1998-05-19 19:47:22 +00:00
|
|
|
/*
|
2000-06-22 00:29:53 +00:00
|
|
|
* Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
|
1998-05-19 19:47:22 +00:00
|
|
|
*
|
1998-05-19 20:03:29 +00:00
|
|
|
* The soft updates code is derived from the appendix of a University
|
|
|
|
* of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
|
|
|
|
* "Soft Updates: A Solution to the Metadata Update Problem in File
|
|
|
|
* Systems", CSE-TR-254-95, August 1995).
|
1998-05-19 19:47:22 +00:00
|
|
|
*
|
2000-06-22 00:29:53 +00:00
|
|
|
* Further information about soft updates can be obtained from:
|
1998-05-19 19:47:22 +00:00
|
|
|
*
|
2000-06-22 00:29:53 +00:00
|
|
|
* Marshall Kirk McKusick http://www.mckusick.com/softdep/
|
|
|
|
* 1614 Oxford Street mckusick@mckusick.com
|
|
|
|
* Berkeley, CA 94709-1608 +1-510-843-9542
|
1998-05-19 19:47:22 +00:00
|
|
|
* USA
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
*
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
|
|
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
|
|
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
|
|
* DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
|
|
|
|
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*
|
2000-06-22 00:29:53 +00:00
|
|
|
* from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
|
1998-05-19 20:03:29 +00:00
|
|
|
*/
|
|
|
|
|
2002-03-15 04:06:10 +00:00
|
|
|
#include <sys/cdefs.h>
|
|
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
|
1998-05-19 20:03:29 +00:00
|
|
|
/*
|
|
|
|
* For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
|
1998-05-19 19:47:22 +00:00
|
|
|
*/
|
1998-05-19 20:03:29 +00:00
|
|
|
#ifndef DIAGNOSTIC
|
|
|
|
#define DIAGNOSTIC
|
|
|
|
#endif
|
|
|
|
#ifndef DEBUG
|
|
|
|
#define DEBUG
|
|
|
|
#endif
|
1998-05-19 19:47:22 +00:00
|
|
|
|
|
|
|
#include <sys/param.h>
|
|
|
|
#include <sys/kernel.h>
|
1999-06-27 13:26:23 +00:00
|
|
|
#include <sys/systm.h>
|
2002-06-21 06:18:05 +00:00
|
|
|
#include <sys/stdint.h>
|
2000-05-05 09:59:14 +00:00
|
|
|
#include <sys/bio.h>
|
1999-06-27 13:26:23 +00:00
|
|
|
#include <sys/buf.h>
|
1998-05-19 19:47:22 +00:00
|
|
|
#include <sys/malloc.h>
|
|
|
|
#include <sys/mount.h>
|
1998-05-19 20:03:29 +00:00
|
|
|
#include <sys/proc.h>
|
2001-05-08 07:42:20 +00:00
|
|
|
#include <sys/stat.h>
|
1998-05-19 19:47:22 +00:00
|
|
|
#include <sys/syslog.h>
|
|
|
|
#include <sys/vnode.h>
|
1999-08-08 18:43:05 +00:00
|
|
|
#include <sys/conf.h>
|
1998-05-19 19:47:22 +00:00
|
|
|
#include <ufs/ufs/dir.h>
|
Introduce extended attribute support for FFS, allowing arbitrary
(name, value) pairs to be associated with inodes. This support is
used for ACLs, MAC labels, and Capabilities in the TrustedBSD
security extensions, which are currently under development.
In this implementation, attributes are backed to data vnodes in the
style of the quota support in FFS. Support for FFS extended
attributes may be enabled using the FFS_EXTATTR kernel option
(disabled by default). Userland utilities and man pages will be
committed in the next batch. VFS interfaces and man pages have
been in the repo since 4.0-RELEASE and are unchanged.
o ufs/ufs/extattr.h: UFS-specific extattr defines
o ufs/ufs/ufs_extattr.c: bulk of support routines
o ufs/{ufs,ffs,mfs}/*.[ch]: hooks and extattr.h includes
o contrib/softupdates/ffs_softdep.c: extattr.h includes
o conf/options, conf/files, i386/conf/LINT: added FFS_EXTATTR
o coda/coda_vfsops.c: XXX required extattr.h due to ufsmount.h
(This should not be the case, and will be fixed in a future commit)
Currently attributes are not supported in MFS. This will be fixed.
Reviewed by: adrian, bp, freebsd-fs, other unthanked souls
Obtained from: TrustedBSD Project
2000-04-15 03:34:27 +00:00
|
|
|
#include <ufs/ufs/extattr.h>
|
1998-05-19 19:47:22 +00:00
|
|
|
#include <ufs/ufs/quota.h>
|
|
|
|
#include <ufs/ufs/inode.h>
|
|
|
|
#include <ufs/ufs/ufsmount.h>
|
|
|
|
#include <ufs/ffs/fs.h>
|
|
|
|
#include <ufs/ffs/softdep.h>
|
|
|
|
#include <ufs/ffs/ffs_extern.h>
|
|
|
|
#include <ufs/ufs/ufs_extern.h>
|
|
|
|
|
|
|
|
/*
|
1998-05-19 22:54:53 +00:00
|
|
|
* These definitions need to be adapted to the system to which
|
|
|
|
* this file is being ported.
|
1998-05-19 19:47:22 +00:00
|
|
|
*/
|
1998-05-19 20:18:42 +00:00
|
|
|
/*
|
|
|
|
* malloc types defined for the softdep system.
|
|
|
|
*/
|
2000-12-08 20:09:00 +00:00
|
|
|
static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
|
|
|
|
static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
|
|
|
|
static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
|
|
|
|
static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
|
|
|
|
static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
|
|
|
|
static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
|
|
|
|
static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
|
|
|
|
static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
|
|
|
|
static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
|
|
|
|
static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
|
|
|
|
static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
|
|
|
|
static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
|
|
|
|
static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
|
2001-05-17 07:24:03 +00:00
|
|
|
static MALLOC_DEFINE(M_NEWDIRBLK, "newdirblk","Unclaimed new directory block");
|
1998-05-19 20:18:42 +00:00
|
|
|
|
2003-02-19 05:47:46 +00:00
|
|
|
#define M_SOFTDEP_FLAGS (M_WAITOK | M_USE_RESERVE)
|
Implement a low-memory deadlock solution.
Removed most of the hacks that were trying to deal with low-memory
situations prior to now.
The new code is based on the concept that I/O must be able to function in
a low memory situation. All major modules related to I/O (except
networking) have been adjusted to allow allocation out of the system
reserve memory pool. These modules now detect a low memory situation but
rather then block they instead continue to operate, then return resources
to the memory pool instead of cache them or leave them wired.
Code has been added to stall in a low-memory situation prior to a vnode
being locked.
Thus situations where a process blocks in a low-memory condition while
holding a locked vnode have been reduced to near nothing. Not only will
I/O continue to operate, but many prior deadlock conditions simply no
longer exist.
Implement a number of VFS/BIO fixes
(found by Ian): in biodone(), bogus-page replacement code, the loop
was not properly incrementing loop variables prior to a continue
statement. We do not believe this code can be hit anyway but we
aren't taking any chances. We'll turn the whole section into a
panic (as it already is in brelse()) after the release is rolled.
In biodone(), the foff calculation was incorrectly
clamped to the iosize, causing the wrong foff to be calculated
for pages in the case of an I/O error or biodone() called without
initiating I/O. The problem always caused a panic before. Now it
doesn't. The problem is mainly an issue with NFS.
Fixed casts for ~PAGE_MASK. This code worked properly before only
because the calculations use signed arithmatic. Better to properly
extend PAGE_MASK first before inverting it for the 64 bit masking
op.
In brelse(), the bogus_page fixup code was improperly throwing
away the original contents of 'm' when it did the j-loop to
fix the bogus pages. The result was that it would potentially
invalidate parts of the *WRONG* page(!), leading to corruption.
There may still be cases where a background bitmap write is
being duplicated, causing potential corruption. We have identified
a potentially serious bug related to this but the fix is still TBD.
So instead this patch contains a KASSERT to detect the problem
and panic the machine rather then continue to corrupt the filesystem.
The problem does not occur very often.. it is very hard to
reproduce, and it may or may not be the cause of the corruption
people have reported.
Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>)
Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
2000-11-18 23:06:26 +00:00
|
|
|
|
1998-05-19 20:18:42 +00:00
|
|
|
#define D_PAGEDEP 0
|
|
|
|
#define D_INODEDEP 1
|
|
|
|
#define D_NEWBLK 2
|
|
|
|
#define D_BMSAFEMAP 3
|
|
|
|
#define D_ALLOCDIRECT 4
|
|
|
|
#define D_INDIRDEP 5
|
|
|
|
#define D_ALLOCINDIR 6
|
|
|
|
#define D_FREEFRAG 7
|
|
|
|
#define D_FREEBLKS 8
|
|
|
|
#define D_FREEFILE 9
|
|
|
|
#define D_DIRADD 10
|
|
|
|
#define D_MKDIR 11
|
|
|
|
#define D_DIRREM 12
|
2001-05-17 07:24:03 +00:00
|
|
|
#define D_NEWDIRBLK 13
|
|
|
|
#define D_LAST D_NEWDIRBLK
|
1998-05-19 20:18:42 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* translate from workitem type to memory type
|
|
|
|
* MUST match the defines above, such that memtype[D_XXX] == M_XXX
|
|
|
|
*/
|
|
|
|
static struct malloc_type *memtype[] = {
|
|
|
|
M_PAGEDEP,
|
|
|
|
M_INODEDEP,
|
|
|
|
M_NEWBLK,
|
|
|
|
M_BMSAFEMAP,
|
|
|
|
M_ALLOCDIRECT,
|
|
|
|
M_INDIRDEP,
|
|
|
|
M_ALLOCINDIR,
|
|
|
|
M_FREEFRAG,
|
|
|
|
M_FREEBLKS,
|
|
|
|
M_FREEFILE,
|
|
|
|
M_DIRADD,
|
|
|
|
M_MKDIR,
|
2001-05-17 07:24:03 +00:00
|
|
|
M_DIRREM,
|
|
|
|
M_NEWDIRBLK
|
1998-05-19 20:18:42 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
#define DtoM(type) (memtype[type])
|
|
|
|
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* Names of malloc types.
|
|
|
|
*/
|
1998-05-19 22:54:53 +00:00
|
|
|
#define TYPENAME(type) \
|
|
|
|
((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
|
|
|
|
/*
|
|
|
|
* End system adaptaion definitions.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Internal function prototypes.
|
|
|
|
*/
|
2002-03-19 22:40:48 +00:00
|
|
|
static void softdep_error(char *, int);
|
|
|
|
static void drain_output(struct vnode *, int);
|
|
|
|
static int getdirtybuf(struct buf **, int);
|
|
|
|
static void clear_remove(struct thread *);
|
|
|
|
static void clear_inodedeps(struct thread *);
|
|
|
|
static int flush_pagedep_deps(struct vnode *, struct mount *,
|
|
|
|
struct diraddhd *);
|
|
|
|
static int flush_inodedep_deps(struct fs *, ino_t);
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
static int flush_deplist(struct allocdirectlst *, int, int *);
|
2002-03-19 22:40:48 +00:00
|
|
|
static int handle_written_filepage(struct pagedep *, struct buf *);
|
|
|
|
static void diradd_inode_written(struct diradd *, struct inodedep *);
|
|
|
|
static int handle_written_inodeblock(struct inodedep *, struct buf *);
|
|
|
|
static void handle_allocdirect_partdone(struct allocdirect *);
|
|
|
|
static void handle_allocindir_partdone(struct allocindir *);
|
|
|
|
static void initiate_write_filepage(struct pagedep *, struct buf *);
|
|
|
|
static void handle_written_mkdir(struct mkdir *, int);
|
2002-06-21 06:18:05 +00:00
|
|
|
static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
|
|
|
|
static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
|
2002-03-19 22:40:48 +00:00
|
|
|
static void handle_workitem_freefile(struct freefile *);
|
|
|
|
static void handle_workitem_remove(struct dirrem *, struct vnode *);
|
|
|
|
static struct dirrem *newdirrem(struct buf *, struct inode *,
|
|
|
|
struct inode *, int, struct dirrem **);
|
|
|
|
static void free_diradd(struct diradd *);
|
|
|
|
static void free_allocindir(struct allocindir *, struct inodedep *);
|
|
|
|
static void free_newdirblk(struct newdirblk *);
|
2002-06-21 06:18:05 +00:00
|
|
|
static int indir_trunc(struct freeblks *, ufs2_daddr_t, int, ufs_lbn_t,
|
|
|
|
ufs2_daddr_t *);
|
2002-03-19 22:40:48 +00:00
|
|
|
static void deallocate_dependencies(struct buf *, struct inodedep *);
|
|
|
|
static void free_allocdirect(struct allocdirectlst *,
|
|
|
|
struct allocdirect *, int);
|
|
|
|
static int check_inode_unwritten(struct inodedep *);
|
|
|
|
static int free_inodedep(struct inodedep *);
|
|
|
|
static void handle_workitem_freeblocks(struct freeblks *, int);
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
|
2002-03-19 22:40:48 +00:00
|
|
|
static void setup_allocindir_phase2(struct buf *, struct inode *,
|
|
|
|
struct allocindir *);
|
2002-06-21 06:18:05 +00:00
|
|
|
static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
|
|
|
|
ufs2_daddr_t);
|
2002-03-19 22:40:48 +00:00
|
|
|
static void handle_workitem_freefrag(struct freefrag *);
|
2002-06-21 06:18:05 +00:00
|
|
|
static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long);
|
2002-03-19 22:40:48 +00:00
|
|
|
static void allocdirect_merge(struct allocdirectlst *,
|
|
|
|
struct allocdirect *, struct allocdirect *);
|
|
|
|
static struct bmsafemap *bmsafemap_lookup(struct buf *);
|
2002-06-21 06:18:05 +00:00
|
|
|
static int newblk_lookup(struct fs *, ufs2_daddr_t, int, struct newblk **);
|
2002-03-19 22:40:48 +00:00
|
|
|
static int inodedep_lookup(struct fs *, ino_t, int, struct inodedep **);
|
|
|
|
static int pagedep_lookup(struct inode *, ufs_lbn_t, int, struct pagedep **);
|
|
|
|
static void pause_timer(void *);
|
|
|
|
static int request_cleanup(int, int);
|
|
|
|
static int process_worklist_item(struct mount *, int);
|
|
|
|
static void add_to_worklist(struct worklist *);
|
1998-05-19 22:54:53 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Exported softdep operations.
|
|
|
|
*/
|
2002-03-19 22:40:48 +00:00
|
|
|
static void softdep_disk_io_initiation(struct buf *);
|
|
|
|
static void softdep_disk_write_complete(struct buf *);
|
|
|
|
static void softdep_deallocate_dependencies(struct buf *);
|
|
|
|
static void softdep_move_dependencies(struct buf *, struct buf *);
|
|
|
|
static int softdep_count_dependencies(struct buf *bp, int);
|
2000-01-09 22:40:09 +00:00
|
|
|
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* Locking primitives.
|
|
|
|
*
|
|
|
|
* For a uniprocessor, all we need to do is protect against disk
|
|
|
|
* interrupts. For a multiprocessor, this lock would have to be
|
|
|
|
* a mutex. A single mutex is used throughout this file, though
|
|
|
|
* finer grain locking could be used if contention warranted it.
|
|
|
|
*
|
|
|
|
* For a multiprocessor, the sleep call would accept a lock and
|
|
|
|
* release it after the sleep processing was complete. In a uniprocessor
|
|
|
|
* implementation there is no such interlock, so we simple mark
|
|
|
|
* the places where it needs to be done with the `interlocked' form
|
|
|
|
* of the lock calls. Since the uniprocessor sleep already interlocks
|
|
|
|
* the spl, there is nothing that really needs to be done.
|
|
|
|
*/
|
|
|
|
#ifndef /* NOT */ DEBUG
|
1998-05-19 20:03:29 +00:00
|
|
|
static struct lockit {
|
|
|
|
int lkt_spl;
|
|
|
|
} lk = { 0 };
|
|
|
|
#define ACQUIRE_LOCK(lk) (lk)->lkt_spl = splbio()
|
|
|
|
#define FREE_LOCK(lk) splx((lk)->lkt_spl)
|
1998-05-19 19:47:22 +00:00
|
|
|
|
|
|
|
#else /* DEBUG */
|
2001-09-27 21:04:13 +00:00
|
|
|
#define NOHOLDER ((struct thread *)-1)
|
|
|
|
#define SPECIAL_FLAG ((struct thread *)-2)
|
1998-05-19 19:47:22 +00:00
|
|
|
static struct lockit {
|
|
|
|
int lkt_spl;
|
2001-09-27 21:04:13 +00:00
|
|
|
struct thread *lkt_held;
|
2001-09-12 08:38:13 +00:00
|
|
|
} lk = { 0, NOHOLDER };
|
1998-05-19 19:47:22 +00:00
|
|
|
|
2002-03-19 22:40:48 +00:00
|
|
|
static void acquire_lock(struct lockit *);
|
|
|
|
static void free_lock(struct lockit *);
|
|
|
|
void softdep_panic(char *);
|
1998-05-19 19:47:22 +00:00
|
|
|
|
|
|
|
#define ACQUIRE_LOCK(lk) acquire_lock(lk)
|
|
|
|
#define FREE_LOCK(lk) free_lock(lk)
|
|
|
|
|
|
|
|
static void
|
|
|
|
acquire_lock(lk)
|
|
|
|
struct lockit *lk;
|
|
|
|
{
|
2001-09-12 08:38:13 +00:00
|
|
|
struct thread *holder;
|
1998-05-19 19:47:22 +00:00
|
|
|
|
2001-09-12 08:38:13 +00:00
|
|
|
if (lk->lkt_held != NOHOLDER) {
|
2001-02-23 09:01:31 +00:00
|
|
|
holder = lk->lkt_held;
|
|
|
|
FREE_LOCK(lk);
|
2001-09-12 08:38:13 +00:00
|
|
|
if (holder == curthread)
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("softdep_lock: locking against myself");
|
|
|
|
else
|
2001-09-12 08:38:13 +00:00
|
|
|
panic("softdep_lock: lock held by %p", holder);
|
1999-05-22 04:43:04 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
lk->lkt_spl = splbio();
|
2001-09-12 08:38:13 +00:00
|
|
|
lk->lkt_held = curthread;
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
free_lock(lk)
|
|
|
|
struct lockit *lk;
|
|
|
|
{
|
|
|
|
|
2001-09-12 08:38:13 +00:00
|
|
|
if (lk->lkt_held == NOHOLDER)
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("softdep_unlock: lock not held");
|
2001-09-12 08:38:13 +00:00
|
|
|
lk->lkt_held = NOHOLDER;
|
1998-05-19 19:47:22 +00:00
|
|
|
splx(lk->lkt_spl);
|
|
|
|
}
|
|
|
|
|
2002-01-12 20:57:36 +00:00
|
|
|
/*
|
|
|
|
* Function to release soft updates lock and panic.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
softdep_panic(msg)
|
|
|
|
char *msg;
|
|
|
|
{
|
|
|
|
|
|
|
|
if (lk.lkt_held != NOHOLDER)
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
panic(msg);
|
|
|
|
}
|
|
|
|
#endif /* DEBUG */
|
|
|
|
|
2002-08-04 10:29:36 +00:00
|
|
|
static int interlocked_sleep(struct lockit *, int, void *, struct mtx *, int,
|
2002-03-19 22:40:48 +00:00
|
|
|
const char *, int);
|
2002-01-12 20:57:36 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* When going to sleep, we must save our SPL so that it does
|
|
|
|
* not get lost if some other process uses the lock while we
|
|
|
|
* are sleeping. We restore it after we have slept. This routine
|
|
|
|
* wraps the interlocking with functions that sleep. The list
|
|
|
|
* below enumerates the available set of operations.
|
|
|
|
*/
|
|
|
|
#define UNKNOWN 0
|
|
|
|
#define SLEEP 1
|
|
|
|
#define LOCKBUF 2
|
|
|
|
|
|
|
|
static int
|
2002-08-04 10:29:36 +00:00
|
|
|
interlocked_sleep(lk, op, ident, mtx, flags, wmesg, timo)
|
1998-05-19 19:47:22 +00:00
|
|
|
struct lockit *lk;
|
2002-01-12 20:57:36 +00:00
|
|
|
int op;
|
|
|
|
void *ident;
|
2002-08-04 10:29:36 +00:00
|
|
|
struct mtx *mtx;
|
2002-01-12 20:57:36 +00:00
|
|
|
int flags;
|
|
|
|
const char *wmesg;
|
|
|
|
int timo;
|
1998-05-19 19:47:22 +00:00
|
|
|
{
|
2001-09-12 08:38:13 +00:00
|
|
|
struct thread *holder;
|
2002-01-12 20:57:36 +00:00
|
|
|
int s, retval;
|
1998-05-19 19:47:22 +00:00
|
|
|
|
2002-01-12 20:57:36 +00:00
|
|
|
s = lk->lkt_spl;
|
|
|
|
# ifdef DEBUG
|
|
|
|
if (lk->lkt_held == NOHOLDER)
|
|
|
|
panic("interlocked_sleep: lock not held");
|
|
|
|
lk->lkt_held = NOHOLDER;
|
|
|
|
# endif /* DEBUG */
|
|
|
|
switch (op) {
|
|
|
|
case SLEEP:
|
2002-08-04 10:29:36 +00:00
|
|
|
retval = msleep(ident, mtx, flags, wmesg, timo);
|
2002-01-12 20:57:36 +00:00
|
|
|
break;
|
|
|
|
case LOCKBUF:
|
|
|
|
retval = BUF_LOCK((struct buf *)ident, flags);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
panic("interlocked_sleep: unknown operation");
|
|
|
|
}
|
|
|
|
# ifdef DEBUG
|
2001-09-12 08:38:13 +00:00
|
|
|
if (lk->lkt_held != NOHOLDER) {
|
2001-02-23 09:01:31 +00:00
|
|
|
holder = lk->lkt_held;
|
|
|
|
FREE_LOCK(lk);
|
2001-09-12 08:38:13 +00:00
|
|
|
if (holder == curthread)
|
2002-01-12 20:57:36 +00:00
|
|
|
panic("interlocked_sleep: locking against self");
|
1998-05-19 19:47:22 +00:00
|
|
|
else
|
2002-01-12 20:57:36 +00:00
|
|
|
panic("interlocked_sleep: lock held by %p", holder);
|
1999-05-22 04:43:04 +00:00
|
|
|
}
|
2001-09-12 08:38:13 +00:00
|
|
|
lk->lkt_held = curthread;
|
2002-01-12 20:57:36 +00:00
|
|
|
# endif /* DEBUG */
|
|
|
|
lk->lkt_spl = s;
|
|
|
|
return (retval);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Place holder for real semaphores.
|
|
|
|
*/
|
|
|
|
struct sema {
|
|
|
|
int value;
|
2001-09-27 21:04:13 +00:00
|
|
|
struct thread *holder;
|
1998-05-19 19:47:22 +00:00
|
|
|
char *name;
|
|
|
|
int prio;
|
|
|
|
int timo;
|
|
|
|
};
|
2002-03-19 22:40:48 +00:00
|
|
|
static void sema_init(struct sema *, char *, int, int);
|
|
|
|
static int sema_get(struct sema *, struct lockit *);
|
|
|
|
static void sema_release(struct sema *);
|
1998-05-19 19:47:22 +00:00
|
|
|
|
|
|
|
static void
|
|
|
|
sema_init(semap, name, prio, timo)
|
|
|
|
struct sema *semap;
|
|
|
|
char *name;
|
|
|
|
int prio, timo;
|
|
|
|
{
|
|
|
|
|
2001-09-12 08:38:13 +00:00
|
|
|
semap->holder = NOHOLDER;
|
1998-05-19 19:47:22 +00:00
|
|
|
semap->value = 0;
|
|
|
|
semap->name = name;
|
|
|
|
semap->prio = prio;
|
|
|
|
semap->timo = timo;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
sema_get(semap, interlock)
|
|
|
|
struct sema *semap;
|
|
|
|
struct lockit *interlock;
|
|
|
|
{
|
|
|
|
|
|
|
|
if (semap->value++ > 0) {
|
|
|
|
if (interlock != NULL) {
|
2002-01-12 20:57:36 +00:00
|
|
|
interlocked_sleep(interlock, SLEEP, (caddr_t)semap,
|
2002-08-04 10:29:36 +00:00
|
|
|
NULL, semap->prio, semap->name,
|
|
|
|
semap->timo);
|
1998-05-19 19:47:22 +00:00
|
|
|
FREE_LOCK(interlock);
|
2002-01-12 20:57:36 +00:00
|
|
|
} else {
|
|
|
|
tsleep((caddr_t)semap, semap->prio, semap->name,
|
|
|
|
semap->timo);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
}
|
2001-09-12 08:38:13 +00:00
|
|
|
semap->holder = curthread;
|
1998-05-19 19:47:22 +00:00
|
|
|
if (interlock != NULL)
|
|
|
|
FREE_LOCK(interlock);
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
sema_release(semap)
|
|
|
|
struct sema *semap;
|
|
|
|
{
|
|
|
|
|
2001-09-12 08:38:13 +00:00
|
|
|
if (semap->value <= 0 || semap->holder != curthread) {
|
|
|
|
if (lk.lkt_held != NOHOLDER)
|
2001-02-23 09:01:31 +00:00
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("sema_release: not held");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
if (--semap->value > 0) {
|
|
|
|
semap->value = 0;
|
|
|
|
wakeup(semap);
|
|
|
|
}
|
2001-09-12 08:38:13 +00:00
|
|
|
semap->holder = NOHOLDER;
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Worklist queue management.
|
|
|
|
* These routines require that the lock be held.
|
|
|
|
*/
|
|
|
|
#ifndef /* NOT */ DEBUG
|
|
|
|
#define WORKLIST_INSERT(head, item) do { \
|
1998-05-19 20:03:29 +00:00
|
|
|
(item)->wk_state |= ONWORKLIST; \
|
1998-05-19 19:47:22 +00:00
|
|
|
LIST_INSERT_HEAD(head, item, wk_list); \
|
|
|
|
} while (0)
|
|
|
|
#define WORKLIST_REMOVE(item) do { \
|
1998-05-19 20:03:29 +00:00
|
|
|
(item)->wk_state &= ~ONWORKLIST; \
|
1998-05-19 19:47:22 +00:00
|
|
|
LIST_REMOVE(item, wk_list); \
|
|
|
|
} while (0)
|
1998-05-19 20:18:42 +00:00
|
|
|
#define WORKITEM_FREE(item, type) FREE(item, DtoM(type))
|
1998-05-19 19:47:22 +00:00
|
|
|
|
|
|
|
#else /* DEBUG */
|
2002-03-19 22:40:48 +00:00
|
|
|
static void worklist_insert(struct workhead *, struct worklist *);
|
|
|
|
static void worklist_remove(struct worklist *);
|
|
|
|
static void workitem_free(struct worklist *, int);
|
1998-05-19 19:47:22 +00:00
|
|
|
|
|
|
|
#define WORKLIST_INSERT(head, item) worklist_insert(head, item)
|
|
|
|
#define WORKLIST_REMOVE(item) worklist_remove(item)
|
|
|
|
#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type)
|
|
|
|
|
|
|
|
static void
|
|
|
|
worklist_insert(head, item)
|
|
|
|
struct workhead *head;
|
|
|
|
struct worklist *item;
|
|
|
|
{
|
|
|
|
|
2001-09-12 08:38:13 +00:00
|
|
|
if (lk.lkt_held == NOHOLDER)
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("worklist_insert: lock not held");
|
2001-02-23 09:01:31 +00:00
|
|
|
if (item->wk_state & ONWORKLIST) {
|
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("worklist_insert: already on list");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
item->wk_state |= ONWORKLIST;
|
|
|
|
LIST_INSERT_HEAD(head, item, wk_list);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
worklist_remove(item)
|
|
|
|
struct worklist *item;
|
|
|
|
{
|
|
|
|
|
2001-09-12 08:38:13 +00:00
|
|
|
if (lk.lkt_held == NOHOLDER)
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("worklist_remove: lock not held");
|
2001-02-23 09:01:31 +00:00
|
|
|
if ((item->wk_state & ONWORKLIST) == 0) {
|
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("worklist_remove: not on list");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
item->wk_state &= ~ONWORKLIST;
|
|
|
|
LIST_REMOVE(item, wk_list);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
workitem_free(item, type)
|
|
|
|
struct worklist *item;
|
|
|
|
int type;
|
|
|
|
{
|
|
|
|
|
2001-02-23 09:01:31 +00:00
|
|
|
if (item->wk_state & ONWORKLIST) {
|
2001-09-12 08:38:13 +00:00
|
|
|
if (lk.lkt_held != NOHOLDER)
|
2001-02-23 09:01:31 +00:00
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("workitem_free: still on list");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
|
|
|
if (item->wk_type != type) {
|
2001-09-12 08:38:13 +00:00
|
|
|
if (lk.lkt_held != NOHOLDER)
|
2001-02-23 09:01:31 +00:00
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("workitem_free: type mismatch");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 20:18:42 +00:00
|
|
|
FREE(item, DtoM(type));
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
#endif /* DEBUG */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Workitem queue management
|
|
|
|
*/
|
|
|
|
static struct workhead softdep_workitem_pending;
|
2000-12-13 08:30:35 +00:00
|
|
|
static int num_on_worklist; /* number of worklist items to be processed */
|
|
|
|
static int softdep_worklist_busy; /* 1 => trying to do unmount */
|
2001-01-30 06:31:59 +00:00
|
|
|
static int softdep_worklist_req; /* serialized waiters */
|
1998-05-19 21:45:53 +00:00
|
|
|
static int max_softdeps; /* maximum number of structs before slowdown */
|
2003-01-07 18:23:50 +00:00
|
|
|
static int maxindirdeps = 50; /* max number of indirdeps before slowdown */
|
1998-05-19 21:45:53 +00:00
|
|
|
static int tickdelay = 2; /* number of ticks to pause during slowdown */
|
|
|
|
static int proc_waiting; /* tracks whether we have a timeout posted */
|
2000-11-20 06:22:39 +00:00
|
|
|
static int *stat_countp; /* statistic to count in proc_waiting timeout */
|
|
|
|
static struct callout_handle handle; /* handle on posted proc_waiting timeout */
|
2001-09-12 08:38:13 +00:00
|
|
|
static struct thread *filesys_syncer; /* proc of filesystem syncer process */
|
1999-05-07 02:26:47 +00:00
|
|
|
static int req_clear_inodedeps; /* syncer process flush some inodedeps */
|
2002-01-22 06:17:22 +00:00
|
|
|
#define FLUSH_INODES 1
|
1999-05-07 02:26:47 +00:00
|
|
|
static int req_clear_remove; /* syncer process flush some freeblks */
|
2002-01-22 06:17:22 +00:00
|
|
|
#define FLUSH_REMOVE 2
|
|
|
|
#define FLUSH_REMOVE_WAIT 3
|
1999-05-14 01:26:46 +00:00
|
|
|
/*
|
|
|
|
* runtime statistics
|
|
|
|
*/
|
2000-12-13 08:30:35 +00:00
|
|
|
static int stat_worklist_push; /* number of worklist cleanups */
|
1999-05-14 01:26:46 +00:00
|
|
|
static int stat_blk_limit_push; /* number of times block limit neared */
|
|
|
|
static int stat_ino_limit_push; /* number of times inode limit neared */
|
|
|
|
static int stat_blk_limit_hit; /* number of times block slowdown imposed */
|
|
|
|
static int stat_ino_limit_hit; /* number of times inode slowdown imposed */
|
2000-12-13 08:30:35 +00:00
|
|
|
static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
|
1999-05-14 01:26:46 +00:00
|
|
|
static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
|
|
|
|
static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */
|
|
|
|
static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
|
|
|
|
static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */
|
1998-05-19 21:45:53 +00:00
|
|
|
#ifdef DEBUG
|
|
|
|
#include <vm/vm.h>
|
|
|
|
#include <sys/sysctl.h>
|
|
|
|
SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
|
|
|
|
SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
|
2003-01-07 18:23:50 +00:00
|
|
|
SYSCTL_INT(_debug, OID_AUTO, maxindirdeps, CTLFLAG_RW, &maxindirdeps, 0, "");
|
2000-12-13 08:30:35 +00:00
|
|
|
SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,"");
|
1999-05-14 01:26:46 +00:00
|
|
|
SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
|
|
|
|
SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,"");
|
|
|
|
SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, "");
|
|
|
|
SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, "");
|
2000-12-13 08:30:35 +00:00
|
|
|
SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, "");
|
1999-05-14 01:26:46 +00:00
|
|
|
SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, "");
|
|
|
|
SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
|
|
|
|
SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
|
|
|
|
SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
|
1998-05-19 21:45:53 +00:00
|
|
|
#endif /* DEBUG */
|
1998-05-19 19:47:22 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Add an item to the end of the work queue.
|
|
|
|
* This routine requires that the lock be held.
|
|
|
|
* This is the only routine that adds items to the list.
|
|
|
|
* The following routine is the only one that removes items
|
|
|
|
* and does so in order from first to last.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
add_to_worklist(wk)
|
|
|
|
struct worklist *wk;
|
|
|
|
{
|
|
|
|
static struct worklist *worklist_tail;
|
|
|
|
|
2001-02-23 09:01:31 +00:00
|
|
|
if (wk->wk_state & ONWORKLIST) {
|
2001-09-12 08:38:13 +00:00
|
|
|
if (lk.lkt_held != NOHOLDER)
|
2001-02-23 09:01:31 +00:00
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("add_to_worklist: already on list");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
wk->wk_state |= ONWORKLIST;
|
1999-05-14 01:26:46 +00:00
|
|
|
if (LIST_FIRST(&softdep_workitem_pending) == NULL)
|
1998-05-19 19:47:22 +00:00
|
|
|
LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list);
|
1999-05-14 01:26:46 +00:00
|
|
|
else
|
1998-05-19 19:47:22 +00:00
|
|
|
LIST_INSERT_AFTER(worklist_tail, wk, wk_list);
|
|
|
|
worklist_tail = wk;
|
2000-12-13 08:30:35 +00:00
|
|
|
num_on_worklist += 1;
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Process that runs once per second to handle items in the background queue.
|
|
|
|
*
|
|
|
|
* Note that we ensure that everything is done in the order in which they
|
|
|
|
* appear in the queue. The code below depends on this property to ensure
|
|
|
|
* that blocks of a file are freed before the inode itself is freed. This
|
|
|
|
* ordering ensures that no new <vfsid, inum, lbn> triples will be generated
|
|
|
|
* until all the old ones have been purged from the dependency lists.
|
|
|
|
*/
|
2000-06-16 13:00:33 +00:00
|
|
|
int
|
1998-05-19 19:47:22 +00:00
|
|
|
softdep_process_worklist(matchmnt)
|
|
|
|
struct mount *matchmnt;
|
|
|
|
{
|
2001-09-12 08:38:13 +00:00
|
|
|
struct thread *td = curthread;
|
2002-03-17 01:25:47 +00:00
|
|
|
int cnt, matchcnt, loopcount;
|
2000-12-13 08:30:35 +00:00
|
|
|
long starttime;
|
1998-05-19 19:47:22 +00:00
|
|
|
|
1998-05-19 21:45:53 +00:00
|
|
|
/*
|
1999-06-15 23:37:29 +00:00
|
|
|
* Record the process identifier of our caller so that we can give
|
|
|
|
* this process preferential treatment in request_cleanup below.
|
1998-05-19 21:45:53 +00:00
|
|
|
*/
|
2001-09-12 08:38:13 +00:00
|
|
|
filesys_syncer = td;
|
1998-05-19 19:47:22 +00:00
|
|
|
matchcnt = 0;
|
2001-01-30 06:31:59 +00:00
|
|
|
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* There is no danger of having multiple processes run this
|
2001-01-30 06:31:59 +00:00
|
|
|
* code, but we have to single-thread it when softdep_flushfiles()
|
|
|
|
* is in operation to get an accurate count of the number of items
|
1998-05-19 19:47:22 +00:00
|
|
|
* related to its mount point that are in the list.
|
|
|
|
*/
|
2001-01-30 06:31:59 +00:00
|
|
|
if (matchmnt == NULL) {
|
|
|
|
if (softdep_worklist_busy < 0)
|
|
|
|
return(-1);
|
|
|
|
softdep_worklist_busy += 1;
|
|
|
|
}
|
|
|
|
|
1999-05-07 02:26:47 +00:00
|
|
|
/*
|
|
|
|
* If requested, try removing inode or removal dependencies.
|
|
|
|
*/
|
|
|
|
if (req_clear_inodedeps) {
|
2001-09-12 08:38:13 +00:00
|
|
|
clear_inodedeps(td);
|
2000-11-20 06:22:39 +00:00
|
|
|
req_clear_inodedeps -= 1;
|
|
|
|
wakeup_one(&proc_waiting);
|
1999-05-07 02:26:47 +00:00
|
|
|
}
|
|
|
|
if (req_clear_remove) {
|
2001-09-12 08:38:13 +00:00
|
|
|
clear_remove(td);
|
2000-11-20 06:22:39 +00:00
|
|
|
req_clear_remove -= 1;
|
|
|
|
wakeup_one(&proc_waiting);
|
1999-05-07 02:26:47 +00:00
|
|
|
}
|
2000-01-09 23:35:38 +00:00
|
|
|
loopcount = 1;
|
2000-12-13 08:30:35 +00:00
|
|
|
starttime = time_second;
|
|
|
|
while (num_on_worklist > 0) {
|
2002-03-17 01:25:47 +00:00
|
|
|
if ((cnt = process_worklist_item(matchmnt, 0)) == -1)
|
|
|
|
break;
|
|
|
|
else
|
|
|
|
matchcnt += cnt;
|
2001-01-30 06:31:59 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If a umount operation wants to run the worklist
|
|
|
|
* accurately, abort.
|
|
|
|
*/
|
|
|
|
if (softdep_worklist_req && matchmnt == NULL) {
|
|
|
|
matchcnt = -1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
1999-05-07 02:26:47 +00:00
|
|
|
/*
|
|
|
|
* If requested, try removing inode or removal dependencies.
|
|
|
|
*/
|
|
|
|
if (req_clear_inodedeps) {
|
2001-09-12 08:38:13 +00:00
|
|
|
clear_inodedeps(td);
|
2000-11-20 06:22:39 +00:00
|
|
|
req_clear_inodedeps -= 1;
|
|
|
|
wakeup_one(&proc_waiting);
|
1999-05-07 02:26:47 +00:00
|
|
|
}
|
|
|
|
if (req_clear_remove) {
|
2001-09-12 08:38:13 +00:00
|
|
|
clear_remove(td);
|
2000-11-20 06:22:39 +00:00
|
|
|
req_clear_remove -= 1;
|
|
|
|
wakeup_one(&proc_waiting);
|
1999-05-07 02:26:47 +00:00
|
|
|
}
|
2000-01-09 23:35:38 +00:00
|
|
|
/*
|
|
|
|
* We do not generally want to stop for buffer space, but if
|
|
|
|
* we are really being a buffer hog, we will stop and wait.
|
|
|
|
*/
|
|
|
|
if (loopcount++ % 128 == 0)
|
|
|
|
bwillwrite();
|
2000-12-13 08:30:35 +00:00
|
|
|
/*
|
|
|
|
* Never allow processing to run for more than one
|
|
|
|
* second. Otherwise the other syncer tasks may get
|
|
|
|
* excessively backlogged.
|
|
|
|
*/
|
2001-01-30 06:31:59 +00:00
|
|
|
if (starttime != time_second && matchmnt == NULL) {
|
|
|
|
matchcnt = -1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (matchmnt == NULL) {
|
|
|
|
softdep_worklist_busy -= 1;
|
|
|
|
if (softdep_worklist_req && softdep_worklist_busy == 0)
|
|
|
|
wakeup(&softdep_worklist_req);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
2000-12-13 08:30:35 +00:00
|
|
|
return (matchcnt);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Process one item on the worklist.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
process_worklist_item(matchmnt, flags)
|
|
|
|
struct mount *matchmnt;
|
|
|
|
int flags;
|
|
|
|
{
|
|
|
|
struct worklist *wk;
|
|
|
|
struct mount *mp;
|
|
|
|
struct vnode *vp;
|
|
|
|
int matchcnt = 0;
|
|
|
|
|
2002-10-23 21:47:02 +00:00
|
|
|
/*
|
|
|
|
* If we are being called because of a process doing a
|
|
|
|
* copy-on-write, then it is not safe to write as we may
|
|
|
|
* recurse into the copy-on-write routine.
|
|
|
|
*/
|
|
|
|
if (curthread->td_proc->p_flag & P_COWINPROGRESS)
|
|
|
|
return (-1);
|
2000-12-13 08:30:35 +00:00
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
/*
|
|
|
|
* Normally we just process each item on the worklist in order.
|
|
|
|
* However, if we are in a situation where we cannot lock any
|
|
|
|
* inodes, we have to skip over any dirrem requests whose
|
|
|
|
* vnodes are resident and locked.
|
|
|
|
*/
|
2002-03-17 01:25:47 +00:00
|
|
|
vp = NULL;
|
2000-12-13 08:30:35 +00:00
|
|
|
LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) {
|
2002-03-17 01:25:47 +00:00
|
|
|
if (wk->wk_state & INPROGRESS)
|
|
|
|
continue;
|
2000-12-13 08:30:35 +00:00
|
|
|
if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
|
|
|
|
break;
|
2002-03-17 01:25:47 +00:00
|
|
|
wk->wk_state |= INPROGRESS;
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
VFS_VGET(WK_DIRREM(wk)->dm_mnt, WK_DIRREM(wk)->dm_oldinum,
|
|
|
|
LK_NOWAIT | LK_EXCLUSIVE, &vp);
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
wk->wk_state &= ~INPROGRESS;
|
|
|
|
if (vp != NULL)
|
2000-12-13 08:30:35 +00:00
|
|
|
break;
|
|
|
|
}
|
2001-03-01 21:43:46 +00:00
|
|
|
if (wk == 0) {
|
|
|
|
FREE_LOCK(&lk);
|
2002-01-22 06:17:22 +00:00
|
|
|
return (-1);
|
2001-03-01 21:43:46 +00:00
|
|
|
}
|
2000-12-13 08:30:35 +00:00
|
|
|
WORKLIST_REMOVE(wk);
|
|
|
|
num_on_worklist -= 1;
|
1998-05-19 19:47:22 +00:00
|
|
|
FREE_LOCK(&lk);
|
2000-12-13 08:30:35 +00:00
|
|
|
switch (wk->wk_type) {
|
|
|
|
|
|
|
|
case D_DIRREM:
|
|
|
|
/* removal of a directory entry */
|
|
|
|
mp = WK_DIRREM(wk)->dm_mnt;
|
|
|
|
if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
|
|
|
|
panic("%s: dirrem on suspended filesystem",
|
|
|
|
"process_worklist_item");
|
|
|
|
if (mp == matchmnt)
|
|
|
|
matchcnt += 1;
|
2002-03-17 01:25:47 +00:00
|
|
|
handle_workitem_remove(WK_DIRREM(wk), vp);
|
2000-12-13 08:30:35 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
case D_FREEBLKS:
|
|
|
|
/* releasing blocks and/or fragments from a file */
|
|
|
|
mp = WK_FREEBLKS(wk)->fb_mnt;
|
|
|
|
if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
|
|
|
|
panic("%s: freeblks on suspended filesystem",
|
|
|
|
"process_worklist_item");
|
|
|
|
if (mp == matchmnt)
|
|
|
|
matchcnt += 1;
|
2001-03-21 04:09:01 +00:00
|
|
|
handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT);
|
2000-12-13 08:30:35 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
case D_FREEFRAG:
|
|
|
|
/* releasing a fragment when replaced as a file grows */
|
|
|
|
mp = WK_FREEFRAG(wk)->ff_mnt;
|
|
|
|
if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
|
|
|
|
panic("%s: freefrag on suspended filesystem",
|
|
|
|
"process_worklist_item");
|
|
|
|
if (mp == matchmnt)
|
|
|
|
matchcnt += 1;
|
|
|
|
handle_workitem_freefrag(WK_FREEFRAG(wk));
|
|
|
|
break;
|
|
|
|
|
|
|
|
case D_FREEFILE:
|
|
|
|
/* releasing an inode when its link count drops to 0 */
|
|
|
|
mp = WK_FREEFILE(wk)->fx_mnt;
|
|
|
|
if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
|
|
|
|
panic("%s: freefile on suspended filesystem",
|
|
|
|
"process_worklist_item");
|
|
|
|
if (mp == matchmnt)
|
|
|
|
matchcnt += 1;
|
|
|
|
handle_workitem_freefile(WK_FREEFILE(wk));
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
panic("%s_process_worklist: Unknown type %s",
|
|
|
|
"softdep", TYPENAME(wk->wk_type));
|
|
|
|
/* NOTREACHED */
|
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
return (matchcnt);
|
|
|
|
}
|
|
|
|
|
2000-01-10 00:24:24 +00:00
|
|
|
/*
|
|
|
|
* Move dependencies from one buffer to another.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
softdep_move_dependencies(oldbp, newbp)
|
|
|
|
struct buf *oldbp;
|
|
|
|
struct buf *newbp;
|
|
|
|
{
|
|
|
|
struct worklist *wk, *wktail;
|
|
|
|
|
|
|
|
if (LIST_FIRST(&newbp->b_dep) != NULL)
|
|
|
|
panic("softdep_move_dependencies: need merge code");
|
|
|
|
wktail = 0;
|
|
|
|
ACQUIRE_LOCK(&lk);
|
2000-01-14 04:39:28 +00:00
|
|
|
while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
|
2000-01-10 00:24:24 +00:00
|
|
|
LIST_REMOVE(wk, wk_list);
|
|
|
|
if (wktail == 0)
|
|
|
|
LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
|
|
|
|
else
|
|
|
|
LIST_INSERT_AFTER(wktail, wk, wk_list);
|
|
|
|
wktail = wk;
|
|
|
|
}
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
}
|
|
|
|
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* Purge the work list of all items associated with a particular mount point.
|
|
|
|
*/
|
|
|
|
int
|
2001-09-12 08:38:13 +00:00
|
|
|
softdep_flushworklist(oldmnt, countp, td)
|
1998-05-19 19:47:22 +00:00
|
|
|
struct mount *oldmnt;
|
2000-07-24 05:28:33 +00:00
|
|
|
int *countp;
|
2001-09-12 08:38:13 +00:00
|
|
|
struct thread *td;
|
1998-05-19 19:47:22 +00:00
|
|
|
{
|
|
|
|
struct vnode *devvp;
|
2000-07-24 05:28:33 +00:00
|
|
|
int count, error = 0;
|
1998-05-19 19:47:22 +00:00
|
|
|
|
|
|
|
/*
|
2001-01-30 06:31:59 +00:00
|
|
|
* Await our turn to clear out the queue, then serialize access.
|
1998-05-19 19:47:22 +00:00
|
|
|
*/
|
2001-01-30 06:31:59 +00:00
|
|
|
while (softdep_worklist_busy) {
|
|
|
|
softdep_worklist_req += 1;
|
|
|
|
tsleep(&softdep_worklist_req, PRIBIO, "softflush", 0);
|
|
|
|
softdep_worklist_req -= 1;
|
|
|
|
}
|
|
|
|
softdep_worklist_busy = -1;
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* Alternately flush the block device associated with the mount
|
|
|
|
* point and process any dependencies that the flushing
|
2000-07-24 05:28:33 +00:00
|
|
|
* creates. We continue until no more worklist dependencies
|
|
|
|
* are found.
|
1998-05-19 19:47:22 +00:00
|
|
|
*/
|
2000-07-24 05:28:33 +00:00
|
|
|
*countp = 0;
|
1998-05-19 19:47:22 +00:00
|
|
|
devvp = VFSTOUFS(oldmnt)->um_devvp;
|
2000-07-24 05:28:33 +00:00
|
|
|
while ((count = softdep_process_worklist(oldmnt)) > 0) {
|
|
|
|
*countp += count;
|
2001-09-12 08:38:13 +00:00
|
|
|
vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
|
2002-02-27 18:32:23 +00:00
|
|
|
error = VOP_FSYNC(devvp, td->td_ucred, MNT_WAIT, td);
|
2001-09-12 08:38:13 +00:00
|
|
|
VOP_UNLOCK(devvp, 0, td);
|
1998-05-19 19:47:22 +00:00
|
|
|
if (error)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
softdep_worklist_busy = 0;
|
2001-01-30 06:31:59 +00:00
|
|
|
if (softdep_worklist_req)
|
|
|
|
wakeup(&softdep_worklist_req);
|
2000-07-24 05:28:33 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Flush all vnodes and worklist items associated with a specified mount point.
|
|
|
|
*/
|
|
|
|
int
|
2001-09-12 08:38:13 +00:00
|
|
|
softdep_flushfiles(oldmnt, flags, td)
|
2000-07-24 05:28:33 +00:00
|
|
|
struct mount *oldmnt;
|
|
|
|
int flags;
|
2001-09-12 08:38:13 +00:00
|
|
|
struct thread *td;
|
2000-07-24 05:28:33 +00:00
|
|
|
{
|
|
|
|
int error, count, loopcnt;
|
|
|
|
|
2002-03-15 04:06:10 +00:00
|
|
|
error = 0;
|
|
|
|
|
2000-07-24 05:28:33 +00:00
|
|
|
/*
|
|
|
|
* Alternately flush the vnodes associated with the mount
|
|
|
|
* point and process any dependencies that the flushing
|
|
|
|
* creates. In theory, this loop can happen at most twice,
|
|
|
|
* but we give it a few extra just to be sure.
|
|
|
|
*/
|
|
|
|
for (loopcnt = 10; loopcnt > 0; loopcnt--) {
|
|
|
|
/*
|
|
|
|
* Do another flush in case any vnodes were brought in
|
|
|
|
* as part of the cleanup operations.
|
|
|
|
*/
|
2001-09-12 08:38:13 +00:00
|
|
|
if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0)
|
2000-07-24 05:28:33 +00:00
|
|
|
break;
|
2001-09-12 08:38:13 +00:00
|
|
|
if ((error = softdep_flushworklist(oldmnt, &count, td)) != 0 ||
|
2000-07-24 05:28:33 +00:00
|
|
|
count == 0)
|
|
|
|
break;
|
|
|
|
}
|
1998-05-19 20:03:29 +00:00
|
|
|
/*
|
|
|
|
* If we are unmounting then it is an error to fail. If we
|
|
|
|
* are simply trying to downgrade to read-only, then filesystem
|
|
|
|
* activity can keep us busy forever, so we just fail with EBUSY.
|
|
|
|
*/
|
|
|
|
if (loopcnt == 0) {
|
1998-05-19 20:18:42 +00:00
|
|
|
if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
|
1998-05-19 20:03:29 +00:00
|
|
|
panic("softdep_flushfiles: looping");
|
|
|
|
error = EBUSY;
|
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Structure hashing.
|
|
|
|
*
|
|
|
|
* There are three types of structures that can be looked up:
|
|
|
|
* 1) pagedep structures identified by mount point, inode number,
|
|
|
|
* and logical block.
|
|
|
|
* 2) inodedep structures identified by mount point and inode number.
|
|
|
|
* 3) newblk structures identified by mount point and
|
|
|
|
* physical block number.
|
|
|
|
*
|
|
|
|
* The "pagedep" and "inodedep" dependency structures are hashed
|
|
|
|
* separately from the file blocks and inodes to which they correspond.
|
|
|
|
* This separation helps when the in-memory copy of an inode or
|
|
|
|
* file block must be replaced. It also obviates the need to access
|
|
|
|
* an inode or file page when simply updating (or de-allocating)
|
|
|
|
* dependency structures. Lookup of newblk structures is needed to
|
|
|
|
* find newly allocated blocks when trying to associate them with
|
|
|
|
* their allocdirect or allocindir structure.
|
|
|
|
*
|
|
|
|
* The lookup routines optionally create and hash a new instance when
|
|
|
|
* an existing entry is not found.
|
|
|
|
*/
|
|
|
|
#define DEPALLOC 0x0001 /* allocate structure if lookup fails */
|
2001-02-20 11:14:38 +00:00
|
|
|
#define NODELAY 0x0002 /* cannot do background work */
|
1998-05-19 19:47:22 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Structures and routines associated with pagedep caching.
|
|
|
|
*/
|
2000-05-26 02:09:24 +00:00
|
|
|
LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
|
1998-05-19 19:47:22 +00:00
|
|
|
u_long pagedep_hash; /* size of hash table - 1 */
|
|
|
|
#define PAGEDEP_HASH(mp, inum, lbn) \
|
1998-05-19 20:03:29 +00:00
|
|
|
(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
|
|
|
|
pagedep_hash])
|
1998-05-19 19:47:22 +00:00
|
|
|
static struct sema pagedep_in_progress;
|
|
|
|
|
|
|
|
/*
|
2002-02-07 00:54:32 +00:00
|
|
|
* Look up a pagedep. Return 1 if found, 0 if not found or found
|
|
|
|
* when asked to allocate but not associated with any buffer.
|
1998-05-19 19:47:22 +00:00
|
|
|
* If not found, allocate if DEPALLOC flag is passed.
|
|
|
|
* Found or allocated entry is returned in pagedeppp.
|
|
|
|
* This routine must be called with splbio interrupts blocked.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
pagedep_lookup(ip, lbn, flags, pagedeppp)
|
|
|
|
struct inode *ip;
|
|
|
|
ufs_lbn_t lbn;
|
|
|
|
int flags;
|
|
|
|
struct pagedep **pagedeppp;
|
|
|
|
{
|
|
|
|
struct pagedep *pagedep;
|
|
|
|
struct pagedep_hashhead *pagedephd;
|
|
|
|
struct mount *mp;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
#ifdef DEBUG
|
2001-09-12 08:38:13 +00:00
|
|
|
if (lk.lkt_held == NOHOLDER)
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("pagedep_lookup: lock not held");
|
|
|
|
#endif
|
|
|
|
mp = ITOV(ip)->v_mount;
|
|
|
|
pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
|
|
|
|
top:
|
2001-02-04 16:08:18 +00:00
|
|
|
LIST_FOREACH(pagedep, pagedephd, pd_hash)
|
1998-05-19 19:47:22 +00:00
|
|
|
if (ip->i_number == pagedep->pd_ino &&
|
|
|
|
lbn == pagedep->pd_lbn &&
|
|
|
|
mp == pagedep->pd_mnt)
|
|
|
|
break;
|
|
|
|
if (pagedep) {
|
|
|
|
*pagedeppp = pagedep;
|
2002-02-07 00:54:32 +00:00
|
|
|
if ((flags & DEPALLOC) != 0 &&
|
|
|
|
(pagedep->pd_state & ONWORKLIST) == 0)
|
|
|
|
return (0);
|
1998-05-19 19:47:22 +00:00
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
if ((flags & DEPALLOC) == 0) {
|
|
|
|
*pagedeppp = NULL;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
if (sema_get(&pagedep_in_progress, &lk) == 0) {
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
goto top;
|
|
|
|
}
|
|
|
|
MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP,
|
2000-12-08 21:51:06 +00:00
|
|
|
M_SOFTDEP_FLAGS|M_ZERO);
|
1998-05-19 20:18:42 +00:00
|
|
|
pagedep->pd_list.wk_type = D_PAGEDEP;
|
1998-05-19 19:47:22 +00:00
|
|
|
pagedep->pd_mnt = mp;
|
|
|
|
pagedep->pd_ino = ip->i_number;
|
|
|
|
pagedep->pd_lbn = lbn;
|
|
|
|
LIST_INIT(&pagedep->pd_dirremhd);
|
|
|
|
LIST_INIT(&pagedep->pd_pendinghd);
|
|
|
|
for (i = 0; i < DAHASHSZ; i++)
|
|
|
|
LIST_INIT(&pagedep->pd_diraddhd[i]);
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
|
|
|
|
sema_release(&pagedep_in_progress);
|
|
|
|
*pagedeppp = pagedep;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Structures and routines associated with inodedep caching.
|
|
|
|
*/
|
2000-05-26 02:09:24 +00:00
|
|
|
LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
|
1998-05-19 21:45:53 +00:00
|
|
|
static u_long inodedep_hash; /* size of hash table - 1 */
|
|
|
|
static long num_inodedep; /* number of inodedep allocated */
|
1998-05-19 19:47:22 +00:00
|
|
|
#define INODEDEP_HASH(fs, inum) \
|
1998-05-19 20:03:29 +00:00
|
|
|
(&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
|
1998-05-19 19:47:22 +00:00
|
|
|
static struct sema inodedep_in_progress;
|
|
|
|
|
|
|
|
/*
|
2003-01-01 18:49:04 +00:00
|
|
|
* Look up an inodedep. Return 1 if found, 0 if not found.
|
1998-05-19 19:47:22 +00:00
|
|
|
* If not found, allocate if DEPALLOC flag is passed.
|
|
|
|
* Found or allocated entry is returned in inodedeppp.
|
|
|
|
* This routine must be called with splbio interrupts blocked.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
inodedep_lookup(fs, inum, flags, inodedeppp)
|
|
|
|
struct fs *fs;
|
|
|
|
ino_t inum;
|
|
|
|
int flags;
|
|
|
|
struct inodedep **inodedeppp;
|
|
|
|
{
|
|
|
|
struct inodedep *inodedep;
|
|
|
|
struct inodedep_hashhead *inodedephd;
|
1998-05-19 21:45:53 +00:00
|
|
|
int firsttry;
|
1998-05-19 19:47:22 +00:00
|
|
|
|
|
|
|
#ifdef DEBUG
|
2001-09-12 08:38:13 +00:00
|
|
|
if (lk.lkt_held == NOHOLDER)
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("inodedep_lookup: lock not held");
|
|
|
|
#endif
|
1998-05-19 21:45:53 +00:00
|
|
|
firsttry = 1;
|
1998-05-19 19:47:22 +00:00
|
|
|
inodedephd = INODEDEP_HASH(fs, inum);
|
|
|
|
top:
|
2001-02-04 16:08:18 +00:00
|
|
|
LIST_FOREACH(inodedep, inodedephd, id_hash)
|
1998-05-19 19:47:22 +00:00
|
|
|
if (inum == inodedep->id_ino && fs == inodedep->id_fs)
|
|
|
|
break;
|
|
|
|
if (inodedep) {
|
|
|
|
*inodedeppp = inodedep;
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
if ((flags & DEPALLOC) == 0) {
|
|
|
|
*inodedeppp = NULL;
|
|
|
|
return (0);
|
|
|
|
}
|
1999-06-15 23:37:29 +00:00
|
|
|
/*
|
|
|
|
* If we are over our limit, try to improve the situation.
|
|
|
|
*/
|
2001-02-20 11:14:38 +00:00
|
|
|
if (num_inodedep > max_softdeps && firsttry && (flags & NODELAY) == 0 &&
|
1999-06-15 23:37:29 +00:00
|
|
|
request_cleanup(FLUSH_INODES, 1)) {
|
1998-05-19 21:45:53 +00:00
|
|
|
firsttry = 0;
|
|
|
|
goto top;
|
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
if (sema_get(&inodedep_in_progress, &lk) == 0) {
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
goto top;
|
|
|
|
}
|
1998-05-19 21:45:53 +00:00
|
|
|
num_inodedep += 1;
|
1998-05-19 19:47:22 +00:00
|
|
|
MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
|
Implement a low-memory deadlock solution.
Removed most of the hacks that were trying to deal with low-memory
situations prior to now.
The new code is based on the concept that I/O must be able to function in
a low memory situation. All major modules related to I/O (except
networking) have been adjusted to allow allocation out of the system
reserve memory pool. These modules now detect a low memory situation but
rather then block they instead continue to operate, then return resources
to the memory pool instead of cache them or leave them wired.
Code has been added to stall in a low-memory situation prior to a vnode
being locked.
Thus situations where a process blocks in a low-memory condition while
holding a locked vnode have been reduced to near nothing. Not only will
I/O continue to operate, but many prior deadlock conditions simply no
longer exist.
Implement a number of VFS/BIO fixes
(found by Ian): in biodone(), bogus-page replacement code, the loop
was not properly incrementing loop variables prior to a continue
statement. We do not believe this code can be hit anyway but we
aren't taking any chances. We'll turn the whole section into a
panic (as it already is in brelse()) after the release is rolled.
In biodone(), the foff calculation was incorrectly
clamped to the iosize, causing the wrong foff to be calculated
for pages in the case of an I/O error or biodone() called without
initiating I/O. The problem always caused a panic before. Now it
doesn't. The problem is mainly an issue with NFS.
Fixed casts for ~PAGE_MASK. This code worked properly before only
because the calculations use signed arithmatic. Better to properly
extend PAGE_MASK first before inverting it for the 64 bit masking
op.
In brelse(), the bogus_page fixup code was improperly throwing
away the original contents of 'm' when it did the j-loop to
fix the bogus pages. The result was that it would potentially
invalidate parts of the *WRONG* page(!), leading to corruption.
There may still be cases where a background bitmap write is
being duplicated, causing potential corruption. We have identified
a potentially serious bug related to this but the fix is still TBD.
So instead this patch contains a KASSERT to detect the problem
and panic the machine rather then continue to corrupt the filesystem.
The problem does not occur very often.. it is very hard to
reproduce, and it may or may not be the cause of the corruption
people have reported.
Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>)
Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
2000-11-18 23:06:26 +00:00
|
|
|
M_INODEDEP, M_SOFTDEP_FLAGS);
|
1998-05-19 20:18:42 +00:00
|
|
|
inodedep->id_list.wk_type = D_INODEDEP;
|
1998-05-19 19:47:22 +00:00
|
|
|
inodedep->id_fs = fs;
|
|
|
|
inodedep->id_ino = inum;
|
|
|
|
inodedep->id_state = ALLCOMPLETE;
|
|
|
|
inodedep->id_nlinkdelta = 0;
|
2002-06-21 06:18:05 +00:00
|
|
|
inodedep->id_savedino1 = NULL;
|
1998-05-19 19:47:22 +00:00
|
|
|
inodedep->id_savedsize = -1;
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
inodedep->id_savedextsize = -1;
|
1998-05-19 19:47:22 +00:00
|
|
|
inodedep->id_buf = NULL;
|
|
|
|
LIST_INIT(&inodedep->id_pendinghd);
|
|
|
|
LIST_INIT(&inodedep->id_inowait);
|
1998-05-19 21:45:53 +00:00
|
|
|
LIST_INIT(&inodedep->id_bufwait);
|
1998-05-19 19:47:22 +00:00
|
|
|
TAILQ_INIT(&inodedep->id_inoupdt);
|
|
|
|
TAILQ_INIT(&inodedep->id_newinoupdt);
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
TAILQ_INIT(&inodedep->id_extupdt);
|
|
|
|
TAILQ_INIT(&inodedep->id_newextupdt);
|
1998-05-19 19:47:22 +00:00
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
|
|
|
|
sema_release(&inodedep_in_progress);
|
|
|
|
*inodedeppp = inodedep;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Structures and routines associated with newblk caching.
|
|
|
|
*/
|
2000-05-26 02:09:24 +00:00
|
|
|
LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
|
1998-05-19 19:47:22 +00:00
|
|
|
u_long newblk_hash; /* size of hash table - 1 */
|
|
|
|
#define NEWBLK_HASH(fs, inum) \
|
1998-05-19 20:03:29 +00:00
|
|
|
(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
|
1998-05-19 19:47:22 +00:00
|
|
|
static struct sema newblk_in_progress;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Look up a newblk. Return 1 if found, 0 if not found.
|
|
|
|
* If not found, allocate if DEPALLOC flag is passed.
|
|
|
|
* Found or allocated entry is returned in newblkpp.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
newblk_lookup(fs, newblkno, flags, newblkpp)
|
|
|
|
struct fs *fs;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs2_daddr_t newblkno;
|
1998-05-19 19:47:22 +00:00
|
|
|
int flags;
|
|
|
|
struct newblk **newblkpp;
|
|
|
|
{
|
|
|
|
struct newblk *newblk;
|
|
|
|
struct newblk_hashhead *newblkhd;
|
|
|
|
|
|
|
|
newblkhd = NEWBLK_HASH(fs, newblkno);
|
|
|
|
top:
|
2001-02-04 16:08:18 +00:00
|
|
|
LIST_FOREACH(newblk, newblkhd, nb_hash)
|
1998-05-19 19:47:22 +00:00
|
|
|
if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
|
|
|
|
break;
|
|
|
|
if (newblk) {
|
|
|
|
*newblkpp = newblk;
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
if ((flags & DEPALLOC) == 0) {
|
|
|
|
*newblkpp = NULL;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
if (sema_get(&newblk_in_progress, 0) == 0)
|
|
|
|
goto top;
|
|
|
|
MALLOC(newblk, struct newblk *, sizeof(struct newblk),
|
Implement a low-memory deadlock solution.
Removed most of the hacks that were trying to deal with low-memory
situations prior to now.
The new code is based on the concept that I/O must be able to function in
a low memory situation. All major modules related to I/O (except
networking) have been adjusted to allow allocation out of the system
reserve memory pool. These modules now detect a low memory situation but
rather then block they instead continue to operate, then return resources
to the memory pool instead of cache them or leave them wired.
Code has been added to stall in a low-memory situation prior to a vnode
being locked.
Thus situations where a process blocks in a low-memory condition while
holding a locked vnode have been reduced to near nothing. Not only will
I/O continue to operate, but many prior deadlock conditions simply no
longer exist.
Implement a number of VFS/BIO fixes
(found by Ian): in biodone(), bogus-page replacement code, the loop
was not properly incrementing loop variables prior to a continue
statement. We do not believe this code can be hit anyway but we
aren't taking any chances. We'll turn the whole section into a
panic (as it already is in brelse()) after the release is rolled.
In biodone(), the foff calculation was incorrectly
clamped to the iosize, causing the wrong foff to be calculated
for pages in the case of an I/O error or biodone() called without
initiating I/O. The problem always caused a panic before. Now it
doesn't. The problem is mainly an issue with NFS.
Fixed casts for ~PAGE_MASK. This code worked properly before only
because the calculations use signed arithmatic. Better to properly
extend PAGE_MASK first before inverting it for the 64 bit masking
op.
In brelse(), the bogus_page fixup code was improperly throwing
away the original contents of 'm' when it did the j-loop to
fix the bogus pages. The result was that it would potentially
invalidate parts of the *WRONG* page(!), leading to corruption.
There may still be cases where a background bitmap write is
being duplicated, causing potential corruption. We have identified
a potentially serious bug related to this but the fix is still TBD.
So instead this patch contains a KASSERT to detect the problem
and panic the machine rather then continue to corrupt the filesystem.
The problem does not occur very often.. it is very hard to
reproduce, and it may or may not be the cause of the corruption
people have reported.
Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>)
Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
2000-11-18 23:06:26 +00:00
|
|
|
M_NEWBLK, M_SOFTDEP_FLAGS);
|
1998-05-19 19:47:22 +00:00
|
|
|
newblk->nb_state = 0;
|
|
|
|
newblk->nb_fs = fs;
|
|
|
|
newblk->nb_newblkno = newblkno;
|
|
|
|
LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
|
|
|
|
sema_release(&newblk_in_progress);
|
|
|
|
*newblkpp = newblk;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Executed during filesystem system initialization before
|
2002-05-16 21:28:32 +00:00
|
|
|
* mounting any filesystems.
|
1998-05-19 19:47:22 +00:00
|
|
|
*/
|
|
|
|
void
|
|
|
|
softdep_initialize()
|
|
|
|
{
|
|
|
|
|
|
|
|
LIST_INIT(&mkdirlisthd);
|
|
|
|
LIST_INIT(&softdep_workitem_pending);
|
2002-11-20 05:16:11 +00:00
|
|
|
max_softdeps = desiredvnodes * 4;
|
1998-05-19 21:45:53 +00:00
|
|
|
pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
|
1998-05-19 20:03:29 +00:00
|
|
|
&pagedep_hash);
|
1998-05-19 19:47:22 +00:00
|
|
|
sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0);
|
1998-05-19 21:45:53 +00:00
|
|
|
inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
|
1998-05-19 19:47:22 +00:00
|
|
|
sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0);
|
1998-05-19 20:03:29 +00:00
|
|
|
newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
|
1998-05-19 19:47:22 +00:00
|
|
|
sema_init(&newblk_in_progress, "newblk", PRIBIO, 0);
|
2002-01-08 19:32:18 +00:00
|
|
|
|
2002-07-01 17:59:40 +00:00
|
|
|
/* hooks through which the main kernel code calls us */
|
|
|
|
softdep_process_worklist_hook = softdep_process_worklist;
|
|
|
|
softdep_fsync_hook = softdep_fsync;
|
|
|
|
|
2002-01-08 19:32:18 +00:00
|
|
|
/* initialise bioops hack */
|
|
|
|
bioops.io_start = softdep_disk_io_initiation;
|
|
|
|
bioops.io_complete = softdep_disk_write_complete;
|
|
|
|
bioops.io_deallocate = softdep_deallocate_dependencies;
|
|
|
|
bioops.io_movedeps = softdep_move_dependencies;
|
|
|
|
bioops.io_countdeps = softdep_count_dependencies;
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
|
2002-07-01 11:00:47 +00:00
|
|
|
/*
|
|
|
|
* Executed after all filesystems have been unmounted during
|
|
|
|
* filesystem module unload.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
softdep_uninitialize()
|
|
|
|
{
|
|
|
|
|
2002-07-01 17:59:40 +00:00
|
|
|
softdep_process_worklist_hook = NULL;
|
|
|
|
softdep_fsync_hook = NULL;
|
2002-07-01 11:00:47 +00:00
|
|
|
hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash);
|
|
|
|
hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash);
|
|
|
|
hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash);
|
|
|
|
}
|
|
|
|
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* Called at mount time to notify the dependency code that a
|
|
|
|
* filesystem wishes to use it.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
softdep_mount(devvp, mp, fs, cred)
|
|
|
|
struct vnode *devvp;
|
|
|
|
struct mount *mp;
|
|
|
|
struct fs *fs;
|
|
|
|
struct ucred *cred;
|
|
|
|
{
|
2002-06-21 06:18:05 +00:00
|
|
|
struct csum_total cstotal;
|
1998-05-19 19:47:22 +00:00
|
|
|
struct cg *cgp;
|
|
|
|
struct buf *bp;
|
|
|
|
int error, cyl;
|
|
|
|
|
1998-05-19 23:07:25 +00:00
|
|
|
mp->mnt_flag &= ~MNT_ASYNC;
|
1998-05-19 19:47:22 +00:00
|
|
|
mp->mnt_flag |= MNT_SOFTDEP;
|
|
|
|
/*
|
|
|
|
* When doing soft updates, the counters in the
|
|
|
|
* superblock may have gotten out of sync, so we have
|
|
|
|
* to scan the cylinder groups and recalculate them.
|
|
|
|
*/
|
|
|
|
if (fs->fs_clean != 0)
|
|
|
|
return (0);
|
|
|
|
bzero(&cstotal, sizeof cstotal);
|
|
|
|
for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
|
|
|
|
if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
|
|
|
|
fs->fs_cgsize, cred, &bp)) != 0) {
|
|
|
|
brelse(bp);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
cgp = (struct cg *)bp->b_data;
|
|
|
|
cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
|
|
|
|
cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
|
|
|
|
cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
|
|
|
|
cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
|
|
|
|
fs->fs_cs(fs, cyl) = cgp->cg_cs;
|
|
|
|
brelse(bp);
|
|
|
|
}
|
|
|
|
#ifdef DEBUG
|
1999-05-07 05:11:31 +00:00
|
|
|
if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
|
2001-03-21 04:09:01 +00:00
|
|
|
printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
|
1998-05-19 19:47:22 +00:00
|
|
|
#endif
|
|
|
|
bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Protecting the freemaps (or bitmaps).
|
|
|
|
*
|
2002-05-16 21:28:32 +00:00
|
|
|
* To eliminate the need to execute fsck before mounting a filesystem
|
1998-05-19 19:47:22 +00:00
|
|
|
* after a power failure, one must (conservatively) guarantee that the
|
|
|
|
* on-disk copy of the bitmaps never indicate that a live inode or block is
|
|
|
|
* free. So, when a block or inode is allocated, the bitmap should be
|
|
|
|
* updated (on disk) before any new pointers. When a block or inode is
|
|
|
|
* freed, the bitmap should not be updated until all pointers have been
|
|
|
|
* reset. The latter dependency is handled by the delayed de-allocation
|
|
|
|
* approach described below for block and inode de-allocation. The former
|
|
|
|
* dependency is handled by calling the following procedure when a block or
|
|
|
|
* inode is allocated. When an inode is allocated an "inodedep" is created
|
|
|
|
* with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
|
|
|
|
* Each "inodedep" is also inserted into the hash indexing structure so
|
|
|
|
* that any additional link additions can be made dependent on the inode
|
|
|
|
* allocation.
|
|
|
|
*
|
2002-05-16 21:28:32 +00:00
|
|
|
* The ufs filesystem maintains a number of free block counts (e.g., per
|
1998-05-19 19:47:22 +00:00
|
|
|
* cylinder group, per cylinder and per <cylinder, rotational position> pair)
|
|
|
|
* in addition to the bitmaps. These counts are used to improve efficiency
|
|
|
|
* during allocation and therefore must be consistent with the bitmaps.
|
|
|
|
* There is no convenient way to guarantee post-crash consistency of these
|
|
|
|
* counts with simple update ordering, for two main reasons: (1) The counts
|
|
|
|
* and bitmaps for a single cylinder group block are not in the same disk
|
|
|
|
* sector. If a disk write is interrupted (e.g., by power failure), one may
|
|
|
|
* be written and the other not. (2) Some of the counts are located in the
|
|
|
|
* superblock rather than the cylinder group block. So, we focus our soft
|
|
|
|
* updates implementation on protecting the bitmaps. When mounting a
|
|
|
|
* filesystem, we recompute the auxiliary counts from the bitmaps.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called just after updating the cylinder group block to allocate an inode.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
softdep_setup_inomapdep(bp, ip, newinum)
|
|
|
|
struct buf *bp; /* buffer for cylgroup block with inode map */
|
|
|
|
struct inode *ip; /* inode related to allocation */
|
|
|
|
ino_t newinum; /* new inode number being allocated */
|
|
|
|
{
|
|
|
|
struct inodedep *inodedep;
|
|
|
|
struct bmsafemap *bmsafemap;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create a dependency for the newly allocated inode.
|
|
|
|
* Panic if it already exists as something is seriously wrong.
|
|
|
|
* Otherwise add it to the dependency list for the buffer holding
|
|
|
|
* the cylinder group map from which it was allocated.
|
|
|
|
*/
|
|
|
|
ACQUIRE_LOCK(&lk);
|
2001-02-23 09:01:31 +00:00
|
|
|
if ((inodedep_lookup(ip->i_fs, newinum, DEPALLOC|NODELAY, &inodedep))) {
|
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("softdep_setup_inomapdep: found inode");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
inodedep->id_buf = bp;
|
|
|
|
inodedep->id_state &= ~DEPCOMPLETE;
|
|
|
|
bmsafemap = bmsafemap_lookup(bp);
|
|
|
|
LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called just after updating the cylinder group block to
|
|
|
|
* allocate block or fragment.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
softdep_setup_blkmapdep(bp, fs, newblkno)
|
|
|
|
struct buf *bp; /* buffer for cylgroup block with block map */
|
|
|
|
struct fs *fs; /* filesystem doing allocation */
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs2_daddr_t newblkno; /* number of newly allocated block */
|
1998-05-19 19:47:22 +00:00
|
|
|
{
|
|
|
|
struct newblk *newblk;
|
|
|
|
struct bmsafemap *bmsafemap;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create a dependency for the newly allocated block.
|
|
|
|
* Add it to the dependency list for the buffer holding
|
|
|
|
* the cylinder group map from which it was allocated.
|
|
|
|
*/
|
|
|
|
if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
|
|
|
|
panic("softdep_setup_blkmapdep: found block");
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp);
|
|
|
|
LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Find the bmsafemap associated with a cylinder group buffer.
|
|
|
|
* If none exists, create one. The buffer must be locked when
|
|
|
|
* this routine is called and this routine must be called with
|
|
|
|
* splbio interrupts blocked.
|
|
|
|
*/
|
|
|
|
static struct bmsafemap *
|
|
|
|
bmsafemap_lookup(bp)
|
|
|
|
struct buf *bp;
|
|
|
|
{
|
|
|
|
struct bmsafemap *bmsafemap;
|
|
|
|
struct worklist *wk;
|
|
|
|
|
|
|
|
#ifdef DEBUG
|
2001-09-12 08:38:13 +00:00
|
|
|
if (lk.lkt_held == NOHOLDER)
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("bmsafemap_lookup: lock not held");
|
|
|
|
#endif
|
2001-02-04 13:13:25 +00:00
|
|
|
LIST_FOREACH(wk, &bp->b_dep, wk_list)
|
1998-05-19 20:18:42 +00:00
|
|
|
if (wk->wk_type == D_BMSAFEMAP)
|
1998-05-19 19:47:22 +00:00
|
|
|
return (WK_BMSAFEMAP(wk));
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
|
Implement a low-memory deadlock solution.
Removed most of the hacks that were trying to deal with low-memory
situations prior to now.
The new code is based on the concept that I/O must be able to function in
a low memory situation. All major modules related to I/O (except
networking) have been adjusted to allow allocation out of the system
reserve memory pool. These modules now detect a low memory situation but
rather then block they instead continue to operate, then return resources
to the memory pool instead of cache them or leave them wired.
Code has been added to stall in a low-memory situation prior to a vnode
being locked.
Thus situations where a process blocks in a low-memory condition while
holding a locked vnode have been reduced to near nothing. Not only will
I/O continue to operate, but many prior deadlock conditions simply no
longer exist.
Implement a number of VFS/BIO fixes
(found by Ian): in biodone(), bogus-page replacement code, the loop
was not properly incrementing loop variables prior to a continue
statement. We do not believe this code can be hit anyway but we
aren't taking any chances. We'll turn the whole section into a
panic (as it already is in brelse()) after the release is rolled.
In biodone(), the foff calculation was incorrectly
clamped to the iosize, causing the wrong foff to be calculated
for pages in the case of an I/O error or biodone() called without
initiating I/O. The problem always caused a panic before. Now it
doesn't. The problem is mainly an issue with NFS.
Fixed casts for ~PAGE_MASK. This code worked properly before only
because the calculations use signed arithmatic. Better to properly
extend PAGE_MASK first before inverting it for the 64 bit masking
op.
In brelse(), the bogus_page fixup code was improperly throwing
away the original contents of 'm' when it did the j-loop to
fix the bogus pages. The result was that it would potentially
invalidate parts of the *WRONG* page(!), leading to corruption.
There may still be cases where a background bitmap write is
being duplicated, causing potential corruption. We have identified
a potentially serious bug related to this but the fix is still TBD.
So instead this patch contains a KASSERT to detect the problem
and panic the machine rather then continue to corrupt the filesystem.
The problem does not occur very often.. it is very hard to
reproduce, and it may or may not be the cause of the corruption
people have reported.
Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>)
Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
2000-11-18 23:06:26 +00:00
|
|
|
M_BMSAFEMAP, M_SOFTDEP_FLAGS);
|
1998-05-19 20:18:42 +00:00
|
|
|
bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
|
1998-05-19 19:47:22 +00:00
|
|
|
bmsafemap->sm_list.wk_state = 0;
|
|
|
|
bmsafemap->sm_buf = bp;
|
|
|
|
LIST_INIT(&bmsafemap->sm_allocdirecthd);
|
|
|
|
LIST_INIT(&bmsafemap->sm_allocindirhd);
|
|
|
|
LIST_INIT(&bmsafemap->sm_inodedephd);
|
|
|
|
LIST_INIT(&bmsafemap->sm_newblkhd);
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
|
|
|
|
return (bmsafemap);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Direct block allocation dependencies.
|
|
|
|
*
|
|
|
|
* When a new block is allocated, the corresponding disk locations must be
|
|
|
|
* initialized (with zeros or new data) before the on-disk inode points to
|
|
|
|
* them. Also, the freemap from which the block was allocated must be
|
|
|
|
* updated (on disk) before the inode's pointer. These two dependencies are
|
|
|
|
* independent of each other and are needed for all file blocks and indirect
|
|
|
|
* blocks that are pointed to directly by the inode. Just before the
|
|
|
|
* "in-core" version of the inode is updated with a newly allocated block
|
|
|
|
* number, a procedure (below) is called to setup allocation dependency
|
|
|
|
* structures. These structures are removed when the corresponding
|
|
|
|
* dependencies are satisfied or when the block allocation becomes obsolete
|
|
|
|
* (i.e., the file is deleted, the block is de-allocated, or the block is a
|
|
|
|
* fragment that gets upgraded). All of these cases are handled in
|
|
|
|
* procedures described later.
|
|
|
|
*
|
|
|
|
* When a file extension causes a fragment to be upgraded, either to a larger
|
|
|
|
* fragment or to a full block, the on-disk location may change (if the
|
|
|
|
* previous fragment could not simply be extended). In this case, the old
|
|
|
|
* fragment must be de-allocated, but not until after the inode's pointer has
|
|
|
|
* been updated. In most cases, this is handled by later procedures, which
|
|
|
|
* will construct a "freefrag" structure to be added to the workitem queue
|
|
|
|
* when the inode update is complete (or obsolete). The main exception to
|
|
|
|
* this is when an allocation occurs while a pending allocation dependency
|
|
|
|
* (for the same block pointer) remains. This case is handled in the main
|
|
|
|
* allocation dependency setup procedure by immediately freeing the
|
|
|
|
* unreferenced fragments.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
|
|
|
|
struct inode *ip; /* inode to which block is being added */
|
|
|
|
ufs_lbn_t lbn; /* block pointer within inode */
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs2_daddr_t newblkno; /* disk block number being added */
|
|
|
|
ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */
|
1998-05-19 19:47:22 +00:00
|
|
|
long newsize; /* size of new block */
|
|
|
|
long oldsize; /* size of new block */
|
|
|
|
struct buf *bp; /* bp for allocated block */
|
|
|
|
{
|
|
|
|
struct allocdirect *adp, *oldadp;
|
|
|
|
struct allocdirectlst *adphead;
|
|
|
|
struct bmsafemap *bmsafemap;
|
|
|
|
struct inodedep *inodedep;
|
|
|
|
struct pagedep *pagedep;
|
|
|
|
struct newblk *newblk;
|
|
|
|
|
|
|
|
MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
|
2000-12-08 21:51:06 +00:00
|
|
|
M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
|
1998-05-19 20:18:42 +00:00
|
|
|
adp->ad_list.wk_type = D_ALLOCDIRECT;
|
1998-05-19 19:47:22 +00:00
|
|
|
adp->ad_lbn = lbn;
|
|
|
|
adp->ad_newblkno = newblkno;
|
|
|
|
adp->ad_oldblkno = oldblkno;
|
|
|
|
adp->ad_newsize = newsize;
|
|
|
|
adp->ad_oldsize = oldsize;
|
|
|
|
adp->ad_state = ATTACHED;
|
2001-05-17 07:24:03 +00:00
|
|
|
LIST_INIT(&adp->ad_newdirblk);
|
1998-05-19 19:47:22 +00:00
|
|
|
if (newblkno == oldblkno)
|
|
|
|
adp->ad_freefrag = NULL;
|
|
|
|
else
|
|
|
|
adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
|
|
|
|
|
|
|
|
if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
|
|
|
|
panic("softdep_setup_allocdirect: lost block");
|
|
|
|
|
|
|
|
ACQUIRE_LOCK(&lk);
|
2001-02-20 11:14:38 +00:00
|
|
|
inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC | NODELAY, &inodedep);
|
1998-05-19 19:47:22 +00:00
|
|
|
adp->ad_inodedep = inodedep;
|
|
|
|
|
|
|
|
if (newblk->nb_state == DEPCOMPLETE) {
|
|
|
|
adp->ad_state |= DEPCOMPLETE;
|
|
|
|
adp->ad_buf = NULL;
|
|
|
|
} else {
|
|
|
|
bmsafemap = newblk->nb_bmsafemap;
|
|
|
|
adp->ad_buf = bmsafemap->sm_buf;
|
|
|
|
LIST_REMOVE(newblk, nb_deps);
|
|
|
|
LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
|
|
|
|
}
|
|
|
|
LIST_REMOVE(newblk, nb_hash);
|
|
|
|
FREE(newblk, M_NEWBLK);
|
|
|
|
|
|
|
|
WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
|
|
|
|
if (lbn >= NDADDR) {
|
|
|
|
/* allocating an indirect block */
|
2001-02-23 09:01:31 +00:00
|
|
|
if (oldblkno != 0) {
|
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("softdep_setup_allocdirect: non-zero indir");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Allocating a direct block.
|
|
|
|
*
|
|
|
|
* If we are allocating a directory block, then we must
|
|
|
|
* allocate an associated pagedep to track additions and
|
|
|
|
* deletions.
|
|
|
|
*/
|
|
|
|
if ((ip->i_mode & IFMT) == IFDIR &&
|
|
|
|
pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
|
|
|
|
WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* The list of allocdirects must be kept in sorted and ascending
|
|
|
|
* order so that the rollback routines can quickly determine the
|
|
|
|
* first uncommitted block (the size of the file stored on disk
|
|
|
|
* ends at the end of the lowest committed fragment, or if there
|
|
|
|
* are no fragments, at the end of the highest committed block).
|
|
|
|
* Since files generally grow, the typical case is that the new
|
|
|
|
* block is to be added at the end of the list. We speed this
|
|
|
|
* special case by checking against the last allocdirect in the
|
|
|
|
* list before laboriously traversing the list looking for the
|
|
|
|
* insertion point.
|
|
|
|
*/
|
|
|
|
adphead = &inodedep->id_newinoupdt;
|
|
|
|
oldadp = TAILQ_LAST(adphead, allocdirectlst);
|
|
|
|
if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
|
|
|
|
/* insert at end of list */
|
|
|
|
TAILQ_INSERT_TAIL(adphead, adp, ad_next);
|
|
|
|
if (oldadp != NULL && oldadp->ad_lbn == lbn)
|
|
|
|
allocdirect_merge(adphead, adp, oldadp);
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
return;
|
|
|
|
}
|
2001-02-04 16:08:18 +00:00
|
|
|
TAILQ_FOREACH(oldadp, adphead, ad_next) {
|
1998-05-19 19:47:22 +00:00
|
|
|
if (oldadp->ad_lbn >= lbn)
|
|
|
|
break;
|
|
|
|
}
|
2001-02-23 09:01:31 +00:00
|
|
|
if (oldadp == NULL) {
|
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("softdep_setup_allocdirect: lost entry");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
/* insert in middle of list */
|
|
|
|
TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
|
|
|
|
if (oldadp->ad_lbn == lbn)
|
|
|
|
allocdirect_merge(adphead, adp, oldadp);
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Replace an old allocdirect dependency with a newer one.
|
|
|
|
* This routine must be called with splbio interrupts blocked.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
allocdirect_merge(adphead, newadp, oldadp)
|
|
|
|
struct allocdirectlst *adphead; /* head of list holding allocdirects */
|
|
|
|
struct allocdirect *newadp; /* allocdirect being added */
|
|
|
|
struct allocdirect *oldadp; /* existing allocdirect being checked */
|
|
|
|
{
|
2001-05-17 07:24:03 +00:00
|
|
|
struct worklist *wk;
|
1998-05-19 19:47:22 +00:00
|
|
|
struct freefrag *freefrag;
|
2001-05-17 07:24:03 +00:00
|
|
|
struct newdirblk *newdirblk;
|
1998-05-19 19:47:22 +00:00
|
|
|
|
|
|
|
#ifdef DEBUG
|
2001-09-12 08:38:13 +00:00
|
|
|
if (lk.lkt_held == NOHOLDER)
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("allocdirect_merge: lock not held");
|
|
|
|
#endif
|
|
|
|
if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
|
|
|
|
newadp->ad_oldsize != oldadp->ad_newsize ||
|
2001-02-23 09:01:31 +00:00
|
|
|
newadp->ad_lbn >= NDADDR) {
|
|
|
|
FREE_LOCK(&lk);
|
2002-06-23 18:17:27 +00:00
|
|
|
panic("%s %jd != new %jd || old size %ld != new %ld",
|
2002-06-21 06:18:05 +00:00
|
|
|
"allocdirect_merge: old blkno",
|
|
|
|
(intmax_t)newadp->ad_oldblkno,
|
|
|
|
(intmax_t)oldadp->ad_newblkno,
|
|
|
|
newadp->ad_oldsize, oldadp->ad_newsize);
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
newadp->ad_oldblkno = oldadp->ad_oldblkno;
|
|
|
|
newadp->ad_oldsize = oldadp->ad_oldsize;
|
|
|
|
/*
|
|
|
|
* If the old dependency had a fragment to free or had never
|
|
|
|
* previously had a block allocated, then the new dependency
|
|
|
|
* can immediately post its freefrag and adopt the old freefrag.
|
|
|
|
* This action is done by swapping the freefrag dependencies.
|
|
|
|
* The new dependency gains the old one's freefrag, and the
|
|
|
|
* old one gets the new one and then immediately puts it on
|
|
|
|
* the worklist when it is freed by free_allocdirect. It is
|
|
|
|
* not possible to do this swap when the old dependency had a
|
|
|
|
* non-zero size but no previous fragment to free. This condition
|
|
|
|
* arises when the new block is an extension of the old block.
|
|
|
|
* Here, the first part of the fragment allocated to the new
|
|
|
|
* dependency is part of the block currently claimed on disk by
|
|
|
|
* the old dependency, so cannot legitimately be freed until the
|
|
|
|
* conditions for the new dependency are fulfilled.
|
|
|
|
*/
|
|
|
|
if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
|
|
|
|
freefrag = newadp->ad_freefrag;
|
|
|
|
newadp->ad_freefrag = oldadp->ad_freefrag;
|
|
|
|
oldadp->ad_freefrag = freefrag;
|
|
|
|
}
|
2001-05-17 07:24:03 +00:00
|
|
|
/*
|
|
|
|
* If we are tracking a new directory-block allocation,
|
|
|
|
* move it from the old allocdirect to the new allocdirect.
|
|
|
|
*/
|
|
|
|
if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
|
|
|
|
newdirblk = WK_NEWDIRBLK(wk);
|
|
|
|
WORKLIST_REMOVE(&newdirblk->db_list);
|
|
|
|
if (LIST_FIRST(&oldadp->ad_newdirblk) != NULL)
|
|
|
|
panic("allocdirect_merge: extra newdirblk");
|
|
|
|
WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);
|
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
free_allocdirect(adphead, oldadp, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Allocate a new freefrag structure if needed.
|
|
|
|
*/
|
|
|
|
static struct freefrag *
|
|
|
|
newfreefrag(ip, blkno, size)
|
|
|
|
struct inode *ip;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs2_daddr_t blkno;
|
1998-05-19 19:47:22 +00:00
|
|
|
long size;
|
|
|
|
{
|
|
|
|
struct freefrag *freefrag;
|
|
|
|
struct fs *fs;
|
|
|
|
|
|
|
|
if (blkno == 0)
|
|
|
|
return (NULL);
|
|
|
|
fs = ip->i_fs;
|
|
|
|
if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
|
|
|
|
panic("newfreefrag: frag size");
|
|
|
|
MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
|
Implement a low-memory deadlock solution.
Removed most of the hacks that were trying to deal with low-memory
situations prior to now.
The new code is based on the concept that I/O must be able to function in
a low memory situation. All major modules related to I/O (except
networking) have been adjusted to allow allocation out of the system
reserve memory pool. These modules now detect a low memory situation but
rather then block they instead continue to operate, then return resources
to the memory pool instead of cache them or leave them wired.
Code has been added to stall in a low-memory situation prior to a vnode
being locked.
Thus situations where a process blocks in a low-memory condition while
holding a locked vnode have been reduced to near nothing. Not only will
I/O continue to operate, but many prior deadlock conditions simply no
longer exist.
Implement a number of VFS/BIO fixes
(found by Ian): in biodone(), bogus-page replacement code, the loop
was not properly incrementing loop variables prior to a continue
statement. We do not believe this code can be hit anyway but we
aren't taking any chances. We'll turn the whole section into a
panic (as it already is in brelse()) after the release is rolled.
In biodone(), the foff calculation was incorrectly
clamped to the iosize, causing the wrong foff to be calculated
for pages in the case of an I/O error or biodone() called without
initiating I/O. The problem always caused a panic before. Now it
doesn't. The problem is mainly an issue with NFS.
Fixed casts for ~PAGE_MASK. This code worked properly before only
because the calculations use signed arithmatic. Better to properly
extend PAGE_MASK first before inverting it for the 64 bit masking
op.
In brelse(), the bogus_page fixup code was improperly throwing
away the original contents of 'm' when it did the j-loop to
fix the bogus pages. The result was that it would potentially
invalidate parts of the *WRONG* page(!), leading to corruption.
There may still be cases where a background bitmap write is
being duplicated, causing potential corruption. We have identified
a potentially serious bug related to this but the fix is still TBD.
So instead this patch contains a KASSERT to detect the problem
and panic the machine rather then continue to corrupt the filesystem.
The problem does not occur very often.. it is very hard to
reproduce, and it may or may not be the cause of the corruption
people have reported.
Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>)
Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
2000-11-18 23:06:26 +00:00
|
|
|
M_FREEFRAG, M_SOFTDEP_FLAGS);
|
1998-05-19 20:18:42 +00:00
|
|
|
freefrag->ff_list.wk_type = D_FREEFRAG;
|
2002-02-02 01:42:44 +00:00
|
|
|
freefrag->ff_state = 0;
|
1998-05-19 19:47:22 +00:00
|
|
|
freefrag->ff_inum = ip->i_number;
|
2000-07-11 22:07:57 +00:00
|
|
|
freefrag->ff_mnt = ITOV(ip)->v_mount;
|
1998-05-19 19:47:22 +00:00
|
|
|
freefrag->ff_blkno = blkno;
|
|
|
|
freefrag->ff_fragsize = size;
|
|
|
|
return (freefrag);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This workitem de-allocates fragments that were replaced during
|
|
|
|
* file block allocation.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
handle_workitem_freefrag(freefrag)
|
|
|
|
struct freefrag *freefrag;
|
|
|
|
{
|
2002-06-21 06:18:05 +00:00
|
|
|
struct ufsmount *ump = VFSTOUFS(freefrag->ff_mnt);
|
2002-02-02 01:42:44 +00:00
|
|
|
|
2002-06-21 06:18:05 +00:00
|
|
|
ffs_blkfree(ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
|
|
|
|
freefrag->ff_fragsize, freefrag->ff_inum);
|
1998-05-19 19:47:22 +00:00
|
|
|
FREE(freefrag, M_FREEFRAG);
|
|
|
|
}
|
|
|
|
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
/*
|
|
|
|
* Set up a dependency structure for an external attributes data block.
|
|
|
|
* This routine follows much of the structure of softdep_setup_allocdirect.
|
|
|
|
* See the description of softdep_setup_allocdirect above for details.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
|
|
|
|
struct inode *ip;
|
|
|
|
ufs_lbn_t lbn;
|
|
|
|
ufs2_daddr_t newblkno;
|
|
|
|
ufs2_daddr_t oldblkno;
|
|
|
|
long newsize;
|
|
|
|
long oldsize;
|
|
|
|
struct buf *bp;
|
|
|
|
{
|
|
|
|
struct allocdirect *adp, *oldadp;
|
|
|
|
struct allocdirectlst *adphead;
|
|
|
|
struct bmsafemap *bmsafemap;
|
|
|
|
struct inodedep *inodedep;
|
|
|
|
struct newblk *newblk;
|
|
|
|
|
|
|
|
MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
|
|
|
|
M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
|
|
|
|
adp->ad_list.wk_type = D_ALLOCDIRECT;
|
|
|
|
adp->ad_lbn = lbn;
|
|
|
|
adp->ad_newblkno = newblkno;
|
|
|
|
adp->ad_oldblkno = oldblkno;
|
|
|
|
adp->ad_newsize = newsize;
|
|
|
|
adp->ad_oldsize = oldsize;
|
|
|
|
adp->ad_state = ATTACHED | EXTDATA;
|
|
|
|
LIST_INIT(&adp->ad_newdirblk);
|
|
|
|
if (newblkno == oldblkno)
|
|
|
|
adp->ad_freefrag = NULL;
|
|
|
|
else
|
|
|
|
adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
|
|
|
|
|
|
|
|
if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
|
|
|
|
panic("softdep_setup_allocext: lost block");
|
|
|
|
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC | NODELAY, &inodedep);
|
|
|
|
adp->ad_inodedep = inodedep;
|
|
|
|
|
|
|
|
if (newblk->nb_state == DEPCOMPLETE) {
|
|
|
|
adp->ad_state |= DEPCOMPLETE;
|
|
|
|
adp->ad_buf = NULL;
|
|
|
|
} else {
|
|
|
|
bmsafemap = newblk->nb_bmsafemap;
|
|
|
|
adp->ad_buf = bmsafemap->sm_buf;
|
|
|
|
LIST_REMOVE(newblk, nb_deps);
|
|
|
|
LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
|
|
|
|
}
|
|
|
|
LIST_REMOVE(newblk, nb_hash);
|
|
|
|
FREE(newblk, M_NEWBLK);
|
|
|
|
|
|
|
|
WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
|
|
|
|
if (lbn >= NXADDR) {
|
|
|
|
FREE_LOCK(&lk);
|
2002-07-20 01:09:35 +00:00
|
|
|
panic("softdep_setup_allocext: lbn %lld > NXADDR",
|
|
|
|
(long long)lbn);
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* The list of allocdirects must be kept in sorted and ascending
|
|
|
|
* order so that the rollback routines can quickly determine the
|
|
|
|
* first uncommitted block (the size of the file stored on disk
|
|
|
|
* ends at the end of the lowest committed fragment, or if there
|
|
|
|
* are no fragments, at the end of the highest committed block).
|
|
|
|
* Since files generally grow, the typical case is that the new
|
|
|
|
* block is to be added at the end of the list. We speed this
|
|
|
|
* special case by checking against the last allocdirect in the
|
|
|
|
* list before laboriously traversing the list looking for the
|
|
|
|
* insertion point.
|
|
|
|
*/
|
|
|
|
adphead = &inodedep->id_newextupdt;
|
|
|
|
oldadp = TAILQ_LAST(adphead, allocdirectlst);
|
|
|
|
if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
|
|
|
|
/* insert at end of list */
|
|
|
|
TAILQ_INSERT_TAIL(adphead, adp, ad_next);
|
|
|
|
if (oldadp != NULL && oldadp->ad_lbn == lbn)
|
|
|
|
allocdirect_merge(adphead, adp, oldadp);
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
TAILQ_FOREACH(oldadp, adphead, ad_next) {
|
|
|
|
if (oldadp->ad_lbn >= lbn)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (oldadp == NULL) {
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
panic("softdep_setup_allocext: lost entry");
|
|
|
|
}
|
|
|
|
/* insert in middle of list */
|
|
|
|
TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
|
|
|
|
if (oldadp->ad_lbn == lbn)
|
|
|
|
allocdirect_merge(adphead, adp, oldadp);
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
}
|
|
|
|
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* Indirect block allocation dependencies.
|
|
|
|
*
|
|
|
|
* The same dependencies that exist for a direct block also exist when
|
|
|
|
* a new block is allocated and pointed to by an entry in a block of
|
|
|
|
* indirect pointers. The undo/redo states described above are also
|
|
|
|
* used here. Because an indirect block contains many pointers that
|
|
|
|
* may have dependencies, a second copy of the entire in-memory indirect
|
|
|
|
* block is kept. The buffer cache copy is always completely up-to-date.
|
|
|
|
* The second copy, which is used only as a source for disk writes,
|
|
|
|
* contains only the safe pointers (i.e., those that have no remaining
|
|
|
|
* update dependencies). The second copy is freed when all pointers
|
|
|
|
* are safe. The cache is not allowed to replace indirect blocks with
|
|
|
|
* pending update dependencies. If a buffer containing an indirect
|
|
|
|
* block with dependencies is written, these routines will mark it
|
|
|
|
* dirty again. It can only be successfully written once all the
|
|
|
|
* dependencies are removed. The ffs_fsync routine in conjunction with
|
|
|
|
* softdep_sync_metadata work together to get all the dependencies
|
|
|
|
* removed so that a file can be successfully written to disk. Three
|
|
|
|
* procedures are used when setting up indirect block pointer
|
|
|
|
* dependencies. The division is necessary because of the organization
|
|
|
|
* of the "balloc" routine and because of the distinction between file
|
|
|
|
* pages and file metadata blocks.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Allocate a new allocindir structure.
|
|
|
|
*/
|
|
|
|
static struct allocindir *
|
|
|
|
newallocindir(ip, ptrno, newblkno, oldblkno)
|
|
|
|
struct inode *ip; /* inode for file being extended */
|
|
|
|
int ptrno; /* offset of pointer in indirect block */
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs2_daddr_t newblkno; /* disk block number being added */
|
|
|
|
ufs2_daddr_t oldblkno; /* previous block number, 0 if none */
|
1998-05-19 19:47:22 +00:00
|
|
|
{
|
|
|
|
struct allocindir *aip;
|
|
|
|
|
|
|
|
MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
|
2000-12-08 21:51:06 +00:00
|
|
|
M_ALLOCINDIR, M_SOFTDEP_FLAGS|M_ZERO);
|
1998-05-19 20:18:42 +00:00
|
|
|
aip->ai_list.wk_type = D_ALLOCINDIR;
|
1998-05-19 19:47:22 +00:00
|
|
|
aip->ai_state = ATTACHED;
|
|
|
|
aip->ai_offset = ptrno;
|
|
|
|
aip->ai_newblkno = newblkno;
|
|
|
|
aip->ai_oldblkno = oldblkno;
|
|
|
|
aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
|
|
|
|
return (aip);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called just before setting an indirect block pointer
|
|
|
|
* to a newly allocated file page.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
|
|
|
|
struct inode *ip; /* inode for file being extended */
|
|
|
|
ufs_lbn_t lbn; /* allocated block number within file */
|
|
|
|
struct buf *bp; /* buffer with indirect blk referencing page */
|
|
|
|
int ptrno; /* offset of pointer in indirect block */
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs2_daddr_t newblkno; /* disk block number being added */
|
|
|
|
ufs2_daddr_t oldblkno; /* previous block number, 0 if none */
|
1998-05-19 19:47:22 +00:00
|
|
|
struct buf *nbp; /* buffer holding allocated page */
|
|
|
|
{
|
|
|
|
struct allocindir *aip;
|
|
|
|
struct pagedep *pagedep;
|
|
|
|
|
|
|
|
aip = newallocindir(ip, ptrno, newblkno, oldblkno);
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
/*
|
|
|
|
* If we are allocating a directory page, then we must
|
|
|
|
* allocate an associated pagedep to track additions and
|
|
|
|
* deletions.
|
|
|
|
*/
|
|
|
|
if ((ip->i_mode & IFMT) == IFDIR &&
|
|
|
|
pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
|
|
|
|
WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
|
|
|
|
WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
setup_allocindir_phase2(bp, ip, aip);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called just before setting an indirect block pointer to a
|
|
|
|
* newly allocated indirect block.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
|
|
|
|
struct buf *nbp; /* newly allocated indirect block */
|
|
|
|
struct inode *ip; /* inode for file being extended */
|
|
|
|
struct buf *bp; /* indirect block referencing allocated block */
|
|
|
|
int ptrno; /* offset of pointer in indirect block */
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs2_daddr_t newblkno; /* disk block number being added */
|
1998-05-19 19:47:22 +00:00
|
|
|
{
|
|
|
|
struct allocindir *aip;
|
|
|
|
|
|
|
|
aip = newallocindir(ip, ptrno, newblkno, 0);
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
setup_allocindir_phase2(bp, ip, aip);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called to finish the allocation of the "aip" allocated
|
|
|
|
* by one of the two routines above.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
setup_allocindir_phase2(bp, ip, aip)
|
|
|
|
struct buf *bp; /* in-memory copy of the indirect block */
|
|
|
|
struct inode *ip; /* inode for file being extended */
|
|
|
|
struct allocindir *aip; /* allocindir allocated by the above routines */
|
|
|
|
{
|
|
|
|
struct worklist *wk;
|
|
|
|
struct indirdep *indirdep, *newindirdep;
|
|
|
|
struct bmsafemap *bmsafemap;
|
|
|
|
struct allocindir *oldaip;
|
|
|
|
struct freefrag *freefrag;
|
|
|
|
struct newblk *newblk;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs2_daddr_t blkno;
|
1998-05-19 19:47:22 +00:00
|
|
|
|
|
|
|
if (bp->b_lblkno >= 0)
|
|
|
|
panic("setup_allocindir_phase2: not indir blk");
|
|
|
|
for (indirdep = NULL, newindirdep = NULL; ; ) {
|
|
|
|
ACQUIRE_LOCK(&lk);
|
2001-02-04 16:08:18 +00:00
|
|
|
LIST_FOREACH(wk, &bp->b_dep, wk_list) {
|
1998-05-19 20:18:42 +00:00
|
|
|
if (wk->wk_type != D_INDIRDEP)
|
1998-05-19 19:47:22 +00:00
|
|
|
continue;
|
|
|
|
indirdep = WK_INDIRDEP(wk);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (indirdep == NULL && newindirdep) {
|
|
|
|
indirdep = newindirdep;
|
|
|
|
WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
|
|
|
|
newindirdep = NULL;
|
|
|
|
}
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
if (indirdep) {
|
|
|
|
if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
|
|
|
|
&newblk) == 0)
|
|
|
|
panic("setup_allocindir: lost block");
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
if (newblk->nb_state == DEPCOMPLETE) {
|
|
|
|
aip->ai_state |= DEPCOMPLETE;
|
|
|
|
aip->ai_buf = NULL;
|
|
|
|
} else {
|
|
|
|
bmsafemap = newblk->nb_bmsafemap;
|
|
|
|
aip->ai_buf = bmsafemap->sm_buf;
|
|
|
|
LIST_REMOVE(newblk, nb_deps);
|
|
|
|
LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
|
|
|
|
aip, ai_deps);
|
|
|
|
}
|
|
|
|
LIST_REMOVE(newblk, nb_hash);
|
|
|
|
FREE(newblk, M_NEWBLK);
|
|
|
|
aip->ai_indirdep = indirdep;
|
|
|
|
/*
|
|
|
|
* Check to see if there is an existing dependency
|
|
|
|
* for this block. If there is, merge the old
|
|
|
|
* dependency into the new one.
|
|
|
|
*/
|
|
|
|
if (aip->ai_oldblkno == 0)
|
|
|
|
oldaip = NULL;
|
|
|
|
else
|
2001-02-04 12:37:48 +00:00
|
|
|
|
|
|
|
LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
|
1998-05-19 19:47:22 +00:00
|
|
|
if (oldaip->ai_offset == aip->ai_offset)
|
|
|
|
break;
|
2000-06-18 22:05:57 +00:00
|
|
|
freefrag = NULL;
|
1998-05-19 19:47:22 +00:00
|
|
|
if (oldaip != NULL) {
|
2001-02-23 09:01:31 +00:00
|
|
|
if (oldaip->ai_newblkno != aip->ai_oldblkno) {
|
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("setup_allocindir_phase2: blkno");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
aip->ai_oldblkno = oldaip->ai_oldblkno;
|
2000-06-18 22:05:57 +00:00
|
|
|
freefrag = aip->ai_freefrag;
|
|
|
|
aip->ai_freefrag = oldaip->ai_freefrag;
|
|
|
|
oldaip->ai_freefrag = NULL;
|
1998-05-19 19:47:22 +00:00
|
|
|
free_allocindir(oldaip, NULL);
|
|
|
|
}
|
|
|
|
LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
|
2002-06-21 06:18:05 +00:00
|
|
|
if (ip->i_ump->um_fstype == UFS1)
|
|
|
|
((ufs1_daddr_t *)indirdep->ir_savebp->b_data)
|
|
|
|
[aip->ai_offset] = aip->ai_oldblkno;
|
|
|
|
else
|
|
|
|
((ufs2_daddr_t *)indirdep->ir_savebp->b_data)
|
|
|
|
[aip->ai_offset] = aip->ai_oldblkno;
|
1998-05-19 19:47:22 +00:00
|
|
|
FREE_LOCK(&lk);
|
2000-06-18 22:05:57 +00:00
|
|
|
if (freefrag != NULL)
|
|
|
|
handle_workitem_freefrag(freefrag);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
if (newindirdep) {
|
2003-01-07 18:23:50 +00:00
|
|
|
brelse(newindirdep->ir_savebp);
|
1998-05-19 20:18:42 +00:00
|
|
|
WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
if (indirdep)
|
|
|
|
break;
|
|
|
|
MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
|
Implement a low-memory deadlock solution.
Removed most of the hacks that were trying to deal with low-memory
situations prior to now.
The new code is based on the concept that I/O must be able to function in
a low memory situation. All major modules related to I/O (except
networking) have been adjusted to allow allocation out of the system
reserve memory pool. These modules now detect a low memory situation but
rather then block they instead continue to operate, then return resources
to the memory pool instead of cache them or leave them wired.
Code has been added to stall in a low-memory situation prior to a vnode
being locked.
Thus situations where a process blocks in a low-memory condition while
holding a locked vnode have been reduced to near nothing. Not only will
I/O continue to operate, but many prior deadlock conditions simply no
longer exist.
Implement a number of VFS/BIO fixes
(found by Ian): in biodone(), bogus-page replacement code, the loop
was not properly incrementing loop variables prior to a continue
statement. We do not believe this code can be hit anyway but we
aren't taking any chances. We'll turn the whole section into a
panic (as it already is in brelse()) after the release is rolled.
In biodone(), the foff calculation was incorrectly
clamped to the iosize, causing the wrong foff to be calculated
for pages in the case of an I/O error or biodone() called without
initiating I/O. The problem always caused a panic before. Now it
doesn't. The problem is mainly an issue with NFS.
Fixed casts for ~PAGE_MASK. This code worked properly before only
because the calculations use signed arithmatic. Better to properly
extend PAGE_MASK first before inverting it for the 64 bit masking
op.
In brelse(), the bogus_page fixup code was improperly throwing
away the original contents of 'm' when it did the j-loop to
fix the bogus pages. The result was that it would potentially
invalidate parts of the *WRONG* page(!), leading to corruption.
There may still be cases where a background bitmap write is
being duplicated, causing potential corruption. We have identified
a potentially serious bug related to this but the fix is still TBD.
So instead this patch contains a KASSERT to detect the problem
and panic the machine rather then continue to corrupt the filesystem.
The problem does not occur very often.. it is very hard to
reproduce, and it may or may not be the cause of the corruption
people have reported.
Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>)
Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
2000-11-18 23:06:26 +00:00
|
|
|
M_INDIRDEP, M_SOFTDEP_FLAGS);
|
1998-05-19 20:18:42 +00:00
|
|
|
newindirdep->ir_list.wk_type = D_INDIRDEP;
|
1998-05-19 19:47:22 +00:00
|
|
|
newindirdep->ir_state = ATTACHED;
|
2002-06-21 06:18:05 +00:00
|
|
|
if (ip->i_ump->um_fstype == UFS1)
|
|
|
|
newindirdep->ir_state |= UFS1FMT;
|
1998-05-19 19:47:22 +00:00
|
|
|
LIST_INIT(&newindirdep->ir_deplisthd);
|
|
|
|
LIST_INIT(&newindirdep->ir_donehd);
|
2002-03-15 18:49:47 +00:00
|
|
|
if (bp->b_blkno == bp->b_lblkno) {
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
|
|
|
|
NULL, NULL);
|
2002-03-15 18:49:47 +00:00
|
|
|
bp->b_blkno = blkno;
|
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
newindirdep->ir_savebp =
|
|
|
|
getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0);
|
1999-06-26 02:47:16 +00:00
|
|
|
BUF_KERNPROC(newindirdep->ir_savebp);
|
1998-05-19 23:07:25 +00:00
|
|
|
bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Block de-allocation dependencies.
|
|
|
|
*
|
|
|
|
* When blocks are de-allocated, the on-disk pointers must be nullified before
|
|
|
|
* the blocks are made available for use by other files. (The true
|
|
|
|
* requirement is that old pointers must be nullified before new on-disk
|
|
|
|
* pointers are set. We chose this slightly more stringent requirement to
|
|
|
|
* reduce complexity.) Our implementation handles this dependency by updating
|
|
|
|
* the inode (or indirect block) appropriately but delaying the actual block
|
|
|
|
* de-allocation (i.e., freemap and free space count manipulation) until
|
|
|
|
* after the updated versions reach stable storage. After the disk is
|
|
|
|
* updated, the blocks can be safely de-allocated whenever it is convenient.
|
|
|
|
* This implementation handles only the common case of reducing a file's
|
|
|
|
* length to zero. Other cases are handled by the conventional synchronous
|
|
|
|
* write approach.
|
|
|
|
*
|
|
|
|
* The ffs implementation with which we worked double-checks
|
|
|
|
* the state of the block pointers and file size as it reduces
|
|
|
|
* a file's length. Some of this code is replicated here in our
|
|
|
|
* soft updates implementation. The freeblks->fb_chkcnt field is
|
|
|
|
* used to transfer a part of this information to the procedure
|
|
|
|
* that eventually de-allocates the blocks.
|
|
|
|
*
|
|
|
|
* This routine should be called from the routine that shortens
|
|
|
|
* a file's length, before the inode's size or block pointers
|
|
|
|
* are modified. It will save the block pointer information for
|
|
|
|
* later release and zero the inode so that the calling routine
|
|
|
|
* can release it.
|
|
|
|
*/
|
|
|
|
void
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
softdep_setup_freeblocks(ip, length, flags)
|
1998-05-19 19:47:22 +00:00
|
|
|
struct inode *ip; /* The inode whose length is to be reduced */
|
|
|
|
off_t length; /* The new length for the file */
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
int flags; /* IO_EXT and/or IO_NORMAL */
|
1998-05-19 19:47:22 +00:00
|
|
|
{
|
|
|
|
struct freeblks *freeblks;
|
|
|
|
struct inodedep *inodedep;
|
|
|
|
struct allocdirect *adp;
|
|
|
|
struct vnode *vp;
|
|
|
|
struct buf *bp;
|
|
|
|
struct fs *fs;
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
ufs2_daddr_t extblocks, datablocks;
|
2000-06-18 22:05:57 +00:00
|
|
|
int i, delay, error;
|
1998-05-19 19:47:22 +00:00
|
|
|
|
|
|
|
fs = ip->i_fs;
|
|
|
|
if (length != 0)
|
2001-05-08 07:42:20 +00:00
|
|
|
panic("softdep_setup_freeblocks: non-zero length");
|
1998-05-19 19:47:22 +00:00
|
|
|
MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
|
2000-12-08 21:51:06 +00:00
|
|
|
M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
|
1998-05-19 20:18:42 +00:00
|
|
|
freeblks->fb_list.wk_type = D_FREEBLKS;
|
1998-05-19 19:47:22 +00:00
|
|
|
freeblks->fb_uid = ip->i_uid;
|
|
|
|
freeblks->fb_previousinum = ip->i_number;
|
|
|
|
freeblks->fb_devvp = ip->i_devvp;
|
2000-07-11 22:07:57 +00:00
|
|
|
freeblks->fb_mnt = ITOV(ip)->v_mount;
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
extblocks = 0;
|
|
|
|
if (fs->fs_magic == FS_UFS2_MAGIC)
|
|
|
|
extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
|
|
|
|
datablocks = DIP(ip, i_blocks) - extblocks;
|
|
|
|
if ((flags & IO_NORMAL) == 0) {
|
|
|
|
freeblks->fb_oldsize = 0;
|
|
|
|
freeblks->fb_chkcnt = 0;
|
|
|
|
} else {
|
|
|
|
freeblks->fb_oldsize = ip->i_size;
|
|
|
|
ip->i_size = 0;
|
|
|
|
DIP(ip, i_size) = 0;
|
|
|
|
freeblks->fb_chkcnt = datablocks;
|
|
|
|
for (i = 0; i < NDADDR; i++) {
|
|
|
|
freeblks->fb_dblks[i] = DIP(ip, i_db[i]);
|
|
|
|
DIP(ip, i_db[i]) = 0;
|
|
|
|
}
|
|
|
|
for (i = 0; i < NIADDR; i++) {
|
|
|
|
freeblks->fb_iblks[i] = DIP(ip, i_ib[i]);
|
|
|
|
DIP(ip, i_ib[i]) = 0;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* If the file was removed, then the space being freed was
|
|
|
|
* accounted for then (see softdep_filereleased()). If the
|
|
|
|
* file is merely being truncated, then we account for it now.
|
|
|
|
*/
|
|
|
|
if ((ip->i_flag & IN_SPACECOUNTED) == 0)
|
|
|
|
fs->fs_pendingblocks += datablocks;
|
|
|
|
}
|
|
|
|
if ((flags & IO_EXT) == 0) {
|
|
|
|
freeblks->fb_oldextsize = 0;
|
|
|
|
} else {
|
|
|
|
freeblks->fb_oldextsize = ip->i_din2->di_extsize;
|
|
|
|
ip->i_din2->di_extsize = 0;
|
|
|
|
freeblks->fb_chkcnt += extblocks;
|
|
|
|
for (i = 0; i < NXADDR; i++) {
|
|
|
|
freeblks->fb_eblks[i] = ip->i_din2->di_extb[i];
|
|
|
|
ip->i_din2->di_extb[i] = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
DIP(ip, i_blocks) -= freeblks->fb_chkcnt;
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* Push the zero'ed inode to to its disk buffer so that we are free
|
|
|
|
* to delete its dependencies below. Once the dependencies are gone
|
|
|
|
* the buffer can be safely released.
|
|
|
|
*/
|
|
|
|
if ((error = bread(ip->i_devvp,
|
|
|
|
fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
|
2002-02-02 01:42:44 +00:00
|
|
|
(int)fs->fs_bsize, NOCRED, &bp)) != 0) {
|
|
|
|
brelse(bp);
|
1998-05-19 19:47:22 +00:00
|
|
|
softdep_error("softdep_setup_freeblocks", error);
|
2002-02-02 01:42:44 +00:00
|
|
|
}
|
2002-06-21 06:18:05 +00:00
|
|
|
if (ip->i_ump->um_fstype == UFS1)
|
|
|
|
*((struct ufs1_dinode *)bp->b_data +
|
|
|
|
ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
|
|
|
|
else
|
|
|
|
*((struct ufs2_dinode *)bp->b_data +
|
|
|
|
ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* Find and eliminate any inode dependencies.
|
|
|
|
*/
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
(void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep);
|
2001-02-23 09:01:31 +00:00
|
|
|
if ((inodedep->id_state & IOSTARTED) != 0) {
|
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("softdep_setup_freeblocks: inode busy");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
When deleting a file, the ordering of events imposed by soft updates
is to first write the deleted directory entry to disk, second write
the zero'ed inode to disk, and finally to release the freed blocks
and the inode back to the cylinder-group map. As this ordering
requires two disk writes to occur which are normally spaced about
30 seconds apart (except when memory is under duress), it takes
about a minute from the time that a file is deleted until its inode
and data blocks show up in the cylinder-group map for reallocation.
If a file has had only a brief lifetime (less than 30 seconds from
creation to deletion), neither its inode nor its directory entry
may have been written to disk. If its directory entry has not been
written to disk, then we need not wait for that directory block to
be written as the on-disk directory block does not reference the
inode. Similarly, if the allocated inode has never been written to
disk, we do not have to wait for it to be written back either as
its on-disk representation is still zero'ed out. Thus, in the case
of a short lived file, we can simply release the blocks and inode
to the cylinder-group map immediately. As the inode and its blocks
are released immediately, they are immediately available for other
uses. If they are not released for a minute, then other inodes and
blocks must be allocated for short lived files, cluttering up the
vnode and buffer caches. The previous code was a bit too aggressive
in trying to release the blocks and inode back to the cylinder-group
map resulting in their being made available when in fact the inode
on disk had not yet been zero'ed. This patch takes a more conservative
approach to doing the release which avoids doing the release prematurely.
2000-11-14 09:00:25 +00:00
|
|
|
/*
|
|
|
|
* Add the freeblks structure to the list of operations that
|
|
|
|
* must await the zero'ed inode being written to disk. If we
|
|
|
|
* still have a bitmap dependency (delay == 0), then the inode
|
|
|
|
* has never been written to disk, so we can process the
|
|
|
|
* freeblks below once we have deleted the dependencies.
|
|
|
|
*/
|
|
|
|
delay = (inodedep->id_state & DEPCOMPLETE);
|
|
|
|
if (delay)
|
|
|
|
WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* Because the file length has been truncated to zero, any
|
|
|
|
* pending block allocation dependency structures associated
|
|
|
|
* with this inode are obsolete and can simply be de-allocated.
|
|
|
|
* We must first merge the two dependency lists to get rid of
|
|
|
|
* any duplicate freefrag structures, then purge the merged list.
|
2000-06-18 22:05:57 +00:00
|
|
|
* If we still have a bitmap dependency, then the inode has never
|
|
|
|
* been written to disk, so we can free any fragments without delay.
|
1998-05-19 19:47:22 +00:00
|
|
|
*/
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
if (flags & IO_NORMAL) {
|
|
|
|
merge_inode_lists(&inodedep->id_newinoupdt,
|
|
|
|
&inodedep->id_inoupdt);
|
|
|
|
while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
|
|
|
|
free_allocdirect(&inodedep->id_inoupdt, adp, delay);
|
|
|
|
}
|
|
|
|
if (flags & IO_EXT) {
|
|
|
|
merge_inode_lists(&inodedep->id_newextupdt,
|
|
|
|
&inodedep->id_extupdt);
|
|
|
|
while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
|
|
|
|
free_allocdirect(&inodedep->id_extupdt, adp, delay);
|
|
|
|
}
|
2000-01-11 06:52:35 +00:00
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
bdwrite(bp);
|
|
|
|
/*
|
|
|
|
* We must wait for any I/O in progress to finish so that
|
|
|
|
* all potential buffers on the dirty list will be visible.
|
|
|
|
* Once they are all there, walk the list and get rid of
|
|
|
|
* any dependencies.
|
|
|
|
*/
|
|
|
|
vp = ITOV(ip);
|
1999-03-02 06:38:07 +00:00
|
|
|
ACQUIRE_LOCK(&lk);
|
1999-05-07 02:26:47 +00:00
|
|
|
drain_output(vp, 1);
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
restart:
|
2002-09-25 02:49:48 +00:00
|
|
|
VI_LOCK(vp);
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
TAILQ_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
|
|
|
|
if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
|
|
|
|
((flags & IO_NORMAL) == 0 &&
|
|
|
|
(bp->b_xflags & BX_ALTDATA) == 0))
|
|
|
|
continue;
|
2002-09-25 02:49:48 +00:00
|
|
|
VI_UNLOCK(vp);
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
if (getdirtybuf(&bp, MNT_WAIT) == 0)
|
|
|
|
goto restart;
|
1998-05-19 19:47:22 +00:00
|
|
|
(void) inodedep_lookup(fs, ip->i_number, 0, &inodedep);
|
|
|
|
deallocate_dependencies(bp, inodedep);
|
1998-06-11 17:44:32 +00:00
|
|
|
bp->b_flags |= B_INVAL | B_NOCACHE;
|
1999-03-02 06:38:07 +00:00
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
brelse(bp);
|
1999-03-02 06:38:07 +00:00
|
|
|
ACQUIRE_LOCK(&lk);
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
goto restart;
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
2002-09-25 02:49:48 +00:00
|
|
|
VI_UNLOCK(vp);
|
When deleting a file, the ordering of events imposed by soft updates
is to first write the deleted directory entry to disk, second write
the zero'ed inode to disk, and finally to release the freed blocks
and the inode back to the cylinder-group map. As this ordering
requires two disk writes to occur which are normally spaced about
30 seconds apart (except when memory is under duress), it takes
about a minute from the time that a file is deleted until its inode
and data blocks show up in the cylinder-group map for reallocation.
If a file has had only a brief lifetime (less than 30 seconds from
creation to deletion), neither its inode nor its directory entry
may have been written to disk. If its directory entry has not been
written to disk, then we need not wait for that directory block to
be written as the on-disk directory block does not reference the
inode. Similarly, if the allocated inode has never been written to
disk, we do not have to wait for it to be written back either as
its on-disk representation is still zero'ed out. Thus, in the case
of a short lived file, we can simply release the blocks and inode
to the cylinder-group map immediately. As the inode and its blocks
are released immediately, they are immediately available for other
uses. If they are not released for a minute, then other inodes and
blocks must be allocated for short lived files, cluttering up the
vnode and buffer caches. The previous code was a bit too aggressive
in trying to release the blocks and inode back to the cylinder-group
map resulting in their being made available when in fact the inode
on disk had not yet been zero'ed. This patch takes a more conservative
approach to doing the release which avoids doing the release prematurely.
2000-11-14 09:00:25 +00:00
|
|
|
if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0)
|
|
|
|
(void) free_inodedep(inodedep);
|
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
When deleting a file, the ordering of events imposed by soft updates
is to first write the deleted directory entry to disk, second write
the zero'ed inode to disk, and finally to release the freed blocks
and the inode back to the cylinder-group map. As this ordering
requires two disk writes to occur which are normally spaced about
30 seconds apart (except when memory is under duress), it takes
about a minute from the time that a file is deleted until its inode
and data blocks show up in the cylinder-group map for reallocation.
If a file has had only a brief lifetime (less than 30 seconds from
creation to deletion), neither its inode nor its directory entry
may have been written to disk. If its directory entry has not been
written to disk, then we need not wait for that directory block to
be written as the on-disk directory block does not reference the
inode. Similarly, if the allocated inode has never been written to
disk, we do not have to wait for it to be written back either as
its on-disk representation is still zero'ed out. Thus, in the case
of a short lived file, we can simply release the blocks and inode
to the cylinder-group map immediately. As the inode and its blocks
are released immediately, they are immediately available for other
uses. If they are not released for a minute, then other inodes and
blocks must be allocated for short lived files, cluttering up the
vnode and buffer caches. The previous code was a bit too aggressive
in trying to release the blocks and inode back to the cylinder-group
map resulting in their being made available when in fact the inode
on disk had not yet been zero'ed. This patch takes a more conservative
approach to doing the release which avoids doing the release prematurely.
2000-11-14 09:00:25 +00:00
|
|
|
* If the inode has never been written to disk (delay == 0),
|
|
|
|
* then we can process the freeblks now that we have deleted
|
|
|
|
* the dependencies.
|
1998-05-19 19:47:22 +00:00
|
|
|
*/
|
When deleting a file, the ordering of events imposed by soft updates
is to first write the deleted directory entry to disk, second write
the zero'ed inode to disk, and finally to release the freed blocks
and the inode back to the cylinder-group map. As this ordering
requires two disk writes to occur which are normally spaced about
30 seconds apart (except when memory is under duress), it takes
about a minute from the time that a file is deleted until its inode
and data blocks show up in the cylinder-group map for reallocation.
If a file has had only a brief lifetime (less than 30 seconds from
creation to deletion), neither its inode nor its directory entry
may have been written to disk. If its directory entry has not been
written to disk, then we need not wait for that directory block to
be written as the on-disk directory block does not reference the
inode. Similarly, if the allocated inode has never been written to
disk, we do not have to wait for it to be written back either as
its on-disk representation is still zero'ed out. Thus, in the case
of a short lived file, we can simply release the blocks and inode
to the cylinder-group map immediately. As the inode and its blocks
are released immediately, they are immediately available for other
uses. If they are not released for a minute, then other inodes and
blocks must be allocated for short lived files, cluttering up the
vnode and buffer caches. The previous code was a bit too aggressive
in trying to release the blocks and inode back to the cylinder-group
map resulting in their being made available when in fact the inode
on disk had not yet been zero'ed. This patch takes a more conservative
approach to doing the release which avoids doing the release prematurely.
2000-11-14 09:00:25 +00:00
|
|
|
if (!delay)
|
2001-03-21 04:09:01 +00:00
|
|
|
handle_workitem_freeblocks(freeblks, 0);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Reclaim any dependency structures from a buffer that is about to
|
|
|
|
* be reallocated to a new vnode. The buffer must be locked, thus,
|
|
|
|
* no I/O completion operations can occur while we are manipulating
|
|
|
|
* its associated dependencies. The mutex is held so that other I/O's
|
|
|
|
* associated with related dependencies do not occur.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
deallocate_dependencies(bp, inodedep)
|
|
|
|
struct buf *bp;
|
|
|
|
struct inodedep *inodedep;
|
|
|
|
{
|
|
|
|
struct worklist *wk;
|
|
|
|
struct indirdep *indirdep;
|
|
|
|
struct allocindir *aip;
|
|
|
|
struct pagedep *pagedep;
|
|
|
|
struct dirrem *dirrem;
|
|
|
|
struct diradd *dap;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
|
|
|
|
switch (wk->wk_type) {
|
|
|
|
|
1998-05-19 20:18:42 +00:00
|
|
|
case D_INDIRDEP:
|
1998-05-19 19:47:22 +00:00
|
|
|
indirdep = WK_INDIRDEP(wk);
|
|
|
|
/*
|
|
|
|
* None of the indirect pointers will ever be visible,
|
|
|
|
* so they can simply be tossed. GOINGAWAY ensures
|
|
|
|
* that allocated pointers will be saved in the buffer
|
|
|
|
* cache until they are freed. Note that they will
|
|
|
|
* only be able to be found by their physical address
|
|
|
|
* since the inode mapping the logical address will
|
|
|
|
* be gone. The save buffer used for the safe copy
|
|
|
|
* was allocated in setup_allocindir_phase2 using
|
|
|
|
* the physical address so it could be used for this
|
|
|
|
* purpose. Hence we swap the safe copy with the real
|
|
|
|
* copy, allowing the safe copy to be freed and holding
|
|
|
|
* on to the real copy for later use in indir_trunc.
|
|
|
|
*/
|
2001-02-23 09:01:31 +00:00
|
|
|
if (indirdep->ir_state & GOINGAWAY) {
|
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("deallocate_dependencies: already gone");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
indirdep->ir_state |= GOINGAWAY;
|
2003-01-07 18:23:50 +00:00
|
|
|
VFSTOUFS(bp->b_vp->v_mount)->um_numindirdeps += 1;
|
1998-05-19 19:47:22 +00:00
|
|
|
while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
|
|
|
|
free_allocindir(aip, inodedep);
|
|
|
|
if (bp->b_lblkno >= 0 ||
|
2001-02-23 09:01:31 +00:00
|
|
|
bp->b_blkno != indirdep->ir_savebp->b_lblkno) {
|
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("deallocate_dependencies: not indir");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 20:50:41 +00:00
|
|
|
bcopy(bp->b_data, indirdep->ir_savebp->b_data,
|
|
|
|
bp->b_bcount);
|
1998-05-19 19:47:22 +00:00
|
|
|
WORKLIST_REMOVE(wk);
|
|
|
|
WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
|
|
|
|
continue;
|
|
|
|
|
1998-05-19 20:18:42 +00:00
|
|
|
case D_PAGEDEP:
|
1998-05-19 19:47:22 +00:00
|
|
|
pagedep = WK_PAGEDEP(wk);
|
|
|
|
/*
|
|
|
|
* None of the directory additions will ever be
|
|
|
|
* visible, so they can simply be tossed.
|
|
|
|
*/
|
|
|
|
for (i = 0; i < DAHASHSZ; i++)
|
1999-05-22 04:43:04 +00:00
|
|
|
while ((dap =
|
|
|
|
LIST_FIRST(&pagedep->pd_diraddhd[i])))
|
1998-05-19 19:47:22 +00:00
|
|
|
free_diradd(dap);
|
|
|
|
while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
|
|
|
|
free_diradd(dap);
|
|
|
|
/*
|
|
|
|
* Copy any directory remove dependencies to the list
|
|
|
|
* to be processed after the zero'ed inode is written.
|
|
|
|
* If the inode has already been written, then they
|
|
|
|
* can be dumped directly onto the work list.
|
|
|
|
*/
|
2001-02-04 16:08:18 +00:00
|
|
|
LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
|
1998-05-19 19:47:22 +00:00
|
|
|
LIST_REMOVE(dirrem, dm_next);
|
|
|
|
dirrem->dm_dirinum = pagedep->pd_ino;
|
2000-01-17 06:35:11 +00:00
|
|
|
if (inodedep == NULL ||
|
|
|
|
(inodedep->id_state & ALLCOMPLETE) ==
|
|
|
|
ALLCOMPLETE)
|
1998-05-19 19:47:22 +00:00
|
|
|
add_to_worklist(&dirrem->dm_list);
|
|
|
|
else
|
1998-05-19 21:45:53 +00:00
|
|
|
WORKLIST_INSERT(&inodedep->id_bufwait,
|
1998-05-19 19:47:22 +00:00
|
|
|
&dirrem->dm_list);
|
|
|
|
}
|
2001-06-05 01:49:37 +00:00
|
|
|
if ((pagedep->pd_state & NEWBLOCK) != 0) {
|
2001-06-13 23:13:13 +00:00
|
|
|
LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list)
|
|
|
|
if (wk->wk_type == D_NEWDIRBLK &&
|
|
|
|
WK_NEWDIRBLK(wk)->db_pagedep ==
|
|
|
|
pagedep)
|
|
|
|
break;
|
|
|
|
if (wk != NULL) {
|
|
|
|
WORKLIST_REMOVE(wk);
|
|
|
|
free_newdirblk(WK_NEWDIRBLK(wk));
|
|
|
|
} else {
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
panic("deallocate_dependencies: "
|
|
|
|
"lost pagedep");
|
|
|
|
}
|
2001-06-05 01:49:37 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
WORKLIST_REMOVE(&pagedep->pd_list);
|
|
|
|
LIST_REMOVE(pagedep, pd_hash);
|
1998-05-19 20:18:42 +00:00
|
|
|
WORKITEM_FREE(pagedep, D_PAGEDEP);
|
1998-05-19 19:47:22 +00:00
|
|
|
continue;
|
|
|
|
|
1998-05-19 20:18:42 +00:00
|
|
|
case D_ALLOCINDIR:
|
1998-05-19 19:47:22 +00:00
|
|
|
free_allocindir(WK_ALLOCINDIR(wk), inodedep);
|
|
|
|
continue;
|
|
|
|
|
1998-05-19 20:18:42 +00:00
|
|
|
case D_ALLOCDIRECT:
|
|
|
|
case D_INODEDEP:
|
2001-02-23 09:01:31 +00:00
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("deallocate_dependencies: Unexpected type %s",
|
|
|
|
TYPENAME(wk->wk_type));
|
|
|
|
/* NOTREACHED */
|
|
|
|
|
|
|
|
default:
|
2001-02-23 09:01:31 +00:00
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("deallocate_dependencies: Unknown type %s",
|
|
|
|
TYPENAME(wk->wk_type));
|
|
|
|
/* NOTREACHED */
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Free an allocdirect. Generate a new freefrag work request if appropriate.
|
|
|
|
* This routine must be called with splbio interrupts blocked.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
free_allocdirect(adphead, adp, delay)
|
|
|
|
struct allocdirectlst *adphead;
|
|
|
|
struct allocdirect *adp;
|
|
|
|
int delay;
|
|
|
|
{
|
2001-05-17 07:24:03 +00:00
|
|
|
struct newdirblk *newdirblk;
|
|
|
|
struct worklist *wk;
|
1998-05-19 19:47:22 +00:00
|
|
|
|
|
|
|
#ifdef DEBUG
|
2001-09-12 08:38:13 +00:00
|
|
|
if (lk.lkt_held == NOHOLDER)
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("free_allocdirect: lock not held");
|
|
|
|
#endif
|
|
|
|
if ((adp->ad_state & DEPCOMPLETE) == 0)
|
|
|
|
LIST_REMOVE(adp, ad_deps);
|
|
|
|
TAILQ_REMOVE(adphead, adp, ad_next);
|
|
|
|
if ((adp->ad_state & COMPLETE) == 0)
|
|
|
|
WORKLIST_REMOVE(&adp->ad_list);
|
|
|
|
if (adp->ad_freefrag != NULL) {
|
|
|
|
if (delay)
|
1998-05-19 21:45:53 +00:00
|
|
|
WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
|
1998-05-19 19:47:22 +00:00
|
|
|
&adp->ad_freefrag->ff_list);
|
|
|
|
else
|
|
|
|
add_to_worklist(&adp->ad_freefrag->ff_list);
|
|
|
|
}
|
2001-05-17 07:24:03 +00:00
|
|
|
if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) {
|
|
|
|
newdirblk = WK_NEWDIRBLK(wk);
|
|
|
|
WORKLIST_REMOVE(&newdirblk->db_list);
|
|
|
|
if (LIST_FIRST(&adp->ad_newdirblk) != NULL)
|
|
|
|
panic("free_allocdirect: extra newdirblk");
|
|
|
|
if (delay)
|
|
|
|
WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
|
|
|
|
&newdirblk->db_list);
|
|
|
|
else
|
|
|
|
free_newdirblk(newdirblk);
|
|
|
|
}
|
1998-05-19 20:18:42 +00:00
|
|
|
WORKITEM_FREE(adp, D_ALLOCDIRECT);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
|
2001-05-17 07:24:03 +00:00
|
|
|
/*
|
|
|
|
* Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
|
|
|
|
* This routine must be called with splbio interrupts blocked.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
free_newdirblk(newdirblk)
|
|
|
|
struct newdirblk *newdirblk;
|
|
|
|
{
|
|
|
|
struct pagedep *pagedep;
|
|
|
|
struct diradd *dap;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
#ifdef DEBUG
|
2001-09-12 08:38:13 +00:00
|
|
|
if (lk.lkt_held == NOHOLDER)
|
2001-05-17 07:24:03 +00:00
|
|
|
panic("free_newdirblk: lock not held");
|
|
|
|
#endif
|
|
|
|
/*
|
2001-05-19 19:24:26 +00:00
|
|
|
* If the pagedep is still linked onto the directory buffer
|
|
|
|
* dependency chain, then some of the entries on the
|
|
|
|
* pd_pendinghd list may not be committed to disk yet. In
|
|
|
|
* this case, we will simply clear the NEWBLOCK flag and
|
|
|
|
* let the pd_pendinghd list be processed when the pagedep
|
|
|
|
* is next written. If the pagedep is no longer on the buffer
|
|
|
|
* dependency chain, then all the entries on the pd_pending
|
|
|
|
* list are committed to disk and we can free them here.
|
2001-05-17 07:24:03 +00:00
|
|
|
*/
|
|
|
|
pagedep = newdirblk->db_pagedep;
|
|
|
|
pagedep->pd_state &= ~NEWBLOCK;
|
2001-05-19 19:24:26 +00:00
|
|
|
if ((pagedep->pd_state & ONWORKLIST) == 0)
|
|
|
|
while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
|
|
|
|
free_diradd(dap);
|
2001-05-17 07:24:03 +00:00
|
|
|
/*
|
|
|
|
* If no dependencies remain, the pagedep will be freed.
|
|
|
|
*/
|
|
|
|
for (i = 0; i < DAHASHSZ; i++)
|
|
|
|
if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
|
|
|
|
break;
|
2001-05-18 22:16:28 +00:00
|
|
|
if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) {
|
2001-05-17 07:24:03 +00:00
|
|
|
LIST_REMOVE(pagedep, pd_hash);
|
|
|
|
WORKITEM_FREE(pagedep, D_PAGEDEP);
|
|
|
|
}
|
|
|
|
WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
|
|
|
|
}
|
|
|
|
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* Prepare an inode to be freed. The actual free operation is not
|
|
|
|
* done until the zero'ed inode has been written to disk.
|
|
|
|
*/
|
|
|
|
void
|
1998-05-19 20:18:42 +00:00
|
|
|
softdep_freefile(pvp, ino, mode)
|
2002-06-21 06:18:05 +00:00
|
|
|
struct vnode *pvp;
|
|
|
|
ino_t ino;
|
|
|
|
int mode;
|
1998-05-19 19:47:22 +00:00
|
|
|
{
|
1998-05-19 20:18:42 +00:00
|
|
|
struct inode *ip = VTOI(pvp);
|
1998-05-19 19:47:22 +00:00
|
|
|
struct inodedep *inodedep;
|
|
|
|
struct freefile *freefile;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This sets up the inode de-allocation dependency.
|
|
|
|
*/
|
|
|
|
MALLOC(freefile, struct freefile *, sizeof(struct freefile),
|
Implement a low-memory deadlock solution.
Removed most of the hacks that were trying to deal with low-memory
situations prior to now.
The new code is based on the concept that I/O must be able to function in
a low memory situation. All major modules related to I/O (except
networking) have been adjusted to allow allocation out of the system
reserve memory pool. These modules now detect a low memory situation but
rather then block they instead continue to operate, then return resources
to the memory pool instead of cache them or leave them wired.
Code has been added to stall in a low-memory situation prior to a vnode
being locked.
Thus situations where a process blocks in a low-memory condition while
holding a locked vnode have been reduced to near nothing. Not only will
I/O continue to operate, but many prior deadlock conditions simply no
longer exist.
Implement a number of VFS/BIO fixes
(found by Ian): in biodone(), bogus-page replacement code, the loop
was not properly incrementing loop variables prior to a continue
statement. We do not believe this code can be hit anyway but we
aren't taking any chances. We'll turn the whole section into a
panic (as it already is in brelse()) after the release is rolled.
In biodone(), the foff calculation was incorrectly
clamped to the iosize, causing the wrong foff to be calculated
for pages in the case of an I/O error or biodone() called without
initiating I/O. The problem always caused a panic before. Now it
doesn't. The problem is mainly an issue with NFS.
Fixed casts for ~PAGE_MASK. This code worked properly before only
because the calculations use signed arithmatic. Better to properly
extend PAGE_MASK first before inverting it for the 64 bit masking
op.
In brelse(), the bogus_page fixup code was improperly throwing
away the original contents of 'm' when it did the j-loop to
fix the bogus pages. The result was that it would potentially
invalidate parts of the *WRONG* page(!), leading to corruption.
There may still be cases where a background bitmap write is
being duplicated, causing potential corruption. We have identified
a potentially serious bug related to this but the fix is still TBD.
So instead this patch contains a KASSERT to detect the problem
and panic the machine rather then continue to corrupt the filesystem.
The problem does not occur very often.. it is very hard to
reproduce, and it may or may not be the cause of the corruption
people have reported.
Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>)
Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
2000-11-18 23:06:26 +00:00
|
|
|
M_FREEFILE, M_SOFTDEP_FLAGS);
|
1998-05-19 20:18:42 +00:00
|
|
|
freefile->fx_list.wk_type = D_FREEFILE;
|
1998-05-19 19:47:22 +00:00
|
|
|
freefile->fx_list.wk_state = 0;
|
1998-05-19 20:18:42 +00:00
|
|
|
freefile->fx_mode = mode;
|
|
|
|
freefile->fx_oldinum = ino;
|
1998-05-19 19:47:22 +00:00
|
|
|
freefile->fx_devvp = ip->i_devvp;
|
2000-07-11 22:07:57 +00:00
|
|
|
freefile->fx_mnt = ITOV(ip)->v_mount;
|
2001-05-08 07:42:20 +00:00
|
|
|
if ((ip->i_flag & IN_SPACECOUNTED) == 0)
|
|
|
|
ip->i_fs->fs_pendinginodes += 1;
|
1998-05-19 19:47:22 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If the inodedep does not exist, then the zero'ed inode has
|
2000-01-18 01:33:05 +00:00
|
|
|
* been written to disk. If the allocated inode has never been
|
|
|
|
* written to disk, then the on-disk inode is zero'ed. In either
|
|
|
|
* case we can free the file immediately.
|
1998-05-19 19:47:22 +00:00
|
|
|
*/
|
|
|
|
ACQUIRE_LOCK(&lk);
|
2000-01-18 01:33:05 +00:00
|
|
|
if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0 ||
|
|
|
|
check_inode_unwritten(inodedep)) {
|
1998-05-19 19:47:22 +00:00
|
|
|
FREE_LOCK(&lk);
|
2000-01-10 00:24:24 +00:00
|
|
|
handle_workitem_freefile(freefile);
|
1998-05-19 19:47:22 +00:00
|
|
|
return;
|
|
|
|
}
|
2000-01-18 01:33:05 +00:00
|
|
|
WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
|
2000-01-18 01:33:05 +00:00
|
|
|
/*
|
|
|
|
* Check to see if an inode has never been written to disk. If
|
|
|
|
* so free the inodedep and return success, otherwise return failure.
|
|
|
|
* This routine must be called with splbio interrupts blocked.
|
|
|
|
*
|
|
|
|
* If we still have a bitmap dependency, then the inode has never
|
|
|
|
* been written to disk. Drop the dependency as it is no longer
|
|
|
|
* necessary since the inode is being deallocated. We set the
|
|
|
|
* ALLCOMPLETE flags since the bitmap now properly shows that the
|
|
|
|
* inode is not allocated. Even if the inode is actively being
|
|
|
|
* written, it has been rolled back to its zero'ed state, so we
|
|
|
|
* are ensured that a zero inode is what is on the disk. For short
|
|
|
|
* lived files, this change will usually result in removing all the
|
|
|
|
* dependencies from the inode so that it can be freed immediately.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
check_inode_unwritten(inodedep)
|
|
|
|
struct inodedep *inodedep;
|
|
|
|
{
|
|
|
|
|
|
|
|
if ((inodedep->id_state & DEPCOMPLETE) != 0 ||
|
|
|
|
LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
|
|
|
|
LIST_FIRST(&inodedep->id_bufwait) != NULL ||
|
|
|
|
LIST_FIRST(&inodedep->id_inowait) != NULL ||
|
|
|
|
TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
|
|
|
|
TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
TAILQ_FIRST(&inodedep->id_extupdt) != NULL ||
|
|
|
|
TAILQ_FIRST(&inodedep->id_newextupdt) != NULL ||
|
2000-01-18 01:33:05 +00:00
|
|
|
inodedep->id_nlinkdelta != 0)
|
|
|
|
return (0);
|
|
|
|
inodedep->id_state |= ALLCOMPLETE;
|
|
|
|
LIST_REMOVE(inodedep, id_deps);
|
|
|
|
inodedep->id_buf = NULL;
|
2000-06-18 22:14:28 +00:00
|
|
|
if (inodedep->id_state & ONWORKLIST)
|
|
|
|
WORKLIST_REMOVE(&inodedep->id_list);
|
2002-06-21 06:18:05 +00:00
|
|
|
if (inodedep->id_savedino1 != NULL) {
|
|
|
|
FREE(inodedep->id_savedino1, M_INODEDEP);
|
|
|
|
inodedep->id_savedino1 = NULL;
|
2000-01-10 00:24:24 +00:00
|
|
|
}
|
2001-02-23 09:01:31 +00:00
|
|
|
if (free_inodedep(inodedep) == 0) {
|
|
|
|
FREE_LOCK(&lk);
|
2000-01-18 01:33:05 +00:00
|
|
|
panic("check_inode_unwritten: busy inode");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
2000-01-18 01:33:05 +00:00
|
|
|
return (1);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Try to free an inodedep structure. Return 1 if it could be freed.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
free_inodedep(inodedep)
|
|
|
|
struct inodedep *inodedep;
|
|
|
|
{
|
|
|
|
|
|
|
|
if ((inodedep->id_state & ONWORKLIST) != 0 ||
|
1998-05-19 20:03:29 +00:00
|
|
|
(inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
|
1998-05-19 19:47:22 +00:00
|
|
|
LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
|
1998-05-19 21:45:53 +00:00
|
|
|
LIST_FIRST(&inodedep->id_bufwait) != NULL ||
|
1998-05-19 19:47:22 +00:00
|
|
|
LIST_FIRST(&inodedep->id_inowait) != NULL ||
|
|
|
|
TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
|
|
|
|
TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
TAILQ_FIRST(&inodedep->id_extupdt) != NULL ||
|
|
|
|
TAILQ_FIRST(&inodedep->id_newextupdt) != NULL ||
|
2002-06-21 06:18:05 +00:00
|
|
|
inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL)
|
1998-05-19 19:47:22 +00:00
|
|
|
return (0);
|
|
|
|
LIST_REMOVE(inodedep, id_hash);
|
1998-05-19 20:18:42 +00:00
|
|
|
WORKITEM_FREE(inodedep, D_INODEDEP);
|
1998-05-19 21:45:53 +00:00
|
|
|
num_inodedep -= 1;
|
1998-05-19 19:47:22 +00:00
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This workitem routine performs the block de-allocation.
|
|
|
|
* The workitem is added to the pending list after the updated
|
|
|
|
* inode block has been written to disk. As mentioned above,
|
|
|
|
* checks regarding the number of blocks de-allocated (compared
|
|
|
|
* to the number of blocks allocated for the file) are also
|
|
|
|
* performed in this function.
|
|
|
|
*/
|
|
|
|
static void
|
2001-03-21 04:09:01 +00:00
|
|
|
handle_workitem_freeblocks(freeblks, flags)
|
1998-05-19 19:47:22 +00:00
|
|
|
struct freeblks *freeblks;
|
2001-03-21 04:09:01 +00:00
|
|
|
int flags;
|
1998-05-19 19:47:22 +00:00
|
|
|
{
|
2002-02-02 01:42:44 +00:00
|
|
|
struct inode *ip;
|
2001-03-21 04:09:01 +00:00
|
|
|
struct vnode *vp;
|
1998-05-19 19:47:22 +00:00
|
|
|
struct fs *fs;
|
2002-06-21 06:18:05 +00:00
|
|
|
int i, nblocks, level, bsize;
|
|
|
|
ufs2_daddr_t bn, blocksreleased = 0;
|
1998-05-19 19:47:22 +00:00
|
|
|
int error, allerror = 0;
|
|
|
|
ufs_lbn_t baselbns[NIADDR], tmpval;
|
|
|
|
|
2002-02-02 01:42:44 +00:00
|
|
|
fs = VFSTOUFS(freeblks->fb_mnt)->um_fs;
|
1998-05-19 19:47:22 +00:00
|
|
|
tmpval = 1;
|
|
|
|
baselbns[0] = NDADDR;
|
|
|
|
for (i = 1; i < NIADDR; i++) {
|
|
|
|
tmpval *= NINDIR(fs);
|
|
|
|
baselbns[i] = baselbns[i - 1] + tmpval;
|
|
|
|
}
|
|
|
|
nblocks = btodb(fs->fs_bsize);
|
|
|
|
blocksreleased = 0;
|
|
|
|
/*
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
* Release all extended attribute blocks or frags.
|
1998-05-19 19:47:22 +00:00
|
|
|
*/
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
if (freeblks->fb_oldextsize > 0) {
|
|
|
|
for (i = (NXADDR - 1); i >= 0; i--) {
|
|
|
|
if ((bn = freeblks->fb_eblks[i]) == 0)
|
|
|
|
continue;
|
|
|
|
bsize = sblksize(fs, freeblks->fb_oldextsize, i);
|
|
|
|
ffs_blkfree(fs, freeblks->fb_devvp, bn, bsize,
|
|
|
|
freeblks->fb_previousinum);
|
|
|
|
blocksreleased += btodb(bsize);
|
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
/*
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
* Release all data blocks or frags.
|
1998-05-19 19:47:22 +00:00
|
|
|
*/
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
if (freeblks->fb_oldsize > 0) {
|
|
|
|
/*
|
|
|
|
* Indirect blocks first.
|
|
|
|
*/
|
|
|
|
for (level = (NIADDR - 1); level >= 0; level--) {
|
|
|
|
if ((bn = freeblks->fb_iblks[level]) == 0)
|
|
|
|
continue;
|
|
|
|
if ((error = indir_trunc(freeblks, fsbtodb(fs, bn),
|
|
|
|
level, baselbns[level], &blocksreleased)) == 0)
|
|
|
|
allerror = error;
|
|
|
|
ffs_blkfree(fs, freeblks->fb_devvp, bn, fs->fs_bsize,
|
|
|
|
freeblks->fb_previousinum);
|
|
|
|
fs->fs_pendingblocks -= nblocks;
|
|
|
|
blocksreleased += nblocks;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* All direct blocks or frags.
|
|
|
|
*/
|
|
|
|
for (i = (NDADDR - 1); i >= 0; i--) {
|
|
|
|
if ((bn = freeblks->fb_dblks[i]) == 0)
|
|
|
|
continue;
|
|
|
|
bsize = sblksize(fs, freeblks->fb_oldsize, i);
|
|
|
|
ffs_blkfree(fs, freeblks->fb_devvp, bn, bsize,
|
|
|
|
freeblks->fb_previousinum);
|
|
|
|
fs->fs_pendingblocks -= btodb(bsize);
|
|
|
|
blocksreleased += btodb(bsize);
|
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
2001-03-21 04:09:01 +00:00
|
|
|
/*
|
|
|
|
* If we still have not finished background cleanup, then check
|
|
|
|
* to see if the block count needs to be adjusted.
|
|
|
|
*/
|
|
|
|
if (freeblks->fb_chkcnt != blocksreleased &&
|
2002-03-17 01:25:47 +00:00
|
|
|
(fs->fs_flags & FS_UNCLEAN) != 0 &&
|
|
|
|
VFS_VGET(freeblks->fb_mnt, freeblks->fb_previousinum,
|
|
|
|
(flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp) == 0) {
|
2001-03-21 04:09:01 +00:00
|
|
|
ip = VTOI(vp);
|
2002-06-21 06:18:05 +00:00
|
|
|
DIP(ip, i_blocks) += freeblks->fb_chkcnt - blocksreleased;
|
2001-03-21 04:09:01 +00:00
|
|
|
ip->i_flag |= IN_CHANGE;
|
|
|
|
vput(vp);
|
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
|
|
|
|
#ifdef DIAGNOSTIC
|
2001-03-21 04:09:01 +00:00
|
|
|
if (freeblks->fb_chkcnt != blocksreleased &&
|
|
|
|
((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0))
|
2002-09-28 19:04:49 +00:00
|
|
|
printf("handle_workitem_freeblocks: block count\n");
|
1998-05-19 19:47:22 +00:00
|
|
|
if (allerror)
|
|
|
|
softdep_error("handle_workitem_freeblks", allerror);
|
|
|
|
#endif /* DIAGNOSTIC */
|
2001-03-21 04:09:01 +00:00
|
|
|
|
1998-05-19 20:18:42 +00:00
|
|
|
WORKITEM_FREE(freeblks, D_FREEBLKS);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Release blocks associated with the inode ip and stored in the indirect
|
|
|
|
* block dbn. If level is greater than SINGLE, the block is an indirect block
|
|
|
|
* and recursive calls to indirtrunc must be used to cleanse other indirect
|
|
|
|
* blocks.
|
|
|
|
*/
|
|
|
|
static int
|
2002-02-02 01:42:44 +00:00
|
|
|
indir_trunc(freeblks, dbn, level, lbn, countp)
|
|
|
|
struct freeblks *freeblks;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs2_daddr_t dbn;
|
1998-05-19 19:47:22 +00:00
|
|
|
int level;
|
|
|
|
ufs_lbn_t lbn;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs2_daddr_t *countp;
|
1998-05-19 19:47:22 +00:00
|
|
|
{
|
|
|
|
struct buf *bp;
|
|
|
|
struct fs *fs;
|
|
|
|
struct worklist *wk;
|
|
|
|
struct indirdep *indirdep;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs1_daddr_t *bap1 = 0;
|
|
|
|
ufs2_daddr_t nb, *bap2 = 0;
|
|
|
|
ufs_lbn_t lbnadd;
|
|
|
|
int i, nblocks, ufs1fmt;
|
1998-05-19 19:47:22 +00:00
|
|
|
int error, allerror = 0;
|
|
|
|
|
2002-02-02 01:42:44 +00:00
|
|
|
fs = VFSTOUFS(freeblks->fb_mnt)->um_fs;
|
1998-05-19 19:47:22 +00:00
|
|
|
lbnadd = 1;
|
|
|
|
for (i = level; i > 0; i--)
|
|
|
|
lbnadd *= NINDIR(fs);
|
|
|
|
/*
|
|
|
|
* Get buffer of block pointers to be freed. This routine is not
|
|
|
|
* called until the zero'ed inode has been written, so it is safe
|
|
|
|
* to free blocks as they are encountered. Because the inode has
|
|
|
|
* been zero'ed, calls to bmap on these blocks will fail. So, we
|
|
|
|
* have to use the on-disk address and the block device for the
|
|
|
|
* filesystem to look them up. If the file was deleted before its
|
|
|
|
* indirect blocks were all written to disk, the routine that set
|
|
|
|
* us up (deallocate_dependencies) will have arranged to leave
|
|
|
|
* a complete copy of the indirect block in memory for our use.
|
|
|
|
* Otherwise we have to read the blocks in from the disk.
|
|
|
|
*/
|
|
|
|
ACQUIRE_LOCK(&lk);
|
2002-02-02 01:42:44 +00:00
|
|
|
if ((bp = incore(freeblks->fb_devvp, dbn)) != NULL &&
|
1998-05-19 19:47:22 +00:00
|
|
|
(wk = LIST_FIRST(&bp->b_dep)) != NULL) {
|
1998-05-19 20:18:42 +00:00
|
|
|
if (wk->wk_type != D_INDIRDEP ||
|
1998-05-19 19:47:22 +00:00
|
|
|
(indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
|
2001-02-23 09:01:31 +00:00
|
|
|
(indirdep->ir_state & GOINGAWAY) == 0) {
|
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("indir_trunc: lost indirdep");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
WORKLIST_REMOVE(wk);
|
1998-05-19 20:18:42 +00:00
|
|
|
WORKITEM_FREE(indirdep, D_INDIRDEP);
|
2001-02-23 09:01:31 +00:00
|
|
|
if (LIST_FIRST(&bp->b_dep) != NULL) {
|
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("indir_trunc: dangling dep");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
2003-01-07 18:23:50 +00:00
|
|
|
VFSTOUFS(freeblks->fb_mnt)->um_numindirdeps -= 1;
|
1998-05-19 19:47:22 +00:00
|
|
|
FREE_LOCK(&lk);
|
|
|
|
} else {
|
|
|
|
FREE_LOCK(&lk);
|
2002-02-02 01:42:44 +00:00
|
|
|
error = bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
|
|
|
|
NOCRED, &bp);
|
|
|
|
if (error) {
|
|
|
|
brelse(bp);
|
1998-05-19 19:47:22 +00:00
|
|
|
return (error);
|
2002-02-02 01:42:44 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Recursively free indirect blocks.
|
|
|
|
*/
|
2002-06-21 06:18:05 +00:00
|
|
|
if (VFSTOUFS(freeblks->fb_mnt)->um_fstype == UFS1) {
|
|
|
|
ufs1fmt = 1;
|
|
|
|
bap1 = (ufs1_daddr_t *)bp->b_data;
|
|
|
|
} else {
|
|
|
|
ufs1fmt = 0;
|
|
|
|
bap2 = (ufs2_daddr_t *)bp->b_data;
|
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
nblocks = btodb(fs->fs_bsize);
|
|
|
|
for (i = NINDIR(fs) - 1; i >= 0; i--) {
|
2002-06-21 06:18:05 +00:00
|
|
|
if (ufs1fmt)
|
|
|
|
nb = bap1[i];
|
|
|
|
else
|
|
|
|
nb = bap2[i];
|
|
|
|
if (nb == 0)
|
1998-05-19 19:47:22 +00:00
|
|
|
continue;
|
|
|
|
if (level != 0) {
|
2002-02-02 01:42:44 +00:00
|
|
|
if ((error = indir_trunc(freeblks, fsbtodb(fs, nb),
|
1998-05-19 19:47:22 +00:00
|
|
|
level - 1, lbn + (i * lbnadd), countp)) != 0)
|
|
|
|
allerror = error;
|
|
|
|
}
|
2002-02-02 01:42:44 +00:00
|
|
|
ffs_blkfree(fs, freeblks->fb_devvp, nb, fs->fs_bsize,
|
|
|
|
freeblks->fb_previousinum);
|
2001-05-08 07:42:20 +00:00
|
|
|
fs->fs_pendingblocks -= nblocks;
|
1998-05-19 19:47:22 +00:00
|
|
|
*countp += nblocks;
|
|
|
|
}
|
1998-06-11 17:44:32 +00:00
|
|
|
bp->b_flags |= B_INVAL | B_NOCACHE;
|
1998-05-19 19:47:22 +00:00
|
|
|
brelse(bp);
|
|
|
|
return (allerror);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Free an allocindir.
|
|
|
|
* This routine must be called with splbio interrupts blocked.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
free_allocindir(aip, inodedep)
|
|
|
|
struct allocindir *aip;
|
|
|
|
struct inodedep *inodedep;
|
|
|
|
{
|
|
|
|
struct freefrag *freefrag;
|
|
|
|
|
|
|
|
#ifdef DEBUG
|
2001-09-12 08:38:13 +00:00
|
|
|
if (lk.lkt_held == NOHOLDER)
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("free_allocindir: lock not held");
|
|
|
|
#endif
|
|
|
|
if ((aip->ai_state & DEPCOMPLETE) == 0)
|
|
|
|
LIST_REMOVE(aip, ai_deps);
|
|
|
|
if (aip->ai_state & ONWORKLIST)
|
|
|
|
WORKLIST_REMOVE(&aip->ai_list);
|
|
|
|
LIST_REMOVE(aip, ai_next);
|
|
|
|
if ((freefrag = aip->ai_freefrag) != NULL) {
|
|
|
|
if (inodedep == NULL)
|
|
|
|
add_to_worklist(&freefrag->ff_list);
|
|
|
|
else
|
1998-05-19 21:45:53 +00:00
|
|
|
WORKLIST_INSERT(&inodedep->id_bufwait,
|
1998-05-19 19:47:22 +00:00
|
|
|
&freefrag->ff_list);
|
|
|
|
}
|
1998-05-19 20:18:42 +00:00
|
|
|
WORKITEM_FREE(aip, D_ALLOCINDIR);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Directory entry addition dependencies.
|
|
|
|
*
|
|
|
|
* When adding a new directory entry, the inode (with its incremented link
|
|
|
|
* count) must be written to disk before the directory entry's pointer to it.
|
|
|
|
* Also, if the inode is newly allocated, the corresponding freemap must be
|
|
|
|
* updated (on disk) before the directory entry's pointer. These requirements
|
|
|
|
* are met via undo/redo on the directory entry's pointer, which consists
|
|
|
|
* simply of the inode number.
|
|
|
|
*
|
|
|
|
* As directory entries are added and deleted, the free space within a
|
2002-05-16 21:28:32 +00:00
|
|
|
* directory block can become fragmented. The ufs filesystem will compact
|
1998-05-19 19:47:22 +00:00
|
|
|
* a fragmented directory block to make space for a new entry. When this
|
|
|
|
* occurs, the offsets of previously added entries change. Any "diradd"
|
|
|
|
* dependency structures corresponding to these entries must be updated with
|
|
|
|
* the new offsets.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This routine is called after the in-memory inode's link
|
|
|
|
* count has been incremented, but before the directory entry's
|
|
|
|
* pointer to the inode has been set.
|
|
|
|
*/
|
2001-05-17 07:24:03 +00:00
|
|
|
int
|
|
|
|
softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
|
1998-05-19 19:47:22 +00:00
|
|
|
struct buf *bp; /* buffer containing directory block */
|
|
|
|
struct inode *dp; /* inode for directory */
|
|
|
|
off_t diroffset; /* offset of new entry in directory */
|
2002-06-21 06:18:05 +00:00
|
|
|
ino_t newinum; /* inode referenced by new directory entry */
|
1998-05-19 19:47:22 +00:00
|
|
|
struct buf *newdirbp; /* non-NULL => contents of new mkdir */
|
2001-05-17 07:24:03 +00:00
|
|
|
int isnewblk; /* entry is in a newly allocated block */
|
1998-05-19 19:47:22 +00:00
|
|
|
{
|
|
|
|
int offset; /* offset of new entry within directory block */
|
|
|
|
ufs_lbn_t lbn; /* block in directory containing new entry */
|
|
|
|
struct fs *fs;
|
|
|
|
struct diradd *dap;
|
2001-05-17 07:24:03 +00:00
|
|
|
struct allocdirect *adp;
|
1998-05-19 19:47:22 +00:00
|
|
|
struct pagedep *pagedep;
|
|
|
|
struct inodedep *inodedep;
|
2001-05-17 07:24:03 +00:00
|
|
|
struct newdirblk *newdirblk = 0;
|
1998-05-19 19:47:22 +00:00
|
|
|
struct mkdir *mkdir1, *mkdir2;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Whiteouts have no dependencies.
|
|
|
|
*/
|
|
|
|
if (newinum == WINO) {
|
|
|
|
if (newdirbp != NULL)
|
|
|
|
bdwrite(newdirbp);
|
2001-05-17 07:24:03 +00:00
|
|
|
return (0);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
fs = dp->i_fs;
|
|
|
|
lbn = lblkno(fs, diroffset);
|
|
|
|
offset = blkoff(fs, diroffset);
|
2000-12-08 21:51:06 +00:00
|
|
|
MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD,
|
|
|
|
M_SOFTDEP_FLAGS|M_ZERO);
|
1998-05-19 20:18:42 +00:00
|
|
|
dap->da_list.wk_type = D_DIRADD;
|
1998-05-19 19:47:22 +00:00
|
|
|
dap->da_offset = offset;
|
|
|
|
dap->da_newinum = newinum;
|
|
|
|
dap->da_state = ATTACHED;
|
2001-05-17 07:24:03 +00:00
|
|
|
if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) {
|
|
|
|
MALLOC(newdirblk, struct newdirblk *, sizeof(struct newdirblk),
|
|
|
|
M_NEWDIRBLK, M_SOFTDEP_FLAGS);
|
|
|
|
newdirblk->db_list.wk_type = D_NEWDIRBLK;
|
|
|
|
newdirblk->db_state = 0;
|
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
if (newdirbp == NULL) {
|
|
|
|
dap->da_state |= DEPCOMPLETE;
|
1998-05-19 20:18:42 +00:00
|
|
|
ACQUIRE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
} else {
|
|
|
|
dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
|
|
|
|
MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
|
Implement a low-memory deadlock solution.
Removed most of the hacks that were trying to deal with low-memory
situations prior to now.
The new code is based on the concept that I/O must be able to function in
a low memory situation. All major modules related to I/O (except
networking) have been adjusted to allow allocation out of the system
reserve memory pool. These modules now detect a low memory situation but
rather then block they instead continue to operate, then return resources
to the memory pool instead of cache them or leave them wired.
Code has been added to stall in a low-memory situation prior to a vnode
being locked.
Thus situations where a process blocks in a low-memory condition while
holding a locked vnode have been reduced to near nothing. Not only will
I/O continue to operate, but many prior deadlock conditions simply no
longer exist.
Implement a number of VFS/BIO fixes
(found by Ian): in biodone(), bogus-page replacement code, the loop
was not properly incrementing loop variables prior to a continue
statement. We do not believe this code can be hit anyway but we
aren't taking any chances. We'll turn the whole section into a
panic (as it already is in brelse()) after the release is rolled.
In biodone(), the foff calculation was incorrectly
clamped to the iosize, causing the wrong foff to be calculated
for pages in the case of an I/O error or biodone() called without
initiating I/O. The problem always caused a panic before. Now it
doesn't. The problem is mainly an issue with NFS.
Fixed casts for ~PAGE_MASK. This code worked properly before only
because the calculations use signed arithmatic. Better to properly
extend PAGE_MASK first before inverting it for the 64 bit masking
op.
In brelse(), the bogus_page fixup code was improperly throwing
away the original contents of 'm' when it did the j-loop to
fix the bogus pages. The result was that it would potentially
invalidate parts of the *WRONG* page(!), leading to corruption.
There may still be cases where a background bitmap write is
being duplicated, causing potential corruption. We have identified
a potentially serious bug related to this but the fix is still TBD.
So instead this patch contains a KASSERT to detect the problem
and panic the machine rather then continue to corrupt the filesystem.
The problem does not occur very often.. it is very hard to
reproduce, and it may or may not be the cause of the corruption
people have reported.
Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>)
Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
2000-11-18 23:06:26 +00:00
|
|
|
M_SOFTDEP_FLAGS);
|
1998-05-19 20:18:42 +00:00
|
|
|
mkdir1->md_list.wk_type = D_MKDIR;
|
1998-05-19 19:47:22 +00:00
|
|
|
mkdir1->md_state = MKDIR_BODY;
|
|
|
|
mkdir1->md_diradd = dap;
|
|
|
|
MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
|
Implement a low-memory deadlock solution.
Removed most of the hacks that were trying to deal with low-memory
situations prior to now.
The new code is based on the concept that I/O must be able to function in
a low memory situation. All major modules related to I/O (except
networking) have been adjusted to allow allocation out of the system
reserve memory pool. These modules now detect a low memory situation but
rather then block they instead continue to operate, then return resources
to the memory pool instead of cache them or leave them wired.
Code has been added to stall in a low-memory situation prior to a vnode
being locked.
Thus situations where a process blocks in a low-memory condition while
holding a locked vnode have been reduced to near nothing. Not only will
I/O continue to operate, but many prior deadlock conditions simply no
longer exist.
Implement a number of VFS/BIO fixes
(found by Ian): in biodone(), bogus-page replacement code, the loop
was not properly incrementing loop variables prior to a continue
statement. We do not believe this code can be hit anyway but we
aren't taking any chances. We'll turn the whole section into a
panic (as it already is in brelse()) after the release is rolled.
In biodone(), the foff calculation was incorrectly
clamped to the iosize, causing the wrong foff to be calculated
for pages in the case of an I/O error or biodone() called without
initiating I/O. The problem always caused a panic before. Now it
doesn't. The problem is mainly an issue with NFS.
Fixed casts for ~PAGE_MASK. This code worked properly before only
because the calculations use signed arithmatic. Better to properly
extend PAGE_MASK first before inverting it for the 64 bit masking
op.
In brelse(), the bogus_page fixup code was improperly throwing
away the original contents of 'm' when it did the j-loop to
fix the bogus pages. The result was that it would potentially
invalidate parts of the *WRONG* page(!), leading to corruption.
There may still be cases where a background bitmap write is
being duplicated, causing potential corruption. We have identified
a potentially serious bug related to this but the fix is still TBD.
So instead this patch contains a KASSERT to detect the problem
and panic the machine rather then continue to corrupt the filesystem.
The problem does not occur very often.. it is very hard to
reproduce, and it may or may not be the cause of the corruption
people have reported.
Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>)
Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
2000-11-18 23:06:26 +00:00
|
|
|
M_SOFTDEP_FLAGS);
|
1998-05-19 20:18:42 +00:00
|
|
|
mkdir2->md_list.wk_type = D_MKDIR;
|
1998-05-19 19:47:22 +00:00
|
|
|
mkdir2->md_state = MKDIR_PARENT;
|
|
|
|
mkdir2->md_diradd = dap;
|
|
|
|
/*
|
1998-05-19 22:54:53 +00:00
|
|
|
* Dependency on "." and ".." being written to disk.
|
1998-05-19 19:47:22 +00:00
|
|
|
*/
|
1999-03-02 00:19:47 +00:00
|
|
|
mkdir1->md_buf = newdirbp;
|
1999-03-02 06:38:07 +00:00
|
|
|
ACQUIRE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
|
|
|
|
WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
|
1999-03-02 06:38:07 +00:00
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
bdwrite(newdirbp);
|
|
|
|
/*
|
|
|
|
* Dependency on link count increase for parent directory
|
|
|
|
*/
|
1999-03-02 06:38:07 +00:00
|
|
|
ACQUIRE_LOCK(&lk);
|
2001-05-17 07:24:03 +00:00
|
|
|
if (inodedep_lookup(fs, dp->i_number, 0, &inodedep) == 0
|
1998-05-19 19:47:22 +00:00
|
|
|
|| (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
|
|
|
|
dap->da_state &= ~MKDIR_PARENT;
|
1998-05-19 20:18:42 +00:00
|
|
|
WORKITEM_FREE(mkdir2, D_MKDIR);
|
1998-05-19 19:47:22 +00:00
|
|
|
} else {
|
|
|
|
LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
|
1998-05-19 21:45:53 +00:00
|
|
|
WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
/*
|
1998-05-19 21:45:53 +00:00
|
|
|
* Link into parent directory pagedep to await its being written.
|
1998-05-19 19:47:22 +00:00
|
|
|
*/
|
|
|
|
if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
|
|
|
|
WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
|
|
|
|
dap->da_pagedep = pagedep;
|
|
|
|
LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
|
|
|
|
da_pdlist);
|
1998-05-19 21:45:53 +00:00
|
|
|
/*
|
|
|
|
* Link into its inodedep. Put it on the id_bufwait list if the inode
|
|
|
|
* is not yet written. If it is written, do the post-inode write
|
|
|
|
* processing to put it on the id_pendinghd list.
|
|
|
|
*/
|
|
|
|
(void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
|
|
|
|
if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
|
|
|
|
diradd_inode_written(dap, inodedep);
|
1998-05-19 19:47:22 +00:00
|
|
|
else
|
1998-05-19 21:45:53 +00:00
|
|
|
WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
|
2001-05-17 07:24:03 +00:00
|
|
|
if (isnewblk) {
|
|
|
|
/*
|
|
|
|
* Directories growing into indirect blocks are rare
|
|
|
|
* enough and the frequency of new block allocation
|
|
|
|
* in those cases even more rare, that we choose not
|
|
|
|
* to bother tracking them. Rather we simply force the
|
|
|
|
* new directory entry to disk.
|
|
|
|
*/
|
|
|
|
if (lbn >= NDADDR) {
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
/*
|
|
|
|
* We only have a new allocation when at the
|
|
|
|
* beginning of a new block, not when we are
|
|
|
|
* expanding into an existing block.
|
|
|
|
*/
|
|
|
|
if (blkoff(fs, diroffset) == 0)
|
|
|
|
return (1);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* We only have a new allocation when at the beginning
|
|
|
|
* of a new fragment, not when we are expanding into an
|
|
|
|
* existing fragment. Also, there is nothing to do if we
|
|
|
|
* are already tracking this block.
|
|
|
|
*/
|
|
|
|
if (fragoff(fs, diroffset) != 0) {
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
if ((pagedep->pd_state & NEWBLOCK) != 0) {
|
|
|
|
WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Find our associated allocdirect and have it track us.
|
|
|
|
*/
|
|
|
|
if (inodedep_lookup(fs, dp->i_number, 0, &inodedep) == 0)
|
|
|
|
panic("softdep_setup_directory_add: lost inodedep");
|
|
|
|
adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst);
|
|
|
|
if (adp == NULL || adp->ad_lbn != lbn) {
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
panic("softdep_setup_directory_add: lost entry");
|
|
|
|
}
|
|
|
|
pagedep->pd_state |= NEWBLOCK;
|
|
|
|
newdirblk->db_pagedep = pagedep;
|
|
|
|
WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list);
|
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
FREE_LOCK(&lk);
|
2001-05-17 07:24:03 +00:00
|
|
|
return (0);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This procedure is called to change the offset of a directory
|
|
|
|
* entry when compacting a directory block which must be owned
|
|
|
|
* exclusively by the caller. Note that the actual entry movement
|
|
|
|
* must be done in this procedure to ensure that no I/O completions
|
|
|
|
* occur while the move is in progress.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
|
|
|
|
struct inode *dp; /* inode for directory */
|
|
|
|
caddr_t base; /* address of dp->i_offset */
|
|
|
|
caddr_t oldloc; /* address of old directory location */
|
|
|
|
caddr_t newloc; /* address of new directory location */
|
|
|
|
int entrysize; /* size of directory entry */
|
|
|
|
{
|
1998-05-19 20:03:29 +00:00
|
|
|
int offset, oldoffset, newoffset;
|
1998-05-19 19:47:22 +00:00
|
|
|
struct pagedep *pagedep;
|
|
|
|
struct diradd *dap;
|
|
|
|
ufs_lbn_t lbn;
|
|
|
|
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
lbn = lblkno(dp->i_fs, dp->i_offset);
|
1998-05-19 20:03:29 +00:00
|
|
|
offset = blkoff(dp->i_fs, dp->i_offset);
|
1998-05-19 19:47:22 +00:00
|
|
|
if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
|
|
|
|
goto done;
|
1998-05-19 20:03:29 +00:00
|
|
|
oldoffset = offset + (oldloc - base);
|
|
|
|
newoffset = offset + (newloc - base);
|
2001-02-04 12:37:48 +00:00
|
|
|
|
|
|
|
LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
|
1998-05-19 19:47:22 +00:00
|
|
|
if (dap->da_offset != oldoffset)
|
|
|
|
continue;
|
|
|
|
dap->da_offset = newoffset;
|
|
|
|
if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
|
|
|
|
break;
|
|
|
|
LIST_REMOVE(dap, da_pdlist);
|
|
|
|
LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
|
|
|
|
dap, da_pdlist);
|
|
|
|
break;
|
|
|
|
}
|
1999-02-17 20:01:20 +00:00
|
|
|
if (dap == NULL) {
|
2001-02-04 12:37:48 +00:00
|
|
|
|
|
|
|
LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
|
1999-02-17 20:01:20 +00:00
|
|
|
if (dap->da_offset == oldoffset) {
|
|
|
|
dap->da_offset = newoffset;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
done:
|
|
|
|
bcopy(oldloc, newloc, entrysize);
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Free a diradd dependency structure. This routine must be called
|
|
|
|
* with splbio interrupts blocked.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
free_diradd(dap)
|
|
|
|
struct diradd *dap;
|
|
|
|
{
|
|
|
|
struct dirrem *dirrem;
|
|
|
|
struct pagedep *pagedep;
|
|
|
|
struct inodedep *inodedep;
|
|
|
|
struct mkdir *mkdir, *nextmd;
|
|
|
|
|
|
|
|
#ifdef DEBUG
|
2001-09-12 08:38:13 +00:00
|
|
|
if (lk.lkt_held == NOHOLDER)
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("free_diradd: lock not held");
|
|
|
|
#endif
|
|
|
|
WORKLIST_REMOVE(&dap->da_list);
|
|
|
|
LIST_REMOVE(dap, da_pdlist);
|
|
|
|
if ((dap->da_state & DIRCHG) == 0) {
|
|
|
|
pagedep = dap->da_pagedep;
|
|
|
|
} else {
|
|
|
|
dirrem = dap->da_previous;
|
|
|
|
pagedep = dirrem->dm_pagedep;
|
1998-08-12 20:46:47 +00:00
|
|
|
dirrem->dm_dirinum = pagedep->pd_ino;
|
1998-05-19 20:03:29 +00:00
|
|
|
add_to_worklist(&dirrem->dm_list);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum,
|
|
|
|
0, &inodedep) != 0)
|
|
|
|
(void) free_inodedep(inodedep);
|
|
|
|
if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
|
|
|
|
for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
|
|
|
|
nextmd = LIST_NEXT(mkdir, md_mkdirs);
|
|
|
|
if (mkdir->md_diradd != dap)
|
|
|
|
continue;
|
|
|
|
dap->da_state &= ~mkdir->md_state;
|
|
|
|
WORKLIST_REMOVE(&mkdir->md_list);
|
|
|
|
LIST_REMOVE(mkdir, md_mkdirs);
|
1998-05-19 20:18:42 +00:00
|
|
|
WORKITEM_FREE(mkdir, D_MKDIR);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
2001-02-23 09:01:31 +00:00
|
|
|
if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
|
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("free_diradd: unfound ref");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
1998-05-19 20:18:42 +00:00
|
|
|
WORKITEM_FREE(dap, D_DIRADD);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Directory entry removal dependencies.
|
|
|
|
*
|
|
|
|
* When removing a directory entry, the entry's inode pointer must be
|
|
|
|
* zero'ed on disk before the corresponding inode's link count is decremented
|
|
|
|
* (possibly freeing the inode for re-use). This dependency is handled by
|
|
|
|
* updating the directory entry but delaying the inode count reduction until
|
|
|
|
* after the directory block has been written to disk. After this point, the
|
|
|
|
* inode count can be decremented whenever it is convenient.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This routine should be called immediately after removing
|
|
|
|
* a directory entry. The inode's link count should not be
|
|
|
|
* decremented by the calling procedure -- the soft updates
|
|
|
|
* code will do this task when it is safe.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
softdep_setup_remove(bp, dp, ip, isrmdir)
|
|
|
|
struct buf *bp; /* buffer containing directory block */
|
|
|
|
struct inode *dp; /* inode for the directory being modified */
|
|
|
|
struct inode *ip; /* inode for directory entry being removed */
|
|
|
|
int isrmdir; /* indicates if doing RMDIR */
|
|
|
|
{
|
2000-01-17 06:28:18 +00:00
|
|
|
struct dirrem *dirrem, *prevdirrem;
|
1998-05-19 19:47:22 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
|
|
|
|
*/
|
2000-01-17 06:28:18 +00:00
|
|
|
dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the COMPLETE flag is clear, then there were no active
|
|
|
|
* entries and we want to roll back to a zeroed entry until
|
|
|
|
* the new inode is committed to disk. If the COMPLETE flag is
|
|
|
|
* set then we have deleted an entry that never made it to
|
|
|
|
* disk. If the entry we deleted resulted from a name change,
|
|
|
|
* then the old name still resides on disk. We cannot delete
|
|
|
|
* its inode (returned to us in prevdirrem) until the zeroed
|
|
|
|
* directory entry gets to disk. The new inode has never been
|
|
|
|
* referenced on the disk, so can be deleted immediately.
|
|
|
|
*/
|
1998-05-19 19:47:22 +00:00
|
|
|
if ((dirrem->dm_state & COMPLETE) == 0) {
|
|
|
|
LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
|
|
|
|
dm_next);
|
2000-01-10 00:24:24 +00:00
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
} else {
|
2000-01-17 06:28:18 +00:00
|
|
|
if (prevdirrem != NULL)
|
|
|
|
LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
|
|
|
|
prevdirrem, dm_next);
|
1998-05-19 19:47:22 +00:00
|
|
|
dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
|
2000-01-10 00:24:24 +00:00
|
|
|
FREE_LOCK(&lk);
|
2002-03-17 01:25:47 +00:00
|
|
|
handle_workitem_remove(dirrem, NULL);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Allocate a new dirrem if appropriate and return it along with
|
|
|
|
* its associated pagedep. Called without a lock, returns with lock.
|
|
|
|
*/
|
2000-01-09 23:35:38 +00:00
|
|
|
static long num_dirrem; /* number of dirrem allocated */
|
1998-05-19 19:47:22 +00:00
|
|
|
static struct dirrem *
|
2000-01-17 06:28:18 +00:00
|
|
|
newdirrem(bp, dp, ip, isrmdir, prevdirremp)
|
1998-05-19 19:47:22 +00:00
|
|
|
struct buf *bp; /* buffer containing directory block */
|
|
|
|
struct inode *dp; /* inode for the directory being modified */
|
|
|
|
struct inode *ip; /* inode for directory entry being removed */
|
|
|
|
int isrmdir; /* indicates if doing RMDIR */
|
2000-01-17 06:28:18 +00:00
|
|
|
struct dirrem **prevdirremp; /* previously referenced inode, if any */
|
1998-05-19 19:47:22 +00:00
|
|
|
{
|
1998-05-19 20:03:29 +00:00
|
|
|
int offset;
|
1998-05-19 19:47:22 +00:00
|
|
|
ufs_lbn_t lbn;
|
|
|
|
struct diradd *dap;
|
|
|
|
struct dirrem *dirrem;
|
|
|
|
struct pagedep *pagedep;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Whiteouts have no deletion dependencies.
|
|
|
|
*/
|
|
|
|
if (ip == NULL)
|
|
|
|
panic("newdirrem: whiteout");
|
2000-01-09 23:35:38 +00:00
|
|
|
/*
|
|
|
|
* If we are over our limit, try to improve the situation.
|
|
|
|
* Limiting the number of dirrem structures will also limit
|
|
|
|
* the number of freefile and freeblks structures.
|
|
|
|
*/
|
2000-12-13 08:30:35 +00:00
|
|
|
if (num_dirrem > max_softdeps / 2)
|
2000-01-09 23:35:38 +00:00
|
|
|
(void) request_cleanup(FLUSH_REMOVE, 0);
|
|
|
|
num_dirrem += 1;
|
1998-05-19 19:47:22 +00:00
|
|
|
MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
|
2000-12-08 21:51:06 +00:00
|
|
|
M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
|
1998-05-19 20:18:42 +00:00
|
|
|
dirrem->dm_list.wk_type = D_DIRREM;
|
1998-05-19 19:47:22 +00:00
|
|
|
dirrem->dm_state = isrmdir ? RMDIR : 0;
|
|
|
|
dirrem->dm_mnt = ITOV(ip)->v_mount;
|
|
|
|
dirrem->dm_oldinum = ip->i_number;
|
2000-01-17 06:28:18 +00:00
|
|
|
*prevdirremp = NULL;
|
1998-05-19 19:47:22 +00:00
|
|
|
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
lbn = lblkno(dp->i_fs, dp->i_offset);
|
1998-05-19 20:03:29 +00:00
|
|
|
offset = blkoff(dp->i_fs, dp->i_offset);
|
1998-05-19 19:47:22 +00:00
|
|
|
if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
|
|
|
|
WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
|
|
|
|
dirrem->dm_pagedep = pagedep;
|
1998-06-10 20:45:46 +00:00
|
|
|
/*
|
|
|
|
* Check for a diradd dependency for the same directory entry.
|
|
|
|
* If present, then both dependencies become obsolete and can
|
|
|
|
* be de-allocated. Check for an entry on both the pd_dirraddhd
|
|
|
|
* list and the pd_pendinghd list.
|
|
|
|
*/
|
2001-02-04 12:37:48 +00:00
|
|
|
|
|
|
|
LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
|
1998-06-10 20:45:46 +00:00
|
|
|
if (dap->da_offset == offset)
|
|
|
|
break;
|
|
|
|
if (dap == NULL) {
|
2001-02-04 12:37:48 +00:00
|
|
|
|
|
|
|
LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
|
1998-06-10 20:45:46 +00:00
|
|
|
if (dap->da_offset == offset)
|
|
|
|
break;
|
|
|
|
if (dap == NULL)
|
|
|
|
return (dirrem);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
1998-06-10 20:45:46 +00:00
|
|
|
/*
|
2000-01-17 06:28:18 +00:00
|
|
|
* Must be ATTACHED at this point.
|
1998-06-10 20:45:46 +00:00
|
|
|
*/
|
2001-02-23 09:01:31 +00:00
|
|
|
if ((dap->da_state & ATTACHED) == 0) {
|
|
|
|
FREE_LOCK(&lk);
|
1998-06-10 20:45:46 +00:00
|
|
|
panic("newdirrem: not ATTACHED");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
|
|
|
if (dap->da_newinum != ip->i_number) {
|
|
|
|
FREE_LOCK(&lk);
|
1998-06-10 20:45:46 +00:00
|
|
|
panic("newdirrem: inum %d should be %d",
|
|
|
|
ip->i_number, dap->da_newinum);
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
2000-01-17 06:28:18 +00:00
|
|
|
/*
|
|
|
|
* If we are deleting a changed name that never made it to disk,
|
|
|
|
* then return the dirrem describing the previous inode (which
|
|
|
|
* represents the inode currently referenced from this entry on disk).
|
|
|
|
*/
|
|
|
|
if ((dap->da_state & DIRCHG) != 0) {
|
|
|
|
*prevdirremp = dap->da_previous;
|
|
|
|
dap->da_state &= ~DIRCHG;
|
|
|
|
dap->da_pagedep = pagedep;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* We are deleting an entry that never made it to disk.
|
|
|
|
* Mark it COMPLETE so we can delete its inode immediately.
|
|
|
|
*/
|
1998-06-10 20:45:46 +00:00
|
|
|
dirrem->dm_state |= COMPLETE;
|
2000-01-17 06:28:18 +00:00
|
|
|
free_diradd(dap);
|
1998-05-19 19:47:22 +00:00
|
|
|
return (dirrem);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Directory entry change dependencies.
|
|
|
|
*
|
|
|
|
* Changing an existing directory entry requires that an add operation
|
|
|
|
* be completed first followed by a deletion. The semantics for the addition
|
|
|
|
* are identical to the description of adding a new entry above except
|
|
|
|
* that the rollback is to the old inode number rather than zero. Once
|
|
|
|
* the addition dependency is completed, the removal is done as described
|
|
|
|
* in the removal routine above.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This routine should be called immediately after changing
|
|
|
|
* a directory entry. The inode's link count should not be
|
|
|
|
* decremented by the calling procedure -- the soft updates
|
|
|
|
* code will perform this task when it is safe.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
|
|
|
|
struct buf *bp; /* buffer containing directory block */
|
|
|
|
struct inode *dp; /* inode for the directory being modified */
|
|
|
|
struct inode *ip; /* inode for directory entry being removed */
|
2002-06-21 06:18:05 +00:00
|
|
|
ino_t newinum; /* new inode number for changed entry */
|
1998-05-19 19:47:22 +00:00
|
|
|
int isrmdir; /* indicates if doing RMDIR */
|
|
|
|
{
|
|
|
|
int offset;
|
1998-06-12 20:48:30 +00:00
|
|
|
struct diradd *dap = NULL;
|
2000-01-17 06:28:18 +00:00
|
|
|
struct dirrem *dirrem, *prevdirrem;
|
1998-06-12 20:48:30 +00:00
|
|
|
struct pagedep *pagedep;
|
1998-05-19 19:47:22 +00:00
|
|
|
struct inodedep *inodedep;
|
|
|
|
|
|
|
|
offset = blkoff(dp->i_fs, dp->i_offset);
|
|
|
|
|
|
|
|
/*
|
1998-06-12 20:48:30 +00:00
|
|
|
* Whiteouts do not need diradd dependencies.
|
1998-05-19 19:47:22 +00:00
|
|
|
*/
|
1998-06-12 20:48:30 +00:00
|
|
|
if (newinum != WINO) {
|
1998-05-19 19:47:22 +00:00
|
|
|
MALLOC(dap, struct diradd *, sizeof(struct diradd),
|
2000-12-08 21:51:06 +00:00
|
|
|
M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
|
1998-05-19 20:18:42 +00:00
|
|
|
dap->da_list.wk_type = D_DIRADD;
|
1998-05-19 19:47:22 +00:00
|
|
|
dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
|
|
|
|
dap->da_offset = offset;
|
|
|
|
dap->da_newinum = newinum;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
1998-06-12 20:48:30 +00:00
|
|
|
* Allocate a new dirrem and ACQUIRE_LOCK.
|
1998-05-19 19:47:22 +00:00
|
|
|
*/
|
2000-01-17 06:28:18 +00:00
|
|
|
dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
|
1998-06-12 20:48:30 +00:00
|
|
|
pagedep = dirrem->dm_pagedep;
|
1998-08-12 20:46:47 +00:00
|
|
|
/*
|
|
|
|
* The possible values for isrmdir:
|
|
|
|
* 0 - non-directory file rename
|
|
|
|
* 1 - directory rename within same directory
|
|
|
|
* inum - directory rename to new directory of given inode number
|
|
|
|
* When renaming to a new directory, we are both deleting and
|
|
|
|
* creating a new directory entry, so the link count on the new
|
|
|
|
* directory should not change. Thus we do not need the followup
|
|
|
|
* dirrem which is usually done in handle_workitem_remove. We set
|
|
|
|
* the DIRCHG flag to tell handle_workitem_remove to skip the
|
|
|
|
* followup dirrem.
|
|
|
|
*/
|
|
|
|
if (isrmdir > 1)
|
|
|
|
dirrem->dm_state |= DIRCHG;
|
1998-05-19 19:47:22 +00:00
|
|
|
|
|
|
|
/*
|
1998-06-12 20:48:30 +00:00
|
|
|
* Whiteouts have no additional dependencies,
|
|
|
|
* so just put the dirrem on the correct list.
|
1998-05-19 19:47:22 +00:00
|
|
|
*/
|
1998-06-12 20:48:30 +00:00
|
|
|
if (newinum == WINO) {
|
|
|
|
if ((dirrem->dm_state & COMPLETE) == 0) {
|
|
|
|
LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
|
|
|
|
dm_next);
|
|
|
|
} else {
|
|
|
|
dirrem->dm_dirinum = pagedep->pd_ino;
|
|
|
|
add_to_worklist(&dirrem->dm_list);
|
|
|
|
}
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
return;
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
|
2000-01-17 06:28:18 +00:00
|
|
|
/*
|
|
|
|
* If the COMPLETE flag is clear, then there were no active
|
|
|
|
* entries and we want to roll back to the previous inode until
|
|
|
|
* the new inode is committed to disk. If the COMPLETE flag is
|
|
|
|
* set, then we have deleted an entry that never made it to disk.
|
|
|
|
* If the entry we deleted resulted from a name change, then the old
|
|
|
|
* inode reference still resides on disk. Any rollback that we do
|
|
|
|
* needs to be to that old inode (returned to us in prevdirrem). If
|
|
|
|
* the entry we deleted resulted from a create, then there is
|
|
|
|
* no entry on the disk, so we want to roll back to zero rather
|
|
|
|
* than the uncommitted inode. In either of the COMPLETE cases we
|
|
|
|
* want to immediately free the unwritten and unreferenced inode.
|
|
|
|
*/
|
|
|
|
if ((dirrem->dm_state & COMPLETE) == 0) {
|
|
|
|
dap->da_previous = dirrem;
|
|
|
|
} else {
|
|
|
|
if (prevdirrem != NULL) {
|
|
|
|
dap->da_previous = prevdirrem;
|
|
|
|
} else {
|
|
|
|
dap->da_state &= ~DIRCHG;
|
|
|
|
dap->da_pagedep = pagedep;
|
|
|
|
}
|
|
|
|
dirrem->dm_dirinum = pagedep->pd_ino;
|
|
|
|
add_to_worklist(&dirrem->dm_list);
|
|
|
|
}
|
1998-06-12 20:48:30 +00:00
|
|
|
/*
|
|
|
|
* Link into its inodedep. Put it on the id_bufwait list if the inode
|
|
|
|
* is not yet written. If it is written, do the post-inode write
|
|
|
|
* processing to put it on the id_pendinghd list.
|
|
|
|
*/
|
|
|
|
if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 ||
|
|
|
|
(inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
|
|
|
|
dap->da_state |= COMPLETE;
|
|
|
|
LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
|
|
|
|
WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
|
|
|
|
} else {
|
|
|
|
LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
|
1998-05-19 19:47:22 +00:00
|
|
|
dap, da_pdlist);
|
1998-05-19 21:45:53 +00:00
|
|
|
WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
|
1998-06-12 20:48:30 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
FREE_LOCK(&lk);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2000-01-10 00:24:24 +00:00
|
|
|
* Called whenever the link count on an inode is changed.
|
1998-05-19 19:47:22 +00:00
|
|
|
* It creates an inode dependency so that the new reference(s)
|
|
|
|
* to the inode cannot be committed to disk until the updated
|
|
|
|
* inode has been written.
|
|
|
|
*/
|
|
|
|
void
|
2000-01-10 00:24:24 +00:00
|
|
|
softdep_change_linkcnt(ip)
|
1998-05-19 19:47:22 +00:00
|
|
|
struct inode *ip; /* the inode with the increased link count */
|
|
|
|
{
|
|
|
|
struct inodedep *inodedep;
|
|
|
|
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
|
2001-02-23 09:01:31 +00:00
|
|
|
if (ip->i_nlink < ip->i_effnlink) {
|
|
|
|
FREE_LOCK(&lk);
|
2000-01-10 00:24:24 +00:00
|
|
|
panic("softdep_change_linkcnt: bad delta");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
2000-01-10 00:24:24 +00:00
|
|
|
inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
|
1998-05-19 19:47:22 +00:00
|
|
|
FREE_LOCK(&lk);
|
|
|
|
}
|
|
|
|
|
2001-05-08 07:42:20 +00:00
|
|
|
/*
|
|
|
|
* Called when the effective link count and the reference count
|
|
|
|
* on an inode drops to zero. At this point there are no names
|
|
|
|
* referencing the file in the filesystem and no active file
|
|
|
|
* references. The space associated with the file will be freed
|
|
|
|
* as soon as the necessary soft dependencies are cleared.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
softdep_releasefile(ip)
|
|
|
|
struct inode *ip; /* inode with the zero effective link count */
|
|
|
|
{
|
|
|
|
struct inodedep *inodedep;
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
struct fs *fs;
|
|
|
|
int extblocks;
|
2001-05-08 07:42:20 +00:00
|
|
|
|
|
|
|
if (ip->i_effnlink > 0)
|
|
|
|
panic("softdep_filerelease: file still referenced");
|
|
|
|
/*
|
|
|
|
* We may be called several times as the real reference count
|
|
|
|
* drops to zero. We only want to account for the space once.
|
|
|
|
*/
|
|
|
|
if (ip->i_flag & IN_SPACECOUNTED)
|
|
|
|
return;
|
|
|
|
/*
|
|
|
|
* We have to deactivate a snapshot otherwise copyonwrites may
|
|
|
|
* add blocks and the cleanup may remove blocks after we have
|
|
|
|
* tried to account for them.
|
|
|
|
*/
|
|
|
|
if ((ip->i_flags & SF_SNAPSHOT) != 0)
|
|
|
|
ffs_snapremove(ITOV(ip));
|
|
|
|
/*
|
|
|
|
* If we are tracking an nlinkdelta, we have to also remember
|
|
|
|
* whether we accounted for the freed space yet.
|
|
|
|
*/
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
if ((inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep)))
|
|
|
|
inodedep->id_state |= SPACECOUNTED;
|
|
|
|
FREE_LOCK(&lk);
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
fs = ip->i_fs;
|
|
|
|
extblocks = 0;
|
|
|
|
if (fs->fs_magic == FS_UFS2_MAGIC)
|
|
|
|
extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
|
|
|
|
ip->i_fs->fs_pendingblocks += DIP(ip, i_blocks) - extblocks;
|
2001-05-08 07:42:20 +00:00
|
|
|
ip->i_fs->fs_pendinginodes += 1;
|
|
|
|
ip->i_flag |= IN_SPACECOUNTED;
|
|
|
|
}
|
|
|
|
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* This workitem decrements the inode's link count.
|
|
|
|
* If the link count reaches zero, the file is removed.
|
|
|
|
*/
|
|
|
|
static void
|
2002-03-17 01:25:47 +00:00
|
|
|
handle_workitem_remove(dirrem, xp)
|
1998-05-19 19:47:22 +00:00
|
|
|
struct dirrem *dirrem;
|
2002-03-17 01:25:47 +00:00
|
|
|
struct vnode *xp;
|
1998-05-19 19:47:22 +00:00
|
|
|
{
|
2001-09-12 08:38:13 +00:00
|
|
|
struct thread *td = curthread;
|
1998-05-19 19:47:22 +00:00
|
|
|
struct inodedep *inodedep;
|
|
|
|
struct vnode *vp;
|
|
|
|
struct inode *ip;
|
2000-01-11 06:52:35 +00:00
|
|
|
ino_t oldinum;
|
1998-05-19 19:47:22 +00:00
|
|
|
int error;
|
|
|
|
|
2002-03-17 01:25:47 +00:00
|
|
|
if ((vp = xp) == NULL &&
|
|
|
|
(error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, LK_EXCLUSIVE,
|
|
|
|
&vp)) != 0) {
|
1998-05-19 19:47:22 +00:00
|
|
|
softdep_error("handle_workitem_remove: vget", error);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
ip = VTOI(vp);
|
2000-01-10 00:24:24 +00:00
|
|
|
ACQUIRE_LOCK(&lk);
|
2001-02-23 09:01:31 +00:00
|
|
|
if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0){
|
|
|
|
FREE_LOCK(&lk);
|
2000-01-11 06:52:35 +00:00
|
|
|
panic("handle_workitem_remove: lost inodedep");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* Normal file deletion.
|
|
|
|
*/
|
|
|
|
if ((dirrem->dm_state & RMDIR) == 0) {
|
|
|
|
ip->i_nlink--;
|
2002-06-21 06:18:05 +00:00
|
|
|
DIP(ip, i_nlink) = ip->i_nlink;
|
2000-01-10 00:24:24 +00:00
|
|
|
ip->i_flag |= IN_CHANGE;
|
2001-02-23 09:01:31 +00:00
|
|
|
if (ip->i_nlink < ip->i_effnlink) {
|
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 21:45:53 +00:00
|
|
|
panic("handle_workitem_remove: bad file delta");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
2000-01-10 00:24:24 +00:00
|
|
|
inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
|
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
vput(vp);
|
2000-01-09 23:35:38 +00:00
|
|
|
num_dirrem -= 1;
|
1998-05-19 20:18:42 +00:00
|
|
|
WORKITEM_FREE(dirrem, D_DIRREM);
|
1998-05-19 19:47:22 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Directory deletion. Decrement reference count for both the
|
|
|
|
* just deleted parent directory entry and the reference for ".".
|
|
|
|
* Next truncate the directory to length zero. When the
|
|
|
|
* truncation completes, arrange to have the reference count on
|
|
|
|
* the parent decremented to account for the loss of "..".
|
|
|
|
*/
|
|
|
|
ip->i_nlink -= 2;
|
2002-06-21 06:18:05 +00:00
|
|
|
DIP(ip, i_nlink) = ip->i_nlink;
|
2000-01-10 00:24:24 +00:00
|
|
|
ip->i_flag |= IN_CHANGE;
|
2001-02-23 09:01:31 +00:00
|
|
|
if (ip->i_nlink < ip->i_effnlink) {
|
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("handle_workitem_remove: bad dir delta");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
2000-01-10 00:24:24 +00:00
|
|
|
inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
|
|
|
|
FREE_LOCK(&lk);
|
2002-02-27 18:32:23 +00:00
|
|
|
if ((error = UFS_TRUNCATE(vp, (off_t)0, 0, td->td_ucred, td)) != 0)
|
1998-05-19 19:47:22 +00:00
|
|
|
softdep_error("handle_workitem_remove: truncate", error);
|
1998-08-12 20:46:47 +00:00
|
|
|
/*
|
|
|
|
* Rename a directory to a new parent. Since, we are both deleting
|
|
|
|
* and creating a new directory entry, the link count on the new
|
|
|
|
* directory should not change. Thus we skip the followup dirrem.
|
|
|
|
*/
|
|
|
|
if (dirrem->dm_state & DIRCHG) {
|
|
|
|
vput(vp);
|
2000-01-09 23:35:38 +00:00
|
|
|
num_dirrem -= 1;
|
1998-08-12 20:46:47 +00:00
|
|
|
WORKITEM_FREE(dirrem, D_DIRREM);
|
|
|
|
return;
|
|
|
|
}
|
2000-01-10 00:24:24 +00:00
|
|
|
/*
|
2000-01-18 01:33:05 +00:00
|
|
|
* If the inodedep does not exist, then the zero'ed inode has
|
|
|
|
* been written to disk. If the allocated inode has never been
|
|
|
|
* written to disk, then the on-disk inode is zero'ed. In either
|
|
|
|
* case we can remove the file immediately.
|
2000-01-10 00:24:24 +00:00
|
|
|
*/
|
1998-05-19 19:47:22 +00:00
|
|
|
ACQUIRE_LOCK(&lk);
|
2000-01-11 06:52:35 +00:00
|
|
|
dirrem->dm_state = 0;
|
|
|
|
oldinum = dirrem->dm_oldinum;
|
|
|
|
dirrem->dm_oldinum = dirrem->dm_dirinum;
|
2000-01-18 01:33:05 +00:00
|
|
|
if (inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep) == 0 ||
|
|
|
|
check_inode_unwritten(inodedep)) {
|
2000-01-10 00:24:24 +00:00
|
|
|
FREE_LOCK(&lk);
|
|
|
|
vput(vp);
|
2002-03-17 01:25:47 +00:00
|
|
|
handle_workitem_remove(dirrem, NULL);
|
2000-01-11 06:52:35 +00:00
|
|
|
return;
|
2000-01-10 00:24:24 +00:00
|
|
|
}
|
2000-01-18 01:33:05 +00:00
|
|
|
WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
|
2000-01-11 06:52:35 +00:00
|
|
|
FREE_LOCK(&lk);
|
|
|
|
vput(vp);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Inode de-allocation dependencies.
|
|
|
|
*
|
|
|
|
* When an inode's link count is reduced to zero, it can be de-allocated. We
|
|
|
|
* found it convenient to postpone de-allocation until after the inode is
|
|
|
|
* written to disk with its new link count (zero). At this point, all of the
|
|
|
|
* on-disk inode's block pointers are nullified and, with careful dependency
|
|
|
|
* list ordering, all dependencies related to the inode will be satisfied and
|
|
|
|
* the corresponding dependency structures de-allocated. So, if/when the
|
|
|
|
* inode is reused, there will be no mixing of old dependencies with new
|
|
|
|
* ones. This artificial dependency is set up by the block de-allocation
|
|
|
|
* procedure above (softdep_setup_freeblocks) and completed by the
|
|
|
|
* following procedure.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
handle_workitem_freefile(freefile)
|
|
|
|
struct freefile *freefile;
|
|
|
|
{
|
2000-07-11 22:07:57 +00:00
|
|
|
struct fs *fs;
|
1998-05-19 19:47:22 +00:00
|
|
|
struct inodedep *idp;
|
|
|
|
int error;
|
|
|
|
|
2000-07-11 22:07:57 +00:00
|
|
|
fs = VFSTOUFS(freefile->fx_mnt)->um_fs;
|
1998-05-19 19:47:22 +00:00
|
|
|
#ifdef DEBUG
|
|
|
|
ACQUIRE_LOCK(&lk);
|
2001-02-23 09:01:31 +00:00
|
|
|
error = inodedep_lookup(fs, freefile->fx_oldinum, 0, &idp);
|
1998-05-19 19:47:22 +00:00
|
|
|
FREE_LOCK(&lk);
|
2001-02-23 09:01:31 +00:00
|
|
|
if (error)
|
|
|
|
panic("handle_workitem_freefile: inodedep survived");
|
1998-05-19 19:47:22 +00:00
|
|
|
#endif
|
2001-05-08 07:42:20 +00:00
|
|
|
fs->fs_pendinginodes -= 1;
|
2002-02-02 01:42:44 +00:00
|
|
|
if ((error = ffs_freefile(fs, freefile->fx_devvp, freefile->fx_oldinum,
|
|
|
|
freefile->fx_mode)) != 0)
|
1998-05-19 19:47:22 +00:00
|
|
|
softdep_error("handle_workitem_freefile", error);
|
1998-05-19 20:18:42 +00:00
|
|
|
WORKITEM_FREE(freefile, D_FREEFILE);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Disk writes.
|
|
|
|
*
|
|
|
|
* The dependency structures constructed above are most actively used when file
|
|
|
|
* system blocks are written to disk. No constraints are placed on when a
|
|
|
|
* block can be written, but unsatisfied update dependencies are made safe by
|
|
|
|
* modifying (or replacing) the source memory for the duration of the disk
|
|
|
|
* write. When the disk write completes, the memory block is again brought
|
|
|
|
* up-to-date.
|
|
|
|
*
|
|
|
|
* In-core inode structure reclamation.
|
|
|
|
*
|
|
|
|
* Because there are a finite number of "in-core" inode structures, they are
|
|
|
|
* reused regularly. By transferring all inode-related dependencies to the
|
|
|
|
* in-memory inode block and indexing them separately (via "inodedep"s), we
|
|
|
|
* can allow "in-core" inode structures to be reused at any time and avoid
|
|
|
|
* any increase in contention.
|
|
|
|
*
|
|
|
|
* Called just before entering the device driver to initiate a new disk I/O.
|
|
|
|
* The buffer must be locked, thus, no I/O completion operations can occur
|
|
|
|
* while we are manipulating its associated dependencies.
|
|
|
|
*/
|
2000-01-09 22:40:09 +00:00
|
|
|
static void
|
1998-05-19 19:47:22 +00:00
|
|
|
softdep_disk_io_initiation(bp)
|
|
|
|
struct buf *bp; /* structure describing disk write to occur */
|
|
|
|
{
|
|
|
|
struct worklist *wk, *nextwk;
|
|
|
|
struct indirdep *indirdep;
|
2002-06-21 06:18:05 +00:00
|
|
|
struct inodedep *inodedep;
|
1998-05-19 19:47:22 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We only care about write operations. There should never
|
|
|
|
* be dependencies for reads.
|
|
|
|
*/
|
2000-03-20 10:44:49 +00:00
|
|
|
if (bp->b_iocmd == BIO_READ)
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("softdep_disk_io_initiation: read");
|
|
|
|
/*
|
|
|
|
* Do any necessary pre-I/O processing.
|
|
|
|
*/
|
|
|
|
for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) {
|
|
|
|
nextwk = LIST_NEXT(wk, wk_list);
|
|
|
|
switch (wk->wk_type) {
|
|
|
|
|
1998-05-19 20:18:42 +00:00
|
|
|
case D_PAGEDEP:
|
1998-05-19 19:47:22 +00:00
|
|
|
initiate_write_filepage(WK_PAGEDEP(wk), bp);
|
|
|
|
continue;
|
|
|
|
|
1998-05-19 20:18:42 +00:00
|
|
|
case D_INODEDEP:
|
2002-06-21 06:18:05 +00:00
|
|
|
inodedep = WK_INODEDEP(wk);
|
|
|
|
if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
|
|
|
|
initiate_write_inodeblock_ufs1(inodedep, bp);
|
|
|
|
else
|
|
|
|
initiate_write_inodeblock_ufs2(inodedep, bp);
|
1998-05-19 19:47:22 +00:00
|
|
|
continue;
|
|
|
|
|
1998-05-19 20:18:42 +00:00
|
|
|
case D_INDIRDEP:
|
1998-05-19 19:47:22 +00:00
|
|
|
indirdep = WK_INDIRDEP(wk);
|
|
|
|
if (indirdep->ir_state & GOINGAWAY)
|
|
|
|
panic("disk_io_initiation: indirdep gone");
|
|
|
|
/*
|
|
|
|
* If there are no remaining dependencies, this
|
|
|
|
* will be writing the real pointers, so the
|
|
|
|
* dependency can be freed.
|
|
|
|
*/
|
|
|
|
if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
|
2003-01-07 18:23:50 +00:00
|
|
|
indirdep->ir_savebp->b_flags |=
|
|
|
|
B_INVAL | B_NOCACHE;
|
1998-05-19 19:47:22 +00:00
|
|
|
brelse(indirdep->ir_savebp);
|
|
|
|
/* inline expand WORKLIST_REMOVE(wk); */
|
|
|
|
wk->wk_state &= ~ONWORKLIST;
|
|
|
|
LIST_REMOVE(wk, wk_list);
|
1998-05-19 20:18:42 +00:00
|
|
|
WORKITEM_FREE(indirdep, D_INDIRDEP);
|
1998-05-19 19:47:22 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Replace up-to-date version with safe version.
|
|
|
|
*/
|
2000-09-07 23:02:55 +00:00
|
|
|
MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
|
Implement a low-memory deadlock solution.
Removed most of the hacks that were trying to deal with low-memory
situations prior to now.
The new code is based on the concept that I/O must be able to function in
a low memory situation. All major modules related to I/O (except
networking) have been adjusted to allow allocation out of the system
reserve memory pool. These modules now detect a low memory situation but
rather then block they instead continue to operate, then return resources
to the memory pool instead of cache them or leave them wired.
Code has been added to stall in a low-memory situation prior to a vnode
being locked.
Thus situations where a process blocks in a low-memory condition while
holding a locked vnode have been reduced to near nothing. Not only will
I/O continue to operate, but many prior deadlock conditions simply no
longer exist.
Implement a number of VFS/BIO fixes
(found by Ian): in biodone(), bogus-page replacement code, the loop
was not properly incrementing loop variables prior to a continue
statement. We do not believe this code can be hit anyway but we
aren't taking any chances. We'll turn the whole section into a
panic (as it already is in brelse()) after the release is rolled.
In biodone(), the foff calculation was incorrectly
clamped to the iosize, causing the wrong foff to be calculated
for pages in the case of an I/O error or biodone() called without
initiating I/O. The problem always caused a panic before. Now it
doesn't. The problem is mainly an issue with NFS.
Fixed casts for ~PAGE_MASK. This code worked properly before only
because the calculations use signed arithmatic. Better to properly
extend PAGE_MASK first before inverting it for the 64 bit masking
op.
In brelse(), the bogus_page fixup code was improperly throwing
away the original contents of 'm' when it did the j-loop to
fix the bogus pages. The result was that it would potentially
invalidate parts of the *WRONG* page(!), leading to corruption.
There may still be cases where a background bitmap write is
being duplicated, causing potential corruption. We have identified
a potentially serious bug related to this but the fix is still TBD.
So instead this patch contains a KASSERT to detect the problem
and panic the machine rather then continue to corrupt the filesystem.
The problem does not occur very often.. it is very hard to
reproduce, and it may or may not be the cause of the corruption
people have reported.
Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>)
Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
2000-11-18 23:06:26 +00:00
|
|
|
M_INDIRDEP, M_SOFTDEP_FLAGS);
|
1998-05-19 19:47:22 +00:00
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
indirdep->ir_state &= ~ATTACHED;
|
|
|
|
indirdep->ir_state |= UNDONE;
|
1998-05-19 23:07:25 +00:00
|
|
|
bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
|
|
|
|
bcopy(indirdep->ir_savebp->b_data, bp->b_data,
|
|
|
|
bp->b_bcount);
|
1998-05-19 19:47:22 +00:00
|
|
|
FREE_LOCK(&lk);
|
|
|
|
continue;
|
|
|
|
|
1998-05-19 20:18:42 +00:00
|
|
|
case D_MKDIR:
|
|
|
|
case D_BMSAFEMAP:
|
|
|
|
case D_ALLOCDIRECT:
|
|
|
|
case D_ALLOCINDIR:
|
1998-05-19 19:47:22 +00:00
|
|
|
continue;
|
|
|
|
|
|
|
|
default:
|
|
|
|
panic("handle_disk_io_initiation: Unexpected type %s",
|
|
|
|
TYPENAME(wk->wk_type));
|
|
|
|
/* NOTREACHED */
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called from within the procedure above to deal with unsatisfied
|
|
|
|
* allocation dependencies in a directory. The buffer must be locked,
|
|
|
|
* thus, no I/O completion operations can occur while we are
|
|
|
|
* manipulating its associated dependencies.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
initiate_write_filepage(pagedep, bp)
|
|
|
|
struct pagedep *pagedep;
|
|
|
|
struct buf *bp;
|
|
|
|
{
|
|
|
|
struct diradd *dap;
|
|
|
|
struct direct *ep;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (pagedep->pd_state & IOSTARTED) {
|
|
|
|
/*
|
|
|
|
* This can only happen if there is a driver that does not
|
|
|
|
* understand chaining. Here biodone will reissue the call
|
|
|
|
* to strategy for the incomplete buffers.
|
|
|
|
*/
|
|
|
|
printf("initiate_write_filepage: already started\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
pagedep->pd_state |= IOSTARTED;
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
for (i = 0; i < DAHASHSZ; i++) {
|
2001-02-04 16:08:18 +00:00
|
|
|
LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
|
1998-05-19 19:47:22 +00:00
|
|
|
ep = (struct direct *)
|
|
|
|
((char *)bp->b_data + dap->da_offset);
|
2001-02-23 09:01:31 +00:00
|
|
|
if (ep->d_ino != dap->da_newinum) {
|
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("%s: dir inum %d != new %d",
|
|
|
|
"initiate_write_filepage",
|
|
|
|
ep->d_ino, dap->da_newinum);
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
if (dap->da_state & DIRCHG)
|
|
|
|
ep->d_ino = dap->da_previous->dm_oldinum;
|
|
|
|
else
|
|
|
|
ep->d_ino = 0;
|
|
|
|
dap->da_state &= ~ATTACHED;
|
|
|
|
dap->da_state |= UNDONE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2002-06-21 06:18:05 +00:00
|
|
|
* Version of initiate_write_inodeblock that handles UFS1 dinodes.
|
|
|
|
* Note that any bug fixes made to this routine must be done in the
|
|
|
|
* version found below.
|
|
|
|
*
|
|
|
|
* Called from within the procedure above to deal with unsatisfied
|
|
|
|
* allocation dependencies in an inodeblock. The buffer must be
|
|
|
|
* locked, thus, no I/O completion operations can occur while we
|
|
|
|
* are manipulating its associated dependencies.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
initiate_write_inodeblock_ufs1(inodedep, bp)
|
|
|
|
struct inodedep *inodedep;
|
|
|
|
struct buf *bp; /* The inode block */
|
|
|
|
{
|
|
|
|
struct allocdirect *adp, *lastadp;
|
|
|
|
struct ufs1_dinode *dp;
|
|
|
|
struct fs *fs;
|
|
|
|
ufs_lbn_t i, prevlbn = 0;
|
|
|
|
int deplist;
|
|
|
|
|
|
|
|
if (inodedep->id_state & IOSTARTED)
|
|
|
|
panic("initiate_write_inodeblock_ufs1: already started");
|
|
|
|
inodedep->id_state |= IOSTARTED;
|
|
|
|
fs = inodedep->id_fs;
|
|
|
|
dp = (struct ufs1_dinode *)bp->b_data +
|
|
|
|
ino_to_fsbo(fs, inodedep->id_ino);
|
|
|
|
/*
|
|
|
|
* If the bitmap is not yet written, then the allocated
|
|
|
|
* inode cannot be written to disk.
|
|
|
|
*/
|
|
|
|
if ((inodedep->id_state & DEPCOMPLETE) == 0) {
|
|
|
|
if (inodedep->id_savedino1 != NULL)
|
|
|
|
panic("initiate_write_inodeblock_ufs1: I/O underway");
|
|
|
|
MALLOC(inodedep->id_savedino1, struct ufs1_dinode *,
|
|
|
|
sizeof(struct ufs1_dinode), M_INODEDEP, M_SOFTDEP_FLAGS);
|
|
|
|
*inodedep->id_savedino1 = *dp;
|
|
|
|
bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* If no dependencies, then there is nothing to roll back.
|
|
|
|
*/
|
|
|
|
inodedep->id_savedsize = dp->di_size;
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
inodedep->id_savedextsize = 0;
|
2002-06-21 06:18:05 +00:00
|
|
|
if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
|
|
|
|
return;
|
|
|
|
/*
|
|
|
|
* Set the dependencies to busy.
|
|
|
|
*/
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
|
|
|
|
adp = TAILQ_NEXT(adp, ad_next)) {
|
|
|
|
#ifdef DIAGNOSTIC
|
|
|
|
if (deplist != 0 && prevlbn >= adp->ad_lbn) {
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
panic("softdep_write_inodeblock: lbn order");
|
|
|
|
}
|
|
|
|
prevlbn = adp->ad_lbn;
|
|
|
|
if (adp->ad_lbn < NDADDR &&
|
|
|
|
dp->di_db[adp->ad_lbn] != adp->ad_newblkno) {
|
|
|
|
FREE_LOCK(&lk);
|
2002-06-23 18:17:27 +00:00
|
|
|
panic("%s: direct pointer #%jd mismatch %d != %jd",
|
2002-06-21 06:18:05 +00:00
|
|
|
"softdep_write_inodeblock",
|
|
|
|
(intmax_t)adp->ad_lbn,
|
|
|
|
dp->di_db[adp->ad_lbn],
|
|
|
|
(intmax_t)adp->ad_newblkno);
|
|
|
|
}
|
|
|
|
if (adp->ad_lbn >= NDADDR &&
|
|
|
|
dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) {
|
|
|
|
FREE_LOCK(&lk);
|
2002-06-23 18:17:27 +00:00
|
|
|
panic("%s: indirect pointer #%jd mismatch %d != %jd",
|
2002-06-21 06:18:05 +00:00
|
|
|
"softdep_write_inodeblock",
|
|
|
|
(intmax_t)adp->ad_lbn - NDADDR,
|
|
|
|
dp->di_ib[adp->ad_lbn - NDADDR],
|
|
|
|
(intmax_t)adp->ad_newblkno);
|
|
|
|
}
|
|
|
|
deplist |= 1 << adp->ad_lbn;
|
|
|
|
if ((adp->ad_state & ATTACHED) == 0) {
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
panic("softdep_write_inodeblock: Unknown state 0x%x",
|
|
|
|
adp->ad_state);
|
|
|
|
}
|
|
|
|
#endif /* DIAGNOSTIC */
|
|
|
|
adp->ad_state &= ~ATTACHED;
|
|
|
|
adp->ad_state |= UNDONE;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* The on-disk inode cannot claim to be any larger than the last
|
|
|
|
* fragment that has been written. Otherwise, the on-disk inode
|
|
|
|
* might have fragments that were not the last block in the file
|
|
|
|
* which would corrupt the filesystem.
|
|
|
|
*/
|
|
|
|
for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
|
|
|
|
lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
|
|
|
|
if (adp->ad_lbn >= NDADDR)
|
|
|
|
break;
|
|
|
|
dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
|
|
|
|
/* keep going until hitting a rollback to a frag */
|
|
|
|
if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
|
|
|
|
continue;
|
|
|
|
dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
|
|
|
|
for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
|
|
|
|
#ifdef DIAGNOSTIC
|
|
|
|
if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) {
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
panic("softdep_write_inodeblock: lost dep1");
|
|
|
|
}
|
|
|
|
#endif /* DIAGNOSTIC */
|
|
|
|
dp->di_db[i] = 0;
|
|
|
|
}
|
|
|
|
for (i = 0; i < NIADDR; i++) {
|
|
|
|
#ifdef DIAGNOSTIC
|
|
|
|
if (dp->di_ib[i] != 0 &&
|
|
|
|
(deplist & ((1 << NDADDR) << i)) == 0) {
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
panic("softdep_write_inodeblock: lost dep2");
|
|
|
|
}
|
|
|
|
#endif /* DIAGNOSTIC */
|
|
|
|
dp->di_ib[i] = 0;
|
|
|
|
}
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* If we have zero'ed out the last allocated block of the file,
|
|
|
|
* roll back the size to the last currently allocated block.
|
|
|
|
* We know that this last allocated block is a full-sized as
|
|
|
|
* we already checked for fragments in the loop above.
|
|
|
|
*/
|
|
|
|
if (lastadp != NULL &&
|
|
|
|
dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
|
|
|
|
for (i = lastadp->ad_lbn; i >= 0; i--)
|
|
|
|
if (dp->di_db[i] != 0)
|
|
|
|
break;
|
|
|
|
dp->di_size = (i + 1) * fs->fs_bsize;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* The only dependencies are for indirect blocks.
|
|
|
|
*
|
|
|
|
* The file size for indirect block additions is not guaranteed.
|
|
|
|
* Such a guarantee would be non-trivial to achieve. The conventional
|
|
|
|
* synchronous write implementation also does not make this guarantee.
|
|
|
|
* Fsck should catch and fix discrepancies. Arguably, the file size
|
|
|
|
* can be over-estimated without destroying integrity when the file
|
|
|
|
* moves into the indirect blocks (i.e., is large). If we want to
|
|
|
|
* postpone fsck, we are stuck with this argument.
|
|
|
|
*/
|
|
|
|
for (; adp; adp = TAILQ_NEXT(adp, ad_next))
|
|
|
|
dp->di_ib[adp->ad_lbn - NDADDR] = 0;
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Version of initiate_write_inodeblock that handles UFS2 dinodes.
|
|
|
|
* Note that any bug fixes made to this routine must be done in the
|
|
|
|
* version found above.
|
|
|
|
*
|
1998-05-19 19:47:22 +00:00
|
|
|
* Called from within the procedure above to deal with unsatisfied
|
|
|
|
* allocation dependencies in an inodeblock. The buffer must be
|
|
|
|
* locked, thus, no I/O completion operations can occur while we
|
|
|
|
* are manipulating its associated dependencies.
|
|
|
|
*/
|
|
|
|
static void
|
2002-06-21 06:18:05 +00:00
|
|
|
initiate_write_inodeblock_ufs2(inodedep, bp)
|
1998-05-19 19:47:22 +00:00
|
|
|
struct inodedep *inodedep;
|
|
|
|
struct buf *bp; /* The inode block */
|
|
|
|
{
|
|
|
|
struct allocdirect *adp, *lastadp;
|
2002-06-21 06:18:05 +00:00
|
|
|
struct ufs2_dinode *dp;
|
1998-05-19 19:47:22 +00:00
|
|
|
struct fs *fs;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs_lbn_t i, prevlbn = 0;
|
|
|
|
int deplist;
|
1998-05-19 19:47:22 +00:00
|
|
|
|
|
|
|
if (inodedep->id_state & IOSTARTED)
|
2002-06-21 06:18:05 +00:00
|
|
|
panic("initiate_write_inodeblock_ufs2: already started");
|
1998-05-19 19:47:22 +00:00
|
|
|
inodedep->id_state |= IOSTARTED;
|
|
|
|
fs = inodedep->id_fs;
|
2002-06-21 06:18:05 +00:00
|
|
|
dp = (struct ufs2_dinode *)bp->b_data +
|
1998-05-19 19:47:22 +00:00
|
|
|
ino_to_fsbo(fs, inodedep->id_ino);
|
|
|
|
/*
|
|
|
|
* If the bitmap is not yet written, then the allocated
|
|
|
|
* inode cannot be written to disk.
|
|
|
|
*/
|
|
|
|
if ((inodedep->id_state & DEPCOMPLETE) == 0) {
|
2002-06-21 06:18:05 +00:00
|
|
|
if (inodedep->id_savedino2 != NULL)
|
|
|
|
panic("initiate_write_inodeblock_ufs2: I/O underway");
|
|
|
|
MALLOC(inodedep->id_savedino2, struct ufs2_dinode *,
|
|
|
|
sizeof(struct ufs2_dinode), M_INODEDEP, M_SOFTDEP_FLAGS);
|
|
|
|
*inodedep->id_savedino2 = *dp;
|
|
|
|
bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
|
1998-05-19 19:47:22 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* If no dependencies, then there is nothing to roll back.
|
|
|
|
*/
|
|
|
|
inodedep->id_savedsize = dp->di_size;
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
inodedep->id_savedextsize = dp->di_extsize;
|
|
|
|
if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL &&
|
|
|
|
TAILQ_FIRST(&inodedep->id_extupdt) == NULL)
|
1998-05-19 19:47:22 +00:00
|
|
|
return;
|
|
|
|
/*
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
* Set the ext data dependencies to busy.
|
1998-05-19 19:47:22 +00:00
|
|
|
*/
|
|
|
|
ACQUIRE_LOCK(&lk);
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
|
|
|
|
adp = TAILQ_NEXT(adp, ad_next)) {
|
|
|
|
#ifdef DIAGNOSTIC
|
|
|
|
if (deplist != 0 && prevlbn >= adp->ad_lbn) {
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
panic("softdep_write_inodeblock: lbn order");
|
|
|
|
}
|
|
|
|
prevlbn = adp->ad_lbn;
|
|
|
|
if (dp->di_extb[adp->ad_lbn] != adp->ad_newblkno) {
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
panic("%s: direct pointer #%jd mismatch %jd != %jd",
|
|
|
|
"softdep_write_inodeblock",
|
|
|
|
(intmax_t)adp->ad_lbn,
|
|
|
|
(intmax_t)dp->di_extb[adp->ad_lbn],
|
|
|
|
(intmax_t)adp->ad_newblkno);
|
|
|
|
}
|
|
|
|
deplist |= 1 << adp->ad_lbn;
|
|
|
|
if ((adp->ad_state & ATTACHED) == 0) {
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
panic("softdep_write_inodeblock: Unknown state 0x%x",
|
|
|
|
adp->ad_state);
|
|
|
|
}
|
|
|
|
#endif /* DIAGNOSTIC */
|
|
|
|
adp->ad_state &= ~ATTACHED;
|
|
|
|
adp->ad_state |= UNDONE;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* The on-disk inode cannot claim to be any larger than the last
|
|
|
|
* fragment that has been written. Otherwise, the on-disk inode
|
|
|
|
* might have fragments that were not the last block in the ext
|
|
|
|
* data which would corrupt the filesystem.
|
|
|
|
*/
|
|
|
|
for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
|
|
|
|
lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
|
|
|
|
dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno;
|
|
|
|
/* keep going until hitting a rollback to a frag */
|
|
|
|
if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
|
|
|
|
continue;
|
|
|
|
dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
|
|
|
|
for (i = adp->ad_lbn + 1; i < NXADDR; i++) {
|
|
|
|
#ifdef DIAGNOSTIC
|
|
|
|
if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) {
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
panic("softdep_write_inodeblock: lost dep1");
|
|
|
|
}
|
|
|
|
#endif /* DIAGNOSTIC */
|
|
|
|
dp->di_extb[i] = 0;
|
|
|
|
}
|
|
|
|
lastadp = NULL;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* If we have zero'ed out the last allocated block of the ext
|
|
|
|
* data, roll back the size to the last currently allocated block.
|
|
|
|
* We know that this last allocated block is a full-sized as
|
|
|
|
* we already checked for fragments in the loop above.
|
|
|
|
*/
|
|
|
|
if (lastadp != NULL &&
|
|
|
|
dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
|
|
|
|
for (i = lastadp->ad_lbn; i >= 0; i--)
|
|
|
|
if (dp->di_extb[i] != 0)
|
|
|
|
break;
|
|
|
|
dp->di_extsize = (i + 1) * fs->fs_bsize;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Set the file data dependencies to busy.
|
|
|
|
*/
|
1998-05-19 19:47:22 +00:00
|
|
|
for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
|
|
|
|
adp = TAILQ_NEXT(adp, ad_next)) {
|
|
|
|
#ifdef DIAGNOSTIC
|
2001-02-23 09:01:31 +00:00
|
|
|
if (deplist != 0 && prevlbn >= adp->ad_lbn) {
|
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("softdep_write_inodeblock: lbn order");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
prevlbn = adp->ad_lbn;
|
|
|
|
if (adp->ad_lbn < NDADDR &&
|
2001-02-23 09:01:31 +00:00
|
|
|
dp->di_db[adp->ad_lbn] != adp->ad_newblkno) {
|
|
|
|
FREE_LOCK(&lk);
|
2002-06-23 18:17:27 +00:00
|
|
|
panic("%s: direct pointer #%jd mismatch %jd != %jd",
|
2002-06-21 06:18:05 +00:00
|
|
|
"softdep_write_inodeblock",
|
|
|
|
(intmax_t)adp->ad_lbn,
|
|
|
|
(intmax_t)dp->di_db[adp->ad_lbn],
|
|
|
|
(intmax_t)adp->ad_newblkno);
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
if (adp->ad_lbn >= NDADDR &&
|
2001-02-23 09:01:31 +00:00
|
|
|
dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) {
|
|
|
|
FREE_LOCK(&lk);
|
2002-06-23 18:17:27 +00:00
|
|
|
panic("%s indirect pointer #%jd mismatch %jd != %jd",
|
2002-06-21 06:18:05 +00:00
|
|
|
"softdep_write_inodeblock:",
|
|
|
|
(intmax_t)adp->ad_lbn - NDADDR,
|
|
|
|
(intmax_t)dp->di_ib[adp->ad_lbn - NDADDR],
|
|
|
|
(intmax_t)adp->ad_newblkno);
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
deplist |= 1 << adp->ad_lbn;
|
2001-02-23 09:01:31 +00:00
|
|
|
if ((adp->ad_state & ATTACHED) == 0) {
|
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("softdep_write_inodeblock: Unknown state 0x%x",
|
|
|
|
adp->ad_state);
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
#endif /* DIAGNOSTIC */
|
|
|
|
adp->ad_state &= ~ATTACHED;
|
|
|
|
adp->ad_state |= UNDONE;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* The on-disk inode cannot claim to be any larger than the last
|
|
|
|
* fragment that has been written. Otherwise, the on-disk inode
|
|
|
|
* might have fragments that were not the last block in the file
|
|
|
|
* which would corrupt the filesystem.
|
|
|
|
*/
|
|
|
|
for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
|
|
|
|
lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
|
|
|
|
if (adp->ad_lbn >= NDADDR)
|
|
|
|
break;
|
|
|
|
dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
|
|
|
|
/* keep going until hitting a rollback to a frag */
|
|
|
|
if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
|
|
|
|
continue;
|
|
|
|
dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
|
|
|
|
for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
|
|
|
|
#ifdef DIAGNOSTIC
|
2001-02-23 09:01:31 +00:00
|
|
|
if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) {
|
|
|
|
FREE_LOCK(&lk);
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
panic("softdep_write_inodeblock: lost dep2");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
#endif /* DIAGNOSTIC */
|
|
|
|
dp->di_db[i] = 0;
|
|
|
|
}
|
|
|
|
for (i = 0; i < NIADDR; i++) {
|
|
|
|
#ifdef DIAGNOSTIC
|
|
|
|
if (dp->di_ib[i] != 0 &&
|
2001-02-23 09:01:31 +00:00
|
|
|
(deplist & ((1 << NDADDR) << i)) == 0) {
|
|
|
|
FREE_LOCK(&lk);
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
panic("softdep_write_inodeblock: lost dep3");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
#endif /* DIAGNOSTIC */
|
|
|
|
dp->di_ib[i] = 0;
|
|
|
|
}
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* If we have zero'ed out the last allocated block of the file,
|
|
|
|
* roll back the size to the last currently allocated block.
|
|
|
|
* We know that this last allocated block is a full-sized as
|
|
|
|
* we already checked for fragments in the loop above.
|
|
|
|
*/
|
|
|
|
if (lastadp != NULL &&
|
|
|
|
dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
|
|
|
|
for (i = lastadp->ad_lbn; i >= 0; i--)
|
|
|
|
if (dp->di_db[i] != 0)
|
|
|
|
break;
|
|
|
|
dp->di_size = (i + 1) * fs->fs_bsize;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* The only dependencies are for indirect blocks.
|
|
|
|
*
|
|
|
|
* The file size for indirect block additions is not guaranteed.
|
|
|
|
* Such a guarantee would be non-trivial to achieve. The conventional
|
|
|
|
* synchronous write implementation also does not make this guarantee.
|
|
|
|
* Fsck should catch and fix discrepancies. Arguably, the file size
|
|
|
|
* can be over-estimated without destroying integrity when the file
|
|
|
|
* moves into the indirect blocks (i.e., is large). If we want to
|
|
|
|
* postpone fsck, we are stuck with this argument.
|
|
|
|
*/
|
|
|
|
for (; adp; adp = TAILQ_NEXT(adp, ad_next))
|
|
|
|
dp->di_ib[adp->ad_lbn - NDADDR] = 0;
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This routine is called during the completion interrupt
|
|
|
|
* service routine for a disk write (from the procedure called
|
2002-05-16 21:28:32 +00:00
|
|
|
* by the device driver to inform the filesystem caches of
|
1998-05-19 19:47:22 +00:00
|
|
|
* a request completion). It should be called early in this
|
|
|
|
* procedure, before the block is made available to other
|
|
|
|
* processes or other routines are called.
|
|
|
|
*/
|
2000-01-09 22:40:09 +00:00
|
|
|
static void
|
1998-05-19 19:47:22 +00:00
|
|
|
softdep_disk_write_complete(bp)
|
|
|
|
struct buf *bp; /* describes the completed disk write */
|
|
|
|
{
|
|
|
|
struct worklist *wk;
|
|
|
|
struct workhead reattach;
|
|
|
|
struct newblk *newblk;
|
|
|
|
struct allocindir *aip;
|
|
|
|
struct allocdirect *adp;
|
|
|
|
struct indirdep *indirdep;
|
|
|
|
struct inodedep *inodedep;
|
|
|
|
struct bmsafemap *bmsafemap;
|
|
|
|
|
2002-11-20 05:14:16 +00:00
|
|
|
/*
|
|
|
|
* If an error occurred while doing the write, then the data
|
|
|
|
* has not hit the disk and the dependencies cannot be unrolled.
|
|
|
|
*/
|
|
|
|
if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
|
|
|
|
return;
|
1998-05-19 19:47:22 +00:00
|
|
|
#ifdef DEBUG
|
2001-09-12 08:38:13 +00:00
|
|
|
if (lk.lkt_held != NOHOLDER)
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("softdep_disk_write_complete: lock is held");
|
2001-09-12 08:38:13 +00:00
|
|
|
lk.lkt_held = SPECIAL_FLAG;
|
1998-05-19 19:47:22 +00:00
|
|
|
#endif
|
|
|
|
LIST_INIT(&reattach);
|
|
|
|
while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
|
|
|
|
WORKLIST_REMOVE(wk);
|
|
|
|
switch (wk->wk_type) {
|
|
|
|
|
1998-05-19 20:18:42 +00:00
|
|
|
case D_PAGEDEP:
|
1998-05-19 19:47:22 +00:00
|
|
|
if (handle_written_filepage(WK_PAGEDEP(wk), bp))
|
|
|
|
WORKLIST_INSERT(&reattach, wk);
|
|
|
|
continue;
|
|
|
|
|
1998-05-19 20:18:42 +00:00
|
|
|
case D_INODEDEP:
|
1998-05-19 19:47:22 +00:00
|
|
|
if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
|
|
|
|
WORKLIST_INSERT(&reattach, wk);
|
|
|
|
continue;
|
|
|
|
|
1998-05-19 20:18:42 +00:00
|
|
|
case D_BMSAFEMAP:
|
1998-05-19 19:47:22 +00:00
|
|
|
bmsafemap = WK_BMSAFEMAP(wk);
|
1999-05-07 02:26:47 +00:00
|
|
|
while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
|
1998-05-19 19:47:22 +00:00
|
|
|
newblk->nb_state |= DEPCOMPLETE;
|
|
|
|
newblk->nb_bmsafemap = NULL;
|
|
|
|
LIST_REMOVE(newblk, nb_deps);
|
|
|
|
}
|
1999-05-22 04:43:04 +00:00
|
|
|
while ((adp =
|
|
|
|
LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
|
1998-05-19 19:47:22 +00:00
|
|
|
adp->ad_state |= DEPCOMPLETE;
|
|
|
|
adp->ad_buf = NULL;
|
|
|
|
LIST_REMOVE(adp, ad_deps);
|
|
|
|
handle_allocdirect_partdone(adp);
|
|
|
|
}
|
1999-05-22 04:43:04 +00:00
|
|
|
while ((aip =
|
|
|
|
LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
|
1998-05-19 19:47:22 +00:00
|
|
|
aip->ai_state |= DEPCOMPLETE;
|
|
|
|
aip->ai_buf = NULL;
|
|
|
|
LIST_REMOVE(aip, ai_deps);
|
|
|
|
handle_allocindir_partdone(aip);
|
|
|
|
}
|
|
|
|
while ((inodedep =
|
1999-05-22 04:43:04 +00:00
|
|
|
LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
|
1998-05-19 19:47:22 +00:00
|
|
|
inodedep->id_state |= DEPCOMPLETE;
|
|
|
|
LIST_REMOVE(inodedep, id_deps);
|
|
|
|
inodedep->id_buf = NULL;
|
|
|
|
}
|
1998-05-19 20:18:42 +00:00
|
|
|
WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
|
1998-05-19 19:47:22 +00:00
|
|
|
continue;
|
|
|
|
|
1998-05-19 20:18:42 +00:00
|
|
|
case D_MKDIR:
|
1998-05-19 19:47:22 +00:00
|
|
|
handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
|
|
|
|
continue;
|
|
|
|
|
1998-05-19 20:18:42 +00:00
|
|
|
case D_ALLOCDIRECT:
|
1998-05-19 19:47:22 +00:00
|
|
|
adp = WK_ALLOCDIRECT(wk);
|
|
|
|
adp->ad_state |= COMPLETE;
|
|
|
|
handle_allocdirect_partdone(adp);
|
|
|
|
continue;
|
|
|
|
|
1998-05-19 20:18:42 +00:00
|
|
|
case D_ALLOCINDIR:
|
1998-05-19 19:47:22 +00:00
|
|
|
aip = WK_ALLOCINDIR(wk);
|
|
|
|
aip->ai_state |= COMPLETE;
|
|
|
|
handle_allocindir_partdone(aip);
|
|
|
|
continue;
|
|
|
|
|
1998-05-19 20:18:42 +00:00
|
|
|
case D_INDIRDEP:
|
1998-05-19 19:47:22 +00:00
|
|
|
indirdep = WK_INDIRDEP(wk);
|
2001-02-23 09:01:31 +00:00
|
|
|
if (indirdep->ir_state & GOINGAWAY) {
|
2001-09-12 08:38:13 +00:00
|
|
|
lk.lkt_held = NOHOLDER;
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("disk_write_complete: indirdep gone");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 23:07:25 +00:00
|
|
|
bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
|
|
|
|
FREE(indirdep->ir_saveddata, M_INDIRDEP);
|
|
|
|
indirdep->ir_saveddata = 0;
|
1998-05-19 19:47:22 +00:00
|
|
|
indirdep->ir_state &= ~UNDONE;
|
|
|
|
indirdep->ir_state |= ATTACHED;
|
|
|
|
while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
|
|
|
|
handle_allocindir_partdone(aip);
|
2001-02-23 09:01:31 +00:00
|
|
|
if (aip == LIST_FIRST(&indirdep->ir_donehd)) {
|
2001-09-12 08:38:13 +00:00
|
|
|
lk.lkt_held = NOHOLDER;
|
1999-02-17 20:01:20 +00:00
|
|
|
panic("disk_write_complete: not gone");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
WORKLIST_INSERT(&reattach, wk);
|
1999-05-14 01:26:46 +00:00
|
|
|
if ((bp->b_flags & B_DELWRI) == 0)
|
|
|
|
stat_indir_blk_ptrs++;
|
1998-05-19 19:47:22 +00:00
|
|
|
bdirty(bp);
|
|
|
|
continue;
|
|
|
|
|
|
|
|
default:
|
2001-09-12 08:38:13 +00:00
|
|
|
lk.lkt_held = NOHOLDER;
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("handle_disk_write_complete: Unknown type %s",
|
|
|
|
TYPENAME(wk->wk_type));
|
|
|
|
/* NOTREACHED */
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Reattach any requests that must be redone.
|
|
|
|
*/
|
|
|
|
while ((wk = LIST_FIRST(&reattach)) != NULL) {
|
|
|
|
WORKLIST_REMOVE(wk);
|
|
|
|
WORKLIST_INSERT(&bp->b_dep, wk);
|
|
|
|
}
|
|
|
|
#ifdef DEBUG
|
2001-09-12 08:38:13 +00:00
|
|
|
if (lk.lkt_held != SPECIAL_FLAG)
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("softdep_disk_write_complete: lock lost");
|
2001-09-12 08:38:13 +00:00
|
|
|
lk.lkt_held = NOHOLDER;
|
1998-05-19 19:47:22 +00:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called from within softdep_disk_write_complete above. Note that
|
|
|
|
* this routine is always called from interrupt level with further
|
|
|
|
* splbio interrupts blocked.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
handle_allocdirect_partdone(adp)
|
|
|
|
struct allocdirect *adp; /* the completed allocdirect */
|
|
|
|
{
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
struct allocdirectlst *listhead;
|
1998-05-19 19:47:22 +00:00
|
|
|
struct allocdirect *listadp;
|
|
|
|
struct inodedep *inodedep;
|
2000-06-18 22:05:57 +00:00
|
|
|
long bsize, delay;
|
1998-05-19 19:47:22 +00:00
|
|
|
|
|
|
|
if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
|
|
|
|
return;
|
2001-02-23 09:01:31 +00:00
|
|
|
if (adp->ad_buf != NULL) {
|
2001-09-12 08:38:13 +00:00
|
|
|
lk.lkt_held = NOHOLDER;
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("handle_allocdirect_partdone: dangling dep");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* The on-disk inode cannot claim to be any larger than the last
|
|
|
|
* fragment that has been written. Otherwise, the on-disk inode
|
|
|
|
* might have fragments that were not the last block in the file
|
|
|
|
* which would corrupt the filesystem. Thus, we cannot free any
|
|
|
|
* allocdirects after one whose ad_oldblkno claims a fragment as
|
|
|
|
* these blocks must be rolled back to zero before writing the inode.
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
* We check the currently active set of allocdirects in id_inoupdt
|
|
|
|
* or id_extupdt as appropriate.
|
1998-05-19 19:47:22 +00:00
|
|
|
*/
|
|
|
|
inodedep = adp->ad_inodedep;
|
|
|
|
bsize = inodedep->id_fs->fs_bsize;
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
if (adp->ad_state & EXTDATA)
|
|
|
|
listhead = &inodedep->id_extupdt;
|
|
|
|
else
|
|
|
|
listhead = &inodedep->id_inoupdt;
|
|
|
|
TAILQ_FOREACH(listadp, listhead, ad_next) {
|
1998-05-19 19:47:22 +00:00
|
|
|
/* found our block */
|
|
|
|
if (listadp == adp)
|
|
|
|
break;
|
|
|
|
/* continue if ad_oldlbn is not a fragment */
|
|
|
|
if (listadp->ad_oldsize == 0 ||
|
|
|
|
listadp->ad_oldsize == bsize)
|
|
|
|
continue;
|
|
|
|
/* hit a fragment */
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* If we have reached the end of the current list without
|
|
|
|
* finding the just finished dependency, then it must be
|
|
|
|
* on the future dependency list. Future dependencies cannot
|
|
|
|
* be freed until they are moved to the current list.
|
|
|
|
*/
|
|
|
|
if (listadp == NULL) {
|
|
|
|
#ifdef DEBUG
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
if (adp->ad_state & EXTDATA)
|
|
|
|
listhead = &inodedep->id_newextupdt;
|
|
|
|
else
|
|
|
|
listhead = &inodedep->id_newinoupdt;
|
|
|
|
TAILQ_FOREACH(listadp, listhead, ad_next)
|
1998-05-19 19:47:22 +00:00
|
|
|
/* found our block */
|
|
|
|
if (listadp == adp)
|
|
|
|
break;
|
2001-02-23 09:01:31 +00:00
|
|
|
if (listadp == NULL) {
|
2001-09-12 08:38:13 +00:00
|
|
|
lk.lkt_held = NOHOLDER;
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("handle_allocdirect_partdone: lost dep");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
#endif /* DEBUG */
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* If we have found the just finished dependency, then free
|
|
|
|
* it along with anything that follows it that is complete.
|
2000-06-18 22:05:57 +00:00
|
|
|
* If the inode still has a bitmap dependency, then it has
|
|
|
|
* never been written to disk, hence the on-disk inode cannot
|
|
|
|
* reference the old fragment so we can free it without delay.
|
1998-05-19 19:47:22 +00:00
|
|
|
*/
|
2000-06-18 22:05:57 +00:00
|
|
|
delay = (inodedep->id_state & DEPCOMPLETE);
|
1998-05-19 19:47:22 +00:00
|
|
|
for (; adp; adp = listadp) {
|
|
|
|
listadp = TAILQ_NEXT(adp, ad_next);
|
|
|
|
if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
|
|
|
|
return;
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
free_allocdirect(listhead, adp, delay);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called from within softdep_disk_write_complete above. Note that
|
|
|
|
* this routine is always called from interrupt level with further
|
|
|
|
* splbio interrupts blocked.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
handle_allocindir_partdone(aip)
|
|
|
|
struct allocindir *aip; /* the completed allocindir */
|
|
|
|
{
|
|
|
|
struct indirdep *indirdep;
|
|
|
|
|
|
|
|
if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
|
|
|
|
return;
|
2001-02-23 09:01:31 +00:00
|
|
|
if (aip->ai_buf != NULL) {
|
2001-09-12 08:38:13 +00:00
|
|
|
lk.lkt_held = NOHOLDER;
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("handle_allocindir_partdone: dangling dependency");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
indirdep = aip->ai_indirdep;
|
|
|
|
if (indirdep->ir_state & UNDONE) {
|
|
|
|
LIST_REMOVE(aip, ai_next);
|
|
|
|
LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
|
|
|
|
return;
|
|
|
|
}
|
2002-06-21 06:18:05 +00:00
|
|
|
if (indirdep->ir_state & UFS1FMT)
|
|
|
|
((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
|
|
|
|
aip->ai_newblkno;
|
|
|
|
else
|
|
|
|
((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
|
|
|
|
aip->ai_newblkno;
|
1998-05-19 19:47:22 +00:00
|
|
|
LIST_REMOVE(aip, ai_next);
|
|
|
|
if (aip->ai_freefrag != NULL)
|
|
|
|
add_to_worklist(&aip->ai_freefrag->ff_list);
|
1998-05-19 20:18:42 +00:00
|
|
|
WORKITEM_FREE(aip, D_ALLOCINDIR);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called from within softdep_disk_write_complete above to restore
|
|
|
|
* in-memory inode block contents to their most up-to-date state. Note
|
|
|
|
* that this routine is always called from interrupt level with further
|
|
|
|
* splbio interrupts blocked.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
handle_written_inodeblock(inodedep, bp)
|
|
|
|
struct inodedep *inodedep;
|
|
|
|
struct buf *bp; /* buffer containing the inode block */
|
|
|
|
{
|
|
|
|
struct worklist *wk, *filefree;
|
|
|
|
struct allocdirect *adp, *nextadp;
|
2002-06-21 06:18:05 +00:00
|
|
|
struct ufs1_dinode *dp1 = NULL;
|
|
|
|
struct ufs2_dinode *dp2 = NULL;
|
|
|
|
int hadchanges, fstype;
|
1998-05-19 19:47:22 +00:00
|
|
|
|
2001-02-23 09:01:31 +00:00
|
|
|
if ((inodedep->id_state & IOSTARTED) == 0) {
|
2001-09-12 08:38:13 +00:00
|
|
|
lk.lkt_held = NOHOLDER;
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("handle_written_inodeblock: not started");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
inodedep->id_state &= ~IOSTARTED;
|
|
|
|
inodedep->id_state |= COMPLETE;
|
2002-06-21 06:18:05 +00:00
|
|
|
if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
|
|
|
|
fstype = UFS1;
|
|
|
|
dp1 = (struct ufs1_dinode *)bp->b_data +
|
|
|
|
ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
|
|
|
|
} else {
|
|
|
|
fstype = UFS2;
|
|
|
|
dp2 = (struct ufs2_dinode *)bp->b_data +
|
|
|
|
ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
|
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* If we had to rollback the inode allocation because of
|
|
|
|
* bitmaps being incomplete, then simply restore it.
|
|
|
|
* Keep the block dirty so that it will not be reclaimed until
|
|
|
|
* all associated dependencies have been cleared and the
|
|
|
|
* corresponding updates written to disk.
|
|
|
|
*/
|
2002-06-21 06:18:05 +00:00
|
|
|
if (inodedep->id_savedino1 != NULL) {
|
|
|
|
if (fstype == UFS1)
|
|
|
|
*dp1 = *inodedep->id_savedino1;
|
|
|
|
else
|
|
|
|
*dp2 = *inodedep->id_savedino2;
|
|
|
|
FREE(inodedep->id_savedino1, M_INODEDEP);
|
|
|
|
inodedep->id_savedino1 = NULL;
|
1999-05-14 01:26:46 +00:00
|
|
|
if ((bp->b_flags & B_DELWRI) == 0)
|
|
|
|
stat_inode_bitmap++;
|
1998-05-19 19:47:22 +00:00
|
|
|
bdirty(bp);
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Roll forward anything that had to be rolled back before
|
|
|
|
* the inode could be updated.
|
|
|
|
*/
|
|
|
|
hadchanges = 0;
|
|
|
|
for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
|
|
|
|
nextadp = TAILQ_NEXT(adp, ad_next);
|
2001-02-23 09:01:31 +00:00
|
|
|
if (adp->ad_state & ATTACHED) {
|
2001-09-12 08:38:13 +00:00
|
|
|
lk.lkt_held = NOHOLDER;
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("handle_written_inodeblock: new entry");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
2002-06-21 06:18:05 +00:00
|
|
|
if (fstype == UFS1) {
|
|
|
|
if (adp->ad_lbn < NDADDR) {
|
|
|
|
if (dp1->di_db[adp->ad_lbn]!=adp->ad_oldblkno) {
|
|
|
|
lk.lkt_held = NOHOLDER;
|
2002-06-23 18:17:27 +00:00
|
|
|
panic("%s %s #%jd mismatch %d != %jd",
|
2002-06-21 06:18:05 +00:00
|
|
|
"handle_written_inodeblock:",
|
|
|
|
"direct pointer",
|
|
|
|
(intmax_t)adp->ad_lbn,
|
|
|
|
dp1->di_db[adp->ad_lbn],
|
|
|
|
(intmax_t)adp->ad_oldblkno);
|
|
|
|
}
|
|
|
|
dp1->di_db[adp->ad_lbn] = adp->ad_newblkno;
|
|
|
|
} else {
|
|
|
|
if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0) {
|
|
|
|
lk.lkt_held = NOHOLDER;
|
2002-06-23 18:17:27 +00:00
|
|
|
panic("%s: %s #%jd allocated as %d",
|
2002-06-21 06:18:05 +00:00
|
|
|
"handle_written_inodeblock",
|
|
|
|
"indirect pointer",
|
|
|
|
(intmax_t)adp->ad_lbn - NDADDR,
|
|
|
|
dp1->di_ib[adp->ad_lbn - NDADDR]);
|
|
|
|
}
|
|
|
|
dp1->di_ib[adp->ad_lbn - NDADDR] =
|
|
|
|
adp->ad_newblkno;
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
} else {
|
2002-06-21 06:18:05 +00:00
|
|
|
if (adp->ad_lbn < NDADDR) {
|
|
|
|
if (dp2->di_db[adp->ad_lbn]!=adp->ad_oldblkno) {
|
|
|
|
lk.lkt_held = NOHOLDER;
|
2002-06-23 18:17:27 +00:00
|
|
|
panic("%s: %s #%jd %s %jd != %jd",
|
2002-06-21 06:18:05 +00:00
|
|
|
"handle_written_inodeblock",
|
|
|
|
"direct pointer",
|
|
|
|
(intmax_t)adp->ad_lbn, "mismatch",
|
|
|
|
(intmax_t)dp2->di_db[adp->ad_lbn],
|
|
|
|
(intmax_t)adp->ad_oldblkno);
|
|
|
|
}
|
|
|
|
dp2->di_db[adp->ad_lbn] = adp->ad_newblkno;
|
|
|
|
} else {
|
|
|
|
if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0) {
|
|
|
|
lk.lkt_held = NOHOLDER;
|
2002-06-23 18:17:27 +00:00
|
|
|
panic("%s: %s #%jd allocated as %jd",
|
2002-06-21 06:18:05 +00:00
|
|
|
"handle_written_inodeblock",
|
|
|
|
"indirect pointer",
|
|
|
|
(intmax_t)adp->ad_lbn - NDADDR,
|
|
|
|
(intmax_t)
|
|
|
|
dp2->di_ib[adp->ad_lbn - NDADDR]);
|
|
|
|
}
|
|
|
|
dp2->di_ib[adp->ad_lbn - NDADDR] =
|
|
|
|
adp->ad_newblkno;
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
adp->ad_state &= ~UNDONE;
|
|
|
|
adp->ad_state |= ATTACHED;
|
|
|
|
hadchanges = 1;
|
|
|
|
}
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
|
|
|
|
nextadp = TAILQ_NEXT(adp, ad_next);
|
|
|
|
if (adp->ad_state & ATTACHED) {
|
|
|
|
lk.lkt_held = NOHOLDER;
|
|
|
|
panic("handle_written_inodeblock: new entry");
|
|
|
|
}
|
|
|
|
if (dp2->di_extb[adp->ad_lbn] != adp->ad_oldblkno) {
|
|
|
|
lk.lkt_held = NOHOLDER;
|
|
|
|
panic("%s: direct pointers #%jd %s %jd != %jd",
|
|
|
|
"handle_written_inodeblock",
|
|
|
|
(intmax_t)adp->ad_lbn, "mismatch",
|
|
|
|
(intmax_t)dp2->di_extb[adp->ad_lbn],
|
|
|
|
(intmax_t)adp->ad_oldblkno);
|
|
|
|
}
|
|
|
|
dp2->di_extb[adp->ad_lbn] = adp->ad_newblkno;
|
|
|
|
adp->ad_state &= ~UNDONE;
|
|
|
|
adp->ad_state |= ATTACHED;
|
|
|
|
hadchanges = 1;
|
|
|
|
}
|
1999-05-14 01:26:46 +00:00
|
|
|
if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
|
|
|
|
stat_direct_blk_ptrs++;
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* Reset the file size to its most up-to-date value.
|
|
|
|
*/
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1) {
|
2001-09-12 08:38:13 +00:00
|
|
|
lk.lkt_held = NOHOLDER;
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("handle_written_inodeblock: bad size");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
2002-06-21 06:18:05 +00:00
|
|
|
if (fstype == UFS1) {
|
|
|
|
if (dp1->di_size != inodedep->id_savedsize) {
|
|
|
|
dp1->di_size = inodedep->id_savedsize;
|
|
|
|
hadchanges = 1;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (dp2->di_size != inodedep->id_savedsize) {
|
|
|
|
dp2->di_size = inodedep->id_savedsize;
|
|
|
|
hadchanges = 1;
|
|
|
|
}
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
if (dp2->di_extsize != inodedep->id_savedextsize) {
|
|
|
|
dp2->di_extsize = inodedep->id_savedextsize;
|
|
|
|
hadchanges = 1;
|
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
inodedep->id_savedsize = -1;
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
inodedep->id_savedextsize = -1;
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* If there were any rollbacks in the inode block, then it must be
|
|
|
|
* marked dirty so that its will eventually get written back in
|
|
|
|
* its correct form.
|
|
|
|
*/
|
|
|
|
if (hadchanges)
|
|
|
|
bdirty(bp);
|
|
|
|
/*
|
|
|
|
* Process any allocdirects that completed during the update.
|
|
|
|
*/
|
|
|
|
if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
|
|
|
|
handle_allocdirect_partdone(adp);
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
|
|
|
|
handle_allocdirect_partdone(adp);
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* Process deallocations that were held pending until the
|
|
|
|
* inode had been written to disk. Freeing of the inode
|
|
|
|
* is delayed until after all blocks have been freed to
|
|
|
|
* avoid creation of new <vfsid, inum, lbn> triples
|
|
|
|
* before the old ones have been deleted.
|
|
|
|
*/
|
|
|
|
filefree = NULL;
|
1998-05-19 21:45:53 +00:00
|
|
|
while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
|
1998-05-19 19:47:22 +00:00
|
|
|
WORKLIST_REMOVE(wk);
|
|
|
|
switch (wk->wk_type) {
|
|
|
|
|
1998-05-19 20:18:42 +00:00
|
|
|
case D_FREEFILE:
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* We defer adding filefree to the worklist until
|
|
|
|
* all other additions have been made to ensure
|
|
|
|
* that it will be done after all the old blocks
|
|
|
|
* have been freed.
|
|
|
|
*/
|
2001-02-23 09:01:31 +00:00
|
|
|
if (filefree != NULL) {
|
2001-09-12 08:38:13 +00:00
|
|
|
lk.lkt_held = NOHOLDER;
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("handle_written_inodeblock: filefree");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
filefree = wk;
|
|
|
|
continue;
|
|
|
|
|
1998-05-19 20:18:42 +00:00
|
|
|
case D_MKDIR:
|
1998-05-19 19:47:22 +00:00
|
|
|
handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
|
|
|
|
continue;
|
|
|
|
|
1998-05-19 20:18:42 +00:00
|
|
|
case D_DIRADD:
|
1998-05-19 21:45:53 +00:00
|
|
|
diradd_inode_written(WK_DIRADD(wk), inodedep);
|
1998-05-19 19:47:22 +00:00
|
|
|
continue;
|
|
|
|
|
1998-05-19 20:18:42 +00:00
|
|
|
case D_FREEBLKS:
|
|
|
|
case D_FREEFRAG:
|
|
|
|
case D_DIRREM:
|
1998-05-19 19:47:22 +00:00
|
|
|
add_to_worklist(wk);
|
|
|
|
continue;
|
|
|
|
|
2001-05-17 07:24:03 +00:00
|
|
|
case D_NEWDIRBLK:
|
|
|
|
free_newdirblk(WK_NEWDIRBLK(wk));
|
|
|
|
continue;
|
|
|
|
|
1998-05-19 19:47:22 +00:00
|
|
|
default:
|
2001-09-12 08:38:13 +00:00
|
|
|
lk.lkt_held = NOHOLDER;
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("handle_written_inodeblock: Unknown type %s",
|
|
|
|
TYPENAME(wk->wk_type));
|
|
|
|
/* NOTREACHED */
|
|
|
|
}
|
|
|
|
}
|
1998-05-19 21:45:53 +00:00
|
|
|
if (filefree != NULL) {
|
2001-02-23 09:01:31 +00:00
|
|
|
if (free_inodedep(inodedep) == 0) {
|
2001-09-12 08:38:13 +00:00
|
|
|
lk.lkt_held = NOHOLDER;
|
1998-05-19 21:45:53 +00:00
|
|
|
panic("handle_written_inodeblock: live inodedep");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
add_to_worklist(filefree);
|
1998-05-19 21:45:53 +00:00
|
|
|
return (0);
|
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If no outstanding dependencies, free it.
|
|
|
|
*/
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
if (free_inodedep(inodedep) ||
|
|
|
|
(TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
|
|
|
|
TAILQ_FIRST(&inodedep->id_extupdt) == 0))
|
1998-05-19 19:47:22 +00:00
|
|
|
return (0);
|
|
|
|
return (hadchanges);
|
|
|
|
}
|
|
|
|
|
1998-05-19 21:45:53 +00:00
|
|
|
/*
|
|
|
|
* Process a diradd entry after its dependent inode has been written.
|
|
|
|
* This routine must be called with splbio interrupts blocked.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
diradd_inode_written(dap, inodedep)
|
|
|
|
struct diradd *dap;
|
|
|
|
struct inodedep *inodedep;
|
|
|
|
{
|
|
|
|
struct pagedep *pagedep;
|
|
|
|
|
|
|
|
dap->da_state |= COMPLETE;
|
|
|
|
if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
|
|
|
|
if (dap->da_state & DIRCHG)
|
|
|
|
pagedep = dap->da_previous->dm_pagedep;
|
|
|
|
else
|
|
|
|
pagedep = dap->da_pagedep;
|
|
|
|
LIST_REMOVE(dap, da_pdlist);
|
|
|
|
LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
|
|
|
|
}
|
|
|
|
WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
|
|
|
|
}
|
|
|
|
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* Handle the completion of a mkdir dependency.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
handle_written_mkdir(mkdir, type)
|
|
|
|
struct mkdir *mkdir;
|
|
|
|
int type;
|
|
|
|
{
|
|
|
|
struct diradd *dap;
|
|
|
|
struct pagedep *pagedep;
|
|
|
|
|
2001-02-23 09:01:31 +00:00
|
|
|
if (mkdir->md_state != type) {
|
2001-09-12 08:38:13 +00:00
|
|
|
lk.lkt_held = NOHOLDER;
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("handle_written_mkdir: bad type");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
dap = mkdir->md_diradd;
|
|
|
|
dap->da_state &= ~type;
|
|
|
|
if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
|
|
|
|
dap->da_state |= DEPCOMPLETE;
|
|
|
|
if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
|
|
|
|
if (dap->da_state & DIRCHG)
|
|
|
|
pagedep = dap->da_previous->dm_pagedep;
|
|
|
|
else
|
|
|
|
pagedep = dap->da_pagedep;
|
|
|
|
LIST_REMOVE(dap, da_pdlist);
|
|
|
|
LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
|
|
|
|
}
|
|
|
|
LIST_REMOVE(mkdir, md_mkdirs);
|
1998-05-19 20:18:42 +00:00
|
|
|
WORKITEM_FREE(mkdir, D_MKDIR);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called from within softdep_disk_write_complete above.
|
|
|
|
* A write operation was just completed. Removed inodes can
|
|
|
|
* now be freed and associated block pointers may be committed.
|
|
|
|
* Note that this routine is always called from interrupt level
|
|
|
|
* with further splbio interrupts blocked.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
handle_written_filepage(pagedep, bp)
|
|
|
|
struct pagedep *pagedep;
|
|
|
|
struct buf *bp; /* buffer containing the written page */
|
|
|
|
{
|
|
|
|
struct dirrem *dirrem;
|
|
|
|
struct diradd *dap, *nextdap;
|
|
|
|
struct direct *ep;
|
|
|
|
int i, chgs;
|
|
|
|
|
2001-02-23 09:01:31 +00:00
|
|
|
if ((pagedep->pd_state & IOSTARTED) == 0) {
|
2001-09-12 08:38:13 +00:00
|
|
|
lk.lkt_held = NOHOLDER;
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("handle_written_filepage: not started");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
pagedep->pd_state &= ~IOSTARTED;
|
|
|
|
/*
|
|
|
|
* Process any directory removals that have been committed.
|
|
|
|
*/
|
|
|
|
while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
|
|
|
|
LIST_REMOVE(dirrem, dm_next);
|
|
|
|
dirrem->dm_dirinum = pagedep->pd_ino;
|
|
|
|
add_to_worklist(&dirrem->dm_list);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Free any directory additions that have been committed.
|
2001-05-17 07:24:03 +00:00
|
|
|
* If it is a newly allocated block, we have to wait until
|
|
|
|
* the on-disk directory inode claims the new block.
|
1998-05-19 19:47:22 +00:00
|
|
|
*/
|
2001-05-17 07:24:03 +00:00
|
|
|
if ((pagedep->pd_state & NEWBLOCK) == 0)
|
|
|
|
while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
|
|
|
|
free_diradd(dap);
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* Uncommitted directory entries must be restored.
|
|
|
|
*/
|
|
|
|
for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
|
|
|
|
for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
|
|
|
|
dap = nextdap) {
|
|
|
|
nextdap = LIST_NEXT(dap, da_pdlist);
|
2001-02-23 09:01:31 +00:00
|
|
|
if (dap->da_state & ATTACHED) {
|
2001-09-12 08:38:13 +00:00
|
|
|
lk.lkt_held = NOHOLDER;
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("handle_written_filepage: attached");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
ep = (struct direct *)
|
|
|
|
((char *)bp->b_data + dap->da_offset);
|
|
|
|
ep->d_ino = dap->da_newinum;
|
|
|
|
dap->da_state &= ~UNDONE;
|
|
|
|
dap->da_state |= ATTACHED;
|
|
|
|
chgs = 1;
|
|
|
|
/*
|
|
|
|
* If the inode referenced by the directory has
|
|
|
|
* been written out, then the dependency can be
|
|
|
|
* moved to the pending list.
|
|
|
|
*/
|
|
|
|
if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
|
|
|
|
LIST_REMOVE(dap, da_pdlist);
|
|
|
|
LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
|
|
|
|
da_pdlist);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* If there were any rollbacks in the directory, then it must be
|
|
|
|
* marked dirty so that its will eventually get written back in
|
|
|
|
* its correct form.
|
|
|
|
*/
|
1999-05-14 01:26:46 +00:00
|
|
|
if (chgs) {
|
|
|
|
if ((bp->b_flags & B_DELWRI) == 0)
|
|
|
|
stat_dir_entry++;
|
1998-05-19 19:47:22 +00:00
|
|
|
bdirty(bp);
|
2001-05-17 07:24:03 +00:00
|
|
|
return (1);
|
1999-05-14 01:26:46 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
2002-02-07 00:54:32 +00:00
|
|
|
* If we are not waiting for a new directory block to be
|
|
|
|
* claimed by its inode, then the pagedep will be freed.
|
|
|
|
* Otherwise it will remain to track any new entries on
|
|
|
|
* the page in case they are fsync'ed.
|
1998-05-19 19:47:22 +00:00
|
|
|
*/
|
2002-02-07 00:54:32 +00:00
|
|
|
if ((pagedep->pd_state & NEWBLOCK) == 0) {
|
2001-06-13 23:13:13 +00:00
|
|
|
LIST_REMOVE(pagedep, pd_hash);
|
|
|
|
WORKITEM_FREE(pagedep, D_PAGEDEP);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
2001-05-17 07:24:03 +00:00
|
|
|
return (0);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Writing back in-core inode structures.
|
|
|
|
*
|
2002-05-16 21:28:32 +00:00
|
|
|
* The filesystem only accesses an inode's contents when it occupies an
|
1998-05-19 19:47:22 +00:00
|
|
|
* "in-core" inode structure. These "in-core" structures are separate from
|
|
|
|
* the page frames used to cache inode blocks. Only the latter are
|
|
|
|
* transferred to/from the disk. So, when the updated contents of the
|
|
|
|
* "in-core" inode structure are copied to the corresponding in-memory inode
|
|
|
|
* block, the dependencies are also transferred. The following procedure is
|
|
|
|
* called when copying a dirty "in-core" inode to a cached inode block.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called when an inode is loaded from disk. If the effective link count
|
|
|
|
* differed from the actual link count when it was last flushed, then we
|
|
|
|
* need to ensure that the correct effective link count is put back.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
softdep_load_inodeblock(ip)
|
|
|
|
struct inode *ip; /* the "in_core" copy of the inode */
|
|
|
|
{
|
|
|
|
struct inodedep *inodedep;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check for alternate nlink count.
|
|
|
|
*/
|
|
|
|
ip->i_effnlink = ip->i_nlink;
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
return;
|
|
|
|
}
|
2000-01-10 00:24:24 +00:00
|
|
|
ip->i_effnlink -= inodedep->id_nlinkdelta;
|
2001-05-08 07:42:20 +00:00
|
|
|
if (inodedep->id_state & SPACECOUNTED)
|
|
|
|
ip->i_flag |= IN_SPACECOUNTED;
|
1998-05-19 19:47:22 +00:00
|
|
|
FREE_LOCK(&lk);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This routine is called just before the "in-core" inode
|
|
|
|
* information is to be copied to the in-memory inode block.
|
|
|
|
* Recall that an inode block contains several inodes. If
|
|
|
|
* the force flag is set, then the dependencies will be
|
|
|
|
* cleared so that the update can always be made. Note that
|
|
|
|
* the buffer is locked when this routine is called, so we
|
|
|
|
* will never be in the middle of writing the inode block
|
|
|
|
* to disk.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
softdep_update_inodeblock(ip, bp, waitfor)
|
|
|
|
struct inode *ip; /* the "in_core" copy of the inode */
|
|
|
|
struct buf *bp; /* the buffer containing the inode block */
|
1999-01-06 18:18:06 +00:00
|
|
|
int waitfor; /* nonzero => update must be allowed */
|
1998-05-19 19:47:22 +00:00
|
|
|
{
|
|
|
|
struct inodedep *inodedep;
|
1998-05-19 21:45:53 +00:00
|
|
|
struct worklist *wk;
|
1998-05-19 19:47:22 +00:00
|
|
|
int error, gotit;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the effective link count is not equal to the actual link
|
|
|
|
* count, then we must track the difference in an inodedep while
|
|
|
|
* the inode is (potentially) tossed out of the cache. Otherwise,
|
|
|
|
* if there is no existing inodedep, then there are no dependencies
|
|
|
|
* to track.
|
|
|
|
*/
|
|
|
|
ACQUIRE_LOCK(&lk);
|
2000-01-17 06:35:11 +00:00
|
|
|
if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
|
2001-02-23 09:01:31 +00:00
|
|
|
FREE_LOCK(&lk);
|
2000-01-17 06:35:11 +00:00
|
|
|
if (ip->i_effnlink != ip->i_nlink)
|
|
|
|
panic("softdep_update_inodeblock: bad link count");
|
1998-05-19 19:47:22 +00:00
|
|
|
return;
|
|
|
|
}
|
2001-02-23 09:01:31 +00:00
|
|
|
if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) {
|
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("softdep_update_inodeblock: bad delta");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* Changes have been initiated. Anything depending on these
|
|
|
|
* changes cannot occur until this inode has been written.
|
|
|
|
*/
|
|
|
|
inodedep->id_state &= ~COMPLETE;
|
|
|
|
if ((inodedep->id_state & ONWORKLIST) == 0)
|
|
|
|
WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
|
|
|
|
/*
|
|
|
|
* Any new dependencies associated with the incore inode must
|
|
|
|
* now be moved to the list associated with the buffer holding
|
|
|
|
* the in-memory copy of the inode. Once merged process any
|
|
|
|
* allocdirects that are completed by the merger.
|
|
|
|
*/
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
|
1998-05-19 19:47:22 +00:00
|
|
|
if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
|
|
|
|
handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
|
|
|
|
if (TAILQ_FIRST(&inodedep->id_extupdt) != NULL)
|
|
|
|
handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt));
|
1998-05-19 21:45:53 +00:00
|
|
|
/*
|
|
|
|
* Now that the inode has been pushed into the buffer, the
|
|
|
|
* operations dependent on the inode being written to disk
|
|
|
|
* can be moved to the id_bufwait so that they will be
|
|
|
|
* processed when the buffer I/O completes.
|
|
|
|
*/
|
|
|
|
while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
|
|
|
|
WORKLIST_REMOVE(wk);
|
|
|
|
WORKLIST_INSERT(&inodedep->id_bufwait, wk);
|
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* Newly allocated inodes cannot be written until the bitmap
|
|
|
|
* that allocates them have been written (indicated by
|
|
|
|
* DEPCOMPLETE being set in id_state). If we are doing a
|
|
|
|
* forced sync (e.g., an fsync on a file), we force the bitmap
|
|
|
|
* to be written so that the update can be done.
|
|
|
|
*/
|
|
|
|
if ((inodedep->id_state & DEPCOMPLETE) != 0 || waitfor == 0) {
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
|
|
|
|
FREE_LOCK(&lk);
|
1999-06-16 23:27:55 +00:00
|
|
|
if (gotit &&
|
2000-03-20 11:29:10 +00:00
|
|
|
(error = BUF_WRITE(inodedep->id_buf)) != 0)
|
1998-05-19 19:47:22 +00:00
|
|
|
softdep_error("softdep_update_inodeblock: bwrite", error);
|
|
|
|
if ((inodedep->id_state & DEPCOMPLETE) == 0)
|
|
|
|
panic("softdep_update_inodeblock: update failed");
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
* Merge the a new inode dependency list (such as id_newinoupdt) into an
|
|
|
|
* old inode dependency list (such as id_inoupdt). This routine must be
|
|
|
|
* called with splbio interrupts blocked.
|
1998-05-19 19:47:22 +00:00
|
|
|
*/
|
|
|
|
static void
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
merge_inode_lists(newlisthead, oldlisthead)
|
|
|
|
struct allocdirectlst *newlisthead;
|
|
|
|
struct allocdirectlst *oldlisthead;
|
1998-05-19 19:47:22 +00:00
|
|
|
{
|
|
|
|
struct allocdirect *listadp, *newadp;
|
|
|
|
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
newadp = TAILQ_FIRST(newlisthead);
|
|
|
|
for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
|
1998-05-19 19:47:22 +00:00
|
|
|
if (listadp->ad_lbn < newadp->ad_lbn) {
|
|
|
|
listadp = TAILQ_NEXT(listadp, ad_next);
|
|
|
|
continue;
|
|
|
|
}
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
TAILQ_REMOVE(newlisthead, newadp, ad_next);
|
1998-05-19 19:47:22 +00:00
|
|
|
TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
|
|
|
|
if (listadp->ad_lbn == newadp->ad_lbn) {
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
allocdirect_merge(oldlisthead, newadp,
|
1998-05-19 19:47:22 +00:00
|
|
|
listadp);
|
|
|
|
listadp = newadp;
|
|
|
|
}
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
newadp = TAILQ_FIRST(newlisthead);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
|
|
|
|
TAILQ_REMOVE(newlisthead, newadp, ad_next);
|
|
|
|
TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we are doing an fsync, then we must ensure that any directory
|
|
|
|
* entries for the inode have been written after the inode gets to disk.
|
|
|
|
*/
|
2000-06-16 13:00:33 +00:00
|
|
|
int
|
1998-05-19 19:47:22 +00:00
|
|
|
softdep_fsync(vp)
|
|
|
|
struct vnode *vp; /* the "in_core" copy of the inode */
|
|
|
|
{
|
|
|
|
struct inodedep *inodedep;
|
|
|
|
struct pagedep *pagedep;
|
|
|
|
struct worklist *wk;
|
2000-01-09 23:14:57 +00:00
|
|
|
struct diradd *dap;
|
1998-05-19 19:47:22 +00:00
|
|
|
struct mount *mnt;
|
|
|
|
struct vnode *pvp;
|
|
|
|
struct inode *ip;
|
|
|
|
struct buf *bp;
|
|
|
|
struct fs *fs;
|
2001-09-12 08:38:13 +00:00
|
|
|
struct thread *td = curthread;
|
2000-01-09 23:14:57 +00:00
|
|
|
int error, flushparent;
|
1998-05-19 19:47:22 +00:00
|
|
|
ino_t parentino;
|
|
|
|
ufs_lbn_t lbn;
|
|
|
|
|
|
|
|
ip = VTOI(vp);
|
|
|
|
fs = ip->i_fs;
|
2000-01-09 23:14:57 +00:00
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) {
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
|
|
|
|
LIST_FIRST(&inodedep->id_bufwait) != NULL ||
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
TAILQ_FIRST(&inodedep->id_extupdt) != NULL ||
|
|
|
|
TAILQ_FIRST(&inodedep->id_newextupdt) != NULL ||
|
2000-01-09 23:14:57 +00:00
|
|
|
TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
|
2001-02-23 09:01:31 +00:00
|
|
|
TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) {
|
|
|
|
FREE_LOCK(&lk);
|
2000-01-09 23:14:57 +00:00
|
|
|
panic("softdep_fsync: pending ops");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
2000-01-09 23:14:57 +00:00
|
|
|
for (error = 0, flushparent = 0; ; ) {
|
1998-05-19 19:47:22 +00:00
|
|
|
if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
|
|
|
|
break;
|
2001-02-23 09:01:31 +00:00
|
|
|
if (wk->wk_type != D_DIRADD) {
|
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 20:03:29 +00:00
|
|
|
panic("softdep_fsync: Unexpected type %s",
|
1998-05-19 19:47:22 +00:00
|
|
|
TYPENAME(wk->wk_type));
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
dap = WK_DIRADD(wk);
|
|
|
|
/*
|
2001-05-17 07:24:03 +00:00
|
|
|
* Flush our parent if this directory entry has a MKDIR_PARENT
|
|
|
|
* dependency or is contained in a newly allocated block.
|
1998-05-19 19:47:22 +00:00
|
|
|
*/
|
|
|
|
if (dap->da_state & DIRCHG)
|
|
|
|
pagedep = dap->da_previous->dm_pagedep;
|
|
|
|
else
|
|
|
|
pagedep = dap->da_pagedep;
|
|
|
|
mnt = pagedep->pd_mnt;
|
|
|
|
parentino = pagedep->pd_ino;
|
|
|
|
lbn = pagedep->pd_lbn;
|
2001-02-23 09:01:31 +00:00
|
|
|
if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) {
|
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("softdep_fsync: dirty");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
2001-05-17 07:24:03 +00:00
|
|
|
if ((dap->da_state & MKDIR_PARENT) ||
|
|
|
|
(pagedep->pd_state & NEWBLOCK))
|
|
|
|
flushparent = 1;
|
|
|
|
else
|
|
|
|
flushparent = 0;
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* If we are being fsync'ed as part of vgone'ing this vnode,
|
|
|
|
* then we will not be able to release and recover the
|
|
|
|
* vnode below, so we just have to give up on writing its
|
|
|
|
* directory entry out. It will eventually be written, just
|
|
|
|
* not now, but then the user was not asking to have it
|
|
|
|
* written, so we are not breaking any promises.
|
|
|
|
*/
|
2002-08-04 10:29:36 +00:00
|
|
|
mp_fixme("This operation is not atomic wrt the rest of the code");
|
|
|
|
VI_LOCK(vp);
|
|
|
|
if (vp->v_iflag & VI_XLOCK) {
|
|
|
|
VI_UNLOCK(vp);
|
1998-05-19 19:47:22 +00:00
|
|
|
break;
|
2002-08-04 10:29:36 +00:00
|
|
|
} else
|
|
|
|
VI_UNLOCK(vp);
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* We prevent deadlock by always fetching inodes from the
|
|
|
|
* root, moving down the directory tree. Thus, when fetching
|
2002-03-17 01:25:47 +00:00
|
|
|
* our parent directory, we first try to get the lock. If
|
|
|
|
* that fails, we must unlock ourselves before requesting
|
|
|
|
* the lock on our parent. See the comment in ufs_lookup
|
|
|
|
* for details on possible races.
|
1998-05-19 19:47:22 +00:00
|
|
|
*/
|
|
|
|
FREE_LOCK(&lk);
|
2002-03-17 01:25:47 +00:00
|
|
|
if (VFS_VGET(mnt, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp)) {
|
|
|
|
VOP_UNLOCK(vp, 0, td);
|
|
|
|
error = VFS_VGET(mnt, parentino, LK_EXCLUSIVE, &pvp);
|
|
|
|
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
|
|
|
|
if (error != 0)
|
|
|
|
return (error);
|
|
|
|
}
|
2001-05-17 07:24:03 +00:00
|
|
|
/*
|
|
|
|
* All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
|
|
|
|
* that are contained in direct blocks will be resolved by
|
|
|
|
* doing a UFS_UPDATE. Pagedeps contained in indirect blocks
|
|
|
|
* may require a complete sync'ing of the directory. So, we
|
|
|
|
* try the cheap and fast UFS_UPDATE first, and if that fails,
|
|
|
|
* then we do the slower VOP_FSYNC of the directory.
|
|
|
|
*/
|
1998-05-19 19:47:22 +00:00
|
|
|
if (flushparent) {
|
1999-05-07 05:11:31 +00:00
|
|
|
if ((error = UFS_UPDATE(pvp, 1)) != 0) {
|
1998-05-19 19:47:22 +00:00
|
|
|
vput(pvp);
|
|
|
|
return (error);
|
|
|
|
}
|
2001-05-17 07:24:03 +00:00
|
|
|
if ((pagedep->pd_state & NEWBLOCK) &&
|
2002-02-27 18:32:23 +00:00
|
|
|
(error = VOP_FSYNC(pvp, td->td_ucred, MNT_WAIT, td))) {
|
2001-05-17 07:24:03 +00:00
|
|
|
vput(pvp);
|
|
|
|
return (error);
|
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Flush directory page containing the inode's name.
|
|
|
|
*/
|
2002-02-27 18:32:23 +00:00
|
|
|
error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
|
1998-05-19 19:47:22 +00:00
|
|
|
&bp);
|
2000-01-09 23:14:57 +00:00
|
|
|
if (error == 0)
|
2000-03-20 11:29:10 +00:00
|
|
|
error = BUF_WRITE(bp);
|
2002-02-02 01:42:44 +00:00
|
|
|
else
|
|
|
|
brelse(bp);
|
1998-05-19 21:45:53 +00:00
|
|
|
vput(pvp);
|
1998-05-19 19:47:22 +00:00
|
|
|
if (error != 0)
|
|
|
|
return (error);
|
2000-01-09 23:14:57 +00:00
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
|
|
|
|
break;
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
1999-05-14 01:26:46 +00:00
|
|
|
/*
|
|
|
|
* Flush all the dirty bitmaps associated with the block device
|
|
|
|
* before flushing the rest of the dirty blocks so as to reduce
|
|
|
|
* the number of dependencies that will have to be rolled back.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
softdep_fsync_mountdev(vp)
|
|
|
|
struct vnode *vp;
|
|
|
|
{
|
|
|
|
struct buf *bp, *nbp;
|
|
|
|
struct worklist *wk;
|
|
|
|
|
2000-01-10 12:04:27 +00:00
|
|
|
if (!vn_isdisk(vp, NULL))
|
1999-11-22 10:33:55 +00:00
|
|
|
panic("softdep_fsync_mountdev: vnode not a disk");
|
1999-05-14 01:26:46 +00:00
|
|
|
ACQUIRE_LOCK(&lk);
|
2002-09-25 02:49:48 +00:00
|
|
|
VI_LOCK(vp);
|
1999-05-14 01:26:46 +00:00
|
|
|
for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
|
|
|
|
nbp = TAILQ_NEXT(bp, b_vnbufs);
|
2002-09-25 02:49:48 +00:00
|
|
|
VI_UNLOCK(vp);
|
1999-05-14 01:26:46 +00:00
|
|
|
/*
|
|
|
|
* If it is already scheduled, skip to the next buffer.
|
|
|
|
*/
|
2002-09-25 02:49:48 +00:00
|
|
|
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
|
|
|
|
VI_LOCK(vp);
|
1999-05-14 01:26:46 +00:00
|
|
|
continue;
|
2002-09-25 02:49:48 +00:00
|
|
|
}
|
2001-02-23 09:01:31 +00:00
|
|
|
if ((bp->b_flags & B_DELWRI) == 0) {
|
|
|
|
FREE_LOCK(&lk);
|
1999-05-14 01:26:46 +00:00
|
|
|
panic("softdep_fsync_mountdev: not dirty");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1999-05-14 01:26:46 +00:00
|
|
|
/*
|
|
|
|
* We are only interested in bitmaps with outstanding
|
|
|
|
* dependencies.
|
|
|
|
*/
|
|
|
|
if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
|
2000-01-30 20:32:59 +00:00
|
|
|
wk->wk_type != D_BMSAFEMAP ||
|
|
|
|
(bp->b_xflags & BX_BKGRDINPROG)) {
|
1999-06-26 02:47:16 +00:00
|
|
|
BUF_UNLOCK(bp);
|
2002-09-25 02:49:48 +00:00
|
|
|
VI_LOCK(vp);
|
1999-05-14 01:26:46 +00:00
|
|
|
continue;
|
1999-06-26 02:47:16 +00:00
|
|
|
}
|
1999-05-14 01:26:46 +00:00
|
|
|
bremfree(bp);
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
(void) bawrite(bp);
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
/*
|
|
|
|
* Since we may have slept during the I/O, we need
|
|
|
|
* to start from a known point.
|
|
|
|
*/
|
2002-09-25 02:49:48 +00:00
|
|
|
VI_LOCK(vp);
|
1999-05-14 01:26:46 +00:00
|
|
|
nbp = TAILQ_FIRST(&vp->v_dirtyblkhd);
|
|
|
|
}
|
2002-09-25 02:49:48 +00:00
|
|
|
VI_UNLOCK(vp);
|
1999-05-14 01:26:46 +00:00
|
|
|
drain_output(vp, 1);
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
}
|
|
|
|
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* This routine is called when we are trying to synchronously flush a
|
|
|
|
* file. This routine must eliminate any filesystem metadata dependencies
|
|
|
|
* so that the syncing routine can succeed by pushing the dirty blocks
|
|
|
|
* associated with the file. If any I/O errors occur, they are returned.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
softdep_sync_metadata(ap)
|
|
|
|
struct vop_fsync_args /* {
|
|
|
|
struct vnode *a_vp;
|
|
|
|
struct ucred *a_cred;
|
|
|
|
int a_waitfor;
|
2001-09-12 08:38:13 +00:00
|
|
|
struct thread *a_td;
|
1998-05-19 19:47:22 +00:00
|
|
|
} */ *ap;
|
|
|
|
{
|
|
|
|
struct vnode *vp = ap->a_vp;
|
1998-05-19 20:03:29 +00:00
|
|
|
struct pagedep *pagedep;
|
1998-05-19 19:47:22 +00:00
|
|
|
struct allocdirect *adp;
|
|
|
|
struct allocindir *aip;
|
|
|
|
struct buf *bp, *nbp;
|
|
|
|
struct worklist *wk;
|
1998-05-19 20:03:29 +00:00
|
|
|
int i, error, waitfor;
|
1998-05-19 19:47:22 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Check whether this vnode is involved in a filesystem
|
|
|
|
* that is doing soft dependency processing.
|
|
|
|
*/
|
2000-01-10 12:04:27 +00:00
|
|
|
if (!vn_isdisk(vp, NULL)) {
|
1998-05-19 19:47:22 +00:00
|
|
|
if (!DOINGSOFTDEP(vp))
|
|
|
|
return (0);
|
|
|
|
} else
|
2000-10-09 17:31:39 +00:00
|
|
|
if (vp->v_rdev->si_mountpoint == NULL ||
|
|
|
|
(vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP) == 0)
|
1998-05-19 19:47:22 +00:00
|
|
|
return (0);
|
|
|
|
/*
|
|
|
|
* Ensure that any direct block dependencies have been cleared.
|
|
|
|
*/
|
|
|
|
ACQUIRE_LOCK(&lk);
|
1999-05-07 02:26:47 +00:00
|
|
|
if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) {
|
1998-05-19 19:47:22 +00:00
|
|
|
FREE_LOCK(&lk);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* For most files, the only metadata dependencies are the
|
|
|
|
* cylinder group maps that allocate their inode or blocks.
|
|
|
|
* The block allocation dependencies can be found by traversing
|
|
|
|
* the dependency lists for any buffers that remain on their
|
|
|
|
* dirty buffer list. The inode allocation dependency will
|
|
|
|
* be resolved when the inode is updated with MNT_WAIT.
|
|
|
|
* This work is done in two passes. The first pass grabs most
|
|
|
|
* of the buffers and begins asynchronously writing them. The
|
|
|
|
* only way to wait for these asynchronous writes is to sleep
|
|
|
|
* on the filesystem vnode which may stay busy for a long time
|
|
|
|
* if the filesystem is active. So, instead, we make a second
|
|
|
|
* pass over the dependencies blocking on each write. In the
|
|
|
|
* usual case we will be blocking against a write that we
|
|
|
|
* initiated, so when it is done the dependency will have been
|
|
|
|
* resolved. Thus the second pass is expected to end quickly.
|
|
|
|
*/
|
|
|
|
waitfor = MNT_NOWAIT;
|
|
|
|
top:
|
2002-01-11 19:59:27 +00:00
|
|
|
/*
|
|
|
|
* We must wait for any I/O in progress to finish so that
|
|
|
|
* all potential buffers on the dirty list will be visible.
|
|
|
|
*/
|
|
|
|
drain_output(vp, 1);
|
1998-10-31 15:33:32 +00:00
|
|
|
if (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT) == 0) {
|
1998-05-19 19:47:22 +00:00
|
|
|
FREE_LOCK(&lk);
|
|
|
|
return (0);
|
|
|
|
}
|
2002-09-25 02:49:48 +00:00
|
|
|
mp_fixme("The locking is somewhat complicated nonexistant here.");
|
1998-10-31 15:33:32 +00:00
|
|
|
bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
|
2001-05-08 07:13:00 +00:00
|
|
|
/* While syncing snapshots, we must allow recursive lookups */
|
|
|
|
bp->b_lock.lk_flags |= LK_CANRECURSE;
|
1998-05-19 19:47:22 +00:00
|
|
|
loop:
|
|
|
|
/*
|
|
|
|
* As we hold the buffer locked, none of its dependencies
|
|
|
|
* will disappear.
|
|
|
|
*/
|
2001-02-04 16:08:18 +00:00
|
|
|
LIST_FOREACH(wk, &bp->b_dep, wk_list) {
|
1998-05-19 19:47:22 +00:00
|
|
|
switch (wk->wk_type) {
|
|
|
|
|
1998-05-19 20:18:42 +00:00
|
|
|
case D_ALLOCDIRECT:
|
1998-05-19 19:47:22 +00:00
|
|
|
adp = WK_ALLOCDIRECT(wk);
|
|
|
|
if (adp->ad_state & DEPCOMPLETE)
|
2001-05-08 07:13:00 +00:00
|
|
|
continue;
|
1998-05-19 19:47:22 +00:00
|
|
|
nbp = adp->ad_buf;
|
1998-05-19 22:54:53 +00:00
|
|
|
if (getdirtybuf(&nbp, waitfor) == 0)
|
2001-05-08 07:13:00 +00:00
|
|
|
continue;
|
1998-05-19 19:47:22 +00:00
|
|
|
FREE_LOCK(&lk);
|
|
|
|
if (waitfor == MNT_NOWAIT) {
|
|
|
|
bawrite(nbp);
|
2000-03-20 11:29:10 +00:00
|
|
|
} else if ((error = BUF_WRITE(nbp)) != 0) {
|
2001-05-08 07:13:00 +00:00
|
|
|
break;
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
ACQUIRE_LOCK(&lk);
|
2001-05-08 07:13:00 +00:00
|
|
|
continue;
|
1998-05-19 19:47:22 +00:00
|
|
|
|
1998-05-19 20:18:42 +00:00
|
|
|
case D_ALLOCINDIR:
|
1998-05-19 19:47:22 +00:00
|
|
|
aip = WK_ALLOCINDIR(wk);
|
|
|
|
if (aip->ai_state & DEPCOMPLETE)
|
2001-05-08 07:13:00 +00:00
|
|
|
continue;
|
1998-05-19 19:47:22 +00:00
|
|
|
nbp = aip->ai_buf;
|
|
|
|
if (getdirtybuf(&nbp, waitfor) == 0)
|
2001-05-08 07:13:00 +00:00
|
|
|
continue;
|
1998-05-19 19:47:22 +00:00
|
|
|
FREE_LOCK(&lk);
|
|
|
|
if (waitfor == MNT_NOWAIT) {
|
|
|
|
bawrite(nbp);
|
2000-03-20 11:29:10 +00:00
|
|
|
} else if ((error = BUF_WRITE(nbp)) != 0) {
|
2001-05-08 07:13:00 +00:00
|
|
|
break;
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
ACQUIRE_LOCK(&lk);
|
2001-05-08 07:13:00 +00:00
|
|
|
continue;
|
1998-05-19 19:47:22 +00:00
|
|
|
|
1998-05-19 20:18:42 +00:00
|
|
|
case D_INDIRDEP:
|
1998-05-19 20:03:29 +00:00
|
|
|
restart:
|
2001-02-04 12:37:48 +00:00
|
|
|
|
|
|
|
LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
|
1998-05-19 19:47:22 +00:00
|
|
|
if (aip->ai_state & DEPCOMPLETE)
|
|
|
|
continue;
|
|
|
|
nbp = aip->ai_buf;
|
1998-05-19 20:03:29 +00:00
|
|
|
if (getdirtybuf(&nbp, MNT_WAIT) == 0)
|
|
|
|
goto restart;
|
1998-05-19 19:47:22 +00:00
|
|
|
FREE_LOCK(&lk);
|
2000-03-20 11:29:10 +00:00
|
|
|
if ((error = BUF_WRITE(nbp)) != 0) {
|
2001-05-08 07:13:00 +00:00
|
|
|
break;
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
ACQUIRE_LOCK(&lk);
|
1998-05-19 20:03:29 +00:00
|
|
|
goto restart;
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
2001-05-08 07:13:00 +00:00
|
|
|
continue;
|
1998-05-19 19:47:22 +00:00
|
|
|
|
1998-05-19 20:18:42 +00:00
|
|
|
case D_INODEDEP:
|
1998-05-19 19:47:22 +00:00
|
|
|
if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs,
|
|
|
|
WK_INODEDEP(wk)->id_ino)) != 0) {
|
|
|
|
FREE_LOCK(&lk);
|
2001-05-08 07:13:00 +00:00
|
|
|
break;
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
2001-05-08 07:13:00 +00:00
|
|
|
continue;
|
1998-05-19 19:47:22 +00:00
|
|
|
|
1998-05-19 20:18:42 +00:00
|
|
|
case D_PAGEDEP:
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* We are trying to sync a directory that may
|
|
|
|
* have dependencies on both its own metadata
|
|
|
|
* and/or dependencies on the inodes of any
|
|
|
|
* recently allocated files. We walk its diradd
|
|
|
|
* lists pushing out the associated inode.
|
|
|
|
*/
|
1998-05-19 20:03:29 +00:00
|
|
|
pagedep = WK_PAGEDEP(wk);
|
|
|
|
for (i = 0; i < DAHASHSZ; i++) {
|
|
|
|
if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
|
|
|
|
continue;
|
1999-05-22 04:43:04 +00:00
|
|
|
if ((error =
|
|
|
|
flush_pagedep_deps(vp, pagedep->pd_mnt,
|
|
|
|
&pagedep->pd_diraddhd[i]))) {
|
1998-05-19 20:03:29 +00:00
|
|
|
FREE_LOCK(&lk);
|
2001-05-08 07:13:00 +00:00
|
|
|
break;
|
1998-05-19 20:03:29 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
2001-05-08 07:13:00 +00:00
|
|
|
continue;
|
1998-05-19 19:47:22 +00:00
|
|
|
|
1999-03-02 00:19:47 +00:00
|
|
|
case D_MKDIR:
|
|
|
|
/*
|
|
|
|
* This case should never happen if the vnode has
|
|
|
|
* been properly sync'ed. However, if this function
|
|
|
|
* is used at a place where the vnode has not yet
|
|
|
|
* been sync'ed, this dependency can show up. So,
|
|
|
|
* rather than panic, just flush it.
|
|
|
|
*/
|
|
|
|
nbp = WK_MKDIR(wk)->md_buf;
|
|
|
|
if (getdirtybuf(&nbp, waitfor) == 0)
|
2001-05-08 07:13:00 +00:00
|
|
|
continue;
|
1999-03-02 00:19:47 +00:00
|
|
|
FREE_LOCK(&lk);
|
|
|
|
if (waitfor == MNT_NOWAIT) {
|
|
|
|
bawrite(nbp);
|
2000-03-20 11:29:10 +00:00
|
|
|
} else if ((error = BUF_WRITE(nbp)) != 0) {
|
2001-05-08 07:13:00 +00:00
|
|
|
break;
|
1999-03-02 00:19:47 +00:00
|
|
|
}
|
|
|
|
ACQUIRE_LOCK(&lk);
|
2001-05-08 07:13:00 +00:00
|
|
|
continue;
|
1999-03-02 00:19:47 +00:00
|
|
|
|
|
|
|
case D_BMSAFEMAP:
|
|
|
|
/*
|
|
|
|
* This case should never happen if the vnode has
|
|
|
|
* been properly sync'ed. However, if this function
|
|
|
|
* is used at a place where the vnode has not yet
|
|
|
|
* been sync'ed, this dependency can show up. So,
|
|
|
|
* rather than panic, just flush it.
|
|
|
|
*/
|
|
|
|
nbp = WK_BMSAFEMAP(wk)->sm_buf;
|
|
|
|
if (getdirtybuf(&nbp, waitfor) == 0)
|
2001-05-08 07:13:00 +00:00
|
|
|
continue;
|
1999-03-02 00:19:47 +00:00
|
|
|
FREE_LOCK(&lk);
|
|
|
|
if (waitfor == MNT_NOWAIT) {
|
|
|
|
bawrite(nbp);
|
2000-03-20 11:29:10 +00:00
|
|
|
} else if ((error = BUF_WRITE(nbp)) != 0) {
|
2001-05-08 07:13:00 +00:00
|
|
|
break;
|
1999-03-02 00:19:47 +00:00
|
|
|
}
|
|
|
|
ACQUIRE_LOCK(&lk);
|
2001-05-08 07:13:00 +00:00
|
|
|
continue;
|
1999-03-02 00:19:47 +00:00
|
|
|
|
1998-05-19 19:47:22 +00:00
|
|
|
default:
|
2001-02-23 09:01:31 +00:00
|
|
|
FREE_LOCK(&lk);
|
1998-05-19 19:47:22 +00:00
|
|
|
panic("softdep_sync_metadata: Unknown type %s",
|
|
|
|
TYPENAME(wk->wk_type));
|
|
|
|
/* NOTREACHED */
|
|
|
|
}
|
2001-05-08 07:13:00 +00:00
|
|
|
/* We reach here only in error and unlocked */
|
|
|
|
if (error == 0)
|
|
|
|
panic("softdep_sync_metadata: zero error");
|
|
|
|
bp->b_lock.lk_flags &= ~LK_CANRECURSE;
|
|
|
|
bawrite(bp);
|
|
|
|
return (error);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
1998-10-31 15:33:32 +00:00
|
|
|
(void) getdirtybuf(&TAILQ_NEXT(bp, b_vnbufs), MNT_WAIT);
|
|
|
|
nbp = TAILQ_NEXT(bp, b_vnbufs);
|
1998-05-19 19:47:22 +00:00
|
|
|
FREE_LOCK(&lk);
|
2001-05-08 07:13:00 +00:00
|
|
|
bp->b_lock.lk_flags &= ~LK_CANRECURSE;
|
1998-05-19 19:47:22 +00:00
|
|
|
bawrite(bp);
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
if (nbp != NULL) {
|
|
|
|
bp = nbp;
|
|
|
|
goto loop;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* The brief unlock is to allow any pent up dependency
|
2002-01-11 19:59:27 +00:00
|
|
|
* processing to be done. Then proceed with the second pass.
|
1998-05-19 19:47:22 +00:00
|
|
|
*/
|
|
|
|
if (waitfor == MNT_NOWAIT) {
|
|
|
|
waitfor = MNT_WAIT;
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
goto top;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we have managed to get rid of all the dirty buffers,
|
|
|
|
* then we are done. For certain directories and block
|
|
|
|
* devices, we may need to do further work.
|
2002-01-11 19:59:27 +00:00
|
|
|
*
|
|
|
|
* We must wait for any I/O in progress to finish so that
|
|
|
|
* all potential buffers on the dirty list will be visible.
|
1998-05-19 19:47:22 +00:00
|
|
|
*/
|
2002-01-11 19:59:27 +00:00
|
|
|
drain_output(vp, 1);
|
1998-10-31 15:33:32 +00:00
|
|
|
if (TAILQ_FIRST(&vp->v_dirtyblkhd) == NULL) {
|
1998-05-19 19:47:22 +00:00
|
|
|
FREE_LOCK(&lk);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
/*
|
|
|
|
* If we are trying to sync a block device, some of its buffers may
|
|
|
|
* contain metadata that cannot be written until the contents of some
|
|
|
|
* partially written files have been written to disk. The only easy
|
|
|
|
* way to accomplish this is to sync the entire filesystem (luckily
|
|
|
|
* this happens rarely).
|
|
|
|
*/
|
2000-01-10 12:04:27 +00:00
|
|
|
if (vn_isdisk(vp, NULL) &&
|
2000-10-09 17:31:39 +00:00
|
|
|
vp->v_rdev->si_mountpoint && !VOP_ISLOCKED(vp, NULL) &&
|
|
|
|
(error = VFS_SYNC(vp->v_rdev->si_mountpoint, MNT_WAIT, ap->a_cred,
|
2001-09-12 08:38:13 +00:00
|
|
|
ap->a_td)) != 0)
|
1998-05-19 19:47:22 +00:00
|
|
|
return (error);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Flush the dependencies associated with an inodedep.
|
|
|
|
* Called with splbio blocked.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
flush_inodedep_deps(fs, ino)
|
|
|
|
struct fs *fs;
|
|
|
|
ino_t ino;
|
|
|
|
{
|
|
|
|
struct inodedep *inodedep;
|
|
|
|
int error, waitfor;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This work is done in two passes. The first pass grabs most
|
|
|
|
* of the buffers and begins asynchronously writing them. The
|
|
|
|
* only way to wait for these asynchronous writes is to sleep
|
|
|
|
* on the filesystem vnode which may stay busy for a long time
|
|
|
|
* if the filesystem is active. So, instead, we make a second
|
|
|
|
* pass over the dependencies blocking on each write. In the
|
|
|
|
* usual case we will be blocking against a write that we
|
|
|
|
* initiated, so when it is done the dependency will have been
|
|
|
|
* resolved. Thus the second pass is expected to end quickly.
|
|
|
|
* We give a brief window at the top of the loop to allow
|
|
|
|
* any pending I/O to complete.
|
|
|
|
*/
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
for (error = 0, waitfor = MNT_NOWAIT; ; ) {
|
|
|
|
if (error)
|
|
|
|
return (error);
|
1998-05-19 19:47:22 +00:00
|
|
|
FREE_LOCK(&lk);
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
|
|
|
|
return (0);
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
|
|
|
|
flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
|
|
|
|
flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
|
|
|
|
flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
|
1998-05-19 19:47:22 +00:00
|
|
|
continue;
|
|
|
|
/*
|
|
|
|
* If pass2, we are done, otherwise do pass 2.
|
|
|
|
*/
|
|
|
|
if (waitfor == MNT_WAIT)
|
|
|
|
break;
|
|
|
|
waitfor = MNT_WAIT;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Try freeing inodedep in case all dependencies have been removed.
|
|
|
|
*/
|
|
|
|
if (inodedep_lookup(fs, ino, 0, &inodedep) != 0)
|
|
|
|
(void) free_inodedep(inodedep);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
/*
|
|
|
|
* Flush an inode dependency list.
|
|
|
|
* Called with splbio blocked.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
flush_deplist(listhead, waitfor, errorp)
|
|
|
|
struct allocdirectlst *listhead;
|
|
|
|
int waitfor;
|
|
|
|
int *errorp;
|
|
|
|
{
|
|
|
|
struct allocdirect *adp;
|
|
|
|
struct buf *bp;
|
|
|
|
|
|
|
|
TAILQ_FOREACH(adp, listhead, ad_next) {
|
|
|
|
if (adp->ad_state & DEPCOMPLETE)
|
|
|
|
continue;
|
|
|
|
bp = adp->ad_buf;
|
|
|
|
if (getdirtybuf(&bp, waitfor) == 0) {
|
|
|
|
if (waitfor == MNT_NOWAIT)
|
|
|
|
continue;
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
if (waitfor == MNT_NOWAIT) {
|
|
|
|
bawrite(bp);
|
|
|
|
} else if ((*errorp = BUF_WRITE(bp)) != 0) {
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* Eliminate a pagedep dependency by flushing out all its diradd dependencies.
|
|
|
|
* Called with splbio blocked.
|
|
|
|
*/
|
|
|
|
static int
|
1998-05-19 20:03:29 +00:00
|
|
|
flush_pagedep_deps(pvp, mp, diraddhdp)
|
1998-05-19 19:47:22 +00:00
|
|
|
struct vnode *pvp;
|
1998-05-19 20:03:29 +00:00
|
|
|
struct mount *mp;
|
|
|
|
struct diraddhd *diraddhdp;
|
1998-05-19 19:47:22 +00:00
|
|
|
{
|
2001-09-12 08:38:13 +00:00
|
|
|
struct thread *td = curthread;
|
1998-05-19 20:03:29 +00:00
|
|
|
struct inodedep *inodedep;
|
|
|
|
struct ufsmount *ump;
|
1998-05-19 19:47:22 +00:00
|
|
|
struct diradd *dap;
|
|
|
|
struct vnode *vp;
|
1998-05-19 20:18:42 +00:00
|
|
|
int gotit, error = 0;
|
1998-05-19 20:03:29 +00:00
|
|
|
struct buf *bp;
|
1998-05-19 19:47:22 +00:00
|
|
|
ino_t inum;
|
|
|
|
|
1998-05-19 20:03:29 +00:00
|
|
|
ump = VFSTOUFS(mp);
|
|
|
|
while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
|
|
|
|
/*
|
|
|
|
* Flush ourselves if this directory entry
|
|
|
|
* has a MKDIR_PARENT dependency.
|
|
|
|
*/
|
|
|
|
if (dap->da_state & MKDIR_PARENT) {
|
|
|
|
FREE_LOCK(&lk);
|
1999-05-07 05:11:31 +00:00
|
|
|
if ((error = UFS_UPDATE(pvp, 1)) != 0)
|
1998-05-19 20:03:29 +00:00
|
|
|
break;
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
/*
|
|
|
|
* If that cleared dependencies, go on to next.
|
|
|
|
*/
|
|
|
|
if (dap != LIST_FIRST(diraddhdp))
|
|
|
|
continue;
|
2001-02-23 09:01:31 +00:00
|
|
|
if (dap->da_state & MKDIR_PARENT) {
|
|
|
|
FREE_LOCK(&lk);
|
2000-01-18 01:30:03 +00:00
|
|
|
panic("flush_pagedep_deps: MKDIR_PARENT");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 20:03:29 +00:00
|
|
|
}
|
|
|
|
/*
|
2000-01-18 01:30:03 +00:00
|
|
|
* A newly allocated directory must have its "." and
|
|
|
|
* ".." entries written out before its name can be
|
|
|
|
* committed in its parent. We do not want or need
|
|
|
|
* the full semantics of a synchronous VOP_FSYNC as
|
|
|
|
* that may end up here again, once for each directory
|
|
|
|
* level in the filesystem. Instead, we push the blocks
|
|
|
|
* and wait for them to clear. We have to fsync twice
|
|
|
|
* because the first call may choose to defer blocks
|
|
|
|
* that still have dependencies, but deferral will
|
|
|
|
* happen at most once.
|
1998-05-19 20:03:29 +00:00
|
|
|
*/
|
|
|
|
inum = dap->da_newinum;
|
2000-01-18 01:30:03 +00:00
|
|
|
if (dap->da_state & MKDIR_BODY) {
|
1998-05-19 19:47:22 +00:00
|
|
|
FREE_LOCK(&lk);
|
2002-03-17 01:25:47 +00:00
|
|
|
if ((error = VFS_VGET(mp, inum, LK_EXCLUSIVE, &vp)))
|
1998-05-19 19:47:22 +00:00
|
|
|
break;
|
2002-02-27 18:32:23 +00:00
|
|
|
if ((error=VOP_FSYNC(vp, td->td_ucred, MNT_NOWAIT, td)) ||
|
|
|
|
(error=VOP_FSYNC(vp, td->td_ucred, MNT_NOWAIT, td))) {
|
1998-05-19 20:03:29 +00:00
|
|
|
vput(vp);
|
|
|
|
break;
|
|
|
|
}
|
1999-05-07 02:26:47 +00:00
|
|
|
drain_output(vp, 0);
|
2000-01-18 01:30:03 +00:00
|
|
|
vput(vp);
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
/*
|
|
|
|
* If that cleared dependencies, go on to next.
|
|
|
|
*/
|
|
|
|
if (dap != LIST_FIRST(diraddhdp))
|
|
|
|
continue;
|
2001-02-23 09:01:31 +00:00
|
|
|
if (dap->da_state & MKDIR_BODY) {
|
|
|
|
FREE_LOCK(&lk);
|
2000-01-18 01:30:03 +00:00
|
|
|
panic("flush_pagedep_deps: MKDIR_BODY");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
2000-01-18 01:30:03 +00:00
|
|
|
/*
|
|
|
|
* Flush the inode on which the directory entry depends.
|
|
|
|
* Having accounted for MKDIR_PARENT and MKDIR_BODY above,
|
|
|
|
* the only remaining dependency is that the updated inode
|
|
|
|
* count must get pushed to disk. The inode has already
|
|
|
|
* been pushed into its inode buffer (via VOP_UPDATE) at
|
|
|
|
* the time of the reference count change. So we need only
|
|
|
|
* locate that buffer, ensure that there will be no rollback
|
|
|
|
* caused by a bitmap dependency, then write the inode buffer.
|
|
|
|
*/
|
2001-02-23 09:01:31 +00:00
|
|
|
if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0) {
|
|
|
|
FREE_LOCK(&lk);
|
2000-01-18 01:30:03 +00:00
|
|
|
panic("flush_pagedep_deps: lost inode");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
2000-01-18 01:30:03 +00:00
|
|
|
/*
|
|
|
|
* If the inode still has bitmap dependencies,
|
|
|
|
* push them to disk.
|
|
|
|
*/
|
|
|
|
if ((inodedep->id_state & DEPCOMPLETE) == 0) {
|
|
|
|
gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT);
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
if (gotit &&
|
2000-03-20 11:29:10 +00:00
|
|
|
(error = BUF_WRITE(inodedep->id_buf)) != 0)
|
2000-01-18 01:30:03 +00:00
|
|
|
break;
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
if (dap != LIST_FIRST(diraddhdp))
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* If the inode is still sitting in a buffer waiting
|
|
|
|
* to be written, push it to disk.
|
|
|
|
*/
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
if ((error = bread(ump->um_devvp,
|
|
|
|
fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
|
2002-02-02 01:42:44 +00:00
|
|
|
(int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) {
|
|
|
|
brelse(bp);
|
2000-01-18 01:30:03 +00:00
|
|
|
break;
|
2002-02-02 01:42:44 +00:00
|
|
|
}
|
2000-03-20 11:29:10 +00:00
|
|
|
if ((error = BUF_WRITE(bp)) != 0)
|
1998-05-19 20:03:29 +00:00
|
|
|
break;
|
2000-01-18 01:30:03 +00:00
|
|
|
ACQUIRE_LOCK(&lk);
|
1998-05-19 20:03:29 +00:00
|
|
|
/*
|
|
|
|
* If we have failed to get rid of all the dependencies
|
|
|
|
* then something is seriously wrong.
|
|
|
|
*/
|
2001-02-23 09:01:31 +00:00
|
|
|
if (dap == LIST_FIRST(diraddhdp)) {
|
|
|
|
FREE_LOCK(&lk);
|
2000-01-18 01:30:03 +00:00
|
|
|
panic("flush_pagedep_deps: flush failed");
|
2001-02-23 09:01:31 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
if (error)
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
1999-05-07 02:26:47 +00:00
|
|
|
/*
|
|
|
|
* A large burst of file addition or deletion activity can drive the
|
2000-12-13 08:30:35 +00:00
|
|
|
* memory load excessively high. First attempt to slow things down
|
|
|
|
* using the techniques below. If that fails, this routine requests
|
|
|
|
* the offending operations to fall back to running synchronously
|
|
|
|
* until the memory load returns to a reasonable level.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
softdep_slowdown(vp)
|
|
|
|
struct vnode *vp;
|
|
|
|
{
|
|
|
|
int max_softdeps_hard;
|
|
|
|
|
|
|
|
max_softdeps_hard = max_softdeps * 11 / 10;
|
|
|
|
if (num_dirrem < max_softdeps_hard / 2 &&
|
2003-01-07 18:23:50 +00:00
|
|
|
num_inodedep < max_softdeps_hard &&
|
|
|
|
VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps)
|
|
|
|
return (0);
|
|
|
|
if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps)
|
|
|
|
speedup_syncer();
|
2000-12-13 08:30:35 +00:00
|
|
|
stat_sync_limit_hit += 1;
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
2002-01-22 06:17:22 +00:00
|
|
|
/*
|
|
|
|
* Called by the allocation routines when they are about to fail
|
|
|
|
* in the hope that we can free up some disk space.
|
|
|
|
*
|
|
|
|
* First check to see if the work list has anything on it. If it has,
|
|
|
|
* clean up entries until we successfully free some space. Because this
|
|
|
|
* process holds inodes locked, we cannot handle any remove requests
|
|
|
|
* that might block on a locked inode as that could lead to deadlock.
|
|
|
|
* If the worklist yields no free space, encourage the syncer daemon
|
|
|
|
* to help us. In no event will we try for longer than tickdelay seconds.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
softdep_request_cleanup(fs, vp)
|
|
|
|
struct fs *fs;
|
|
|
|
struct vnode *vp;
|
|
|
|
{
|
2002-06-21 06:18:05 +00:00
|
|
|
long starttime;
|
|
|
|
ufs2_daddr_t needed;
|
2002-01-22 06:17:22 +00:00
|
|
|
|
|
|
|
needed = fs->fs_cstotal.cs_nbfree + fs->fs_contigsumsize;
|
|
|
|
starttime = time_second + tickdelay;
|
2002-10-23 21:47:02 +00:00
|
|
|
/*
|
|
|
|
* If we are being called because of a process doing a
|
|
|
|
* copy-on-write, then it is not safe to update the vnode
|
|
|
|
* as we may recurse into the copy-on-write routine.
|
|
|
|
*/
|
|
|
|
if ((curthread->td_proc->p_flag & P_COWINPROGRESS) == 0 &&
|
|
|
|
UFS_UPDATE(vp, 1) != 0)
|
2002-01-22 06:17:22 +00:00
|
|
|
return (0);
|
|
|
|
while (fs->fs_pendingblocks > 0 && fs->fs_cstotal.cs_nbfree <= needed) {
|
|
|
|
if (time_second > starttime)
|
|
|
|
return (0);
|
|
|
|
if (num_on_worklist > 0 &&
|
|
|
|
process_worklist_item(NULL, LK_NOWAIT) != -1) {
|
|
|
|
stat_worklist_push += 1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
request_cleanup(FLUSH_REMOVE_WAIT, 0);
|
|
|
|
}
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
2000-12-13 08:30:35 +00:00
|
|
|
/*
|
|
|
|
* If memory utilization has gotten too high, deliberately slow things
|
|
|
|
* down and speed up the I/O processing.
|
1999-05-07 02:26:47 +00:00
|
|
|
*/
|
|
|
|
static int
|
1999-06-15 23:37:29 +00:00
|
|
|
request_cleanup(resource, islocked)
|
|
|
|
int resource;
|
1999-05-07 02:26:47 +00:00
|
|
|
int islocked;
|
|
|
|
{
|
2001-09-12 08:38:13 +00:00
|
|
|
struct thread *td = curthread;
|
1999-05-07 02:26:47 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We never hold up the filesystem syncer process.
|
|
|
|
*/
|
2001-09-12 08:38:13 +00:00
|
|
|
if (td == filesys_syncer)
|
1999-05-07 02:26:47 +00:00
|
|
|
return (0);
|
2000-12-13 08:30:35 +00:00
|
|
|
/*
|
|
|
|
* First check to see if the work list has gotten backlogged.
|
|
|
|
* If it has, co-opt this process to help clean up two entries.
|
|
|
|
* Because this process may hold inodes locked, we cannot
|
|
|
|
* handle any remove requests that might block on a locked
|
|
|
|
* inode as that could lead to deadlock.
|
|
|
|
*/
|
|
|
|
if (num_on_worklist > max_softdeps / 10) {
|
2001-02-20 11:14:38 +00:00
|
|
|
if (islocked)
|
|
|
|
FREE_LOCK(&lk);
|
2000-12-13 08:30:35 +00:00
|
|
|
process_worklist_item(NULL, LK_NOWAIT);
|
|
|
|
process_worklist_item(NULL, LK_NOWAIT);
|
|
|
|
stat_worklist_push += 2;
|
2001-02-20 11:14:38 +00:00
|
|
|
if (islocked)
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
return(1);
|
2000-12-13 08:30:35 +00:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Next, we attempt to speed up the syncer process. If that
|
|
|
|
* is successful, then we allow the process to continue.
|
|
|
|
*/
|
2002-01-22 06:17:22 +00:00
|
|
|
if (speedup_syncer() && resource != FLUSH_REMOVE_WAIT)
|
2000-12-13 08:30:35 +00:00
|
|
|
return(0);
|
1999-05-07 02:26:47 +00:00
|
|
|
/*
|
|
|
|
* If we are resource constrained on inode dependencies, try
|
|
|
|
* flushing some dirty inodes. Otherwise, we are constrained
|
|
|
|
* by file deletions, so try accelerating flushes of directories
|
|
|
|
* with removal dependencies. We would like to do the cleanup
|
|
|
|
* here, but we probably hold an inode locked at this point and
|
|
|
|
* that might deadlock against one that we try to clean. So,
|
1999-06-15 23:37:29 +00:00
|
|
|
* the best that we can do is request the syncer daemon to do
|
|
|
|
* the cleanup for us.
|
1999-05-07 02:26:47 +00:00
|
|
|
*/
|
1999-06-15 23:37:29 +00:00
|
|
|
switch (resource) {
|
|
|
|
|
|
|
|
case FLUSH_INODES:
|
1999-05-14 01:26:46 +00:00
|
|
|
stat_ino_limit_push += 1;
|
2000-11-20 06:22:39 +00:00
|
|
|
req_clear_inodedeps += 1;
|
|
|
|
stat_countp = &stat_ino_limit_hit;
|
1999-06-15 23:37:29 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
case FLUSH_REMOVE:
|
2002-01-22 06:17:22 +00:00
|
|
|
case FLUSH_REMOVE_WAIT:
|
1999-05-14 01:26:46 +00:00
|
|
|
stat_blk_limit_push += 1;
|
2000-11-20 06:22:39 +00:00
|
|
|
req_clear_remove += 1;
|
|
|
|
stat_countp = &stat_blk_limit_hit;
|
1999-06-15 23:37:29 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
2001-02-23 09:01:31 +00:00
|
|
|
if (islocked)
|
|
|
|
FREE_LOCK(&lk);
|
1999-06-15 23:37:29 +00:00
|
|
|
panic("request_cleanup: unknown type");
|
1999-05-07 02:26:47 +00:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Hopefully the syncer daemon will catch up and awaken us.
|
|
|
|
* We wait at most tickdelay before proceeding in any case.
|
|
|
|
*/
|
|
|
|
if (islocked == 0)
|
|
|
|
ACQUIRE_LOCK(&lk);
|
2000-12-13 08:30:35 +00:00
|
|
|
proc_waiting += 1;
|
|
|
|
if (handle.callout == NULL)
|
2000-11-20 06:22:39 +00:00
|
|
|
handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
|
2002-08-04 10:29:36 +00:00
|
|
|
interlocked_sleep(&lk, SLEEP, (caddr_t)&proc_waiting, NULL, PPAUSE,
|
2002-01-12 20:57:36 +00:00
|
|
|
"softupdate", 0);
|
2000-12-13 08:30:35 +00:00
|
|
|
proc_waiting -= 1;
|
1999-05-07 02:26:47 +00:00
|
|
|
if (islocked == 0)
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
1999-06-15 23:37:29 +00:00
|
|
|
* Awaken processes pausing in request_cleanup and clear proc_waiting
|
1999-05-07 02:26:47 +00:00
|
|
|
* to indicate that there is no longer a timer running.
|
|
|
|
*/
|
2002-09-28 17:15:38 +00:00
|
|
|
static void
|
1999-05-07 02:26:47 +00:00
|
|
|
pause_timer(arg)
|
|
|
|
void *arg;
|
|
|
|
{
|
|
|
|
|
2000-11-20 06:22:39 +00:00
|
|
|
*stat_countp += 1;
|
|
|
|
wakeup_one(&proc_waiting);
|
2000-12-13 08:30:35 +00:00
|
|
|
if (proc_waiting > 0)
|
|
|
|
handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
|
|
|
|
else
|
|
|
|
handle.callout = NULL;
|
1999-05-07 02:26:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2000-01-09 23:35:38 +00:00
|
|
|
* Flush out a directory with at least one removal dependency in an effort to
|
|
|
|
* reduce the number of dirrem, freefile, and freeblks dependency structures.
|
1999-05-07 02:26:47 +00:00
|
|
|
*/
|
|
|
|
static void
|
2001-09-12 08:38:13 +00:00
|
|
|
clear_remove(td)
|
|
|
|
struct thread *td;
|
1999-05-07 02:26:47 +00:00
|
|
|
{
|
|
|
|
struct pagedep_hashhead *pagedephd;
|
|
|
|
struct pagedep *pagedep;
|
|
|
|
static int next = 0;
|
|
|
|
struct mount *mp;
|
|
|
|
struct vnode *vp;
|
|
|
|
int error, cnt;
|
|
|
|
ino_t ino;
|
|
|
|
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
for (cnt = 0; cnt < pagedep_hash; cnt++) {
|
|
|
|
pagedephd = &pagedep_hashtbl[next++];
|
|
|
|
if (next >= pagedep_hash)
|
|
|
|
next = 0;
|
2001-02-04 16:08:18 +00:00
|
|
|
LIST_FOREACH(pagedep, pagedephd, pd_hash) {
|
1999-05-07 02:26:47 +00:00
|
|
|
if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL)
|
|
|
|
continue;
|
|
|
|
mp = pagedep->pd_mnt;
|
|
|
|
ino = pagedep->pd_ino;
|
2000-07-24 05:28:33 +00:00
|
|
|
if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
|
|
|
|
continue;
|
2002-10-23 05:14:06 +00:00
|
|
|
FREE_LOCK(&lk);
|
2002-03-17 01:25:47 +00:00
|
|
|
if ((error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &vp))) {
|
1999-05-07 02:26:47 +00:00
|
|
|
softdep_error("clear_remove: vget", error);
|
2000-07-11 22:07:57 +00:00
|
|
|
vn_finished_write(mp);
|
1999-05-07 02:26:47 +00:00
|
|
|
return;
|
|
|
|
}
|
2002-02-27 18:32:23 +00:00
|
|
|
if ((error = VOP_FSYNC(vp, td->td_ucred, MNT_NOWAIT, td)))
|
1999-05-07 02:26:47 +00:00
|
|
|
softdep_error("clear_remove: fsync", error);
|
|
|
|
drain_output(vp, 0);
|
|
|
|
vput(vp);
|
2000-07-11 22:07:57 +00:00
|
|
|
vn_finished_write(mp);
|
1999-05-07 02:26:47 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Clear out a block of dirty inodes in an effort to reduce
|
|
|
|
* the number of inodedep dependency structures.
|
|
|
|
*/
|
|
|
|
static void
|
2001-09-12 08:38:13 +00:00
|
|
|
clear_inodedeps(td)
|
|
|
|
struct thread *td;
|
1999-05-07 02:26:47 +00:00
|
|
|
{
|
|
|
|
struct inodedep_hashhead *inodedephd;
|
|
|
|
struct inodedep *inodedep;
|
|
|
|
static int next = 0;
|
|
|
|
struct mount *mp;
|
|
|
|
struct vnode *vp;
|
|
|
|
struct fs *fs;
|
|
|
|
int error, cnt;
|
|
|
|
ino_t firstino, lastino, ino;
|
|
|
|
|
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
/*
|
|
|
|
* Pick a random inode dependency to be cleared.
|
|
|
|
* We will then gather up all the inodes in its block
|
|
|
|
* that have dependencies and flush them out.
|
|
|
|
*/
|
|
|
|
for (cnt = 0; cnt < inodedep_hash; cnt++) {
|
|
|
|
inodedephd = &inodedep_hashtbl[next++];
|
|
|
|
if (next >= inodedep_hash)
|
|
|
|
next = 0;
|
|
|
|
if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
|
|
|
|
break;
|
|
|
|
}
|
2001-02-22 10:17:57 +00:00
|
|
|
if (inodedep == NULL)
|
|
|
|
return;
|
1999-05-07 02:26:47 +00:00
|
|
|
/*
|
|
|
|
* Ugly code to find mount point given pointer to superblock.
|
|
|
|
*/
|
|
|
|
fs = inodedep->id_fs;
|
1999-11-20 10:00:46 +00:00
|
|
|
TAILQ_FOREACH(mp, &mountlist, mnt_list)
|
1999-05-07 02:26:47 +00:00
|
|
|
if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs)
|
|
|
|
break;
|
|
|
|
/*
|
|
|
|
* Find the last inode in the block with dependencies.
|
|
|
|
*/
|
|
|
|
firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
|
|
|
|
for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
|
|
|
|
if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0)
|
|
|
|
break;
|
|
|
|
/*
|
|
|
|
* Asynchronously push all but the last inode with dependencies.
|
|
|
|
* Synchronously push the last inode with dependencies to ensure
|
|
|
|
* that the inode block gets written to free up the inodedeps.
|
|
|
|
*/
|
|
|
|
for (ino = firstino; ino <= lastino; ino++) {
|
|
|
|
if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
|
|
|
|
continue;
|
|
|
|
FREE_LOCK(&lk);
|
2000-07-24 05:28:33 +00:00
|
|
|
if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
|
|
|
|
continue;
|
2002-03-17 01:25:47 +00:00
|
|
|
if ((error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &vp)) != 0) {
|
1999-05-07 02:26:47 +00:00
|
|
|
softdep_error("clear_inodedeps: vget", error);
|
2000-07-11 22:07:57 +00:00
|
|
|
vn_finished_write(mp);
|
1999-05-07 02:26:47 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (ino == lastino) {
|
2002-02-27 18:32:23 +00:00
|
|
|
if ((error = VOP_FSYNC(vp, td->td_ucred, MNT_WAIT, td)))
|
1999-05-07 02:26:47 +00:00
|
|
|
softdep_error("clear_inodedeps: fsync1", error);
|
|
|
|
} else {
|
2002-02-27 18:32:23 +00:00
|
|
|
if ((error = VOP_FSYNC(vp, td->td_ucred, MNT_NOWAIT, td)))
|
1999-05-07 02:26:47 +00:00
|
|
|
softdep_error("clear_inodedeps: fsync2", error);
|
|
|
|
drain_output(vp, 0);
|
|
|
|
}
|
|
|
|
vput(vp);
|
2000-07-11 22:07:57 +00:00
|
|
|
vn_finished_write(mp);
|
1999-05-07 02:26:47 +00:00
|
|
|
ACQUIRE_LOCK(&lk);
|
|
|
|
}
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
}
|
|
|
|
|
2000-01-10 00:24:24 +00:00
|
|
|
/*
|
|
|
|
* Function to determine if the buffer has outstanding dependencies
|
|
|
|
* that will cause a roll-back if the buffer is written. If wantcount
|
|
|
|
* is set, return number of dependencies, otherwise just yes or no.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
softdep_count_dependencies(bp, wantcount)
|
|
|
|
struct buf *bp;
|
|
|
|
int wantcount;
|
|
|
|
{
|
|
|
|
struct worklist *wk;
|
|
|
|
struct inodedep *inodedep;
|
|
|
|
struct indirdep *indirdep;
|
|
|
|
struct allocindir *aip;
|
|
|
|
struct pagedep *pagedep;
|
|
|
|
struct diradd *dap;
|
|
|
|
int i, retval;
|
|
|
|
|
|
|
|
retval = 0;
|
|
|
|
ACQUIRE_LOCK(&lk);
|
2001-02-04 13:13:25 +00:00
|
|
|
LIST_FOREACH(wk, &bp->b_dep, wk_list) {
|
2000-01-10 00:24:24 +00:00
|
|
|
switch (wk->wk_type) {
|
|
|
|
|
|
|
|
case D_INODEDEP:
|
|
|
|
inodedep = WK_INODEDEP(wk);
|
|
|
|
if ((inodedep->id_state & DEPCOMPLETE) == 0) {
|
|
|
|
/* bitmap allocation dependency */
|
|
|
|
retval += 1;
|
|
|
|
if (!wantcount)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
|
|
|
|
/* direct block pointer dependency */
|
|
|
|
retval += 1;
|
|
|
|
if (!wantcount)
|
|
|
|
goto out;
|
|
|
|
}
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
if (TAILQ_FIRST(&inodedep->id_extupdt)) {
|
|
|
|
/* direct block pointer dependency */
|
|
|
|
retval += 1;
|
|
|
|
if (!wantcount)
|
|
|
|
goto out;
|
|
|
|
}
|
2000-01-10 00:24:24 +00:00
|
|
|
continue;
|
|
|
|
|
|
|
|
case D_INDIRDEP:
|
|
|
|
indirdep = WK_INDIRDEP(wk);
|
2001-02-04 12:37:48 +00:00
|
|
|
|
|
|
|
LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
|
2000-01-10 00:24:24 +00:00
|
|
|
/* indirect block pointer dependency */
|
|
|
|
retval += 1;
|
|
|
|
if (!wantcount)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
|
|
|
|
case D_PAGEDEP:
|
|
|
|
pagedep = WK_PAGEDEP(wk);
|
|
|
|
for (i = 0; i < DAHASHSZ; i++) {
|
2001-02-04 12:37:48 +00:00
|
|
|
|
|
|
|
LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
|
2000-01-10 00:24:24 +00:00
|
|
|
/* directory entry dependency */
|
|
|
|
retval += 1;
|
|
|
|
if (!wantcount)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
|
|
|
|
case D_BMSAFEMAP:
|
|
|
|
case D_ALLOCDIRECT:
|
|
|
|
case D_ALLOCINDIR:
|
|
|
|
case D_MKDIR:
|
|
|
|
/* never a dependency on these blocks */
|
|
|
|
continue;
|
|
|
|
|
|
|
|
default:
|
2001-02-23 09:01:31 +00:00
|
|
|
FREE_LOCK(&lk);
|
2000-01-10 00:24:24 +00:00
|
|
|
panic("softdep_check_for_rollback: Unexpected type %s",
|
|
|
|
TYPENAME(wk->wk_type));
|
|
|
|
/* NOTREACHED */
|
|
|
|
}
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* Acquire exclusive access to a buffer.
|
|
|
|
* Must be called with splbio blocked.
|
|
|
|
* Return 1 if buffer was acquired.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
getdirtybuf(bpp, waitfor)
|
|
|
|
struct buf **bpp;
|
|
|
|
int waitfor;
|
|
|
|
{
|
|
|
|
struct buf *bp;
|
2002-01-12 20:57:36 +00:00
|
|
|
int error;
|
1998-05-19 19:47:22 +00:00
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
if ((bp = *bpp) == NULL)
|
|
|
|
return (0);
|
2000-01-13 07:20:01 +00:00
|
|
|
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
|
|
|
|
if ((bp->b_xflags & BX_BKGRDINPROG) == 0)
|
|
|
|
break;
|
|
|
|
BUF_UNLOCK(bp);
|
|
|
|
if (waitfor != MNT_WAIT)
|
|
|
|
return (0);
|
|
|
|
bp->b_xflags |= BX_BKGRDWAIT;
|
2002-08-04 10:29:36 +00:00
|
|
|
interlocked_sleep(&lk, SLEEP, &bp->b_xflags, NULL,
|
|
|
|
PRIBIO, "getbuf", 0);
|
2000-01-13 07:20:01 +00:00
|
|
|
continue;
|
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
if (waitfor != MNT_WAIT)
|
|
|
|
return (0);
|
2002-08-04 10:29:36 +00:00
|
|
|
error = interlocked_sleep(&lk, LOCKBUF, bp, NULL,
|
2002-01-12 20:57:36 +00:00
|
|
|
LK_EXCLUSIVE | LK_SLEEPFAIL, 0, 0);
|
|
|
|
if (error != ENOLCK) {
|
|
|
|
FREE_LOCK(&lk);
|
1999-06-26 02:47:16 +00:00
|
|
|
panic("getdirtybuf: inconsistent lock");
|
2002-01-12 20:57:36 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
1999-06-26 02:47:16 +00:00
|
|
|
if ((bp->b_flags & B_DELWRI) == 0) {
|
|
|
|
BUF_UNLOCK(bp);
|
1998-05-19 19:47:22 +00:00
|
|
|
return (0);
|
1999-06-26 02:47:16 +00:00
|
|
|
}
|
1998-05-19 19:47:22 +00:00
|
|
|
bremfree(bp);
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
1999-05-07 02:26:47 +00:00
|
|
|
/*
|
|
|
|
* Wait for pending output on a vnode to complete.
|
|
|
|
* Must be called with vnode locked.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
drain_output(vp, islocked)
|
|
|
|
struct vnode *vp;
|
|
|
|
int islocked;
|
|
|
|
{
|
|
|
|
|
|
|
|
if (!islocked)
|
|
|
|
ACQUIRE_LOCK(&lk);
|
2002-08-04 10:29:36 +00:00
|
|
|
VI_LOCK(vp);
|
1999-05-07 02:26:47 +00:00
|
|
|
while (vp->v_numoutput) {
|
2002-08-04 10:29:36 +00:00
|
|
|
vp->v_iflag |= VI_BWAIT;
|
|
|
|
interlocked_sleep(&lk, SLEEP, (caddr_t)&vp->v_numoutput,
|
|
|
|
VI_MTX(vp), PRIBIO + 1, "drainvp", 0);
|
1999-05-07 02:26:47 +00:00
|
|
|
}
|
2002-08-04 10:29:36 +00:00
|
|
|
VI_UNLOCK(vp);
|
1999-05-07 02:26:47 +00:00
|
|
|
if (!islocked)
|
|
|
|
FREE_LOCK(&lk);
|
|
|
|
}
|
|
|
|
|
1998-05-19 19:47:22 +00:00
|
|
|
/*
|
|
|
|
* Called whenever a buffer that is being invalidated or reallocated
|
|
|
|
* contains dependencies. This should only happen if an I/O error has
|
|
|
|
* occurred. The routine is called with the buffer locked.
|
|
|
|
*/
|
2000-01-09 22:40:09 +00:00
|
|
|
static void
|
1998-05-19 19:47:22 +00:00
|
|
|
softdep_deallocate_dependencies(bp)
|
|
|
|
struct buf *bp;
|
|
|
|
{
|
1999-05-07 05:11:31 +00:00
|
|
|
|
2000-04-02 15:24:56 +00:00
|
|
|
if ((bp->b_ioflags & BIO_ERROR) == 0)
|
1999-05-07 05:11:31 +00:00
|
|
|
panic("softdep_deallocate_dependencies: dangling deps");
|
1998-05-19 19:47:22 +00:00
|
|
|
softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
|
1999-05-07 05:11:31 +00:00
|
|
|
panic("softdep_deallocate_dependencies: unrecovered I/O error");
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Function to handle asynchronous write errors in the filesystem.
|
|
|
|
*/
|
2002-09-28 17:15:38 +00:00
|
|
|
static void
|
1998-05-19 19:47:22 +00:00
|
|
|
softdep_error(func, error)
|
|
|
|
char *func;
|
|
|
|
int error;
|
|
|
|
{
|
1999-05-07 05:11:31 +00:00
|
|
|
|
1998-05-19 19:47:22 +00:00
|
|
|
/* XXX should do something better! */
|
1999-01-22 09:07:32 +00:00
|
|
|
printf("%s: got error %d while accessing filesystem\n", func, error);
|
1998-05-19 19:47:22 +00:00
|
|
|
}
|