freebsd-dev/sys/ufs/ffs/ffs_softdep.c

5966 lines
177 KiB
C
Raw Normal View History

/*
2000-06-22 00:29:53 +00:00
* Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
*
* The soft updates code is derived from the appendix of a University
* of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
* "Soft Updates: A Solution to the Metadata Update Problem in File
* Systems", CSE-TR-254-95, August 1995).
*
2000-06-22 00:29:53 +00:00
* Further information about soft updates can be obtained from:
*
2000-06-22 00:29:53 +00:00
* Marshall Kirk McKusick http://www.mckusick.com/softdep/
* 1614 Oxford Street mckusick@mckusick.com
* Berkeley, CA 94709-1608 +1-510-843-9542
* USA
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
2000-06-22 00:29:53 +00:00
* from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
*/
2002-03-15 04:06:10 +00:00
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
/*
* For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
*/
#ifndef DIAGNOSTIC
#define DIAGNOSTIC
#endif
#ifndef DEBUG
#define DEBUG
#endif
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/kdb.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/stat.h>
#include <sys/syslog.h>
#include <sys/vnode.h>
#include <sys/conf.h>
#include <ufs/ufs/dir.h>
#include <ufs/ufs/extattr.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ffs/fs.h>
#include <ufs/ffs/softdep.h>
#include <ufs/ffs/ffs_extern.h>
#include <ufs/ufs/ufs_extern.h>
/*
* These definitions need to be adapted to the system to which
* this file is being ported.
*/
/*
* malloc types defined for the softdep system.
*/
2000-12-08 20:09:00 +00:00
static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
static MALLOC_DEFINE(M_NEWDIRBLK, "newdirblk","Unclaimed new directory block");
#define M_SOFTDEP_FLAGS (M_WAITOK | M_USE_RESERVE)
Implement a low-memory deadlock solution. Removed most of the hacks that were trying to deal with low-memory situations prior to now. The new code is based on the concept that I/O must be able to function in a low memory situation. All major modules related to I/O (except networking) have been adjusted to allow allocation out of the system reserve memory pool. These modules now detect a low memory situation but rather then block they instead continue to operate, then return resources to the memory pool instead of cache them or leave them wired. Code has been added to stall in a low-memory situation prior to a vnode being locked. Thus situations where a process blocks in a low-memory condition while holding a locked vnode have been reduced to near nothing. Not only will I/O continue to operate, but many prior deadlock conditions simply no longer exist. Implement a number of VFS/BIO fixes (found by Ian): in biodone(), bogus-page replacement code, the loop was not properly incrementing loop variables prior to a continue statement. We do not believe this code can be hit anyway but we aren't taking any chances. We'll turn the whole section into a panic (as it already is in brelse()) after the release is rolled. In biodone(), the foff calculation was incorrectly clamped to the iosize, causing the wrong foff to be calculated for pages in the case of an I/O error or biodone() called without initiating I/O. The problem always caused a panic before. Now it doesn't. The problem is mainly an issue with NFS. Fixed casts for ~PAGE_MASK. This code worked properly before only because the calculations use signed arithmatic. Better to properly extend PAGE_MASK first before inverting it for the 64 bit masking op. In brelse(), the bogus_page fixup code was improperly throwing away the original contents of 'm' when it did the j-loop to fix the bogus pages. The result was that it would potentially invalidate parts of the *WRONG* page(!), leading to corruption. There may still be cases where a background bitmap write is being duplicated, causing potential corruption. We have identified a potentially serious bug related to this but the fix is still TBD. So instead this patch contains a KASSERT to detect the problem and panic the machine rather then continue to corrupt the filesystem. The problem does not occur very often.. it is very hard to reproduce, and it may or may not be the cause of the corruption people have reported. Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>) Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
2000-11-18 23:06:26 +00:00
#define D_PAGEDEP 0
#define D_INODEDEP 1
#define D_NEWBLK 2
#define D_BMSAFEMAP 3
#define D_ALLOCDIRECT 4
#define D_INDIRDEP 5
#define D_ALLOCINDIR 6
#define D_FREEFRAG 7
#define D_FREEBLKS 8
#define D_FREEFILE 9
#define D_DIRADD 10
#define D_MKDIR 11
#define D_DIRREM 12
#define D_NEWDIRBLK 13
#define D_LAST D_NEWDIRBLK
/*
* translate from workitem type to memory type
* MUST match the defines above, such that memtype[D_XXX] == M_XXX
*/
static struct malloc_type *memtype[] = {
M_PAGEDEP,
M_INODEDEP,
M_NEWBLK,
M_BMSAFEMAP,
M_ALLOCDIRECT,
M_INDIRDEP,
M_ALLOCINDIR,
M_FREEFRAG,
M_FREEBLKS,
M_FREEFILE,
M_DIRADD,
M_MKDIR,
M_DIRREM,
M_NEWDIRBLK
};
#define DtoM(type) (memtype[type])
/*
* Names of malloc types.
*/
#define TYPENAME(type) \
((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
/*
* End system adaptaion definitions.
*/
/*
* Internal function prototypes.
*/
2002-03-19 22:40:48 +00:00
static void softdep_error(char *, int);
static void drain_output(struct vnode *, int);
static struct buf *getdirtybuf(struct buf **, struct mtx *, int);
2002-03-19 22:40:48 +00:00
static void clear_remove(struct thread *);
static void clear_inodedeps(struct thread *);
static int flush_pagedep_deps(struct vnode *, struct mount *,
struct diraddhd *);
static int flush_inodedep_deps(struct fs *, ino_t);
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
static int flush_deplist(struct allocdirectlst *, int, int *);
2002-03-19 22:40:48 +00:00
static int handle_written_filepage(struct pagedep *, struct buf *);
static void diradd_inode_written(struct diradd *, struct inodedep *);
static int handle_written_inodeblock(struct inodedep *, struct buf *);
static void handle_allocdirect_partdone(struct allocdirect *);
static void handle_allocindir_partdone(struct allocindir *);
static void initiate_write_filepage(struct pagedep *, struct buf *);
static void handle_written_mkdir(struct mkdir *, int);
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
2002-03-19 22:40:48 +00:00
static void handle_workitem_freefile(struct freefile *);
static void handle_workitem_remove(struct dirrem *, struct vnode *);
static struct dirrem *newdirrem(struct buf *, struct inode *,
struct inode *, int, struct dirrem **);
static void free_diradd(struct diradd *);
static void free_allocindir(struct allocindir *, struct inodedep *);
static void free_newdirblk(struct newdirblk *);
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
static int indir_trunc(struct freeblks *, ufs2_daddr_t, int, ufs_lbn_t,
ufs2_daddr_t *);
2002-03-19 22:40:48 +00:00
static void deallocate_dependencies(struct buf *, struct inodedep *);
static void free_allocdirect(struct allocdirectlst *,
struct allocdirect *, int);
static int check_inode_unwritten(struct inodedep *);
static int free_inodedep(struct inodedep *);
static void handle_workitem_freeblocks(struct freeblks *, int);
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
2002-03-19 22:40:48 +00:00
static void setup_allocindir_phase2(struct buf *, struct inode *,
struct allocindir *);
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
ufs2_daddr_t);
2002-03-19 22:40:48 +00:00
static void handle_workitem_freefrag(struct freefrag *);
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long);
2002-03-19 22:40:48 +00:00
static void allocdirect_merge(struct allocdirectlst *,
struct allocdirect *, struct allocdirect *);
static struct bmsafemap *bmsafemap_lookup(struct buf *);
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
static int newblk_lookup(struct fs *, ufs2_daddr_t, int, struct newblk **);
2002-03-19 22:40:48 +00:00
static int inodedep_lookup(struct fs *, ino_t, int, struct inodedep **);
static int pagedep_lookup(struct inode *, ufs_lbn_t, int, struct pagedep **);
static void pause_timer(void *);
static int request_cleanup(int, int);
static int process_worklist_item(struct mount *, int);
static void add_to_worklist(struct worklist *);
/*
* Exported softdep operations.
*/
static int softdep_disk_prewrite(struct vnode *vp, struct buf *bp);
2002-03-19 22:40:48 +00:00
static void softdep_disk_io_initiation(struct buf *);
static void softdep_disk_write_complete(struct buf *);
static void softdep_deallocate_dependencies(struct buf *);
static void softdep_move_dependencies(struct buf *, struct buf *);
static int softdep_count_dependencies(struct buf *bp, int);
/*
* Locking primitives.
*
* For a uniprocessor, all we need to do is protect against disk
* interrupts. For a multiprocessor, this lock would have to be
* a mutex. A single mutex is used throughout this file, though
* finer grain locking could be used if contention warranted it.
*
* For a multiprocessor, the sleep call would accept a lock and
* release it after the sleep processing was complete. In a uniprocessor
* implementation there is no such interlock, so we simple mark
* the places where it needs to be done with the `interlocked' form
* of the lock calls. Since the uniprocessor sleep already interlocks
* the spl, there is nothing that really needs to be done.
*/
#ifndef /* NOT */ DEBUG
static struct lockit {
int lkt_spl;
} lk = { 0 };
#define ACQUIRE_LOCK(lk) (lk)->lkt_spl = splbio()
#define FREE_LOCK(lk) splx((lk)->lkt_spl)
#else /* DEBUG */
#define NOHOLDER ((struct thread *)-1)
#define SPECIAL_FLAG ((struct thread *)-2)
static struct lockit {
int lkt_spl;
struct thread *lkt_held;
} lk = { 0, NOHOLDER };
2002-03-19 22:40:48 +00:00
static void acquire_lock(struct lockit *);
static void free_lock(struct lockit *);
void softdep_panic(char *);
#define ACQUIRE_LOCK(lk) acquire_lock(lk)
#define FREE_LOCK(lk) free_lock(lk)
static void
acquire_lock(lk)
struct lockit *lk;
{
struct thread *holder;
if (lk->lkt_held != NOHOLDER) {
holder = lk->lkt_held;
FREE_LOCK(lk);
if (holder == curthread)
panic("softdep_lock: locking against myself");
else
panic("softdep_lock: lock held by %p", holder);
}
lk->lkt_spl = splbio();
lk->lkt_held = curthread;
}
static void
free_lock(lk)
struct lockit *lk;
{
if (lk->lkt_held == NOHOLDER)
panic("softdep_unlock: lock not held");
lk->lkt_held = NOHOLDER;
splx(lk->lkt_spl);
}
/*
* Function to release soft updates lock and panic.
*/
void
softdep_panic(msg)
char *msg;
{
if (lk.lkt_held != NOHOLDER)
FREE_LOCK(&lk);
panic(msg);
}
#endif /* DEBUG */
static int interlocked_sleep(struct lockit *, int, void *, struct mtx *, int,
2002-03-19 22:40:48 +00:00
const char *, int);
/*
* When going to sleep, we must save our SPL so that it does
* not get lost if some other process uses the lock while we
* are sleeping. We restore it after we have slept. This routine
* wraps the interlocking with functions that sleep. The list
* below enumerates the available set of operations.
*/
#define UNKNOWN 0
#define SLEEP 1
#define LOCKBUF 2
static int
interlocked_sleep(lk, op, ident, mtx, flags, wmesg, timo)
struct lockit *lk;
int op;
void *ident;
struct mtx *mtx;
int flags;
const char *wmesg;
int timo;
{
struct thread *holder;
int s, retval;
s = lk->lkt_spl;
# ifdef DEBUG
if (lk->lkt_held == NOHOLDER)
panic("interlocked_sleep: lock not held");
lk->lkt_held = NOHOLDER;
# endif /* DEBUG */
switch (op) {
case SLEEP:
retval = msleep(ident, mtx, flags, wmesg, timo);
break;
case LOCKBUF:
retval = BUF_LOCK((struct buf *)ident, flags, mtx);
break;
default:
panic("interlocked_sleep: unknown operation");
}
# ifdef DEBUG
if (lk->lkt_held != NOHOLDER) {
holder = lk->lkt_held;
FREE_LOCK(lk);
if (holder == curthread)
panic("interlocked_sleep: locking against self");
else
panic("interlocked_sleep: lock held by %p", holder);
}
lk->lkt_held = curthread;
# endif /* DEBUG */
lk->lkt_spl = s;
return (retval);
}
/*
* Place holder for real semaphores.
*/
struct sema {
int value;
struct thread *holder;
char *name;
int prio;
int timo;
};
2002-03-19 22:40:48 +00:00
static void sema_init(struct sema *, char *, int, int);
static int sema_get(struct sema *, struct lockit *);
static void sema_release(struct sema *);
static void
sema_init(semap, name, prio, timo)
struct sema *semap;
char *name;
int prio, timo;
{
semap->holder = NOHOLDER;
semap->value = 0;
semap->name = name;
semap->prio = prio;
semap->timo = timo;
}
static int
sema_get(semap, interlock)
struct sema *semap;
struct lockit *interlock;
{
if (semap->value++ > 0) {
if (interlock != NULL) {
interlocked_sleep(interlock, SLEEP, (caddr_t)semap,
NULL, semap->prio, semap->name,
semap->timo);
FREE_LOCK(interlock);
} else {
tsleep(semap, semap->prio, semap->name,
semap->timo);
}
return (0);
}
semap->holder = curthread;
if (interlock != NULL)
FREE_LOCK(interlock);
return (1);
}
static void
sema_release(semap)
struct sema *semap;
{
if (semap->value <= 0 || semap->holder != curthread) {
if (lk.lkt_held != NOHOLDER)
FREE_LOCK(&lk);
panic("sema_release: not held");
}
if (--semap->value > 0) {
semap->value = 0;
wakeup(semap);
}
semap->holder = NOHOLDER;
}
/*
* Worklist queue management.
* These routines require that the lock be held.
*/
#ifndef /* NOT */ DEBUG
#define WORKLIST_INSERT(head, item) do { \
(item)->wk_state |= ONWORKLIST; \
LIST_INSERT_HEAD(head, item, wk_list); \
} while (0)
#define WORKLIST_REMOVE(item) do { \
(item)->wk_state &= ~ONWORKLIST; \
LIST_REMOVE(item, wk_list); \
} while (0)
#define WORKITEM_FREE(item, type) FREE(item, DtoM(type))
#else /* DEBUG */
2002-03-19 22:40:48 +00:00
static void worklist_insert(struct workhead *, struct worklist *);
static void worklist_remove(struct worklist *);
static void workitem_free(struct worklist *, int);
#define WORKLIST_INSERT(head, item) worklist_insert(head, item)
#define WORKLIST_REMOVE(item) worklist_remove(item)
#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type)
static void
worklist_insert(head, item)
struct workhead *head;
struct worklist *item;
{
if (lk.lkt_held == NOHOLDER)
panic("worklist_insert: lock not held");
if (item->wk_state & ONWORKLIST) {
FREE_LOCK(&lk);
panic("worklist_insert: already on list");
}
item->wk_state |= ONWORKLIST;
LIST_INSERT_HEAD(head, item, wk_list);
}
static void
worklist_remove(item)
struct worklist *item;
{
if (lk.lkt_held == NOHOLDER)
panic("worklist_remove: lock not held");
if ((item->wk_state & ONWORKLIST) == 0) {
FREE_LOCK(&lk);
panic("worklist_remove: not on list");
}
item->wk_state &= ~ONWORKLIST;
LIST_REMOVE(item, wk_list);
}
static void
workitem_free(item, type)
struct worklist *item;
int type;
{
if (item->wk_state & ONWORKLIST) {
if (lk.lkt_held != NOHOLDER)
FREE_LOCK(&lk);
panic("workitem_free: still on list");
}
if (item->wk_type != type) {
if (lk.lkt_held != NOHOLDER)
FREE_LOCK(&lk);
panic("workitem_free: type mismatch");
}
FREE(item, DtoM(type));
}
#endif /* DEBUG */
/*
* Workitem queue management
*/
static struct workhead softdep_workitem_pending;
static struct worklist *worklist_tail;
static int num_on_worklist; /* number of worklist items to be processed */
static int softdep_worklist_busy; /* 1 => trying to do unmount */
static int softdep_worklist_req; /* serialized waiters */
static int max_softdeps; /* maximum number of structs before slowdown */
static int maxindirdeps = 50; /* max number of indirdeps before slowdown */
static int tickdelay = 2; /* number of ticks to pause during slowdown */
static int proc_waiting; /* tracks whether we have a timeout posted */
static int *stat_countp; /* statistic to count in proc_waiting timeout */
static struct callout_handle handle; /* handle on posted proc_waiting timeout */
static struct thread *filesys_syncer; /* proc of filesystem syncer process */
static int req_clear_inodedeps; /* syncer process flush some inodedeps */
#define FLUSH_INODES 1
static int req_clear_remove; /* syncer process flush some freeblks */
#define FLUSH_REMOVE 2
#define FLUSH_REMOVE_WAIT 3
/*
* runtime statistics
*/
static int stat_worklist_push; /* number of worklist cleanups */
static int stat_blk_limit_push; /* number of times block limit neared */
static int stat_ino_limit_push; /* number of times inode limit neared */
static int stat_blk_limit_hit; /* number of times block slowdown imposed */
static int stat_ino_limit_hit; /* number of times inode slowdown imposed */
static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */
static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */
#ifdef DEBUG
#include <vm/vm.h>
#include <sys/sysctl.h>
SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
SYSCTL_INT(_debug, OID_AUTO, maxindirdeps, CTLFLAG_RW, &maxindirdeps, 0, "");
SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,"");
SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,"");
SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, "");
SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, "");
SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, "");
SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, "");
SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
#endif /* DEBUG */
/*
* Add an item to the end of the work queue.
* This routine requires that the lock be held.
* This is the only routine that adds items to the list.
* The following routine is the only one that removes items
* and does so in order from first to last.
*/
static void
add_to_worklist(wk)
struct worklist *wk;
{
if (wk->wk_state & ONWORKLIST) {
if (lk.lkt_held != NOHOLDER)
FREE_LOCK(&lk);
panic("add_to_worklist: already on list");
}
wk->wk_state |= ONWORKLIST;
if (LIST_FIRST(&softdep_workitem_pending) == NULL)
LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list);
else
LIST_INSERT_AFTER(worklist_tail, wk, wk_list);
worklist_tail = wk;
num_on_worklist += 1;
}
/*
* Process that runs once per second to handle items in the background queue.
*
* Note that we ensure that everything is done in the order in which they
* appear in the queue. The code below depends on this property to ensure
* that blocks of a file are freed before the inode itself is freed. This
* ordering ensures that no new <vfsid, inum, lbn> triples will be generated
* until all the old ones have been purged from the dependency lists.
*/
int
softdep_process_worklist(matchmnt)
struct mount *matchmnt;
{
struct thread *td = curthread;
int cnt, matchcnt, loopcount;
long starttime;
/*
* Record the process identifier of our caller so that we can give
* this process preferential treatment in request_cleanup below.
*/
filesys_syncer = td;
matchcnt = 0;
/*
* There is no danger of having multiple processes run this
* code, but we have to single-thread it when softdep_flushfiles()
* is in operation to get an accurate count of the number of items
* related to its mount point that are in the list.
*/
if (matchmnt == NULL) {
if (softdep_worklist_busy < 0)
return(-1);
softdep_worklist_busy += 1;
}
/*
* If requested, try removing inode or removal dependencies.
*/
if (req_clear_inodedeps) {
clear_inodedeps(td);
req_clear_inodedeps -= 1;
wakeup_one(&proc_waiting);
}
if (req_clear_remove) {
clear_remove(td);
req_clear_remove -= 1;
wakeup_one(&proc_waiting);
}
loopcount = 1;
starttime = time_second;
while (num_on_worklist > 0) {
if ((cnt = process_worklist_item(matchmnt, 0)) == -1)
break;
else
matchcnt += cnt;
/*
* If a umount operation wants to run the worklist
* accurately, abort.
*/
if (softdep_worklist_req && matchmnt == NULL) {
matchcnt = -1;
break;
}
/*
* If requested, try removing inode or removal dependencies.
*/
if (req_clear_inodedeps) {
clear_inodedeps(td);
req_clear_inodedeps -= 1;
wakeup_one(&proc_waiting);
}
if (req_clear_remove) {
clear_remove(td);
req_clear_remove -= 1;
wakeup_one(&proc_waiting);
}
/*
* We do not generally want to stop for buffer space, but if
* we are really being a buffer hog, we will stop and wait.
*/
if (loopcount++ % 128 == 0)
bwillwrite();
/*
* Never allow processing to run for more than one
* second. Otherwise the other syncer tasks may get
* excessively backlogged.
*/
if (starttime != time_second && matchmnt == NULL) {
matchcnt = -1;
break;
}
}
if (matchmnt == NULL) {
softdep_worklist_busy -= 1;
if (softdep_worklist_req && softdep_worklist_busy == 0)
wakeup(&softdep_worklist_req);
}
return (matchcnt);
}
/*
* Process one item on the worklist.
*/
static int
process_worklist_item(matchmnt, flags)
struct mount *matchmnt;
int flags;
{
struct worklist *wk, *wkend;
struct mount *mp;
struct vnode *vp;
int matchcnt = 0;
/*
* If we are being called because of a process doing a
* copy-on-write, then it is not safe to write as we may
* recurse into the copy-on-write routine.
*/
if (curthread->td_pflags & TDP_COWINPROGRESS)
return (-1);
ACQUIRE_LOCK(&lk);
/*
* Normally we just process each item on the worklist in order.
* However, if we are in a situation where we cannot lock any
* inodes, we have to skip over any dirrem requests whose
* vnodes are resident and locked.
*/
vp = NULL;
LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) {
if (wk->wk_state & INPROGRESS)
continue;
if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
break;
wk->wk_state |= INPROGRESS;
FREE_LOCK(&lk);
VFS_VGET(WK_DIRREM(wk)->dm_mnt, WK_DIRREM(wk)->dm_oldinum,
LK_NOWAIT | LK_EXCLUSIVE, &vp);
ACQUIRE_LOCK(&lk);
wk->wk_state &= ~INPROGRESS;
if (vp != NULL)
break;
}
if (wk == 0) {
FREE_LOCK(&lk);
return (-1);
}
/*
* Remove the item to be processed. If we are removing the last
* item on the list, we need to recalculate the tail pointer.
* As this happens rarely and usually when the list is short,
* we just run down the list to find it rather than tracking it
* in the above loop.
*/
WORKLIST_REMOVE(wk);
if (wk == worklist_tail) {
LIST_FOREACH(wkend, &softdep_workitem_pending, wk_list)
if (LIST_NEXT(wkend, wk_list) == NULL)
break;
worklist_tail = wkend;
}
num_on_worklist -= 1;
FREE_LOCK(&lk);
switch (wk->wk_type) {
case D_DIRREM:
/* removal of a directory entry */
mp = WK_DIRREM(wk)->dm_mnt;
if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
panic("%s: dirrem on suspended filesystem",
"process_worklist_item");
if (mp == matchmnt)
matchcnt += 1;
handle_workitem_remove(WK_DIRREM(wk), vp);
break;
case D_FREEBLKS:
/* releasing blocks and/or fragments from a file */
mp = WK_FREEBLKS(wk)->fb_mnt;
if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
panic("%s: freeblks on suspended filesystem",
"process_worklist_item");
if (mp == matchmnt)
matchcnt += 1;
handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT);
break;
case D_FREEFRAG:
/* releasing a fragment when replaced as a file grows */
mp = WK_FREEFRAG(wk)->ff_mnt;
if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
panic("%s: freefrag on suspended filesystem",
"process_worklist_item");
if (mp == matchmnt)
matchcnt += 1;
handle_workitem_freefrag(WK_FREEFRAG(wk));
break;
case D_FREEFILE:
/* releasing an inode when its link count drops to 0 */
mp = WK_FREEFILE(wk)->fx_mnt;
if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
panic("%s: freefile on suspended filesystem",
"process_worklist_item");
if (mp == matchmnt)
matchcnt += 1;
handle_workitem_freefile(WK_FREEFILE(wk));
break;
default:
panic("%s_process_worklist: Unknown type %s",
"softdep", TYPENAME(wk->wk_type));
/* NOTREACHED */
}
return (matchcnt);
}
/*
* Move dependencies from one buffer to another.
*/
static void
softdep_move_dependencies(oldbp, newbp)
struct buf *oldbp;
struct buf *newbp;
{
struct worklist *wk, *wktail;
if (LIST_FIRST(&newbp->b_dep) != NULL)
panic("softdep_move_dependencies: need merge code");
wktail = 0;
ACQUIRE_LOCK(&lk);
while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
LIST_REMOVE(wk, wk_list);
if (wktail == 0)
LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
else
LIST_INSERT_AFTER(wktail, wk, wk_list);
wktail = wk;
}
FREE_LOCK(&lk);
}
/*
* Purge the work list of all items associated with a particular mount point.
*/
int
softdep_flushworklist(oldmnt, countp, td)
struct mount *oldmnt;
This patch corrects the first round of panics and hangs reported with the new snapshot code. Update addaliasu to correctly implement the semantics of the old checkalias function. When a device vnode first comes into existence, check to see if an anonymous vnode for the same device was created at boot time by bdevvp(). If so, adopt the bdevvp vnode rather than creating a new vnode for the device. This corrects a problem which caused the kernel to panic when taking a snapshot of the root filesystem. Change the calling convention of vn_write_suspend_wait() to be the same as vn_start_write(). Split out softdep_flushworklist() from softdep_flushfiles() so that it can be used to clear the work queue when suspending filesystem operations. Access to buffers becomes recursive so that snapshots can recursively traverse their indirect blocks using ffs_copyonwrite() when checking for the need for copy on write when flushing one of their own indirect blocks. This eliminates a deadlock between the syncer daemon and a process taking a snapshot. Ensure that softdep_process_worklist() can never block because of a snapshot being taken. This eliminates a problem with buffer starvation. Cleanup change in ffs_sync() which did not synchronously wait when MNT_WAIT was specified. The result was an unclean filesystem panic when doing forcible unmount with heavy filesystem I/O in progress. Return a zero'ed block when reading a block that was not in use at the time that a snapshot was taken. Normally, these blocks should never be read. However, the readahead code will occationally read them which can cause unexpected behavior. Clean up the debugging code that ensures that no blocks be written on a filesystem while it is suspended. Snapshots must explicitly label the blocks that they are writing during the suspension so that they do not cause a `write on suspended filesystem' panic. Reorganize ffs_copyonwrite() to eliminate a deadlock and also to prevent a race condition that would permit the same block to be copied twice. This change eliminates an unexpected soft updates inconsistency in fsck caused by the double allocation. Use bqrelse rather than brelse for buffers that will be needed soon again by the snapshot code. This improves snapshot performance.
2000-07-24 05:28:33 +00:00
int *countp;
struct thread *td;
{
struct vnode *devvp;
This patch corrects the first round of panics and hangs reported with the new snapshot code. Update addaliasu to correctly implement the semantics of the old checkalias function. When a device vnode first comes into existence, check to see if an anonymous vnode for the same device was created at boot time by bdevvp(). If so, adopt the bdevvp vnode rather than creating a new vnode for the device. This corrects a problem which caused the kernel to panic when taking a snapshot of the root filesystem. Change the calling convention of vn_write_suspend_wait() to be the same as vn_start_write(). Split out softdep_flushworklist() from softdep_flushfiles() so that it can be used to clear the work queue when suspending filesystem operations. Access to buffers becomes recursive so that snapshots can recursively traverse their indirect blocks using ffs_copyonwrite() when checking for the need for copy on write when flushing one of their own indirect blocks. This eliminates a deadlock between the syncer daemon and a process taking a snapshot. Ensure that softdep_process_worklist() can never block because of a snapshot being taken. This eliminates a problem with buffer starvation. Cleanup change in ffs_sync() which did not synchronously wait when MNT_WAIT was specified. The result was an unclean filesystem panic when doing forcible unmount with heavy filesystem I/O in progress. Return a zero'ed block when reading a block that was not in use at the time that a snapshot was taken. Normally, these blocks should never be read. However, the readahead code will occationally read them which can cause unexpected behavior. Clean up the debugging code that ensures that no blocks be written on a filesystem while it is suspended. Snapshots must explicitly label the blocks that they are writing during the suspension so that they do not cause a `write on suspended filesystem' panic. Reorganize ffs_copyonwrite() to eliminate a deadlock and also to prevent a race condition that would permit the same block to be copied twice. This change eliminates an unexpected soft updates inconsistency in fsck caused by the double allocation. Use bqrelse rather than brelse for buffers that will be needed soon again by the snapshot code. This improves snapshot performance.
2000-07-24 05:28:33 +00:00
int count, error = 0;
/*
* Await our turn to clear out the queue, then serialize access.
*/
while (softdep_worklist_busy) {
softdep_worklist_req += 1;
tsleep(&softdep_worklist_req, PRIBIO, "softflush", 0);
softdep_worklist_req -= 1;
}
softdep_worklist_busy = -1;
/*
* Alternately flush the block device associated with the mount
* point and process any dependencies that the flushing
This patch corrects the first round of panics and hangs reported with the new snapshot code. Update addaliasu to correctly implement the semantics of the old checkalias function. When a device vnode first comes into existence, check to see if an anonymous vnode for the same device was created at boot time by bdevvp(). If so, adopt the bdevvp vnode rather than creating a new vnode for the device. This corrects a problem which caused the kernel to panic when taking a snapshot of the root filesystem. Change the calling convention of vn_write_suspend_wait() to be the same as vn_start_write(). Split out softdep_flushworklist() from softdep_flushfiles() so that it can be used to clear the work queue when suspending filesystem operations. Access to buffers becomes recursive so that snapshots can recursively traverse their indirect blocks using ffs_copyonwrite() when checking for the need for copy on write when flushing one of their own indirect blocks. This eliminates a deadlock between the syncer daemon and a process taking a snapshot. Ensure that softdep_process_worklist() can never block because of a snapshot being taken. This eliminates a problem with buffer starvation. Cleanup change in ffs_sync() which did not synchronously wait when MNT_WAIT was specified. The result was an unclean filesystem panic when doing forcible unmount with heavy filesystem I/O in progress. Return a zero'ed block when reading a block that was not in use at the time that a snapshot was taken. Normally, these blocks should never be read. However, the readahead code will occationally read them which can cause unexpected behavior. Clean up the debugging code that ensures that no blocks be written on a filesystem while it is suspended. Snapshots must explicitly label the blocks that they are writing during the suspension so that they do not cause a `write on suspended filesystem' panic. Reorganize ffs_copyonwrite() to eliminate a deadlock and also to prevent a race condition that would permit the same block to be copied twice. This change eliminates an unexpected soft updates inconsistency in fsck caused by the double allocation. Use bqrelse rather than brelse for buffers that will be needed soon again by the snapshot code. This improves snapshot performance.
2000-07-24 05:28:33 +00:00
* creates. We continue until no more worklist dependencies
* are found.
*/
This patch corrects the first round of panics and hangs reported with the new snapshot code. Update addaliasu to correctly implement the semantics of the old checkalias function. When a device vnode first comes into existence, check to see if an anonymous vnode for the same device was created at boot time by bdevvp(). If so, adopt the bdevvp vnode rather than creating a new vnode for the device. This corrects a problem which caused the kernel to panic when taking a snapshot of the root filesystem. Change the calling convention of vn_write_suspend_wait() to be the same as vn_start_write(). Split out softdep_flushworklist() from softdep_flushfiles() so that it can be used to clear the work queue when suspending filesystem operations. Access to buffers becomes recursive so that snapshots can recursively traverse their indirect blocks using ffs_copyonwrite() when checking for the need for copy on write when flushing one of their own indirect blocks. This eliminates a deadlock between the syncer daemon and a process taking a snapshot. Ensure that softdep_process_worklist() can never block because of a snapshot being taken. This eliminates a problem with buffer starvation. Cleanup change in ffs_sync() which did not synchronously wait when MNT_WAIT was specified. The result was an unclean filesystem panic when doing forcible unmount with heavy filesystem I/O in progress. Return a zero'ed block when reading a block that was not in use at the time that a snapshot was taken. Normally, these blocks should never be read. However, the readahead code will occationally read them which can cause unexpected behavior. Clean up the debugging code that ensures that no blocks be written on a filesystem while it is suspended. Snapshots must explicitly label the blocks that they are writing during the suspension so that they do not cause a `write on suspended filesystem' panic. Reorganize ffs_copyonwrite() to eliminate a deadlock and also to prevent a race condition that would permit the same block to be copied twice. This change eliminates an unexpected soft updates inconsistency in fsck caused by the double allocation. Use bqrelse rather than brelse for buffers that will be needed soon again by the snapshot code. This improves snapshot performance.
2000-07-24 05:28:33 +00:00
*countp = 0;
devvp = VFSTOUFS(oldmnt)->um_devvp;
This patch corrects the first round of panics and hangs reported with the new snapshot code. Update addaliasu to correctly implement the semantics of the old checkalias function. When a device vnode first comes into existence, check to see if an anonymous vnode for the same device was created at boot time by bdevvp(). If so, adopt the bdevvp vnode rather than creating a new vnode for the device. This corrects a problem which caused the kernel to panic when taking a snapshot of the root filesystem. Change the calling convention of vn_write_suspend_wait() to be the same as vn_start_write(). Split out softdep_flushworklist() from softdep_flushfiles() so that it can be used to clear the work queue when suspending filesystem operations. Access to buffers becomes recursive so that snapshots can recursively traverse their indirect blocks using ffs_copyonwrite() when checking for the need for copy on write when flushing one of their own indirect blocks. This eliminates a deadlock between the syncer daemon and a process taking a snapshot. Ensure that softdep_process_worklist() can never block because of a snapshot being taken. This eliminates a problem with buffer starvation. Cleanup change in ffs_sync() which did not synchronously wait when MNT_WAIT was specified. The result was an unclean filesystem panic when doing forcible unmount with heavy filesystem I/O in progress. Return a zero'ed block when reading a block that was not in use at the time that a snapshot was taken. Normally, these blocks should never be read. However, the readahead code will occationally read them which can cause unexpected behavior. Clean up the debugging code that ensures that no blocks be written on a filesystem while it is suspended. Snapshots must explicitly label the blocks that they are writing during the suspension so that they do not cause a `write on suspended filesystem' panic. Reorganize ffs_copyonwrite() to eliminate a deadlock and also to prevent a race condition that would permit the same block to be copied twice. This change eliminates an unexpected soft updates inconsistency in fsck caused by the double allocation. Use bqrelse rather than brelse for buffers that will be needed soon again by the snapshot code. This improves snapshot performance.
2000-07-24 05:28:33 +00:00
while ((count = softdep_process_worklist(oldmnt)) > 0) {
*countp += count;
vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
error = VOP_FSYNC(devvp, td->td_ucred, MNT_WAIT, td);
VOP_UNLOCK(devvp, 0, td);
if (error)
break;
}
softdep_worklist_busy = 0;
if (softdep_worklist_req)
wakeup(&softdep_worklist_req);
This patch corrects the first round of panics and hangs reported with the new snapshot code. Update addaliasu to correctly implement the semantics of the old checkalias function. When a device vnode first comes into existence, check to see if an anonymous vnode for the same device was created at boot time by bdevvp(). If so, adopt the bdevvp vnode rather than creating a new vnode for the device. This corrects a problem which caused the kernel to panic when taking a snapshot of the root filesystem. Change the calling convention of vn_write_suspend_wait() to be the same as vn_start_write(). Split out softdep_flushworklist() from softdep_flushfiles() so that it can be used to clear the work queue when suspending filesystem operations. Access to buffers becomes recursive so that snapshots can recursively traverse their indirect blocks using ffs_copyonwrite() when checking for the need for copy on write when flushing one of their own indirect blocks. This eliminates a deadlock between the syncer daemon and a process taking a snapshot. Ensure that softdep_process_worklist() can never block because of a snapshot being taken. This eliminates a problem with buffer starvation. Cleanup change in ffs_sync() which did not synchronously wait when MNT_WAIT was specified. The result was an unclean filesystem panic when doing forcible unmount with heavy filesystem I/O in progress. Return a zero'ed block when reading a block that was not in use at the time that a snapshot was taken. Normally, these blocks should never be read. However, the readahead code will occationally read them which can cause unexpected behavior. Clean up the debugging code that ensures that no blocks be written on a filesystem while it is suspended. Snapshots must explicitly label the blocks that they are writing during the suspension so that they do not cause a `write on suspended filesystem' panic. Reorganize ffs_copyonwrite() to eliminate a deadlock and also to prevent a race condition that would permit the same block to be copied twice. This change eliminates an unexpected soft updates inconsistency in fsck caused by the double allocation. Use bqrelse rather than brelse for buffers that will be needed soon again by the snapshot code. This improves snapshot performance.
2000-07-24 05:28:33 +00:00
return (error);
}
/*
* Flush all vnodes and worklist items associated with a specified mount point.
*/
int
softdep_flushfiles(oldmnt, flags, td)
This patch corrects the first round of panics and hangs reported with the new snapshot code. Update addaliasu to correctly implement the semantics of the old checkalias function. When a device vnode first comes into existence, check to see if an anonymous vnode for the same device was created at boot time by bdevvp(). If so, adopt the bdevvp vnode rather than creating a new vnode for the device. This corrects a problem which caused the kernel to panic when taking a snapshot of the root filesystem. Change the calling convention of vn_write_suspend_wait() to be the same as vn_start_write(). Split out softdep_flushworklist() from softdep_flushfiles() so that it can be used to clear the work queue when suspending filesystem operations. Access to buffers becomes recursive so that snapshots can recursively traverse their indirect blocks using ffs_copyonwrite() when checking for the need for copy on write when flushing one of their own indirect blocks. This eliminates a deadlock between the syncer daemon and a process taking a snapshot. Ensure that softdep_process_worklist() can never block because of a snapshot being taken. This eliminates a problem with buffer starvation. Cleanup change in ffs_sync() which did not synchronously wait when MNT_WAIT was specified. The result was an unclean filesystem panic when doing forcible unmount with heavy filesystem I/O in progress. Return a zero'ed block when reading a block that was not in use at the time that a snapshot was taken. Normally, these blocks should never be read. However, the readahead code will occationally read them which can cause unexpected behavior. Clean up the debugging code that ensures that no blocks be written on a filesystem while it is suspended. Snapshots must explicitly label the blocks that they are writing during the suspension so that they do not cause a `write on suspended filesystem' panic. Reorganize ffs_copyonwrite() to eliminate a deadlock and also to prevent a race condition that would permit the same block to be copied twice. This change eliminates an unexpected soft updates inconsistency in fsck caused by the double allocation. Use bqrelse rather than brelse for buffers that will be needed soon again by the snapshot code. This improves snapshot performance.
2000-07-24 05:28:33 +00:00
struct mount *oldmnt;
int flags;
struct thread *td;
This patch corrects the first round of panics and hangs reported with the new snapshot code. Update addaliasu to correctly implement the semantics of the old checkalias function. When a device vnode first comes into existence, check to see if an anonymous vnode for the same device was created at boot time by bdevvp(). If so, adopt the bdevvp vnode rather than creating a new vnode for the device. This corrects a problem which caused the kernel to panic when taking a snapshot of the root filesystem. Change the calling convention of vn_write_suspend_wait() to be the same as vn_start_write(). Split out softdep_flushworklist() from softdep_flushfiles() so that it can be used to clear the work queue when suspending filesystem operations. Access to buffers becomes recursive so that snapshots can recursively traverse their indirect blocks using ffs_copyonwrite() when checking for the need for copy on write when flushing one of their own indirect blocks. This eliminates a deadlock between the syncer daemon and a process taking a snapshot. Ensure that softdep_process_worklist() can never block because of a snapshot being taken. This eliminates a problem with buffer starvation. Cleanup change in ffs_sync() which did not synchronously wait when MNT_WAIT was specified. The result was an unclean filesystem panic when doing forcible unmount with heavy filesystem I/O in progress. Return a zero'ed block when reading a block that was not in use at the time that a snapshot was taken. Normally, these blocks should never be read. However, the readahead code will occationally read them which can cause unexpected behavior. Clean up the debugging code that ensures that no blocks be written on a filesystem while it is suspended. Snapshots must explicitly label the blocks that they are writing during the suspension so that they do not cause a `write on suspended filesystem' panic. Reorganize ffs_copyonwrite() to eliminate a deadlock and also to prevent a race condition that would permit the same block to be copied twice. This change eliminates an unexpected soft updates inconsistency in fsck caused by the double allocation. Use bqrelse rather than brelse for buffers that will be needed soon again by the snapshot code. This improves snapshot performance.
2000-07-24 05:28:33 +00:00
{
int error, count, loopcnt;
2002-03-15 04:06:10 +00:00
error = 0;
This patch corrects the first round of panics and hangs reported with the new snapshot code. Update addaliasu to correctly implement the semantics of the old checkalias function. When a device vnode first comes into existence, check to see if an anonymous vnode for the same device was created at boot time by bdevvp(). If so, adopt the bdevvp vnode rather than creating a new vnode for the device. This corrects a problem which caused the kernel to panic when taking a snapshot of the root filesystem. Change the calling convention of vn_write_suspend_wait() to be the same as vn_start_write(). Split out softdep_flushworklist() from softdep_flushfiles() so that it can be used to clear the work queue when suspending filesystem operations. Access to buffers becomes recursive so that snapshots can recursively traverse their indirect blocks using ffs_copyonwrite() when checking for the need for copy on write when flushing one of their own indirect blocks. This eliminates a deadlock between the syncer daemon and a process taking a snapshot. Ensure that softdep_process_worklist() can never block because of a snapshot being taken. This eliminates a problem with buffer starvation. Cleanup change in ffs_sync() which did not synchronously wait when MNT_WAIT was specified. The result was an unclean filesystem panic when doing forcible unmount with heavy filesystem I/O in progress. Return a zero'ed block when reading a block that was not in use at the time that a snapshot was taken. Normally, these blocks should never be read. However, the readahead code will occationally read them which can cause unexpected behavior. Clean up the debugging code that ensures that no blocks be written on a filesystem while it is suspended. Snapshots must explicitly label the blocks that they are writing during the suspension so that they do not cause a `write on suspended filesystem' panic. Reorganize ffs_copyonwrite() to eliminate a deadlock and also to prevent a race condition that would permit the same block to be copied twice. This change eliminates an unexpected soft updates inconsistency in fsck caused by the double allocation. Use bqrelse rather than brelse for buffers that will be needed soon again by the snapshot code. This improves snapshot performance.
2000-07-24 05:28:33 +00:00
/*
* Alternately flush the vnodes associated with the mount
* point and process any dependencies that the flushing
* creates. In theory, this loop can happen at most twice,
* but we give it a few extra just to be sure.
*/
for (loopcnt = 10; loopcnt > 0; loopcnt--) {
/*
* Do another flush in case any vnodes were brought in
* as part of the cleanup operations.
*/
if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0)
This patch corrects the first round of panics and hangs reported with the new snapshot code. Update addaliasu to correctly implement the semantics of the old checkalias function. When a device vnode first comes into existence, check to see if an anonymous vnode for the same device was created at boot time by bdevvp(). If so, adopt the bdevvp vnode rather than creating a new vnode for the device. This corrects a problem which caused the kernel to panic when taking a snapshot of the root filesystem. Change the calling convention of vn_write_suspend_wait() to be the same as vn_start_write(). Split out softdep_flushworklist() from softdep_flushfiles() so that it can be used to clear the work queue when suspending filesystem operations. Access to buffers becomes recursive so that snapshots can recursively traverse their indirect blocks using ffs_copyonwrite() when checking for the need for copy on write when flushing one of their own indirect blocks. This eliminates a deadlock between the syncer daemon and a process taking a snapshot. Ensure that softdep_process_worklist() can never block because of a snapshot being taken. This eliminates a problem with buffer starvation. Cleanup change in ffs_sync() which did not synchronously wait when MNT_WAIT was specified. The result was an unclean filesystem panic when doing forcible unmount with heavy filesystem I/O in progress. Return a zero'ed block when reading a block that was not in use at the time that a snapshot was taken. Normally, these blocks should never be read. However, the readahead code will occationally read them which can cause unexpected behavior. Clean up the debugging code that ensures that no blocks be written on a filesystem while it is suspended. Snapshots must explicitly label the blocks that they are writing during the suspension so that they do not cause a `write on suspended filesystem' panic. Reorganize ffs_copyonwrite() to eliminate a deadlock and also to prevent a race condition that would permit the same block to be copied twice. This change eliminates an unexpected soft updates inconsistency in fsck caused by the double allocation. Use bqrelse rather than brelse for buffers that will be needed soon again by the snapshot code. This improves snapshot performance.
2000-07-24 05:28:33 +00:00
break;
if ((error = softdep_flushworklist(oldmnt, &count, td)) != 0 ||
This patch corrects the first round of panics and hangs reported with the new snapshot code. Update addaliasu to correctly implement the semantics of the old checkalias function. When a device vnode first comes into existence, check to see if an anonymous vnode for the same device was created at boot time by bdevvp(). If so, adopt the bdevvp vnode rather than creating a new vnode for the device. This corrects a problem which caused the kernel to panic when taking a snapshot of the root filesystem. Change the calling convention of vn_write_suspend_wait() to be the same as vn_start_write(). Split out softdep_flushworklist() from softdep_flushfiles() so that it can be used to clear the work queue when suspending filesystem operations. Access to buffers becomes recursive so that snapshots can recursively traverse their indirect blocks using ffs_copyonwrite() when checking for the need for copy on write when flushing one of their own indirect blocks. This eliminates a deadlock between the syncer daemon and a process taking a snapshot. Ensure that softdep_process_worklist() can never block because of a snapshot being taken. This eliminates a problem with buffer starvation. Cleanup change in ffs_sync() which did not synchronously wait when MNT_WAIT was specified. The result was an unclean filesystem panic when doing forcible unmount with heavy filesystem I/O in progress. Return a zero'ed block when reading a block that was not in use at the time that a snapshot was taken. Normally, these blocks should never be read. However, the readahead code will occationally read them which can cause unexpected behavior. Clean up the debugging code that ensures that no blocks be written on a filesystem while it is suspended. Snapshots must explicitly label the blocks that they are writing during the suspension so that they do not cause a `write on suspended filesystem' panic. Reorganize ffs_copyonwrite() to eliminate a deadlock and also to prevent a race condition that would permit the same block to be copied twice. This change eliminates an unexpected soft updates inconsistency in fsck caused by the double allocation. Use bqrelse rather than brelse for buffers that will be needed soon again by the snapshot code. This improves snapshot performance.
2000-07-24 05:28:33 +00:00
count == 0)
break;
}
/*
* If we are unmounting then it is an error to fail. If we
* are simply trying to downgrade to read-only, then filesystem
* activity can keep us busy forever, so we just fail with EBUSY.
*/
if (loopcnt == 0) {
if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
panic("softdep_flushfiles: looping");
error = EBUSY;
}
return (error);
}
/*
* Structure hashing.
*
* There are three types of structures that can be looked up:
* 1) pagedep structures identified by mount point, inode number,
* and logical block.
* 2) inodedep structures identified by mount point and inode number.
* 3) newblk structures identified by mount point and
* physical block number.
*
* The "pagedep" and "inodedep" dependency structures are hashed
* separately from the file blocks and inodes to which they correspond.
* This separation helps when the in-memory copy of an inode or
* file block must be replaced. It also obviates the need to access
* an inode or file page when simply updating (or de-allocating)
* dependency structures. Lookup of newblk structures is needed to
* find newly allocated blocks when trying to associate them with
* their allocdirect or allocindir structure.
*
* The lookup routines optionally create and hash a new instance when
* an existing entry is not found.
*/
#define DEPALLOC 0x0001 /* allocate structure if lookup fails */
This patch corrects two problems with the rate limiting code that was introduced in revision 1.80. The problem manifested itself with a `locking against myself' panic and could also result in soft updates inconsistences associated with inodedeps. The two problems are: 1) One of the background operations could manipulate the bitmap while holding it locked with intent to create. This held lock results in a `locking against myself' panic, when the background processing that we have been coopted to do tries to lock the bitmap which we are already holding locked. To understand how to fix this problem, first, observe that we can do the background cleanups in inodedep_lookup only when allocating inodedeps (DEPALLOC is set in the call to inodedep_lookup). Second observe that calls to inodedep_lookup with DEPALLOC set can only happen from the following calls into the softdep code: softdep_setup_inomapdep softdep_setup_allocdirect softdep_setup_remove softdep_setup_freeblocks softdep_setup_directory_change softdep_setup_directory_add softdep_change_linkcnt Only the first two of these can come from ffs_alloc.c while holding a bitmap locked. Thus, inodedep_lookup must not go off to do request_cleanups when being called from these functions. This change adds a flag, NODELAY, that can be passed to inodedep_lookup to let it know that it should not do background processing in those cases. 2) The return value from request_cleanup when helping out with the cleanup was 0 instead of 1. This meant that despite the fact that we may have slept while doing the cleanups, the code did not recheck for the appearance of an inodedep (e.g., goto top in inodedep_lookup). This lead to the softdep inconsistency in which we ended up with two inodedep's for the same inode. Reviewed by: Peter Wemm <peter@yahoo-inc.com>, Matt Dillon <dillon@earth.backplane.com>
2001-02-20 11:14:38 +00:00
#define NODELAY 0x0002 /* cannot do background work */
/*
* Structures and routines associated with pagedep caching.
*/
LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
u_long pagedep_hash; /* size of hash table - 1 */
#define PAGEDEP_HASH(mp, inum, lbn) \
(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
pagedep_hash])
static struct sema pagedep_in_progress;
/*
* Look up a pagedep. Return 1 if found, 0 if not found or found
* when asked to allocate but not associated with any buffer.
* If not found, allocate if DEPALLOC flag is passed.
* Found or allocated entry is returned in pagedeppp.
* This routine must be called with splbio interrupts blocked.
*/
static int
pagedep_lookup(ip, lbn, flags, pagedeppp)
struct inode *ip;
ufs_lbn_t lbn;
int flags;
struct pagedep **pagedeppp;
{
struct pagedep *pagedep;
struct pagedep_hashhead *pagedephd;
struct mount *mp;
int i;
#ifdef DEBUG
if (lk.lkt_held == NOHOLDER)
panic("pagedep_lookup: lock not held");
#endif
mp = ITOV(ip)->v_mount;
pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
top:
LIST_FOREACH(pagedep, pagedephd, pd_hash)
if (ip->i_number == pagedep->pd_ino &&
lbn == pagedep->pd_lbn &&
mp == pagedep->pd_mnt)
break;
if (pagedep) {
*pagedeppp = pagedep;
if ((flags & DEPALLOC) != 0 &&
(pagedep->pd_state & ONWORKLIST) == 0)
return (0);
return (1);
}
if ((flags & DEPALLOC) == 0) {
*pagedeppp = NULL;
return (0);
}
if (sema_get(&pagedep_in_progress, &lk) == 0) {
ACQUIRE_LOCK(&lk);
goto top;
}
MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP,
M_SOFTDEP_FLAGS|M_ZERO);
pagedep->pd_list.wk_type = D_PAGEDEP;
pagedep->pd_mnt = mp;
pagedep->pd_ino = ip->i_number;
pagedep->pd_lbn = lbn;
LIST_INIT(&pagedep->pd_dirremhd);
LIST_INIT(&pagedep->pd_pendinghd);
for (i = 0; i < DAHASHSZ; i++)
LIST_INIT(&pagedep->pd_diraddhd[i]);
ACQUIRE_LOCK(&lk);
LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
sema_release(&pagedep_in_progress);
*pagedeppp = pagedep;
return (0);
}
/*
* Structures and routines associated with inodedep caching.
*/
LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
static u_long inodedep_hash; /* size of hash table - 1 */
static long num_inodedep; /* number of inodedep allocated */
#define INODEDEP_HASH(fs, inum) \
(&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
static struct sema inodedep_in_progress;
/*
* Look up an inodedep. Return 1 if found, 0 if not found.
* If not found, allocate if DEPALLOC flag is passed.
* Found or allocated entry is returned in inodedeppp.
* This routine must be called with splbio interrupts blocked.
*/
static int
inodedep_lookup(fs, inum, flags, inodedeppp)
struct fs *fs;
ino_t inum;
int flags;
struct inodedep **inodedeppp;
{
struct inodedep *inodedep;
struct inodedep_hashhead *inodedephd;
int firsttry;
#ifdef DEBUG
if (lk.lkt_held == NOHOLDER)
panic("inodedep_lookup: lock not held");
#endif
firsttry = 1;
inodedephd = INODEDEP_HASH(fs, inum);
top:
LIST_FOREACH(inodedep, inodedephd, id_hash)
if (inum == inodedep->id_ino && fs == inodedep->id_fs)
break;
if (inodedep) {
*inodedeppp = inodedep;
return (1);
}
if ((flags & DEPALLOC) == 0) {
*inodedeppp = NULL;
return (0);
}
/*
* If we are over our limit, try to improve the situation.
*/
This patch corrects two problems with the rate limiting code that was introduced in revision 1.80. The problem manifested itself with a `locking against myself' panic and could also result in soft updates inconsistences associated with inodedeps. The two problems are: 1) One of the background operations could manipulate the bitmap while holding it locked with intent to create. This held lock results in a `locking against myself' panic, when the background processing that we have been coopted to do tries to lock the bitmap which we are already holding locked. To understand how to fix this problem, first, observe that we can do the background cleanups in inodedep_lookup only when allocating inodedeps (DEPALLOC is set in the call to inodedep_lookup). Second observe that calls to inodedep_lookup with DEPALLOC set can only happen from the following calls into the softdep code: softdep_setup_inomapdep softdep_setup_allocdirect softdep_setup_remove softdep_setup_freeblocks softdep_setup_directory_change softdep_setup_directory_add softdep_change_linkcnt Only the first two of these can come from ffs_alloc.c while holding a bitmap locked. Thus, inodedep_lookup must not go off to do request_cleanups when being called from these functions. This change adds a flag, NODELAY, that can be passed to inodedep_lookup to let it know that it should not do background processing in those cases. 2) The return value from request_cleanup when helping out with the cleanup was 0 instead of 1. This meant that despite the fact that we may have slept while doing the cleanups, the code did not recheck for the appearance of an inodedep (e.g., goto top in inodedep_lookup). This lead to the softdep inconsistency in which we ended up with two inodedep's for the same inode. Reviewed by: Peter Wemm <peter@yahoo-inc.com>, Matt Dillon <dillon@earth.backplane.com>
2001-02-20 11:14:38 +00:00
if (num_inodedep > max_softdeps && firsttry && (flags & NODELAY) == 0 &&
request_cleanup(FLUSH_INODES, 1)) {
firsttry = 0;
goto top;
}
if (sema_get(&inodedep_in_progress, &lk) == 0) {
ACQUIRE_LOCK(&lk);
goto top;
}
num_inodedep += 1;
MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
Implement a low-memory deadlock solution. Removed most of the hacks that were trying to deal with low-memory situations prior to now. The new code is based on the concept that I/O must be able to function in a low memory situation. All major modules related to I/O (except networking) have been adjusted to allow allocation out of the system reserve memory pool. These modules now detect a low memory situation but rather then block they instead continue to operate, then return resources to the memory pool instead of cache them or leave them wired. Code has been added to stall in a low-memory situation prior to a vnode being locked. Thus situations where a process blocks in a low-memory condition while holding a locked vnode have been reduced to near nothing. Not only will I/O continue to operate, but many prior deadlock conditions simply no longer exist. Implement a number of VFS/BIO fixes (found by Ian): in biodone(), bogus-page replacement code, the loop was not properly incrementing loop variables prior to a continue statement. We do not believe this code can be hit anyway but we aren't taking any chances. We'll turn the whole section into a panic (as it already is in brelse()) after the release is rolled. In biodone(), the foff calculation was incorrectly clamped to the iosize, causing the wrong foff to be calculated for pages in the case of an I/O error or biodone() called without initiating I/O. The problem always caused a panic before. Now it doesn't. The problem is mainly an issue with NFS. Fixed casts for ~PAGE_MASK. This code worked properly before only because the calculations use signed arithmatic. Better to properly extend PAGE_MASK first before inverting it for the 64 bit masking op. In brelse(), the bogus_page fixup code was improperly throwing away the original contents of 'm' when it did the j-loop to fix the bogus pages. The result was that it would potentially invalidate parts of the *WRONG* page(!), leading to corruption. There may still be cases where a background bitmap write is being duplicated, causing potential corruption. We have identified a potentially serious bug related to this but the fix is still TBD. So instead this patch contains a KASSERT to detect the problem and panic the machine rather then continue to corrupt the filesystem. The problem does not occur very often.. it is very hard to reproduce, and it may or may not be the cause of the corruption people have reported. Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>) Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
2000-11-18 23:06:26 +00:00
M_INODEDEP, M_SOFTDEP_FLAGS);
inodedep->id_list.wk_type = D_INODEDEP;
inodedep->id_fs = fs;
inodedep->id_ino = inum;
inodedep->id_state = ALLCOMPLETE;
inodedep->id_nlinkdelta = 0;
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
inodedep->id_savedino1 = NULL;
inodedep->id_savedsize = -1;
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
inodedep->id_savedextsize = -1;
inodedep->id_buf = NULL;
LIST_INIT(&inodedep->id_pendinghd);
LIST_INIT(&inodedep->id_inowait);
LIST_INIT(&inodedep->id_bufwait);
TAILQ_INIT(&inodedep->id_inoupdt);
TAILQ_INIT(&inodedep->id_newinoupdt);
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
TAILQ_INIT(&inodedep->id_extupdt);
TAILQ_INIT(&inodedep->id_newextupdt);
ACQUIRE_LOCK(&lk);
LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
sema_release(&inodedep_in_progress);
*inodedeppp = inodedep;
return (0);
}
/*
* Structures and routines associated with newblk caching.
*/
LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
u_long newblk_hash; /* size of hash table - 1 */
#define NEWBLK_HASH(fs, inum) \
(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
static struct sema newblk_in_progress;
/*
* Look up a newblk. Return 1 if found, 0 if not found.
* If not found, allocate if DEPALLOC flag is passed.
* Found or allocated entry is returned in newblkpp.
*/
static int
newblk_lookup(fs, newblkno, flags, newblkpp)
struct fs *fs;
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
ufs2_daddr_t newblkno;
int flags;
struct newblk **newblkpp;
{
struct newblk *newblk;
struct newblk_hashhead *newblkhd;
newblkhd = NEWBLK_HASH(fs, newblkno);
top:
LIST_FOREACH(newblk, newblkhd, nb_hash)
if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
break;
if (newblk) {
*newblkpp = newblk;
return (1);
}
if ((flags & DEPALLOC) == 0) {
*newblkpp = NULL;
return (0);
}
if (sema_get(&newblk_in_progress, 0) == 0)
goto top;
MALLOC(newblk, struct newblk *, sizeof(struct newblk),
Implement a low-memory deadlock solution. Removed most of the hacks that were trying to deal with low-memory situations prior to now. The new code is based on the concept that I/O must be able to function in a low memory situation. All major modules related to I/O (except networking) have been adjusted to allow allocation out of the system reserve memory pool. These modules now detect a low memory situation but rather then block they instead continue to operate, then return resources to the memory pool instead of cache them or leave them wired. Code has been added to stall in a low-memory situation prior to a vnode being locked. Thus situations where a process blocks in a low-memory condition while holding a locked vnode have been reduced to near nothing. Not only will I/O continue to operate, but many prior deadlock conditions simply no longer exist. Implement a number of VFS/BIO fixes (found by Ian): in biodone(), bogus-page replacement code, the loop was not properly incrementing loop variables prior to a continue statement. We do not believe this code can be hit anyway but we aren't taking any chances. We'll turn the whole section into a panic (as it already is in brelse()) after the release is rolled. In biodone(), the foff calculation was incorrectly clamped to the iosize, causing the wrong foff to be calculated for pages in the case of an I/O error or biodone() called without initiating I/O. The problem always caused a panic before. Now it doesn't. The problem is mainly an issue with NFS. Fixed casts for ~PAGE_MASK. This code worked properly before only because the calculations use signed arithmatic. Better to properly extend PAGE_MASK first before inverting it for the 64 bit masking op. In brelse(), the bogus_page fixup code was improperly throwing away the original contents of 'm' when it did the j-loop to fix the bogus pages. The result was that it would potentially invalidate parts of the *WRONG* page(!), leading to corruption. There may still be cases where a background bitmap write is being duplicated, causing potential corruption. We have identified a potentially serious bug related to this but the fix is still TBD. So instead this patch contains a KASSERT to detect the problem and panic the machine rather then continue to corrupt the filesystem. The problem does not occur very often.. it is very hard to reproduce, and it may or may not be the cause of the corruption people have reported. Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>) Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
2000-11-18 23:06:26 +00:00
M_NEWBLK, M_SOFTDEP_FLAGS);
newblk->nb_state = 0;
newblk->nb_fs = fs;
newblk->nb_newblkno = newblkno;
LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
sema_release(&newblk_in_progress);
*newblkpp = newblk;
return (0);
}
/*
* Executed during filesystem system initialization before
2002-05-16 21:28:32 +00:00
* mounting any filesystems.
*/
void
softdep_initialize()
{
LIST_INIT(&mkdirlisthd);
LIST_INIT(&softdep_workitem_pending);
max_softdeps = desiredvnodes * 4;
pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
&pagedep_hash);
sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0);
inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0);
newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
sema_init(&newblk_in_progress, "newblk", PRIBIO, 0);
/* hooks through which the main kernel code calls us */
softdep_process_worklist_hook = softdep_process_worklist;
softdep_fsync_hook = softdep_fsync;
/* initialise bioops hack */
bioops.io_prewrite = softdep_disk_prewrite;
bioops.io_start = softdep_disk_io_initiation;
bioops.io_complete = softdep_disk_write_complete;
bioops.io_deallocate = softdep_deallocate_dependencies;
bioops.io_movedeps = softdep_move_dependencies;
bioops.io_countdeps = softdep_count_dependencies;
}
/*
* Executed after all filesystems have been unmounted during
* filesystem module unload.
*/
void
softdep_uninitialize()
{
softdep_process_worklist_hook = NULL;
softdep_fsync_hook = NULL;
hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash);
hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash);
hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash);
}
/*
* Called at mount time to notify the dependency code that a
* filesystem wishes to use it.
*/
int
softdep_mount(devvp, mp, fs, cred)
struct vnode *devvp;
struct mount *mp;
struct fs *fs;
struct ucred *cred;
{
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
struct csum_total cstotal;
struct cg *cgp;
struct buf *bp;
int error, cyl;
mp->mnt_flag &= ~MNT_ASYNC;
mp->mnt_flag |= MNT_SOFTDEP;
/*
* When doing soft updates, the counters in the
* superblock may have gotten out of sync, so we have
* to scan the cylinder groups and recalculate them.
*/
if (fs->fs_clean != 0)
return (0);
bzero(&cstotal, sizeof cstotal);
for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
fs->fs_cgsize, cred, &bp)) != 0) {
brelse(bp);
return (error);
}
cgp = (struct cg *)bp->b_data;
cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
fs->fs_cs(fs, cyl) = cgp->cg_cs;
brelse(bp);
}
#ifdef DEBUG
if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
#endif
bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
return (0);
}
/*
* Protecting the freemaps (or bitmaps).
*
2002-05-16 21:28:32 +00:00
* To eliminate the need to execute fsck before mounting a filesystem
* after a power failure, one must (conservatively) guarantee that the
* on-disk copy of the bitmaps never indicate that a live inode or block is
* free. So, when a block or inode is allocated, the bitmap should be
* updated (on disk) before any new pointers. When a block or inode is
* freed, the bitmap should not be updated until all pointers have been
* reset. The latter dependency is handled by the delayed de-allocation
* approach described below for block and inode de-allocation. The former
* dependency is handled by calling the following procedure when a block or
* inode is allocated. When an inode is allocated an "inodedep" is created
* with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
* Each "inodedep" is also inserted into the hash indexing structure so
* that any additional link additions can be made dependent on the inode
* allocation.
*
2002-05-16 21:28:32 +00:00
* The ufs filesystem maintains a number of free block counts (e.g., per
* cylinder group, per cylinder and per <cylinder, rotational position> pair)
* in addition to the bitmaps. These counts are used to improve efficiency
* during allocation and therefore must be consistent with the bitmaps.
* There is no convenient way to guarantee post-crash consistency of these
* counts with simple update ordering, for two main reasons: (1) The counts
* and bitmaps for a single cylinder group block are not in the same disk
* sector. If a disk write is interrupted (e.g., by power failure), one may
* be written and the other not. (2) Some of the counts are located in the
* superblock rather than the cylinder group block. So, we focus our soft
* updates implementation on protecting the bitmaps. When mounting a
* filesystem, we recompute the auxiliary counts from the bitmaps.
*/
/*
* Called just after updating the cylinder group block to allocate an inode.
*/
void
softdep_setup_inomapdep(bp, ip, newinum)
struct buf *bp; /* buffer for cylgroup block with inode map */
struct inode *ip; /* inode related to allocation */
ino_t newinum; /* new inode number being allocated */
{
struct inodedep *inodedep;
struct bmsafemap *bmsafemap;
/*
* Create a dependency for the newly allocated inode.
* Panic if it already exists as something is seriously wrong.
* Otherwise add it to the dependency list for the buffer holding
* the cylinder group map from which it was allocated.
*/
ACQUIRE_LOCK(&lk);
if ((inodedep_lookup(ip->i_fs, newinum, DEPALLOC|NODELAY, &inodedep))) {
FREE_LOCK(&lk);
panic("softdep_setup_inomapdep: found inode");
}
inodedep->id_buf = bp;
inodedep->id_state &= ~DEPCOMPLETE;
bmsafemap = bmsafemap_lookup(bp);
LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
FREE_LOCK(&lk);
}
/*
* Called just after updating the cylinder group block to
* allocate block or fragment.
*/
void
softdep_setup_blkmapdep(bp, fs, newblkno)
struct buf *bp; /* buffer for cylgroup block with block map */
struct fs *fs; /* filesystem doing allocation */
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
ufs2_daddr_t newblkno; /* number of newly allocated block */
{
struct newblk *newblk;
struct bmsafemap *bmsafemap;
/*
* Create a dependency for the newly allocated block.
* Add it to the dependency list for the buffer holding
* the cylinder group map from which it was allocated.
*/
if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
panic("softdep_setup_blkmapdep: found block");
ACQUIRE_LOCK(&lk);
newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp);
LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
FREE_LOCK(&lk);
}
/*
* Find the bmsafemap associated with a cylinder group buffer.
* If none exists, create one. The buffer must be locked when
* this routine is called and this routine must be called with
* splbio interrupts blocked.
*/
static struct bmsafemap *
bmsafemap_lookup(bp)
struct buf *bp;
{
struct bmsafemap *bmsafemap;
struct worklist *wk;
#ifdef DEBUG
if (lk.lkt_held == NOHOLDER)
panic("bmsafemap_lookup: lock not held");
#endif
LIST_FOREACH(wk, &bp->b_dep, wk_list)
if (wk->wk_type == D_BMSAFEMAP)
return (WK_BMSAFEMAP(wk));
FREE_LOCK(&lk);
MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
Implement a low-memory deadlock solution. Removed most of the hacks that were trying to deal with low-memory situations prior to now. The new code is based on the concept that I/O must be able to function in a low memory situation. All major modules related to I/O (except networking) have been adjusted to allow allocation out of the system reserve memory pool. These modules now detect a low memory situation but rather then block they instead continue to operate, then return resources to the memory pool instead of cache them or leave them wired. Code has been added to stall in a low-memory situation prior to a vnode being locked. Thus situations where a process blocks in a low-memory condition while holding a locked vnode have been reduced to near nothing. Not only will I/O continue to operate, but many prior deadlock conditions simply no longer exist. Implement a number of VFS/BIO fixes (found by Ian): in biodone(), bogus-page replacement code, the loop was not properly incrementing loop variables prior to a continue statement. We do not believe this code can be hit anyway but we aren't taking any chances. We'll turn the whole section into a panic (as it already is in brelse()) after the release is rolled. In biodone(), the foff calculation was incorrectly clamped to the iosize, causing the wrong foff to be calculated for pages in the case of an I/O error or biodone() called without initiating I/O. The problem always caused a panic before. Now it doesn't. The problem is mainly an issue with NFS. Fixed casts for ~PAGE_MASK. This code worked properly before only because the calculations use signed arithmatic. Better to properly extend PAGE_MASK first before inverting it for the 64 bit masking op. In brelse(), the bogus_page fixup code was improperly throwing away the original contents of 'm' when it did the j-loop to fix the bogus pages. The result was that it would potentially invalidate parts of the *WRONG* page(!), leading to corruption. There may still be cases where a background bitmap write is being duplicated, causing potential corruption. We have identified a potentially serious bug related to this but the fix is still TBD. So instead this patch contains a KASSERT to detect the problem and panic the machine rather then continue to corrupt the filesystem. The problem does not occur very often.. it is very hard to reproduce, and it may or may not be the cause of the corruption people have reported. Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>) Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
2000-11-18 23:06:26 +00:00
M_BMSAFEMAP, M_SOFTDEP_FLAGS);
bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
bmsafemap->sm_list.wk_state = 0;
bmsafemap->sm_buf = bp;
LIST_INIT(&bmsafemap->sm_allocdirecthd);
LIST_INIT(&bmsafemap->sm_allocindirhd);
LIST_INIT(&bmsafemap->sm_inodedephd);
LIST_INIT(&bmsafemap->sm_newblkhd);
ACQUIRE_LOCK(&lk);
WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
return (bmsafemap);
}
/*
* Direct block allocation dependencies.
*
* When a new block is allocated, the corresponding disk locations must be
* initialized (with zeros or new data) before the on-disk inode points to
* them. Also, the freemap from which the block was allocated must be
* updated (on disk) before the inode's pointer. These two dependencies are
* independent of each other and are needed for all file blocks and indirect
* blocks that are pointed to directly by the inode. Just before the
* "in-core" version of the inode is updated with a newly allocated block
* number, a procedure (below) is called to setup allocation dependency
* structures. These structures are removed when the corresponding
* dependencies are satisfied or when the block allocation becomes obsolete
* (i.e., the file is deleted, the block is de-allocated, or the block is a
* fragment that gets upgraded). All of these cases are handled in
* procedures described later.
*
* When a file extension causes a fragment to be upgraded, either to a larger
* fragment or to a full block, the on-disk location may change (if the
* previous fragment could not simply be extended). In this case, the old
* fragment must be de-allocated, but not until after the inode's pointer has
* been updated. In most cases, this is handled by later procedures, which
* will construct a "freefrag" structure to be added to the workitem queue
* when the inode update is complete (or obsolete). The main exception to
* this is when an allocation occurs while a pending allocation dependency
* (for the same block pointer) remains. This case is handled in the main
* allocation dependency setup procedure by immediately freeing the
* unreferenced fragments.
*/
void
softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
struct inode *ip; /* inode to which block is being added */
ufs_lbn_t lbn; /* block pointer within inode */
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
ufs2_daddr_t newblkno; /* disk block number being added */
ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */
long newsize; /* size of new block */
long oldsize; /* size of new block */
struct buf *bp; /* bp for allocated block */
{
struct allocdirect *adp, *oldadp;
struct allocdirectlst *adphead;
struct bmsafemap *bmsafemap;
struct inodedep *inodedep;
struct pagedep *pagedep;
struct newblk *newblk;
MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
adp->ad_list.wk_type = D_ALLOCDIRECT;
adp->ad_lbn = lbn;
adp->ad_newblkno = newblkno;
adp->ad_oldblkno = oldblkno;
adp->ad_newsize = newsize;
adp->ad_oldsize = oldsize;
adp->ad_state = ATTACHED;
LIST_INIT(&adp->ad_newdirblk);
if (newblkno == oldblkno)
adp->ad_freefrag = NULL;
else
adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
panic("softdep_setup_allocdirect: lost block");
ACQUIRE_LOCK(&lk);
This patch corrects two problems with the rate limiting code that was introduced in revision 1.80. The problem manifested itself with a `locking against myself' panic and could also result in soft updates inconsistences associated with inodedeps. The two problems are: 1) One of the background operations could manipulate the bitmap while holding it locked with intent to create. This held lock results in a `locking against myself' panic, when the background processing that we have been coopted to do tries to lock the bitmap which we are already holding locked. To understand how to fix this problem, first, observe that we can do the background cleanups in inodedep_lookup only when allocating inodedeps (DEPALLOC is set in the call to inodedep_lookup). Second observe that calls to inodedep_lookup with DEPALLOC set can only happen from the following calls into the softdep code: softdep_setup_inomapdep softdep_setup_allocdirect softdep_setup_remove softdep_setup_freeblocks softdep_setup_directory_change softdep_setup_directory_add softdep_change_linkcnt Only the first two of these can come from ffs_alloc.c while holding a bitmap locked. Thus, inodedep_lookup must not go off to do request_cleanups when being called from these functions. This change adds a flag, NODELAY, that can be passed to inodedep_lookup to let it know that it should not do background processing in those cases. 2) The return value from request_cleanup when helping out with the cleanup was 0 instead of 1. This meant that despite the fact that we may have slept while doing the cleanups, the code did not recheck for the appearance of an inodedep (e.g., goto top in inodedep_lookup). This lead to the softdep inconsistency in which we ended up with two inodedep's for the same inode. Reviewed by: Peter Wemm <peter@yahoo-inc.com>, Matt Dillon <dillon@earth.backplane.com>
2001-02-20 11:14:38 +00:00
inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC | NODELAY, &inodedep);
adp->ad_inodedep = inodedep;
if (newblk->nb_state == DEPCOMPLETE) {
adp->ad_state |= DEPCOMPLETE;
adp->ad_buf = NULL;
} else {
bmsafemap = newblk->nb_bmsafemap;
adp->ad_buf = bmsafemap->sm_buf;
LIST_REMOVE(newblk, nb_deps);
LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
}
LIST_REMOVE(newblk, nb_hash);
FREE(newblk, M_NEWBLK);
WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
if (lbn >= NDADDR) {
/* allocating an indirect block */
if (oldblkno != 0) {
FREE_LOCK(&lk);
panic("softdep_setup_allocdirect: non-zero indir");
}
} else {
/*
* Allocating a direct block.
*
* If we are allocating a directory block, then we must
* allocate an associated pagedep to track additions and
* deletions.
*/
if ((ip->i_mode & IFMT) == IFDIR &&
pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
}
/*
* The list of allocdirects must be kept in sorted and ascending
* order so that the rollback routines can quickly determine the
* first uncommitted block (the size of the file stored on disk
* ends at the end of the lowest committed fragment, or if there
* are no fragments, at the end of the highest committed block).
* Since files generally grow, the typical case is that the new
* block is to be added at the end of the list. We speed this
* special case by checking against the last allocdirect in the
* list before laboriously traversing the list looking for the
* insertion point.
*/
adphead = &inodedep->id_newinoupdt;
oldadp = TAILQ_LAST(adphead, allocdirectlst);
if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
/* insert at end of list */
TAILQ_INSERT_TAIL(adphead, adp, ad_next);
if (oldadp != NULL && oldadp->ad_lbn == lbn)
allocdirect_merge(adphead, adp, oldadp);
FREE_LOCK(&lk);
return;
}
TAILQ_FOREACH(oldadp, adphead, ad_next) {
if (oldadp->ad_lbn >= lbn)
break;
}
if (oldadp == NULL) {
FREE_LOCK(&lk);
panic("softdep_setup_allocdirect: lost entry");
}
/* insert in middle of list */
TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
if (oldadp->ad_lbn == lbn)
allocdirect_merge(adphead, adp, oldadp);
FREE_LOCK(&lk);
}
/*
* Replace an old allocdirect dependency with a newer one.
* This routine must be called with splbio interrupts blocked.
*/
static void
allocdirect_merge(adphead, newadp, oldadp)
struct allocdirectlst *adphead; /* head of list holding allocdirects */
struct allocdirect *newadp; /* allocdirect being added */
struct allocdirect *oldadp; /* existing allocdirect being checked */
{
struct worklist *wk;
struct freefrag *freefrag;
struct newdirblk *newdirblk;
#ifdef DEBUG
if (lk.lkt_held == NOHOLDER)
panic("allocdirect_merge: lock not held");
#endif
if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
newadp->ad_oldsize != oldadp->ad_newsize ||
newadp->ad_lbn >= NDADDR) {
FREE_LOCK(&lk);
panic("%s %jd != new %jd || old size %ld != new %ld",
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
"allocdirect_merge: old blkno",
(intmax_t)newadp->ad_oldblkno,
(intmax_t)oldadp->ad_newblkno,
newadp->ad_oldsize, oldadp->ad_newsize);
}
newadp->ad_oldblkno = oldadp->ad_oldblkno;
newadp->ad_oldsize = oldadp->ad_oldsize;
/*
* If the old dependency had a fragment to free or had never
* previously had a block allocated, then the new dependency
* can immediately post its freefrag and adopt the old freefrag.
* This action is done by swapping the freefrag dependencies.
* The new dependency gains the old one's freefrag, and the
* old one gets the new one and then immediately puts it on
* the worklist when it is freed by free_allocdirect. It is
* not possible to do this swap when the old dependency had a
* non-zero size but no previous fragment to free. This condition
* arises when the new block is an extension of the old block.
* Here, the first part of the fragment allocated to the new
* dependency is part of the block currently claimed on disk by
* the old dependency, so cannot legitimately be freed until the
* conditions for the new dependency are fulfilled.
*/
if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
freefrag = newadp->ad_freefrag;
newadp->ad_freefrag = oldadp->ad_freefrag;
oldadp->ad_freefrag = freefrag;
}
/*
* If we are tracking a new directory-block allocation,
* move it from the old allocdirect to the new allocdirect.
*/
if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
newdirblk = WK_NEWDIRBLK(wk);
WORKLIST_REMOVE(&newdirblk->db_list);
if (LIST_FIRST(&oldadp->ad_newdirblk) != NULL)
panic("allocdirect_merge: extra newdirblk");
WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);
}
free_allocdirect(adphead, oldadp, 0);
}
/*
* Allocate a new freefrag structure if needed.
*/
static struct freefrag *
newfreefrag(ip, blkno, size)
struct inode *ip;
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
ufs2_daddr_t blkno;
long size;
{
struct freefrag *freefrag;
struct fs *fs;
if (blkno == 0)
return (NULL);
fs = ip->i_fs;
if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
panic("newfreefrag: frag size");
MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
Implement a low-memory deadlock solution. Removed most of the hacks that were trying to deal with low-memory situations prior to now. The new code is based on the concept that I/O must be able to function in a low memory situation. All major modules related to I/O (except networking) have been adjusted to allow allocation out of the system reserve memory pool. These modules now detect a low memory situation but rather then block they instead continue to operate, then return resources to the memory pool instead of cache them or leave them wired. Code has been added to stall in a low-memory situation prior to a vnode being locked. Thus situations where a process blocks in a low-memory condition while holding a locked vnode have been reduced to near nothing. Not only will I/O continue to operate, but many prior deadlock conditions simply no longer exist. Implement a number of VFS/BIO fixes (found by Ian): in biodone(), bogus-page replacement code, the loop was not properly incrementing loop variables prior to a continue statement. We do not believe this code can be hit anyway but we aren't taking any chances. We'll turn the whole section into a panic (as it already is in brelse()) after the release is rolled. In biodone(), the foff calculation was incorrectly clamped to the iosize, causing the wrong foff to be calculated for pages in the case of an I/O error or biodone() called without initiating I/O. The problem always caused a panic before. Now it doesn't. The problem is mainly an issue with NFS. Fixed casts for ~PAGE_MASK. This code worked properly before only because the calculations use signed arithmatic. Better to properly extend PAGE_MASK first before inverting it for the 64 bit masking op. In brelse(), the bogus_page fixup code was improperly throwing away the original contents of 'm' when it did the j-loop to fix the bogus pages. The result was that it would potentially invalidate parts of the *WRONG* page(!), leading to corruption. There may still be cases where a background bitmap write is being duplicated, causing potential corruption. We have identified a potentially serious bug related to this but the fix is still TBD. So instead this patch contains a KASSERT to detect the problem and panic the machine rather then continue to corrupt the filesystem. The problem does not occur very often.. it is very hard to reproduce, and it may or may not be the cause of the corruption people have reported. Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>) Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
2000-11-18 23:06:26 +00:00
M_FREEFRAG, M_SOFTDEP_FLAGS);
freefrag->ff_list.wk_type = D_FREEFRAG;
freefrag->ff_state = 0;
freefrag->ff_inum = ip->i_number;
freefrag->ff_mnt = ITOV(ip)->v_mount;
freefrag->ff_blkno = blkno;
freefrag->ff_fragsize = size;
return (freefrag);
}
/*
* This workitem de-allocates fragments that were replaced during
* file block allocation.
*/
static void
handle_workitem_freefrag(freefrag)
struct freefrag *freefrag;
{
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
struct ufsmount *ump = VFSTOUFS(freefrag->ff_mnt);
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
ffs_blkfree(ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
freefrag->ff_fragsize, freefrag->ff_inum);
FREE(freefrag, M_FREEFRAG);
}
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
/*
* Set up a dependency structure for an external attributes data block.
* This routine follows much of the structure of softdep_setup_allocdirect.
* See the description of softdep_setup_allocdirect above for details.
*/
void
softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
struct inode *ip;
ufs_lbn_t lbn;
ufs2_daddr_t newblkno;
ufs2_daddr_t oldblkno;
long newsize;
long oldsize;
struct buf *bp;
{
struct allocdirect *adp, *oldadp;
struct allocdirectlst *adphead;
struct bmsafemap *bmsafemap;
struct inodedep *inodedep;
struct newblk *newblk;
MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
adp->ad_list.wk_type = D_ALLOCDIRECT;
adp->ad_lbn = lbn;
adp->ad_newblkno = newblkno;
adp->ad_oldblkno = oldblkno;
adp->ad_newsize = newsize;
adp->ad_oldsize = oldsize;
adp->ad_state = ATTACHED | EXTDATA;
LIST_INIT(&adp->ad_newdirblk);
if (newblkno == oldblkno)
adp->ad_freefrag = NULL;
else
adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
panic("softdep_setup_allocext: lost block");
ACQUIRE_LOCK(&lk);
inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC | NODELAY, &inodedep);
adp->ad_inodedep = inodedep;
if (newblk->nb_state == DEPCOMPLETE) {
adp->ad_state |= DEPCOMPLETE;
adp->ad_buf = NULL;
} else {
bmsafemap = newblk->nb_bmsafemap;
adp->ad_buf = bmsafemap->sm_buf;
LIST_REMOVE(newblk, nb_deps);
LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
}
LIST_REMOVE(newblk, nb_hash);
FREE(newblk, M_NEWBLK);
WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
if (lbn >= NXADDR) {
FREE_LOCK(&lk);
panic("softdep_setup_allocext: lbn %lld > NXADDR",
(long long)lbn);
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
}
/*
* The list of allocdirects must be kept in sorted and ascending
* order so that the rollback routines can quickly determine the
* first uncommitted block (the size of the file stored on disk
* ends at the end of the lowest committed fragment, or if there
* are no fragments, at the end of the highest committed block).
* Since files generally grow, the typical case is that the new
* block is to be added at the end of the list. We speed this
* special case by checking against the last allocdirect in the
* list before laboriously traversing the list looking for the
* insertion point.
*/
adphead = &inodedep->id_newextupdt;
oldadp = TAILQ_LAST(adphead, allocdirectlst);
if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
/* insert at end of list */
TAILQ_INSERT_TAIL(adphead, adp, ad_next);
if (oldadp != NULL && oldadp->ad_lbn == lbn)
allocdirect_merge(adphead, adp, oldadp);
FREE_LOCK(&lk);
return;
}
TAILQ_FOREACH(oldadp, adphead, ad_next) {
if (oldadp->ad_lbn >= lbn)
break;
}
if (oldadp == NULL) {
FREE_LOCK(&lk);
panic("softdep_setup_allocext: lost entry");
}
/* insert in middle of list */
TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
if (oldadp->ad_lbn == lbn)
allocdirect_merge(adphead, adp, oldadp);
FREE_LOCK(&lk);
}
/*
* Indirect block allocation dependencies.
*
* The same dependencies that exist for a direct block also exist when
* a new block is allocated and pointed to by an entry in a block of
* indirect pointers. The undo/redo states described above are also
* used here. Because an indirect block contains many pointers that
* may have dependencies, a second copy of the entire in-memory indirect
* block is kept. The buffer cache copy is always completely up-to-date.
* The second copy, which is used only as a source for disk writes,
* contains only the safe pointers (i.e., those that have no remaining
* update dependencies). The second copy is freed when all pointers
* are safe. The cache is not allowed to replace indirect blocks with
* pending update dependencies. If a buffer containing an indirect
* block with dependencies is written, these routines will mark it
* dirty again. It can only be successfully written once all the
* dependencies are removed. The ffs_fsync routine in conjunction with
* softdep_sync_metadata work together to get all the dependencies
* removed so that a file can be successfully written to disk. Three
* procedures are used when setting up indirect block pointer
* dependencies. The division is necessary because of the organization
* of the "balloc" routine and because of the distinction between file
* pages and file metadata blocks.
*/
/*
* Allocate a new allocindir structure.
*/
static struct allocindir *
newallocindir(ip, ptrno, newblkno, oldblkno)
struct inode *ip; /* inode for file being extended */
int ptrno; /* offset of pointer in indirect block */
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
ufs2_daddr_t newblkno; /* disk block number being added */
ufs2_daddr_t oldblkno; /* previous block number, 0 if none */
{
struct allocindir *aip;
MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
M_ALLOCINDIR, M_SOFTDEP_FLAGS|M_ZERO);
aip->ai_list.wk_type = D_ALLOCINDIR;
aip->ai_state = ATTACHED;
aip->ai_offset = ptrno;
aip->ai_newblkno = newblkno;
aip->ai_oldblkno = oldblkno;
aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
return (aip);
}
/*
* Called just before setting an indirect block pointer
* to a newly allocated file page.
*/
void
softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
struct inode *ip; /* inode for file being extended */
ufs_lbn_t lbn; /* allocated block number within file */
struct buf *bp; /* buffer with indirect blk referencing page */
int ptrno; /* offset of pointer in indirect block */
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
ufs2_daddr_t newblkno; /* disk block number being added */
ufs2_daddr_t oldblkno; /* previous block number, 0 if none */
struct buf *nbp; /* buffer holding allocated page */
{
struct allocindir *aip;
struct pagedep *pagedep;
aip = newallocindir(ip, ptrno, newblkno, oldblkno);
ACQUIRE_LOCK(&lk);
/*
* If we are allocating a directory page, then we must
* allocate an associated pagedep to track additions and
* deletions.
*/
if ((ip->i_mode & IFMT) == IFDIR &&
pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
FREE_LOCK(&lk);
setup_allocindir_phase2(bp, ip, aip);
}
/*
* Called just before setting an indirect block pointer to a
* newly allocated indirect block.
*/
void
softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
struct buf *nbp; /* newly allocated indirect block */
struct inode *ip; /* inode for file being extended */
struct buf *bp; /* indirect block referencing allocated block */
int ptrno; /* offset of pointer in indirect block */
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
ufs2_daddr_t newblkno; /* disk block number being added */
{
struct allocindir *aip;
aip = newallocindir(ip, ptrno, newblkno, 0);
ACQUIRE_LOCK(&lk);
WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
FREE_LOCK(&lk);
setup_allocindir_phase2(bp, ip, aip);
}
/*
* Called to finish the allocation of the "aip" allocated
* by one of the two routines above.
*/
static void
setup_allocindir_phase2(bp, ip, aip)
struct buf *bp; /* in-memory copy of the indirect block */
struct inode *ip; /* inode for file being extended */
struct allocindir *aip; /* allocindir allocated by the above routines */
{
struct worklist *wk;
struct indirdep *indirdep, *newindirdep;
struct bmsafemap *bmsafemap;
struct allocindir *oldaip;
struct freefrag *freefrag;
struct newblk *newblk;
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
ufs2_daddr_t blkno;
if (bp->b_lblkno >= 0)
panic("setup_allocindir_phase2: not indir blk");
for (indirdep = NULL, newindirdep = NULL; ; ) {
ACQUIRE_LOCK(&lk);
LIST_FOREACH(wk, &bp->b_dep, wk_list) {
if (wk->wk_type != D_INDIRDEP)
continue;
indirdep = WK_INDIRDEP(wk);
break;
}
if (indirdep == NULL && newindirdep) {
indirdep = newindirdep;
WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
newindirdep = NULL;
}
FREE_LOCK(&lk);
if (indirdep) {
if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
&newblk) == 0)
panic("setup_allocindir: lost block");
ACQUIRE_LOCK(&lk);
if (newblk->nb_state == DEPCOMPLETE) {
aip->ai_state |= DEPCOMPLETE;
aip->ai_buf = NULL;
} else {
bmsafemap = newblk->nb_bmsafemap;
aip->ai_buf = bmsafemap->sm_buf;
LIST_REMOVE(newblk, nb_deps);
LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
aip, ai_deps);
}
LIST_REMOVE(newblk, nb_hash);
FREE(newblk, M_NEWBLK);
aip->ai_indirdep = indirdep;
/*
* Check to see if there is an existing dependency
* for this block. If there is, merge the old
* dependency into the new one.
*/
if (aip->ai_oldblkno == 0)
oldaip = NULL;
else
2001-02-04 12:37:48 +00:00
LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
if (oldaip->ai_offset == aip->ai_offset)
break;
freefrag = NULL;
if (oldaip != NULL) {
if (oldaip->ai_newblkno != aip->ai_oldblkno) {
FREE_LOCK(&lk);
panic("setup_allocindir_phase2: blkno");
}
aip->ai_oldblkno = oldaip->ai_oldblkno;
freefrag = aip->ai_freefrag;
aip->ai_freefrag = oldaip->ai_freefrag;
oldaip->ai_freefrag = NULL;
free_allocindir(oldaip, NULL);
}
LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
if (ip->i_ump->um_fstype == UFS1)
((ufs1_daddr_t *)indirdep->ir_savebp->b_data)
[aip->ai_offset] = aip->ai_oldblkno;
else
((ufs2_daddr_t *)indirdep->ir_savebp->b_data)
[aip->ai_offset] = aip->ai_oldblkno;
FREE_LOCK(&lk);
if (freefrag != NULL)
handle_workitem_freefrag(freefrag);
}
if (newindirdep) {
brelse(newindirdep->ir_savebp);
WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
}
if (indirdep)
break;
MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
Implement a low-memory deadlock solution. Removed most of the hacks that were trying to deal with low-memory situations prior to now. The new code is based on the concept that I/O must be able to function in a low memory situation. All major modules related to I/O (except networking) have been adjusted to allow allocation out of the system reserve memory pool. These modules now detect a low memory situation but rather then block they instead continue to operate, then return resources to the memory pool instead of cache them or leave them wired. Code has been added to stall in a low-memory situation prior to a vnode being locked. Thus situations where a process blocks in a low-memory condition while holding a locked vnode have been reduced to near nothing. Not only will I/O continue to operate, but many prior deadlock conditions simply no longer exist. Implement a number of VFS/BIO fixes (found by Ian): in biodone(), bogus-page replacement code, the loop was not properly incrementing loop variables prior to a continue statement. We do not believe this code can be hit anyway but we aren't taking any chances. We'll turn the whole section into a panic (as it already is in brelse()) after the release is rolled. In biodone(), the foff calculation was incorrectly clamped to the iosize, causing the wrong foff to be calculated for pages in the case of an I/O error or biodone() called without initiating I/O. The problem always caused a panic before. Now it doesn't. The problem is mainly an issue with NFS. Fixed casts for ~PAGE_MASK. This code worked properly before only because the calculations use signed arithmatic. Better to properly extend PAGE_MASK first before inverting it for the 64 bit masking op. In brelse(), the bogus_page fixup code was improperly throwing away the original contents of 'm' when it did the j-loop to fix the bogus pages. The result was that it would potentially invalidate parts of the *WRONG* page(!), leading to corruption. There may still be cases where a background bitmap write is being duplicated, causing potential corruption. We have identified a potentially serious bug related to this but the fix is still TBD. So instead this patch contains a KASSERT to detect the problem and panic the machine rather then continue to corrupt the filesystem. The problem does not occur very often.. it is very hard to reproduce, and it may or may not be the cause of the corruption people have reported. Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>) Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
2000-11-18 23:06:26 +00:00
M_INDIRDEP, M_SOFTDEP_FLAGS);
newindirdep->ir_list.wk_type = D_INDIRDEP;
newindirdep->ir_state = ATTACHED;
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
if (ip->i_ump->um_fstype == UFS1)
newindirdep->ir_state |= UFS1FMT;
LIST_INIT(&newindirdep->ir_deplisthd);
LIST_INIT(&newindirdep->ir_donehd);
if (bp->b_blkno == bp->b_lblkno) {
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
NULL, NULL);
bp->b_blkno = blkno;
}
newindirdep->ir_savebp =
getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
BUF_KERNPROC(newindirdep->ir_savebp);
bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
}
}
/*
* Block de-allocation dependencies.
*
* When blocks are de-allocated, the on-disk pointers must be nullified before
* the blocks are made available for use by other files. (The true
* requirement is that old pointers must be nullified before new on-disk
* pointers are set. We chose this slightly more stringent requirement to
* reduce complexity.) Our implementation handles this dependency by updating
* the inode (or indirect block) appropriately but delaying the actual block
* de-allocation (i.e., freemap and free space count manipulation) until
* after the updated versions reach stable storage. After the disk is
* updated, the blocks can be safely de-allocated whenever it is convenient.
* This implementation handles only the common case of reducing a file's
* length to zero. Other cases are handled by the conventional synchronous
* write approach.
*
* The ffs implementation with which we worked double-checks
* the state of the block pointers and file size as it reduces
* a file's length. Some of this code is replicated here in our
* soft updates implementation. The freeblks->fb_chkcnt field is
* used to transfer a part of this information to the procedure
* that eventually de-allocates the blocks.
*
* This routine should be called from the routine that shortens
* a file's length, before the inode's size or block pointers
* are modified. It will save the block pointer information for
* later release and zero the inode so that the calling routine
* can release it.
*/
void
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
softdep_setup_freeblocks(ip, length, flags)
struct inode *ip; /* The inode whose length is to be reduced */
off_t length; /* The new length for the file */
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
int flags; /* IO_EXT and/or IO_NORMAL */
{
struct freeblks *freeblks;
struct inodedep *inodedep;
struct allocdirect *adp;
struct vnode *vp;
struct buf *bp;
struct fs *fs;
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
ufs2_daddr_t extblocks, datablocks;
int i, delay, error;
fs = ip->i_fs;
if (length != 0)
panic("softdep_setup_freeblocks: non-zero length");
MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
freeblks->fb_list.wk_type = D_FREEBLKS;
freeblks->fb_uid = ip->i_uid;
freeblks->fb_previousinum = ip->i_number;
freeblks->fb_devvp = ip->i_devvp;
freeblks->fb_mnt = ITOV(ip)->v_mount;
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
extblocks = 0;
if (fs->fs_magic == FS_UFS2_MAGIC)
extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
datablocks = DIP(ip, i_blocks) - extblocks;
if ((flags & IO_NORMAL) == 0) {
freeblks->fb_oldsize = 0;
freeblks->fb_chkcnt = 0;
} else {
freeblks->fb_oldsize = ip->i_size;
ip->i_size = 0;
DIP_SET(ip, i_size, 0);
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
freeblks->fb_chkcnt = datablocks;
for (i = 0; i < NDADDR; i++) {
freeblks->fb_dblks[i] = DIP(ip, i_db[i]);
DIP_SET(ip, i_db[i], 0);
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
}
for (i = 0; i < NIADDR; i++) {
freeblks->fb_iblks[i] = DIP(ip, i_ib[i]);
DIP_SET(ip, i_ib[i], 0);
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
}
/*
* If the file was removed, then the space being freed was
* accounted for then (see softdep_filereleased()). If the
* file is merely being truncated, then we account for it now.
*/
if ((ip->i_flag & IN_SPACECOUNTED) == 0)
fs->fs_pendingblocks += datablocks;
}
if ((flags & IO_EXT) == 0) {
freeblks->fb_oldextsize = 0;
} else {
freeblks->fb_oldextsize = ip->i_din2->di_extsize;
ip->i_din2->di_extsize = 0;
freeblks->fb_chkcnt += extblocks;
for (i = 0; i < NXADDR; i++) {
freeblks->fb_eblks[i] = ip->i_din2->di_extb[i];
ip->i_din2->di_extb[i] = 0;
}
}
DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt);
/*
* Push the zero'ed inode to to its disk buffer so that we are free
* to delete its dependencies below. Once the dependencies are gone
* the buffer can be safely released.
*/
if ((error = bread(ip->i_devvp,
fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
(int)fs->fs_bsize, NOCRED, &bp)) != 0) {
brelse(bp);
softdep_error("softdep_setup_freeblocks", error);
}
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
if (ip->i_ump->um_fstype == UFS1)
*((struct ufs1_dinode *)bp->b_data +
ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
else
*((struct ufs2_dinode *)bp->b_data +
ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
/*
* Find and eliminate any inode dependencies.
*/
ACQUIRE_LOCK(&lk);
(void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep);
if ((inodedep->id_state & IOSTARTED) != 0) {
FREE_LOCK(&lk);
panic("softdep_setup_freeblocks: inode busy");
}
When deleting a file, the ordering of events imposed by soft updates is to first write the deleted directory entry to disk, second write the zero'ed inode to disk, and finally to release the freed blocks and the inode back to the cylinder-group map. As this ordering requires two disk writes to occur which are normally spaced about 30 seconds apart (except when memory is under duress), it takes about a minute from the time that a file is deleted until its inode and data blocks show up in the cylinder-group map for reallocation. If a file has had only a brief lifetime (less than 30 seconds from creation to deletion), neither its inode nor its directory entry may have been written to disk. If its directory entry has not been written to disk, then we need not wait for that directory block to be written as the on-disk directory block does not reference the inode. Similarly, if the allocated inode has never been written to disk, we do not have to wait for it to be written back either as its on-disk representation is still zero'ed out. Thus, in the case of a short lived file, we can simply release the blocks and inode to the cylinder-group map immediately. As the inode and its blocks are released immediately, they are immediately available for other uses. If they are not released for a minute, then other inodes and blocks must be allocated for short lived files, cluttering up the vnode and buffer caches. The previous code was a bit too aggressive in trying to release the blocks and inode back to the cylinder-group map resulting in their being made available when in fact the inode on disk had not yet been zero'ed. This patch takes a more conservative approach to doing the release which avoids doing the release prematurely.
2000-11-14 09:00:25 +00:00
/*
* Add the freeblks structure to the list of operations that
* must await the zero'ed inode being written to disk. If we
* still have a bitmap dependency (delay == 0), then the inode
* has never been written to disk, so we can process the
* freeblks below once we have deleted the dependencies.
*/
delay = (inodedep->id_state & DEPCOMPLETE);
if (delay)
WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
/*
* Because the file length has been truncated to zero, any
* pending block allocation dependency structures associated
* with this inode are obsolete and can simply be de-allocated.
* We must first merge the two dependency lists to get rid of
* any duplicate freefrag structures, then purge the merged list.
* If we still have a bitmap dependency, then the inode has never
* been written to disk, so we can free any fragments without delay.
*/
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
if (flags & IO_NORMAL) {
merge_inode_lists(&inodedep->id_newinoupdt,
&inodedep->id_inoupdt);
while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
free_allocdirect(&inodedep->id_inoupdt, adp, delay);
}
if (flags & IO_EXT) {
merge_inode_lists(&inodedep->id_newextupdt,
&inodedep->id_extupdt);
while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
free_allocdirect(&inodedep->id_extupdt, adp, delay);
}
FREE_LOCK(&lk);
bdwrite(bp);
/*
* We must wait for any I/O in progress to finish so that
* all potential buffers on the dirty list will be visible.
* Once they are all there, walk the list and get rid of
* any dependencies.
*/
vp = ITOV(ip);
ACQUIRE_LOCK(&lk);
VI_LOCK(vp);
drain_output(vp, 1);
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
restart:
TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
((flags & IO_NORMAL) == 0 &&
(bp->b_xflags & BX_ALTDATA) == 0))
continue;
if ((bp = getdirtybuf(&bp, VI_MTX(vp), MNT_WAIT)) == NULL)
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
goto restart;
(void) inodedep_lookup(fs, ip->i_number, 0, &inodedep);
deallocate_dependencies(bp, inodedep);
bp->b_flags |= B_INVAL | B_NOCACHE;
FREE_LOCK(&lk);
brelse(bp);
ACQUIRE_LOCK(&lk);
VI_LOCK(vp);
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
goto restart;
}
VI_UNLOCK(vp);
When deleting a file, the ordering of events imposed by soft updates is to first write the deleted directory entry to disk, second write the zero'ed inode to disk, and finally to release the freed blocks and the inode back to the cylinder-group map. As this ordering requires two disk writes to occur which are normally spaced about 30 seconds apart (except when memory is under duress), it takes about a minute from the time that a file is deleted until its inode and data blocks show up in the cylinder-group map for reallocation. If a file has had only a brief lifetime (less than 30 seconds from creation to deletion), neither its inode nor its directory entry may have been written to disk. If its directory entry has not been written to disk, then we need not wait for that directory block to be written as the on-disk directory block does not reference the inode. Similarly, if the allocated inode has never been written to disk, we do not have to wait for it to be written back either as its on-disk representation is still zero'ed out. Thus, in the case of a short lived file, we can simply release the blocks and inode to the cylinder-group map immediately. As the inode and its blocks are released immediately, they are immediately available for other uses. If they are not released for a minute, then other inodes and blocks must be allocated for short lived files, cluttering up the vnode and buffer caches. The previous code was a bit too aggressive in trying to release the blocks and inode back to the cylinder-group map resulting in their being made available when in fact the inode on disk had not yet been zero'ed. This patch takes a more conservative approach to doing the release which avoids doing the release prematurely.
2000-11-14 09:00:25 +00:00
if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0)
(void) free_inodedep(inodedep);
FREE_LOCK(&lk);
/*
When deleting a file, the ordering of events imposed by soft updates is to first write the deleted directory entry to disk, second write the zero'ed inode to disk, and finally to release the freed blocks and the inode back to the cylinder-group map. As this ordering requires two disk writes to occur which are normally spaced about 30 seconds apart (except when memory is under duress), it takes about a minute from the time that a file is deleted until its inode and data blocks show up in the cylinder-group map for reallocation. If a file has had only a brief lifetime (less than 30 seconds from creation to deletion), neither its inode nor its directory entry may have been written to disk. If its directory entry has not been written to disk, then we need not wait for that directory block to be written as the on-disk directory block does not reference the inode. Similarly, if the allocated inode has never been written to disk, we do not have to wait for it to be written back either as its on-disk representation is still zero'ed out. Thus, in the case of a short lived file, we can simply release the blocks and inode to the cylinder-group map immediately. As the inode and its blocks are released immediately, they are immediately available for other uses. If they are not released for a minute, then other inodes and blocks must be allocated for short lived files, cluttering up the vnode and buffer caches. The previous code was a bit too aggressive in trying to release the blocks and inode back to the cylinder-group map resulting in their being made available when in fact the inode on disk had not yet been zero'ed. This patch takes a more conservative approach to doing the release which avoids doing the release prematurely.
2000-11-14 09:00:25 +00:00
* If the inode has never been written to disk (delay == 0),
* then we can process the freeblks now that we have deleted
* the dependencies.
*/
When deleting a file, the ordering of events imposed by soft updates is to first write the deleted directory entry to disk, second write the zero'ed inode to disk, and finally to release the freed blocks and the inode back to the cylinder-group map. As this ordering requires two disk writes to occur which are normally spaced about 30 seconds apart (except when memory is under duress), it takes about a minute from the time that a file is deleted until its inode and data blocks show up in the cylinder-group map for reallocation. If a file has had only a brief lifetime (less than 30 seconds from creation to deletion), neither its inode nor its directory entry may have been written to disk. If its directory entry has not been written to disk, then we need not wait for that directory block to be written as the on-disk directory block does not reference the inode. Similarly, if the allocated inode has never been written to disk, we do not have to wait for it to be written back either as its on-disk representation is still zero'ed out. Thus, in the case of a short lived file, we can simply release the blocks and inode to the cylinder-group map immediately. As the inode and its blocks are released immediately, they are immediately available for other uses. If they are not released for a minute, then other inodes and blocks must be allocated for short lived files, cluttering up the vnode and buffer caches. The previous code was a bit too aggressive in trying to release the blocks and inode back to the cylinder-group map resulting in their being made available when in fact the inode on disk had not yet been zero'ed. This patch takes a more conservative approach to doing the release which avoids doing the release prematurely.
2000-11-14 09:00:25 +00:00
if (!delay)
handle_workitem_freeblocks(freeblks, 0);
}
/*
* Reclaim any dependency structures from a buffer that is about to
* be reallocated to a new vnode. The buffer must be locked, thus,
* no I/O completion operations can occur while we are manipulating
* its associated dependencies. The mutex is held so that other I/O's
* associated with related dependencies do not occur.
*/
static void
deallocate_dependencies(bp, inodedep)
struct buf *bp;
struct inodedep *inodedep;
{
struct worklist *wk;
struct indirdep *indirdep;
struct allocindir *aip;
struct pagedep *pagedep;
struct dirrem *dirrem;
struct diradd *dap;
int i;
while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
switch (wk->wk_type) {
case D_INDIRDEP:
indirdep = WK_INDIRDEP(wk);
/*
* None of the indirect pointers will ever be visible,
* so they can simply be tossed. GOINGAWAY ensures
* that allocated pointers will be saved in the buffer
* cache until they are freed. Note that they will
* only be able to be found by their physical address
* since the inode mapping the logical address will
* be gone. The save buffer used for the safe copy
* was allocated in setup_allocindir_phase2 using
* the physical address so it could be used for this
* purpose. Hence we swap the safe copy with the real
* copy, allowing the safe copy to be freed and holding
* on to the real copy for later use in indir_trunc.
*/
if (indirdep->ir_state & GOINGAWAY) {
FREE_LOCK(&lk);
panic("deallocate_dependencies: already gone");
}
indirdep->ir_state |= GOINGAWAY;
VFSTOUFS(bp->b_vp->v_mount)->um_numindirdeps += 1;
while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
free_allocindir(aip, inodedep);
if (bp->b_lblkno >= 0 ||
bp->b_blkno != indirdep->ir_savebp->b_lblkno) {
FREE_LOCK(&lk);
panic("deallocate_dependencies: not indir");
}
bcopy(bp->b_data, indirdep->ir_savebp->b_data,
bp->b_bcount);
WORKLIST_REMOVE(wk);
WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
continue;
case D_PAGEDEP:
pagedep = WK_PAGEDEP(wk);
/*
* None of the directory additions will ever be
* visible, so they can simply be tossed.
*/
for (i = 0; i < DAHASHSZ; i++)
while ((dap =
LIST_FIRST(&pagedep->pd_diraddhd[i])))
free_diradd(dap);
while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
free_diradd(dap);
/*
* Copy any directory remove dependencies to the list
* to be processed after the zero'ed inode is written.
* If the inode has already been written, then they
* can be dumped directly onto the work list.
*/
LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
LIST_REMOVE(dirrem, dm_next);
dirrem->dm_dirinum = pagedep->pd_ino;
if (inodedep == NULL ||
(inodedep->id_state & ALLCOMPLETE) ==
ALLCOMPLETE)
add_to_worklist(&dirrem->dm_list);
else
WORKLIST_INSERT(&inodedep->id_bufwait,
&dirrem->dm_list);
}
if ((pagedep->pd_state & NEWBLOCK) != 0) {
LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list)
if (wk->wk_type == D_NEWDIRBLK &&
WK_NEWDIRBLK(wk)->db_pagedep ==
pagedep)
break;
if (wk != NULL) {
WORKLIST_REMOVE(wk);
free_newdirblk(WK_NEWDIRBLK(wk));
} else {
FREE_LOCK(&lk);
panic("deallocate_dependencies: "
"lost pagedep");
}
}
WORKLIST_REMOVE(&pagedep->pd_list);
LIST_REMOVE(pagedep, pd_hash);
WORKITEM_FREE(pagedep, D_PAGEDEP);
continue;
case D_ALLOCINDIR:
free_allocindir(WK_ALLOCINDIR(wk), inodedep);
continue;
case D_ALLOCDIRECT:
case D_INODEDEP:
FREE_LOCK(&lk);
panic("deallocate_dependencies: Unexpected type %s",
TYPENAME(wk->wk_type));
/* NOTREACHED */
default:
FREE_LOCK(&lk);
panic("deallocate_dependencies: Unknown type %s",
TYPENAME(wk->wk_type));
/* NOTREACHED */
}
}
}
/*
* Free an allocdirect. Generate a new freefrag work request if appropriate.
* This routine must be called with splbio interrupts blocked.
*/
static void
free_allocdirect(adphead, adp, delay)
struct allocdirectlst *adphead;
struct allocdirect *adp;
int delay;
{
struct newdirblk *newdirblk;
struct worklist *wk;
#ifdef DEBUG
if (lk.lkt_held == NOHOLDER)
panic("free_allocdirect: lock not held");
#endif
if ((adp->ad_state & DEPCOMPLETE) == 0)
LIST_REMOVE(adp, ad_deps);
TAILQ_REMOVE(adphead, adp, ad_next);
if ((adp->ad_state & COMPLETE) == 0)
WORKLIST_REMOVE(&adp->ad_list);
if (adp->ad_freefrag != NULL) {
if (delay)
WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
&adp->ad_freefrag->ff_list);
else
add_to_worklist(&adp->ad_freefrag->ff_list);
}
if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) {
newdirblk = WK_NEWDIRBLK(wk);
WORKLIST_REMOVE(&newdirblk->db_list);
if (LIST_FIRST(&adp->ad_newdirblk) != NULL)
panic("free_allocdirect: extra newdirblk");
if (delay)
WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
&newdirblk->db_list);
else
free_newdirblk(newdirblk);
}
WORKITEM_FREE(adp, D_ALLOCDIRECT);
}
/*
* Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
* This routine must be called with splbio interrupts blocked.
*/
static void
free_newdirblk(newdirblk)
struct newdirblk *newdirblk;
{
struct pagedep *pagedep;
struct diradd *dap;
int i;
#ifdef DEBUG
if (lk.lkt_held == NOHOLDER)
panic("free_newdirblk: lock not held");
#endif
/*
* If the pagedep is still linked onto the directory buffer
* dependency chain, then some of the entries on the
* pd_pendinghd list may not be committed to disk yet. In
* this case, we will simply clear the NEWBLOCK flag and
* let the pd_pendinghd list be processed when the pagedep
* is next written. If the pagedep is no longer on the buffer
* dependency chain, then all the entries on the pd_pending
* list are committed to disk and we can free them here.
*/
pagedep = newdirblk->db_pagedep;
pagedep->pd_state &= ~NEWBLOCK;
if ((pagedep->pd_state & ONWORKLIST) == 0)
while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
free_diradd(dap);
/*
* If no dependencies remain, the pagedep will be freed.
*/
for (i = 0; i < DAHASHSZ; i++)
if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
break;
if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) {
LIST_REMOVE(pagedep, pd_hash);
WORKITEM_FREE(pagedep, D_PAGEDEP);
}
WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
}
/*
* Prepare an inode to be freed. The actual free operation is not
* done until the zero'ed inode has been written to disk.
*/
void
softdep_freefile(pvp, ino, mode)
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
struct vnode *pvp;
ino_t ino;
int mode;
{
struct inode *ip = VTOI(pvp);
struct inodedep *inodedep;
struct freefile *freefile;
/*
* This sets up the inode de-allocation dependency.
*/
MALLOC(freefile, struct freefile *, sizeof(struct freefile),
Implement a low-memory deadlock solution. Removed most of the hacks that were trying to deal with low-memory situations prior to now. The new code is based on the concept that I/O must be able to function in a low memory situation. All major modules related to I/O (except networking) have been adjusted to allow allocation out of the system reserve memory pool. These modules now detect a low memory situation but rather then block they instead continue to operate, then return resources to the memory pool instead of cache them or leave them wired. Code has been added to stall in a low-memory situation prior to a vnode being locked. Thus situations where a process blocks in a low-memory condition while holding a locked vnode have been reduced to near nothing. Not only will I/O continue to operate, but many prior deadlock conditions simply no longer exist. Implement a number of VFS/BIO fixes (found by Ian): in biodone(), bogus-page replacement code, the loop was not properly incrementing loop variables prior to a continue statement. We do not believe this code can be hit anyway but we aren't taking any chances. We'll turn the whole section into a panic (as it already is in brelse()) after the release is rolled. In biodone(), the foff calculation was incorrectly clamped to the iosize, causing the wrong foff to be calculated for pages in the case of an I/O error or biodone() called without initiating I/O. The problem always caused a panic before. Now it doesn't. The problem is mainly an issue with NFS. Fixed casts for ~PAGE_MASK. This code worked properly before only because the calculations use signed arithmatic. Better to properly extend PAGE_MASK first before inverting it for the 64 bit masking op. In brelse(), the bogus_page fixup code was improperly throwing away the original contents of 'm' when it did the j-loop to fix the bogus pages. The result was that it would potentially invalidate parts of the *WRONG* page(!), leading to corruption. There may still be cases where a background bitmap write is being duplicated, causing potential corruption. We have identified a potentially serious bug related to this but the fix is still TBD. So instead this patch contains a KASSERT to detect the problem and panic the machine rather then continue to corrupt the filesystem. The problem does not occur very often.. it is very hard to reproduce, and it may or may not be the cause of the corruption people have reported. Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>) Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
2000-11-18 23:06:26 +00:00
M_FREEFILE, M_SOFTDEP_FLAGS);
freefile->fx_list.wk_type = D_FREEFILE;
freefile->fx_list.wk_state = 0;
freefile->fx_mode = mode;
freefile->fx_oldinum = ino;
freefile->fx_devvp = ip->i_devvp;
freefile->fx_mnt = ITOV(ip)->v_mount;
if ((ip->i_flag & IN_SPACECOUNTED) == 0)
ip->i_fs->fs_pendinginodes += 1;
/*
* If the inodedep does not exist, then the zero'ed inode has
* been written to disk. If the allocated inode has never been
* written to disk, then the on-disk inode is zero'ed. In either
* case we can free the file immediately.
*/
ACQUIRE_LOCK(&lk);
if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0 ||
check_inode_unwritten(inodedep)) {
FREE_LOCK(&lk);
handle_workitem_freefile(freefile);
return;
}
WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
FREE_LOCK(&lk);
}
/*
* Check to see if an inode has never been written to disk. If
* so free the inodedep and return success, otherwise return failure.
* This routine must be called with splbio interrupts blocked.
*
* If we still have a bitmap dependency, then the inode has never
* been written to disk. Drop the dependency as it is no longer
* necessary since the inode is being deallocated. We set the
* ALLCOMPLETE flags since the bitmap now properly shows that the
* inode is not allocated. Even if the inode is actively being
* written, it has been rolled back to its zero'ed state, so we
* are ensured that a zero inode is what is on the disk. For short
* lived files, this change will usually result in removing all the
* dependencies from the inode so that it can be freed immediately.
*/
static int
check_inode_unwritten(inodedep)
struct inodedep *inodedep;
{
if ((inodedep->id_state & DEPCOMPLETE) != 0 ||
LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
LIST_FIRST(&inodedep->id_bufwait) != NULL ||
LIST_FIRST(&inodedep->id_inowait) != NULL ||
TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
TAILQ_FIRST(&inodedep->id_extupdt) != NULL ||
TAILQ_FIRST(&inodedep->id_newextupdt) != NULL ||
inodedep->id_nlinkdelta != 0)
return (0);
inodedep->id_state |= ALLCOMPLETE;
LIST_REMOVE(inodedep, id_deps);
inodedep->id_buf = NULL;
if (inodedep->id_state & ONWORKLIST)
WORKLIST_REMOVE(&inodedep->id_list);
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
if (inodedep->id_savedino1 != NULL) {
FREE(inodedep->id_savedino1, M_INODEDEP);
inodedep->id_savedino1 = NULL;
}
if (free_inodedep(inodedep) == 0) {
FREE_LOCK(&lk);
panic("check_inode_unwritten: busy inode");
}
return (1);
}
/*
* Try to free an inodedep structure. Return 1 if it could be freed.
*/
static int
free_inodedep(inodedep)
struct inodedep *inodedep;
{
if ((inodedep->id_state & ONWORKLIST) != 0 ||
(inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
LIST_FIRST(&inodedep->id_bufwait) != NULL ||
LIST_FIRST(&inodedep->id_inowait) != NULL ||
TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
TAILQ_FIRST(&inodedep->id_extupdt) != NULL ||
TAILQ_FIRST(&inodedep->id_newextupdt) != NULL ||
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL)
return (0);
LIST_REMOVE(inodedep, id_hash);
WORKITEM_FREE(inodedep, D_INODEDEP);
num_inodedep -= 1;
return (1);
}
/*
* This workitem routine performs the block de-allocation.
* The workitem is added to the pending list after the updated
* inode block has been written to disk. As mentioned above,
* checks regarding the number of blocks de-allocated (compared
* to the number of blocks allocated for the file) are also
* performed in this function.
*/
static void
handle_workitem_freeblocks(freeblks, flags)
struct freeblks *freeblks;
int flags;
{
struct inode *ip;
struct vnode *vp;
struct fs *fs;
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
int i, nblocks, level, bsize;
ufs2_daddr_t bn, blocksreleased = 0;
int error, allerror = 0;
ufs_lbn_t baselbns[NIADDR], tmpval;
fs = VFSTOUFS(freeblks->fb_mnt)->um_fs;
tmpval = 1;
baselbns[0] = NDADDR;
for (i = 1; i < NIADDR; i++) {
tmpval *= NINDIR(fs);
baselbns[i] = baselbns[i - 1] + tmpval;
}
nblocks = btodb(fs->fs_bsize);
blocksreleased = 0;
/*
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
* Release all extended attribute blocks or frags.
*/
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
if (freeblks->fb_oldextsize > 0) {
for (i = (NXADDR - 1); i >= 0; i--) {
if ((bn = freeblks->fb_eblks[i]) == 0)
continue;
bsize = sblksize(fs, freeblks->fb_oldextsize, i);
ffs_blkfree(fs, freeblks->fb_devvp, bn, bsize,
freeblks->fb_previousinum);
blocksreleased += btodb(bsize);
}
}
/*
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
* Release all data blocks or frags.
*/
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
if (freeblks->fb_oldsize > 0) {
/*
* Indirect blocks first.
*/
for (level = (NIADDR - 1); level >= 0; level--) {
if ((bn = freeblks->fb_iblks[level]) == 0)
continue;
if ((error = indir_trunc(freeblks, fsbtodb(fs, bn),
level, baselbns[level], &blocksreleased)) == 0)
allerror = error;
ffs_blkfree(fs, freeblks->fb_devvp, bn, fs->fs_bsize,
freeblks->fb_previousinum);
fs->fs_pendingblocks -= nblocks;
blocksreleased += nblocks;
}
/*
* All direct blocks or frags.
*/
for (i = (NDADDR - 1); i >= 0; i--) {
if ((bn = freeblks->fb_dblks[i]) == 0)
continue;
bsize = sblksize(fs, freeblks->fb_oldsize, i);
ffs_blkfree(fs, freeblks->fb_devvp, bn, bsize,
freeblks->fb_previousinum);
fs->fs_pendingblocks -= btodb(bsize);
blocksreleased += btodb(bsize);
}
}
/*
* If we still have not finished background cleanup, then check
* to see if the block count needs to be adjusted.
*/
if (freeblks->fb_chkcnt != blocksreleased &&
(fs->fs_flags & FS_UNCLEAN) != 0 &&
VFS_VGET(freeblks->fb_mnt, freeblks->fb_previousinum,
(flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp) == 0) {
ip = VTOI(vp);
DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + \
freeblks->fb_chkcnt - blocksreleased);
ip->i_flag |= IN_CHANGE;
vput(vp);
}
#ifdef DIAGNOSTIC
if (freeblks->fb_chkcnt != blocksreleased &&
((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0))
printf("handle_workitem_freeblocks: block count\n");
if (allerror)
softdep_error("handle_workitem_freeblks", allerror);
#endif /* DIAGNOSTIC */
WORKITEM_FREE(freeblks, D_FREEBLKS);
}
/*
* Release blocks associated with the inode ip and stored in the indirect
* block dbn. If level is greater than SINGLE, the block is an indirect block
* and recursive calls to indirtrunc must be used to cleanse other indirect
* blocks.
*/
static int
indir_trunc(freeblks, dbn, level, lbn, countp)
struct freeblks *freeblks;
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
ufs2_daddr_t dbn;
int level;
ufs_lbn_t lbn;
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
ufs2_daddr_t *countp;
{
struct buf *bp;
struct fs *fs;
struct worklist *wk;
struct indirdep *indirdep;
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
ufs1_daddr_t *bap1 = 0;
ufs2_daddr_t nb, *bap2 = 0;
ufs_lbn_t lbnadd;
int i, nblocks, ufs1fmt;
int error, allerror = 0;
fs = VFSTOUFS(freeblks->fb_mnt)->um_fs;
lbnadd = 1;
for (i = level; i > 0; i--)
lbnadd *= NINDIR(fs);
/*
* Get buffer of block pointers to be freed. This routine is not
* called until the zero'ed inode has been written, so it is safe
* to free blocks as they are encountered. Because the inode has
* been zero'ed, calls to bmap on these blocks will fail. So, we
* have to use the on-disk address and the block device for the
* filesystem to look them up. If the file was deleted before its
* indirect blocks were all written to disk, the routine that set
* us up (deallocate_dependencies) will have arranged to leave
* a complete copy of the indirect block in memory for our use.
* Otherwise we have to read the blocks in from the disk.
*/
#ifdef notyet
bp = getblk(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 0, 0,
GB_NOCREAT);
#else
bp = incore(&freeblks->fb_devvp->v_bufobj, dbn);
#endif
ACQUIRE_LOCK(&lk);
if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
if (wk->wk_type != D_INDIRDEP ||
(indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
(indirdep->ir_state & GOINGAWAY) == 0) {
FREE_LOCK(&lk);
panic("indir_trunc: lost indirdep");
}
WORKLIST_REMOVE(wk);
WORKITEM_FREE(indirdep, D_INDIRDEP);
if (LIST_FIRST(&bp->b_dep) != NULL) {
FREE_LOCK(&lk);
panic("indir_trunc: dangling dep");
}
VFSTOUFS(freeblks->fb_mnt)->um_numindirdeps -= 1;
FREE_LOCK(&lk);
} else {
#ifdef notyet
if (bp)
brelse(bp);
#endif
FREE_LOCK(&lk);
error = bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
NOCRED, &bp);
if (error) {
brelse(bp);
return (error);
}
}
/*
* Recursively free indirect blocks.
*/
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
if (VFSTOUFS(freeblks->fb_mnt)->um_fstype == UFS1) {
ufs1fmt = 1;
bap1 = (ufs1_daddr_t *)bp->b_data;
} else {
ufs1fmt = 0;
bap2 = (ufs2_daddr_t *)bp->b_data;
}
nblocks = btodb(fs->fs_bsize);
for (i = NINDIR(fs) - 1; i >= 0; i--) {
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
if (ufs1fmt)
nb = bap1[i];
else
nb = bap2[i];
if (nb == 0)
continue;
if (level != 0) {
if ((error = indir_trunc(freeblks, fsbtodb(fs, nb),
level - 1, lbn + (i * lbnadd), countp)) != 0)
allerror = error;
}
ffs_blkfree(fs, freeblks->fb_devvp, nb, fs->fs_bsize,
freeblks->fb_previousinum);
fs->fs_pendingblocks -= nblocks;
*countp += nblocks;
}
bp->b_flags |= B_INVAL | B_NOCACHE;
brelse(bp);
return (allerror);
}
/*
* Free an allocindir.
* This routine must be called with splbio interrupts blocked.
*/
static void
free_allocindir(aip, inodedep)
struct allocindir *aip;
struct inodedep *inodedep;
{
struct freefrag *freefrag;
#ifdef DEBUG
if (lk.lkt_held == NOHOLDER)
panic("free_allocindir: lock not held");
#endif
if ((aip->ai_state & DEPCOMPLETE) == 0)
LIST_REMOVE(aip, ai_deps);
if (aip->ai_state & ONWORKLIST)
WORKLIST_REMOVE(&aip->ai_list);
LIST_REMOVE(aip, ai_next);
if ((freefrag = aip->ai_freefrag) != NULL) {
if (inodedep == NULL)
add_to_worklist(&freefrag->ff_list);
else
WORKLIST_INSERT(&inodedep->id_bufwait,
&freefrag->ff_list);
}
WORKITEM_FREE(aip, D_ALLOCINDIR);
}
/*
* Directory entry addition dependencies.
*
* When adding a new directory entry, the inode (with its incremented link
* count) must be written to disk before the directory entry's pointer to it.
* Also, if the inode is newly allocated, the corresponding freemap must be
* updated (on disk) before the directory entry's pointer. These requirements
* are met via undo/redo on the directory entry's pointer, which consists
* simply of the inode number.
*
* As directory entries are added and deleted, the free space within a
2002-05-16 21:28:32 +00:00
* directory block can become fragmented. The ufs filesystem will compact
* a fragmented directory block to make space for a new entry. When this
* occurs, the offsets of previously added entries change. Any "diradd"
* dependency structures corresponding to these entries must be updated with
* the new offsets.
*/
/*
* This routine is called after the in-memory inode's link
* count has been incremented, but before the directory entry's
* pointer to the inode has been set.
*/
int
softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
struct buf *bp; /* buffer containing directory block */
struct inode *dp; /* inode for directory */
off_t diroffset; /* offset of new entry in directory */
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
ino_t newinum; /* inode referenced by new directory entry */
struct buf *newdirbp; /* non-NULL => contents of new mkdir */
int isnewblk; /* entry is in a newly allocated block */
{
int offset; /* offset of new entry within directory block */
ufs_lbn_t lbn; /* block in directory containing new entry */
struct fs *fs;
struct diradd *dap;
struct allocdirect *adp;
struct pagedep *pagedep;
struct inodedep *inodedep;
struct newdirblk *newdirblk = 0;
struct mkdir *mkdir1, *mkdir2;
/*
* Whiteouts have no dependencies.
*/
if (newinum == WINO) {
if (newdirbp != NULL)
bdwrite(newdirbp);
return (0);
}
fs = dp->i_fs;
lbn = lblkno(fs, diroffset);
offset = blkoff(fs, diroffset);
MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD,
M_SOFTDEP_FLAGS|M_ZERO);
dap->da_list.wk_type = D_DIRADD;
dap->da_offset = offset;
dap->da_newinum = newinum;
dap->da_state = ATTACHED;
if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) {
MALLOC(newdirblk, struct newdirblk *, sizeof(struct newdirblk),
M_NEWDIRBLK, M_SOFTDEP_FLAGS);
newdirblk->db_list.wk_type = D_NEWDIRBLK;
newdirblk->db_state = 0;
}
if (newdirbp == NULL) {
dap->da_state |= DEPCOMPLETE;
ACQUIRE_LOCK(&lk);
} else {
dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
Implement a low-memory deadlock solution. Removed most of the hacks that were trying to deal with low-memory situations prior to now. The new code is based on the concept that I/O must be able to function in a low memory situation. All major modules related to I/O (except networking) have been adjusted to allow allocation out of the system reserve memory pool. These modules now detect a low memory situation but rather then block they instead continue to operate, then return resources to the memory pool instead of cache them or leave them wired. Code has been added to stall in a low-memory situation prior to a vnode being locked. Thus situations where a process blocks in a low-memory condition while holding a locked vnode have been reduced to near nothing. Not only will I/O continue to operate, but many prior deadlock conditions simply no longer exist. Implement a number of VFS/BIO fixes (found by Ian): in biodone(), bogus-page replacement code, the loop was not properly incrementing loop variables prior to a continue statement. We do not believe this code can be hit anyway but we aren't taking any chances. We'll turn the whole section into a panic (as it already is in brelse()) after the release is rolled. In biodone(), the foff calculation was incorrectly clamped to the iosize, causing the wrong foff to be calculated for pages in the case of an I/O error or biodone() called without initiating I/O. The problem always caused a panic before. Now it doesn't. The problem is mainly an issue with NFS. Fixed casts for ~PAGE_MASK. This code worked properly before only because the calculations use signed arithmatic. Better to properly extend PAGE_MASK first before inverting it for the 64 bit masking op. In brelse(), the bogus_page fixup code was improperly throwing away the original contents of 'm' when it did the j-loop to fix the bogus pages. The result was that it would potentially invalidate parts of the *WRONG* page(!), leading to corruption. There may still be cases where a background bitmap write is being duplicated, causing potential corruption. We have identified a potentially serious bug related to this but the fix is still TBD. So instead this patch contains a KASSERT to detect the problem and panic the machine rather then continue to corrupt the filesystem. The problem does not occur very often.. it is very hard to reproduce, and it may or may not be the cause of the corruption people have reported. Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>) Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
2000-11-18 23:06:26 +00:00
M_SOFTDEP_FLAGS);
mkdir1->md_list.wk_type = D_MKDIR;
mkdir1->md_state = MKDIR_BODY;
mkdir1->md_diradd = dap;
MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
Implement a low-memory deadlock solution. Removed most of the hacks that were trying to deal with low-memory situations prior to now. The new code is based on the concept that I/O must be able to function in a low memory situation. All major modules related to I/O (except networking) have been adjusted to allow allocation out of the system reserve memory pool. These modules now detect a low memory situation but rather then block they instead continue to operate, then return resources to the memory pool instead of cache them or leave them wired. Code has been added to stall in a low-memory situation prior to a vnode being locked. Thus situations where a process blocks in a low-memory condition while holding a locked vnode have been reduced to near nothing. Not only will I/O continue to operate, but many prior deadlock conditions simply no longer exist. Implement a number of VFS/BIO fixes (found by Ian): in biodone(), bogus-page replacement code, the loop was not properly incrementing loop variables prior to a continue statement. We do not believe this code can be hit anyway but we aren't taking any chances. We'll turn the whole section into a panic (as it already is in brelse()) after the release is rolled. In biodone(), the foff calculation was incorrectly clamped to the iosize, causing the wrong foff to be calculated for pages in the case of an I/O error or biodone() called without initiating I/O. The problem always caused a panic before. Now it doesn't. The problem is mainly an issue with NFS. Fixed casts for ~PAGE_MASK. This code worked properly before only because the calculations use signed arithmatic. Better to properly extend PAGE_MASK first before inverting it for the 64 bit masking op. In brelse(), the bogus_page fixup code was improperly throwing away the original contents of 'm' when it did the j-loop to fix the bogus pages. The result was that it would potentially invalidate parts of the *WRONG* page(!), leading to corruption. There may still be cases where a background bitmap write is being duplicated, causing potential corruption. We have identified a potentially serious bug related to this but the fix is still TBD. So instead this patch contains a KASSERT to detect the problem and panic the machine rather then continue to corrupt the filesystem. The problem does not occur very often.. it is very hard to reproduce, and it may or may not be the cause of the corruption people have reported. Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>) Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
2000-11-18 23:06:26 +00:00
M_SOFTDEP_FLAGS);
mkdir2->md_list.wk_type = D_MKDIR;
mkdir2->md_state = MKDIR_PARENT;
mkdir2->md_diradd = dap;
/*
* Dependency on "." and ".." being written to disk.
*/
mkdir1->md_buf = newdirbp;
ACQUIRE_LOCK(&lk);
LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
FREE_LOCK(&lk);
bdwrite(newdirbp);
/*
* Dependency on link count increase for parent directory
*/
ACQUIRE_LOCK(&lk);
if (inodedep_lookup(fs, dp->i_number, 0, &inodedep) == 0
|| (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
dap->da_state &= ~MKDIR_PARENT;
WORKITEM_FREE(mkdir2, D_MKDIR);
} else {
LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
}
}
/*
* Link into parent directory pagedep to await its being written.
*/
if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
dap->da_pagedep = pagedep;
LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
da_pdlist);
/*
* Link into its inodedep. Put it on the id_bufwait list if the inode
* is not yet written. If it is written, do the post-inode write
* processing to put it on the id_pendinghd list.
*/
(void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
diradd_inode_written(dap, inodedep);
else
WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
if (isnewblk) {
/*
* Directories growing into indirect blocks are rare
* enough and the frequency of new block allocation
* in those cases even more rare, that we choose not
* to bother tracking them. Rather we simply force the
* new directory entry to disk.
*/
if (lbn >= NDADDR) {
FREE_LOCK(&lk);
/*
* We only have a new allocation when at the
* beginning of a new block, not when we are
* expanding into an existing block.
*/
if (blkoff(fs, diroffset) == 0)
return (1);
return (0);
}
/*
* We only have a new allocation when at the beginning
* of a new fragment, not when we are expanding into an
* existing fragment. Also, there is nothing to do if we
* are already tracking this block.
*/
if (fragoff(fs, diroffset) != 0) {
FREE_LOCK(&lk);
return (0);
}
if ((pagedep->pd_state & NEWBLOCK) != 0) {
WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
FREE_LOCK(&lk);
return (0);
}
/*
* Find our associated allocdirect and have it track us.
*/
if (inodedep_lookup(fs, dp->i_number, 0, &inodedep) == 0)
panic("softdep_setup_directory_add: lost inodedep");
adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst);
if (adp == NULL || adp->ad_lbn != lbn) {
FREE_LOCK(&lk);
panic("softdep_setup_directory_add: lost entry");
}
pagedep->pd_state |= NEWBLOCK;
newdirblk->db_pagedep = pagedep;
WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list);
}
FREE_LOCK(&lk);
return (0);
}
/*
* This procedure is called to change the offset of a directory
* entry when compacting a directory block which must be owned
* exclusively by the caller. Note that the actual entry movement
* must be done in this procedure to ensure that no I/O completions
* occur while the move is in progress.
*/
void
softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
struct inode *dp; /* inode for directory */
caddr_t base; /* address of dp->i_offset */
caddr_t oldloc; /* address of old directory location */
caddr_t newloc; /* address of new directory location */
int entrysize; /* size of directory entry */
{
int offset, oldoffset, newoffset;
struct pagedep *pagedep;
struct diradd *dap;
ufs_lbn_t lbn;
ACQUIRE_LOCK(&lk);
lbn = lblkno(dp->i_fs, dp->i_offset);
offset = blkoff(dp->i_fs, dp->i_offset);
if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
goto done;
oldoffset = offset + (oldloc - base);
newoffset = offset + (newloc - base);
2001-02-04 12:37:48 +00:00
LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
if (dap->da_offset != oldoffset)
continue;
dap->da_offset = newoffset;
if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
break;
LIST_REMOVE(dap, da_pdlist);
LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
dap, da_pdlist);
break;
}
if (dap == NULL) {
2001-02-04 12:37:48 +00:00
LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
if (dap->da_offset == oldoffset) {
dap->da_offset = newoffset;
break;
}
}
}
done:
bcopy(oldloc, newloc, entrysize);
FREE_LOCK(&lk);
}
/*
* Free a diradd dependency structure. This routine must be called
* with splbio interrupts blocked.
*/
static void
free_diradd(dap)
struct diradd *dap;
{
struct dirrem *dirrem;
struct pagedep *pagedep;
struct inodedep *inodedep;
struct mkdir *mkdir, *nextmd;
#ifdef DEBUG
if (lk.lkt_held == NOHOLDER)
panic("free_diradd: lock not held");
#endif
WORKLIST_REMOVE(&dap->da_list);
LIST_REMOVE(dap, da_pdlist);
if ((dap->da_state & DIRCHG) == 0) {
pagedep = dap->da_pagedep;
} else {
dirrem = dap->da_previous;
pagedep = dirrem->dm_pagedep;
dirrem->dm_dirinum = pagedep->pd_ino;
add_to_worklist(&dirrem->dm_list);
}
if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum,
0, &inodedep) != 0)
(void) free_inodedep(inodedep);
if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
nextmd = LIST_NEXT(mkdir, md_mkdirs);
if (mkdir->md_diradd != dap)
continue;
dap->da_state &= ~mkdir->md_state;
WORKLIST_REMOVE(&mkdir->md_list);
LIST_REMOVE(mkdir, md_mkdirs);
WORKITEM_FREE(mkdir, D_MKDIR);
}
if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
FREE_LOCK(&lk);
panic("free_diradd: unfound ref");
}
}
WORKITEM_FREE(dap, D_DIRADD);
}
/*
* Directory entry removal dependencies.
*
* When removing a directory entry, the entry's inode pointer must be
* zero'ed on disk before the corresponding inode's link count is decremented
* (possibly freeing the inode for re-use). This dependency is handled by
* updating the directory entry but delaying the inode count reduction until
* after the directory block has been written to disk. After this point, the
* inode count can be decremented whenever it is convenient.
*/
/*
* This routine should be called immediately after removing
* a directory entry. The inode's link count should not be
* decremented by the calling procedure -- the soft updates
* code will do this task when it is safe.
*/
void
softdep_setup_remove(bp, dp, ip, isrmdir)
struct buf *bp; /* buffer containing directory block */
struct inode *dp; /* inode for the directory being modified */
struct inode *ip; /* inode for directory entry being removed */
int isrmdir; /* indicates if doing RMDIR */
{
struct dirrem *dirrem, *prevdirrem;
/*
* Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
*/
dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
/*
* If the COMPLETE flag is clear, then there were no active
* entries and we want to roll back to a zeroed entry until
* the new inode is committed to disk. If the COMPLETE flag is
* set then we have deleted an entry that never made it to
* disk. If the entry we deleted resulted from a name change,
* then the old name still resides on disk. We cannot delete
* its inode (returned to us in prevdirrem) until the zeroed
* directory entry gets to disk. The new inode has never been
* referenced on the disk, so can be deleted immediately.
*/
if ((dirrem->dm_state & COMPLETE) == 0) {
LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
dm_next);
FREE_LOCK(&lk);
} else {
if (prevdirrem != NULL)
LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
prevdirrem, dm_next);
dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
FREE_LOCK(&lk);
handle_workitem_remove(dirrem, NULL);
}
}
/*
* Allocate a new dirrem if appropriate and return it along with
* its associated pagedep. Called without a lock, returns with lock.
*/
static long num_dirrem; /* number of dirrem allocated */
static struct dirrem *
newdirrem(bp, dp, ip, isrmdir, prevdirremp)
struct buf *bp; /* buffer containing directory block */
struct inode *dp; /* inode for the directory being modified */
struct inode *ip; /* inode for directory entry being removed */
int isrmdir; /* indicates if doing RMDIR */
struct dirrem **prevdirremp; /* previously referenced inode, if any */
{
int offset;
ufs_lbn_t lbn;
struct diradd *dap;
struct dirrem *dirrem;
struct pagedep *pagedep;
/*
* Whiteouts have no deletion dependencies.
*/
if (ip == NULL)
panic("newdirrem: whiteout");
/*
* If we are over our limit, try to improve the situation.
* Limiting the number of dirrem structures will also limit
* the number of freefile and freeblks structures.
*/
if (num_dirrem > max_softdeps / 2)
(void) request_cleanup(FLUSH_REMOVE, 0);
num_dirrem += 1;
MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
dirrem->dm_list.wk_type = D_DIRREM;
dirrem->dm_state = isrmdir ? RMDIR : 0;
dirrem->dm_mnt = ITOV(ip)->v_mount;
dirrem->dm_oldinum = ip->i_number;
*prevdirremp = NULL;
ACQUIRE_LOCK(&lk);
lbn = lblkno(dp->i_fs, dp->i_offset);
offset = blkoff(dp->i_fs, dp->i_offset);
if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
dirrem->dm_pagedep = pagedep;
/*
* Check for a diradd dependency for the same directory entry.
* If present, then both dependencies become obsolete and can
* be de-allocated. Check for an entry on both the pd_dirraddhd
* list and the pd_pendinghd list.
*/
2001-02-04 12:37:48 +00:00
LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
if (dap->da_offset == offset)
break;
if (dap == NULL) {
2001-02-04 12:37:48 +00:00
LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
if (dap->da_offset == offset)
break;
if (dap == NULL)
return (dirrem);
}
/*
* Must be ATTACHED at this point.
*/
if ((dap->da_state & ATTACHED) == 0) {
FREE_LOCK(&lk);
panic("newdirrem: not ATTACHED");
}
if (dap->da_newinum != ip->i_number) {
FREE_LOCK(&lk);
panic("newdirrem: inum %d should be %d",
ip->i_number, dap->da_newinum);
}
/*
* If we are deleting a changed name that never made it to disk,
* then return the dirrem describing the previous inode (which
* represents the inode currently referenced from this entry on disk).
*/
if ((dap->da_state & DIRCHG) != 0) {
*prevdirremp = dap->da_previous;
dap->da_state &= ~DIRCHG;
dap->da_pagedep = pagedep;
}
/*
* We are deleting an entry that never made it to disk.
* Mark it COMPLETE so we can delete its inode immediately.
*/
dirrem->dm_state |= COMPLETE;
free_diradd(dap);
return (dirrem);
}
/*
* Directory entry change dependencies.
*
* Changing an existing directory entry requires that an add operation
* be completed first followed by a deletion. The semantics for the addition
* are identical to the description of adding a new entry above except
* that the rollback is to the old inode number rather than zero. Once
* the addition dependency is completed, the removal is done as described
* in the removal routine above.
*/
/*
* This routine should be called immediately after changing
* a directory entry. The inode's link count should not be
* decremented by the calling procedure -- the soft updates
* code will perform this task when it is safe.
*/
void
softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
struct buf *bp; /* buffer containing directory block */
struct inode *dp; /* inode for the directory being modified */
struct inode *ip; /* inode for directory entry being removed */
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
ino_t newinum; /* new inode number for changed entry */
int isrmdir; /* indicates if doing RMDIR */
{
int offset;
struct diradd *dap = NULL;
struct dirrem *dirrem, *prevdirrem;
struct pagedep *pagedep;
struct inodedep *inodedep;
offset = blkoff(dp->i_fs, dp->i_offset);
/*
* Whiteouts do not need diradd dependencies.
*/
if (newinum != WINO) {
MALLOC(dap, struct diradd *, sizeof(struct diradd),
M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
dap->da_list.wk_type = D_DIRADD;
dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
dap->da_offset = offset;
dap->da_newinum = newinum;
}
/*
* Allocate a new dirrem and ACQUIRE_LOCK.
*/
dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
pagedep = dirrem->dm_pagedep;
/*
* The possible values for isrmdir:
* 0 - non-directory file rename
* 1 - directory rename within same directory
* inum - directory rename to new directory of given inode number
* When renaming to a new directory, we are both deleting and
* creating a new directory entry, so the link count on the new
* directory should not change. Thus we do not need the followup
* dirrem which is usually done in handle_workitem_remove. We set
* the DIRCHG flag to tell handle_workitem_remove to skip the
* followup dirrem.
*/
if (isrmdir > 1)
dirrem->dm_state |= DIRCHG;
/*
* Whiteouts have no additional dependencies,
* so just put the dirrem on the correct list.
*/
if (newinum == WINO) {
if ((dirrem->dm_state & COMPLETE) == 0) {
LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
dm_next);
} else {
dirrem->dm_dirinum = pagedep->pd_ino;
add_to_worklist(&dirrem->dm_list);
}
FREE_LOCK(&lk);
return;
}
/*
* If the COMPLETE flag is clear, then there were no active
* entries and we want to roll back to the previous inode until
* the new inode is committed to disk. If the COMPLETE flag is
* set, then we have deleted an entry that never made it to disk.
* If the entry we deleted resulted from a name change, then the old
* inode reference still resides on disk. Any rollback that we do
* needs to be to that old inode (returned to us in prevdirrem). If
* the entry we deleted resulted from a create, then there is
* no entry on the disk, so we want to roll back to zero rather
* than the uncommitted inode. In either of the COMPLETE cases we
* want to immediately free the unwritten and unreferenced inode.
*/
if ((dirrem->dm_state & COMPLETE) == 0) {
dap->da_previous = dirrem;
} else {
if (prevdirrem != NULL) {
dap->da_previous = prevdirrem;
} else {
dap->da_state &= ~DIRCHG;
dap->da_pagedep = pagedep;
}
dirrem->dm_dirinum = pagedep->pd_ino;
add_to_worklist(&dirrem->dm_list);
}
/*
* Link into its inodedep. Put it on the id_bufwait list if the inode
* is not yet written. If it is written, do the post-inode write
* processing to put it on the id_pendinghd list.
*/
if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 ||
(inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
dap->da_state |= COMPLETE;
LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
} else {
LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
dap, da_pdlist);
WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
}
FREE_LOCK(&lk);
}
/*
* Called whenever the link count on an inode is changed.
* It creates an inode dependency so that the new reference(s)
* to the inode cannot be committed to disk until the updated
* inode has been written.
*/
void
softdep_change_linkcnt(ip)
struct inode *ip; /* the inode with the increased link count */
{
struct inodedep *inodedep;
ACQUIRE_LOCK(&lk);
(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
if (ip->i_nlink < ip->i_effnlink) {
FREE_LOCK(&lk);
panic("softdep_change_linkcnt: bad delta");
}
inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
FREE_LOCK(&lk);
}
/*
* Called when the effective link count and the reference count
* on an inode drops to zero. At this point there are no names
* referencing the file in the filesystem and no active file
* references. The space associated with the file will be freed
* as soon as the necessary soft dependencies are cleared.
*/
void
softdep_releasefile(ip)
struct inode *ip; /* inode with the zero effective link count */
{
struct inodedep *inodedep;
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
struct fs *fs;
int extblocks;
if (ip->i_effnlink > 0)
panic("softdep_filerelease: file still referenced");
/*
* We may be called several times as the real reference count
* drops to zero. We only want to account for the space once.
*/
if (ip->i_flag & IN_SPACECOUNTED)
return;
/*
* We have to deactivate a snapshot otherwise copyonwrites may
* add blocks and the cleanup may remove blocks after we have
* tried to account for them.
*/
if ((ip->i_flags & SF_SNAPSHOT) != 0)
ffs_snapremove(ITOV(ip));
/*
* If we are tracking an nlinkdelta, we have to also remember
* whether we accounted for the freed space yet.
*/
ACQUIRE_LOCK(&lk);
if ((inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep)))
inodedep->id_state |= SPACECOUNTED;
FREE_LOCK(&lk);
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
fs = ip->i_fs;
extblocks = 0;
if (fs->fs_magic == FS_UFS2_MAGIC)
extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
ip->i_fs->fs_pendingblocks += DIP(ip, i_blocks) - extblocks;
ip->i_fs->fs_pendinginodes += 1;
ip->i_flag |= IN_SPACECOUNTED;
}
/*
* This workitem decrements the inode's link count.
* If the link count reaches zero, the file is removed.
*/
static void
handle_workitem_remove(dirrem, xp)
struct dirrem *dirrem;
struct vnode *xp;
{
struct thread *td = curthread;
struct inodedep *inodedep;
struct vnode *vp;
struct inode *ip;
ino_t oldinum;
int error;
if ((vp = xp) == NULL &&
(error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, LK_EXCLUSIVE,
&vp)) != 0) {
softdep_error("handle_workitem_remove: vget", error);
return;
}
ip = VTOI(vp);
ACQUIRE_LOCK(&lk);
if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0){
FREE_LOCK(&lk);
panic("handle_workitem_remove: lost inodedep");
}
/*
* Normal file deletion.
*/
if ((dirrem->dm_state & RMDIR) == 0) {
ip->i_nlink--;
DIP_SET(ip, i_nlink, ip->i_nlink);
ip->i_flag |= IN_CHANGE;
if (ip->i_nlink < ip->i_effnlink) {
FREE_LOCK(&lk);
panic("handle_workitem_remove: bad file delta");
}
inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
FREE_LOCK(&lk);
vput(vp);
num_dirrem -= 1;
WORKITEM_FREE(dirrem, D_DIRREM);
return;
}
/*
* Directory deletion. Decrement reference count for both the
* just deleted parent directory entry and the reference for ".".
* Next truncate the directory to length zero. When the
* truncation completes, arrange to have the reference count on
* the parent decremented to account for the loss of "..".
*/
ip->i_nlink -= 2;
DIP_SET(ip, i_nlink, ip->i_nlink);
ip->i_flag |= IN_CHANGE;
if (ip->i_nlink < ip->i_effnlink) {
FREE_LOCK(&lk);
panic("handle_workitem_remove: bad dir delta");
}
inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
FREE_LOCK(&lk);
if ((error = UFS_TRUNCATE(vp, (off_t)0, 0, td->td_ucred, td)) != 0)
softdep_error("handle_workitem_remove: truncate", error);
/*
* Rename a directory to a new parent. Since, we are both deleting
* and creating a new directory entry, the link count on the new
* directory should not change. Thus we skip the followup dirrem.
*/
if (dirrem->dm_state & DIRCHG) {
vput(vp);
num_dirrem -= 1;
WORKITEM_FREE(dirrem, D_DIRREM);
return;
}
/*
* If the inodedep does not exist, then the zero'ed inode has
* been written to disk. If the allocated inode has never been
* written to disk, then the on-disk inode is zero'ed. In either
* case we can remove the file immediately.
*/
ACQUIRE_LOCK(&lk);
dirrem->dm_state = 0;
oldinum = dirrem->dm_oldinum;
dirrem->dm_oldinum = dirrem->dm_dirinum;
if (inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep) == 0 ||
check_inode_unwritten(inodedep)) {
FREE_LOCK(&lk);
vput(vp);
handle_workitem_remove(dirrem, NULL);
return;
}
WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
FREE_LOCK(&lk);
vput(vp);
}
/*
* Inode de-allocation dependencies.
*
* When an inode's link count is reduced to zero, it can be de-allocated. We
* found it convenient to postpone de-allocation until after the inode is
* written to disk with its new link count (zero). At this point, all of the
* on-disk inode's block pointers are nullified and, with careful dependency
* list ordering, all dependencies related to the inode will be satisfied and
* the corresponding dependency structures de-allocated. So, if/when the
* inode is reused, there will be no mixing of old dependencies with new
* ones. This artificial dependency is set up by the block de-allocation
* procedure above (softdep_setup_freeblocks) and completed by the
* following procedure.
*/
static void
handle_workitem_freefile(freefile)
struct freefile *freefile;
{
struct fs *fs;
struct inodedep *idp;
int error;
fs = VFSTOUFS(freefile->fx_mnt)->um_fs;
#ifdef DEBUG
ACQUIRE_LOCK(&lk);
error = inodedep_lookup(fs, freefile->fx_oldinum, 0, &idp);
FREE_LOCK(&lk);
if (error)
panic("handle_workitem_freefile: inodedep survived");
#endif
fs->fs_pendinginodes -= 1;
if ((error = ffs_freefile(fs, freefile->fx_devvp, freefile->fx_oldinum,
freefile->fx_mode)) != 0)
softdep_error("handle_workitem_freefile", error);
WORKITEM_FREE(freefile, D_FREEFILE);
}
static int
softdep_disk_prewrite(struct vnode *vp, struct buf *bp)
{
int error;
if (bp->b_iocmd != BIO_WRITE)
return (0);
if ((bp->b_flags & B_VALIDSUSPWRT) == 0 &&
bp->b_vp != NULL && bp->b_vp->v_mount != NULL &&
(bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0)
panic("softdep_disk_prewrite: bad I/O");
bp->b_flags &= ~B_VALIDSUSPWRT;
if (LIST_FIRST(&bp->b_dep) != NULL)
buf_start(bp);
mp_fixme("This should require the vnode lock.");
if ((vp->v_vflag & VV_COPYONWRITE) &&
vp->v_rdev->si_snapdata != NULL &&
(error = (ffs_copyonwrite)(vp, bp)) != 0 &&
error != EOPNOTSUPP) {
2004-08-08 13:23:05 +00:00
bp->b_error = error;
bp->b_ioflags |= BIO_ERROR;
bufdone(bp);
return (1);
}
return (0);
}
/*
* Disk writes.
*
* The dependency structures constructed above are most actively used when file
* system blocks are written to disk. No constraints are placed on when a
* block can be written, but unsatisfied update dependencies are made safe by
* modifying (or replacing) the source memory for the duration of the disk
* write. When the disk write completes, the memory block is again brought
* up-to-date.
*
* In-core inode structure reclamation.
*
* Because there are a finite number of "in-core" inode structures, they are
* reused regularly. By transferring all inode-related dependencies to the
* in-memory inode block and indexing them separately (via "inodedep"s), we
* can allow "in-core" inode structures to be reused at any time and avoid
* any increase in contention.
*
* Called just before entering the device driver to initiate a new disk I/O.
* The buffer must be locked, thus, no I/O completion operations can occur
* while we are manipulating its associated dependencies.
*/
static void
softdep_disk_io_initiation(bp)
struct buf *bp; /* structure describing disk write to occur */
{
struct worklist *wk, *nextwk;
struct indirdep *indirdep;
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
struct inodedep *inodedep;
/*
* We only care about write operations. There should never
* be dependencies for reads.
*/
if (bp->b_iocmd == BIO_READ)
panic("softdep_disk_io_initiation: read");
/*
* Do any necessary pre-I/O processing.
*/
for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) {
nextwk = LIST_NEXT(wk, wk_list);
switch (wk->wk_type) {
case D_PAGEDEP:
initiate_write_filepage(WK_PAGEDEP(wk), bp);
continue;
case D_INODEDEP:
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
inodedep = WK_INODEDEP(wk);
if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
initiate_write_inodeblock_ufs1(inodedep, bp);
else
initiate_write_inodeblock_ufs2(inodedep, bp);
continue;
case D_INDIRDEP:
indirdep = WK_INDIRDEP(wk);
if (indirdep->ir_state & GOINGAWAY)
panic("disk_io_initiation: indirdep gone");
/*
* If there are no remaining dependencies, this
* will be writing the real pointers, so the
* dependency can be freed.
*/
if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
indirdep->ir_savebp->b_flags |=
B_INVAL | B_NOCACHE;
brelse(indirdep->ir_savebp);
/* inline expand WORKLIST_REMOVE(wk); */
wk->wk_state &= ~ONWORKLIST;
LIST_REMOVE(wk, wk_list);
WORKITEM_FREE(indirdep, D_INDIRDEP);
continue;
}
/*
* Replace up-to-date version with safe version.
*/
MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
Implement a low-memory deadlock solution. Removed most of the hacks that were trying to deal with low-memory situations prior to now. The new code is based on the concept that I/O must be able to function in a low memory situation. All major modules related to I/O (except networking) have been adjusted to allow allocation out of the system reserve memory pool. These modules now detect a low memory situation but rather then block they instead continue to operate, then return resources to the memory pool instead of cache them or leave them wired. Code has been added to stall in a low-memory situation prior to a vnode being locked. Thus situations where a process blocks in a low-memory condition while holding a locked vnode have been reduced to near nothing. Not only will I/O continue to operate, but many prior deadlock conditions simply no longer exist. Implement a number of VFS/BIO fixes (found by Ian): in biodone(), bogus-page replacement code, the loop was not properly incrementing loop variables prior to a continue statement. We do not believe this code can be hit anyway but we aren't taking any chances. We'll turn the whole section into a panic (as it already is in brelse()) after the release is rolled. In biodone(), the foff calculation was incorrectly clamped to the iosize, causing the wrong foff to be calculated for pages in the case of an I/O error or biodone() called without initiating I/O. The problem always caused a panic before. Now it doesn't. The problem is mainly an issue with NFS. Fixed casts for ~PAGE_MASK. This code worked properly before only because the calculations use signed arithmatic. Better to properly extend PAGE_MASK first before inverting it for the 64 bit masking op. In brelse(), the bogus_page fixup code was improperly throwing away the original contents of 'm' when it did the j-loop to fix the bogus pages. The result was that it would potentially invalidate parts of the *WRONG* page(!), leading to corruption. There may still be cases where a background bitmap write is being duplicated, causing potential corruption. We have identified a potentially serious bug related to this but the fix is still TBD. So instead this patch contains a KASSERT to detect the problem and panic the machine rather then continue to corrupt the filesystem. The problem does not occur very often.. it is very hard to reproduce, and it may or may not be the cause of the corruption people have reported. Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>) Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
2000-11-18 23:06:26 +00:00
M_INDIRDEP, M_SOFTDEP_FLAGS);
ACQUIRE_LOCK(&lk);
indirdep->ir_state &= ~ATTACHED;
indirdep->ir_state |= UNDONE;
bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
bcopy(indirdep->ir_savebp->b_data, bp->b_data,
bp->b_bcount);
FREE_LOCK(&lk);
continue;
case D_MKDIR:
case D_BMSAFEMAP:
case D_ALLOCDIRECT:
case D_ALLOCINDIR:
continue;
default:
panic("handle_disk_io_initiation: Unexpected type %s",
TYPENAME(wk->wk_type));
/* NOTREACHED */
}
}
}
/*
* Called from within the procedure above to deal with unsatisfied
* allocation dependencies in a directory. The buffer must be locked,
* thus, no I/O completion operations can occur while we are
* manipulating its associated dependencies.
*/
static void
initiate_write_filepage(pagedep, bp)
struct pagedep *pagedep;
struct buf *bp;
{
struct diradd *dap;
struct direct *ep;
int i;
if (pagedep->pd_state & IOSTARTED) {
/*
* This can only happen if there is a driver that does not
* understand chaining. Here biodone will reissue the call
* to strategy for the incomplete buffers.
*/
printf("initiate_write_filepage: already started\n");
return;
}
pagedep->pd_state |= IOSTARTED;
ACQUIRE_LOCK(&lk);
for (i = 0; i < DAHASHSZ; i++) {
LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
ep = (struct direct *)
((char *)bp->b_data + dap->da_offset);
if (ep->d_ino != dap->da_newinum) {
FREE_LOCK(&lk);
panic("%s: dir inum %d != new %d",
"initiate_write_filepage",
ep->d_ino, dap->da_newinum);
}
if (dap->da_state & DIRCHG)
ep->d_ino = dap->da_previous->dm_oldinum;
else
ep->d_ino = 0;
dap->da_state &= ~ATTACHED;
dap->da_state |= UNDONE;
}
}
FREE_LOCK(&lk);
}
/*
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
* Version of initiate_write_inodeblock that handles UFS1 dinodes.
* Note that any bug fixes made to this routine must be done in the
* version found below.
*
* Called from within the procedure above to deal with unsatisfied
* allocation dependencies in an inodeblock. The buffer must be
* locked, thus, no I/O completion operations can occur while we
* are manipulating its associated dependencies.
*/
static void
initiate_write_inodeblock_ufs1(inodedep, bp)
struct inodedep *inodedep;
struct buf *bp; /* The inode block */
{
struct allocdirect *adp, *lastadp;
struct ufs1_dinode *dp;
struct fs *fs;
ufs_lbn_t i, prevlbn = 0;
int deplist;
if (inodedep->id_state & IOSTARTED)
panic("initiate_write_inodeblock_ufs1: already started");
inodedep->id_state |= IOSTARTED;
fs = inodedep->id_fs;
dp = (struct ufs1_dinode *)bp->b_data +
ino_to_fsbo(fs, inodedep->id_ino);
/*
* If the bitmap is not yet written, then the allocated
* inode cannot be written to disk.
*/
if ((inodedep->id_state & DEPCOMPLETE) == 0) {
if (inodedep->id_savedino1 != NULL)
panic("initiate_write_inodeblock_ufs1: I/O underway");
MALLOC(inodedep->id_savedino1, struct ufs1_dinode *,
sizeof(struct ufs1_dinode), M_INODEDEP, M_SOFTDEP_FLAGS);
*inodedep->id_savedino1 = *dp;
bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
return;
}
/*
* If no dependencies, then there is nothing to roll back.
*/
inodedep->id_savedsize = dp->di_size;
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
inodedep->id_savedextsize = 0;
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
return;
/*
* Set the dependencies to busy.
*/
ACQUIRE_LOCK(&lk);
for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
adp = TAILQ_NEXT(adp, ad_next)) {
#ifdef DIAGNOSTIC
if (deplist != 0 && prevlbn >= adp->ad_lbn) {
FREE_LOCK(&lk);
panic("softdep_write_inodeblock: lbn order");
}
prevlbn = adp->ad_lbn;
if (adp->ad_lbn < NDADDR &&
dp->di_db[adp->ad_lbn] != adp->ad_newblkno) {
FREE_LOCK(&lk);
panic("%s: direct pointer #%jd mismatch %d != %jd",
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
"softdep_write_inodeblock",
(intmax_t)adp->ad_lbn,
dp->di_db[adp->ad_lbn],
(intmax_t)adp->ad_newblkno);
}
if (adp->ad_lbn >= NDADDR &&
dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) {
FREE_LOCK(&lk);
panic("%s: indirect pointer #%jd mismatch %d != %jd",
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
"softdep_write_inodeblock",
(intmax_t)adp->ad_lbn - NDADDR,
dp->di_ib[adp->ad_lbn - NDADDR],
(intmax_t)adp->ad_newblkno);
}
deplist |= 1 << adp->ad_lbn;
if ((adp->ad_state & ATTACHED) == 0) {
FREE_LOCK(&lk);
panic("softdep_write_inodeblock: Unknown state 0x%x",
adp->ad_state);
}
#endif /* DIAGNOSTIC */
adp->ad_state &= ~ATTACHED;
adp->ad_state |= UNDONE;
}
/*
* The on-disk inode cannot claim to be any larger than the last
* fragment that has been written. Otherwise, the on-disk inode
* might have fragments that were not the last block in the file
* which would corrupt the filesystem.
*/
for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
if (adp->ad_lbn >= NDADDR)
break;
dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
/* keep going until hitting a rollback to a frag */
if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
continue;
dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
#ifdef DIAGNOSTIC
if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) {
FREE_LOCK(&lk);
panic("softdep_write_inodeblock: lost dep1");
}
#endif /* DIAGNOSTIC */
dp->di_db[i] = 0;
}
for (i = 0; i < NIADDR; i++) {
#ifdef DIAGNOSTIC
if (dp->di_ib[i] != 0 &&
(deplist & ((1 << NDADDR) << i)) == 0) {
FREE_LOCK(&lk);
panic("softdep_write_inodeblock: lost dep2");
}
#endif /* DIAGNOSTIC */
dp->di_ib[i] = 0;
}
FREE_LOCK(&lk);
return;
}
/*
* If we have zero'ed out the last allocated block of the file,
* roll back the size to the last currently allocated block.
* We know that this last allocated block is a full-sized as
* we already checked for fragments in the loop above.
*/
if (lastadp != NULL &&
dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
for (i = lastadp->ad_lbn; i >= 0; i--)
if (dp->di_db[i] != 0)
break;
dp->di_size = (i + 1) * fs->fs_bsize;
}
/*
* The only dependencies are for indirect blocks.
*
* The file size for indirect block additions is not guaranteed.
* Such a guarantee would be non-trivial to achieve. The conventional
* synchronous write implementation also does not make this guarantee.
* Fsck should catch and fix discrepancies. Arguably, the file size
* can be over-estimated without destroying integrity when the file
* moves into the indirect blocks (i.e., is large). If we want to
* postpone fsck, we are stuck with this argument.
*/
for (; adp; adp = TAILQ_NEXT(adp, ad_next))
dp->di_ib[adp->ad_lbn - NDADDR] = 0;
FREE_LOCK(&lk);
}
/*
* Version of initiate_write_inodeblock that handles UFS2 dinodes.
* Note that any bug fixes made to this routine must be done in the
* version found above.
*
* Called from within the procedure above to deal with unsatisfied
* allocation dependencies in an inodeblock. The buffer must be
* locked, thus, no I/O completion operations can occur while we
* are manipulating its associated dependencies.
*/
static void
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
initiate_write_inodeblock_ufs2(inodedep, bp)
struct inodedep *inodedep;
struct buf *bp; /* The inode block */
{
struct allocdirect *adp, *lastadp;
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
struct ufs2_dinode *dp;
struct fs *fs;
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
ufs_lbn_t i, prevlbn = 0;
int deplist;
if (inodedep->id_state & IOSTARTED)
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
panic("initiate_write_inodeblock_ufs2: already started");
inodedep->id_state |= IOSTARTED;
fs = inodedep->id_fs;
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
dp = (struct ufs2_dinode *)bp->b_data +
ino_to_fsbo(fs, inodedep->id_ino);
/*
* If the bitmap is not yet written, then the allocated
* inode cannot be written to disk.
*/
if ((inodedep->id_state & DEPCOMPLETE) == 0) {
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
if (inodedep->id_savedino2 != NULL)
panic("initiate_write_inodeblock_ufs2: I/O underway");
MALLOC(inodedep->id_savedino2, struct ufs2_dinode *,
sizeof(struct ufs2_dinode), M_INODEDEP, M_SOFTDEP_FLAGS);
*inodedep->id_savedino2 = *dp;
bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
return;
}
/*
* If no dependencies, then there is nothing to roll back.
*/
inodedep->id_savedsize = dp->di_size;
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
inodedep->id_savedextsize = dp->di_extsize;
if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL &&
TAILQ_FIRST(&inodedep->id_extupdt) == NULL)
return;
/*
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
* Set the ext data dependencies to busy.
*/
ACQUIRE_LOCK(&lk);
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
adp = TAILQ_NEXT(adp, ad_next)) {
#ifdef DIAGNOSTIC
if (deplist != 0 && prevlbn >= adp->ad_lbn) {
FREE_LOCK(&lk);
panic("softdep_write_inodeblock: lbn order");
}
prevlbn = adp->ad_lbn;
if (dp->di_extb[adp->ad_lbn] != adp->ad_newblkno) {
FREE_LOCK(&lk);
panic("%s: direct pointer #%jd mismatch %jd != %jd",
"softdep_write_inodeblock",
(intmax_t)adp->ad_lbn,
(intmax_t)dp->di_extb[adp->ad_lbn],
(intmax_t)adp->ad_newblkno);
}
deplist |= 1 << adp->ad_lbn;
if ((adp->ad_state & ATTACHED) == 0) {
FREE_LOCK(&lk);
panic("softdep_write_inodeblock: Unknown state 0x%x",
adp->ad_state);
}
#endif /* DIAGNOSTIC */
adp->ad_state &= ~ATTACHED;
adp->ad_state |= UNDONE;
}
/*
* The on-disk inode cannot claim to be any larger than the last
* fragment that has been written. Otherwise, the on-disk inode
* might have fragments that were not the last block in the ext
* data which would corrupt the filesystem.
*/
for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno;
/* keep going until hitting a rollback to a frag */
if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
continue;
dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
for (i = adp->ad_lbn + 1; i < NXADDR; i++) {
#ifdef DIAGNOSTIC
if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) {
FREE_LOCK(&lk);
panic("softdep_write_inodeblock: lost dep1");
}
#endif /* DIAGNOSTIC */
dp->di_extb[i] = 0;
}
lastadp = NULL;
break;
}
/*
* If we have zero'ed out the last allocated block of the ext
* data, roll back the size to the last currently allocated block.
* We know that this last allocated block is a full-sized as
* we already checked for fragments in the loop above.
*/
if (lastadp != NULL &&
dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
for (i = lastadp->ad_lbn; i >= 0; i--)
if (dp->di_extb[i] != 0)
break;
dp->di_extsize = (i + 1) * fs->fs_bsize;
}
/*
* Set the file data dependencies to busy.
*/
for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
adp = TAILQ_NEXT(adp, ad_next)) {
#ifdef DIAGNOSTIC
if (deplist != 0 && prevlbn >= adp->ad_lbn) {
FREE_LOCK(&lk);
panic("softdep_write_inodeblock: lbn order");
}
prevlbn = adp->ad_lbn;
if (adp->ad_lbn < NDADDR &&
dp->di_db[adp->ad_lbn] != adp->ad_newblkno) {
FREE_LOCK(&lk);
panic("%s: direct pointer #%jd mismatch %jd != %jd",
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
"softdep_write_inodeblock",
(intmax_t)adp->ad_lbn,
(intmax_t)dp->di_db[adp->ad_lbn],
(intmax_t)adp->ad_newblkno);
}
if (adp->ad_lbn >= NDADDR &&
dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) {
FREE_LOCK(&lk);
panic("%s indirect pointer #%jd mismatch %jd != %jd",
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
"softdep_write_inodeblock:",
(intmax_t)adp->ad_lbn - NDADDR,
(intmax_t)dp->di_ib[adp->ad_lbn - NDADDR],
(intmax_t)adp->ad_newblkno);
}
deplist |= 1 << adp->ad_lbn;
if ((adp->ad_state & ATTACHED) == 0) {
FREE_LOCK(&lk);
panic("softdep_write_inodeblock: Unknown state 0x%x",
adp->ad_state);
}
#endif /* DIAGNOSTIC */
adp->ad_state &= ~ATTACHED;
adp->ad_state |= UNDONE;
}
/*
* The on-disk inode cannot claim to be any larger than the last
* fragment that has been written. Otherwise, the on-disk inode
* might have fragments that were not the last block in the file
* which would corrupt the filesystem.
*/
for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
if (adp->ad_lbn >= NDADDR)
break;
dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
/* keep going until hitting a rollback to a frag */
if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
continue;
dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
#ifdef DIAGNOSTIC
if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) {
FREE_LOCK(&lk);
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
panic("softdep_write_inodeblock: lost dep2");
}
#endif /* DIAGNOSTIC */
dp->di_db[i] = 0;
}
for (i = 0; i < NIADDR; i++) {
#ifdef DIAGNOSTIC
if (dp->di_ib[i] != 0 &&
(deplist & ((1 << NDADDR) << i)) == 0) {
FREE_LOCK(&lk);
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
panic("softdep_write_inodeblock: lost dep3");
}
#endif /* DIAGNOSTIC */
dp->di_ib[i] = 0;
}
FREE_LOCK(&lk);
return;
}
/*
* If we have zero'ed out the last allocated block of the file,
* roll back the size to the last currently allocated block.
* We know that this last allocated block is a full-sized as
* we already checked for fragments in the loop above.
*/
if (lastadp != NULL &&
dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
for (i = lastadp->ad_lbn; i >= 0; i--)
if (dp->di_db[i] != 0)
break;
dp->di_size = (i + 1) * fs->fs_bsize;
}
/*
* The only dependencies are for indirect blocks.
*
* The file size for indirect block additions is not guaranteed.
* Such a guarantee would be non-trivial to achieve. The conventional
* synchronous write implementation also does not make this guarantee.
* Fsck should catch and fix discrepancies. Arguably, the file size
* can be over-estimated without destroying integrity when the file
* moves into the indirect blocks (i.e., is large). If we want to
* postpone fsck, we are stuck with this argument.
*/
for (; adp; adp = TAILQ_NEXT(adp, ad_next))
dp->di_ib[adp->ad_lbn - NDADDR] = 0;
FREE_LOCK(&lk);
}
/*
* This routine is called during the completion interrupt
* service routine for a disk write (from the procedure called
2002-05-16 21:28:32 +00:00
* by the device driver to inform the filesystem caches of
* a request completion). It should be called early in this
* procedure, before the block is made available to other
* processes or other routines are called.
*/
static void
softdep_disk_write_complete(bp)
struct buf *bp; /* describes the completed disk write */
{
struct worklist *wk;
struct workhead reattach;
struct newblk *newblk;
struct allocindir *aip;
struct allocdirect *adp;
struct indirdep *indirdep;
struct inodedep *inodedep;
struct bmsafemap *bmsafemap;
/*
* If an error occurred while doing the write, then the data
* has not hit the disk and the dependencies cannot be unrolled.
*/
if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
return;
#ifdef DEBUG
if (lk.lkt_held != NOHOLDER)
panic("softdep_disk_write_complete: lock is held");
lk.lkt_held = SPECIAL_FLAG;
#endif
LIST_INIT(&reattach);
while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
WORKLIST_REMOVE(wk);
switch (wk->wk_type) {
case D_PAGEDEP:
if (handle_written_filepage(WK_PAGEDEP(wk), bp))
WORKLIST_INSERT(&reattach, wk);
continue;
case D_INODEDEP:
if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
WORKLIST_INSERT(&reattach, wk);
continue;
case D_BMSAFEMAP:
bmsafemap = WK_BMSAFEMAP(wk);
while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
newblk->nb_state |= DEPCOMPLETE;
newblk->nb_bmsafemap = NULL;
LIST_REMOVE(newblk, nb_deps);
}
while ((adp =
LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
adp->ad_state |= DEPCOMPLETE;
adp->ad_buf = NULL;
LIST_REMOVE(adp, ad_deps);
handle_allocdirect_partdone(adp);
}
while ((aip =
LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
aip->ai_state |= DEPCOMPLETE;
aip->ai_buf = NULL;
LIST_REMOVE(aip, ai_deps);
handle_allocindir_partdone(aip);
}
while ((inodedep =
LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
inodedep->id_state |= DEPCOMPLETE;
LIST_REMOVE(inodedep, id_deps);
inodedep->id_buf = NULL;
}
WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
continue;
case D_MKDIR:
handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
continue;
case D_ALLOCDIRECT:
adp = WK_ALLOCDIRECT(wk);
adp->ad_state |= COMPLETE;
handle_allocdirect_partdone(adp);
continue;
case D_ALLOCINDIR:
aip = WK_ALLOCINDIR(wk);
aip->ai_state |= COMPLETE;
handle_allocindir_partdone(aip);
continue;
case D_INDIRDEP:
indirdep = WK_INDIRDEP(wk);
if (indirdep->ir_state & GOINGAWAY) {
lk.lkt_held = NOHOLDER;
panic("disk_write_complete: indirdep gone");
}
bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
FREE(indirdep->ir_saveddata, M_INDIRDEP);
indirdep->ir_saveddata = 0;
indirdep->ir_state &= ~UNDONE;
indirdep->ir_state |= ATTACHED;
while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
handle_allocindir_partdone(aip);
if (aip == LIST_FIRST(&indirdep->ir_donehd)) {
lk.lkt_held = NOHOLDER;
panic("disk_write_complete: not gone");
}
}
WORKLIST_INSERT(&reattach, wk);
if ((bp->b_flags & B_DELWRI) == 0)
stat_indir_blk_ptrs++;
bdirty(bp);
continue;
default:
lk.lkt_held = NOHOLDER;
panic("handle_disk_write_complete: Unknown type %s",
TYPENAME(wk->wk_type));
/* NOTREACHED */
}
}
/*
* Reattach any requests that must be redone.
*/
while ((wk = LIST_FIRST(&reattach)) != NULL) {
WORKLIST_REMOVE(wk);
WORKLIST_INSERT(&bp->b_dep, wk);
}
#ifdef DEBUG
if (lk.lkt_held != SPECIAL_FLAG)
panic("softdep_disk_write_complete: lock lost");
lk.lkt_held = NOHOLDER;
#endif
}
/*
* Called from within softdep_disk_write_complete above. Note that
* this routine is always called from interrupt level with further
* splbio interrupts blocked.
*/
static void
handle_allocdirect_partdone(adp)
struct allocdirect *adp; /* the completed allocdirect */
{
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
struct allocdirectlst *listhead;
struct allocdirect *listadp;
struct inodedep *inodedep;
long bsize, delay;
if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
return;
if (adp->ad_buf != NULL) {
lk.lkt_held = NOHOLDER;
panic("handle_allocdirect_partdone: dangling dep");
}
/*
* The on-disk inode cannot claim to be any larger than the last
* fragment that has been written. Otherwise, the on-disk inode
* might have fragments that were not the last block in the file
* which would corrupt the filesystem. Thus, we cannot free any
* allocdirects after one whose ad_oldblkno claims a fragment as
* these blocks must be rolled back to zero before writing the inode.
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
* We check the currently active set of allocdirects in id_inoupdt
* or id_extupdt as appropriate.
*/
inodedep = adp->ad_inodedep;
bsize = inodedep->id_fs->fs_bsize;
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
if (adp->ad_state & EXTDATA)
listhead = &inodedep->id_extupdt;
else
listhead = &inodedep->id_inoupdt;
TAILQ_FOREACH(listadp, listhead, ad_next) {
/* found our block */
if (listadp == adp)
break;
/* continue if ad_oldlbn is not a fragment */
if (listadp->ad_oldsize == 0 ||
listadp->ad_oldsize == bsize)
continue;
/* hit a fragment */
return;
}
/*
* If we have reached the end of the current list without
* finding the just finished dependency, then it must be
* on the future dependency list. Future dependencies cannot
* be freed until they are moved to the current list.
*/
if (listadp == NULL) {
#ifdef DEBUG
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
if (adp->ad_state & EXTDATA)
listhead = &inodedep->id_newextupdt;
else
listhead = &inodedep->id_newinoupdt;
TAILQ_FOREACH(listadp, listhead, ad_next)
/* found our block */
if (listadp == adp)
break;
if (listadp == NULL) {
lk.lkt_held = NOHOLDER;
panic("handle_allocdirect_partdone: lost dep");
}
#endif /* DEBUG */
return;
}
/*
* If we have found the just finished dependency, then free
* it along with anything that follows it that is complete.
* If the inode still has a bitmap dependency, then it has
* never been written to disk, hence the on-disk inode cannot
* reference the old fragment so we can free it without delay.
*/
delay = (inodedep->id_state & DEPCOMPLETE);
for (; adp; adp = listadp) {
listadp = TAILQ_NEXT(adp, ad_next);
if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
return;
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
free_allocdirect(listhead, adp, delay);
}
}
/*
* Called from within softdep_disk_write_complete above. Note that
* this routine is always called from interrupt level with further
* splbio interrupts blocked.
*/
static void
handle_allocindir_partdone(aip)
struct allocindir *aip; /* the completed allocindir */
{
struct indirdep *indirdep;
if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
return;
if (aip->ai_buf != NULL) {
lk.lkt_held = NOHOLDER;
panic("handle_allocindir_partdone: dangling dependency");
}
indirdep = aip->ai_indirdep;
if (indirdep->ir_state & UNDONE) {
LIST_REMOVE(aip, ai_next);
LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
return;
}
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
if (indirdep->ir_state & UFS1FMT)
((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
aip->ai_newblkno;
else
((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
aip->ai_newblkno;
LIST_REMOVE(aip, ai_next);
if (aip->ai_freefrag != NULL)
add_to_worklist(&aip->ai_freefrag->ff_list);
WORKITEM_FREE(aip, D_ALLOCINDIR);
}
/*
* Called from within softdep_disk_write_complete above to restore
* in-memory inode block contents to their most up-to-date state. Note
* that this routine is always called from interrupt level with further
* splbio interrupts blocked.
*/
static int
handle_written_inodeblock(inodedep, bp)
struct inodedep *inodedep;
struct buf *bp; /* buffer containing the inode block */
{
struct worklist *wk, *filefree;
struct allocdirect *adp, *nextadp;
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
struct ufs1_dinode *dp1 = NULL;
struct ufs2_dinode *dp2 = NULL;
int hadchanges, fstype;
if ((inodedep->id_state & IOSTARTED) == 0) {
lk.lkt_held = NOHOLDER;
panic("handle_written_inodeblock: not started");
}
inodedep->id_state &= ~IOSTARTED;
inodedep->id_state |= COMPLETE;
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
fstype = UFS1;
dp1 = (struct ufs1_dinode *)bp->b_data +
ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
} else {
fstype = UFS2;
dp2 = (struct ufs2_dinode *)bp->b_data +
ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
}
/*
* If we had to rollback the inode allocation because of
* bitmaps being incomplete, then simply restore it.
* Keep the block dirty so that it will not be reclaimed until
* all associated dependencies have been cleared and the
* corresponding updates written to disk.
*/
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
if (inodedep->id_savedino1 != NULL) {
if (fstype == UFS1)
*dp1 = *inodedep->id_savedino1;
else
*dp2 = *inodedep->id_savedino2;
FREE(inodedep->id_savedino1, M_INODEDEP);
inodedep->id_savedino1 = NULL;
if ((bp->b_flags & B_DELWRI) == 0)
stat_inode_bitmap++;
bdirty(bp);
return (1);
}
/*
* Roll forward anything that had to be rolled back before
* the inode could be updated.
*/
hadchanges = 0;
for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
nextadp = TAILQ_NEXT(adp, ad_next);
if (adp->ad_state & ATTACHED) {
lk.lkt_held = NOHOLDER;
panic("handle_written_inodeblock: new entry");
}
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
if (fstype == UFS1) {
if (adp->ad_lbn < NDADDR) {
if (dp1->di_db[adp->ad_lbn]!=adp->ad_oldblkno) {
lk.lkt_held = NOHOLDER;
panic("%s %s #%jd mismatch %d != %jd",
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
"handle_written_inodeblock:",
"direct pointer",
(intmax_t)adp->ad_lbn,
dp1->di_db[adp->ad_lbn],
(intmax_t)adp->ad_oldblkno);
}
dp1->di_db[adp->ad_lbn] = adp->ad_newblkno;
} else {
if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0) {
lk.lkt_held = NOHOLDER;
panic("%s: %s #%jd allocated as %d",
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
"handle_written_inodeblock",
"indirect pointer",
(intmax_t)adp->ad_lbn - NDADDR,
dp1->di_ib[adp->ad_lbn - NDADDR]);
}
dp1->di_ib[adp->ad_lbn - NDADDR] =
adp->ad_newblkno;
}
} else {
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
if (adp->ad_lbn < NDADDR) {
if (dp2->di_db[adp->ad_lbn]!=adp->ad_oldblkno) {
lk.lkt_held = NOHOLDER;
panic("%s: %s #%jd %s %jd != %jd",
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
"handle_written_inodeblock",
"direct pointer",
(intmax_t)adp->ad_lbn, "mismatch",
(intmax_t)dp2->di_db[adp->ad_lbn],
(intmax_t)adp->ad_oldblkno);
}
dp2->di_db[adp->ad_lbn] = adp->ad_newblkno;
} else {
if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0) {
lk.lkt_held = NOHOLDER;
panic("%s: %s #%jd allocated as %jd",
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
"handle_written_inodeblock",
"indirect pointer",
(intmax_t)adp->ad_lbn - NDADDR,
(intmax_t)
dp2->di_ib[adp->ad_lbn - NDADDR]);
}
dp2->di_ib[adp->ad_lbn - NDADDR] =
adp->ad_newblkno;
}
}
adp->ad_state &= ~UNDONE;
adp->ad_state |= ATTACHED;
hadchanges = 1;
}
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
nextadp = TAILQ_NEXT(adp, ad_next);
if (adp->ad_state & ATTACHED) {
lk.lkt_held = NOHOLDER;
panic("handle_written_inodeblock: new entry");
}
if (dp2->di_extb[adp->ad_lbn] != adp->ad_oldblkno) {
lk.lkt_held = NOHOLDER;
panic("%s: direct pointers #%jd %s %jd != %jd",
"handle_written_inodeblock",
(intmax_t)adp->ad_lbn, "mismatch",
(intmax_t)dp2->di_extb[adp->ad_lbn],
(intmax_t)adp->ad_oldblkno);
}
dp2->di_extb[adp->ad_lbn] = adp->ad_newblkno;
adp->ad_state &= ~UNDONE;
adp->ad_state |= ATTACHED;
hadchanges = 1;
}
if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
stat_direct_blk_ptrs++;
/*
* Reset the file size to its most up-to-date value.
*/
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1) {
lk.lkt_held = NOHOLDER;
panic("handle_written_inodeblock: bad size");
}
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
if (fstype == UFS1) {
if (dp1->di_size != inodedep->id_savedsize) {
dp1->di_size = inodedep->id_savedsize;
hadchanges = 1;
}
} else {
if (dp2->di_size != inodedep->id_savedsize) {
dp2->di_size = inodedep->id_savedsize;
hadchanges = 1;
}
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
if (dp2->di_extsize != inodedep->id_savedextsize) {
dp2->di_extsize = inodedep->id_savedextsize;
hadchanges = 1;
}
}
inodedep->id_savedsize = -1;
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
inodedep->id_savedextsize = -1;
/*
* If there were any rollbacks in the inode block, then it must be
* marked dirty so that its will eventually get written back in
* its correct form.
*/
if (hadchanges)
bdirty(bp);
/*
* Process any allocdirects that completed during the update.
*/
if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
handle_allocdirect_partdone(adp);
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
handle_allocdirect_partdone(adp);
/*
* Process deallocations that were held pending until the
* inode had been written to disk. Freeing of the inode
* is delayed until after all blocks have been freed to
* avoid creation of new <vfsid, inum, lbn> triples
* before the old ones have been deleted.
*/
filefree = NULL;
while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
WORKLIST_REMOVE(wk);
switch (wk->wk_type) {
case D_FREEFILE:
/*
* We defer adding filefree to the worklist until
* all other additions have been made to ensure
* that it will be done after all the old blocks
* have been freed.
*/
if (filefree != NULL) {
lk.lkt_held = NOHOLDER;
panic("handle_written_inodeblock: filefree");
}
filefree = wk;
continue;
case D_MKDIR:
handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
continue;
case D_DIRADD:
diradd_inode_written(WK_DIRADD(wk), inodedep);
continue;
case D_FREEBLKS:
case D_FREEFRAG:
case D_DIRREM:
add_to_worklist(wk);
continue;
case D_NEWDIRBLK:
free_newdirblk(WK_NEWDIRBLK(wk));
continue;
default:
lk.lkt_held = NOHOLDER;
panic("handle_written_inodeblock: Unknown type %s",
TYPENAME(wk->wk_type));
/* NOTREACHED */
}
}
if (filefree != NULL) {
if (free_inodedep(inodedep) == 0) {
lk.lkt_held = NOHOLDER;
panic("handle_written_inodeblock: live inodedep");
}
add_to_worklist(filefree);
return (0);
}
/*
* If no outstanding dependencies, free it.
*/
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
if (free_inodedep(inodedep) ||
(TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
TAILQ_FIRST(&inodedep->id_extupdt) == 0))
return (0);
return (hadchanges);
}
/*
* Process a diradd entry after its dependent inode has been written.
* This routine must be called with splbio interrupts blocked.
*/
static void
diradd_inode_written(dap, inodedep)
struct diradd *dap;
struct inodedep *inodedep;
{
struct pagedep *pagedep;
dap->da_state |= COMPLETE;
if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
if (dap->da_state & DIRCHG)
pagedep = dap->da_previous->dm_pagedep;
else
pagedep = dap->da_pagedep;
LIST_REMOVE(dap, da_pdlist);
LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
}
WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
}
/*
* Handle the completion of a mkdir dependency.
*/
static void
handle_written_mkdir(mkdir, type)
struct mkdir *mkdir;
int type;
{
struct diradd *dap;
struct pagedep *pagedep;
if (mkdir->md_state != type) {
lk.lkt_held = NOHOLDER;
panic("handle_written_mkdir: bad type");
}
dap = mkdir->md_diradd;
dap->da_state &= ~type;
if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
dap->da_state |= DEPCOMPLETE;
if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
if (dap->da_state & DIRCHG)
pagedep = dap->da_previous->dm_pagedep;
else
pagedep = dap->da_pagedep;
LIST_REMOVE(dap, da_pdlist);
LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
}
LIST_REMOVE(mkdir, md_mkdirs);
WORKITEM_FREE(mkdir, D_MKDIR);
}
/*
* Called from within softdep_disk_write_complete above.
* A write operation was just completed. Removed inodes can
* now be freed and associated block pointers may be committed.
* Note that this routine is always called from interrupt level
* with further splbio interrupts blocked.
*/
static int
handle_written_filepage(pagedep, bp)
struct pagedep *pagedep;
struct buf *bp; /* buffer containing the written page */
{
struct dirrem *dirrem;
struct diradd *dap, *nextdap;
struct direct *ep;
int i, chgs;
if ((pagedep->pd_state & IOSTARTED) == 0) {
lk.lkt_held = NOHOLDER;
panic("handle_written_filepage: not started");
}
pagedep->pd_state &= ~IOSTARTED;
/*
* Process any directory removals that have been committed.
*/
while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
LIST_REMOVE(dirrem, dm_next);
dirrem->dm_dirinum = pagedep->pd_ino;
add_to_worklist(&dirrem->dm_list);
}
/*
* Free any directory additions that have been committed.
* If it is a newly allocated block, we have to wait until
* the on-disk directory inode claims the new block.
*/
if ((pagedep->pd_state & NEWBLOCK) == 0)
while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
free_diradd(dap);
/*
* Uncommitted directory entries must be restored.
*/
for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
dap = nextdap) {
nextdap = LIST_NEXT(dap, da_pdlist);
if (dap->da_state & ATTACHED) {
lk.lkt_held = NOHOLDER;
panic("handle_written_filepage: attached");
}
ep = (struct direct *)
((char *)bp->b_data + dap->da_offset);
ep->d_ino = dap->da_newinum;
dap->da_state &= ~UNDONE;
dap->da_state |= ATTACHED;
chgs = 1;
/*
* If the inode referenced by the directory has
* been written out, then the dependency can be
* moved to the pending list.
*/
if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
LIST_REMOVE(dap, da_pdlist);
LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
da_pdlist);
}
}
}
/*
* If there were any rollbacks in the directory, then it must be
* marked dirty so that its will eventually get written back in
* its correct form.
*/
if (chgs) {
if ((bp->b_flags & B_DELWRI) == 0)
stat_dir_entry++;
bdirty(bp);
return (1);
}
/*
* If we are not waiting for a new directory block to be
* claimed by its inode, then the pagedep will be freed.
* Otherwise it will remain to track any new entries on
* the page in case they are fsync'ed.
*/
if ((pagedep->pd_state & NEWBLOCK) == 0) {
LIST_REMOVE(pagedep, pd_hash);
WORKITEM_FREE(pagedep, D_PAGEDEP);
}
return (0);
}
/*
* Writing back in-core inode structures.
*
2002-05-16 21:28:32 +00:00
* The filesystem only accesses an inode's contents when it occupies an
* "in-core" inode structure. These "in-core" structures are separate from
* the page frames used to cache inode blocks. Only the latter are
* transferred to/from the disk. So, when the updated contents of the
* "in-core" inode structure are copied to the corresponding in-memory inode
* block, the dependencies are also transferred. The following procedure is
* called when copying a dirty "in-core" inode to a cached inode block.
*/
/*
* Called when an inode is loaded from disk. If the effective link count
* differed from the actual link count when it was last flushed, then we
* need to ensure that the correct effective link count is put back.
*/
void
softdep_load_inodeblock(ip)
struct inode *ip; /* the "in_core" copy of the inode */
{
struct inodedep *inodedep;
/*
* Check for alternate nlink count.
*/
ip->i_effnlink = ip->i_nlink;
ACQUIRE_LOCK(&lk);
if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
FREE_LOCK(&lk);
return;
}
ip->i_effnlink -= inodedep->id_nlinkdelta;
if (inodedep->id_state & SPACECOUNTED)
ip->i_flag |= IN_SPACECOUNTED;
FREE_LOCK(&lk);
}
/*
* This routine is called just before the "in-core" inode
* information is to be copied to the in-memory inode block.
* Recall that an inode block contains several inodes. If
* the force flag is set, then the dependencies will be
* cleared so that the update can always be made. Note that
* the buffer is locked when this routine is called, so we
* will never be in the middle of writing the inode block
* to disk.
*/
void
softdep_update_inodeblock(ip, bp, waitfor)
struct inode *ip; /* the "in_core" copy of the inode */
struct buf *bp; /* the buffer containing the inode block */
int waitfor; /* nonzero => update must be allowed */
{
struct inodedep *inodedep;
struct worklist *wk;
struct buf *ibp;
int error;
/*
* If the effective link count is not equal to the actual link
* count, then we must track the difference in an inodedep while
* the inode is (potentially) tossed out of the cache. Otherwise,
* if there is no existing inodedep, then there are no dependencies
* to track.
*/
ACQUIRE_LOCK(&lk);
if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
FREE_LOCK(&lk);
if (ip->i_effnlink != ip->i_nlink)
panic("softdep_update_inodeblock: bad link count");
return;
}
if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) {
FREE_LOCK(&lk);
panic("softdep_update_inodeblock: bad delta");
}
/*
* Changes have been initiated. Anything depending on these
* changes cannot occur until this inode has been written.
*/
inodedep->id_state &= ~COMPLETE;
if ((inodedep->id_state & ONWORKLIST) == 0)
WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
/*
* Any new dependencies associated with the incore inode must
* now be moved to the list associated with the buffer holding
* the in-memory copy of the inode. Once merged process any
* allocdirects that are completed by the merger.
*/
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
if (TAILQ_FIRST(&inodedep->id_extupdt) != NULL)
handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt));
/*
* Now that the inode has been pushed into the buffer, the
* operations dependent on the inode being written to disk
* can be moved to the id_bufwait so that they will be
* processed when the buffer I/O completes.
*/
while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
WORKLIST_REMOVE(wk);
WORKLIST_INSERT(&inodedep->id_bufwait, wk);
}
/*
* Newly allocated inodes cannot be written until the bitmap
* that allocates them have been written (indicated by
* DEPCOMPLETE being set in id_state). If we are doing a
* forced sync (e.g., an fsync on a file), we force the bitmap
* to be written so that the update can be done.
*/
if ((inodedep->id_state & DEPCOMPLETE) != 0 || waitfor == 0) {
FREE_LOCK(&lk);
return;
}
ibp = inodedep->id_buf;
ibp = getdirtybuf(&ibp, NULL, MNT_WAIT);
FREE_LOCK(&lk);
if (ibp && (error = bwrite(ibp)) != 0)
softdep_error("softdep_update_inodeblock: bwrite", error);
if ((inodedep->id_state & DEPCOMPLETE) == 0)
panic("softdep_update_inodeblock: update failed");
}
/*
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
* Merge the a new inode dependency list (such as id_newinoupdt) into an
* old inode dependency list (such as id_inoupdt). This routine must be
* called with splbio interrupts blocked.
*/
static void
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
merge_inode_lists(newlisthead, oldlisthead)
struct allocdirectlst *newlisthead;
struct allocdirectlst *oldlisthead;
{
struct allocdirect *listadp, *newadp;
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
newadp = TAILQ_FIRST(newlisthead);
for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
if (listadp->ad_lbn < newadp->ad_lbn) {
listadp = TAILQ_NEXT(listadp, ad_next);
continue;
}
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
TAILQ_REMOVE(newlisthead, newadp, ad_next);
TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
if (listadp->ad_lbn == newadp->ad_lbn) {
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
allocdirect_merge(oldlisthead, newadp,
listadp);
listadp = newadp;
}
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
newadp = TAILQ_FIRST(newlisthead);
}
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
TAILQ_REMOVE(newlisthead, newadp, ad_next);
TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
}
}
/*
* If we are doing an fsync, then we must ensure that any directory
* entries for the inode have been written after the inode gets to disk.
*/
int
softdep_fsync(vp)
struct vnode *vp; /* the "in_core" copy of the inode */
{
struct inodedep *inodedep;
struct pagedep *pagedep;
struct worklist *wk;
struct diradd *dap;
struct mount *mnt;
struct vnode *pvp;
struct inode *ip;
struct buf *bp;
struct fs *fs;
struct thread *td = curthread;
int error, flushparent;
ino_t parentino;
ufs_lbn_t lbn;
ip = VTOI(vp);
fs = ip->i_fs;
ACQUIRE_LOCK(&lk);
if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) {
FREE_LOCK(&lk);
return (0);
}
if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
LIST_FIRST(&inodedep->id_bufwait) != NULL ||
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
TAILQ_FIRST(&inodedep->id_extupdt) != NULL ||
TAILQ_FIRST(&inodedep->id_newextupdt) != NULL ||
TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) {
FREE_LOCK(&lk);
panic("softdep_fsync: pending ops");
}
for (error = 0, flushparent = 0; ; ) {
if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
break;
if (wk->wk_type != D_DIRADD) {
FREE_LOCK(&lk);
panic("softdep_fsync: Unexpected type %s",
TYPENAME(wk->wk_type));
}
dap = WK_DIRADD(wk);
/*
* Flush our parent if this directory entry has a MKDIR_PARENT
* dependency or is contained in a newly allocated block.
*/
if (dap->da_state & DIRCHG)
pagedep = dap->da_previous->dm_pagedep;
else
pagedep = dap->da_pagedep;
mnt = pagedep->pd_mnt;
parentino = pagedep->pd_ino;
lbn = pagedep->pd_lbn;
if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) {
FREE_LOCK(&lk);
panic("softdep_fsync: dirty");
}
if ((dap->da_state & MKDIR_PARENT) ||
(pagedep->pd_state & NEWBLOCK))
flushparent = 1;
else
flushparent = 0;
/*
* If we are being fsync'ed as part of vgone'ing this vnode,
* then we will not be able to release and recover the
* vnode below, so we just have to give up on writing its
* directory entry out. It will eventually be written, just
* not now, but then the user was not asking to have it
* written, so we are not breaking any promises.
*/
if (vp->v_iflag & VI_XLOCK)
break;
/*
* We prevent deadlock by always fetching inodes from the
* root, moving down the directory tree. Thus, when fetching
* our parent directory, we first try to get the lock. If
* that fails, we must unlock ourselves before requesting
* the lock on our parent. See the comment in ufs_lookup
* for details on possible races.
*/
FREE_LOCK(&lk);
if (VFS_VGET(mnt, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp)) {
VOP_UNLOCK(vp, 0, td);
error = VFS_VGET(mnt, parentino, LK_EXCLUSIVE, &pvp);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
if (error != 0)
return (error);
}
/*
* All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
* that are contained in direct blocks will be resolved by
* doing a UFS_UPDATE. Pagedeps contained in indirect blocks
* may require a complete sync'ing of the directory. So, we
* try the cheap and fast UFS_UPDATE first, and if that fails,
* then we do the slower VOP_FSYNC of the directory.
*/
if (flushparent) {
if ((error = UFS_UPDATE(pvp, 1)) != 0) {
vput(pvp);
return (error);
}
if ((pagedep->pd_state & NEWBLOCK) &&
(error = VOP_FSYNC(pvp, td->td_ucred, MNT_WAIT, td))) {
vput(pvp);
return (error);
}
}
/*
* Flush directory page containing the inode's name.
*/
error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
&bp);
if (error == 0)
error = bwrite(bp);
else
brelse(bp);
vput(pvp);
if (error != 0)
return (error);
ACQUIRE_LOCK(&lk);
if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
break;
}
FREE_LOCK(&lk);
return (0);
}
/*
* Flush all the dirty bitmaps associated with the block device
* before flushing the rest of the dirty blocks so as to reduce
* the number of dependencies that will have to be rolled back.
*/
void
softdep_fsync_mountdev(vp)
struct vnode *vp;
{
struct buf *bp, *nbp;
struct worklist *wk;
if (!vn_isdisk(vp, NULL))
panic("softdep_fsync_mountdev: vnode not a disk");
ACQUIRE_LOCK(&lk);
VI_LOCK(vp);
TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) {
/*
* If it is already scheduled, skip to the next buffer.
*/
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
continue;
if ((bp->b_flags & B_DELWRI) == 0) {
FREE_LOCK(&lk);
panic("softdep_fsync_mountdev: not dirty");
}
/*
* We are only interested in bitmaps with outstanding
* dependencies.
*/
if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
wk->wk_type != D_BMSAFEMAP ||
(bp->b_vflags & BV_BKGRDINPROG)) {
BUF_UNLOCK(bp);
continue;
}
VI_UNLOCK(vp);
bremfree(bp);
FREE_LOCK(&lk);
(void) bawrite(bp);
ACQUIRE_LOCK(&lk);
/*
* Since we may have slept during the I/O, we need
* to start from a known point.
*/
VI_LOCK(vp);
nbp = TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd);
}
drain_output(vp, 1);
VI_UNLOCK(vp);
FREE_LOCK(&lk);
}
/*
* This routine is called when we are trying to synchronously flush a
* file. This routine must eliminate any filesystem metadata dependencies
* so that the syncing routine can succeed by pushing the dirty blocks
* associated with the file. If any I/O errors occur, they are returned.
*/
int
softdep_sync_metadata(ap)
struct vop_fsync_args /* {
struct vnode *a_vp;
struct ucred *a_cred;
int a_waitfor;
struct thread *a_td;
} */ *ap;
{
struct vnode *vp = ap->a_vp;
struct pagedep *pagedep;
struct allocdirect *adp;
struct allocindir *aip;
struct buf *bp, *nbp;
struct worklist *wk;
int i, error, waitfor;
/*
* Check whether this vnode is involved in a filesystem
* that is doing soft dependency processing.
*/
if (!vn_isdisk(vp, NULL)) {
if (!DOINGSOFTDEP(vp))
return (0);
} else
if (vp->v_rdev->si_mountpoint == NULL ||
(vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP) == 0)
return (0);
/*
* Ensure that any direct block dependencies have been cleared.
*/
ACQUIRE_LOCK(&lk);
if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) {
FREE_LOCK(&lk);
return (error);
}
/*
* For most files, the only metadata dependencies are the
* cylinder group maps that allocate their inode or blocks.
* The block allocation dependencies can be found by traversing
* the dependency lists for any buffers that remain on their
* dirty buffer list. The inode allocation dependency will
* be resolved when the inode is updated with MNT_WAIT.
* This work is done in two passes. The first pass grabs most
* of the buffers and begins asynchronously writing them. The
* only way to wait for these asynchronous writes is to sleep
* on the filesystem vnode which may stay busy for a long time
* if the filesystem is active. So, instead, we make a second
* pass over the dependencies blocking on each write. In the
* usual case we will be blocking against a write that we
* initiated, so when it is done the dependency will have been
* resolved. Thus the second pass is expected to end quickly.
*/
waitfor = MNT_NOWAIT;
top:
/*
* We must wait for any I/O in progress to finish so that
* all potential buffers on the dirty list will be visible.
*/
VI_LOCK(vp);
drain_output(vp, 1);
bp = getdirtybuf(&TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd),
VI_MTX(vp), MNT_WAIT);
if (bp == NULL) {
VI_UNLOCK(vp);
FREE_LOCK(&lk);
return (0);
}
/* While syncing snapshots, we must allow recursive lookups */
bp->b_lock.lk_flags |= LK_CANRECURSE;
loop:
/*
* As we hold the buffer locked, none of its dependencies
* will disappear.
*/
LIST_FOREACH(wk, &bp->b_dep, wk_list) {
switch (wk->wk_type) {
case D_ALLOCDIRECT:
adp = WK_ALLOCDIRECT(wk);
if (adp->ad_state & DEPCOMPLETE)
continue;
nbp = adp->ad_buf;
nbp = getdirtybuf(&nbp, NULL, waitfor);
if (nbp == NULL)
continue;
FREE_LOCK(&lk);
if (waitfor == MNT_NOWAIT) {
bawrite(nbp);
} else if ((error = bwrite(nbp)) != 0) {
break;
}
ACQUIRE_LOCK(&lk);
continue;
case D_ALLOCINDIR:
aip = WK_ALLOCINDIR(wk);
if (aip->ai_state & DEPCOMPLETE)
continue;
nbp = aip->ai_buf;
nbp = getdirtybuf(&nbp, NULL, waitfor);
if (nbp == NULL)
continue;
FREE_LOCK(&lk);
if (waitfor == MNT_NOWAIT) {
bawrite(nbp);
} else if ((error = bwrite(nbp)) != 0) {
break;
}
ACQUIRE_LOCK(&lk);
continue;
case D_INDIRDEP:
restart:
2001-02-04 12:37:48 +00:00
LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
if (aip->ai_state & DEPCOMPLETE)
continue;
nbp = aip->ai_buf;
nbp = getdirtybuf(&nbp, NULL, MNT_WAIT);
if (nbp == NULL)
goto restart;
FREE_LOCK(&lk);
if ((error = bwrite(nbp)) != 0) {
break;
}
ACQUIRE_LOCK(&lk);
goto restart;
}
continue;
case D_INODEDEP:
if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs,
WK_INODEDEP(wk)->id_ino)) != 0) {
FREE_LOCK(&lk);
break;
}
continue;
case D_PAGEDEP:
/*
* We are trying to sync a directory that may
* have dependencies on both its own metadata
* and/or dependencies on the inodes of any
* recently allocated files. We walk its diradd
* lists pushing out the associated inode.
*/
pagedep = WK_PAGEDEP(wk);
for (i = 0; i < DAHASHSZ; i++) {
if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
continue;
if ((error =
flush_pagedep_deps(vp, pagedep->pd_mnt,
&pagedep->pd_diraddhd[i]))) {
FREE_LOCK(&lk);
break;
}
}
continue;
case D_MKDIR:
/*
* This case should never happen if the vnode has
* been properly sync'ed. However, if this function
* is used at a place where the vnode has not yet
* been sync'ed, this dependency can show up. So,
* rather than panic, just flush it.
*/
nbp = WK_MKDIR(wk)->md_buf;
nbp = getdirtybuf(&nbp, NULL, waitfor);
if (nbp == NULL)
continue;
FREE_LOCK(&lk);
if (waitfor == MNT_NOWAIT) {
bawrite(nbp);
} else if ((error = bwrite(nbp)) != 0) {
break;
}
ACQUIRE_LOCK(&lk);
continue;
case D_BMSAFEMAP:
/*
* This case should never happen if the vnode has
* been properly sync'ed. However, if this function
* is used at a place where the vnode has not yet
* been sync'ed, this dependency can show up. So,
* rather than panic, just flush it.
*/
nbp = WK_BMSAFEMAP(wk)->sm_buf;
nbp = getdirtybuf(&nbp, NULL, waitfor);
if (nbp == NULL)
continue;
FREE_LOCK(&lk);
if (waitfor == MNT_NOWAIT) {
bawrite(nbp);
} else if ((error = bwrite(nbp)) != 0) {
break;
}
ACQUIRE_LOCK(&lk);
continue;
default:
FREE_LOCK(&lk);
panic("softdep_sync_metadata: Unknown type %s",
TYPENAME(wk->wk_type));
/* NOTREACHED */
}
/* We reach here only in error and unlocked */
if (error == 0)
panic("softdep_sync_metadata: zero error");
bp->b_lock.lk_flags &= ~LK_CANRECURSE;
bawrite(bp);
return (error);
}
VI_LOCK(vp);
nbp = getdirtybuf(&TAILQ_NEXT(bp, b_bobufs), VI_MTX(vp), MNT_WAIT);
if (nbp == NULL)
VI_UNLOCK(vp);
FREE_LOCK(&lk);
bp->b_lock.lk_flags &= ~LK_CANRECURSE;
bawrite(bp);
ACQUIRE_LOCK(&lk);
if (nbp != NULL) {
bp = nbp;
goto loop;
}
/*
* The brief unlock is to allow any pent up dependency
* processing to be done. Then proceed with the second pass.
*/
if (waitfor == MNT_NOWAIT) {
waitfor = MNT_WAIT;
FREE_LOCK(&lk);
ACQUIRE_LOCK(&lk);
goto top;
}
/*
* If we have managed to get rid of all the dirty buffers,
* then we are done. For certain directories and block
* devices, we may need to do further work.
*
* We must wait for any I/O in progress to finish so that
* all potential buffers on the dirty list will be visible.
*/
VI_LOCK(vp);
drain_output(vp, 1);
if (vp->v_bufobj.bo_dirty.bv_cnt == 0) {
VI_UNLOCK(vp);
FREE_LOCK(&lk);
return (0);
}
VI_UNLOCK(vp);
FREE_LOCK(&lk);
/*
* If we are trying to sync a block device, some of its buffers may
* contain metadata that cannot be written until the contents of some
* partially written files have been written to disk. The only easy
* way to accomplish this is to sync the entire filesystem (luckily
* this happens rarely).
*/
if (vn_isdisk(vp, NULL) &&
vp->v_rdev->si_mountpoint && !VOP_ISLOCKED(vp, NULL) &&
(error = VFS_SYNC(vp->v_rdev->si_mountpoint, MNT_WAIT, ap->a_cred,
ap->a_td)) != 0)
return (error);
return (0);
}
/*
* Flush the dependencies associated with an inodedep.
* Called with splbio blocked.
*/
static int
flush_inodedep_deps(fs, ino)
struct fs *fs;
ino_t ino;
{
struct inodedep *inodedep;
int error, waitfor;
/*
* This work is done in two passes. The first pass grabs most
* of the buffers and begins asynchronously writing them. The
* only way to wait for these asynchronous writes is to sleep
* on the filesystem vnode which may stay busy for a long time
* if the filesystem is active. So, instead, we make a second
* pass over the dependencies blocking on each write. In the
* usual case we will be blocking against a write that we
* initiated, so when it is done the dependency will have been
* resolved. Thus the second pass is expected to end quickly.
* We give a brief window at the top of the loop to allow
* any pending I/O to complete.
*/
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
for (error = 0, waitfor = MNT_NOWAIT; ; ) {
if (error)
return (error);
FREE_LOCK(&lk);
ACQUIRE_LOCK(&lk);
if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
return (0);
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
continue;
/*
* If pass2, we are done, otherwise do pass 2.
*/
if (waitfor == MNT_WAIT)
break;
waitfor = MNT_WAIT;
}
/*
* Try freeing inodedep in case all dependencies have been removed.
*/
if (inodedep_lookup(fs, ino, 0, &inodedep) != 0)
(void) free_inodedep(inodedep);
return (0);
}
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
/*
* Flush an inode dependency list.
* Called with splbio blocked.
*/
static int
flush_deplist(listhead, waitfor, errorp)
struct allocdirectlst *listhead;
int waitfor;
int *errorp;
{
struct allocdirect *adp;
struct buf *bp;
TAILQ_FOREACH(adp, listhead, ad_next) {
if (adp->ad_state & DEPCOMPLETE)
continue;
bp = adp->ad_buf;
bp = getdirtybuf(&bp, NULL, waitfor);
if (bp == NULL) {
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
if (waitfor == MNT_NOWAIT)
continue;
return (1);
}
FREE_LOCK(&lk);
if (waitfor == MNT_NOWAIT) {
bawrite(bp);
} else if ((*errorp = bwrite(bp)) != 0) {
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
ACQUIRE_LOCK(&lk);
return (1);
}
ACQUIRE_LOCK(&lk);
return (1);
}
return (0);
}
/*
* Eliminate a pagedep dependency by flushing out all its diradd dependencies.
* Called with splbio blocked.
*/
static int
flush_pagedep_deps(pvp, mp, diraddhdp)
struct vnode *pvp;
struct mount *mp;
struct diraddhd *diraddhdp;
{
struct thread *td = curthread;
struct inodedep *inodedep;
struct ufsmount *ump;
struct diradd *dap;
struct vnode *vp;
int error = 0;
struct buf *bp;
ino_t inum;
ump = VFSTOUFS(mp);
while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
/*
* Flush ourselves if this directory entry
* has a MKDIR_PARENT dependency.
*/
if (dap->da_state & MKDIR_PARENT) {
FREE_LOCK(&lk);
if ((error = UFS_UPDATE(pvp, 1)) != 0)
break;
ACQUIRE_LOCK(&lk);
/*
* If that cleared dependencies, go on to next.
*/
if (dap != LIST_FIRST(diraddhdp))
continue;
if (dap->da_state & MKDIR_PARENT) {
FREE_LOCK(&lk);
panic("flush_pagedep_deps: MKDIR_PARENT");
}
}
/*
* A newly allocated directory must have its "." and
* ".." entries written out before its name can be
* committed in its parent. We do not want or need
* the full semantics of a synchronous VOP_FSYNC as
* that may end up here again, once for each directory
* level in the filesystem. Instead, we push the blocks
* and wait for them to clear. We have to fsync twice
* because the first call may choose to defer blocks
* that still have dependencies, but deferral will
* happen at most once.
*/
inum = dap->da_newinum;
if (dap->da_state & MKDIR_BODY) {
FREE_LOCK(&lk);
if ((error = VFS_VGET(mp, inum, LK_EXCLUSIVE, &vp)))
break;
if ((error=VOP_FSYNC(vp, td->td_ucred, MNT_NOWAIT, td)) ||
(error=VOP_FSYNC(vp, td->td_ucred, MNT_NOWAIT, td))) {
vput(vp);
break;
}
VI_LOCK(vp);
drain_output(vp, 0);
VI_UNLOCK(vp);
vput(vp);
ACQUIRE_LOCK(&lk);
/*
* If that cleared dependencies, go on to next.
*/
if (dap != LIST_FIRST(diraddhdp))
continue;
if (dap->da_state & MKDIR_BODY) {
FREE_LOCK(&lk);
panic("flush_pagedep_deps: MKDIR_BODY");
}
}
/*
* Flush the inode on which the directory entry depends.
* Having accounted for MKDIR_PARENT and MKDIR_BODY above,
* the only remaining dependency is that the updated inode
* count must get pushed to disk. The inode has already
* been pushed into its inode buffer (via VOP_UPDATE) at
* the time of the reference count change. So we need only
* locate that buffer, ensure that there will be no rollback
* caused by a bitmap dependency, then write the inode buffer.
*/
if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0) {
FREE_LOCK(&lk);
panic("flush_pagedep_deps: lost inode");
}
/*
* If the inode still has bitmap dependencies,
* push them to disk.
*/
if ((inodedep->id_state & DEPCOMPLETE) == 0) {
bp = inodedep->id_buf;
bp = getdirtybuf(&bp, NULL, MNT_WAIT);
FREE_LOCK(&lk);
if (bp && (error = bwrite(bp)) != 0)
break;
ACQUIRE_LOCK(&lk);
if (dap != LIST_FIRST(diraddhdp))
continue;
}
/*
* If the inode is still sitting in a buffer waiting
* to be written, push it to disk.
*/
FREE_LOCK(&lk);
if ((error = bread(ump->um_devvp,
fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
(int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) {
brelse(bp);
break;
}
if ((error = bwrite(bp)) != 0)
break;
ACQUIRE_LOCK(&lk);
/*
* If we have failed to get rid of all the dependencies
* then something is seriously wrong.
*/
if (dap == LIST_FIRST(diraddhdp)) {
FREE_LOCK(&lk);
panic("flush_pagedep_deps: flush failed");
}
}
if (error)
ACQUIRE_LOCK(&lk);
return (error);
}
/*
* A large burst of file addition or deletion activity can drive the
* memory load excessively high. First attempt to slow things down
* using the techniques below. If that fails, this routine requests
* the offending operations to fall back to running synchronously
* until the memory load returns to a reasonable level.
*/
int
softdep_slowdown(vp)
struct vnode *vp;
{
int max_softdeps_hard;
max_softdeps_hard = max_softdeps * 11 / 10;
if (num_dirrem < max_softdeps_hard / 2 &&
num_inodedep < max_softdeps_hard &&
VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps)
return (0);
if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps)
speedup_syncer();
stat_sync_limit_hit += 1;
return (1);
}
/*
* Called by the allocation routines when they are about to fail
* in the hope that we can free up some disk space.
*
* First check to see if the work list has anything on it. If it has,
* clean up entries until we successfully free some space. Because this
* process holds inodes locked, we cannot handle any remove requests
* that might block on a locked inode as that could lead to deadlock.
* If the worklist yields no free space, encourage the syncer daemon
* to help us. In no event will we try for longer than tickdelay seconds.
*/
int
softdep_request_cleanup(fs, vp)
struct fs *fs;
struct vnode *vp;
{
This commit adds basic support for the UFS2 filesystem. The UFS2 filesystem expands the inode to 256 bytes to make space for 64-bit block pointers. It also adds a file-creation time field, an ability to use jumbo blocks per inode to allow extent like pointer density, and space for extended attributes (up to twice the filesystem block size worth of attributes, e.g., on a 16K filesystem, there is space for 32K of attributes). UFS2 fully supports and runs existing UFS1 filesystems. New filesystems built using newfs can be built in either UFS1 or UFS2 format using the -O option. In this commit UFS1 is the default format, so if you want to build UFS2 format filesystems, you must specify -O 2. This default will be changed to UFS2 when UFS2 proves itself to be stable. In this commit the boot code for reading UFS2 filesystems is not compiled (see /sys/boot/common/ufsread.c) as there is insufficient space in the boot block. Once the size of the boot block is increased, this code can be defined. Things to note: the definition of SBSIZE has changed to SBLOCKSIZE. The header file <ufs/ufs/dinode.h> must be included before <ufs/ffs/fs.h> so as to get the definitions of ufs2_daddr_t and ufs_lbn_t. Still TODO: Verify that the first level bootstraps work for all the architectures. Convert the utility ffsinfo to understand UFS2 and test growfs. Add support for the extended attribute storage. Update soft updates to ensure integrity of extended attribute storage. Switch the current extended attribute interfaces to use the extended attribute storage. Add the extent like functionality (framework is there, but is currently never used). Sponsored by: DARPA & NAI Labs. Reviewed by: Poul-Henning Kamp <phk@freebsd.org>
2002-06-21 06:18:05 +00:00
long starttime;
ufs2_daddr_t needed;
needed = fs->fs_cstotal.cs_nbfree + fs->fs_contigsumsize;
starttime = time_second + tickdelay;
/*
* If we are being called because of a process doing a
* copy-on-write, then it is not safe to update the vnode
* as we may recurse into the copy-on-write routine.
*/
if (!(curthread->td_pflags & TDP_COWINPROGRESS) &&
UFS_UPDATE(vp, 1) != 0)
return (0);
while (fs->fs_pendingblocks > 0 && fs->fs_cstotal.cs_nbfree <= needed) {
if (time_second > starttime)
return (0);
if (num_on_worklist > 0 &&
process_worklist_item(NULL, LK_NOWAIT) != -1) {
stat_worklist_push += 1;
continue;
}
request_cleanup(FLUSH_REMOVE_WAIT, 0);
}
return (1);
}
/*
* If memory utilization has gotten too high, deliberately slow things
* down and speed up the I/O processing.
*/
static int
request_cleanup(resource, islocked)
int resource;
int islocked;
{
struct thread *td = curthread;
/*
* We never hold up the filesystem syncer process.
*/
if (td == filesys_syncer)
return (0);
/*
* First check to see if the work list has gotten backlogged.
* If it has, co-opt this process to help clean up two entries.
* Because this process may hold inodes locked, we cannot
* handle any remove requests that might block on a locked
* inode as that could lead to deadlock.
*/
if (num_on_worklist > max_softdeps / 10) {
This patch corrects two problems with the rate limiting code that was introduced in revision 1.80. The problem manifested itself with a `locking against myself' panic and could also result in soft updates inconsistences associated with inodedeps. The two problems are: 1) One of the background operations could manipulate the bitmap while holding it locked with intent to create. This held lock results in a `locking against myself' panic, when the background processing that we have been coopted to do tries to lock the bitmap which we are already holding locked. To understand how to fix this problem, first, observe that we can do the background cleanups in inodedep_lookup only when allocating inodedeps (DEPALLOC is set in the call to inodedep_lookup). Second observe that calls to inodedep_lookup with DEPALLOC set can only happen from the following calls into the softdep code: softdep_setup_inomapdep softdep_setup_allocdirect softdep_setup_remove softdep_setup_freeblocks softdep_setup_directory_change softdep_setup_directory_add softdep_change_linkcnt Only the first two of these can come from ffs_alloc.c while holding a bitmap locked. Thus, inodedep_lookup must not go off to do request_cleanups when being called from these functions. This change adds a flag, NODELAY, that can be passed to inodedep_lookup to let it know that it should not do background processing in those cases. 2) The return value from request_cleanup when helping out with the cleanup was 0 instead of 1. This meant that despite the fact that we may have slept while doing the cleanups, the code did not recheck for the appearance of an inodedep (e.g., goto top in inodedep_lookup). This lead to the softdep inconsistency in which we ended up with two inodedep's for the same inode. Reviewed by: Peter Wemm <peter@yahoo-inc.com>, Matt Dillon <dillon@earth.backplane.com>
2001-02-20 11:14:38 +00:00
if (islocked)
FREE_LOCK(&lk);
process_worklist_item(NULL, LK_NOWAIT);
process_worklist_item(NULL, LK_NOWAIT);
stat_worklist_push += 2;
This patch corrects two problems with the rate limiting code that was introduced in revision 1.80. The problem manifested itself with a `locking against myself' panic and could also result in soft updates inconsistences associated with inodedeps. The two problems are: 1) One of the background operations could manipulate the bitmap while holding it locked with intent to create. This held lock results in a `locking against myself' panic, when the background processing that we have been coopted to do tries to lock the bitmap which we are already holding locked. To understand how to fix this problem, first, observe that we can do the background cleanups in inodedep_lookup only when allocating inodedeps (DEPALLOC is set in the call to inodedep_lookup). Second observe that calls to inodedep_lookup with DEPALLOC set can only happen from the following calls into the softdep code: softdep_setup_inomapdep softdep_setup_allocdirect softdep_setup_remove softdep_setup_freeblocks softdep_setup_directory_change softdep_setup_directory_add softdep_change_linkcnt Only the first two of these can come from ffs_alloc.c while holding a bitmap locked. Thus, inodedep_lookup must not go off to do request_cleanups when being called from these functions. This change adds a flag, NODELAY, that can be passed to inodedep_lookup to let it know that it should not do background processing in those cases. 2) The return value from request_cleanup when helping out with the cleanup was 0 instead of 1. This meant that despite the fact that we may have slept while doing the cleanups, the code did not recheck for the appearance of an inodedep (e.g., goto top in inodedep_lookup). This lead to the softdep inconsistency in which we ended up with two inodedep's for the same inode. Reviewed by: Peter Wemm <peter@yahoo-inc.com>, Matt Dillon <dillon@earth.backplane.com>
2001-02-20 11:14:38 +00:00
if (islocked)
ACQUIRE_LOCK(&lk);
return(1);
}
/*
* Next, we attempt to speed up the syncer process. If that
* is successful, then we allow the process to continue.
*/
if (speedup_syncer() && resource != FLUSH_REMOVE_WAIT)
return(0);
/*
* If we are resource constrained on inode dependencies, try
* flushing some dirty inodes. Otherwise, we are constrained
* by file deletions, so try accelerating flushes of directories
* with removal dependencies. We would like to do the cleanup
* here, but we probably hold an inode locked at this point and
* that might deadlock against one that we try to clean. So,
* the best that we can do is request the syncer daemon to do
* the cleanup for us.
*/
switch (resource) {
case FLUSH_INODES:
stat_ino_limit_push += 1;
req_clear_inodedeps += 1;
stat_countp = &stat_ino_limit_hit;
break;
case FLUSH_REMOVE:
case FLUSH_REMOVE_WAIT:
stat_blk_limit_push += 1;
req_clear_remove += 1;
stat_countp = &stat_blk_limit_hit;
break;
default:
if (islocked)
FREE_LOCK(&lk);
panic("request_cleanup: unknown type");
}
/*
* Hopefully the syncer daemon will catch up and awaken us.
* We wait at most tickdelay before proceeding in any case.
*/
if (islocked == 0)
ACQUIRE_LOCK(&lk);
proc_waiting += 1;
if (handle.callout == NULL)
handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
interlocked_sleep(&lk, SLEEP, (caddr_t)&proc_waiting, NULL, PPAUSE,
"softupdate", 0);
proc_waiting -= 1;
if (islocked == 0)
FREE_LOCK(&lk);
return (1);
}
/*
* Awaken processes pausing in request_cleanup and clear proc_waiting
* to indicate that there is no longer a timer running.
*/
static void
pause_timer(arg)
void *arg;
{
*stat_countp += 1;
wakeup_one(&proc_waiting);
if (proc_waiting > 0)
handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
else
handle.callout = NULL;
}
/*
* Flush out a directory with at least one removal dependency in an effort to
* reduce the number of dirrem, freefile, and freeblks dependency structures.
*/
static void
clear_remove(td)
struct thread *td;
{
struct pagedep_hashhead *pagedephd;
struct pagedep *pagedep;
static int next = 0;
struct mount *mp;
struct vnode *vp;
int error, cnt;
ino_t ino;
ACQUIRE_LOCK(&lk);
for (cnt = 0; cnt < pagedep_hash; cnt++) {
pagedephd = &pagedep_hashtbl[next++];
if (next >= pagedep_hash)
next = 0;
LIST_FOREACH(pagedep, pagedephd, pd_hash) {
if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL)
continue;
mp = pagedep->pd_mnt;
ino = pagedep->pd_ino;
This patch corrects the first round of panics and hangs reported with the new snapshot code. Update addaliasu to correctly implement the semantics of the old checkalias function. When a device vnode first comes into existence, check to see if an anonymous vnode for the same device was created at boot time by bdevvp(). If so, adopt the bdevvp vnode rather than creating a new vnode for the device. This corrects a problem which caused the kernel to panic when taking a snapshot of the root filesystem. Change the calling convention of vn_write_suspend_wait() to be the same as vn_start_write(). Split out softdep_flushworklist() from softdep_flushfiles() so that it can be used to clear the work queue when suspending filesystem operations. Access to buffers becomes recursive so that snapshots can recursively traverse their indirect blocks using ffs_copyonwrite() when checking for the need for copy on write when flushing one of their own indirect blocks. This eliminates a deadlock between the syncer daemon and a process taking a snapshot. Ensure that softdep_process_worklist() can never block because of a snapshot being taken. This eliminates a problem with buffer starvation. Cleanup change in ffs_sync() which did not synchronously wait when MNT_WAIT was specified. The result was an unclean filesystem panic when doing forcible unmount with heavy filesystem I/O in progress. Return a zero'ed block when reading a block that was not in use at the time that a snapshot was taken. Normally, these blocks should never be read. However, the readahead code will occationally read them which can cause unexpected behavior. Clean up the debugging code that ensures that no blocks be written on a filesystem while it is suspended. Snapshots must explicitly label the blocks that they are writing during the suspension so that they do not cause a `write on suspended filesystem' panic. Reorganize ffs_copyonwrite() to eliminate a deadlock and also to prevent a race condition that would permit the same block to be copied twice. This change eliminates an unexpected soft updates inconsistency in fsck caused by the double allocation. Use bqrelse rather than brelse for buffers that will be needed soon again by the snapshot code. This improves snapshot performance.
2000-07-24 05:28:33 +00:00
if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
continue;
FREE_LOCK(&lk);
if ((error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &vp))) {
softdep_error("clear_remove: vget", error);
vn_finished_write(mp);
return;
}
if ((error = VOP_FSYNC(vp, td->td_ucred, MNT_NOWAIT, td)))
softdep_error("clear_remove: fsync", error);
VI_LOCK(vp);
drain_output(vp, 0);
VI_UNLOCK(vp);
vput(vp);
vn_finished_write(mp);
return;
}
}
FREE_LOCK(&lk);
}
/*
* Clear out a block of dirty inodes in an effort to reduce
* the number of inodedep dependency structures.
*/
static void
clear_inodedeps(td)
struct thread *td;
{
struct inodedep_hashhead *inodedephd;
struct inodedep *inodedep;
static int next = 0;
struct mount *mp;
struct vnode *vp;
struct fs *fs;
int error, cnt;
ino_t firstino, lastino, ino;
ACQUIRE_LOCK(&lk);
/*
* Pick a random inode dependency to be cleared.
* We will then gather up all the inodes in its block
* that have dependencies and flush them out.
*/
for (cnt = 0; cnt < inodedep_hash; cnt++) {
inodedephd = &inodedep_hashtbl[next++];
if (next >= inodedep_hash)
next = 0;
if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
break;
}
if (inodedep == NULL)
return;
/*
* Ugly code to find mount point given pointer to superblock.
*/
fs = inodedep->id_fs;
TAILQ_FOREACH(mp, &mountlist, mnt_list)
if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs)
break;
/*
* Find the last inode in the block with dependencies.
*/
firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0)
break;
/*
* Asynchronously push all but the last inode with dependencies.
* Synchronously push the last inode with dependencies to ensure
* that the inode block gets written to free up the inodedeps.
*/
for (ino = firstino; ino <= lastino; ino++) {
if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
continue;
This patch corrects the first round of panics and hangs reported with the new snapshot code. Update addaliasu to correctly implement the semantics of the old checkalias function. When a device vnode first comes into existence, check to see if an anonymous vnode for the same device was created at boot time by bdevvp(). If so, adopt the bdevvp vnode rather than creating a new vnode for the device. This corrects a problem which caused the kernel to panic when taking a snapshot of the root filesystem. Change the calling convention of vn_write_suspend_wait() to be the same as vn_start_write(). Split out softdep_flushworklist() from softdep_flushfiles() so that it can be used to clear the work queue when suspending filesystem operations. Access to buffers becomes recursive so that snapshots can recursively traverse their indirect blocks using ffs_copyonwrite() when checking for the need for copy on write when flushing one of their own indirect blocks. This eliminates a deadlock between the syncer daemon and a process taking a snapshot. Ensure that softdep_process_worklist() can never block because of a snapshot being taken. This eliminates a problem with buffer starvation. Cleanup change in ffs_sync() which did not synchronously wait when MNT_WAIT was specified. The result was an unclean filesystem panic when doing forcible unmount with heavy filesystem I/O in progress. Return a zero'ed block when reading a block that was not in use at the time that a snapshot was taken. Normally, these blocks should never be read. However, the readahead code will occationally read them which can cause unexpected behavior. Clean up the debugging code that ensures that no blocks be written on a filesystem while it is suspended. Snapshots must explicitly label the blocks that they are writing during the suspension so that they do not cause a `write on suspended filesystem' panic. Reorganize ffs_copyonwrite() to eliminate a deadlock and also to prevent a race condition that would permit the same block to be copied twice. This change eliminates an unexpected soft updates inconsistency in fsck caused by the double allocation. Use bqrelse rather than brelse for buffers that will be needed soon again by the snapshot code. This improves snapshot performance.
2000-07-24 05:28:33 +00:00
if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
continue;
FREE_LOCK(&lk);
if ((error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &vp)) != 0) {
softdep_error("clear_inodedeps: vget", error);
vn_finished_write(mp);
return;
}
if (ino == lastino) {
if ((error = VOP_FSYNC(vp, td->td_ucred, MNT_WAIT, td)))
softdep_error("clear_inodedeps: fsync1", error);
} else {
if ((error = VOP_FSYNC(vp, td->td_ucred, MNT_NOWAIT, td)))
softdep_error("clear_inodedeps: fsync2", error);
VI_LOCK(vp);
drain_output(vp, 0);
VI_UNLOCK(vp);
}
vput(vp);
vn_finished_write(mp);
ACQUIRE_LOCK(&lk);
}
FREE_LOCK(&lk);
}
/*
* Function to determine if the buffer has outstanding dependencies
* that will cause a roll-back if the buffer is written. If wantcount
* is set, return number of dependencies, otherwise just yes or no.
*/
static int
softdep_count_dependencies(bp, wantcount)
struct buf *bp;
int wantcount;
{
struct worklist *wk;
struct inodedep *inodedep;
struct indirdep *indirdep;
struct allocindir *aip;
struct pagedep *pagedep;
struct diradd *dap;
int i, retval;
retval = 0;
ACQUIRE_LOCK(&lk);
LIST_FOREACH(wk, &bp->b_dep, wk_list) {
switch (wk->wk_type) {
case D_INODEDEP:
inodedep = WK_INODEDEP(wk);
if ((inodedep->id_state & DEPCOMPLETE) == 0) {
/* bitmap allocation dependency */
retval += 1;
if (!wantcount)
goto out;
}
if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
/* direct block pointer dependency */
retval += 1;
if (!wantcount)
goto out;
}
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
if (TAILQ_FIRST(&inodedep->id_extupdt)) {
/* direct block pointer dependency */
retval += 1;
if (!wantcount)
goto out;
}
continue;
case D_INDIRDEP:
indirdep = WK_INDIRDEP(wk);
2001-02-04 12:37:48 +00:00
LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
/* indirect block pointer dependency */
retval += 1;
if (!wantcount)
goto out;
}
continue;
case D_PAGEDEP:
pagedep = WK_PAGEDEP(wk);
for (i = 0; i < DAHASHSZ; i++) {
2001-02-04 12:37:48 +00:00
LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
/* directory entry dependency */
retval += 1;
if (!wantcount)
goto out;
}
}
continue;
case D_BMSAFEMAP:
case D_ALLOCDIRECT:
case D_ALLOCINDIR:
case D_MKDIR:
/* never a dependency on these blocks */
continue;
default:
FREE_LOCK(&lk);
panic("softdep_check_for_rollback: Unexpected type %s",
TYPENAME(wk->wk_type));
/* NOTREACHED */
}
}
out:
FREE_LOCK(&lk);
return retval;
}
/*
* Acquire exclusive access to a buffer.
* Must be called with splbio blocked.
* Return acquired buffer or NULL on failure. mtx, if provided, will be
* released on success but held on failure.
*/
static struct buf *
getdirtybuf(bpp, mtx, waitfor)
struct buf **bpp;
struct mtx *mtx;
int waitfor;
{
struct buf *bp;
int error;
/*
* XXX This code and the code that calls it need to be reviewed to
* verify its use of the vnode interlock.
*/
for (;;) {
if ((bp = *bpp) == NULL)
return (0);
if (bp->b_vp == NULL)
kdb_backtrace();
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) {
if ((bp->b_vflags & BV_BKGRDINPROG) == 0)
break;
BUF_UNLOCK(bp);
if (waitfor != MNT_WAIT)
return (NULL);
/*
* The mtx argument must be bp->b_vp's mutex in
* this case.
*/
#ifdef DEBUG_VFS_LOCKS
if (bp->b_vp->v_type != VCHR)
ASSERT_VI_LOCKED(bp->b_vp, "getdirtybuf");
#endif
bp->b_vflags |= BV_BKGRDWAIT;
interlocked_sleep(&lk, SLEEP, &bp->b_xflags, mtx,
PRIBIO, "getbuf", 0);
continue;
}
if (waitfor != MNT_WAIT)
return (NULL);
if (mtx) {
error = interlocked_sleep(&lk, LOCKBUF, bp, mtx,
LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 0, 0);
mtx_lock(mtx);
} else
error = interlocked_sleep(&lk, LOCKBUF, bp, NULL,
LK_EXCLUSIVE | LK_SLEEPFAIL, 0, 0);
if (error != ENOLCK) {
FREE_LOCK(&lk);
panic("getdirtybuf: inconsistent lock");
}
}
if ((bp->b_flags & B_DELWRI) == 0) {
BUF_UNLOCK(bp);
return (NULL);
}
if (mtx)
mtx_unlock(mtx);
bremfree(bp);
return (bp);
}
/*
* Wait for pending output on a vnode to complete.
* Must be called with vnode lock and interlock locked.
*/
static void
drain_output(vp, islocked)
struct vnode *vp;
int islocked;
{
ASSERT_VOP_LOCKED(vp, "drain_output");
ASSERT_VI_LOCKED(vp, "drain_output");
if (!islocked)
ACQUIRE_LOCK(&lk);
while (vp->v_bufobj.bo_numoutput) {
vp->v_bufobj.bo_flag |= BO_WWAIT;
interlocked_sleep(&lk, SLEEP,
(caddr_t)&vp->v_bufobj.bo_numoutput,
VI_MTX(vp), PRIBIO + 1, "drainvp", 0);
}
if (!islocked)
FREE_LOCK(&lk);
}
/*
* Called whenever a buffer that is being invalidated or reallocated
* contains dependencies. This should only happen if an I/O error has
* occurred. The routine is called with the buffer locked.
*/
static void
softdep_deallocate_dependencies(bp)
struct buf *bp;
{
if ((bp->b_ioflags & BIO_ERROR) == 0)
panic("softdep_deallocate_dependencies: dangling deps");
softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
panic("softdep_deallocate_dependencies: unrecovered I/O error");
}
/*
* Function to handle asynchronous write errors in the filesystem.
*/
static void
softdep_error(func, error)
char *func;
int error;
{
/* XXX should do something better! */
printf("%s: got error %d while accessing filesystem\n", func, error);
}