Implement fully asynchronous partial truncation with softupdates journaling

to resolve errors which can cause corruption on recovery with the old
synchronous mechanism.

 - Append partial truncation freework structures to indirdeps while
   truncation is proceeding.  These prevent new block pointers from
   becoming valid until truncation completes and serialize truncations.
 - On completion of a partial truncate journal work waits for zeroed
   pointers to hit indirects.
 - softdep_journal_freeblocks() handles last frag allocation and last
   block zeroing.
 - vtruncbuf/ffs_page_remove moved into softdep_*_freeblocks() so it
   is only implemented in one place.
 - Block allocation failure handling moved up one level so it does not
   proceed with buf locks held.  This permits us to do more extensive
   reclaims when filesystem space is exhausted.
 - softdep_sync_metadata() is broken into two parts, the first executes
   once at the start of ffs_syncvnode() and flushes truncations and
   inode dependencies.  The second is called on each locked buf.  This
   eliminates excessive looping and rollbacks.
 - Improve the mechanism in process_worklist_item() that handles
   acquiring vnode locks for handle_workitem_remove() so that it works
   more generally and does not loop excessively over the same worklist
   items on each call.
 - Don't corrupt directories by zeroing the tail in fsck.  This is only
   done for regular files.
 - Push a fsync complete record for files that need it so the checker
   knows a truncation in the journal is no longer valid.

Discussed with:	mckusick, kib (ffs_pages_remove and ffs_truncate parts)
Tested by:	pho
This commit is contained in:
jeff 2011-06-10 22:48:35 +00:00
parent 742a97ee0a
commit 6ba8b7f04c
13 changed files with 2675 additions and 1574 deletions

View File

@ -1604,7 +1604,7 @@ ino_trunc(ino_t ino, off_t size)
* uninitialized space later.
*/
off = blkoff(fs, size);
if (off) {
if (off && DIP(ip, di_mode) != IFDIR) {
uint8_t *buf;
long clrsize;
@ -1775,13 +1775,18 @@ cg_trunc(struct suj_cg *sc)
struct suj_ino *sino;
int i;
for (i = 0; i < SUJ_HASHSIZE; i++)
LIST_FOREACH(sino, &sc->sc_inohash[i], si_next)
for (i = 0; i < SUJ_HASHSIZE; i++) {
LIST_FOREACH(sino, &sc->sc_inohash[i], si_next) {
if (sino->si_trunc) {
ino_trunc(sino->si_ino,
sino->si_trunc->jt_size);
sino->si_blkadj = 0;
sino->si_trunc = NULL;
}
if (sino->si_blkadj)
ino_adjblks(sino);
}
}
}
/*
@ -1791,7 +1796,6 @@ cg_trunc(struct suj_cg *sc)
static void
cg_check_blk(struct suj_cg *sc)
{
struct suj_ino *sino;
struct suj_blk *sblk;
int i;
@ -1799,15 +1803,6 @@ cg_check_blk(struct suj_cg *sc)
for (i = 0; i < SUJ_HASHSIZE; i++)
LIST_FOREACH(sblk, &sc->sc_blkhash[i], sb_next)
blk_check(sblk);
/*
* Now that we've freed blocks which are not referenced we
* make a second pass over all inodes to adjust their block
* counts.
*/
for (i = 0; i < SUJ_HASHSIZE; i++)
LIST_FOREACH(sino, &sc->sc_inohash[i], si_next)
if (sino->si_blkadj)
ino_adjblks(sino);
}
/*
@ -1961,14 +1956,7 @@ ino_append(union jrec *rec)
"parent %d, diroff %jd\n",
refrec->jr_op, refrec->jr_ino, refrec->jr_nlink,
refrec->jr_parent, refrec->jr_diroff);
/*
* Lookup the ino and clear truncate if one is found. Partial
* truncates are always done synchronously so if we discover
* an operation that requires a lock the truncation has completed
* and can be discarded.
*/
sino = ino_lookup(((struct jrefrec *)rec)->jr_ino, 1);
sino->si_trunc = NULL;
sino->si_hasrecs = 1;
srec = errmalloc(sizeof(*srec));
srec->sr_rec = rec;
@ -2174,9 +2162,7 @@ blk_build(struct jblkrec *blkrec)
struct suj_rec *srec;
struct suj_blk *sblk;
struct jblkrec *blkrn;
struct suj_ino *sino;
ufs2_daddr_t blk;
off_t foff;
int frag;
if (debug)
@ -2185,17 +2171,6 @@ blk_build(struct jblkrec *blkrec)
blkrec->jb_op, blkrec->jb_blkno, blkrec->jb_frags,
blkrec->jb_oldfrags, blkrec->jb_ino, blkrec->jb_lbn);
/*
* Look up the inode and clear the truncate if any lbns after the
* truncate lbn are freed or allocated.
*/
sino = ino_lookup(blkrec->jb_ino, 0);
if (sino && sino->si_trunc) {
foff = lblktosize(fs, blkrec->jb_lbn);
foff += lfragtosize(fs, blkrec->jb_frags);
if (foff > sino->si_trunc->jt_size)
sino->si_trunc = NULL;
}
blk = blknum(fs, blkrec->jb_blkno);
frag = fragnum(fs, blkrec->jb_blkno);
sblk = blk_lookup(blk, 1);
@ -2242,10 +2217,15 @@ ino_build_trunc(struct jtrncrec *rec)
struct suj_ino *sino;
if (debug)
printf("ino_build_trunc: ino %d, size %jd\n",
rec->jt_ino, rec->jt_size);
printf("ino_build_trunc: op %d ino %d, size %jd\n",
rec->jt_op, rec->jt_ino, rec->jt_size);
sino = ino_lookup(rec->jt_ino, 1);
sino->si_trunc = rec;
if (rec->jt_op == JOP_SYNC) {
sino->si_trunc = NULL;
return;
}
if (sino->si_trunc == NULL || sino->si_trunc->jt_size > rec->jt_size)
sino->si_trunc = rec;
}
/*

View File

@ -302,6 +302,7 @@ struct vattr {
#define IO_EXT 0x0400 /* operate on external attributes */
#define IO_NORMAL 0x0800 /* operate on regular data */
#define IO_NOMACCHECK 0x1000 /* MAC checks unnecessary */
#define IO_BUFLOCKED 0x2000 /* ffs flag; indir buf is locked */
#define IO_SEQMAX 0x7F /* seq heuristic max value */
#define IO_SEQSHIFT 16 /* seq heuristic in upper 16 bits */

View File

@ -217,7 +217,7 @@ ffs_alloc(ip, lbn, bpref, size, flags, cred, bnp)
(void) chkdq(ip, -btodb(size), cred, FORCE);
UFS_LOCK(ump);
#endif
if (reclaimed == 0) {
if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) {
reclaimed = 1;
softdep_request_cleanup(fs, ITOV(ip), cred, FLUSH_BLOCKS_WAIT);
goto retry;
@ -418,7 +418,7 @@ ffs_realloccg(ip, lbprev, bprev, bpref, osize, nsize, flags, cred, bpp)
/*
* no space available
*/
if (reclaimed == 0) {
if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) {
reclaimed = 1;
UFS_UNLOCK(ump);
if (bp) {

View File

@ -105,6 +105,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
int unwindidx = -1;
int saved_inbdflush;
int reclaimed;
ip = VTOI(vp);
dp = ip->i_din1;
@ -112,6 +113,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
ump = ip->i_ump;
lbn = lblkno(fs, startoffset);
size = blkoff(fs, startoffset) + size;
reclaimed = 0;
if (size > fs->fs_bsize)
panic("ffs_balloc_ufs1: blk too big");
*bpp = NULL;
@ -276,6 +278,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
/*
* Fetch through the indirect blocks, allocating as necessary.
*/
retry:
for (i = 1;;) {
error = bread(vp,
indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
@ -296,8 +299,15 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
if (pref == 0)
pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
flags, cred, &newb)) != 0) {
flags | IO_BUFLOCKED, cred, &newb)) != 0) {
brelse(bp);
if (++reclaimed == 1) {
UFS_LOCK(ump);
softdep_request_cleanup(fs, vp, cred,
FLUSH_BLOCKS_WAIT);
UFS_UNLOCK(ump);
goto retry;
}
goto fail;
}
nb = newb;
@ -349,10 +359,17 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
if (nb == 0) {
UFS_LOCK(ump);
pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off, &bap[0]);
error = ffs_alloc(ip,
lbn, pref, (int)fs->fs_bsize, flags, cred, &newb);
error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
flags | IO_BUFLOCKED, cred, &newb);
if (error) {
brelse(bp);
if (++reclaimed == 1) {
UFS_LOCK(ump);
softdep_request_cleanup(fs, vp, cred,
FLUSH_BLOCKS_WAIT);
UFS_UNLOCK(ump);
goto retry;
}
goto fail;
}
nb = newb;
@ -506,6 +523,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
int deallocated, osize, nsize, num, i, error;
int unwindidx = -1;
int saved_inbdflush;
int reclaimed;
ip = VTOI(vp);
dp = ip->i_din2;
@ -513,6 +531,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
ump = ip->i_ump;
lbn = lblkno(fs, startoffset);
size = blkoff(fs, startoffset) + size;
reclaimed = 0;
if (size > fs->fs_bsize)
panic("ffs_balloc_ufs2: blk too big");
*bpp = NULL;
@ -787,6 +806,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
/*
* Fetch through the indirect blocks, allocating as necessary.
*/
retry:
for (i = 1;;) {
error = bread(vp,
indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
@ -807,8 +827,15 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
if (pref == 0)
pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
flags, cred, &newb)) != 0) {
flags | IO_BUFLOCKED, cred, &newb)) != 0) {
brelse(bp);
if (++reclaimed == 1) {
UFS_LOCK(ump);
softdep_request_cleanup(fs, vp, cred,
FLUSH_BLOCKS_WAIT);
UFS_UNLOCK(ump);
goto retry;
}
goto fail;
}
nb = newb;
@ -860,10 +887,17 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
if (nb == 0) {
UFS_LOCK(ump);
pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off, &bap[0]);
error = ffs_alloc(ip,
lbn, pref, (int)fs->fs_bsize, flags, cred, &newb);
error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
flags | IO_BUFLOCKED, cred, &newb);
if (error) {
brelse(bp);
if (++reclaimed == 1) {
UFS_LOCK(ump);
softdep_request_cleanup(fs, vp, cred,
FLUSH_BLOCKS_WAIT);
UFS_UNLOCK(ump);
goto retry;
}
goto fail;
}
nb = newb;

View File

@ -74,6 +74,7 @@ int ffs_isfreeblock(struct fs *, u_char *, ufs1_daddr_t);
void ffs_load_inode(struct buf *, struct inode *, struct fs *, ino_t);
int ffs_mountroot(void);
void ffs_oldfscompat_write(struct fs *, struct ufsmount *);
void ffs_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end);
int ffs_reallocblks(struct vop_reallocblks_args *);
int ffs_realloccg(struct inode *, ufs2_daddr_t, ufs2_daddr_t,
ufs2_daddr_t, int, int, int, struct ucred *, struct buf **);
@ -107,7 +108,6 @@ extern struct vop_vector ffs_fifoops2;
int softdep_check_suspend(struct mount *, struct vnode *,
int, int, int, int);
int softdep_complete_trunc(struct vnode *, void *);
void softdep_get_depcounts(struct mount *, int *, int *);
void softdep_initialize(void);
void softdep_uninitialize(void);
@ -139,14 +139,17 @@ void softdep_setup_blkfree(struct mount *, struct buf *, ufs2_daddr_t, int,
void softdep_setup_inofree(struct mount *, struct buf *, ino_t,
struct workhead *);
void softdep_setup_sbupdate(struct ufsmount *, struct fs *, struct buf *);
void *softdep_setup_trunc(struct vnode *vp, off_t length, int flags);
void softdep_fsync_mountdev(struct vnode *);
int softdep_sync_metadata(struct vnode *);
int softdep_sync_buf(struct vnode *, struct buf *, int);
int softdep_process_worklist(struct mount *, int);
int softdep_fsync(struct vnode *);
int softdep_waitidle(struct mount *);
int softdep_prealloc(struct vnode *, int);
int softdep_journal_lookup(struct mount *, struct vnode **);
void softdep_journal_freeblocks(struct inode *, struct ucred *, off_t, int);
void softdep_journal_fsync(struct inode *);
/*
* Things to request flushing in softdep_request_cleanup()

View File

@ -120,7 +120,7 @@ ffs_update(vp, waitfor)
}
}
static void
void
ffs_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
{
vm_object_t object;
@ -151,12 +151,12 @@ ffs_truncate(vp, length, flags, cred, td)
ufs2_daddr_t bn, lbn, lastblock, lastiblock[NIADDR], indir_lbn[NIADDR];
ufs2_daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR];
ufs2_daddr_t count, blocksreleased = 0, datablocks;
void *cookie;
struct bufobj *bo;
struct fs *fs;
struct buf *bp;
struct ufsmount *ump;
int needextclean, softdepslowdown, extblocks;
int softdeptrunc, journaltrunc;
int needextclean, extblocks;
int offset, size, level, nblocks;
int i, error, allerror;
off_t osize;
@ -165,7 +165,6 @@ ffs_truncate(vp, length, flags, cred, td)
fs = ip->i_fs;
ump = ip->i_ump;
bo = &vp->v_bufobj;
cookie = NULL;
ASSERT_VOP_LOCKED(vp, "ffs_truncate");
@ -173,6 +172,11 @@ ffs_truncate(vp, length, flags, cred, td)
return (EINVAL);
if (length > fs->fs_maxfilesize)
return (EFBIG);
#ifdef QUOTA
error = getinoquota(ip);
if (error)
return (error);
#endif
/*
* Historically clients did not have to specify which data
* they were truncating. So, if not specified, we assume
@ -191,7 +195,10 @@ ffs_truncate(vp, length, flags, cred, td)
*/
allerror = 0;
needextclean = 0;
softdepslowdown = DOINGSOFTDEP(vp) && softdep_slowdown(vp);
softdeptrunc = 0;
journaltrunc = DOINGSUJ(vp);
if (journaltrunc == 0 && DOINGSOFTDEP(vp) && length == 0)
softdeptrunc = !softdep_slowdown(vp);
extblocks = 0;
datablocks = DIP(ip, i_blocks);
if (fs->fs_magic == FS_UFS2_MAGIC && ip->i_din2->di_extsize > 0) {
@ -199,27 +206,23 @@ ffs_truncate(vp, length, flags, cred, td)
datablocks -= extblocks;
}
if ((flags & IO_EXT) && extblocks > 0) {
if (DOINGSOFTDEP(vp) && softdepslowdown == 0 && length == 0) {
if ((flags & IO_NORMAL) == 0) {
softdep_setup_freeblocks(ip, length, IO_EXT);
return (0);
}
if (length != 0)
panic("ffs_truncate: partial trunc of extdata");
if (softdeptrunc || journaltrunc) {
if ((flags & IO_NORMAL) == 0)
goto extclean;
needextclean = 1;
} else {
if (length != 0)
panic("ffs_truncate: partial trunc of extdata");
if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0)
return (error);
if (DOINGSUJ(vp))
cookie = softdep_setup_trunc(vp, length, flags);
osize = ip->i_din2->di_extsize;
ip->i_din2->di_blocks -= extblocks;
#ifdef QUOTA
(void) chkdq(ip, -extblocks, NOCRED, 0);
#endif
vinvalbuf(vp, V_ALT, 0, 0);
ffs_pages_remove(vp,
OFF_TO_IDX(lblktosize(fs, -extblocks)), 0);
osize = ip->i_din2->di_extsize;
ip->i_din2->di_blocks -= extblocks;
ip->i_din2->di_extsize = 0;
for (i = 0; i < NXADDR; i++) {
oldblks[i] = ip->i_din2->di_extb[i];
@ -227,7 +230,7 @@ ffs_truncate(vp, length, flags, cred, td)
}
ip->i_flag |= IN_CHANGE;
if ((error = ffs_update(vp, 1)))
goto out;
return (error);
for (i = 0; i < NXADDR; i++) {
if (oldblks[i] == 0)
continue;
@ -236,10 +239,8 @@ ffs_truncate(vp, length, flags, cred, td)
}
}
}
if ((flags & IO_NORMAL) == 0) {
error = 0;
goto out;
}
if ((flags & IO_NORMAL) == 0)
return (0);
if (vp->v_type == VLNK &&
(ip->i_size < vp->v_mount->mnt_maxsymlinklen ||
datablocks == 0)) {
@ -252,24 +253,17 @@ ffs_truncate(vp, length, flags, cred, td)
DIP_SET(ip, i_size, 0);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
if (needextclean)
softdep_setup_freeblocks(ip, length, IO_EXT);
error = ffs_update(vp, 1);
goto out;
goto extclean;
return ffs_update(vp, 1);
}
if (ip->i_size == length) {
ip->i_flag |= IN_CHANGE | IN_UPDATE;
if (needextclean)
softdep_setup_freeblocks(ip, length, IO_EXT);
error = ffs_update(vp, 0);
goto out;
goto extclean;
return ffs_update(vp, 0);
}
if (fs->fs_ronly)
panic("ffs_truncate: read-only filesystem");
#ifdef QUOTA
error = getinoquota(ip);
if (error)
goto out;
#endif
if ((ip->i_flags & SF_SNAPSHOT) != 0)
ffs_snapremove(vp);
vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
@ -285,7 +279,7 @@ ffs_truncate(vp, length, flags, cred, td)
error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp);
if (error) {
vnode_pager_setsize(vp, osize);
goto out;
return (error);
}
ip->i_size = length;
DIP_SET(ip, i_size, length);
@ -296,11 +290,10 @@ ffs_truncate(vp, length, flags, cred, td)
else
bawrite(bp);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
error = ffs_update(vp, 1);
goto out;
return ffs_update(vp, 1);
}
if (DOINGSOFTDEP(vp)) {
if (length > 0 || softdepslowdown) {
if (softdeptrunc == 0 && journaltrunc == 0) {
/*
* If a file is only partially truncated, then
* we have to clean up the data structures
@ -311,29 +304,20 @@ ffs_truncate(vp, length, flags, cred, td)
* so that it will have no data structures left.
*/
if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0)
goto out;
/*
* We have to journal the truncation before we change
* any blocks so we don't leave the file partially
* truncated.
*/
if (DOINGSUJ(vp) && cookie == NULL)
cookie = softdep_setup_trunc(vp, length, flags);
return (error);
} else {
#ifdef QUOTA
(void) chkdq(ip, -datablocks, NOCRED, 0);
#endif
softdep_setup_freeblocks(ip, length, needextclean ?
IO_EXT | IO_NORMAL : IO_NORMAL);
flags = IO_NORMAL | (needextclean ? IO_EXT: 0);
if (journaltrunc)
softdep_journal_freeblocks(ip, cred, length,
flags);
else
softdep_setup_freeblocks(ip, length, flags);
ASSERT_VOP_LOCKED(vp, "ffs_truncate1");
vinvalbuf(vp, needextclean ? 0 : V_NORMAL, 0, 0);
if (!needextclean)
ffs_pages_remove(vp, 0,
OFF_TO_IDX(lblktosize(fs, -extblocks)));
vnode_pager_setsize(vp, 0);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
error = ffs_update(vp, 0);
goto out;
if (journaltrunc == 0) {
ip->i_flag |= IN_CHANGE | IN_UPDATE;
error = ffs_update(vp, 0);
}
return (error);
}
}
/*
@ -353,7 +337,7 @@ ffs_truncate(vp, length, flags, cred, td)
flags |= BA_CLRBUF;
error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp);
if (error)
goto out;
return (error);
/*
* When we are doing soft updates and the UFS_BALLOC
* above fills in a direct block hole with a full sized
@ -365,7 +349,7 @@ ffs_truncate(vp, length, flags, cred, td)
if (DOINGSOFTDEP(vp) && lbn < NDADDR &&
fragroundup(fs, blkoff(fs, length)) < fs->fs_bsize &&
(error = ffs_syncvnode(vp, MNT_WAIT)) != 0)
goto out;
return (error);
ip->i_size = length;
DIP_SET(ip, i_size, length);
size = blksize(fs, ip, lbn);
@ -411,13 +395,7 @@ ffs_truncate(vp, length, flags, cred, td)
DIP_SET(ip, i_db[i], 0);
}
ip->i_flag |= IN_CHANGE | IN_UPDATE;
/*
* When doing softupdate journaling we must preserve the size along
* with the old pointers until they are freed or we might not
* know how many fragments remain.
*/
if (!DOINGSUJ(vp))
allerror = ffs_update(vp, 1);
allerror = ffs_update(vp, 1);
/*
* Having written the new inode to disk, save its new configuration
@ -541,14 +519,14 @@ ffs_truncate(vp, length, flags, cred, td)
#ifdef QUOTA
(void) chkdq(ip, -blocksreleased, NOCRED, 0);
#endif
error = allerror;
out:
if (cookie) {
allerror = softdep_complete_trunc(vp, cookie);
if (allerror != 0 && error == 0)
error = allerror;
}
return (error);
return (allerror);
extclean:
if (journaltrunc)
softdep_journal_freeblocks(ip, cred, length, IO_EXT);
else
softdep_setup_freeblocks(ip, length, IO_EXT);
return ffs_update(vp, MNT_WAIT);
}
/*

File diff suppressed because it is too large Load Diff

View File

@ -2034,12 +2034,10 @@ ffs_geom_strategy(struct bufobj *bo, struct buf *bp)
static void
db_print_ffs(struct ufsmount *ump)
{
db_printf("mp %p %s devvp %p fs %p su_wl %d su_wl_in %d su_deps %d "
"su_req %d\n",
db_printf("mp %p %s devvp %p fs %p su_wl %d su_deps %d su_req %d\n",
ump->um_mountp, ump->um_mountp->mnt_stat.f_mntonname,
ump->um_devvp, ump->um_fs, ump->softdep_on_worklist,
ump->softdep_on_worklist_inprogress, ump->softdep_deps,
ump->softdep_req);
ump->softdep_deps, ump->softdep_req);
}
DB_SHOW_COMMAND(ffs, db_show_ffs)

View File

@ -212,26 +212,32 @@ ffs_fsync(struct vop_fsync_args *ap)
int
ffs_syncvnode(struct vnode *vp, int waitfor)
{
struct inode *ip = VTOI(vp);
struct inode *ip;
struct bufobj *bo;
struct buf *bp;
struct buf *nbp;
int s, error, wait, passes, skipmeta;
ufs_lbn_t lbn;
int error, wait, passes;
wait = (waitfor == MNT_WAIT);
lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1));
bo = &vp->v_bufobj;
ip = VTOI(vp);
ip->i_flag &= ~IN_NEEDSYNC;
bo = &vp->v_bufobj;
/*
* When doing MNT_WAIT we must first flush all dependencies
* on the inode.
*/
if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT &&
(error = softdep_sync_metadata(vp)) != 0)
return (error);
/*
* Flush all dirty buffers associated with a vnode.
*/
passes = NIADDR + 1;
skipmeta = 0;
if (wait)
skipmeta = 1;
s = splbio();
error = 0;
passes = 0;
wait = 0; /* Always do an async pass first. */
lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1));
BO_LOCK(bo);
loop:
TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
@ -239,70 +245,53 @@ ffs_syncvnode(struct vnode *vp, int waitfor)
TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
/*
* Reasons to skip this buffer: it has already been considered
* on this pass, this pass is the first time through on a
* synchronous flush request and the buffer being considered
* is metadata, the buffer has dependencies that will cause
* on this pass, the buffer has dependencies that will cause
* it to be redirtied and it has not already been deferred,
* or it is already being written.
*/
if ((bp->b_vflags & BV_SCANNED) != 0)
continue;
bp->b_vflags |= BV_SCANNED;
if ((skipmeta == 1 && bp->b_lblkno < 0))
/* Flush indirects in order. */
if (waitfor == MNT_WAIT && bp->b_lblkno <= -NDADDR &&
lbn_level(bp->b_lblkno) >= passes)
continue;
if (bp->b_lblkno > lbn)
panic("ffs_syncvnode: syncing truncated data.");
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
continue;
BO_UNLOCK(bo);
if (!wait && !LIST_EMPTY(&bp->b_dep) &&
(bp->b_flags & B_DEFERRED) == 0 &&
buf_countdeps(bp, 0)) {
bp->b_flags |= B_DEFERRED;
BUF_UNLOCK(bp);
BO_LOCK(bo);
continue;
}
if ((bp->b_flags & B_DELWRI) == 0)
panic("ffs_fsync: not dirty");
/*
* If this is a synchronous flush request, or it is not a
* file or device, start the write on this buffer immediately.
* Check for dependencies and potentially complete them.
*/
if (wait || (vp->v_type != VREG && vp->v_type != VBLK)) {
/*
* On our final pass through, do all I/O synchronously
* so that we can find out if our flush is failing
* because of write errors.
*/
if (passes > 0 || !wait) {
if ((bp->b_flags & B_CLUSTEROK) && !wait) {
(void) vfs_bio_awrite(bp);
} else {
bremfree(bp);
splx(s);
(void) bawrite(bp);
s = splbio();
}
} else {
bremfree(bp);
splx(s);
if ((error = bwrite(bp)) != 0)
return (error);
s = splbio();
if (!LIST_EMPTY(&bp->b_dep) &&
(error = softdep_sync_buf(vp, bp,
wait ? MNT_WAIT : MNT_NOWAIT)) != 0) {
/* I/O error. */
if (error != EBUSY) {
BUF_UNLOCK(bp);
return (error);
}
} else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) {
/*
* If the buffer is for data that has been truncated
* off the file, then throw it away.
*/
/* If we deferred once, don't defer again. */
if ((bp->b_flags & B_DEFERRED) == 0) {
bp->b_flags |= B_DEFERRED;
BUF_UNLOCK(bp);
goto next;
}
}
if (wait) {
bremfree(bp);
bp->b_flags |= B_INVAL | B_NOCACHE;
splx(s);
brelse(bp);
s = splbio();
} else
vfs_bio_awrite(bp);
if ((error = bwrite(bp)) != 0)
return (error);
} else if ((bp->b_flags & B_CLUSTEROK)) {
(void) vfs_bio_awrite(bp);
} else {
bremfree(bp);
(void) bawrite(bp);
}
next:
/*
* Since we may have slept during the I/O, we need
* to start from a known point.
@ -310,51 +299,44 @@ ffs_syncvnode(struct vnode *vp, int waitfor)
BO_LOCK(bo);
nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd);
}
/*
* If we were asked to do this synchronously, then go back for
* another pass, this time doing the metadata.
*/
if (skipmeta) {
skipmeta = 0;
goto loop;
}
if (wait) {
bufobj_wwait(bo, 3, 0);
if (waitfor != MNT_WAIT) {
BO_UNLOCK(bo);
/*
* Ensure that any filesystem metatdata associated
* with the vnode has been written.
*/
splx(s);
if ((error = softdep_sync_metadata(vp)) != 0)
return (error);
s = splbio();
BO_LOCK(bo);
if (bo->bo_dirty.bv_cnt > 0) {
/*
* Block devices associated with filesystems may
* have new I/O requests posted for them even if
* the vnode is locked, so no amount of trying will
* get them clean. Thus we give block devices a
* good effort, then just give up. For all other file
* types, go around and try again until it is clean.
*/
if (passes > 0) {
passes -= 1;
goto loop;
}
#ifdef INVARIANTS
if (!vn_isdisk(vp, NULL))
vprint("ffs_fsync: dirty", vp);
#endif
return (ffs_update(vp, waitfor));
}
/* Drain IO to see if we're done. */
bufobj_wwait(bo, 0, 0);
/*
* Block devices associated with filesystems may have new I/O
* requests posted for them even if the vnode is locked, so no
* amount of trying will get them clean. We make several passes
* as a best effort.
*
* Regular files may need multiple passes to flush all dependency
* work as it is possible that we must write once per indirect
* level, once for the leaf, and once for the inode and each of
* these will be done with one sync and one async pass.
*/
if (bo->bo_dirty.bv_cnt > 0) {
/* Write the inode after sync passes to flush deps. */
if (wait && DOINGSOFTDEP(vp)) {
BO_UNLOCK(bo);
ffs_update(vp, MNT_WAIT);
BO_LOCK(bo);
}
/* switch between sync/async. */
wait = !wait;
if (wait == 1 || ++passes < NIADDR + 2)
goto loop;
#ifdef INVARIANTS
if (!vn_isdisk(vp, NULL))
vprint("ffs_fsync: dirty", vp);
#endif
}
BO_UNLOCK(bo);
splx(s);
return (ffs_update(vp, wait));
error = ffs_update(vp, MNT_WAIT);
if (DOINGSUJ(vp))
softdep_journal_fsync(VTOI(vp));
return (error);
}
static int

View File

@ -664,6 +664,7 @@ lbn_offset(struct fs *fs, int level)
#define JOP_FREEBLK 4 /* Free a block or a tree of blocks. */
#define JOP_MVREF 5 /* Move a reference from one off to another. */
#define JOP_TRUNC 6 /* Partial truncation record. */
#define JOP_SYNC 7 /* fsync() complete record. */
#define JREC_SIZE 32 /* Record and segment header size. */
@ -729,7 +730,7 @@ struct jblkrec {
/*
* Truncation record. Records a partial truncation so that it may be
* completed later.
* completed at check time. Also used for sync records.
*/
struct jtrncrec {
uint32_t jt_op;

View File

@ -127,7 +127,7 @@
#define DIRCHG 0x000080 /* diradd, dirrem only */
#define GOINGAWAY 0x000100 /* indirdep, jremref only */
#define IOSTARTED 0x000200 /* inodedep, pagedep, bmsafemap only */
#define UNUSED400 0x000400 /* currently available. */
#define DELAYEDFREE 0x000400 /* allocindirect free delayed. */
#define NEWBLOCK 0x000800 /* pagedep, jaddref only */
#define INPROGRESS 0x001000 /* dirrem, freeblks, freefrag, freefile only */
#define UFS1FMT 0x002000 /* indirdep only */
@ -195,8 +195,9 @@ struct worklist {
#define WK_JFREEBLK(wk) ((struct jfreeblk *)(wk))
#define WK_FREEDEP(wk) ((struct freedep *)(wk))
#define WK_JFREEFRAG(wk) ((struct jfreefrag *)(wk))
#define WK_SBDEP(wk) ((struct sbdep *)wk)
#define WK_SBDEP(wk) ((struct sbdep *)(wk))
#define WK_JTRUNC(wk) ((struct jtrunc *)(wk))
#define WK_JFSYNC(wk) ((struct jfsync *)(wk))
/*
* Various types of lists
@ -213,10 +214,12 @@ LIST_HEAD(jaddrefhd, jaddref);
LIST_HEAD(jremrefhd, jremref);
LIST_HEAD(jmvrefhd, jmvref);
LIST_HEAD(jnewblkhd, jnewblk);
LIST_HEAD(jfreeblkhd, jfreeblk);
LIST_HEAD(jblkdephd, jblkdep);
LIST_HEAD(freeworkhd, freework);
TAILQ_HEAD(freeworklst, freework);
TAILQ_HEAD(jseglst, jseg);
TAILQ_HEAD(inoreflst, inoref);
TAILQ_HEAD(freeblklst, freeblks);
/*
* The "pagedep" structure tracks the various dependencies related to
@ -321,6 +324,7 @@ struct inodedep {
struct allocdirectlst id_newinoupdt; /* updates when inode written */
struct allocdirectlst id_extupdt; /* extdata updates pre-inode write */
struct allocdirectlst id_newextupdt; /* extdata updates at ino write */
struct freeblklst id_freeblklst; /* List of partial truncates. */
union {
struct ufs1_dinode *idu_savedino1; /* saved ufs1_dinode contents */
struct ufs2_dinode *idu_savedino2; /* saved ufs2_dinode contents */
@ -342,8 +346,9 @@ struct inodedep {
struct bmsafemap {
struct worklist sm_list; /* cylgrp buffer */
# define sm_state sm_list.wk_state
int sm_cg;
LIST_ENTRY(bmsafemap) sm_hash; /* Hash links. */
LIST_ENTRY(bmsafemap) sm_next; /* Mount list. */
int sm_cg;
struct buf *sm_buf; /* associated buffer */
struct allocdirecthd sm_allocdirecthd; /* allocdirect deps */
struct allocdirecthd sm_allocdirectwr; /* writing allocdirect deps */
@ -355,6 +360,8 @@ struct bmsafemap {
struct newblkhd sm_newblkwr; /* writing newblk deps */
struct jaddrefhd sm_jaddrefhd; /* Pending inode allocations. */
struct jnewblkhd sm_jnewblkhd; /* Pending block allocations. */
struct workhead sm_freehd; /* Freedep deps. */
struct workhead sm_freewr; /* Written freedeps. */
};
/*
@ -442,14 +449,15 @@ struct indirdep {
struct worklist ir_list; /* buffer holding indirect block */
# define ir_state ir_list.wk_state /* indirect block pointer state */
LIST_ENTRY(indirdep) ir_next; /* alloc{direct,indir} list */
TAILQ_HEAD(, freework) ir_trunc; /* List of truncations. */
caddr_t ir_saveddata; /* buffer cache contents */
struct buf *ir_savebp; /* buffer holding safe copy */
struct buf *ir_bp; /* buffer holding live copy */
struct allocindirhd ir_completehd; /* waiting for indirdep complete */
struct allocindirhd ir_writehd; /* Waiting for the pointer write. */
struct allocindirhd ir_donehd; /* done waiting to update safecopy */
struct allocindirhd ir_deplisthd; /* allocindir deps for this block */
struct jnewblkhd ir_jnewblkhd; /* Canceled block allocations. */
struct workhead ir_jwork; /* Journal work pending. */
struct freeblks *ir_freeblks; /* Freeblks that frees this indir. */
};
/*
@ -471,6 +479,7 @@ struct allocindir {
LIST_ENTRY(allocindir) ai_next; /* indirdep's list of allocindir's */
struct indirdep *ai_indirdep; /* address of associated indirdep */
ufs2_daddr_t ai_oldblkno; /* old value of block pointer */
ufs_lbn_t ai_lbn; /* Logical block number. */
int ai_offset; /* Pointer offset in parent. */
};
#define ai_newblkno ai_block.nb_newblkno
@ -516,14 +525,22 @@ struct freefrag {
struct freeblks {
struct worklist fb_list; /* id_inowait or delayed worklist */
# define fb_state fb_list.wk_state /* inode and dirty block state */
struct jfreeblkhd fb_jfreeblkhd; /* Journal entries pending */
TAILQ_ENTRY(freeblks) fb_next; /* List of inode truncates. */
struct jblkdephd fb_jblkdephd; /* Journal entries pending */
struct workhead fb_freeworkhd; /* Work items pending */
struct workhead fb_jwork; /* Journal work pending */
ino_t fb_previousinum; /* inode of previous owner of blocks */
uid_t fb_uid; /* uid of previous owner of blocks */
struct vnode *fb_devvp; /* filesystem device vnode */
ufs2_daddr_t fb_chkcnt; /* used to check cnt of blks released */
#ifdef QUOTA
struct dquot *fb_quota[MAXQUOTAS]; /* quotas to be adjusted */
#endif
uint64_t fb_modrev; /* Inode revision at start of trunc. */
off_t fb_len; /* Length we're truncating to. */
ufs2_daddr_t fb_chkcnt; /* Expected blks released. */
ufs2_daddr_t fb_freecnt; /* Actual blocks released. */
ino_t fb_inum; /* inode owner of blocks */
uid_t fb_uid; /* uid of previous owner of blocks */
int fb_ref; /* Children outstanding. */
int fb_cgwait; /* cg writes outstanding. */
};
/*
@ -538,16 +555,18 @@ struct freeblks {
struct freework {
struct worklist fw_list; /* Delayed worklist. */
# define fw_state fw_list.wk_state
LIST_ENTRY(freework) fw_next; /* For seg journal list. */
struct jnewblk *fw_jnewblk; /* Journal entry to cancel. */
LIST_ENTRY(freework) fw_segs; /* Seg list. */
TAILQ_ENTRY(freework) fw_next; /* Hash/Trunc list. */
struct jnewblk *fw_jnewblk; /* Journal entry to cancel. */
struct freeblks *fw_freeblks; /* Root of operation. */
struct freework *fw_parent; /* Parent indirect. */
struct indirdep *fw_indir; /* indirect block. */
ufs2_daddr_t fw_blkno; /* Our block #. */
ufs_lbn_t fw_lbn; /* Original lbn before free. */
int fw_frags; /* Number of frags. */
int fw_ref; /* Number of children out. */
int fw_off; /* Current working position. */
struct workhead fw_jwork; /* Journal work pending. */
uint16_t fw_frags; /* Number of frags. */
uint16_t fw_ref; /* Number of children out. */
uint16_t fw_off; /* Current working position. */
uint16_t fw_start; /* Start of partial truncate. */
};
/*
@ -674,6 +693,7 @@ struct dirrem {
LIST_ENTRY(dirrem) dm_inonext; /* inodedep's list of dirrem's */
struct jremrefhd dm_jremrefhd; /* Pending remove reference deps. */
ino_t dm_oldinum; /* inum of the removed dir entry */
doff_t dm_offset; /* offset of removed dir entry in blk */
union {
struct pagedep *dmu_pagedep; /* pagedep dependency for remove */
ino_t dmu_dirinum; /* parent inode number (for rmdir) */
@ -707,7 +727,7 @@ struct dirrem {
*/
struct newdirblk {
struct worklist db_list; /* id_inowait or pg_newdirblk */
# define db_state db_list.wk_state /* unused */
# define db_state db_list.wk_state
struct pagedep *db_pagedep; /* associated pagedep */
struct workhead db_mkdir;
};
@ -807,29 +827,36 @@ struct jnewblk {
# define jn_state jn_list.wk_state
struct jsegdep *jn_jsegdep; /* Will track our journal record. */
LIST_ENTRY(jnewblk) jn_deps; /* Jnewblks on sm_jnewblkhd. */
LIST_ENTRY(jnewblk) jn_indirdeps; /* Jnewblks on ir_jnewblkhd. */
struct worklist *jn_dep; /* Dependency to ref completed seg. */
ino_t jn_ino; /* Ino to which allocated. */
ufs_lbn_t jn_lbn; /* Lbn to which allocated. */
ufs2_daddr_t jn_blkno; /* Blkno allocated */
ino_t jn_ino; /* Ino to which allocated. */
int jn_oldfrags; /* Previous fragments when extended. */
int jn_frags; /* Number of fragments. */
};
/*
* A "jblkdep" structure tracks jfreeblk and jtrunc records attached to a
* freeblks structure.
*/
struct jblkdep {
struct worklist jb_list; /* For softdep journal pending. */
struct jsegdep *jb_jsegdep; /* Reference to the jseg. */
struct freeblks *jb_freeblks; /* Back pointer to freeblks. */
LIST_ENTRY(jblkdep) jb_deps; /* Dep list on freeblks. */
};
/*
* A "jfreeblk" structure tracks the journal write for freeing a block
* or tree of blocks. The block pointer must not be cleared in the inode
* or indirect prior to the jfreeblk being written to the journal.
*/
struct jfreeblk {
struct worklist jf_list; /* Linked to softdep_journal_pending. */
# define jf_state jf_list.wk_state
struct jsegdep *jf_jsegdep; /* Will track our journal record. */
struct freeblks *jf_freeblks; /* Back pointer to freeblks. */
LIST_ENTRY(jfreeblk) jf_deps; /* Jfreeblk on fb_jfreeblkhd. */
ino_t jf_ino; /* Ino from which blocks freed. */
struct jblkdep jf_dep; /* freeblks linkage. */
ufs_lbn_t jf_lbn; /* Lbn from which blocks freed. */
ufs2_daddr_t jf_blkno; /* Blkno being freed. */
ino_t jf_ino; /* Ino from which blocks freed. */
int jf_frags; /* Number of frags being freed. */
};
@ -843,24 +870,31 @@ struct jfreefrag {
# define fr_state fr_list.wk_state
struct jsegdep *fr_jsegdep; /* Will track our journal record. */
struct freefrag *fr_freefrag; /* Back pointer to freefrag. */
ino_t fr_ino; /* Ino from which frag freed. */
ufs_lbn_t fr_lbn; /* Lbn from which frag freed. */
ufs2_daddr_t fr_blkno; /* Blkno being freed. */
ino_t fr_ino; /* Ino from which frag freed. */
int fr_frags; /* Size of frag being freed. */
};
/*
* A "jtrunc" journals the intent to truncate an inode to a non-zero
* value. This is done synchronously prior to the synchronous partial
* truncation process. The jsegdep is not released until the truncation
* is complete and the truncated inode is fsync'd.
* A "jtrunc" journals the intent to truncate an inode's data or extent area.
*/
struct jtrunc {
struct worklist jt_list; /* Linked to softdep_journal_pending. */
struct jsegdep *jt_jsegdep; /* Will track our journal record. */
ino_t jt_ino; /* Ino being truncated. */
off_t jt_size; /* Final file size. */
int jt_extsize; /* Final extent size. */
struct jblkdep jt_dep; /* freeblks linkage. */
off_t jt_size; /* Final file size. */
int jt_extsize; /* Final extent size. */
ino_t jt_ino; /* Ino being truncated. */
};
/*
* A "jfsync" journals the completion of an fsync which invalidates earlier
* jtrunc records in the journal.
*/
struct jfsync {
struct worklist jfs_list; /* For softdep journal pending. */
off_t jfs_size; /* Sync file size. */
int jfs_extsize; /* Sync extent size. */
ino_t jfs_ino; /* ino being synced. */
};
/*

View File

@ -127,6 +127,8 @@ struct inode {
#define IN_EA_LOCKED 0x0200
#define IN_EA_LOCKWAIT 0x0400
#define IN_TRUNCATED 0x0800 /* Journaled truncation pending. */
#define i_devvp i_ump->um_devvp
#define i_umbufobj i_ump->um_bo
#define i_dirhash i_un.dirhash

View File

@ -61,6 +61,7 @@ struct jblocks;
struct inodedep;
TAILQ_HEAD(inodedeplst, inodedep);
LIST_HEAD(bmsafemaphd, bmsafemap);
/* This structure describes the UFS specific mount structure data. */
struct ufsmount {
@ -82,10 +83,10 @@ struct ufsmount {
struct workhead softdep_journal_pending; /* journal work queue */
struct worklist *softdep_journal_tail; /* Tail pointer for above */
struct jblocks *softdep_jblocks; /* Journal block information */
struct inodedeplst softdep_unlinked; /* Unlinked inodes */
struct inodedeplst softdep_unlinked; /* Unlinked inodes */
struct bmsafemaphd softdep_dirtycg; /* Dirty CGs */
int softdep_on_journal; /* Items on the journal list */
int softdep_on_worklist; /* Items on the worklist */
int softdep_on_worklist_inprogress; /* Busy items on worklist */
int softdep_deps; /* Total dependency count */
int softdep_accdeps; /* accumulated dep count */
int softdep_req; /* Wakeup when deps hits 0. */