- Fix a truncation bug with softdep journaling that could leak blocks on
crash. When truncating a file that never made it to disk we use the canceled allocation dependencies to hold the journal records until the truncation completes. Previously allocdirect dependencies on the id_bufwait list were not considered and their journal space could expire before the bitmaps were written. Cancel them and attach them to the freeblks as we do for other allocdirects. - Add KTR traces that were used to debug this problem. - When adding jsegdeps, always use jwork_insert() so we don't have more than one segdep on a given jwork list. Sponsored by: EMC / Isilon Storage Division
This commit is contained in:
parent
2db62a6b1f
commit
ad9cdc05ba
@ -60,6 +60,7 @@ __FBSDID("$FreeBSD$");
|
||||
#include <sys/buf.h>
|
||||
#include <sys/kdb.h>
|
||||
#include <sys/kthread.h>
|
||||
#include <sys/ktr.h>
|
||||
#include <sys/limits.h>
|
||||
#include <sys/lock.h>
|
||||
#include <sys/malloc.h>
|
||||
@ -92,6 +93,8 @@ __FBSDID("$FreeBSD$");
|
||||
|
||||
#include <ddb/ddb.h>
|
||||
|
||||
#define KTR_SUJ 0 /* Define to KTR_SPARE. */
|
||||
|
||||
#ifndef SOFTUPDATES
|
||||
|
||||
int
|
||||
@ -769,6 +772,34 @@ struct newblk_hashhead;
|
||||
struct pagedep_hashhead;
|
||||
struct bmsafemap_hashhead;
|
||||
|
||||
/*
|
||||
* Private journaling structures.
|
||||
*/
|
||||
struct jblocks {
|
||||
struct jseglst jb_segs; /* TAILQ of current segments. */
|
||||
struct jseg *jb_writeseg; /* Next write to complete. */
|
||||
struct jseg *jb_oldestseg; /* Oldest segment with valid entries. */
|
||||
struct jextent *jb_extent; /* Extent array. */
|
||||
uint64_t jb_nextseq; /* Next sequence number. */
|
||||
uint64_t jb_oldestwrseq; /* Oldest written sequence number. */
|
||||
uint8_t jb_needseg; /* Need a forced segment. */
|
||||
uint8_t jb_suspended; /* Did journal suspend writes? */
|
||||
int jb_avail; /* Available extents. */
|
||||
int jb_used; /* Last used extent. */
|
||||
int jb_head; /* Allocator head. */
|
||||
int jb_off; /* Allocator extent offset. */
|
||||
int jb_blocks; /* Total disk blocks covered. */
|
||||
int jb_free; /* Total disk blocks free. */
|
||||
int jb_min; /* Minimum free space. */
|
||||
int jb_low; /* Low on space. */
|
||||
int jb_age; /* Insertion time of oldest rec. */
|
||||
};
|
||||
|
||||
struct jextent {
|
||||
ufs2_daddr_t je_daddr; /* Disk block address. */
|
||||
int je_blocks; /* Disk block count. */
|
||||
};
|
||||
|
||||
/*
|
||||
* Internal function prototypes.
|
||||
*/
|
||||
@ -2268,19 +2299,15 @@ static void
|
||||
indirblk_insert(freework)
|
||||
struct freework *freework;
|
||||
{
|
||||
struct freeblks *freeblks;
|
||||
struct jsegdep *jsegdep;
|
||||
struct worklist *wk;
|
||||
struct jblocks *jblocks;
|
||||
struct jseg *jseg;
|
||||
|
||||
freeblks = freework->fw_freeblks;
|
||||
LIST_FOREACH(wk, &freeblks->fb_jwork, wk_list)
|
||||
if (wk->wk_type == D_JSEGDEP)
|
||||
break;
|
||||
if (wk == NULL)
|
||||
jblocks = VFSTOUFS(freework->fw_list.wk_mp)->softdep_jblocks;
|
||||
jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst);
|
||||
if (jseg == NULL)
|
||||
return;
|
||||
|
||||
jsegdep = WK_JSEGDEP(wk);
|
||||
LIST_INSERT_HEAD(&jsegdep->jd_seg->js_indirs, freework, fw_segs);
|
||||
LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs);
|
||||
TAILQ_INSERT_HEAD(INDIR_HASH(freework->fw_list.wk_mp,
|
||||
freework->fw_blkno), freework, fw_next);
|
||||
freework->fw_state &= ~DEPCOMPLETE;
|
||||
@ -2433,31 +2460,6 @@ softdep_unmount(mp)
|
||||
journal_unmount(mp);
|
||||
}
|
||||
|
||||
struct jblocks {
|
||||
struct jseglst jb_segs; /* TAILQ of current segments. */
|
||||
struct jseg *jb_writeseg; /* Next write to complete. */
|
||||
struct jseg *jb_oldestseg; /* Oldest segment with valid entries. */
|
||||
struct jextent *jb_extent; /* Extent array. */
|
||||
uint64_t jb_nextseq; /* Next sequence number. */
|
||||
uint64_t jb_oldestwrseq; /* Oldest written sequence number. */
|
||||
uint8_t jb_needseg; /* Need a forced segment. */
|
||||
uint8_t jb_suspended; /* Did journal suspend writes? */
|
||||
int jb_avail; /* Available extents. */
|
||||
int jb_used; /* Last used extent. */
|
||||
int jb_head; /* Allocator head. */
|
||||
int jb_off; /* Allocator extent offset. */
|
||||
int jb_blocks; /* Total disk blocks covered. */
|
||||
int jb_free; /* Total disk blocks free. */
|
||||
int jb_min; /* Minimum free space. */
|
||||
int jb_low; /* Low on space. */
|
||||
int jb_age; /* Insertion time of oldest rec. */
|
||||
};
|
||||
|
||||
struct jextent {
|
||||
ufs2_daddr_t je_daddr; /* Disk block address. */
|
||||
int je_blocks; /* Disk block count. */
|
||||
};
|
||||
|
||||
static struct jblocks *
|
||||
jblocks_create(void)
|
||||
{
|
||||
@ -3663,7 +3665,7 @@ handle_written_jnewblk(jnewblk)
|
||||
*/
|
||||
freefrag = WK_FREEFRAG(jnewblk->jn_dep);
|
||||
freefrag->ff_jdep = NULL;
|
||||
WORKLIST_INSERT(&freefrag->ff_jwork, &jsegdep->jd_list);
|
||||
jwork_insert(&freefrag->ff_jwork, jsegdep);
|
||||
break;
|
||||
case D_FREEWORK:
|
||||
/*
|
||||
@ -3671,8 +3673,7 @@ handle_written_jnewblk(jnewblk)
|
||||
*/
|
||||
freework = WK_FREEWORK(jnewblk->jn_dep);
|
||||
freework->fw_jnewblk = NULL;
|
||||
WORKLIST_INSERT(&freework->fw_freeblks->fb_jwork,
|
||||
&jsegdep->jd_list);
|
||||
jwork_insert(&freework->fw_freeblks->fb_jwork, jsegdep);
|
||||
break;
|
||||
default:
|
||||
panic("handle_written_jnewblk: Unknown type %d.",
|
||||
@ -3702,6 +3703,7 @@ cancel_jfreefrag(jfreefrag)
|
||||
jfreefrag->fr_freefrag = NULL;
|
||||
free_jfreefrag(jfreefrag);
|
||||
freefrag->ff_state |= DEPCOMPLETE;
|
||||
CTR1(KTR_SUJ, "cancel_jfreefrag: blkno %jd", freefrag->ff_blkno);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -3765,7 +3767,7 @@ handle_written_jblkdep(jblkdep)
|
||||
jblkdep->jb_jsegdep = NULL;
|
||||
freeblks = jblkdep->jb_freeblks;
|
||||
LIST_REMOVE(jblkdep, jb_deps);
|
||||
WORKLIST_INSERT(&freeblks->fb_jwork, &jsegdep->jd_list);
|
||||
jwork_insert(&freeblks->fb_jwork, jsegdep);
|
||||
/*
|
||||
* If the freeblks is all journaled, we can add it to the worklist.
|
||||
*/
|
||||
@ -3968,6 +3970,7 @@ cancel_jfreeblk(freeblks, blkno)
|
||||
}
|
||||
if (jblkdep == NULL)
|
||||
return;
|
||||
CTR1(KTR_SUJ, "cancel_jfreeblk: blkno %jd", blkno);
|
||||
free_jsegdep(jblkdep->jb_jsegdep);
|
||||
LIST_REMOVE(jblkdep, jb_deps);
|
||||
WORKITEM_FREE(jfreeblk, D_JFREEBLK);
|
||||
@ -4208,6 +4211,7 @@ cancel_jnewblk(jnewblk, wkhd)
|
||||
{
|
||||
struct jsegdep *jsegdep;
|
||||
|
||||
CTR1(KTR_SUJ, "cancel_jnewblk: blkno %jd", jnewblk->jn_blkno);
|
||||
jsegdep = jnewblk->jn_jsegdep;
|
||||
if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL)
|
||||
panic("cancel_jnewblk: Invalid state");
|
||||
@ -4899,6 +4903,10 @@ softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
CTR3(KTR_SUJ,
|
||||
"softdep_setup_blkmapdep: blkno %jd frags %d oldfrags %d",
|
||||
newblkno, frags, oldfrags);
|
||||
ACQUIRE_LOCK(&lk);
|
||||
if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
|
||||
panic("softdep_setup_blkmapdep: found block");
|
||||
@ -5060,6 +5068,10 @@ softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
|
||||
else
|
||||
freefrag = NULL;
|
||||
|
||||
CTR6(KTR_SUJ,
|
||||
"softdep_setup_allocdirect: ino %d blkno %jd oldblkno %jd "
|
||||
"off %jd newsize %ld oldsize %d",
|
||||
ip->i_number, newblkno, oldblkno, off, newsize, oldsize);
|
||||
ACQUIRE_LOCK(&lk);
|
||||
if (off >= NDADDR) {
|
||||
if (lbn > 0)
|
||||
@ -5338,6 +5350,8 @@ newfreefrag(ip, blkno, size, lbn)
|
||||
struct freefrag *freefrag;
|
||||
struct fs *fs;
|
||||
|
||||
CTR4(KTR_SUJ, "newfreefrag: ino %d blkno %jd size %ld lbn %jd",
|
||||
ip->i_number, blkno, size, lbn);
|
||||
fs = ip->i_fs;
|
||||
if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
|
||||
panic("newfreefrag: frag size");
|
||||
@ -5373,6 +5387,9 @@ handle_workitem_freefrag(freefrag)
|
||||
struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
|
||||
struct workhead wkhd;
|
||||
|
||||
CTR3(KTR_SUJ,
|
||||
"handle_workitem_freefrag: ino %d blkno %jd size %ld",
|
||||
freefrag->ff_inum, freefrag->ff_blkno, freefrag->ff_fragsize);
|
||||
/*
|
||||
* It would be illegal to add new completion items to the
|
||||
* freefrag after it was schedule to be done so it must be
|
||||
@ -5591,6 +5608,9 @@ softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
|
||||
if (lbn != nbp->b_lblkno)
|
||||
panic("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
|
||||
lbn, bp->b_lblkno);
|
||||
CTR4(KTR_SUJ,
|
||||
"softdep_setup_allocindir_page: ino %d blkno %jd oldblkno %jd "
|
||||
"lbn %jd", ip->i_number, newblkno, oldblkno, lbn);
|
||||
ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
|
||||
mp = UFSTOVFS(ip->i_ump);
|
||||
aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
|
||||
@ -5629,6 +5649,9 @@ softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
|
||||
ufs_lbn_t lbn;
|
||||
int dflags;
|
||||
|
||||
CTR3(KTR_SUJ,
|
||||
"softdep_setup_allocindir_meta: ino %d blkno %jd ptrno %d",
|
||||
ip->i_number, newblkno, ptrno);
|
||||
lbn = nbp->b_lblkno;
|
||||
ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
|
||||
aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
|
||||
@ -6233,6 +6256,7 @@ softdep_journal_freeblocks(ip, cred, length, flags)
|
||||
int flags; /* IO_EXT and/or IO_NORMAL */
|
||||
{
|
||||
struct freeblks *freeblks, *fbn;
|
||||
struct worklist *wk, *wkn;
|
||||
struct inodedep *inodedep;
|
||||
struct jblkdep *jblkdep;
|
||||
struct allocdirect *adp, *adpn;
|
||||
@ -6267,6 +6291,8 @@ softdep_journal_freeblocks(ip, cred, length, flags)
|
||||
if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED &&
|
||||
length == 0)
|
||||
needj = 0;
|
||||
CTR3(KTR_SUJ, "softdep_journal_freeblks: ip %d length %ld needj %d",
|
||||
ip->i_number, length, needj);
|
||||
FREE_LOCK(&lk);
|
||||
/*
|
||||
* Calculate the lbn that we are truncating to. This results in -1
|
||||
@ -6419,6 +6445,21 @@ softdep_journal_freeblocks(ip, cred, length, flags)
|
||||
while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
|
||||
cancel_allocdirect(&inodedep->id_extupdt, adp,
|
||||
freeblks);
|
||||
/*
|
||||
* Scan the bufwait list for newblock dependencies that will never
|
||||
* make it to disk.
|
||||
*/
|
||||
LIST_FOREACH_SAFE(wk, &inodedep->id_bufwait, wk_list, wkn) {
|
||||
if (wk->wk_type != D_ALLOCDIRECT)
|
||||
continue;
|
||||
adp = WK_ALLOCDIRECT(wk);
|
||||
if (((flags & IO_NORMAL) != 0 && (adp->ad_offset > iboff)) ||
|
||||
((flags & IO_EXT) != 0 && (adp->ad_state & EXTDATA))) {
|
||||
cancel_jfreeblk(freeblks, adp->ad_newblkno);
|
||||
cancel_newblk(WK_NEWBLK(wk), NULL, &freeblks->fb_jwork);
|
||||
WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Add journal work.
|
||||
*/
|
||||
@ -6558,6 +6599,8 @@ softdep_setup_freeblocks(ip, length, flags)
|
||||
ufs_lbn_t tmpval;
|
||||
ufs_lbn_t lbn;
|
||||
|
||||
CTR2(KTR_SUJ, "softdep_setup_freeblks: ip %d length %ld",
|
||||
ip->i_number, length);
|
||||
fs = ip->i_fs;
|
||||
mp = UFSTOVFS(ip->i_ump);
|
||||
if (length != 0)
|
||||
@ -7083,6 +7126,8 @@ cancel_newblk(newblk, wk, wkhd)
|
||||
{
|
||||
struct jnewblk *jnewblk;
|
||||
|
||||
CTR1(KTR_SUJ, "cancel_newblk: blkno %jd", newblk->nb_newblkno);
|
||||
|
||||
newblk->nb_state |= GOINGAWAY;
|
||||
/*
|
||||
* Previously we traversed the completedhd on each indirdep
|
||||
@ -7451,6 +7496,9 @@ freework_freeblock(freework)
|
||||
}
|
||||
FREE_LOCK(&lk);
|
||||
freeblks_free(ump, freeblks, btodb(bsize));
|
||||
CTR4(KTR_SUJ,
|
||||
"freework_freeblock: ino %d blkno %jd lbn %jd size %ld",
|
||||
freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize);
|
||||
ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize,
|
||||
freeblks->fb_inum, freeblks->fb_vtype, &wkhd);
|
||||
ACQUIRE_LOCK(&lk);
|
||||
@ -7884,6 +7932,9 @@ indir_trunc(freework, dbn, lbn)
|
||||
&freedep->fd_list);
|
||||
freedeps++;
|
||||
}
|
||||
CTR3(KTR_SUJ,
|
||||
"indir_trunc: ino %d blkno %jd size %ld",
|
||||
freeblks->fb_inum, nb, fs->fs_bsize);
|
||||
ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
|
||||
fs->fs_bsize, freeblks->fb_inum,
|
||||
freeblks->fb_vtype, &wkhd);
|
||||
@ -7919,6 +7970,9 @@ indir_trunc(freework, dbn, lbn)
|
||||
* If we're not journaling we can free the indirect now.
|
||||
*/
|
||||
dbn = dbtofsb(fs, dbn);
|
||||
CTR3(KTR_SUJ,
|
||||
"indir_trunc 2: ino %d blkno %jd size %ld",
|
||||
freeblks->fb_inum, dbn, fs->fs_bsize);
|
||||
ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
|
||||
freeblks->fb_inum, freeblks->fb_vtype, NULL);
|
||||
/* Non SUJ softdep does single-threaded truncations. */
|
||||
@ -10356,6 +10410,10 @@ softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
|
||||
int i;
|
||||
#endif
|
||||
|
||||
CTR3(KTR_SUJ,
|
||||
"softdep_setup_blkfree: blkno %jd frags %d wk head %p",
|
||||
blkno, frags, wkhd);
|
||||
|
||||
ACQUIRE_LOCK(&lk);
|
||||
/* Lookup the bmsafemap so we track when it is dirty. */
|
||||
fs = VFSTOUFS(mp)->um_fs;
|
||||
@ -10367,6 +10425,9 @@ softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
|
||||
*/
|
||||
if (wkhd) {
|
||||
while ((wk = LIST_FIRST(wkhd)) != NULL) {
|
||||
CTR2(KTR_SUJ,
|
||||
"softdep_setup_blkfree: blkno %jd wk type %d",
|
||||
blkno, wk->wk_type);
|
||||
WORKLIST_REMOVE(wk);
|
||||
if (wk->wk_type != D_JNEWBLK) {
|
||||
WORKLIST_INSERT(&bmsafemap->sm_freehd, wk);
|
||||
|
Loading…
Reference in New Issue
Block a user