Merge in Kirk's changes to stop softupdates from hogging all of memory.

This commit is contained in:
Julian Elischer 1998-05-19 21:45:53 +00:00
parent aebde78243
commit 62e12c760c
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=36225
9 changed files with 882 additions and 201 deletions

View File

@ -249,3 +249,69 @@ code and installed the updated utilities, do the following:
it and run `./doit'. You may want to check out each of the
three subtests individually first: doit1 - andrew benchmarks,
doit2 - copy and removal of /etc, doit3 - find from /.
====
Additional notes from Feb 13
hen removing huge directories of files, it is possible to get
the incore state arbitrarily far ahead of the disk. Maintaining
all the associated depedency information can exhaust the kernel
malloc arena. To avoid this senario, I have put some limits on
the soft update code so that it will not be allowed to rampage
through all of the kernel memory. I enclose below the relevant
patches to vnode.h and vfs_subr.c (which allow the soft update
code to speed up the filesystem syncer process). I have also
included the diffs for ffs_softdep.c. I hope to make a pass over
ffs_softdep.c to isolate the differences with my standard version
so that these diffs are less painful to incorporate.
Since I know you like to play with tuning, I have put the relevant
knobs on sysctl debug variables. The tuning knobs can be viewed
with `sysctl debug' and set with `sysctl -w debug.<name>=value'.
The knobs are as follows:
debug.max_softdeps - limit on any given resource
debug.tickdelay - ticks to delay before allocating
debug.max_limit_hit - number of times tickdelay imposed
debug.rush_requests - number of rush requests to filesystem syncer
The max_softdeps limit is derived from vnodesdesired which in
turn is sized based on the amount of memory on the machine.
When the limit is hit, a process requesting a resource first
tries to speed up the filesystem syncer process. Such a
request is recorded as a rush_request. After syncdelay / 2
unserviced rush requests (typically 15) are in the filesystem
syncers queue (i.e., it is more than 15 seconds behind in its
work), the process requesting the memory is put to sleep for
tickdelay seconds. Such a delay is recorded in max_limit_hit.
Following this delay it is granted its memory without further
delay. I have tried the following experiments in which I
delete an MH directory containing 16,703 files:
Run # 1 2 3
max_softdeps 4496 4496 4496
tickdelay 100 == 1 sec 20 == 0.2 sec 2 == 0.02 sec
max_limit_hit 16 == 16 sec 27 == 5.4 sec 203 == 4.1 sec
rush_requests 147 102 93
run time 57 sec 46 sec 45 sec
I/O's 781 859 936
When run with no limits, it completes in 40 seconds. So, the
time spent in delay is directly added to the bottom line.
Shortening the tick delay does cut down the total running time,
but at the expense of generating more total I/O operations
due to the rush orders being sent to the filesystem syncer.
Although the number of rush orders decreases with a shorter
tick delay, there are more requests in each order, hence the
increase in I/O count. Also, although the I/O count does rise
with a shorter delay, it is still at least an order of magnitude
less than without soft updates. Anyway, you may want to play
around with these value to see what works best and to see if
you can get an insight into how best to tune them. If you get
out of memory panic's, then you have max_softdeps set too high.
The max_limit_hit and rush_requests show be reset to zero
before each run. The minimum legal value for tickdelay is 2
(if you set it below that, the code will use 2).

View File

@ -53,7 +53,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ffs_softdep.c 9.14 (McKusick) 1/15/98
* @(#)ffs_softdep.c 9.21 (McKusick) 2/15/98
*/
/*
@ -95,6 +95,7 @@ static int flush_pagedep_deps __P((struct vnode *, struct mount *,
struct diraddhd *));
static int flush_inodedep_deps __P((struct fs *, ino_t));
static int handle_written_filepage __P((struct pagedep *, struct buf *));
static void diradd_inode_written __P((struct diradd *, struct inodedep *));
static int handle_written_inodeblock __P((struct inodedep *, struct buf *));
static void handle_allocdirect_partdone __P((struct allocdirect *));
static void handle_allocindir_partdone __P((struct allocindir *));
@ -129,6 +130,8 @@ static int newblk_lookup __P((struct fs *, ufs_daddr_t, int,
static int inodedep_lookup __P((struct fs *, ino_t, int, struct inodedep **));
static int pagedep_lookup __P((struct inode *, ufs_lbn_t, int,
struct pagedep **));
static void pause_timer __P((void *));
static int checklimit __P((long *, int));
static void add_to_worklist __P((struct worklist *));
/*
@ -427,6 +430,28 @@ workitem_free(item, type)
*/
static struct workhead softdep_workitem_pending;
static int softdep_worklist_busy;
static int max_softdeps; /* maximum number of structs before slowdown */
static int tickdelay = 2; /* number of ticks to pause during slowdown */
static int max_limit_hit; /* number of times slowdown imposed */
static int rush_requests; /* number of times I/O speeded up */
static int proc_waiting; /* tracks whether we have a timeout posted */
static pid_t filesys_syncer_pid;/* records pid of filesystem syncer process */
#ifdef DEBUG
#include <vm/vm.h>
#include <sys/sysctl.h>
#if defined(__FreeBSD__)
SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
SYSCTL_INT(_debug, OID_AUTO, max_limit_hit, CTLFLAG_RW, &max_limit_hit, 0, "");
SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &rush_requests, 0, "");
#else /* !__FreeBSD__ */
struct ctldebug debug4 = { "max_softdeps", &max_softdeps };
struct ctldebug debug5 = { "tickdelay", &tickdelay };
struct ctldebug debug6 = { "max_limit_hit", &max_limit_hit };
struct ctldebug debug7 = { "rush_requests", &rush_requests };
#endif /* !__FreeBSD__ */
#endif /* DEBUG */
/*
* Add an item to the end of the work queue.
@ -465,10 +490,16 @@ int
softdep_process_worklist(matchmnt)
struct mount *matchmnt;
{
struct proc *p = curproc;
struct worklist *wk;
struct fs *matchfs;
int matchcnt;
/*
* Record the process identifier of our caller so that we can
* give this process preferential treatment in checklimit below.
*/
filesys_syncer_pid = p->p_pid;
matchcnt = 0;
matchfs = NULL;
if (matchmnt != NULL)
@ -591,6 +622,71 @@ softdep_flushfiles(oldmnt, flags, p)
return (error);
}
/*
* A large burst of file addition or deletion activity can drive the
* memory load excessively high. Therefore we deliberately slow things
* down and speed up the I/O processing if we find ourselves with too
* many dependencies in progress.
*/
static int
checklimit(resource, islocked)
long *resource;
int islocked;
{
struct proc *p = curproc;
/*
* If we are under our limit, just proceed.
*/
if (*resource < max_softdeps)
return (0);
/*
* We never hold up the filesystem syncer process.
*/
if (p->p_pid == filesys_syncer_pid)
return (0);
/*
* Our first approach is to speed up the syncer process.
* We never push it to speed up more than half of its
* normal turn time, otherwise it could take over the cpu.
*/
if (rushjob < syncdelay / 2) {
rushjob += 1;
rush_requests += 1;
return (0);
}
/*
* Every trick has failed, so we pause momentarily to let
* the filesystem syncer process catch up.
*/
if (islocked == 0)
ACQUIRE_LOCK(&lk);
if (proc_waiting == 0) {
proc_waiting = 1;
timeout(pause_timer, NULL, tickdelay > 2 ? tickdelay : 2);
}
FREE_LOCK_INTERLOCKED(&lk);
(void) tsleep((caddr_t)&proc_waiting, PPAUSE | PCATCH, "softupdate", 0);
ACQUIRE_LOCK_INTERLOCKED(&lk);
if (islocked == 0)
FREE_LOCK(&lk);
max_limit_hit += 1;
return (1);
}
/*
* Awaken processes pausing in checklimit and clear proc_waiting
* to indicate that there is no longer a timer running.
*/
void
pause_timer(arg)
void *arg;
{
proc_waiting = 0;
wakeup(&proc_waiting);
}
/*
* Structure hashing.
*
@ -690,7 +786,8 @@ pagedep_lookup(ip, lbn, flags, pagedeppp)
* Structures and routines associated with inodedep caching.
*/
LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
u_long inodedep_hash; /* size of hash table - 1 */
static u_long inodedep_hash; /* size of hash table - 1 */
static long num_inodedep; /* number of inodedep allocated */
#define INODEDEP_HASH(fs, inum) \
(&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
static struct sema inodedep_in_progress;
@ -710,11 +807,13 @@ inodedep_lookup(fs, inum, flags, inodedeppp)
{
struct inodedep *inodedep;
struct inodedep_hashhead *inodedephd;
int firsttry;
#ifdef DEBUG
if (lk.lkt_held == -1)
panic("inodedep_lookup: lock not held");
#endif
firsttry = 1;
inodedephd = INODEDEP_HASH(fs, inum);
top:
for (inodedep = LIST_FIRST(inodedephd); inodedep;
@ -729,10 +828,15 @@ inodedep_lookup(fs, inum, flags, inodedeppp)
*inodedeppp = NULL;
return (0);
}
if (firsttry && checklimit(&num_inodedep, 1) == 1) {
firsttry = 0;
goto top;
}
if (sema_get(&inodedep_in_progress, &lk) == 0) {
ACQUIRE_LOCK(&lk);
goto top;
}
num_inodedep += 1;
MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
M_INODEDEP, M_WAITOK);
inodedep->id_list.wk_type = D_INODEDEP;
@ -745,6 +849,7 @@ inodedep_lookup(fs, inum, flags, inodedeppp)
inodedep->id_buf = NULL;
LIST_INIT(&inodedep->id_pendinghd);
LIST_INIT(&inodedep->id_inowait);
LIST_INIT(&inodedep->id_bufwait);
TAILQ_INIT(&inodedep->id_inoupdt);
TAILQ_INIT(&inodedep->id_newinoupdt);
ACQUIRE_LOCK(&lk);
@ -815,11 +920,11 @@ softdep_initialize()
LIST_INIT(&mkdirlisthd);
LIST_INIT(&softdep_workitem_pending);
pagedep_hashtbl = hashinit(desiredvnodes / 10, M_PAGEDEP,
max_softdeps = desiredvnodes * 8;
pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
&pagedep_hash);
sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0);
inodedep_hashtbl = hashinit(desiredvnodes / 2, M_INODEDEP,
&inodedep_hash);
inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0);
newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
sema_init(&newblk_in_progress, "newblk", PRIBIO, 0);
@ -1452,6 +1557,7 @@ setup_allocindir_phase2(bp, ip, aip)
* later release and zero the inode so that the calling routine
* can release it.
*/
static long num_freeblks; /* number of freeblks allocated */
void
softdep_setup_freeblocks(ip, length)
struct inode *ip; /* The inode whose length is to be reduced */
@ -1468,6 +1574,8 @@ softdep_setup_freeblocks(ip, length)
fs = ip->i_fs;
if (length != 0)
panic("softde_setup_freeblocks: non-zero length");
(void) checklimit(&num_freeblks, 0);
num_freeblks += 1;
MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
M_FREEBLKS, M_WAITOK);
bzero(freeblks, sizeof(struct freeblks));
@ -1511,7 +1619,7 @@ softdep_setup_freeblocks(ip, length)
* Add the freeblks structure to the list of operations that
* must await the zero'ed inode being written to disk.
*/
WORKLIST_INSERT(&inodedep->id_inowait, &freeblks->fb_list);
WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
/*
* Because the file length has been truncated to zero, any
* pending block allocation dependency structures associated
@ -1630,7 +1738,7 @@ deallocate_dependencies(bp, inodedep)
if (inodedep == NULL)
add_to_worklist(&dirrem->dm_list);
else
WORKLIST_INSERT(&inodedep->id_inowait,
WORKLIST_INSERT(&inodedep->id_bufwait,
&dirrem->dm_list);
}
WORKLIST_REMOVE(&pagedep->pd_list);
@ -1678,7 +1786,7 @@ free_allocdirect(adphead, adp, delay)
WORKLIST_REMOVE(&adp->ad_list);
if (adp->ad_freefrag != NULL) {
if (delay)
WORKLIST_INSERT(&adp->ad_inodedep->id_inowait,
WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
&adp->ad_freefrag->ff_list);
else
add_to_worklist(&adp->ad_freefrag->ff_list);
@ -1690,6 +1798,7 @@ free_allocdirect(adphead, adp, delay)
* Prepare an inode to be freed. The actual free operation is not
* done until the zero'ed inode has been written to disk.
*/
static long num_freefile; /* number of freefile allocated */
void
softdep_freefile(pvp, ino, mode)
struct vnode *pvp;
@ -1703,6 +1812,8 @@ softdep_freefile(pvp, ino, mode)
/*
* This sets up the inode de-allocation dependency.
*/
(void) checklimit(&num_freefile, 0);
num_freefile += 1;
MALLOC(freefile, struct freefile *, sizeof(struct freefile),
M_FREEFILE, M_WAITOK);
freefile->fx_list.wk_type = D_FREEFILE;
@ -1761,6 +1872,7 @@ free_inodedep(inodedep)
if ((inodedep->id_state & ONWORKLIST) != 0 ||
(inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
LIST_FIRST(&inodedep->id_bufwait) != NULL ||
LIST_FIRST(&inodedep->id_inowait) != NULL ||
TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
@ -1768,6 +1880,7 @@ free_inodedep(inodedep)
return (0);
LIST_REMOVE(inodedep, id_hash);
WORKITEM_FREE(inodedep, D_INODEDEP);
num_inodedep -= 1;
return (1);
}
@ -1836,6 +1949,7 @@ handle_workitem_freeblocks(freeblks)
softdep_error("handle_workitem_freeblks", allerror);
#endif /* DIAGNOSTIC */
WORKITEM_FREE(freeblks, D_FREEBLKS);
num_freeblks -= 1;
}
/*
@ -1940,7 +2054,7 @@ free_allocindir(aip, inodedep)
if (inodedep == NULL)
add_to_worklist(&freefrag->ff_list);
else
WORKLIST_INSERT(&inodedep->id_inowait,
WORKLIST_INSERT(&inodedep->id_bufwait,
&freefrag->ff_list);
}
WORKITEM_FREE(aip, D_ALLOCINDIR);
@ -2038,23 +2152,27 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp)
WORKITEM_FREE(mkdir2, D_MKDIR);
} else {
LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
WORKLIST_INSERT(&inodedep->id_inowait,&mkdir2->md_list);
WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
}
}
/*
* Link into parent directory pagedep and new inode inodedep
* structures to await its being written.
* Link into parent directory pagedep to await its being written.
*/
if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
dap->da_pagedep = pagedep;
LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
da_pdlist);
if (inodedep_lookup(fs, newinum, DEPALLOC, &inodedep) == 1 &&
(inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
/*
* Link into its inodedep. Put it on the id_bufwait list if the inode
* is not yet written. If it is written, do the post-inode write
* processing to put it on the id_pendinghd list.
*/
(void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
diradd_inode_written(dap, inodedep);
else
WORKLIST_INSERT(&inodedep->id_inowait, &dap->da_list);
WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
FREE_LOCK(&lk);
}
@ -2314,7 +2432,7 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
LIST_INSERT_HEAD(
&dirrem->dm_pagedep->pd_diraddhd[DIRADDHASH(offset)],
dap, da_pdlist);
WORKLIST_INSERT(&inodedep->id_inowait, &dap->da_list);
WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
} else if ((dirrem->dm_state & COMPLETE) == 0) {
LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
dm_next);
@ -2366,12 +2484,8 @@ handle_workitem_remove(dirrem)
*/
if ((dirrem->dm_state & RMDIR) == 0) {
ip->i_nlink--;
if (ip->i_nlink < ip->i_effnlink) {
#ifdef DIAGNOSTIC
vprint("handle_workitem_remove: bad file delta", vp);
#endif
ip->i_effnlink = ip->i_nlink;
}
if (ip->i_nlink < ip->i_effnlink)
panic("handle_workitem_remove: bad file delta");
ip->i_flag |= IN_CHANGE;
vput(vp);
WORKITEM_FREE(dirrem, D_DIRREM);
@ -2436,6 +2550,7 @@ handle_workitem_freefile(freefile)
if ((error = ffs_freefile(&vp, freefile->fx_oldinum, freefile->fx_mode)) != 0)
softdep_error("handle_workitem_freefile", error);
WORKITEM_FREE(freefile, D_FREEFILE);
num_freefile -= 1;
}
/*
@ -3022,7 +3137,7 @@ handle_written_inodeblock(inodedep, bp)
* before the old ones have been deleted.
*/
filefree = NULL;
while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
WORKLIST_REMOVE(wk);
switch (wk->wk_type) {
@ -3043,18 +3158,7 @@ handle_written_inodeblock(inodedep, bp)
continue;
case D_DIRADD:
dap = WK_DIRADD(wk);
dap->da_state |= COMPLETE;
if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
if (dap->da_state & DIRCHG)
pagedep = dap->da_previous->dm_pagedep;
else
pagedep = dap->da_pagedep;
LIST_REMOVE(dap, da_pdlist);
LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
da_pdlist);
}
WORKLIST_INSERT(&inodedep->id_pendinghd, wk);
diradd_inode_written(WK_DIRADD(wk), inodedep);
continue;
case D_FREEBLKS:
@ -3069,8 +3173,12 @@ handle_written_inodeblock(inodedep, bp)
/* NOTREACHED */
}
}
if (filefree != NULL)
if (filefree != NULL) {
if (free_inodedep(inodedep) == 0)
panic("handle_written_inodeblock: live inodedep");
add_to_worklist(filefree);
return (0);
}
/*
* If no outstanding dependencies, free it.
@ -3080,6 +3188,29 @@ handle_written_inodeblock(inodedep, bp)
return (hadchanges);
}
/*
* Process a diradd entry after its dependent inode has been written.
* This routine must be called with splbio interrupts blocked.
*/
static void
diradd_inode_written(dap, inodedep)
struct diradd *dap;
struct inodedep *inodedep;
{
struct pagedep *pagedep;
dap->da_state |= COMPLETE;
if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
if (dap->da_state & DIRCHG)
pagedep = dap->da_previous->dm_pagedep;
else
pagedep = dap->da_pagedep;
LIST_REMOVE(dap, da_pdlist);
LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
}
WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
}
/*
* Handle the completion of a mkdir dependency.
*/
@ -3229,6 +3360,7 @@ softdep_load_inodeblock(ip)
}
if (inodedep->id_nlinkdelta != 0) {
ip->i_effnlink -= inodedep->id_nlinkdelta;
ip->i_flag |= IN_MODIFIED;
inodedep->id_nlinkdelta = 0;
(void) free_inodedep(inodedep);
}
@ -3252,6 +3384,7 @@ softdep_update_inodeblock(ip, bp, waitfor)
int waitfor; /* 1 => update must be allowed */
{
struct inodedep *inodedep;
struct worklist *wk;
int error, gotit;
/*
@ -3272,15 +3405,6 @@ softdep_update_inodeblock(ip, bp, waitfor)
if (ip->i_nlink < ip->i_effnlink)
panic("softdep_update_inodeblock: bad delta");
inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
/*
* If the last remaining use for the inodedep was to track the
* link count, and there is no difference between the effective
* and actual link count, then we can free the inodedep.
*/
if (free_inodedep(inodedep)) {
FREE_LOCK(&lk);
return;
}
/*
* Changes have been initiated. Anything depending on these
* changes cannot occur until this inode has been written.
@ -3297,6 +3421,16 @@ softdep_update_inodeblock(ip, bp, waitfor)
merge_inode_lists(inodedep);
if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
/*
* Now that the inode has been pushed into the buffer, the
* operations dependent on the inode being written to disk
* can be moved to the id_bufwait so that they will be
* processed when the buffer I/O completes.
*/
while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
WORKLIST_REMOVE(wk);
WORKLIST_INSERT(&inodedep->id_bufwait, wk);
}
/*
* Newly allocated inodes cannot be written until the bitmap
* that allocates them have been written (indicated by
@ -3378,6 +3512,7 @@ softdep_fsync(vp)
if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
break;
if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
LIST_FIRST(&inodedep->id_bufwait) != NULL ||
TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL)
panic("softdep_fsync: pending ops");
@ -3444,8 +3579,8 @@ softdep_fsync(vp)
*/
error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred,
&bp);
vput(pvp);
ret = VOP_BWRITE(bp);
vput(pvp);
if (error != 0)
return (error);
if (ret != 0)
@ -3535,8 +3670,13 @@ softdep_sync_metadata(ap)
if (adp->ad_state & DEPCOMPLETE)
break;
nbp = adp->ad_buf;
if (getdirtybuf(&nbp, waitfor) == 0)
if (getdirtybuf(&nbp, waitfor) == 0) {
#if 0 /* [JRE] I suspect this should be here XXX */
if (waitfor == MNT_NOWAIT)
continue;
#endif
break;
}
FREE_LOCK(&lk);
if (waitfor == MNT_NOWAIT) {
bawrite(nbp);
@ -3713,8 +3853,11 @@ flush_inodedep_deps(fs, ino)
if (adp->ad_state & DEPCOMPLETE)
continue;
bp = adp->ad_buf;
if (getdirtybuf(&bp, waitfor) == 0)
if (getdirtybuf(&bp, waitfor) == 0) {
if (waitfor == MNT_NOWAIT)
continue;
break;
}
FREE_LOCK(&lk);
if (waitfor == MNT_NOWAIT) {
bawrite(bp);
@ -3732,8 +3875,11 @@ flush_inodedep_deps(fs, ino)
if (adp->ad_state & DEPCOMPLETE)
continue;
bp = adp->ad_buf;
if (getdirtybuf(&bp, waitfor) == 0)
if (getdirtybuf(&bp, waitfor) == 0) {
if (waitfor == MNT_NOWAIT)
continue;
break;
}
FREE_LOCK(&lk);
if (waitfor == MNT_NOWAIT) {
bawrite(bp);

View File

@ -52,7 +52,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)softdep.h 9.4 (McKusick) 1/15/98
* @(#)softdep.h 9.5 (McKusick) 2/11/98
*/
#include <sys/queue.h>
@ -201,31 +201,45 @@ struct pagedep {
/*
* The "inodedep" structure tracks the set of dependencies associated
* with an inode. Each block that is allocated is represented by an
* with an inode. One task that it must manage is delayed operations
* (i.e., work requests that must be held until the inodedep's associated
* inode has been written to disk). Getting an inode from its incore
* state to the disk requires two steps to be taken by the filesystem
* in this order: first the inode must be copied to its disk buffer by
* the VOP_UPDATE operation; second the inode's buffer must be written
* to disk. To ensure that both operations have happened in the required
* order, the inodedep maintains two lists. Delayed operations are
* placed on the id_inowait list. When the VOP_UPDATE is done, all
* operations on the id_inowait list are moved to the id_bufwait list.
* When the buffer is written, the items on the id_bufwait list can be
* safely moved to the work queue to be processed. A second task of the
* inodedep structure is to track the status of block allocation within
* the inode. Each block that is allocated is represented by an
* "allocdirect" structure (see below). It is linked onto the id_newinoupdt
* list until both its contents and its allocation in the cylinder
* group map have been written to disk. Once the dependencies have been
* group map have been written to disk. Once these dependencies have been
* satisfied, it is removed from the id_newinoupdt list and any followup
* actions such as releasing the previous block or fragment are placed
* on the id_inowait list. When an inode is updated (copied from the
* in-core inode structure to a disk buffer containing its on-disk
* copy), the "inodedep" structure is linked onto the buffer through
* its worklist. Thus it will be notified when the buffer is about
* on the id_inowait list. When an inode is updated (a VOP_UPDATE is
* done), the "inodedep" structure is linked onto the buffer through
* its worklist. Thus, it will be notified when the buffer is about
* to be written and when it is done. At the update time, all the
* elements on the id_newinoupdt list are moved to the id_inoupdt list
* since those changes are now relevant to the copy of the inode in the
* buffer. When the buffer containing the inode is written to disk, any
* updates listed on the id_inoupdt list are rolled back as they are
* not yet safe. Following the write, the changes are once again rolled
* forward and any actions on the id_inowait list are processed (since
* the previously allocated blocks are no longer claimed on the disk).
* buffer. Also at update time, the tasks on the id_inowait list are
* moved to the id_bufwait list so that they will be executed when
* the updated inode has been written to disk. When the buffer containing
* the inode is written to disk, any updates listed on the id_inoupdt
* list are rolled back as they are not yet safe. Following the write,
* the changes are once again rolled forward and any actions on the
* id_bufwait list are processed (since those actions are now safe).
* The entries on the id_inoupdt and id_newinoupdt lists must be kept
* sorted by logical block number to speed the calculation of the size
* of the rolled back inode (see explanation in initiate_write_inodeblock).
* When a directory entry is created, it is represented by a diradd.
* The diradd is added to the id_inowait list and is not permitted to be
* written to disk until the inode that it represents is written. After
* the inode is written, the id_inowait list is processed and the diradd
* The diradd is added to the id_inowait list as it cannot be safely
* written to disk until the inode that it represents is on disk. After
* the inode is written, the id_bufwait list is processed and the diradd
* entries are moved to the id_pendinghd list where they remain until
* the directory block containing the name has been written to disk.
* The purpose of keeping the entries on the id_pendinghd list is so that
@ -244,7 +258,8 @@ struct inodedep {
struct buf *id_buf; /* related bmsafemap (if pending) */
off_t id_savedsize; /* file size saved during rollback */
struct workhead id_pendinghd; /* entries awaiting directory write */
struct workhead id_inowait; /* operations after inode written */
struct workhead id_bufwait; /* operations after inode written */
struct workhead id_inowait; /* operations waiting inode update */
struct allocdirectlst id_inoupdt; /* updates before inode written */
struct allocdirectlst id_newinoupdt; /* updates when inode written */
};
@ -460,7 +475,7 @@ struct freefile {
* if appropriate and is never cleared.
*/
struct diradd {
struct worklist da_list; /* id_inowait and id_pendinghd list */
struct worklist da_list; /* id_inowait or id_pendinghd list */
# define da_state da_list.wk_state /* state of the new directory entry */
LIST_ENTRY(diradd) da_pdlist; /* pagedep holding directory block */
doff_t da_offset; /* offset of new dir entry in dir blk */

View File

@ -249,3 +249,69 @@ code and installed the updated utilities, do the following:
it and run `./doit'. You may want to check out each of the
three subtests individually first: doit1 - andrew benchmarks,
doit2 - copy and removal of /etc, doit3 - find from /.
====
Additional notes from Feb 13
hen removing huge directories of files, it is possible to get
the incore state arbitrarily far ahead of the disk. Maintaining
all the associated depedency information can exhaust the kernel
malloc arena. To avoid this senario, I have put some limits on
the soft update code so that it will not be allowed to rampage
through all of the kernel memory. I enclose below the relevant
patches to vnode.h and vfs_subr.c (which allow the soft update
code to speed up the filesystem syncer process). I have also
included the diffs for ffs_softdep.c. I hope to make a pass over
ffs_softdep.c to isolate the differences with my standard version
so that these diffs are less painful to incorporate.
Since I know you like to play with tuning, I have put the relevant
knobs on sysctl debug variables. The tuning knobs can be viewed
with `sysctl debug' and set with `sysctl -w debug.<name>=value'.
The knobs are as follows:
debug.max_softdeps - limit on any given resource
debug.tickdelay - ticks to delay before allocating
debug.max_limit_hit - number of times tickdelay imposed
debug.rush_requests - number of rush requests to filesystem syncer
The max_softdeps limit is derived from vnodesdesired which in
turn is sized based on the amount of memory on the machine.
When the limit is hit, a process requesting a resource first
tries to speed up the filesystem syncer process. Such a
request is recorded as a rush_request. After syncdelay / 2
unserviced rush requests (typically 15) are in the filesystem
syncers queue (i.e., it is more than 15 seconds behind in its
work), the process requesting the memory is put to sleep for
tickdelay seconds. Such a delay is recorded in max_limit_hit.
Following this delay it is granted its memory without further
delay. I have tried the following experiments in which I
delete an MH directory containing 16,703 files:
Run # 1 2 3
max_softdeps 4496 4496 4496
tickdelay 100 == 1 sec 20 == 0.2 sec 2 == 0.02 sec
max_limit_hit 16 == 16 sec 27 == 5.4 sec 203 == 4.1 sec
rush_requests 147 102 93
run time 57 sec 46 sec 45 sec
I/O's 781 859 936
When run with no limits, it completes in 40 seconds. So, the
time spent in delay is directly added to the bottom line.
Shortening the tick delay does cut down the total running time,
but at the expense of generating more total I/O operations
due to the rush orders being sent to the filesystem syncer.
Although the number of rush orders decreases with a shorter
tick delay, there are more requests in each order, hence the
increase in I/O count. Also, although the I/O count does rise
with a shorter delay, it is still at least an order of magnitude
less than without soft updates. Anyway, you may want to play
around with these value to see what works best and to see if
you can get an insight into how best to tune them. If you get
out of memory panic's, then you have max_softdeps set too high.
The max_limit_hit and rush_requests show be reset to zero
before each run. The minimum legal value for tickdelay is 2
(if you set it below that, the code will use 2).

View File

@ -53,7 +53,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ffs_softdep.c 9.14 (McKusick) 1/15/98
* @(#)ffs_softdep.c 9.21 (McKusick) 2/15/98
*/
/*
@ -95,6 +95,7 @@ static int flush_pagedep_deps __P((struct vnode *, struct mount *,
struct diraddhd *));
static int flush_inodedep_deps __P((struct fs *, ino_t));
static int handle_written_filepage __P((struct pagedep *, struct buf *));
static void diradd_inode_written __P((struct diradd *, struct inodedep *));
static int handle_written_inodeblock __P((struct inodedep *, struct buf *));
static void handle_allocdirect_partdone __P((struct allocdirect *));
static void handle_allocindir_partdone __P((struct allocindir *));
@ -129,6 +130,8 @@ static int newblk_lookup __P((struct fs *, ufs_daddr_t, int,
static int inodedep_lookup __P((struct fs *, ino_t, int, struct inodedep **));
static int pagedep_lookup __P((struct inode *, ufs_lbn_t, int,
struct pagedep **));
static void pause_timer __P((void *));
static int checklimit __P((long *, int));
static void add_to_worklist __P((struct worklist *));
/*
@ -427,6 +430,28 @@ workitem_free(item, type)
*/
static struct workhead softdep_workitem_pending;
static int softdep_worklist_busy;
static int max_softdeps; /* maximum number of structs before slowdown */
static int tickdelay = 2; /* number of ticks to pause during slowdown */
static int max_limit_hit; /* number of times slowdown imposed */
static int rush_requests; /* number of times I/O speeded up */
static int proc_waiting; /* tracks whether we have a timeout posted */
static pid_t filesys_syncer_pid;/* records pid of filesystem syncer process */
#ifdef DEBUG
#include <vm/vm.h>
#include <sys/sysctl.h>
#if defined(__FreeBSD__)
SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
SYSCTL_INT(_debug, OID_AUTO, max_limit_hit, CTLFLAG_RW, &max_limit_hit, 0, "");
SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &rush_requests, 0, "");
#else /* !__FreeBSD__ */
struct ctldebug debug4 = { "max_softdeps", &max_softdeps };
struct ctldebug debug5 = { "tickdelay", &tickdelay };
struct ctldebug debug6 = { "max_limit_hit", &max_limit_hit };
struct ctldebug debug7 = { "rush_requests", &rush_requests };
#endif /* !__FreeBSD__ */
#endif /* DEBUG */
/*
* Add an item to the end of the work queue.
@ -465,10 +490,16 @@ int
softdep_process_worklist(matchmnt)
struct mount *matchmnt;
{
struct proc *p = curproc;
struct worklist *wk;
struct fs *matchfs;
int matchcnt;
/*
* Record the process identifier of our caller so that we can
* give this process preferential treatment in checklimit below.
*/
filesys_syncer_pid = p->p_pid;
matchcnt = 0;
matchfs = NULL;
if (matchmnt != NULL)
@ -591,6 +622,71 @@ softdep_flushfiles(oldmnt, flags, p)
return (error);
}
/*
* A large burst of file addition or deletion activity can drive the
* memory load excessively high. Therefore we deliberately slow things
* down and speed up the I/O processing if we find ourselves with too
* many dependencies in progress.
*/
static int
checklimit(resource, islocked)
long *resource;
int islocked;
{
struct proc *p = curproc;
/*
* If we are under our limit, just proceed.
*/
if (*resource < max_softdeps)
return (0);
/*
* We never hold up the filesystem syncer process.
*/
if (p->p_pid == filesys_syncer_pid)
return (0);
/*
* Our first approach is to speed up the syncer process.
* We never push it to speed up more than half of its
* normal turn time, otherwise it could take over the cpu.
*/
if (rushjob < syncdelay / 2) {
rushjob += 1;
rush_requests += 1;
return (0);
}
/*
* Every trick has failed, so we pause momentarily to let
* the filesystem syncer process catch up.
*/
if (islocked == 0)
ACQUIRE_LOCK(&lk);
if (proc_waiting == 0) {
proc_waiting = 1;
timeout(pause_timer, NULL, tickdelay > 2 ? tickdelay : 2);
}
FREE_LOCK_INTERLOCKED(&lk);
(void) tsleep((caddr_t)&proc_waiting, PPAUSE | PCATCH, "softupdate", 0);
ACQUIRE_LOCK_INTERLOCKED(&lk);
if (islocked == 0)
FREE_LOCK(&lk);
max_limit_hit += 1;
return (1);
}
/*
* Awaken processes pausing in checklimit and clear proc_waiting
* to indicate that there is no longer a timer running.
*/
void
pause_timer(arg)
void *arg;
{
proc_waiting = 0;
wakeup(&proc_waiting);
}
/*
* Structure hashing.
*
@ -690,7 +786,8 @@ pagedep_lookup(ip, lbn, flags, pagedeppp)
* Structures and routines associated with inodedep caching.
*/
LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
u_long inodedep_hash; /* size of hash table - 1 */
static u_long inodedep_hash; /* size of hash table - 1 */
static long num_inodedep; /* number of inodedep allocated */
#define INODEDEP_HASH(fs, inum) \
(&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
static struct sema inodedep_in_progress;
@ -710,11 +807,13 @@ inodedep_lookup(fs, inum, flags, inodedeppp)
{
struct inodedep *inodedep;
struct inodedep_hashhead *inodedephd;
int firsttry;
#ifdef DEBUG
if (lk.lkt_held == -1)
panic("inodedep_lookup: lock not held");
#endif
firsttry = 1;
inodedephd = INODEDEP_HASH(fs, inum);
top:
for (inodedep = LIST_FIRST(inodedephd); inodedep;
@ -729,10 +828,15 @@ inodedep_lookup(fs, inum, flags, inodedeppp)
*inodedeppp = NULL;
return (0);
}
if (firsttry && checklimit(&num_inodedep, 1) == 1) {
firsttry = 0;
goto top;
}
if (sema_get(&inodedep_in_progress, &lk) == 0) {
ACQUIRE_LOCK(&lk);
goto top;
}
num_inodedep += 1;
MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
M_INODEDEP, M_WAITOK);
inodedep->id_list.wk_type = D_INODEDEP;
@ -745,6 +849,7 @@ inodedep_lookup(fs, inum, flags, inodedeppp)
inodedep->id_buf = NULL;
LIST_INIT(&inodedep->id_pendinghd);
LIST_INIT(&inodedep->id_inowait);
LIST_INIT(&inodedep->id_bufwait);
TAILQ_INIT(&inodedep->id_inoupdt);
TAILQ_INIT(&inodedep->id_newinoupdt);
ACQUIRE_LOCK(&lk);
@ -815,11 +920,11 @@ softdep_initialize()
LIST_INIT(&mkdirlisthd);
LIST_INIT(&softdep_workitem_pending);
pagedep_hashtbl = hashinit(desiredvnodes / 10, M_PAGEDEP,
max_softdeps = desiredvnodes * 8;
pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
&pagedep_hash);
sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0);
inodedep_hashtbl = hashinit(desiredvnodes / 2, M_INODEDEP,
&inodedep_hash);
inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0);
newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
sema_init(&newblk_in_progress, "newblk", PRIBIO, 0);
@ -1452,6 +1557,7 @@ setup_allocindir_phase2(bp, ip, aip)
* later release and zero the inode so that the calling routine
* can release it.
*/
static long num_freeblks; /* number of freeblks allocated */
void
softdep_setup_freeblocks(ip, length)
struct inode *ip; /* The inode whose length is to be reduced */
@ -1468,6 +1574,8 @@ softdep_setup_freeblocks(ip, length)
fs = ip->i_fs;
if (length != 0)
panic("softde_setup_freeblocks: non-zero length");
(void) checklimit(&num_freeblks, 0);
num_freeblks += 1;
MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
M_FREEBLKS, M_WAITOK);
bzero(freeblks, sizeof(struct freeblks));
@ -1511,7 +1619,7 @@ softdep_setup_freeblocks(ip, length)
* Add the freeblks structure to the list of operations that
* must await the zero'ed inode being written to disk.
*/
WORKLIST_INSERT(&inodedep->id_inowait, &freeblks->fb_list);
WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
/*
* Because the file length has been truncated to zero, any
* pending block allocation dependency structures associated
@ -1630,7 +1738,7 @@ deallocate_dependencies(bp, inodedep)
if (inodedep == NULL)
add_to_worklist(&dirrem->dm_list);
else
WORKLIST_INSERT(&inodedep->id_inowait,
WORKLIST_INSERT(&inodedep->id_bufwait,
&dirrem->dm_list);
}
WORKLIST_REMOVE(&pagedep->pd_list);
@ -1678,7 +1786,7 @@ free_allocdirect(adphead, adp, delay)
WORKLIST_REMOVE(&adp->ad_list);
if (adp->ad_freefrag != NULL) {
if (delay)
WORKLIST_INSERT(&adp->ad_inodedep->id_inowait,
WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
&adp->ad_freefrag->ff_list);
else
add_to_worklist(&adp->ad_freefrag->ff_list);
@ -1690,6 +1798,7 @@ free_allocdirect(adphead, adp, delay)
* Prepare an inode to be freed. The actual free operation is not
* done until the zero'ed inode has been written to disk.
*/
static long num_freefile; /* number of freefile allocated */
void
softdep_freefile(pvp, ino, mode)
struct vnode *pvp;
@ -1703,6 +1812,8 @@ softdep_freefile(pvp, ino, mode)
/*
* This sets up the inode de-allocation dependency.
*/
(void) checklimit(&num_freefile, 0);
num_freefile += 1;
MALLOC(freefile, struct freefile *, sizeof(struct freefile),
M_FREEFILE, M_WAITOK);
freefile->fx_list.wk_type = D_FREEFILE;
@ -1761,6 +1872,7 @@ free_inodedep(inodedep)
if ((inodedep->id_state & ONWORKLIST) != 0 ||
(inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
LIST_FIRST(&inodedep->id_bufwait) != NULL ||
LIST_FIRST(&inodedep->id_inowait) != NULL ||
TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
@ -1768,6 +1880,7 @@ free_inodedep(inodedep)
return (0);
LIST_REMOVE(inodedep, id_hash);
WORKITEM_FREE(inodedep, D_INODEDEP);
num_inodedep -= 1;
return (1);
}
@ -1836,6 +1949,7 @@ handle_workitem_freeblocks(freeblks)
softdep_error("handle_workitem_freeblks", allerror);
#endif /* DIAGNOSTIC */
WORKITEM_FREE(freeblks, D_FREEBLKS);
num_freeblks -= 1;
}
/*
@ -1940,7 +2054,7 @@ free_allocindir(aip, inodedep)
if (inodedep == NULL)
add_to_worklist(&freefrag->ff_list);
else
WORKLIST_INSERT(&inodedep->id_inowait,
WORKLIST_INSERT(&inodedep->id_bufwait,
&freefrag->ff_list);
}
WORKITEM_FREE(aip, D_ALLOCINDIR);
@ -2038,23 +2152,27 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp)
WORKITEM_FREE(mkdir2, D_MKDIR);
} else {
LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
WORKLIST_INSERT(&inodedep->id_inowait,&mkdir2->md_list);
WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
}
}
/*
* Link into parent directory pagedep and new inode inodedep
* structures to await its being written.
* Link into parent directory pagedep to await its being written.
*/
if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
dap->da_pagedep = pagedep;
LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
da_pdlist);
if (inodedep_lookup(fs, newinum, DEPALLOC, &inodedep) == 1 &&
(inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
/*
* Link into its inodedep. Put it on the id_bufwait list if the inode
* is not yet written. If it is written, do the post-inode write
* processing to put it on the id_pendinghd list.
*/
(void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
diradd_inode_written(dap, inodedep);
else
WORKLIST_INSERT(&inodedep->id_inowait, &dap->da_list);
WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
FREE_LOCK(&lk);
}
@ -2314,7 +2432,7 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
LIST_INSERT_HEAD(
&dirrem->dm_pagedep->pd_diraddhd[DIRADDHASH(offset)],
dap, da_pdlist);
WORKLIST_INSERT(&inodedep->id_inowait, &dap->da_list);
WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
} else if ((dirrem->dm_state & COMPLETE) == 0) {
LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
dm_next);
@ -2366,12 +2484,8 @@ handle_workitem_remove(dirrem)
*/
if ((dirrem->dm_state & RMDIR) == 0) {
ip->i_nlink--;
if (ip->i_nlink < ip->i_effnlink) {
#ifdef DIAGNOSTIC
vprint("handle_workitem_remove: bad file delta", vp);
#endif
ip->i_effnlink = ip->i_nlink;
}
if (ip->i_nlink < ip->i_effnlink)
panic("handle_workitem_remove: bad file delta");
ip->i_flag |= IN_CHANGE;
vput(vp);
WORKITEM_FREE(dirrem, D_DIRREM);
@ -2436,6 +2550,7 @@ handle_workitem_freefile(freefile)
if ((error = ffs_freefile(&vp, freefile->fx_oldinum, freefile->fx_mode)) != 0)
softdep_error("handle_workitem_freefile", error);
WORKITEM_FREE(freefile, D_FREEFILE);
num_freefile -= 1;
}
/*
@ -3022,7 +3137,7 @@ handle_written_inodeblock(inodedep, bp)
* before the old ones have been deleted.
*/
filefree = NULL;
while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
WORKLIST_REMOVE(wk);
switch (wk->wk_type) {
@ -3043,18 +3158,7 @@ handle_written_inodeblock(inodedep, bp)
continue;
case D_DIRADD:
dap = WK_DIRADD(wk);
dap->da_state |= COMPLETE;
if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
if (dap->da_state & DIRCHG)
pagedep = dap->da_previous->dm_pagedep;
else
pagedep = dap->da_pagedep;
LIST_REMOVE(dap, da_pdlist);
LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
da_pdlist);
}
WORKLIST_INSERT(&inodedep->id_pendinghd, wk);
diradd_inode_written(WK_DIRADD(wk), inodedep);
continue;
case D_FREEBLKS:
@ -3069,8 +3173,12 @@ handle_written_inodeblock(inodedep, bp)
/* NOTREACHED */
}
}
if (filefree != NULL)
if (filefree != NULL) {
if (free_inodedep(inodedep) == 0)
panic("handle_written_inodeblock: live inodedep");
add_to_worklist(filefree);
return (0);
}
/*
* If no outstanding dependencies, free it.
@ -3080,6 +3188,29 @@ handle_written_inodeblock(inodedep, bp)
return (hadchanges);
}
/*
* Process a diradd entry after its dependent inode has been written.
* This routine must be called with splbio interrupts blocked.
*/
static void
diradd_inode_written(dap, inodedep)
struct diradd *dap;
struct inodedep *inodedep;
{
struct pagedep *pagedep;
dap->da_state |= COMPLETE;
if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
if (dap->da_state & DIRCHG)
pagedep = dap->da_previous->dm_pagedep;
else
pagedep = dap->da_pagedep;
LIST_REMOVE(dap, da_pdlist);
LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
}
WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
}
/*
* Handle the completion of a mkdir dependency.
*/
@ -3229,6 +3360,7 @@ softdep_load_inodeblock(ip)
}
if (inodedep->id_nlinkdelta != 0) {
ip->i_effnlink -= inodedep->id_nlinkdelta;
ip->i_flag |= IN_MODIFIED;
inodedep->id_nlinkdelta = 0;
(void) free_inodedep(inodedep);
}
@ -3252,6 +3384,7 @@ softdep_update_inodeblock(ip, bp, waitfor)
int waitfor; /* 1 => update must be allowed */
{
struct inodedep *inodedep;
struct worklist *wk;
int error, gotit;
/*
@ -3272,15 +3405,6 @@ softdep_update_inodeblock(ip, bp, waitfor)
if (ip->i_nlink < ip->i_effnlink)
panic("softdep_update_inodeblock: bad delta");
inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
/*
* If the last remaining use for the inodedep was to track the
* link count, and there is no difference between the effective
* and actual link count, then we can free the inodedep.
*/
if (free_inodedep(inodedep)) {
FREE_LOCK(&lk);
return;
}
/*
* Changes have been initiated. Anything depending on these
* changes cannot occur until this inode has been written.
@ -3297,6 +3421,16 @@ softdep_update_inodeblock(ip, bp, waitfor)
merge_inode_lists(inodedep);
if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
/*
* Now that the inode has been pushed into the buffer, the
* operations dependent on the inode being written to disk
* can be moved to the id_bufwait so that they will be
* processed when the buffer I/O completes.
*/
while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
WORKLIST_REMOVE(wk);
WORKLIST_INSERT(&inodedep->id_bufwait, wk);
}
/*
* Newly allocated inodes cannot be written until the bitmap
* that allocates them have been written (indicated by
@ -3378,6 +3512,7 @@ softdep_fsync(vp)
if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
break;
if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
LIST_FIRST(&inodedep->id_bufwait) != NULL ||
TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL)
panic("softdep_fsync: pending ops");
@ -3444,8 +3579,8 @@ softdep_fsync(vp)
*/
error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred,
&bp);
vput(pvp);
ret = VOP_BWRITE(bp);
vput(pvp);
if (error != 0)
return (error);
if (ret != 0)
@ -3535,8 +3670,13 @@ softdep_sync_metadata(ap)
if (adp->ad_state & DEPCOMPLETE)
break;
nbp = adp->ad_buf;
if (getdirtybuf(&nbp, waitfor) == 0)
if (getdirtybuf(&nbp, waitfor) == 0) {
#if 0 /* [JRE] I suspect this should be here XXX */
if (waitfor == MNT_NOWAIT)
continue;
#endif
break;
}
FREE_LOCK(&lk);
if (waitfor == MNT_NOWAIT) {
bawrite(nbp);
@ -3713,8 +3853,11 @@ flush_inodedep_deps(fs, ino)
if (adp->ad_state & DEPCOMPLETE)
continue;
bp = adp->ad_buf;
if (getdirtybuf(&bp, waitfor) == 0)
if (getdirtybuf(&bp, waitfor) == 0) {
if (waitfor == MNT_NOWAIT)
continue;
break;
}
FREE_LOCK(&lk);
if (waitfor == MNT_NOWAIT) {
bawrite(bp);
@ -3732,8 +3875,11 @@ flush_inodedep_deps(fs, ino)
if (adp->ad_state & DEPCOMPLETE)
continue;
bp = adp->ad_buf;
if (getdirtybuf(&bp, waitfor) == 0)
if (getdirtybuf(&bp, waitfor) == 0) {
if (waitfor == MNT_NOWAIT)
continue;
break;
}
FREE_LOCK(&lk);
if (waitfor == MNT_NOWAIT) {
bawrite(bp);

View File

@ -52,7 +52,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)softdep.h 9.4 (McKusick) 1/15/98
* @(#)softdep.h 9.5 (McKusick) 2/11/98
*/
#include <sys/queue.h>
@ -201,31 +201,45 @@ struct pagedep {
/*
* The "inodedep" structure tracks the set of dependencies associated
* with an inode. Each block that is allocated is represented by an
* with an inode. One task that it must manage is delayed operations
* (i.e., work requests that must be held until the inodedep's associated
* inode has been written to disk). Getting an inode from its incore
* state to the disk requires two steps to be taken by the filesystem
* in this order: first the inode must be copied to its disk buffer by
* the VOP_UPDATE operation; second the inode's buffer must be written
* to disk. To ensure that both operations have happened in the required
* order, the inodedep maintains two lists. Delayed operations are
* placed on the id_inowait list. When the VOP_UPDATE is done, all
* operations on the id_inowait list are moved to the id_bufwait list.
* When the buffer is written, the items on the id_bufwait list can be
* safely moved to the work queue to be processed. A second task of the
* inodedep structure is to track the status of block allocation within
* the inode. Each block that is allocated is represented by an
* "allocdirect" structure (see below). It is linked onto the id_newinoupdt
* list until both its contents and its allocation in the cylinder
* group map have been written to disk. Once the dependencies have been
* group map have been written to disk. Once these dependencies have been
* satisfied, it is removed from the id_newinoupdt list and any followup
* actions such as releasing the previous block or fragment are placed
* on the id_inowait list. When an inode is updated (copied from the
* in-core inode structure to a disk buffer containing its on-disk
* copy), the "inodedep" structure is linked onto the buffer through
* its worklist. Thus it will be notified when the buffer is about
* on the id_inowait list. When an inode is updated (a VOP_UPDATE is
* done), the "inodedep" structure is linked onto the buffer through
* its worklist. Thus, it will be notified when the buffer is about
* to be written and when it is done. At the update time, all the
* elements on the id_newinoupdt list are moved to the id_inoupdt list
* since those changes are now relevant to the copy of the inode in the
* buffer. When the buffer containing the inode is written to disk, any
* updates listed on the id_inoupdt list are rolled back as they are
* not yet safe. Following the write, the changes are once again rolled
* forward and any actions on the id_inowait list are processed (since
* the previously allocated blocks are no longer claimed on the disk).
* buffer. Also at update time, the tasks on the id_inowait list are
* moved to the id_bufwait list so that they will be executed when
* the updated inode has been written to disk. When the buffer containing
* the inode is written to disk, any updates listed on the id_inoupdt
* list are rolled back as they are not yet safe. Following the write,
* the changes are once again rolled forward and any actions on the
* id_bufwait list are processed (since those actions are now safe).
* The entries on the id_inoupdt and id_newinoupdt lists must be kept
* sorted by logical block number to speed the calculation of the size
* of the rolled back inode (see explanation in initiate_write_inodeblock).
* When a directory entry is created, it is represented by a diradd.
* The diradd is added to the id_inowait list and is not permitted to be
* written to disk until the inode that it represents is written. After
* the inode is written, the id_inowait list is processed and the diradd
* The diradd is added to the id_inowait list as it cannot be safely
* written to disk until the inode that it represents is on disk. After
* the inode is written, the id_bufwait list is processed and the diradd
* entries are moved to the id_pendinghd list where they remain until
* the directory block containing the name has been written to disk.
* The purpose of keeping the entries on the id_pendinghd list is so that
@ -244,7 +258,8 @@ struct inodedep {
struct buf *id_buf; /* related bmsafemap (if pending) */
off_t id_savedsize; /* file size saved during rollback */
struct workhead id_pendinghd; /* entries awaiting directory write */
struct workhead id_inowait; /* operations after inode written */
struct workhead id_bufwait; /* operations after inode written */
struct workhead id_inowait; /* operations waiting inode update */
struct allocdirectlst id_inoupdt; /* updates before inode written */
struct allocdirectlst id_newinoupdt; /* updates when inode written */
};
@ -460,7 +475,7 @@ struct freefile {
* if appropriate and is never cleared.
*/
struct diradd {
struct worklist da_list; /* id_inowait and id_pendinghd list */
struct worklist da_list; /* id_inowait or id_pendinghd list */
# define da_state da_list.wk_state /* state of the new directory entry */
LIST_ENTRY(diradd) da_pdlist; /* pagedep holding directory block */
doff_t da_offset; /* offset of new dir entry in dir blk */

View File

@ -249,3 +249,69 @@ code and installed the updated utilities, do the following:
it and run `./doit'. You may want to check out each of the
three subtests individually first: doit1 - andrew benchmarks,
doit2 - copy and removal of /etc, doit3 - find from /.
====
Additional notes from Feb 13
hen removing huge directories of files, it is possible to get
the incore state arbitrarily far ahead of the disk. Maintaining
all the associated depedency information can exhaust the kernel
malloc arena. To avoid this senario, I have put some limits on
the soft update code so that it will not be allowed to rampage
through all of the kernel memory. I enclose below the relevant
patches to vnode.h and vfs_subr.c (which allow the soft update
code to speed up the filesystem syncer process). I have also
included the diffs for ffs_softdep.c. I hope to make a pass over
ffs_softdep.c to isolate the differences with my standard version
so that these diffs are less painful to incorporate.
Since I know you like to play with tuning, I have put the relevant
knobs on sysctl debug variables. The tuning knobs can be viewed
with `sysctl debug' and set with `sysctl -w debug.<name>=value'.
The knobs are as follows:
debug.max_softdeps - limit on any given resource
debug.tickdelay - ticks to delay before allocating
debug.max_limit_hit - number of times tickdelay imposed
debug.rush_requests - number of rush requests to filesystem syncer
The max_softdeps limit is derived from vnodesdesired which in
turn is sized based on the amount of memory on the machine.
When the limit is hit, a process requesting a resource first
tries to speed up the filesystem syncer process. Such a
request is recorded as a rush_request. After syncdelay / 2
unserviced rush requests (typically 15) are in the filesystem
syncers queue (i.e., it is more than 15 seconds behind in its
work), the process requesting the memory is put to sleep for
tickdelay seconds. Such a delay is recorded in max_limit_hit.
Following this delay it is granted its memory without further
delay. I have tried the following experiments in which I
delete an MH directory containing 16,703 files:
Run # 1 2 3
max_softdeps 4496 4496 4496
tickdelay 100 == 1 sec 20 == 0.2 sec 2 == 0.02 sec
max_limit_hit 16 == 16 sec 27 == 5.4 sec 203 == 4.1 sec
rush_requests 147 102 93
run time 57 sec 46 sec 45 sec
I/O's 781 859 936
When run with no limits, it completes in 40 seconds. So, the
time spent in delay is directly added to the bottom line.
Shortening the tick delay does cut down the total running time,
but at the expense of generating more total I/O operations
due to the rush orders being sent to the filesystem syncer.
Although the number of rush orders decreases with a shorter
tick delay, there are more requests in each order, hence the
increase in I/O count. Also, although the I/O count does rise
with a shorter delay, it is still at least an order of magnitude
less than without soft updates. Anyway, you may want to play
around with these value to see what works best and to see if
you can get an insight into how best to tune them. If you get
out of memory panic's, then you have max_softdeps set too high.
The max_limit_hit and rush_requests show be reset to zero
before each run. The minimum legal value for tickdelay is 2
(if you set it below that, the code will use 2).

View File

@ -53,7 +53,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ffs_softdep.c 9.14 (McKusick) 1/15/98
* @(#)ffs_softdep.c 9.21 (McKusick) 2/15/98
*/
/*
@ -95,6 +95,7 @@ static int flush_pagedep_deps __P((struct vnode *, struct mount *,
struct diraddhd *));
static int flush_inodedep_deps __P((struct fs *, ino_t));
static int handle_written_filepage __P((struct pagedep *, struct buf *));
static void diradd_inode_written __P((struct diradd *, struct inodedep *));
static int handle_written_inodeblock __P((struct inodedep *, struct buf *));
static void handle_allocdirect_partdone __P((struct allocdirect *));
static void handle_allocindir_partdone __P((struct allocindir *));
@ -129,6 +130,8 @@ static int newblk_lookup __P((struct fs *, ufs_daddr_t, int,
static int inodedep_lookup __P((struct fs *, ino_t, int, struct inodedep **));
static int pagedep_lookup __P((struct inode *, ufs_lbn_t, int,
struct pagedep **));
static void pause_timer __P((void *));
static int checklimit __P((long *, int));
static void add_to_worklist __P((struct worklist *));
/*
@ -427,6 +430,28 @@ workitem_free(item, type)
*/
static struct workhead softdep_workitem_pending;
static int softdep_worklist_busy;
static int max_softdeps; /* maximum number of structs before slowdown */
static int tickdelay = 2; /* number of ticks to pause during slowdown */
static int max_limit_hit; /* number of times slowdown imposed */
static int rush_requests; /* number of times I/O speeded up */
static int proc_waiting; /* tracks whether we have a timeout posted */
static pid_t filesys_syncer_pid;/* records pid of filesystem syncer process */
#ifdef DEBUG
#include <vm/vm.h>
#include <sys/sysctl.h>
#if defined(__FreeBSD__)
SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
SYSCTL_INT(_debug, OID_AUTO, max_limit_hit, CTLFLAG_RW, &max_limit_hit, 0, "");
SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &rush_requests, 0, "");
#else /* !__FreeBSD__ */
struct ctldebug debug4 = { "max_softdeps", &max_softdeps };
struct ctldebug debug5 = { "tickdelay", &tickdelay };
struct ctldebug debug6 = { "max_limit_hit", &max_limit_hit };
struct ctldebug debug7 = { "rush_requests", &rush_requests };
#endif /* !__FreeBSD__ */
#endif /* DEBUG */
/*
* Add an item to the end of the work queue.
@ -465,10 +490,16 @@ int
softdep_process_worklist(matchmnt)
struct mount *matchmnt;
{
struct proc *p = curproc;
struct worklist *wk;
struct fs *matchfs;
int matchcnt;
/*
* Record the process identifier of our caller so that we can
* give this process preferential treatment in checklimit below.
*/
filesys_syncer_pid = p->p_pid;
matchcnt = 0;
matchfs = NULL;
if (matchmnt != NULL)
@ -591,6 +622,71 @@ softdep_flushfiles(oldmnt, flags, p)
return (error);
}
/*
* A large burst of file addition or deletion activity can drive the
* memory load excessively high. Therefore we deliberately slow things
* down and speed up the I/O processing if we find ourselves with too
* many dependencies in progress.
*/
static int
checklimit(resource, islocked)
long *resource;
int islocked;
{
struct proc *p = curproc;
/*
* If we are under our limit, just proceed.
*/
if (*resource < max_softdeps)
return (0);
/*
* We never hold up the filesystem syncer process.
*/
if (p->p_pid == filesys_syncer_pid)
return (0);
/*
* Our first approach is to speed up the syncer process.
* We never push it to speed up more than half of its
* normal turn time, otherwise it could take over the cpu.
*/
if (rushjob < syncdelay / 2) {
rushjob += 1;
rush_requests += 1;
return (0);
}
/*
* Every trick has failed, so we pause momentarily to let
* the filesystem syncer process catch up.
*/
if (islocked == 0)
ACQUIRE_LOCK(&lk);
if (proc_waiting == 0) {
proc_waiting = 1;
timeout(pause_timer, NULL, tickdelay > 2 ? tickdelay : 2);
}
FREE_LOCK_INTERLOCKED(&lk);
(void) tsleep((caddr_t)&proc_waiting, PPAUSE | PCATCH, "softupdate", 0);
ACQUIRE_LOCK_INTERLOCKED(&lk);
if (islocked == 0)
FREE_LOCK(&lk);
max_limit_hit += 1;
return (1);
}
/*
* Awaken processes pausing in checklimit and clear proc_waiting
* to indicate that there is no longer a timer running.
*/
void
pause_timer(arg)
void *arg;
{
proc_waiting = 0;
wakeup(&proc_waiting);
}
/*
* Structure hashing.
*
@ -690,7 +786,8 @@ pagedep_lookup(ip, lbn, flags, pagedeppp)
* Structures and routines associated with inodedep caching.
*/
LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
u_long inodedep_hash; /* size of hash table - 1 */
static u_long inodedep_hash; /* size of hash table - 1 */
static long num_inodedep; /* number of inodedep allocated */
#define INODEDEP_HASH(fs, inum) \
(&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
static struct sema inodedep_in_progress;
@ -710,11 +807,13 @@ inodedep_lookup(fs, inum, flags, inodedeppp)
{
struct inodedep *inodedep;
struct inodedep_hashhead *inodedephd;
int firsttry;
#ifdef DEBUG
if (lk.lkt_held == -1)
panic("inodedep_lookup: lock not held");
#endif
firsttry = 1;
inodedephd = INODEDEP_HASH(fs, inum);
top:
for (inodedep = LIST_FIRST(inodedephd); inodedep;
@ -729,10 +828,15 @@ inodedep_lookup(fs, inum, flags, inodedeppp)
*inodedeppp = NULL;
return (0);
}
if (firsttry && checklimit(&num_inodedep, 1) == 1) {
firsttry = 0;
goto top;
}
if (sema_get(&inodedep_in_progress, &lk) == 0) {
ACQUIRE_LOCK(&lk);
goto top;
}
num_inodedep += 1;
MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
M_INODEDEP, M_WAITOK);
inodedep->id_list.wk_type = D_INODEDEP;
@ -745,6 +849,7 @@ inodedep_lookup(fs, inum, flags, inodedeppp)
inodedep->id_buf = NULL;
LIST_INIT(&inodedep->id_pendinghd);
LIST_INIT(&inodedep->id_inowait);
LIST_INIT(&inodedep->id_bufwait);
TAILQ_INIT(&inodedep->id_inoupdt);
TAILQ_INIT(&inodedep->id_newinoupdt);
ACQUIRE_LOCK(&lk);
@ -815,11 +920,11 @@ softdep_initialize()
LIST_INIT(&mkdirlisthd);
LIST_INIT(&softdep_workitem_pending);
pagedep_hashtbl = hashinit(desiredvnodes / 10, M_PAGEDEP,
max_softdeps = desiredvnodes * 8;
pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
&pagedep_hash);
sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0);
inodedep_hashtbl = hashinit(desiredvnodes / 2, M_INODEDEP,
&inodedep_hash);
inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0);
newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
sema_init(&newblk_in_progress, "newblk", PRIBIO, 0);
@ -1452,6 +1557,7 @@ setup_allocindir_phase2(bp, ip, aip)
* later release and zero the inode so that the calling routine
* can release it.
*/
static long num_freeblks; /* number of freeblks allocated */
void
softdep_setup_freeblocks(ip, length)
struct inode *ip; /* The inode whose length is to be reduced */
@ -1468,6 +1574,8 @@ softdep_setup_freeblocks(ip, length)
fs = ip->i_fs;
if (length != 0)
panic("softde_setup_freeblocks: non-zero length");
(void) checklimit(&num_freeblks, 0);
num_freeblks += 1;
MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
M_FREEBLKS, M_WAITOK);
bzero(freeblks, sizeof(struct freeblks));
@ -1511,7 +1619,7 @@ softdep_setup_freeblocks(ip, length)
* Add the freeblks structure to the list of operations that
* must await the zero'ed inode being written to disk.
*/
WORKLIST_INSERT(&inodedep->id_inowait, &freeblks->fb_list);
WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
/*
* Because the file length has been truncated to zero, any
* pending block allocation dependency structures associated
@ -1630,7 +1738,7 @@ deallocate_dependencies(bp, inodedep)
if (inodedep == NULL)
add_to_worklist(&dirrem->dm_list);
else
WORKLIST_INSERT(&inodedep->id_inowait,
WORKLIST_INSERT(&inodedep->id_bufwait,
&dirrem->dm_list);
}
WORKLIST_REMOVE(&pagedep->pd_list);
@ -1678,7 +1786,7 @@ free_allocdirect(adphead, adp, delay)
WORKLIST_REMOVE(&adp->ad_list);
if (adp->ad_freefrag != NULL) {
if (delay)
WORKLIST_INSERT(&adp->ad_inodedep->id_inowait,
WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
&adp->ad_freefrag->ff_list);
else
add_to_worklist(&adp->ad_freefrag->ff_list);
@ -1690,6 +1798,7 @@ free_allocdirect(adphead, adp, delay)
* Prepare an inode to be freed. The actual free operation is not
* done until the zero'ed inode has been written to disk.
*/
static long num_freefile; /* number of freefile allocated */
void
softdep_freefile(pvp, ino, mode)
struct vnode *pvp;
@ -1703,6 +1812,8 @@ softdep_freefile(pvp, ino, mode)
/*
* This sets up the inode de-allocation dependency.
*/
(void) checklimit(&num_freefile, 0);
num_freefile += 1;
MALLOC(freefile, struct freefile *, sizeof(struct freefile),
M_FREEFILE, M_WAITOK);
freefile->fx_list.wk_type = D_FREEFILE;
@ -1761,6 +1872,7 @@ free_inodedep(inodedep)
if ((inodedep->id_state & ONWORKLIST) != 0 ||
(inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
LIST_FIRST(&inodedep->id_bufwait) != NULL ||
LIST_FIRST(&inodedep->id_inowait) != NULL ||
TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
@ -1768,6 +1880,7 @@ free_inodedep(inodedep)
return (0);
LIST_REMOVE(inodedep, id_hash);
WORKITEM_FREE(inodedep, D_INODEDEP);
num_inodedep -= 1;
return (1);
}
@ -1836,6 +1949,7 @@ handle_workitem_freeblocks(freeblks)
softdep_error("handle_workitem_freeblks", allerror);
#endif /* DIAGNOSTIC */
WORKITEM_FREE(freeblks, D_FREEBLKS);
num_freeblks -= 1;
}
/*
@ -1940,7 +2054,7 @@ free_allocindir(aip, inodedep)
if (inodedep == NULL)
add_to_worklist(&freefrag->ff_list);
else
WORKLIST_INSERT(&inodedep->id_inowait,
WORKLIST_INSERT(&inodedep->id_bufwait,
&freefrag->ff_list);
}
WORKITEM_FREE(aip, D_ALLOCINDIR);
@ -2038,23 +2152,27 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp)
WORKITEM_FREE(mkdir2, D_MKDIR);
} else {
LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
WORKLIST_INSERT(&inodedep->id_inowait,&mkdir2->md_list);
WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
}
}
/*
* Link into parent directory pagedep and new inode inodedep
* structures to await its being written.
* Link into parent directory pagedep to await its being written.
*/
if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
dap->da_pagedep = pagedep;
LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
da_pdlist);
if (inodedep_lookup(fs, newinum, DEPALLOC, &inodedep) == 1 &&
(inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
/*
* Link into its inodedep. Put it on the id_bufwait list if the inode
* is not yet written. If it is written, do the post-inode write
* processing to put it on the id_pendinghd list.
*/
(void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
diradd_inode_written(dap, inodedep);
else
WORKLIST_INSERT(&inodedep->id_inowait, &dap->da_list);
WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
FREE_LOCK(&lk);
}
@ -2314,7 +2432,7 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
LIST_INSERT_HEAD(
&dirrem->dm_pagedep->pd_diraddhd[DIRADDHASH(offset)],
dap, da_pdlist);
WORKLIST_INSERT(&inodedep->id_inowait, &dap->da_list);
WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
} else if ((dirrem->dm_state & COMPLETE) == 0) {
LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
dm_next);
@ -2366,12 +2484,8 @@ handle_workitem_remove(dirrem)
*/
if ((dirrem->dm_state & RMDIR) == 0) {
ip->i_nlink--;
if (ip->i_nlink < ip->i_effnlink) {
#ifdef DIAGNOSTIC
vprint("handle_workitem_remove: bad file delta", vp);
#endif
ip->i_effnlink = ip->i_nlink;
}
if (ip->i_nlink < ip->i_effnlink)
panic("handle_workitem_remove: bad file delta");
ip->i_flag |= IN_CHANGE;
vput(vp);
WORKITEM_FREE(dirrem, D_DIRREM);
@ -2436,6 +2550,7 @@ handle_workitem_freefile(freefile)
if ((error = ffs_freefile(&vp, freefile->fx_oldinum, freefile->fx_mode)) != 0)
softdep_error("handle_workitem_freefile", error);
WORKITEM_FREE(freefile, D_FREEFILE);
num_freefile -= 1;
}
/*
@ -3022,7 +3137,7 @@ handle_written_inodeblock(inodedep, bp)
* before the old ones have been deleted.
*/
filefree = NULL;
while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
WORKLIST_REMOVE(wk);
switch (wk->wk_type) {
@ -3043,18 +3158,7 @@ handle_written_inodeblock(inodedep, bp)
continue;
case D_DIRADD:
dap = WK_DIRADD(wk);
dap->da_state |= COMPLETE;
if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
if (dap->da_state & DIRCHG)
pagedep = dap->da_previous->dm_pagedep;
else
pagedep = dap->da_pagedep;
LIST_REMOVE(dap, da_pdlist);
LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
da_pdlist);
}
WORKLIST_INSERT(&inodedep->id_pendinghd, wk);
diradd_inode_written(WK_DIRADD(wk), inodedep);
continue;
case D_FREEBLKS:
@ -3069,8 +3173,12 @@ handle_written_inodeblock(inodedep, bp)
/* NOTREACHED */
}
}
if (filefree != NULL)
if (filefree != NULL) {
if (free_inodedep(inodedep) == 0)
panic("handle_written_inodeblock: live inodedep");
add_to_worklist(filefree);
return (0);
}
/*
* If no outstanding dependencies, free it.
@ -3080,6 +3188,29 @@ handle_written_inodeblock(inodedep, bp)
return (hadchanges);
}
/*
* Process a diradd entry after its dependent inode has been written.
* This routine must be called with splbio interrupts blocked.
*/
static void
diradd_inode_written(dap, inodedep)
struct diradd *dap;
struct inodedep *inodedep;
{
struct pagedep *pagedep;
dap->da_state |= COMPLETE;
if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
if (dap->da_state & DIRCHG)
pagedep = dap->da_previous->dm_pagedep;
else
pagedep = dap->da_pagedep;
LIST_REMOVE(dap, da_pdlist);
LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
}
WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
}
/*
* Handle the completion of a mkdir dependency.
*/
@ -3229,6 +3360,7 @@ softdep_load_inodeblock(ip)
}
if (inodedep->id_nlinkdelta != 0) {
ip->i_effnlink -= inodedep->id_nlinkdelta;
ip->i_flag |= IN_MODIFIED;
inodedep->id_nlinkdelta = 0;
(void) free_inodedep(inodedep);
}
@ -3252,6 +3384,7 @@ softdep_update_inodeblock(ip, bp, waitfor)
int waitfor; /* 1 => update must be allowed */
{
struct inodedep *inodedep;
struct worklist *wk;
int error, gotit;
/*
@ -3272,15 +3405,6 @@ softdep_update_inodeblock(ip, bp, waitfor)
if (ip->i_nlink < ip->i_effnlink)
panic("softdep_update_inodeblock: bad delta");
inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
/*
* If the last remaining use for the inodedep was to track the
* link count, and there is no difference between the effective
* and actual link count, then we can free the inodedep.
*/
if (free_inodedep(inodedep)) {
FREE_LOCK(&lk);
return;
}
/*
* Changes have been initiated. Anything depending on these
* changes cannot occur until this inode has been written.
@ -3297,6 +3421,16 @@ softdep_update_inodeblock(ip, bp, waitfor)
merge_inode_lists(inodedep);
if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
/*
* Now that the inode has been pushed into the buffer, the
* operations dependent on the inode being written to disk
* can be moved to the id_bufwait so that they will be
* processed when the buffer I/O completes.
*/
while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
WORKLIST_REMOVE(wk);
WORKLIST_INSERT(&inodedep->id_bufwait, wk);
}
/*
* Newly allocated inodes cannot be written until the bitmap
* that allocates them have been written (indicated by
@ -3378,6 +3512,7 @@ softdep_fsync(vp)
if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
break;
if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
LIST_FIRST(&inodedep->id_bufwait) != NULL ||
TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL)
panic("softdep_fsync: pending ops");
@ -3444,8 +3579,8 @@ softdep_fsync(vp)
*/
error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred,
&bp);
vput(pvp);
ret = VOP_BWRITE(bp);
vput(pvp);
if (error != 0)
return (error);
if (ret != 0)
@ -3535,8 +3670,13 @@ softdep_sync_metadata(ap)
if (adp->ad_state & DEPCOMPLETE)
break;
nbp = adp->ad_buf;
if (getdirtybuf(&nbp, waitfor) == 0)
if (getdirtybuf(&nbp, waitfor) == 0) {
#if 0 /* [JRE] I suspect this should be here XXX */
if (waitfor == MNT_NOWAIT)
continue;
#endif
break;
}
FREE_LOCK(&lk);
if (waitfor == MNT_NOWAIT) {
bawrite(nbp);
@ -3713,8 +3853,11 @@ flush_inodedep_deps(fs, ino)
if (adp->ad_state & DEPCOMPLETE)
continue;
bp = adp->ad_buf;
if (getdirtybuf(&bp, waitfor) == 0)
if (getdirtybuf(&bp, waitfor) == 0) {
if (waitfor == MNT_NOWAIT)
continue;
break;
}
FREE_LOCK(&lk);
if (waitfor == MNT_NOWAIT) {
bawrite(bp);
@ -3732,8 +3875,11 @@ flush_inodedep_deps(fs, ino)
if (adp->ad_state & DEPCOMPLETE)
continue;
bp = adp->ad_buf;
if (getdirtybuf(&bp, waitfor) == 0)
if (getdirtybuf(&bp, waitfor) == 0) {
if (waitfor == MNT_NOWAIT)
continue;
break;
}
FREE_LOCK(&lk);
if (waitfor == MNT_NOWAIT) {
bawrite(bp);

View File

@ -52,7 +52,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)softdep.h 9.4 (McKusick) 1/15/98
* @(#)softdep.h 9.5 (McKusick) 2/11/98
*/
#include <sys/queue.h>
@ -201,31 +201,45 @@ struct pagedep {
/*
* The "inodedep" structure tracks the set of dependencies associated
* with an inode. Each block that is allocated is represented by an
* with an inode. One task that it must manage is delayed operations
* (i.e., work requests that must be held until the inodedep's associated
* inode has been written to disk). Getting an inode from its incore
* state to the disk requires two steps to be taken by the filesystem
* in this order: first the inode must be copied to its disk buffer by
* the VOP_UPDATE operation; second the inode's buffer must be written
* to disk. To ensure that both operations have happened in the required
* order, the inodedep maintains two lists. Delayed operations are
* placed on the id_inowait list. When the VOP_UPDATE is done, all
* operations on the id_inowait list are moved to the id_bufwait list.
* When the buffer is written, the items on the id_bufwait list can be
* safely moved to the work queue to be processed. A second task of the
* inodedep structure is to track the status of block allocation within
* the inode. Each block that is allocated is represented by an
* "allocdirect" structure (see below). It is linked onto the id_newinoupdt
* list until both its contents and its allocation in the cylinder
* group map have been written to disk. Once the dependencies have been
* group map have been written to disk. Once these dependencies have been
* satisfied, it is removed from the id_newinoupdt list and any followup
* actions such as releasing the previous block or fragment are placed
* on the id_inowait list. When an inode is updated (copied from the
* in-core inode structure to a disk buffer containing its on-disk
* copy), the "inodedep" structure is linked onto the buffer through
* its worklist. Thus it will be notified when the buffer is about
* on the id_inowait list. When an inode is updated (a VOP_UPDATE is
* done), the "inodedep" structure is linked onto the buffer through
* its worklist. Thus, it will be notified when the buffer is about
* to be written and when it is done. At the update time, all the
* elements on the id_newinoupdt list are moved to the id_inoupdt list
* since those changes are now relevant to the copy of the inode in the
* buffer. When the buffer containing the inode is written to disk, any
* updates listed on the id_inoupdt list are rolled back as they are
* not yet safe. Following the write, the changes are once again rolled
* forward and any actions on the id_inowait list are processed (since
* the previously allocated blocks are no longer claimed on the disk).
* buffer. Also at update time, the tasks on the id_inowait list are
* moved to the id_bufwait list so that they will be executed when
* the updated inode has been written to disk. When the buffer containing
* the inode is written to disk, any updates listed on the id_inoupdt
* list are rolled back as they are not yet safe. Following the write,
* the changes are once again rolled forward and any actions on the
* id_bufwait list are processed (since those actions are now safe).
* The entries on the id_inoupdt and id_newinoupdt lists must be kept
* sorted by logical block number to speed the calculation of the size
* of the rolled back inode (see explanation in initiate_write_inodeblock).
* When a directory entry is created, it is represented by a diradd.
* The diradd is added to the id_inowait list and is not permitted to be
* written to disk until the inode that it represents is written. After
* the inode is written, the id_inowait list is processed and the diradd
* The diradd is added to the id_inowait list as it cannot be safely
* written to disk until the inode that it represents is on disk. After
* the inode is written, the id_bufwait list is processed and the diradd
* entries are moved to the id_pendinghd list where they remain until
* the directory block containing the name has been written to disk.
* The purpose of keeping the entries on the id_pendinghd list is so that
@ -244,7 +258,8 @@ struct inodedep {
struct buf *id_buf; /* related bmsafemap (if pending) */
off_t id_savedsize; /* file size saved during rollback */
struct workhead id_pendinghd; /* entries awaiting directory write */
struct workhead id_inowait; /* operations after inode written */
struct workhead id_bufwait; /* operations after inode written */
struct workhead id_inowait; /* operations waiting inode update */
struct allocdirectlst id_inoupdt; /* updates before inode written */
struct allocdirectlst id_newinoupdt; /* updates when inode written */
};
@ -460,7 +475,7 @@ struct freefile {
* if appropriate and is never cleared.
*/
struct diradd {
struct worklist da_list; /* id_inowait and id_pendinghd list */
struct worklist da_list; /* id_inowait or id_pendinghd list */
# define da_state da_list.wk_state /* state of the new directory entry */
LIST_ENTRY(diradd) da_pdlist; /* pagedep holding directory block */
doff_t da_offset; /* offset of new dir entry in dir blk */