From b1897c197c06ebd09ab26a462489bd331c96ce2e Mon Sep 17 00:00:00 2001
From: Julian Elischer <julian@FreeBSD.org>
Date: Sun, 8 Mar 1998 09:59:44 +0000
Subject: [PATCH] Reviewed by:	dyson@freebsd.org (john Dyson), dg@root.com
 (david greenman) Submitted by:	Kirk McKusick (mcKusick@mckusick.com) Obtained
 from:  WHistle development tree

---
 sbin/fsck/dir.c                  |   9 +-
 sbin/fsck/fsck.h                 |   2 +
 sbin/fsck/inode.c                |  11 +
 sbin/fsck/main.c                 |  22 +-
 sbin/fsck/pass1.c                |  16 +-
 sbin/fsck/pass2.c                |  31 ++-
 sbin/fsck/pass5.c                |  60 +++--
 sbin/fsck/setup.c                |   8 +-
 sbin/fsck/utilities.c            |  41 +++-
 sbin/fsck_ffs/dir.c              |   9 +-
 sbin/fsck_ffs/fsck.h             |   2 +
 sbin/fsck_ffs/inode.c            |  11 +
 sbin/fsck_ffs/main.c             |  22 +-
 sbin/fsck_ffs/pass1.c            |  16 +-
 sbin/fsck_ffs/pass2.c            |  31 ++-
 sbin/fsck_ffs/pass5.c            |  60 +++--
 sbin/fsck_ffs/setup.c            |   8 +-
 sbin/fsck_ffs/utilities.c        |  41 +++-
 sbin/fsck_ifs/dir.c              |   9 +-
 sbin/fsck_ifs/fsck.h             |   2 +
 sbin/fsck_ifs/inode.c            |  11 +
 sbin/fsck_ifs/main.c             |  22 +-
 sbin/fsck_ifs/pass1.c            |  16 +-
 sbin/fsck_ifs/pass2.c            |  31 ++-
 sbin/fsck_ifs/pass5.c            |  60 +++--
 sbin/fsck_ifs/setup.c            |   8 +-
 sbin/fsck_ifs/utilities.c        |  41 +++-
 sbin/mount/mount.c               |   8 +-
 sbin/mount_ifs/mount.c           |   8 +-
 sbin/tunefs/tunefs.8             |   1 +
 sbin/tunefs/tunefs.c             |  23 +-
 sys/conf/NOTES                   |   9 +-
 sys/conf/files                   |   2 +
 sys/conf/options                 |   8 +-
 sys/dev/de/if_de.c               |   4 +-
 sys/fs/cd9660/cd9660_vfsops.c    |  11 +-
 sys/fs/msdosfs/msdosfs_vfsops.c  |  19 +-
 sys/fs/specfs/spec_vnops.c       |  10 +-
 sys/gnu/ext2fs/inode.h           |  11 +-
 sys/gnu/fs/ext2fs/inode.h        |  11 +-
 sys/i386/conf/LINT               |   9 +-
 sys/i386/conf/NOTES              |   9 +-
 sys/isofs/cd9660/cd9660_vfsops.c |  11 +-
 sys/kern/kern_malloc.c           |   6 +-
 sys/kern/kern_shutdown.c         |  14 +-
 sys/kern/kern_synch.c            |   4 +-
 sys/kern/vfs_bio.c               |  78 ++++++-
 sys/kern/vfs_cluster.c           |  11 +-
 sys/kern/vfs_export.c            | 370 ++++++++++++++++++++++++++++++-
 sys/kern/vfs_extattr.c           |  29 ++-
 sys/kern/vfs_subr.c              | 370 ++++++++++++++++++++++++++++++-
 sys/kern/vfs_syscalls.c          |  29 ++-
 sys/kern/vnode_if.src            |  14 +-
 sys/miscfs/specfs/spec_vnops.c   |  10 +-
 sys/miscfs/specfs/specdev.h      |  11 +-
 sys/msdosfs/msdosfs_vfsops.c     |  19 +-
 sys/nfs/nfs_bio.c                |   6 +-
 sys/nfs/nfs_vnops.c              |   9 +-
 sys/nfsclient/nfs_bio.c          |   6 +-
 sys/nfsclient/nfs_vnops.c        |   9 +-
 sys/pci/if_de.c                  |   4 +-
 sys/sys/bio.h                    |  22 +-
 sys/sys/buf.h                    |  22 +-
 sys/sys/malloc.h                 |   8 +-
 sys/sys/mount.h                  |  15 +-
 sys/sys/vnode.h                  |  12 +-
 sys/ufs/ffs/ffs_alloc.c          |  93 ++++++--
 sys/ufs/ffs/ffs_balloc.c         |  92 +++++---
 sys/ufs/ffs/ffs_extern.h         |  34 ++-
 sys/ufs/ffs/ffs_inode.c          |  56 ++++-
 sys/ufs/ffs/ffs_subr.c           |  26 ++-
 sys/ufs/ffs/ffs_vfsops.c         |  85 +++++--
 sys/ufs/ffs/ffs_vnops.c          |  97 +++++---
 sys/ufs/ffs/fs.h                 |  16 +-
 sys/ufs/ufs/inode.h              |  11 +-
 sys/ufs/ufs/ufs_extern.h         |  27 ++-
 sys/ufs/ufs/ufs_lookup.c         | 265 ++++++++++++++--------
 sys/ufs/ufs/ufs_quota.c          |   8 +-
 sys/ufs/ufs/ufs_readwrite.c      |  10 +-
 sys/ufs/ufs/ufs_vnops.c          | 340 ++++++++++++++--------------
 80 files changed, 2349 insertions(+), 643 deletions(-)

diff --git a/sbin/fsck/dir.c b/sbin/fsck/dir.c
index 4b6999b013a9..6ab67d33ca32 100644
--- a/sbin/fsck/dir.c
+++ b/sbin/fsck/dir.c
@@ -315,12 +315,13 @@ adjust(idesc, lcnt)
 		pinode(idesc->id_number);
 		printf(" COUNT %d SHOULD BE %d",
 			dp->di_nlink, dp->di_nlink - lcnt);
-		if (preen) {
+		if (preen || usedsoftdep) {
 			if (lcnt < 0) {
 				printf("\n");
 				pfatal("LINK COUNT INCREASING");
 			}
-			printf(" (ADJUSTED)\n");
+			if (preen)
+				printf(" (ADJUSTED)\n");
 		}
 		if (preen || reply("ADJUST") == 1) {
 			dp->di_nlink -= lcnt;
@@ -406,13 +407,15 @@ linkup(orphan, parentdir)
 	lostdir = (dp->di_mode & IFMT) == IFDIR;
 	pwarn("UNREF %s ", lostdir ? "DIR" : "FILE");
 	pinode(orphan);
-	if (preen && dp->di_size == 0)
+	if ((preen || usedsoftdep) && dp->di_size == 0)
 		return (0);
 	if (preen)
 		printf(" (RECONNECTED)\n");
 	else
 		if (reply("RECONNECT") == 0)
 			return (0);
+	if (parentdir != 0)
+		lncntp[parentdir]++;
 	if (lfdir == 0) {
 		dp = ginode(ROOTINO);
 		idesc.id_name = lfname;
diff --git a/sbin/fsck/fsck.h b/sbin/fsck/fsck.h
index 1967691e989c..4e0271d815dd 100644
--- a/sbin/fsck/fsck.h
+++ b/sbin/fsck/fsck.h
@@ -176,6 +176,8 @@ int	cvtlevel;		/* convert to newer file system format */
 int	doinglevel1;		/* converting to new cylinder group format */
 int	doinglevel2;		/* converting to new inode format */
 int	newinofmt;		/* filesystem has new inode format */
+char	usedsoftdep;		/* just fix soft dependency inconsistencies */
+char	resolved;		/* cleared if unresolved changes => not clean */
 char	preen;			/* just fix normal inconsistencies */
 char	hotroot;		/* checking root device */
 char	havesb;			/* superblock has been read */
diff --git a/sbin/fsck/inode.c b/sbin/fsck/inode.c
index 429dd3b2267b..74561c8eb1b4 100644
--- a/sbin/fsck/inode.c
+++ b/sbin/fsck/inode.c
@@ -559,6 +559,8 @@ allocino(request, type)
 {
 	register ino_t ino;
 	register struct dinode *dp;
+	struct cg *cgp = &cgrp;
+	int cg;
 
 	if (request == 0)
 		request = ROOTINO;
@@ -569,9 +571,16 @@ allocino(request, type)
 			break;
 	if (ino == maxino)
 		return (0);
+	cg = ino_to_cg(&sblock, ino);
+	getblk(&cgblk, cgtod(&sblock, cg), sblock.fs_cgsize);
+	if (!cg_chkmagic(cgp))
+		pfatal("CG %d: BAD MAGIC NUMBER\n", cg);
+	setbit(cg_inosused(cgp), ino % sblock.fs_ipg);
+	cgp->cg_cs.cs_nifree--;
 	switch (type & IFMT) {
 	case IFDIR:
 		statemap[ino] = DSTATE;
+		cgp->cg_cs.cs_ndir++;
 		break;
 	case IFREG:
 	case IFLNK:
@@ -580,12 +589,14 @@ allocino(request, type)
 	default:
 		return (0);
 	}
+	cgdirty();
 	dp = ginode(ino);
 	dp->di_db[0] = allocblk((long)1);
 	if (dp->di_db[0] == 0) {
 		statemap[ino] = USTATE;
 		return (0);
 	}
+	dp->di_flags = 0;
 	dp->di_mode = type;
 	dp->di_atime = time(NULL);
 	dp->di_mtime = dp->di_ctime = dp->di_atime;
diff --git a/sbin/fsck/main.c b/sbin/fsck/main.c
index dcb7006125cb..b4bc2c9caaaf 100644
--- a/sbin/fsck/main.c
+++ b/sbin/fsck/main.c
@@ -42,7 +42,7 @@ static const char copyright[] =
 static char sccsid[] = "@(#)main.c	8.6 (Berkeley) 5/14/95";
 #endif
 static const char rcsid[] =
-	"$Id$";
+	"$Id: main.c,v 1.12 1997/12/20 22:24:32 bde Exp $";
 #endif /* not lint */
 
 #include <sys/param.h>
@@ -209,6 +209,11 @@ checkfilesys(filesys, mntpt, auxdata, child)
 		return (0);
 	}
 
+	/*
+	 * Cleared if any questions answered no. Used to decide if
+	 * the superblock should be marked clean.
+	 */
+	resolved = 1;
 	/*
 	 * 1: scan inodes tallying blocks used
 	 */
@@ -224,7 +229,7 @@ checkfilesys(filesys, mntpt, auxdata, child)
 	 * 1b: locate first references to duplicates, if any
 	 */
 	if (duplist) {
-		if (preen)
+		if (preen || usedsoftdep)
 			pfatal("INTERNAL ERROR: dups with -p");
 		printf("** Phase 1b - Rescan For More DUPS\n");
 		pass1b();
@@ -306,19 +311,20 @@ checkfilesys(filesys, mntpt, auxdata, child)
 			bwrite(fswritefd, (char *)&sblock,
 			    fsbtodb(&sblock, cgsblock(&sblock, cylno)), SBSIZE);
 	}
-	if (!hotroot) {
-		ckfini(1);
-	} else {
+	if (rerun)
+		resolved = 0;
+	flags = 0;
+	if (hotroot) {
 		struct statfs stfs_buf;
 		/*
 		 * Check to see if root is mounted read-write.
 		 */
 		if (statfs("/", &stfs_buf) == 0)
 			flags = stfs_buf.f_flags;
-		else
-			flags = 0;
-		ckfini(flags & MNT_RDONLY);
+		if ((flags & MNT_RDONLY) == 0)
+			resolved = 0;
 	}
+	ckfini(resolved);
 	free(blockmap);
 	free(statemap);
 	free((char *)lncntp);
diff --git a/sbin/fsck/pass1.c b/sbin/fsck/pass1.c
index 99582777186c..181f858184bc 100644
--- a/sbin/fsck/pass1.c
+++ b/sbin/fsck/pass1.c
@@ -200,8 +200,10 @@ checkinode(inumber, idesc)
 		zlnp = (struct zlncnt *)malloc(sizeof *zlnp);
 		if (zlnp == NULL) {
 			pfatal("LINK COUNT TABLE OVERFLOW");
-			if (reply("CONTINUE") == 0)
+			if (reply("CONTINUE") == 0) {
+				ckfini(0);
 				exit(EEXIT);
+			}
 		} else {
 			zlnp->zlncnt = inumber;
 			zlnp->next = zlnhead;
@@ -270,8 +272,10 @@ pass1check(idesc)
 				idesc->id_number);
 			if (preen)
 				printf(" (SKIPPING)\n");
-			else if (reply("CONTINUE") == 0)
+			else if (reply("CONTINUE") == 0) {
+				ckfini(0);
 				exit(EEXIT);
+			}
 			return (STOP);
 		}
 	}
@@ -288,15 +292,19 @@ pass1check(idesc)
 					idesc->id_number);
 				if (preen)
 					printf(" (SKIPPING)\n");
-				else if (reply("CONTINUE") == 0)
+				else if (reply("CONTINUE") == 0) {
+					ckfini(0);
 					exit(EEXIT);
+				}
 				return (STOP);
 			}
 			new = (struct dups *)malloc(sizeof(struct dups));
 			if (new == NULL) {
 				pfatal("DUP TABLE OVERFLOW.");
-				if (reply("CONTINUE") == 0)
+				if (reply("CONTINUE") == 0) {
+					ckfini(0);
 					exit(EEXIT);
+				}
 				return (STOP);
 			}
 			new->dup = blkno;
diff --git a/sbin/fsck/pass2.c b/sbin/fsck/pass2.c
index 445f6f1682b9..ebc33b8a650f 100644
--- a/sbin/fsck/pass2.c
+++ b/sbin/fsck/pass2.c
@@ -66,8 +66,10 @@ pass2()
 
 	case USTATE:
 		pfatal("ROOT INODE UNALLOCATED");
-		if (reply("ALLOCATE") == 0)
+		if (reply("ALLOCATE") == 0) {
+			ckfini(0);
 			exit(EEXIT);
+		}
 		if (allocdir(ROOTINO, ROOTINO, 0755) != ROOTINO)
 			errx(EEXIT, "CANNOT ALLOCATE ROOT INODE");
 		break;
@@ -80,8 +82,10 @@ pass2()
 				errx(EEXIT, "CANNOT ALLOCATE ROOT INODE");
 			break;
 		}
-		if (reply("CONTINUE") == 0)
+		if (reply("CONTINUE") == 0) {
+			ckfini(0);
 			exit(EEXIT);
+		}
 		break;
 
 	case FSTATE:
@@ -93,8 +97,10 @@ pass2()
 				errx(EEXIT, "CANNOT ALLOCATE ROOT INODE");
 			break;
 		}
-		if (reply("FIX") == 0)
+		if (reply("FIX") == 0) {
+			ckfini(0);
 			exit(EEXIT);
+		}
 		dp = ginode(ROOTINO);
 		dp->di_mode &= ~IFMT;
 		dp->di_mode |= IFDIR;
@@ -139,8 +145,14 @@ pass2()
 			}
 		} else if ((inp->i_isize & (DIRBLKSIZ - 1)) != 0) {
 			getpathname(pathbuf, inp->i_number, inp->i_number);
-			pwarn("DIRECTORY %s: LENGTH %d NOT MULTIPLE OF %d",
-				pathbuf, inp->i_isize, DIRBLKSIZ);
+			if (usedsoftdep)
+				pfatal("%s %s: LENGTH %d NOT MULTIPLE OF %d",
+					"DIRECTORY", pathbuf, inp->i_isize,
+					DIRBLKSIZ);
+			else
+				pwarn("%s %s: LENGTH %d NOT MULTIPLE OF %d",
+					"DIRECTORY", pathbuf, inp->i_isize,
+					DIRBLKSIZ);
 			if (preen)
 				printf(" (ADJUSTED)\n");
 			inp->i_isize = roundup(inp->i_isize, DIRBLKSIZ);
@@ -394,7 +406,7 @@ pass2check(idesc)
 				break;
 			if (statemap[dirp->d_ino] == FCLEAR)
 				errmsg = "DUP/BAD";
-			else if (!preen)
+			else if (!preen && !usedsoftdep)
 				errmsg = "ZERO LENGTH DIRECTORY";
 			else {
 				n = 1;
@@ -423,8 +435,11 @@ pass2check(idesc)
 				pwarn("%s %s %s\n", pathbuf,
 				    "IS AN EXTRANEOUS HARD LINK TO DIRECTORY",
 				    namebuf);
-				if (preen)
-					printf(" (IGNORED)\n");
+				if (preen) {
+					printf(" (REMOVED)\n");
+  					n = 1;
+  					break;
+				}
 				else if ((n = reply("REMOVE")) == 1)
 					break;
 			}
diff --git a/sbin/fsck/pass5.c b/sbin/fsck/pass5.c
index 3dd0c1aac237..873f008b8c78 100644
--- a/sbin/fsck/pass5.c
+++ b/sbin/fsck/pass5.c
@@ -50,11 +50,12 @@ void
 pass5()
 {
 	int c, blk, frags, basesize, sumsize, mapsize, savednrpos;
+	int inomapsize, blkmapsize;
 	struct fs *fs = &sblock;
 	struct cg *cg = &cgrp;
 	ufs_daddr_t dbase, dmax;
 	ufs_daddr_t d;
-	long i, j;
+	long i, j, k;
 	struct csum *cs;
 	struct csum cstotal;
 	struct inodesc idesc[3];
@@ -112,6 +113,8 @@ pass5()
 		sumsize = &ocg->cg_iused[0] - (u_int8_t *)(&ocg->cg_btot[0]);
 		mapsize = &ocg->cg_free[howmany(fs->fs_fpg, NBBY)] -
 			(u_char *)&ocg->cg_iused[0];
+		blkmapsize = howmany(fs->fs_fpg, NBBY);
+		inomapsize = &ocg->cg_free[0] - (u_char *)&ocg->cg_iused[0];
 		ocg->cg_magic = CG_MAGIC;
 		savednrpos = fs->fs_nrpos;
 		fs->fs_nrpos = 8;
@@ -126,12 +129,12 @@ pass5()
 		    fs->fs_cpg * fs->fs_nrpos * sizeof(short);
 		newcg->cg_freeoff =
 		    newcg->cg_iusedoff + howmany(fs->fs_ipg, NBBY);
-		if (fs->fs_contigsumsize <= 0) {
-			newcg->cg_nextfreeoff = newcg->cg_freeoff +
-			    howmany(fs->fs_cpg * fs->fs_spc / NSPF(fs), NBBY);
-		} else {
-			newcg->cg_clustersumoff = newcg->cg_freeoff +
-			    howmany(fs->fs_cpg * fs->fs_spc / NSPF(fs), NBBY) -
+		inomapsize = newcg->cg_freeoff - newcg->cg_iusedoff;
+		newcg->cg_nextfreeoff = newcg->cg_freeoff +
+		    howmany(fs->fs_cpg * fs->fs_spc / NSPF(fs), NBBY);
+		blkmapsize = newcg->cg_nextfreeoff - newcg->cg_freeoff;
+		if (fs->fs_contigsumsize > 0) {
+			newcg->cg_clustersumoff = newcg->cg_nextfreeoff -
 			    sizeof(long);
 			newcg->cg_clustersumoff =
 			    roundup(newcg->cg_clustersumoff, sizeof(long));
@@ -148,7 +151,7 @@ pass5()
 		break;
 
 	default:
-		sumsize = 0;	/* keep lint happy */
+		inomapsize = blkmapsize = sumsize = 0;	/* keep lint happy */
 		errx(EEXIT, "UNKNOWN ROTATIONAL TABLE FORMAT %d",
 			fs->fs_postblformat);
 	}
@@ -299,13 +302,6 @@ pass5()
 			cgdirty();
 			continue;
 		}
-		if (memcmp(cg_inosused(newcg),
-			 cg_inosused(cg), mapsize) != 0 &&
-		    dofix(&idesc[1], "BLK(S) MISSING IN BIT MAPS")) {
-			memmove(cg_inosused(cg), cg_inosused(newcg),
-			      (size_t)mapsize);
-			cgdirty();
-		}
 		if ((memcmp(newcg, cg, basesize) != 0 ||
 		     memcmp(&cg_blktot(newcg)[0],
 			  &cg_blktot(cg)[0], sumsize) != 0) &&
@@ -315,6 +311,40 @@ pass5()
 			       &cg_blktot(newcg)[0], (size_t)sumsize);
 			cgdirty();
 		}
+		if (usedsoftdep) {
+			for (i = 0; i < inomapsize; i++) {
+				j = cg_inosused(newcg)[i];
+				if ((cg_inosused(cg)[i] & j) == j)
+					continue;
+				for (k = 0; k < NBBY; k++) {
+					if ((j & (1 << k)) == 0)
+						continue;
+					if (cg_inosused(cg)[i] & (1 << k))
+						continue;
+					pwarn("ALLOCATED INODE %d MARKED FREE",
+					    c * fs->fs_ipg + i * 8 + k);
+				}
+			}
+			for (i = 0; i < blkmapsize; i++) {
+				j = cg_blksfree(cg)[i];
+				if ((cg_blksfree(newcg)[i] & j) == j)
+					continue;
+				for (k = 0; k < NBBY; k++) {
+					if ((j & (1 << k)) == 0)
+						continue;
+					if (cg_inosused(cg)[i] & (1 << k))
+						continue;
+					pwarn("ALLOCATED FRAG %d MARKED FREE",
+					    c * fs->fs_fpg + i * 8 + k);
+				}
+			}
+		}
+		if (memcmp(cg_inosused(newcg), cg_inosused(cg), mapsize) != 0 &&
+		    dofix(&idesc[1], "BLK(S) MISSING IN BIT MAPS")) {
+			memmove(cg_inosused(cg), cg_inosused(newcg),
+			      (size_t)mapsize);
+			cgdirty();
+		}
 	}
 	if (fs->fs_postblformat == FS_42POSTBLFMT)
 		fs->fs_nrpos = savednrpos;
diff --git a/sbin/fsck/setup.c b/sbin/fsck/setup.c
index 28e7e4b4a46e..f464b63e53ba 100644
--- a/sbin/fsck/setup.c
+++ b/sbin/fsck/setup.c
@@ -255,8 +255,10 @@ setup(dev)
 		    fsbtodb(&sblock, sblock.fs_csaddr + j * sblock.fs_frag),
 		    size) != 0 && !asked) {
 			pfatal("BAD SUMMARY INFORMATION");
-			if (reply("CONTINUE") == 0)
+			if (reply("CONTINUE") == 0) {
+				ckfini(0);
 				exit(EEXIT);
+			}
 			asked++;
 		}
 	}
@@ -311,6 +313,10 @@ setup(dev)
 		goto badsb;
 	}
 	bufinit();
+	if (sblock.fs_flags & FS_DOSOFTDEP)
+		usedsoftdep = 1;
+	else
+		usedsoftdep = 0;
 	return (1);
 
 badsb:
diff --git a/sbin/fsck/utilities.c b/sbin/fsck/utilities.c
index 30c31cfeedb1..465fb3be3884 100644
--- a/sbin/fsck/utilities.c
+++ b/sbin/fsck/utilities.c
@@ -87,6 +87,7 @@ reply(question)
 	printf("\n");
 	if (!persevere && (nflag || fswritefd < 0)) {
 		printf("%s? no\n\n", question);
+		resolved = 0;
 		return (0);
 	}
 	if (yflag || (persevere && nflag)) {
@@ -97,13 +98,17 @@ reply(question)
 		printf("%s? [yn] ", question);
 		(void) fflush(stdout);
 		c = getc(stdin);
-		while (c != '\n' && getc(stdin) != '\n')
-			if (feof(stdin))
+		while (c != '\n' && getc(stdin) != '\n') {
+			if (feof(stdin)) {
+				resolved = 0;
 				return (0);
+			}
+		}
 	} while (c != 'y' && c != 'Y' && c != 'n' && c != 'N');
 	printf("\n");
 	if (c == 'y' || c == 'Y')
 		return (1);
+	resolved = 0;
 	return (0);
 }
 
@@ -360,7 +365,8 @@ ufs_daddr_t
 allocblk(frags)
 	long frags;
 {
-	register int i, j, k;
+	int i, j, k, cg, baseblk;
+	struct cg *cgp = &cgrp;
 
 	if (frags <= 0 || frags > sblock.fs_frag)
 		return (0);
@@ -375,9 +381,21 @@ allocblk(frags)
 				j += k;
 				continue;
 			}
-			for (k = 0; k < frags; k++)
+			cg = dtog(&sblock, i + j);
+			getblk(&cgblk, cgtod(&sblock, cg), sblock.fs_cgsize);
+			if (!cg_chkmagic(cgp))
+				pfatal("CG %d: BAD MAGIC NUMBER\n", cg);
+			baseblk = dtogd(&sblock, i + j);
+			for (k = 0; k < frags; k++) {
 				setbmap(i + j + k);
+				clrbit(cg_blksfree(cgp), baseblk + k);
+			}
 			n_blks += frags;
+			if (frags == sblock.fs_frag)
+				cgp->cg_cs.cs_nbfree--;
+			else
+				cgp->cg_cs.cs_nffree -= frags;
+			cgdirty();
 			return (i + j);
 		}
 	}
@@ -545,7 +563,8 @@ dofix(idesc, msg)
 
 /*
  * An unexpected inconsistency occured.
- * Die if preening, otherwise just print message and continue.
+ * Die if preening or filesystem is running with soft dependency protocol,
+ * otherwise just print message and continue.
  */
 void
 #if __STDC__
@@ -565,19 +584,23 @@ pfatal(fmt, va_alist)
 	if (!preen) {
 		(void)vfprintf(stderr, fmt, ap);
 		va_end(ap);
+		if (usedsoftdep)
+			(void)fprintf(stderr,
+			    "\nUNEXPECTED SOFTDEP INCONSISTENCY\n");
 		return;
 	}
 	(void)fprintf(stderr, "%s: ", cdevname);
 	(void)vfprintf(stderr, fmt, ap);
 	(void)fprintf(stderr,
-	    "\n%s: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY.\n",
-	    cdevname);
+	    "\n%s: UNEXPECTED%sINCONSISTENCY; RUN fsck MANUALLY.\n",
+	    cdevname, usedsoftdep ? " SOFTDEP " : " ");
+	ckfini(0);
 	exit(EEXIT);
 }
 
 /*
- * Pwarn just prints a message when not preening,
- * or a warning (preceded by filename) when preening.
+ * Pwarn just prints a message when not preening or running soft dependency
+ * protocol, or a warning (preceded by filename) when preening.
  */
 void
 #if __STDC__
diff --git a/sbin/fsck_ffs/dir.c b/sbin/fsck_ffs/dir.c
index 4b6999b013a9..6ab67d33ca32 100644
--- a/sbin/fsck_ffs/dir.c
+++ b/sbin/fsck_ffs/dir.c
@@ -315,12 +315,13 @@ adjust(idesc, lcnt)
 		pinode(idesc->id_number);
 		printf(" COUNT %d SHOULD BE %d",
 			dp->di_nlink, dp->di_nlink - lcnt);
-		if (preen) {
+		if (preen || usedsoftdep) {
 			if (lcnt < 0) {
 				printf("\n");
 				pfatal("LINK COUNT INCREASING");
 			}
-			printf(" (ADJUSTED)\n");
+			if (preen)
+				printf(" (ADJUSTED)\n");
 		}
 		if (preen || reply("ADJUST") == 1) {
 			dp->di_nlink -= lcnt;
@@ -406,13 +407,15 @@ linkup(orphan, parentdir)
 	lostdir = (dp->di_mode & IFMT) == IFDIR;
 	pwarn("UNREF %s ", lostdir ? "DIR" : "FILE");
 	pinode(orphan);
-	if (preen && dp->di_size == 0)
+	if ((preen || usedsoftdep) && dp->di_size == 0)
 		return (0);
 	if (preen)
 		printf(" (RECONNECTED)\n");
 	else
 		if (reply("RECONNECT") == 0)
 			return (0);
+	if (parentdir != 0)
+		lncntp[parentdir]++;
 	if (lfdir == 0) {
 		dp = ginode(ROOTINO);
 		idesc.id_name = lfname;
diff --git a/sbin/fsck_ffs/fsck.h b/sbin/fsck_ffs/fsck.h
index 1967691e989c..4e0271d815dd 100644
--- a/sbin/fsck_ffs/fsck.h
+++ b/sbin/fsck_ffs/fsck.h
@@ -176,6 +176,8 @@ int	cvtlevel;		/* convert to newer file system format */
 int	doinglevel1;		/* converting to new cylinder group format */
 int	doinglevel2;		/* converting to new inode format */
 int	newinofmt;		/* filesystem has new inode format */
+char	usedsoftdep;		/* just fix soft dependency inconsistencies */
+char	resolved;		/* cleared if unresolved changes => not clean */
 char	preen;			/* just fix normal inconsistencies */
 char	hotroot;		/* checking root device */
 char	havesb;			/* superblock has been read */
diff --git a/sbin/fsck_ffs/inode.c b/sbin/fsck_ffs/inode.c
index 429dd3b2267b..74561c8eb1b4 100644
--- a/sbin/fsck_ffs/inode.c
+++ b/sbin/fsck_ffs/inode.c
@@ -559,6 +559,8 @@ allocino(request, type)
 {
 	register ino_t ino;
 	register struct dinode *dp;
+	struct cg *cgp = &cgrp;
+	int cg;
 
 	if (request == 0)
 		request = ROOTINO;
@@ -569,9 +571,16 @@ allocino(request, type)
 			break;
 	if (ino == maxino)
 		return (0);
+	cg = ino_to_cg(&sblock, ino);
+	getblk(&cgblk, cgtod(&sblock, cg), sblock.fs_cgsize);
+	if (!cg_chkmagic(cgp))
+		pfatal("CG %d: BAD MAGIC NUMBER\n", cg);
+	setbit(cg_inosused(cgp), ino % sblock.fs_ipg);
+	cgp->cg_cs.cs_nifree--;
 	switch (type & IFMT) {
 	case IFDIR:
 		statemap[ino] = DSTATE;
+		cgp->cg_cs.cs_ndir++;
 		break;
 	case IFREG:
 	case IFLNK:
@@ -580,12 +589,14 @@ allocino(request, type)
 	default:
 		return (0);
 	}
+	cgdirty();
 	dp = ginode(ino);
 	dp->di_db[0] = allocblk((long)1);
 	if (dp->di_db[0] == 0) {
 		statemap[ino] = USTATE;
 		return (0);
 	}
+	dp->di_flags = 0;
 	dp->di_mode = type;
 	dp->di_atime = time(NULL);
 	dp->di_mtime = dp->di_ctime = dp->di_atime;
diff --git a/sbin/fsck_ffs/main.c b/sbin/fsck_ffs/main.c
index dcb7006125cb..b4bc2c9caaaf 100644
--- a/sbin/fsck_ffs/main.c
+++ b/sbin/fsck_ffs/main.c
@@ -42,7 +42,7 @@ static const char copyright[] =
 static char sccsid[] = "@(#)main.c	8.6 (Berkeley) 5/14/95";
 #endif
 static const char rcsid[] =
-	"$Id$";
+	"$Id: main.c,v 1.12 1997/12/20 22:24:32 bde Exp $";
 #endif /* not lint */
 
 #include <sys/param.h>
@@ -209,6 +209,11 @@ checkfilesys(filesys, mntpt, auxdata, child)
 		return (0);
 	}
 
+	/*
+	 * Cleared if any questions answered no. Used to decide if
+	 * the superblock should be marked clean.
+	 */
+	resolved = 1;
 	/*
 	 * 1: scan inodes tallying blocks used
 	 */
@@ -224,7 +229,7 @@ checkfilesys(filesys, mntpt, auxdata, child)
 	 * 1b: locate first references to duplicates, if any
 	 */
 	if (duplist) {
-		if (preen)
+		if (preen || usedsoftdep)
 			pfatal("INTERNAL ERROR: dups with -p");
 		printf("** Phase 1b - Rescan For More DUPS\n");
 		pass1b();
@@ -306,19 +311,20 @@ checkfilesys(filesys, mntpt, auxdata, child)
 			bwrite(fswritefd, (char *)&sblock,
 			    fsbtodb(&sblock, cgsblock(&sblock, cylno)), SBSIZE);
 	}
-	if (!hotroot) {
-		ckfini(1);
-	} else {
+	if (rerun)
+		resolved = 0;
+	flags = 0;
+	if (hotroot) {
 		struct statfs stfs_buf;
 		/*
 		 * Check to see if root is mounted read-write.
 		 */
 		if (statfs("/", &stfs_buf) == 0)
 			flags = stfs_buf.f_flags;
-		else
-			flags = 0;
-		ckfini(flags & MNT_RDONLY);
+		if ((flags & MNT_RDONLY) == 0)
+			resolved = 0;
 	}
+	ckfini(resolved);
 	free(blockmap);
 	free(statemap);
 	free((char *)lncntp);
diff --git a/sbin/fsck_ffs/pass1.c b/sbin/fsck_ffs/pass1.c
index 99582777186c..181f858184bc 100644
--- a/sbin/fsck_ffs/pass1.c
+++ b/sbin/fsck_ffs/pass1.c
@@ -200,8 +200,10 @@ checkinode(inumber, idesc)
 		zlnp = (struct zlncnt *)malloc(sizeof *zlnp);
 		if (zlnp == NULL) {
 			pfatal("LINK COUNT TABLE OVERFLOW");
-			if (reply("CONTINUE") == 0)
+			if (reply("CONTINUE") == 0) {
+				ckfini(0);
 				exit(EEXIT);
+			}
 		} else {
 			zlnp->zlncnt = inumber;
 			zlnp->next = zlnhead;
@@ -270,8 +272,10 @@ pass1check(idesc)
 				idesc->id_number);
 			if (preen)
 				printf(" (SKIPPING)\n");
-			else if (reply("CONTINUE") == 0)
+			else if (reply("CONTINUE") == 0) {
+				ckfini(0);
 				exit(EEXIT);
+			}
 			return (STOP);
 		}
 	}
@@ -288,15 +292,19 @@ pass1check(idesc)
 					idesc->id_number);
 				if (preen)
 					printf(" (SKIPPING)\n");
-				else if (reply("CONTINUE") == 0)
+				else if (reply("CONTINUE") == 0) {
+					ckfini(0);
 					exit(EEXIT);
+				}
 				return (STOP);
 			}
 			new = (struct dups *)malloc(sizeof(struct dups));
 			if (new == NULL) {
 				pfatal("DUP TABLE OVERFLOW.");
-				if (reply("CONTINUE") == 0)
+				if (reply("CONTINUE") == 0) {
+					ckfini(0);
 					exit(EEXIT);
+				}
 				return (STOP);
 			}
 			new->dup = blkno;
diff --git a/sbin/fsck_ffs/pass2.c b/sbin/fsck_ffs/pass2.c
index 445f6f1682b9..ebc33b8a650f 100644
--- a/sbin/fsck_ffs/pass2.c
+++ b/sbin/fsck_ffs/pass2.c
@@ -66,8 +66,10 @@ pass2()
 
 	case USTATE:
 		pfatal("ROOT INODE UNALLOCATED");
-		if (reply("ALLOCATE") == 0)
+		if (reply("ALLOCATE") == 0) {
+			ckfini(0);
 			exit(EEXIT);
+		}
 		if (allocdir(ROOTINO, ROOTINO, 0755) != ROOTINO)
 			errx(EEXIT, "CANNOT ALLOCATE ROOT INODE");
 		break;
@@ -80,8 +82,10 @@ pass2()
 				errx(EEXIT, "CANNOT ALLOCATE ROOT INODE");
 			break;
 		}
-		if (reply("CONTINUE") == 0)
+		if (reply("CONTINUE") == 0) {
+			ckfini(0);
 			exit(EEXIT);
+		}
 		break;
 
 	case FSTATE:
@@ -93,8 +97,10 @@ pass2()
 				errx(EEXIT, "CANNOT ALLOCATE ROOT INODE");
 			break;
 		}
-		if (reply("FIX") == 0)
+		if (reply("FIX") == 0) {
+			ckfini(0);
 			exit(EEXIT);
+		}
 		dp = ginode(ROOTINO);
 		dp->di_mode &= ~IFMT;
 		dp->di_mode |= IFDIR;
@@ -139,8 +145,14 @@ pass2()
 			}
 		} else if ((inp->i_isize & (DIRBLKSIZ - 1)) != 0) {
 			getpathname(pathbuf, inp->i_number, inp->i_number);
-			pwarn("DIRECTORY %s: LENGTH %d NOT MULTIPLE OF %d",
-				pathbuf, inp->i_isize, DIRBLKSIZ);
+			if (usedsoftdep)
+				pfatal("%s %s: LENGTH %d NOT MULTIPLE OF %d",
+					"DIRECTORY", pathbuf, inp->i_isize,
+					DIRBLKSIZ);
+			else
+				pwarn("%s %s: LENGTH %d NOT MULTIPLE OF %d",
+					"DIRECTORY", pathbuf, inp->i_isize,
+					DIRBLKSIZ);
 			if (preen)
 				printf(" (ADJUSTED)\n");
 			inp->i_isize = roundup(inp->i_isize, DIRBLKSIZ);
@@ -394,7 +406,7 @@ pass2check(idesc)
 				break;
 			if (statemap[dirp->d_ino] == FCLEAR)
 				errmsg = "DUP/BAD";
-			else if (!preen)
+			else if (!preen && !usedsoftdep)
 				errmsg = "ZERO LENGTH DIRECTORY";
 			else {
 				n = 1;
@@ -423,8 +435,11 @@ pass2check(idesc)
 				pwarn("%s %s %s\n", pathbuf,
 				    "IS AN EXTRANEOUS HARD LINK TO DIRECTORY",
 				    namebuf);
-				if (preen)
-					printf(" (IGNORED)\n");
+				if (preen) {
+					printf(" (REMOVED)\n");
+  					n = 1;
+  					break;
+				}
 				else if ((n = reply("REMOVE")) == 1)
 					break;
 			}
diff --git a/sbin/fsck_ffs/pass5.c b/sbin/fsck_ffs/pass5.c
index 3dd0c1aac237..873f008b8c78 100644
--- a/sbin/fsck_ffs/pass5.c
+++ b/sbin/fsck_ffs/pass5.c
@@ -50,11 +50,12 @@ void
 pass5()
 {
 	int c, blk, frags, basesize, sumsize, mapsize, savednrpos;
+	int inomapsize, blkmapsize;
 	struct fs *fs = &sblock;
 	struct cg *cg = &cgrp;
 	ufs_daddr_t dbase, dmax;
 	ufs_daddr_t d;
-	long i, j;
+	long i, j, k;
 	struct csum *cs;
 	struct csum cstotal;
 	struct inodesc idesc[3];
@@ -112,6 +113,8 @@ pass5()
 		sumsize = &ocg->cg_iused[0] - (u_int8_t *)(&ocg->cg_btot[0]);
 		mapsize = &ocg->cg_free[howmany(fs->fs_fpg, NBBY)] -
 			(u_char *)&ocg->cg_iused[0];
+		blkmapsize = howmany(fs->fs_fpg, NBBY);
+		inomapsize = &ocg->cg_free[0] - (u_char *)&ocg->cg_iused[0];
 		ocg->cg_magic = CG_MAGIC;
 		savednrpos = fs->fs_nrpos;
 		fs->fs_nrpos = 8;
@@ -126,12 +129,12 @@ pass5()
 		    fs->fs_cpg * fs->fs_nrpos * sizeof(short);
 		newcg->cg_freeoff =
 		    newcg->cg_iusedoff + howmany(fs->fs_ipg, NBBY);
-		if (fs->fs_contigsumsize <= 0) {
-			newcg->cg_nextfreeoff = newcg->cg_freeoff +
-			    howmany(fs->fs_cpg * fs->fs_spc / NSPF(fs), NBBY);
-		} else {
-			newcg->cg_clustersumoff = newcg->cg_freeoff +
-			    howmany(fs->fs_cpg * fs->fs_spc / NSPF(fs), NBBY) -
+		inomapsize = newcg->cg_freeoff - newcg->cg_iusedoff;
+		newcg->cg_nextfreeoff = newcg->cg_freeoff +
+		    howmany(fs->fs_cpg * fs->fs_spc / NSPF(fs), NBBY);
+		blkmapsize = newcg->cg_nextfreeoff - newcg->cg_freeoff;
+		if (fs->fs_contigsumsize > 0) {
+			newcg->cg_clustersumoff = newcg->cg_nextfreeoff -
 			    sizeof(long);
 			newcg->cg_clustersumoff =
 			    roundup(newcg->cg_clustersumoff, sizeof(long));
@@ -148,7 +151,7 @@ pass5()
 		break;
 
 	default:
-		sumsize = 0;	/* keep lint happy */
+		inomapsize = blkmapsize = sumsize = 0;	/* keep lint happy */
 		errx(EEXIT, "UNKNOWN ROTATIONAL TABLE FORMAT %d",
 			fs->fs_postblformat);
 	}
@@ -299,13 +302,6 @@ pass5()
 			cgdirty();
 			continue;
 		}
-		if (memcmp(cg_inosused(newcg),
-			 cg_inosused(cg), mapsize) != 0 &&
-		    dofix(&idesc[1], "BLK(S) MISSING IN BIT MAPS")) {
-			memmove(cg_inosused(cg), cg_inosused(newcg),
-			      (size_t)mapsize);
-			cgdirty();
-		}
 		if ((memcmp(newcg, cg, basesize) != 0 ||
 		     memcmp(&cg_blktot(newcg)[0],
 			  &cg_blktot(cg)[0], sumsize) != 0) &&
@@ -315,6 +311,40 @@ pass5()
 			       &cg_blktot(newcg)[0], (size_t)sumsize);
 			cgdirty();
 		}
+		if (usedsoftdep) {
+			for (i = 0; i < inomapsize; i++) {
+				j = cg_inosused(newcg)[i];
+				if ((cg_inosused(cg)[i] & j) == j)
+					continue;
+				for (k = 0; k < NBBY; k++) {
+					if ((j & (1 << k)) == 0)
+						continue;
+					if (cg_inosused(cg)[i] & (1 << k))
+						continue;
+					pwarn("ALLOCATED INODE %d MARKED FREE",
+					    c * fs->fs_ipg + i * 8 + k);
+				}
+			}
+			for (i = 0; i < blkmapsize; i++) {
+				j = cg_blksfree(cg)[i];
+				if ((cg_blksfree(newcg)[i] & j) == j)
+					continue;
+				for (k = 0; k < NBBY; k++) {
+					if ((j & (1 << k)) == 0)
+						continue;
+					if (cg_inosused(cg)[i] & (1 << k))
+						continue;
+					pwarn("ALLOCATED FRAG %d MARKED FREE",
+					    c * fs->fs_fpg + i * 8 + k);
+				}
+			}
+		}
+		if (memcmp(cg_inosused(newcg), cg_inosused(cg), mapsize) != 0 &&
+		    dofix(&idesc[1], "BLK(S) MISSING IN BIT MAPS")) {
+			memmove(cg_inosused(cg), cg_inosused(newcg),
+			      (size_t)mapsize);
+			cgdirty();
+		}
 	}
 	if (fs->fs_postblformat == FS_42POSTBLFMT)
 		fs->fs_nrpos = savednrpos;
diff --git a/sbin/fsck_ffs/setup.c b/sbin/fsck_ffs/setup.c
index 28e7e4b4a46e..f464b63e53ba 100644
--- a/sbin/fsck_ffs/setup.c
+++ b/sbin/fsck_ffs/setup.c
@@ -255,8 +255,10 @@ setup(dev)
 		    fsbtodb(&sblock, sblock.fs_csaddr + j * sblock.fs_frag),
 		    size) != 0 && !asked) {
 			pfatal("BAD SUMMARY INFORMATION");
-			if (reply("CONTINUE") == 0)
+			if (reply("CONTINUE") == 0) {
+				ckfini(0);
 				exit(EEXIT);
+			}
 			asked++;
 		}
 	}
@@ -311,6 +313,10 @@ setup(dev)
 		goto badsb;
 	}
 	bufinit();
+	if (sblock.fs_flags & FS_DOSOFTDEP)
+		usedsoftdep = 1;
+	else
+		usedsoftdep = 0;
 	return (1);
 
 badsb:
diff --git a/sbin/fsck_ffs/utilities.c b/sbin/fsck_ffs/utilities.c
index 30c31cfeedb1..465fb3be3884 100644
--- a/sbin/fsck_ffs/utilities.c
+++ b/sbin/fsck_ffs/utilities.c
@@ -87,6 +87,7 @@ reply(question)
 	printf("\n");
 	if (!persevere && (nflag || fswritefd < 0)) {
 		printf("%s? no\n\n", question);
+		resolved = 0;
 		return (0);
 	}
 	if (yflag || (persevere && nflag)) {
@@ -97,13 +98,17 @@ reply(question)
 		printf("%s? [yn] ", question);
 		(void) fflush(stdout);
 		c = getc(stdin);
-		while (c != '\n' && getc(stdin) != '\n')
-			if (feof(stdin))
+		while (c != '\n' && getc(stdin) != '\n') {
+			if (feof(stdin)) {
+				resolved = 0;
 				return (0);
+			}
+		}
 	} while (c != 'y' && c != 'Y' && c != 'n' && c != 'N');
 	printf("\n");
 	if (c == 'y' || c == 'Y')
 		return (1);
+	resolved = 0;
 	return (0);
 }
 
@@ -360,7 +365,8 @@ ufs_daddr_t
 allocblk(frags)
 	long frags;
 {
-	register int i, j, k;
+	int i, j, k, cg, baseblk;
+	struct cg *cgp = &cgrp;
 
 	if (frags <= 0 || frags > sblock.fs_frag)
 		return (0);
@@ -375,9 +381,21 @@ allocblk(frags)
 				j += k;
 				continue;
 			}
-			for (k = 0; k < frags; k++)
+			cg = dtog(&sblock, i + j);
+			getblk(&cgblk, cgtod(&sblock, cg), sblock.fs_cgsize);
+			if (!cg_chkmagic(cgp))
+				pfatal("CG %d: BAD MAGIC NUMBER\n", cg);
+			baseblk = dtogd(&sblock, i + j);
+			for (k = 0; k < frags; k++) {
 				setbmap(i + j + k);
+				clrbit(cg_blksfree(cgp), baseblk + k);
+			}
 			n_blks += frags;
+			if (frags == sblock.fs_frag)
+				cgp->cg_cs.cs_nbfree--;
+			else
+				cgp->cg_cs.cs_nffree -= frags;
+			cgdirty();
 			return (i + j);
 		}
 	}
@@ -545,7 +563,8 @@ dofix(idesc, msg)
 
 /*
  * An unexpected inconsistency occured.
- * Die if preening, otherwise just print message and continue.
+ * Die if preening or filesystem is running with soft dependency protocol,
+ * otherwise just print message and continue.
  */
 void
 #if __STDC__
@@ -565,19 +584,23 @@ pfatal(fmt, va_alist)
 	if (!preen) {
 		(void)vfprintf(stderr, fmt, ap);
 		va_end(ap);
+		if (usedsoftdep)
+			(void)fprintf(stderr,
+			    "\nUNEXPECTED SOFTDEP INCONSISTENCY\n");
 		return;
 	}
 	(void)fprintf(stderr, "%s: ", cdevname);
 	(void)vfprintf(stderr, fmt, ap);
 	(void)fprintf(stderr,
-	    "\n%s: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY.\n",
-	    cdevname);
+	    "\n%s: UNEXPECTED%sINCONSISTENCY; RUN fsck MANUALLY.\n",
+	    cdevname, usedsoftdep ? " SOFTDEP " : " ");
+	ckfini(0);
 	exit(EEXIT);
 }
 
 /*
- * Pwarn just prints a message when not preening,
- * or a warning (preceded by filename) when preening.
+ * Pwarn just prints a message when not preening or running soft dependency
+ * protocol, or a warning (preceded by filename) when preening.
  */
 void
 #if __STDC__
diff --git a/sbin/fsck_ifs/dir.c b/sbin/fsck_ifs/dir.c
index 4b6999b013a9..6ab67d33ca32 100644
--- a/sbin/fsck_ifs/dir.c
+++ b/sbin/fsck_ifs/dir.c
@@ -315,12 +315,13 @@ adjust(idesc, lcnt)
 		pinode(idesc->id_number);
 		printf(" COUNT %d SHOULD BE %d",
 			dp->di_nlink, dp->di_nlink - lcnt);
-		if (preen) {
+		if (preen || usedsoftdep) {
 			if (lcnt < 0) {
 				printf("\n");
 				pfatal("LINK COUNT INCREASING");
 			}
-			printf(" (ADJUSTED)\n");
+			if (preen)
+				printf(" (ADJUSTED)\n");
 		}
 		if (preen || reply("ADJUST") == 1) {
 			dp->di_nlink -= lcnt;
@@ -406,13 +407,15 @@ linkup(orphan, parentdir)
 	lostdir = (dp->di_mode & IFMT) == IFDIR;
 	pwarn("UNREF %s ", lostdir ? "DIR" : "FILE");
 	pinode(orphan);
-	if (preen && dp->di_size == 0)
+	if ((preen || usedsoftdep) && dp->di_size == 0)
 		return (0);
 	if (preen)
 		printf(" (RECONNECTED)\n");
 	else
 		if (reply("RECONNECT") == 0)
 			return (0);
+	if (parentdir != 0)
+		lncntp[parentdir]++;
 	if (lfdir == 0) {
 		dp = ginode(ROOTINO);
 		idesc.id_name = lfname;
diff --git a/sbin/fsck_ifs/fsck.h b/sbin/fsck_ifs/fsck.h
index 1967691e989c..4e0271d815dd 100644
--- a/sbin/fsck_ifs/fsck.h
+++ b/sbin/fsck_ifs/fsck.h
@@ -176,6 +176,8 @@ int	cvtlevel;		/* convert to newer file system format */
 int	doinglevel1;		/* converting to new cylinder group format */
 int	doinglevel2;		/* converting to new inode format */
 int	newinofmt;		/* filesystem has new inode format */
+char	usedsoftdep;		/* just fix soft dependency inconsistencies */
+char	resolved;		/* cleared if unresolved changes => not clean */
 char	preen;			/* just fix normal inconsistencies */
 char	hotroot;		/* checking root device */
 char	havesb;			/* superblock has been read */
diff --git a/sbin/fsck_ifs/inode.c b/sbin/fsck_ifs/inode.c
index 429dd3b2267b..74561c8eb1b4 100644
--- a/sbin/fsck_ifs/inode.c
+++ b/sbin/fsck_ifs/inode.c
@@ -559,6 +559,8 @@ allocino(request, type)
 {
 	register ino_t ino;
 	register struct dinode *dp;
+	struct cg *cgp = &cgrp;
+	int cg;
 
 	if (request == 0)
 		request = ROOTINO;
@@ -569,9 +571,16 @@ allocino(request, type)
 			break;
 	if (ino == maxino)
 		return (0);
+	cg = ino_to_cg(&sblock, ino);
+	getblk(&cgblk, cgtod(&sblock, cg), sblock.fs_cgsize);
+	if (!cg_chkmagic(cgp))
+		pfatal("CG %d: BAD MAGIC NUMBER\n", cg);
+	setbit(cg_inosused(cgp), ino % sblock.fs_ipg);
+	cgp->cg_cs.cs_nifree--;
 	switch (type & IFMT) {
 	case IFDIR:
 		statemap[ino] = DSTATE;
+		cgp->cg_cs.cs_ndir++;
 		break;
 	case IFREG:
 	case IFLNK:
@@ -580,12 +589,14 @@ allocino(request, type)
 	default:
 		return (0);
 	}
+	cgdirty();
 	dp = ginode(ino);
 	dp->di_db[0] = allocblk((long)1);
 	if (dp->di_db[0] == 0) {
 		statemap[ino] = USTATE;
 		return (0);
 	}
+	dp->di_flags = 0;
 	dp->di_mode = type;
 	dp->di_atime = time(NULL);
 	dp->di_mtime = dp->di_ctime = dp->di_atime;
diff --git a/sbin/fsck_ifs/main.c b/sbin/fsck_ifs/main.c
index dcb7006125cb..b4bc2c9caaaf 100644
--- a/sbin/fsck_ifs/main.c
+++ b/sbin/fsck_ifs/main.c
@@ -42,7 +42,7 @@ static const char copyright[] =
 static char sccsid[] = "@(#)main.c	8.6 (Berkeley) 5/14/95";
 #endif
 static const char rcsid[] =
-	"$Id$";
+	"$Id: main.c,v 1.12 1997/12/20 22:24:32 bde Exp $";
 #endif /* not lint */
 
 #include <sys/param.h>
@@ -209,6 +209,11 @@ checkfilesys(filesys, mntpt, auxdata, child)
 		return (0);
 	}
 
+	/*
+	 * Cleared if any questions answered no. Used to decide if
+	 * the superblock should be marked clean.
+	 */
+	resolved = 1;
 	/*
 	 * 1: scan inodes tallying blocks used
 	 */
@@ -224,7 +229,7 @@ checkfilesys(filesys, mntpt, auxdata, child)
 	 * 1b: locate first references to duplicates, if any
 	 */
 	if (duplist) {
-		if (preen)
+		if (preen || usedsoftdep)
 			pfatal("INTERNAL ERROR: dups with -p");
 		printf("** Phase 1b - Rescan For More DUPS\n");
 		pass1b();
@@ -306,19 +311,20 @@ checkfilesys(filesys, mntpt, auxdata, child)
 			bwrite(fswritefd, (char *)&sblock,
 			    fsbtodb(&sblock, cgsblock(&sblock, cylno)), SBSIZE);
 	}
-	if (!hotroot) {
-		ckfini(1);
-	} else {
+	if (rerun)
+		resolved = 0;
+	flags = 0;
+	if (hotroot) {
 		struct statfs stfs_buf;
 		/*
 		 * Check to see if root is mounted read-write.
 		 */
 		if (statfs("/", &stfs_buf) == 0)
 			flags = stfs_buf.f_flags;
-		else
-			flags = 0;
-		ckfini(flags & MNT_RDONLY);
+		if ((flags & MNT_RDONLY) == 0)
+			resolved = 0;
 	}
+	ckfini(resolved);
 	free(blockmap);
 	free(statemap);
 	free((char *)lncntp);
diff --git a/sbin/fsck_ifs/pass1.c b/sbin/fsck_ifs/pass1.c
index 99582777186c..181f858184bc 100644
--- a/sbin/fsck_ifs/pass1.c
+++ b/sbin/fsck_ifs/pass1.c
@@ -200,8 +200,10 @@ checkinode(inumber, idesc)
 		zlnp = (struct zlncnt *)malloc(sizeof *zlnp);
 		if (zlnp == NULL) {
 			pfatal("LINK COUNT TABLE OVERFLOW");
-			if (reply("CONTINUE") == 0)
+			if (reply("CONTINUE") == 0) {
+				ckfini(0);
 				exit(EEXIT);
+			}
 		} else {
 			zlnp->zlncnt = inumber;
 			zlnp->next = zlnhead;
@@ -270,8 +272,10 @@ pass1check(idesc)
 				idesc->id_number);
 			if (preen)
 				printf(" (SKIPPING)\n");
-			else if (reply("CONTINUE") == 0)
+			else if (reply("CONTINUE") == 0) {
+				ckfini(0);
 				exit(EEXIT);
+			}
 			return (STOP);
 		}
 	}
@@ -288,15 +292,19 @@ pass1check(idesc)
 					idesc->id_number);
 				if (preen)
 					printf(" (SKIPPING)\n");
-				else if (reply("CONTINUE") == 0)
+				else if (reply("CONTINUE") == 0) {
+					ckfini(0);
 					exit(EEXIT);
+				}
 				return (STOP);
 			}
 			new = (struct dups *)malloc(sizeof(struct dups));
 			if (new == NULL) {
 				pfatal("DUP TABLE OVERFLOW.");
-				if (reply("CONTINUE") == 0)
+				if (reply("CONTINUE") == 0) {
+					ckfini(0);
 					exit(EEXIT);
+				}
 				return (STOP);
 			}
 			new->dup = blkno;
diff --git a/sbin/fsck_ifs/pass2.c b/sbin/fsck_ifs/pass2.c
index 445f6f1682b9..ebc33b8a650f 100644
--- a/sbin/fsck_ifs/pass2.c
+++ b/sbin/fsck_ifs/pass2.c
@@ -66,8 +66,10 @@ pass2()
 
 	case USTATE:
 		pfatal("ROOT INODE UNALLOCATED");
-		if (reply("ALLOCATE") == 0)
+		if (reply("ALLOCATE") == 0) {
+			ckfini(0);
 			exit(EEXIT);
+		}
 		if (allocdir(ROOTINO, ROOTINO, 0755) != ROOTINO)
 			errx(EEXIT, "CANNOT ALLOCATE ROOT INODE");
 		break;
@@ -80,8 +82,10 @@ pass2()
 				errx(EEXIT, "CANNOT ALLOCATE ROOT INODE");
 			break;
 		}
-		if (reply("CONTINUE") == 0)
+		if (reply("CONTINUE") == 0) {
+			ckfini(0);
 			exit(EEXIT);
+		}
 		break;
 
 	case FSTATE:
@@ -93,8 +97,10 @@ pass2()
 				errx(EEXIT, "CANNOT ALLOCATE ROOT INODE");
 			break;
 		}
-		if (reply("FIX") == 0)
+		if (reply("FIX") == 0) {
+			ckfini(0);
 			exit(EEXIT);
+		}
 		dp = ginode(ROOTINO);
 		dp->di_mode &= ~IFMT;
 		dp->di_mode |= IFDIR;
@@ -139,8 +145,14 @@ pass2()
 			}
 		} else if ((inp->i_isize & (DIRBLKSIZ - 1)) != 0) {
 			getpathname(pathbuf, inp->i_number, inp->i_number);
-			pwarn("DIRECTORY %s: LENGTH %d NOT MULTIPLE OF %d",
-				pathbuf, inp->i_isize, DIRBLKSIZ);
+			if (usedsoftdep)
+				pfatal("%s %s: LENGTH %d NOT MULTIPLE OF %d",
+					"DIRECTORY", pathbuf, inp->i_isize,
+					DIRBLKSIZ);
+			else
+				pwarn("%s %s: LENGTH %d NOT MULTIPLE OF %d",
+					"DIRECTORY", pathbuf, inp->i_isize,
+					DIRBLKSIZ);
 			if (preen)
 				printf(" (ADJUSTED)\n");
 			inp->i_isize = roundup(inp->i_isize, DIRBLKSIZ);
@@ -394,7 +406,7 @@ pass2check(idesc)
 				break;
 			if (statemap[dirp->d_ino] == FCLEAR)
 				errmsg = "DUP/BAD";
-			else if (!preen)
+			else if (!preen && !usedsoftdep)
 				errmsg = "ZERO LENGTH DIRECTORY";
 			else {
 				n = 1;
@@ -423,8 +435,11 @@ pass2check(idesc)
 				pwarn("%s %s %s\n", pathbuf,
 				    "IS AN EXTRANEOUS HARD LINK TO DIRECTORY",
 				    namebuf);
-				if (preen)
-					printf(" (IGNORED)\n");
+				if (preen) {
+					printf(" (REMOVED)\n");
+  					n = 1;
+  					break;
+				}
 				else if ((n = reply("REMOVE")) == 1)
 					break;
 			}
diff --git a/sbin/fsck_ifs/pass5.c b/sbin/fsck_ifs/pass5.c
index 3dd0c1aac237..873f008b8c78 100644
--- a/sbin/fsck_ifs/pass5.c
+++ b/sbin/fsck_ifs/pass5.c
@@ -50,11 +50,12 @@ void
 pass5()
 {
 	int c, blk, frags, basesize, sumsize, mapsize, savednrpos;
+	int inomapsize, blkmapsize;
 	struct fs *fs = &sblock;
 	struct cg *cg = &cgrp;
 	ufs_daddr_t dbase, dmax;
 	ufs_daddr_t d;
-	long i, j;
+	long i, j, k;
 	struct csum *cs;
 	struct csum cstotal;
 	struct inodesc idesc[3];
@@ -112,6 +113,8 @@ pass5()
 		sumsize = &ocg->cg_iused[0] - (u_int8_t *)(&ocg->cg_btot[0]);
 		mapsize = &ocg->cg_free[howmany(fs->fs_fpg, NBBY)] -
 			(u_char *)&ocg->cg_iused[0];
+		blkmapsize = howmany(fs->fs_fpg, NBBY);
+		inomapsize = &ocg->cg_free[0] - (u_char *)&ocg->cg_iused[0];
 		ocg->cg_magic = CG_MAGIC;
 		savednrpos = fs->fs_nrpos;
 		fs->fs_nrpos = 8;
@@ -126,12 +129,12 @@ pass5()
 		    fs->fs_cpg * fs->fs_nrpos * sizeof(short);
 		newcg->cg_freeoff =
 		    newcg->cg_iusedoff + howmany(fs->fs_ipg, NBBY);
-		if (fs->fs_contigsumsize <= 0) {
-			newcg->cg_nextfreeoff = newcg->cg_freeoff +
-			    howmany(fs->fs_cpg * fs->fs_spc / NSPF(fs), NBBY);
-		} else {
-			newcg->cg_clustersumoff = newcg->cg_freeoff +
-			    howmany(fs->fs_cpg * fs->fs_spc / NSPF(fs), NBBY) -
+		inomapsize = newcg->cg_freeoff - newcg->cg_iusedoff;
+		newcg->cg_nextfreeoff = newcg->cg_freeoff +
+		    howmany(fs->fs_cpg * fs->fs_spc / NSPF(fs), NBBY);
+		blkmapsize = newcg->cg_nextfreeoff - newcg->cg_freeoff;
+		if (fs->fs_contigsumsize > 0) {
+			newcg->cg_clustersumoff = newcg->cg_nextfreeoff -
 			    sizeof(long);
 			newcg->cg_clustersumoff =
 			    roundup(newcg->cg_clustersumoff, sizeof(long));
@@ -148,7 +151,7 @@ pass5()
 		break;
 
 	default:
-		sumsize = 0;	/* keep lint happy */
+		inomapsize = blkmapsize = sumsize = 0;	/* keep lint happy */
 		errx(EEXIT, "UNKNOWN ROTATIONAL TABLE FORMAT %d",
 			fs->fs_postblformat);
 	}
@@ -299,13 +302,6 @@ pass5()
 			cgdirty();
 			continue;
 		}
-		if (memcmp(cg_inosused(newcg),
-			 cg_inosused(cg), mapsize) != 0 &&
-		    dofix(&idesc[1], "BLK(S) MISSING IN BIT MAPS")) {
-			memmove(cg_inosused(cg), cg_inosused(newcg),
-			      (size_t)mapsize);
-			cgdirty();
-		}
 		if ((memcmp(newcg, cg, basesize) != 0 ||
 		     memcmp(&cg_blktot(newcg)[0],
 			  &cg_blktot(cg)[0], sumsize) != 0) &&
@@ -315,6 +311,40 @@ pass5()
 			       &cg_blktot(newcg)[0], (size_t)sumsize);
 			cgdirty();
 		}
+		if (usedsoftdep) {
+			for (i = 0; i < inomapsize; i++) {
+				j = cg_inosused(newcg)[i];
+				if ((cg_inosused(cg)[i] & j) == j)
+					continue;
+				for (k = 0; k < NBBY; k++) {
+					if ((j & (1 << k)) == 0)
+						continue;
+					if (cg_inosused(cg)[i] & (1 << k))
+						continue;
+					pwarn("ALLOCATED INODE %d MARKED FREE",
+					    c * fs->fs_ipg + i * 8 + k);
+				}
+			}
+			for (i = 0; i < blkmapsize; i++) {
+				j = cg_blksfree(cg)[i];
+				if ((cg_blksfree(newcg)[i] & j) == j)
+					continue;
+				for (k = 0; k < NBBY; k++) {
+					if ((j & (1 << k)) == 0)
+						continue;
+					if (cg_inosused(cg)[i] & (1 << k))
+						continue;
+					pwarn("ALLOCATED FRAG %d MARKED FREE",
+					    c * fs->fs_fpg + i * 8 + k);
+				}
+			}
+		}
+		if (memcmp(cg_inosused(newcg), cg_inosused(cg), mapsize) != 0 &&
+		    dofix(&idesc[1], "BLK(S) MISSING IN BIT MAPS")) {
+			memmove(cg_inosused(cg), cg_inosused(newcg),
+			      (size_t)mapsize);
+			cgdirty();
+		}
 	}
 	if (fs->fs_postblformat == FS_42POSTBLFMT)
 		fs->fs_nrpos = savednrpos;
diff --git a/sbin/fsck_ifs/setup.c b/sbin/fsck_ifs/setup.c
index 28e7e4b4a46e..f464b63e53ba 100644
--- a/sbin/fsck_ifs/setup.c
+++ b/sbin/fsck_ifs/setup.c
@@ -255,8 +255,10 @@ setup(dev)
 		    fsbtodb(&sblock, sblock.fs_csaddr + j * sblock.fs_frag),
 		    size) != 0 && !asked) {
 			pfatal("BAD SUMMARY INFORMATION");
-			if (reply("CONTINUE") == 0)
+			if (reply("CONTINUE") == 0) {
+				ckfini(0);
 				exit(EEXIT);
+			}
 			asked++;
 		}
 	}
@@ -311,6 +313,10 @@ setup(dev)
 		goto badsb;
 	}
 	bufinit();
+	if (sblock.fs_flags & FS_DOSOFTDEP)
+		usedsoftdep = 1;
+	else
+		usedsoftdep = 0;
 	return (1);
 
 badsb:
diff --git a/sbin/fsck_ifs/utilities.c b/sbin/fsck_ifs/utilities.c
index 30c31cfeedb1..465fb3be3884 100644
--- a/sbin/fsck_ifs/utilities.c
+++ b/sbin/fsck_ifs/utilities.c
@@ -87,6 +87,7 @@ reply(question)
 	printf("\n");
 	if (!persevere && (nflag || fswritefd < 0)) {
 		printf("%s? no\n\n", question);
+		resolved = 0;
 		return (0);
 	}
 	if (yflag || (persevere && nflag)) {
@@ -97,13 +98,17 @@ reply(question)
 		printf("%s? [yn] ", question);
 		(void) fflush(stdout);
 		c = getc(stdin);
-		while (c != '\n' && getc(stdin) != '\n')
-			if (feof(stdin))
+		while (c != '\n' && getc(stdin) != '\n') {
+			if (feof(stdin)) {
+				resolved = 0;
 				return (0);
+			}
+		}
 	} while (c != 'y' && c != 'Y' && c != 'n' && c != 'N');
 	printf("\n");
 	if (c == 'y' || c == 'Y')
 		return (1);
+	resolved = 0;
 	return (0);
 }
 
@@ -360,7 +365,8 @@ ufs_daddr_t
 allocblk(frags)
 	long frags;
 {
-	register int i, j, k;
+	int i, j, k, cg, baseblk;
+	struct cg *cgp = &cgrp;
 
 	if (frags <= 0 || frags > sblock.fs_frag)
 		return (0);
@@ -375,9 +381,21 @@ allocblk(frags)
 				j += k;
 				continue;
 			}
-			for (k = 0; k < frags; k++)
+			cg = dtog(&sblock, i + j);
+			getblk(&cgblk, cgtod(&sblock, cg), sblock.fs_cgsize);
+			if (!cg_chkmagic(cgp))
+				pfatal("CG %d: BAD MAGIC NUMBER\n", cg);
+			baseblk = dtogd(&sblock, i + j);
+			for (k = 0; k < frags; k++) {
 				setbmap(i + j + k);
+				clrbit(cg_blksfree(cgp), baseblk + k);
+			}
 			n_blks += frags;
+			if (frags == sblock.fs_frag)
+				cgp->cg_cs.cs_nbfree--;
+			else
+				cgp->cg_cs.cs_nffree -= frags;
+			cgdirty();
 			return (i + j);
 		}
 	}
@@ -545,7 +563,8 @@ dofix(idesc, msg)
 
 /*
  * An unexpected inconsistency occured.
- * Die if preening, otherwise just print message and continue.
+ * Die if preening or filesystem is running with soft dependency protocol,
+ * otherwise just print message and continue.
  */
 void
 #if __STDC__
@@ -565,19 +584,23 @@ pfatal(fmt, va_alist)
 	if (!preen) {
 		(void)vfprintf(stderr, fmt, ap);
 		va_end(ap);
+		if (usedsoftdep)
+			(void)fprintf(stderr,
+			    "\nUNEXPECTED SOFTDEP INCONSISTENCY\n");
 		return;
 	}
 	(void)fprintf(stderr, "%s: ", cdevname);
 	(void)vfprintf(stderr, fmt, ap);
 	(void)fprintf(stderr,
-	    "\n%s: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY.\n",
-	    cdevname);
+	    "\n%s: UNEXPECTED%sINCONSISTENCY; RUN fsck MANUALLY.\n",
+	    cdevname, usedsoftdep ? " SOFTDEP " : " ");
+	ckfini(0);
 	exit(EEXIT);
 }
 
 /*
- * Pwarn just prints a message when not preening,
- * or a warning (preceded by filename) when preening.
+ * Pwarn just prints a message when not preening or running soft dependency
+ * protocol, or a warning (preceded by filename) when preening.
  */
 void
 #if __STDC__
diff --git a/sbin/mount/mount.c b/sbin/mount/mount.c
index 93b7e14edbdb..f4ee239b57df 100644
--- a/sbin/mount/mount.c
+++ b/sbin/mount/mount.c
@@ -42,7 +42,7 @@ static const char copyright[] =
 static char sccsid[] = "@(#)mount.c	8.25 (Berkeley) 5/8/95";
 #else
 static const char rcsid[] =
-	"$Id: mount.c,v 1.21 1997/11/13 00:28:49 julian Exp $";
+	"$Id: mount.c,v 1.22 1998/02/13 04:54:27 bde Exp $";
 #endif
 #endif /* not lint */
 
@@ -98,6 +98,7 @@ static struct opt {
 	{ MNT_NOCLUSTERR,	"noclusterr" },
 	{ MNT_NOCLUSTERW,	"noclusterw" },
 	{ MNT_SUIDDIR,		"suiddir" },
+	{ MNT_SOFTDEP,		"soft-updates" },
 	{ NULL }
 };
 
@@ -495,7 +496,8 @@ prmount(sfp)
 		else
 			(void)printf("%d", sfp->f_owner);
 	}
-	(void)printf(f ? ")\n" : "\n");
+	(void)printf("%swrites: sync %d async %d)\n", !f++ ? " (" : ", ",
+	    sfp->f_syncwrites, sfp->f_asyncwrites);
 }
 
 struct statfs *
@@ -602,6 +604,8 @@ putfsent(ent)
 		printf(",noclusterr");
 	if (ent->f_flags & MNT_NOCLUSTERW)
 		printf(",noclusterw");
+	if (ent->f_flags & MNT_SUIDDIR)
+		printf(",suiddir");
 
 	if ((fst = getfsspec(ent->f_mntfromname)))
 		printf("\t%u %u\n", fst->fs_freq, fst->fs_passno);
diff --git a/sbin/mount_ifs/mount.c b/sbin/mount_ifs/mount.c
index 93b7e14edbdb..f4ee239b57df 100644
--- a/sbin/mount_ifs/mount.c
+++ b/sbin/mount_ifs/mount.c
@@ -42,7 +42,7 @@ static const char copyright[] =
 static char sccsid[] = "@(#)mount.c	8.25 (Berkeley) 5/8/95";
 #else
 static const char rcsid[] =
-	"$Id: mount.c,v 1.21 1997/11/13 00:28:49 julian Exp $";
+	"$Id: mount.c,v 1.22 1998/02/13 04:54:27 bde Exp $";
 #endif
 #endif /* not lint */
 
@@ -98,6 +98,7 @@ static struct opt {
 	{ MNT_NOCLUSTERR,	"noclusterr" },
 	{ MNT_NOCLUSTERW,	"noclusterw" },
 	{ MNT_SUIDDIR,		"suiddir" },
+	{ MNT_SOFTDEP,		"soft-updates" },
 	{ NULL }
 };
 
@@ -495,7 +496,8 @@ prmount(sfp)
 		else
 			(void)printf("%d", sfp->f_owner);
 	}
-	(void)printf(f ? ")\n" : "\n");
+	(void)printf("%swrites: sync %d async %d)\n", !f++ ? " (" : ", ",
+	    sfp->f_syncwrites, sfp->f_asyncwrites);
 }
 
 struct statfs *
@@ -602,6 +604,8 @@ putfsent(ent)
 		printf(",noclusterr");
 	if (ent->f_flags & MNT_NOCLUSTERW)
 		printf(",noclusterw");
+	if (ent->f_flags & MNT_SUIDDIR)
+		printf(",suiddir");
 
 	if ((fst = getfsspec(ent->f_mntfromname)))
 		printf("\t%u %u\n", fst->fs_freq, fst->fs_passno);
diff --git a/sbin/tunefs/tunefs.8 b/sbin/tunefs/tunefs.8
index df161a9de000..3d014475607d 100644
--- a/sbin/tunefs/tunefs.8
+++ b/sbin/tunefs/tunefs.8
@@ -46,6 +46,7 @@
 .Op Fl m Ar minfree
 .Op Fl p
 .Bk -words
+.Op Fl n Ar soft_dependency_enabling
 .Op Fl o Ar optimize_preference
 .Ek
 .Op Ar special | Ar filesys
diff --git a/sbin/tunefs/tunefs.c b/sbin/tunefs/tunefs.c
index 1802f22f5ac3..b1df99b324c9 100644
--- a/sbin/tunefs/tunefs.c
+++ b/sbin/tunefs/tunefs.c
@@ -81,7 +81,7 @@ main(argc, argv)
 	int argc;
 	char *argv[];
 {
-	char *cp, *special, *name;
+	char *cp, *special, *name, *action;
 	struct stat st;
 	int i;
 	int Aflag = 0;
@@ -182,6 +182,24 @@ main(argc, argv)
 					warnx(OPTWARN, "space", "<", MINFREE);
 				continue;
 
+			case 'n':
+ 				name = "soft updates";
+ 				if (argc < 1)
+ 					errx(10, "-s: missing %s", name);
+ 				argc--, argv++;
+ 				if (strcmp(*argv, "enable") == 0) {
+ 					sblock.fs_flags |= FS_DOSOFTDEP;
+ 					action = "set";
+ 				} else if (strcmp(*argv, "disable") == 0) {
+ 					sblock.fs_flags &= ~FS_DOSOFTDEP;
+ 					action = "cleared";
+ 				} else {
+ 					errx(10, "bad %s (options are %s)",
+ 					    name, "`enable' or `disable'");
+ 				}
+ 				warnx("%s %s", name, action);
+ 				continue;
+ 
 			case 'o':
 				name = "optimization preference";
 				if (argc < 1)
@@ -237,6 +255,7 @@ usage()
 	fprintf(stderr, "\t-d rotational delay between contiguous blocks\n");
 	fprintf(stderr, "\t-e maximum blocks per file in a cylinder group\n");
 	fprintf(stderr, "\t-m minimum percentage of free space\n");
+	fprintf(stderr, "\t-n soft updates (`enable' or `disable')\n");
 	fprintf(stderr, "\t-o optimization preference (`space' or `time')\n");
 	fprintf(stderr, "\t-p no change - just prints current tuneable settings\n");
 	exit(2);
@@ -261,6 +280,8 @@ getsb(fs, file)
 void
 printfs()
 {
+	warnx("soft updates:  (-n)                                %s", 
+		(sblock.fs_flags & FS_DOSOFTDEP)? "enabled" : "disabled");
 	warnx("maximum contiguous block count: (-a)               %d",
 	      sblock.fs_maxcontig);
 	warnx("rotational delay between contiguous blocks: (-d)   %d ms",
diff --git a/sys/conf/NOTES b/sys/conf/NOTES
index 357fdf410d59..cc44c5e04b74 100644
--- a/sys/conf/NOTES
+++ b/sys/conf/NOTES
@@ -2,7 +2,7 @@
 # LINT -- config file for checking all the sources, tries to pull in
 #	as much of the source tree as it can.
 #
-#	$Id: LINT,v 1.412 1998/02/24 22:24:46 phk Exp $
+#	$Id: LINT,v 1.413 1998/02/27 10:02:41 itojun Exp $
 #
 # NB: You probably don't want to try running a kernel built from this
 # file.  Instead, you should start from GENERIC, and add options from
@@ -466,6 +466,13 @@ options		NFS_ROOT		#NFS usable as root device
 # This DEVFS is experimental but seems to work
 options		DEVFS			#devices filesystem
 
+# Allow the FFS to use Softupdates technology.
+# To do this you need to fetch the two files
+# /sys/ufs/ffs/softdep.h and /sys/ufs/ffs/ffs_softdep.c
+# from freebsd.org and understand the licensing restrictions.
+#options		SOFTUPDATES
+# (we can't actually enable it because the files may not be present)
+
 # Make space in the kernel for a MFS root filesystem.  Define to the number
 # of kilobytes to reserve for the filesystem.
 options		MFS_ROOT=10
diff --git a/sys/conf/files b/sys/conf/files
index 36d1b23b59c8..108d5c853390 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -416,6 +416,8 @@ ufs/ffs/ffs_balloc.c	optional ffs
 ufs/ffs/ffs_balloc.c	optional mfs
 ufs/ffs/ffs_inode.c	optional ffs
 ufs/ffs/ffs_inode.c	optional mfs
+ufs/ffs/ffs_softdep_stub.c	optional ffs
+ufs/ffs/ffs_softdep.c	optional softupdates
 ufs/ffs/ffs_subr.c	optional ffs
 ufs/ffs/ffs_subr.c	optional mfs
 ufs/ffs/ffs_tables.c	optional ffs
diff --git a/sys/conf/options b/sys/conf/options
index 48a5c3355a01..161be3aec136 100644
--- a/sys/conf/options
+++ b/sys/conf/options
@@ -1,4 +1,4 @@
-#	$Id: options,v 1.63 1998/02/27 10:02:37 itojun Exp $
+#	$Id: options,v 1.64 1998/03/04 10:24:08 dufault Exp $
 
 # Format:
 # Option name	filename
@@ -59,6 +59,12 @@ CD9660
 FFS
 NFS
 
+# If you are following the conditions in the copyright, 
+# you can enable soft-updates which will speed up a lot of thigs 
+# and make the system safer from crashes at the same time.
+# otherwise a STUB module will be compiled in.
+SOFTUPDATES	opt_ffs.h
+
 # The above static dependencies are planned removed, with a
 # <filesystem>_ROOT option to control if it usable as root.  This list
 # allows these options to be present in config files already (though
diff --git a/sys/dev/de/if_de.c b/sys/dev/de/if_de.c
index 1cfaeba29de5..e764252d396b 100644
--- a/sys/dev/de/if_de.c
+++ b/sys/dev/de/if_de.c
@@ -1,5 +1,7 @@
+#undef __FreeBSD__
+#define __FreeBSD__ 3
 /*	$NetBSD: if_de.c,v 1.56 1997/10/20 14:32:46 matt Exp $	*/
-/*	$Id: if_de.c,v 1.79 1998/02/06 12:14:08 eivind Exp $ */
+/*	$Id: if_de.c,v 1.80 1998/02/20 13:11:50 bde Exp $ */
 
 /*-
  * Copyright (c) 1994-1997 Matt Thomas (matt@3am-software.com)
diff --git a/sys/fs/cd9660/cd9660_vfsops.c b/sys/fs/cd9660/cd9660_vfsops.c
index a9218c3a2b7d..1431b19f0819 100644
--- a/sys/fs/cd9660/cd9660_vfsops.c
+++ b/sys/fs/cd9660/cd9660_vfsops.c
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)cd9660_vfsops.c	8.18 (Berkeley) 5/22/95
- * $Id: cd9660_vfsops.c,v 1.33 1997/12/21 21:40:02 joerg Exp $
+ * $Id: cd9660_vfsops.c,v 1.34 1998/03/01 22:46:00 msmith Exp $
  */
 
 #include <sys/param.h>
@@ -392,7 +392,7 @@ iso_mountfs(devvp, mp, p, argp)
 	isomp->im_dev = dev;
 	isomp->im_devvp = devvp;
 
-	devvp->v_specflags |= SI_MOUNTEDON;
+	devvp->v_specmountpoint = mp;
 
 	/* Check the Rock Ridge Extention support */
 	if (!(argp->flags & ISOFSMNT_NORRIP)) {
@@ -438,7 +438,7 @@ iso_mountfs(devvp, mp, p, argp)
 
 	return 0;
 out:
-	devvp->v_specflags &= ~SI_MOUNTEDON;
+	devvp->v_specmountpoint = NULL;
 	if (bp)
 		brelse(bp);
 	if (needclose)
@@ -489,7 +489,7 @@ cd9660_unmount(mp, mntflags, p)
 	isomp = VFSTOISOFS(mp);
 
 
-	isomp->im_devvp->v_specflags &= ~SI_MOUNTEDON;
+	isomp->im_devvp->v_specmountpoint = NULL;
 	error = VOP_CLOSE(isomp->im_devvp, FREAD, NOCRED, p);
 	vrele(isomp->im_devvp);
 	free((caddr_t)isomp, M_ISOFSMNT);
@@ -561,7 +561,8 @@ cd9660_statfs(mp, sbp, p)
 		bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN);
 	}
 	/* Use the first spare for flags: */
-	sbp->f_spare[0] = isomp->im_flags;
+	/* Don't do this!!! XXX */
+	/* sbp->f_spare[0] = isomp->im_flags; */
 	return 0;
 }
 
diff --git a/sys/fs/msdosfs/msdosfs_vfsops.c b/sys/fs/msdosfs/msdosfs_vfsops.c
index 2b1d1d7f352f..0af5438dae63 100644
--- a/sys/fs/msdosfs/msdosfs_vfsops.c
+++ b/sys/fs/msdosfs/msdosfs_vfsops.c
@@ -1,4 +1,4 @@
-/*	$Id: msdosfs_vfsops.c,v 1.28 1998/02/23 16:44:32 ache Exp $ */
+/*	$Id: msdosfs_vfsops.c,v 1.29 1998/03/01 22:46:27 msmith Exp $ */
 /*	$NetBSD: msdosfs_vfsops.c,v 1.51 1997/11/17 15:36:58 ws Exp $	*/
 
 /*-
@@ -772,7 +772,7 @@ mountmsdosfs(devvp, mp, p, argp)
 	mp->mnt_stat.f_fsid.val[0] = (long)dev;
 	mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum;
 	mp->mnt_flag |= MNT_LOCAL;
-	devvp->v_specflags |= SI_MOUNTEDON;
+	devvp->v_specmountpoint = mp;
 
 	return 0;
 
@@ -818,7 +818,7 @@ msdosfs_unmount(mp, mntflags, p)
 	if (error)
 		return error;
 	pmp = VFSTOMSDOSFS(mp);
-	pmp->pm_devvp->v_specflags &= ~SI_MOUNTEDON;
+	pmp->pm_devvp->v_specmountpoint = NULL;
 #ifdef MSDOSFS_DEBUG
 	{
 		struct vnode *vp = pmp->pm_devvp;
@@ -841,8 +841,9 @@ msdosfs_unmount(mp, mntflags, p)
 		    ((u_int *)vp->v_data)[1]);
 	}
 #endif
-	error = VOP_CLOSE(pmp->pm_devvp, (pmp->pm_flags&MSDOSFSMNT_RONLY) ? FREAD : FREAD | FWRITE,
-	    NOCRED, p);
+	error = VOP_CLOSE(pmp->pm_devvp,
+		    (pmp->pm_flags&MSDOSFSMNT_RONLY) ? FREAD : FREAD | FWRITE,
+		    NOCRED, p);
 	vrele(pmp->pm_devvp);
 	free(pmp->pm_inusemap, M_MSDOSFSFAT);
 	free(pmp, M_MSDOSFSMNT);
@@ -946,9 +947,11 @@ msdosfs_sync(mp, waitfor, cred, p)
 		simple_lock(&vp->v_interlock);
 		nvp = vp->v_mntvnodes.le_next;
 		dep = VTODE(vp);
-		if (vp->v_type == VNON || ((dep->de_flag &
-		    (DE_ACCESS | DE_CREATE | DE_UPDATE | DE_MODIFIED)) == 0)
-		    && vp->v_dirtyblkhd.lh_first == NULL) {
+		if (vp->v_type == VNON
+		|| (waitfor == MNT_LAZY) /* can this happen with msdosfs? */
+		|| (((dep->de_flag &
+		     (DE_ACCESS | DE_CREATE | DE_UPDATE | DE_MODIFIED)) == 0)
+		  && (vp->v_dirtyblkhd.lh_first == NULL))) {
 			simple_unlock(&vp->v_interlock);
 			continue;
 		}
diff --git a/sys/fs/specfs/spec_vnops.c b/sys/fs/specfs/spec_vnops.c
index 9c3c8450867d..666322f65609 100644
--- a/sys/fs/specfs/spec_vnops.c
+++ b/sys/fs/specfs/spec_vnops.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)spec_vnops.c	8.14 (Berkeley) 5/21/95
- * $Id: spec_vnops.c,v 1.58 1998/03/07 21:35:52 dyson Exp $
+ * $Id: spec_vnops.c,v 1.59 1998/03/08 08:46:18 dyson Exp $
  */
 
 #include <sys/param.h>
@@ -548,8 +548,12 @@ spec_strategy(ap)
 		struct buf *a_bp;
 	} */ *ap;
 {
+	struct buf *bp;
 
-	(*bdevsw[major(ap->a_bp->b_dev)]->d_strategy)(ap->a_bp);
+	bp = ap->a_bp;
+	if ((LIST_FIRST(&bp->b_dep)) != NULL && bioops.io_start)
+		(*bioops.io_start)(bp);
+	(*bdevsw[major(bp->b_dev)]->d_strategy)(bp);
 	return (0);
 }
 
@@ -633,7 +637,9 @@ spec_close(ap)
 		 * we must invalidate any in core blocks, so that
 		 * we can, for instance, change floppy disks.
 		 */
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, ap->a_p);
 		error = vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 0, 0);
+		VOP_UNLOCK(vp, 0, ap->a_p);
 		if (error)
 			return (error);
 
diff --git a/sys/gnu/ext2fs/inode.h b/sys/gnu/ext2fs/inode.h
index f2fd0f25fa5e..4bd1cf5d7de1 100644
--- a/sys/gnu/ext2fs/inode.h
+++ b/sys/gnu/ext2fs/inode.h
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)inode.h	8.9 (Berkeley) 5/14/95
- * $Id: inode.h,v 1.19 1997/12/05 13:43:47 jkh Exp $
+ * $Id: inode.h,v 1.20 1998/01/30 11:34:02 phk Exp $
  */
 
 #ifndef _UFS_UFS_INODE_H_
@@ -45,6 +45,11 @@
 #include <sys/lock.h>
 #include <ufs/ufs/dinode.h>
 
+/*
+ * The size of a logical block number.
+ */
+typedef long ufs_lbn_t;
+
 /*
  * This must agree with the definition in <ufs/ufs/dir.h>.
  */
@@ -67,6 +72,7 @@ struct inode {
 	u_int32_t i_flag;	/* flags, see below */
 	dev_t	  i_dev;	/* Device associated with the inode. */
 	ino_t	  i_number;	/* The identity of the inode. */
+	int	  i_effnlink;	/* i_nlink when I/O completes */
 
 	union {			/* Associated filesystem. */
 		struct	fs *fs;		/* FFS */
@@ -160,6 +166,9 @@ struct indir {
 	}								\
 }
 
+/* Determine if soft dependencies are being done */
+#define DOINGSOFTDEP(vp)	((vp)->v_mount->mnt_flag & MNT_SOFTDEP)
+
 /* This overlays the fid structure (see mount.h). */
 struct ufid {
 	u_int16_t ufid_len;	/* Length of structure. */
diff --git a/sys/gnu/fs/ext2fs/inode.h b/sys/gnu/fs/ext2fs/inode.h
index f2fd0f25fa5e..4bd1cf5d7de1 100644
--- a/sys/gnu/fs/ext2fs/inode.h
+++ b/sys/gnu/fs/ext2fs/inode.h
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)inode.h	8.9 (Berkeley) 5/14/95
- * $Id: inode.h,v 1.19 1997/12/05 13:43:47 jkh Exp $
+ * $Id: inode.h,v 1.20 1998/01/30 11:34:02 phk Exp $
  */
 
 #ifndef _UFS_UFS_INODE_H_
@@ -45,6 +45,11 @@
 #include <sys/lock.h>
 #include <ufs/ufs/dinode.h>
 
+/*
+ * The size of a logical block number.
+ */
+typedef long ufs_lbn_t;
+
 /*
  * This must agree with the definition in <ufs/ufs/dir.h>.
  */
@@ -67,6 +72,7 @@ struct inode {
 	u_int32_t i_flag;	/* flags, see below */
 	dev_t	  i_dev;	/* Device associated with the inode. */
 	ino_t	  i_number;	/* The identity of the inode. */
+	int	  i_effnlink;	/* i_nlink when I/O completes */
 
 	union {			/* Associated filesystem. */
 		struct	fs *fs;		/* FFS */
@@ -160,6 +166,9 @@ struct indir {
 	}								\
 }
 
+/* Determine if soft dependencies are being done */
+#define DOINGSOFTDEP(vp)	((vp)->v_mount->mnt_flag & MNT_SOFTDEP)
+
 /* This overlays the fid structure (see mount.h). */
 struct ufid {
 	u_int16_t ufid_len;	/* Length of structure. */
diff --git a/sys/i386/conf/LINT b/sys/i386/conf/LINT
index 357fdf410d59..cc44c5e04b74 100644
--- a/sys/i386/conf/LINT
+++ b/sys/i386/conf/LINT
@@ -2,7 +2,7 @@
 # LINT -- config file for checking all the sources, tries to pull in
 #	as much of the source tree as it can.
 #
-#	$Id: LINT,v 1.412 1998/02/24 22:24:46 phk Exp $
+#	$Id: LINT,v 1.413 1998/02/27 10:02:41 itojun Exp $
 #
 # NB: You probably don't want to try running a kernel built from this
 # file.  Instead, you should start from GENERIC, and add options from
@@ -466,6 +466,13 @@ options		NFS_ROOT		#NFS usable as root device
 # This DEVFS is experimental but seems to work
 options		DEVFS			#devices filesystem
 
+# Allow the FFS to use Softupdates technology.
+# To do this you need to fetch the two files
+# /sys/ufs/ffs/softdep.h and /sys/ufs/ffs/ffs_softdep.c
+# from freebsd.org and understand the licensing restrictions.
+#options		SOFTUPDATES
+# (we can't actually enable it because the files may not be present)
+
 # Make space in the kernel for a MFS root filesystem.  Define to the number
 # of kilobytes to reserve for the filesystem.
 options		MFS_ROOT=10
diff --git a/sys/i386/conf/NOTES b/sys/i386/conf/NOTES
index 357fdf410d59..cc44c5e04b74 100644
--- a/sys/i386/conf/NOTES
+++ b/sys/i386/conf/NOTES
@@ -2,7 +2,7 @@
 # LINT -- config file for checking all the sources, tries to pull in
 #	as much of the source tree as it can.
 #
-#	$Id: LINT,v 1.412 1998/02/24 22:24:46 phk Exp $
+#	$Id: LINT,v 1.413 1998/02/27 10:02:41 itojun Exp $
 #
 # NB: You probably don't want to try running a kernel built from this
 # file.  Instead, you should start from GENERIC, and add options from
@@ -466,6 +466,13 @@ options		NFS_ROOT		#NFS usable as root device
 # This DEVFS is experimental but seems to work
 options		DEVFS			#devices filesystem
 
+# Allow the FFS to use Softupdates technology.
+# To do this you need to fetch the two files
+# /sys/ufs/ffs/softdep.h and /sys/ufs/ffs/ffs_softdep.c
+# from freebsd.org and understand the licensing restrictions.
+#options		SOFTUPDATES
+# (we can't actually enable it because the files may not be present)
+
 # Make space in the kernel for a MFS root filesystem.  Define to the number
 # of kilobytes to reserve for the filesystem.
 options		MFS_ROOT=10
diff --git a/sys/isofs/cd9660/cd9660_vfsops.c b/sys/isofs/cd9660/cd9660_vfsops.c
index a9218c3a2b7d..1431b19f0819 100644
--- a/sys/isofs/cd9660/cd9660_vfsops.c
+++ b/sys/isofs/cd9660/cd9660_vfsops.c
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)cd9660_vfsops.c	8.18 (Berkeley) 5/22/95
- * $Id: cd9660_vfsops.c,v 1.33 1997/12/21 21:40:02 joerg Exp $
+ * $Id: cd9660_vfsops.c,v 1.34 1998/03/01 22:46:00 msmith Exp $
  */
 
 #include <sys/param.h>
@@ -392,7 +392,7 @@ iso_mountfs(devvp, mp, p, argp)
 	isomp->im_dev = dev;
 	isomp->im_devvp = devvp;
 
-	devvp->v_specflags |= SI_MOUNTEDON;
+	devvp->v_specmountpoint = mp;
 
 	/* Check the Rock Ridge Extention support */
 	if (!(argp->flags & ISOFSMNT_NORRIP)) {
@@ -438,7 +438,7 @@ iso_mountfs(devvp, mp, p, argp)
 
 	return 0;
 out:
-	devvp->v_specflags &= ~SI_MOUNTEDON;
+	devvp->v_specmountpoint = NULL;
 	if (bp)
 		brelse(bp);
 	if (needclose)
@@ -489,7 +489,7 @@ cd9660_unmount(mp, mntflags, p)
 	isomp = VFSTOISOFS(mp);
 
 
-	isomp->im_devvp->v_specflags &= ~SI_MOUNTEDON;
+	isomp->im_devvp->v_specmountpoint = NULL;
 	error = VOP_CLOSE(isomp->im_devvp, FREAD, NOCRED, p);
 	vrele(isomp->im_devvp);
 	free((caddr_t)isomp, M_ISOFSMNT);
@@ -561,7 +561,8 @@ cd9660_statfs(mp, sbp, p)
 		bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN);
 	}
 	/* Use the first spare for flags: */
-	sbp->f_spare[0] = isomp->im_flags;
+	/* Don't do this!!! XXX */
+	/* sbp->f_spare[0] = isomp->im_flags; */
 	return 0;
 }
 
diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c
index a51177c050b2..7d0746f860d5 100644
--- a/sys/kern/kern_malloc.c
+++ b/sys/kern/kern_malloc.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)kern_malloc.c	8.3 (Berkeley) 1/4/94
- * $Id: kern_malloc.c,v 1.43 1998/02/09 06:09:22 eivind Exp $
+ * $Id: kern_malloc.c,v 1.44 1998/02/23 07:41:23 dyson Exp $
  */
 
 #include "opt_vm.h"
@@ -128,7 +128,7 @@ malloc(size, type, flags)
 
 	indx = BUCKETINDX(size);
 	kbp = &bucket[indx];
-	s = splhigh();
+	s = splmem();
 	while (ksp->ks_memuse >= ksp->ks_limit) {
 		if (flags & M_NOWAIT) {
 			splx(s);
@@ -268,7 +268,7 @@ free(addr, type)
 	kup = btokup(addr);
 	size = 1 << kup->ku_indx;
 	kbp = &bucket[kup->ku_indx];
-	s = splhigh();
+	s = splmem();
 #ifdef DIAGNOSTIC
 	/*
 	 * Check for returns of data that do not point to the
diff --git a/sys/kern/kern_shutdown.c b/sys/kern/kern_shutdown.c
index c83dd75df127..c6dd9c17bbda 100644
--- a/sys/kern/kern_shutdown.c
+++ b/sys/kern/kern_shutdown.c
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)kern_shutdown.c	8.3 (Berkeley) 1/21/94
- * $Id: kern_shutdown.c,v 1.27 1997/11/25 07:07:43 julian Exp $
+ * $Id: kern_shutdown.c,v 1.28 1998/02/16 23:57:44 eivind Exp $
  */
 
 #include "opt_ddb.h"
@@ -217,17 +217,27 @@ boot(howto)
 
 		sync(&proc0, NULL);
 
+		/*
+		 * With soft updates, some buffers that are
+		 * written will be remarked as dirty until other
+		 * buffers are written.
+		 */
 		for (iter = 0; iter < 20; iter++) {
 			nbusy = 0;
 			for (bp = &buf[nbuf]; --bp >= buf; ) {
 				if ((bp->b_flags & (B_BUSY | B_INVAL)) == B_BUSY) {
 					nbusy++;
+				} else if ((bp->b_flags & (B_DELWRI | B_INVAL))
+						== B_DELWRI) {
+					/* bawrite(bp);*/
+					nbusy++;
 				}
 			}
 			if (nbusy == 0)
 				break;
 			printf("%d ", nbusy);
-			DELAY(40000 * iter);
+			sync(&proc0, NULL);
+			DELAY(50000 * iter);
 		}
 		if (nbusy) {
 			/*
diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
index bb370ac4fa66..4fdc5bde1d0b 100644
--- a/sys/kern/kern_synch.c
+++ b/sys/kern/kern_synch.c
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
- * $Id: kern_synch.c,v 1.47 1998/02/25 06:04:46 bde Exp $
+ * $Id: kern_synch.c,v 1.48 1998/03/04 10:25:55 dufault Exp $
  */
 
 #include "opt_ktrace.h"
@@ -230,7 +230,6 @@ schedcpu(arg)
 	register int s;
 	register unsigned int newcpu;
 
-	wakeup((caddr_t)&lbolt);
 	for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
 		/*
 		 * Increment time in/out of memory and sleep time
@@ -282,6 +281,7 @@ schedcpu(arg)
 		splx(s);
 	}
 	vmmeter();
+	wakeup((caddr_t)&lbolt);
 	timeout(schedcpu, (void *)0, hz);
 }
 
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 4c09e1dcb5a5..114e035a0d13 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -11,7 +11,7 @@
  * 2. Absolutely no warranty of function or purpose is made by the author
  *		John S. Dyson.
  *
- * $Id: vfs_bio.c,v 1.153 1998/03/04 03:17:30 dyson Exp $
+ * $Id: vfs_bio.c,v 1.154 1998/03/07 21:35:24 dyson Exp $
  */
 
 /*
@@ -37,6 +37,7 @@
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 #include <sys/lock.h>
+#include <miscfs/specfs/specdev.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
@@ -53,6 +54,9 @@
 
 static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
 
+struct	bio_ops bioops;		/* I/O operation notification */
+
+#if 0 	/* replaced bu sched_sync */
 static void vfs_update __P((void));
 static struct	proc *updateproc;
 static struct kproc_desc up_kp = {
@@ -61,6 +65,7 @@ static struct kproc_desc up_kp = {
 	&updateproc
 };
 SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
+#endif
 
 struct buf *buf;		/* buffer header pool */
 struct swqueue bswlist;
@@ -179,6 +184,7 @@ bufinit()
 		bp->b_qindex = QUEUE_EMPTY;
 		bp->b_vnbufs.le_next = NOLIST;
 		bp->b_generation = 0;
+		LIST_INIT(&bp->b_dep);
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 	}
@@ -362,6 +368,9 @@ int
 bwrite(struct buf * bp)
 {
 	int oldflags = bp->b_flags;
+	struct vnode *vp;
+	struct mount *mp;
+
 
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
@@ -386,6 +395,23 @@ bwrite(struct buf * bp)
 		curproc->p_stats->p_ru.ru_oublock++;
 	VOP_STRATEGY(bp);
 
+	/*
+	 * Collect statistics on synchronous and asynchronous writes.
+	 * Writes to block devices are charged to their associated
+	 * filesystem (if any).
+	 */
+	if ((vp = bp->b_vp) != NULL) {
+		if (vp->v_type == VBLK)
+			mp = vp->v_specmountpoint;
+		else
+			mp = vp->v_mount;
+		if (mp != NULL)
+			if ((oldflags & B_ASYNC) == 0)
+				mp->mnt_stat.f_syncwrites++;
+			else
+				mp->mnt_stat.f_asyncwrites++;
+	}
+
 	if ((oldflags & B_ASYNC) == 0) {
 		int rtval = biowait(bp);
 
@@ -420,6 +446,8 @@ vfs_bio_need_satisfy(void) {
 void
 bdwrite(struct buf * bp)
 {
+	int s;
+	struct vnode *vp;
 
 #if !defined(MAX_PERF)
 	if ((bp->b_flags & B_BUSY) == 0) {
@@ -438,7 +466,9 @@ bdwrite(struct buf * bp)
 	bp->b_flags &= ~(B_READ|B_RELBUF);
 	if ((bp->b_flags & B_DELWRI) == 0) {
 		bp->b_flags |= B_DONE | B_DELWRI;
+		s = splbio();
 		reassignbuf(bp, bp->b_vp);
+		splx(s);
 		++numdirtybuffers;
 	}
 
@@ -470,12 +500,45 @@ bdwrite(struct buf * bp)
 	vfs_clean_pages(bp);
 	bqrelse(bp);
 
+	/*
+	 * XXX The soft dependency code is not prepared to
+	 * have I/O done when a bdwrite is requested. For
+	 * now we just let the write be delayed if it is
+	 * requested by the soft dependency code.
+	 */
+	if ((vp = bp->b_vp) &&
+	    (vp->v_type == VBLK && vp->v_specmountpoint &&
+	    (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) ||
+	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP)))
+		return;
+
 	if (numdirtybuffers >= hidirtybuffers)
 		flushdirtybuffers(0, 0);
 
 	return;
 }
 
+
+/*
+ * Same as first half of bdwrite, mark buffer dirty, but do not release it.
+ * Check how this compares with vfs_setdirty(); XXX [JRE]
+ */
+void
+bdirty(bp)
+      struct buf *bp;
+{
+	int s;
+	
+	bp->b_flags &= ~(B_READ|B_RELBUF); /* XXX ??? check this */
+	if ((bp->b_flags & B_DELWRI) == 0) {
+		bp->b_flags |= B_DONE | B_DELWRI; /* why done? XXX JRE */
+		s = splbio();
+		reassignbuf(bp, bp->b_vp);
+		splx(s);
+		++numdirtybuffers;
+	}
+}
+
 /*
  * Asynchronous write.
  * Start output on a buffer, but do not wait for it to complete.
@@ -535,6 +598,8 @@ brelse(struct buf * bp)
 	if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) ||
 	    (bp->b_bufsize <= 0)) {
 		bp->b_flags |= B_INVAL;
+		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
+			(*bioops.io_deallocate)(bp);
 		if (bp->b_flags & B_DELWRI)
 			--numdirtybuffers;
 		bp->b_flags &= ~(B_DELWRI | B_CACHE);
@@ -1065,6 +1130,9 @@ getnewbuf(struct vnode *vp, daddr_t blkno,
 		crfree(bp->b_wcred);
 		bp->b_wcred = NOCRED;
 	}
+	if (LIST_FIRST(&bp->b_dep) != NULL &&
+	    bioops.io_deallocate)
+		(*bioops.io_deallocate)(bp);
 
 	LIST_REMOVE(bp, b_hash);
 	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
@@ -1083,6 +1151,8 @@ getnewbuf(struct vnode *vp, daddr_t blkno,
 	bp->b_dirtyoff = bp->b_dirtyend = 0;
 	bp->b_validoff = bp->b_validend = 0;
 	bp->b_usecount = 5;
+	/* Here, not kern_physio.c, is where this should be done*/
+	LIST_INIT(&bp->b_dep);
 
 	maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK;
 
@@ -1799,6 +1869,9 @@ biodone(register struct buf * bp)
 		splx(s);
 		return;
 	}
+	if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete)
+		(*bioops.io_complete)(bp);
+
 	if (bp->b_flags & B_VMIO) {
 		int i, resid;
 		vm_ooffset_t foff;
@@ -1944,6 +2017,7 @@ count_lock_queue()
 	return (count);
 }
 
+#if 0	/* not with kirks code */
 static int vfs_update_interval = 30;
 
 static void
@@ -1970,6 +2044,8 @@ sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS
 SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW,
 	&vfs_update_interval, 0, sysctl_kern_updateinterval, "I", "");
 
+#endif
+
 
 /*
  * This routine is called in lieu of iodone in the case of
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index 7f477bfd1b82..0022ac906eb4 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
- * $Id: vfs_cluster.c,v 1.55 1998/02/06 12:13:30 eivind Exp $
+ * $Id: vfs_cluster.c,v 1.56 1998/03/07 21:35:28 dyson Exp $
  */
 
 #include "opt_debug_cluster.h"
@@ -399,6 +399,9 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
 				break;
 			}
 		}
+		/* check for latent dependencies to be handled */
+		if ((LIST_FIRST(&tbp->b_dep)) != NULL && bioops.io_start)
+			(*bioops.io_start)(tbp);
 		TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
 			tbp, b_cluster.cluster_entry);
 		for (j = 0; j < tbp->b_npages; j += 1) {
@@ -684,7 +687,6 @@ cluster_wbuild(vp, size, start_lbn, len)
 						(tbp->b_flags & (B_VMIO|B_NEEDCOMMIT));
 		bp->b_iodone = cluster_callback;
 		pbgetvp(vp, bp);
-
 		for (i = 0; i < len; ++i, ++start_lbn) {
 			if (i != 0) {
 				s = splbio();
@@ -714,7 +716,10 @@ cluster_wbuild(vp, size, start_lbn, len)
 				tbp->b_flags &= ~B_DONE;
 				splx(s);
 			}
-
+			/* check for latent dependencies to be handled */
+			if ((LIST_FIRST(&tbp->b_dep)) != NULL &&
+			    bioops.io_start)
+				(*bioops.io_start)(tbp);
 			if (tbp->b_flags & B_VMIO) {
 				vm_page_t m;
 
diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c
index 5d27cf57355f..972604d24b37 100644
--- a/sys/kern/vfs_export.c
+++ b/sys/kern/vfs_export.c
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
- * $Id: vfs_subr.c,v 1.136 1998/03/01 23:07:45 dyson Exp $
+ * $Id: vfs_subr.c,v 1.137 1998/03/07 21:35:35 dyson Exp $
  */
 
 /*
@@ -123,6 +123,19 @@ static struct simplelock spechash_slock;
 struct nfs_public nfs_pub;	/* publicly exported FS */
 static vm_zone_t vnode_zone;
 
+/*
+ * The workitem queue.
+ */
+#define SYNCER_MAXDELAY		32
+int syncer_maxdelay =		SYNCER_MAXDELAY;	/* maximum delay time */
+time_t syncdelay =		30;
+int rushjob;				/* number of slots to run ASAP */
+
+static int syncer_delayno = 0;
+static long syncer_mask; 
+LIST_HEAD(synclist, vnode);
+static struct synclist *syncer_workitem_pending;
+
 int desiredvnodes;
 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "");
 
@@ -147,6 +160,12 @@ vntblinit()
 	simple_lock_init(&vnode_free_list_slock);
 	CIRCLEQ_INIT(&mountlist);
 	vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
+	/*
+	 * Initialize the filesystem syncer.
+	 */     
+	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 
+		&syncer_mask);
+	syncer_maxdelay = syncer_mask + 1;
 }
 
 /*
@@ -554,7 +573,7 @@ vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
 	int s, error;
 	vm_object_t object;
 
-	if (flags & V_SAVE) {
+	if ((flags & V_SAVE) && vp->v_dirtyblkhd.lh_first != NULL) {
 		if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)))
 			return (error);
 		if (vp->v_dirtyblkhd.lh_first != NULL)
@@ -688,16 +707,153 @@ brelvp(bp)
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
+	vp = bp->b_vp;
 	s = splbio();
 	if (bp->b_vnbufs.le_next != NOLIST)
 		bufremvn(bp);
+	if ((vp->v_flag & VONWORKLST) && (LIST_FIRST(&vp->v_dirtyblkhd) == NULL)) {
+		vp->v_flag &= ~VONWORKLST;
+		LIST_REMOVE(vp, v_synclist);
+	}
 	splx(s);
-
-	vp = bp->b_vp;
 	bp->b_vp = (struct vnode *) 0;
 	vdrop(vp);
 }
 
+/*
+ * The workitem queue.
+ * 
+ * It is useful to delay writes of file data and filesystem metadata
+ * for tens of seconds so that quickly created and deleted files need
+ * not waste disk bandwidth being created and removed. To realize this,
+ * we append vnodes to a "workitem" queue. When running with a soft
+ * updates implementation, most pending metadata dependencies should
+ * not wait for more than a few seconds. Thus, mounted on block devices
+ * are delayed only about a half the time that file data is delayed.
+ * Similarly, directory updates are more critical, so are only delayed
+ * about a third the time that file data is delayed. Thus, there are
+ * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
+ * one each second (driven off the filesystem syner process). The
+ * syncer_delayno variable indicates the next queue that is to be processed.
+ * Items that need to be processed soon are placed in this queue:
+ *
+ *	syncer_workitem_pending[syncer_delayno]
+ *
+ * A delay of fifteen seconds is done by placing the request fifteen
+ * entries later in the queue:
+ *
+ *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
+ *
+ */
+
+/*
+ * Add an item to the syncer work queue.
+ */
+void
+vn_syncer_add_to_worklist(vp, delay)
+	struct vnode *vp;
+	int delay;
+{
+	int s, slot;
+
+	s = splbio();
+
+	if (vp->v_flag & VONWORKLST) {
+		LIST_REMOVE(vp, v_synclist);
+	}
+
+	if (delay > syncer_maxdelay - 2)
+		delay = syncer_maxdelay - 2;
+	slot = (syncer_delayno + delay) & syncer_mask;
+
+	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
+	vp->v_flag |= VONWORKLST;
+	splx(s);
+}
+
+static void sched_sync __P((void));
+static struct	proc *updateproc;
+static struct kproc_desc up_kp = {
+	"syncer",
+	sched_sync,
+	&updateproc
+};
+SYSINIT_KT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
+
+/*
+ * System filesystem synchronizer daemon.
+ */
+void 
+sched_sync(void)
+{
+	struct synclist *slp;
+	struct vnode *vp;
+	long starttime;
+	int s;
+	struct proc *p = updateproc;
+
+	for (;;) {
+		starttime = time.tv_sec;
+
+		/*
+		 * Push files whose dirty time has expired.
+		 */
+		s = splbio();
+		slp = &syncer_workitem_pending[syncer_delayno];
+		syncer_delayno += 1;
+		if (syncer_delayno == syncer_maxdelay)
+			syncer_delayno = 0;
+		splx(s);
+
+		while ((vp = LIST_FIRST(slp)) != NULL) {
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+			(void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
+			VOP_UNLOCK(vp, 0, p);
+			if (LIST_FIRST(slp) == vp) {
+				if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL &&
+				    vp->v_type != VBLK)
+					panic("sched_sync: fsync failed");
+				/*
+				 * Move ourselves to the back of the sync list.
+				 */
+				LIST_REMOVE(vp, v_synclist);
+				vn_syncer_add_to_worklist(vp, syncdelay);
+			}
+		}
+
+		/*
+		 * Do soft update processing.
+		 */
+		if (bioops.io_sync)
+			(*bioops.io_sync)(NULL);
+
+		/*
+		 * The variable rushjob allows the kernel to speed up the
+		 * processing of the filesystem syncer process. A rushjob
+		 * value of N tells the filesystem syncer to process the next
+		 * N seconds worth of work on its queue ASAP. Currently rushjob
+		 * is used by the soft update code to speed up the filesystem
+		 * syncer process when the incore state is getting so far
+		 * ahead of the disk that the kernel memory pool is being
+		 * threatened with exhaustion.
+		 */
+		if (rushjob > 0) {
+			rushjob -= 1;
+			continue;
+		}
+		/*
+		 * If it has taken us less than a second to process the
+		 * current work, then wait. Otherwise start right over
+		 * again. We can still lose time if any single round
+		 * takes more than two seconds, but it does not really
+		 * matter as we are just trying to generally pace the
+		 * filesystem activity.
+		 */
+		if (time.tv_sec == starttime)
+			tsleep(&lbolt, PPAUSE, "syncer", 0);
+	}
+}
+
 /*
  * Associate a p-buffer with a vnode.
  */
@@ -743,6 +899,8 @@ reassignbuf(bp, newvp)
 	register struct buf *bp;
 	register struct vnode *newvp;
 {
+	struct buflists *listheadp;
+	int delay;
 	int s;
 
 	if (newvp == NULL) {
@@ -765,18 +923,40 @@ reassignbuf(bp, newvp)
 	if (bp->b_flags & B_DELWRI) {
 		struct buf *tbp;
 
-		tbp = newvp->v_dirtyblkhd.lh_first;
+		listheadp = &newvp->v_dirtyblkhd;
+		if ((newvp->v_flag & VONWORKLST) == 0) {
+			switch (newvp->v_type) {
+			case VDIR:
+				delay = syncdelay / 3;
+				break;
+			case VBLK:
+				if (newvp->v_specmountpoint != NULL) {
+					delay = syncdelay / 2;
+					break;
+				}
+				/* fall through */
+			default:
+				delay = syncdelay;
+			}
+			vn_syncer_add_to_worklist(newvp, delay);
+		}
+		tbp = listheadp->lh_first;
 		if (!tbp || (tbp->b_lblkno > bp->b_lblkno)) {
-			bufinsvn(bp, &newvp->v_dirtyblkhd);
+			bufinsvn(bp, listheadp);
 		} else {
 			while (tbp->b_vnbufs.le_next &&
-				(tbp->b_vnbufs.le_next->b_lblkno < bp->b_lblkno)) {
+			    (tbp->b_vnbufs.le_next->b_lblkno < bp->b_lblkno)) {
 				tbp = tbp->b_vnbufs.le_next;
 			}
 			LIST_INSERT_AFTER(tbp, bp, b_vnbufs);
 		}
 	} else {
 		bufinsvn(bp, &newvp->v_cleanblkhd);
+		if ((newvp->v_flag & VONWORKLST) &&
+			LIST_FIRST(&newvp->v_dirtyblkhd) == NULL) {
+			newvp->v_flag &= ~VONWORKLST;
+			LIST_REMOVE(newvp, v_synclist);
+		}
 	}
 	bp->b_vp = newvp;
 	vhold(bp->b_vp);
@@ -863,7 +1043,7 @@ checkalias(nvp, nvp_rdev, mp)
 		nvp->v_rdev = nvp_rdev;
 		nvp->v_hashchain = vpp;
 		nvp->v_specnext = *vpp;
-		nvp->v_specflags = 0;
+		nvp->v_specmountpoint = NULL;
 		simple_unlock(&spechash_slock);
 		*vpp = nvp;
 		if (vp != NULLVP) {
@@ -920,7 +1100,6 @@ vget(vp, flags, p)
 
 	if (VSHOULDBUSY(vp))
 		vbusy(vp);
-
 	if (flags & LK_TYPE_MASK) {
 		if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) {
 			/*
@@ -1066,7 +1245,7 @@ vdrop(vp)
 
 	simple_lock(&vp->v_interlock);
 	if (vp->v_holdcnt <= 0)
-		panic("holdrele: holdcnt");
+		panic("vdrop: holdcnt");
 	vp->v_holdcnt--;
 	if (VSHOULDFREE(vp))
 		vfree(vp);
@@ -1790,7 +1969,7 @@ vfs_mountedon(vp)
 	struct vnode *vq;
 	int error = 0;
 
-	if (vp->v_specflags & SI_MOUNTEDON)
+	if (vp->v_specmountpoint != NULL)
 		return (EBUSY);
 	if (vp->v_flag & VALIASED) {
 		simple_lock(&spechash_slock);
@@ -1798,7 +1977,7 @@ vfs_mountedon(vp)
 			if (vq->v_rdev != vp->v_rdev ||
 			    vq->v_type != vp->v_type)
 				continue;
-			if (vq->v_specflags & SI_MOUNTEDON) {
+			if (vq->v_specmountpoint != NULL) {
 				error = EBUSY;
 				break;
 			}
@@ -2326,3 +2505,170 @@ vn_pollgone(vp)
 	}
 	simple_unlock(&vp->v_pollinfo.vpi_lock);
 }
+
+
+
+/*
+ * Routine to create and manage a filesystem syncer vnode.
+ */
+#define sync_close ((int (*) __P((struct  vop_close_args *)))nullop)
+int	sync_fsync __P((struct  vop_fsync_args *));
+int	sync_inactive __P((struct  vop_inactive_args *));
+int	sync_reclaim  __P((struct  vop_reclaim_args *));
+#define sync_lock ((int (*) __P((struct  vop_lock_args *)))vop_nolock)
+#define sync_unlock ((int (*) __P((struct  vop_unlock_args *)))vop_nounlock)
+int	sync_print __P((struct vop_print_args *));
+#define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked)
+
+vop_t **sync_vnodeop_p;
+struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
+	{ &vop_default_desc,	(vop_t *) vop_eopnotsupp },
+	{ &vop_close_desc,	(vop_t *) sync_close },		/* close */
+	{ &vop_fsync_desc,	(vop_t *) sync_fsync },		/* fsync */
+	{ &vop_inactive_desc,	(vop_t *) sync_inactive },	/* inactive */
+	{ &vop_reclaim_desc,	(vop_t *) sync_reclaim },	/* reclaim */
+	{ &vop_lock_desc,	(vop_t *) sync_lock },		/* lock */
+	{ &vop_unlock_desc,	(vop_t *) sync_unlock },	/* unlock */
+	{ &vop_print_desc,	(vop_t *) sync_print },		/* print */
+	{ &vop_islocked_desc,	(vop_t *) sync_islocked },	/* islocked */
+	{ NULL, NULL }
+};
+struct vnodeopv_desc sync_vnodeop_opv_desc =
+	{ &sync_vnodeop_p, sync_vnodeop_entries };
+
+VNODEOP_SET(sync_vnodeop_opv_desc);
+
+/*
+ * Create a new filesystem syncer vnode for the specified mount point.
+ */
+int
+vfs_allocate_syncvnode(mp)
+	struct mount *mp;
+{
+	struct vnode *vp;
+	static long start, incr, next;
+	int error;
+
+	/* Allocate a new vnode */
+	if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
+		mp->mnt_syncer = NULL;
+		return (error);
+	}
+	vp->v_type = VNON;
+	/*
+	 * Place the vnode onto the syncer worklist. We attempt to
+	 * scatter them about on the list so that they will go off
+	 * at evenly distributed times even if all the filesystems
+	 * are mounted at once.
+	 */
+	next += incr;
+	if (next == 0 || next > syncer_maxdelay) {
+		start /= 2;
+		incr /= 2;
+		if (start == 0) {
+			start = syncer_maxdelay / 2;
+			incr = syncer_maxdelay;
+		}
+		next = start;
+	}
+	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
+	mp->mnt_syncer = vp;
+	return (0);
+}
+
+/*
+ * Do a lazy sync of the filesystem.
+ */
+int
+sync_fsync(ap)
+	struct vop_fsync_args /* {
+		struct vnode *a_vp;
+		struct ucred *a_cred;
+		int a_waitfor;
+		struct proc *a_p;
+	} */ *ap;
+{
+	struct vnode *syncvp = ap->a_vp;
+	struct mount *mp = syncvp->v_mount;
+	struct proc *p = ap->a_p;
+	int asyncflag;
+
+	/*
+	 * We only need to do something if this is a lazy evaluation.
+	 */
+	if (ap->a_waitfor != MNT_LAZY)
+		return (0);
+
+	/*
+	 * Move ourselves to the back of the sync list.
+	 */
+	vn_syncer_add_to_worklist(syncvp, syncdelay);
+
+	/*
+	 * Walk the list of vnodes pushing all that are dirty and
+	 * not already on the sync list.
+	 */
+	simple_lock(&mountlist_slock);
+	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0)
+		return (0);
+	asyncflag = mp->mnt_flag & MNT_ASYNC;
+	mp->mnt_flag &= ~MNT_ASYNC;
+	VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p);
+	if (asyncflag)
+		mp->mnt_flag |= MNT_ASYNC;
+	vfs_unbusy(mp, p);
+	return (0);
+}
+
+/*
+ * The syncer vnode is no referenced.
+ */
+int
+sync_inactive(ap)
+	struct vop_inactive_args /* {
+		struct vnode *a_vp;
+		struct proc *a_p;
+	} */ *ap;
+{
+
+	vgone(ap->a_vp);
+	return (0);
+}
+
+/*
+ * The syncer vnode is no longer needed and is being decommissioned.
+ */
+int
+sync_reclaim(ap)
+	struct vop_reclaim_args /* {
+		struct vnode *a_vp;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+
+	vp->v_mount->mnt_syncer = NULL;
+	if (vp->v_flag & VONWORKLST) {
+		LIST_REMOVE(vp, v_synclist);
+		vp->v_flag &= ~VONWORKLST;
+	}
+
+	return (0);
+}
+
+/*
+ * Print out a syncer vnode.
+ */
+int
+sync_print(ap)
+	struct vop_print_args /* {
+		struct vnode *a_vp;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+
+	printf("syncer vnode");
+	if (vp->v_vnlock != NULL)
+		lockmgr_printinfo(vp->v_vnlock);
+	printf("\n");
+	return (0);
+}
diff --git a/sys/kern/vfs_extattr.c b/sys/kern/vfs_extattr.c
index 596de9561183..4a818dc42939 100644
--- a/sys/kern/vfs_extattr.c
+++ b/sys/kern/vfs_extattr.c
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
- * $Id: vfs_syscalls.c,v 1.93 1998/02/15 04:17:09 dyson Exp $
+ * $Id: vfs_syscalls.c,v 1.94 1998/03/07 21:35:39 dyson Exp $
  */
 
 /* For 4.3 integer FS ID compatibility */
@@ -283,6 +283,14 @@ mount(p, uap)
 			mp->mnt_flag = flag;
 			mp->mnt_kern_flag = flag2;
 		}
+		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
+			if (mp->mnt_syncer == NULL)
+				error = vfs_allocate_syncvnode(mp);
+		} else {
+			if (mp->mnt_syncer != NULL)
+				vrele(mp->mnt_syncer);
+			mp->mnt_syncer = NULL;
+		}
 		vfs_unbusy(mp, p);
 		return (error);
 	}
@@ -296,6 +304,8 @@ mount(p, uap)
 		simple_unlock(&mountlist_slock);
 		checkdirs(vp);
 		VOP_UNLOCK(vp, 0, p);
+		if ((mp->mnt_flag & MNT_RDONLY) == 0)
+			error = vfs_allocate_syncvnode(mp);
 		vfs_unbusy(mp, p);
 		if (error = VFS_START(mp, 0, p))
 			vrele(vp);
@@ -431,12 +441,16 @@ dounmount(mp, flags, p)
 	vfs_msync(mp, MNT_WAIT);
 	mp->mnt_flag &=~ MNT_ASYNC;
 	cache_purgevfs(mp);	/* remove cache entries for this file sys */
+	if (mp->mnt_syncer != NULL)
+		vrele(mp->mnt_syncer);
 	if (((mp->mnt_flag & MNT_RDONLY) ||
 	     (error = VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p)) == 0) ||
 	    (flags & MNT_FORCE))
 		error = VFS_UNMOUNT(mp, flags, p);
 	simple_lock(&mountlist_slock);
 	if (error) {
+		if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL)
+			(void) vfs_allocate_syncvnode(mp);
 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
 		lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK | LK_REENABLE,
 		    &mountlist_slock, p);
@@ -490,9 +504,9 @@ sync(p, uap)
 			asyncflag = mp->mnt_flag & MNT_ASYNC;
 			mp->mnt_flag &= ~MNT_ASYNC;
 			vfs_msync(mp, MNT_NOWAIT);
-			VFS_SYNC(mp, MNT_NOWAIT, p != NULL ? p->p_ucred : NOCRED, p);
-			if (asyncflag)
-				mp->mnt_flag |= MNT_ASYNC;
+			VFS_SYNC(mp, MNT_NOWAIT,
+				((p != NULL) ? p->p_ucred : NOCRED), p);
+			mp->mnt_flag |= asyncflag;
 		}
 		simple_lock(&mountlist_slock);
 		nmp = mp->mnt_list.cqe_next;
@@ -665,10 +679,11 @@ getfsstat(p, uap)
 		if (sfsp && count < maxcount) {
 			sp = &mp->mnt_stat;
 			/*
-			 * If MNT_NOWAIT is specified, do not refresh the
-			 * fsstat cache. MNT_WAIT overrides MNT_NOWAIT.
+			 * If MNT_NOWAIT or MNT_LAZY is specified, do not
+			 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
+			 * overrides MNT_WAIT.
 			 */
-			if (((SCARG(uap, flags) & MNT_NOWAIT) == 0 ||
+			if (((SCARG(uap, flags) & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
 			    (SCARG(uap, flags) & MNT_WAIT)) &&
 			    (error = VFS_STATFS(mp, sp, p))) {
 				simple_lock(&mountlist_slock);
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 5d27cf57355f..972604d24b37 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
- * $Id: vfs_subr.c,v 1.136 1998/03/01 23:07:45 dyson Exp $
+ * $Id: vfs_subr.c,v 1.137 1998/03/07 21:35:35 dyson Exp $
  */
 
 /*
@@ -123,6 +123,19 @@ static struct simplelock spechash_slock;
 struct nfs_public nfs_pub;	/* publicly exported FS */
 static vm_zone_t vnode_zone;
 
+/*
+ * The workitem queue.
+ */
+#define SYNCER_MAXDELAY		32
+int syncer_maxdelay =		SYNCER_MAXDELAY;	/* maximum delay time */
+time_t syncdelay =		30;
+int rushjob;				/* number of slots to run ASAP */
+
+static int syncer_delayno = 0;
+static long syncer_mask; 
+LIST_HEAD(synclist, vnode);
+static struct synclist *syncer_workitem_pending;
+
 int desiredvnodes;
 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "");
 
@@ -147,6 +160,12 @@ vntblinit()
 	simple_lock_init(&vnode_free_list_slock);
 	CIRCLEQ_INIT(&mountlist);
 	vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
+	/*
+	 * Initialize the filesystem syncer.
+	 */     
+	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 
+		&syncer_mask);
+	syncer_maxdelay = syncer_mask + 1;
 }
 
 /*
@@ -554,7 +573,7 @@ vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
 	int s, error;
 	vm_object_t object;
 
-	if (flags & V_SAVE) {
+	if ((flags & V_SAVE) && vp->v_dirtyblkhd.lh_first != NULL) {
 		if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)))
 			return (error);
 		if (vp->v_dirtyblkhd.lh_first != NULL)
@@ -688,16 +707,153 @@ brelvp(bp)
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
+	vp = bp->b_vp;
 	s = splbio();
 	if (bp->b_vnbufs.le_next != NOLIST)
 		bufremvn(bp);
+	if ((vp->v_flag & VONWORKLST) && (LIST_FIRST(&vp->v_dirtyblkhd) == NULL)) {
+		vp->v_flag &= ~VONWORKLST;
+		LIST_REMOVE(vp, v_synclist);
+	}
 	splx(s);
-
-	vp = bp->b_vp;
 	bp->b_vp = (struct vnode *) 0;
 	vdrop(vp);
 }
 
+/*
+ * The workitem queue.
+ * 
+ * It is useful to delay writes of file data and filesystem metadata
+ * for tens of seconds so that quickly created and deleted files need
+ * not waste disk bandwidth being created and removed. To realize this,
+ * we append vnodes to a "workitem" queue. When running with a soft
+ * updates implementation, most pending metadata dependencies should
+ * not wait for more than a few seconds. Thus, mounted on block devices
+ * are delayed only about a half the time that file data is delayed.
+ * Similarly, directory updates are more critical, so are only delayed
+ * about a third the time that file data is delayed. Thus, there are
+ * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
+ * one each second (driven off the filesystem syner process). The
+ * syncer_delayno variable indicates the next queue that is to be processed.
+ * Items that need to be processed soon are placed in this queue:
+ *
+ *	syncer_workitem_pending[syncer_delayno]
+ *
+ * A delay of fifteen seconds is done by placing the request fifteen
+ * entries later in the queue:
+ *
+ *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
+ *
+ */
+
+/*
+ * Add an item to the syncer work queue.
+ */
+void
+vn_syncer_add_to_worklist(vp, delay)
+	struct vnode *vp;
+	int delay;
+{
+	int s, slot;
+
+	s = splbio();
+
+	if (vp->v_flag & VONWORKLST) {
+		LIST_REMOVE(vp, v_synclist);
+	}
+
+	if (delay > syncer_maxdelay - 2)
+		delay = syncer_maxdelay - 2;
+	slot = (syncer_delayno + delay) & syncer_mask;
+
+	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
+	vp->v_flag |= VONWORKLST;
+	splx(s);
+}
+
+static void sched_sync __P((void));
+static struct	proc *updateproc;
+static struct kproc_desc up_kp = {
+	"syncer",
+	sched_sync,
+	&updateproc
+};
+SYSINIT_KT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
+
+/*
+ * System filesystem synchronizer daemon.
+ */
+void 
+sched_sync(void)
+{
+	struct synclist *slp;
+	struct vnode *vp;
+	long starttime;
+	int s;
+	struct proc *p = updateproc;
+
+	for (;;) {
+		starttime = time.tv_sec;
+
+		/*
+		 * Push files whose dirty time has expired.
+		 */
+		s = splbio();
+		slp = &syncer_workitem_pending[syncer_delayno];
+		syncer_delayno += 1;
+		if (syncer_delayno == syncer_maxdelay)
+			syncer_delayno = 0;
+		splx(s);
+
+		while ((vp = LIST_FIRST(slp)) != NULL) {
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+			(void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
+			VOP_UNLOCK(vp, 0, p);
+			if (LIST_FIRST(slp) == vp) {
+				if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL &&
+				    vp->v_type != VBLK)
+					panic("sched_sync: fsync failed");
+				/*
+				 * Move ourselves to the back of the sync list.
+				 */
+				LIST_REMOVE(vp, v_synclist);
+				vn_syncer_add_to_worklist(vp, syncdelay);
+			}
+		}
+
+		/*
+		 * Do soft update processing.
+		 */
+		if (bioops.io_sync)
+			(*bioops.io_sync)(NULL);
+
+		/*
+		 * The variable rushjob allows the kernel to speed up the
+		 * processing of the filesystem syncer process. A rushjob
+		 * value of N tells the filesystem syncer to process the next
+		 * N seconds worth of work on its queue ASAP. Currently rushjob
+		 * is used by the soft update code to speed up the filesystem
+		 * syncer process when the incore state is getting so far
+		 * ahead of the disk that the kernel memory pool is being
+		 * threatened with exhaustion.
+		 */
+		if (rushjob > 0) {
+			rushjob -= 1;
+			continue;
+		}
+		/*
+		 * If it has taken us less than a second to process the
+		 * current work, then wait. Otherwise start right over
+		 * again. We can still lose time if any single round
+		 * takes more than two seconds, but it does not really
+		 * matter as we are just trying to generally pace the
+		 * filesystem activity.
+		 */
+		if (time.tv_sec == starttime)
+			tsleep(&lbolt, PPAUSE, "syncer", 0);
+	}
+}
+
 /*
  * Associate a p-buffer with a vnode.
  */
@@ -743,6 +899,8 @@ reassignbuf(bp, newvp)
 	register struct buf *bp;
 	register struct vnode *newvp;
 {
+	struct buflists *listheadp;
+	int delay;
 	int s;
 
 	if (newvp == NULL) {
@@ -765,18 +923,40 @@ reassignbuf(bp, newvp)
 	if (bp->b_flags & B_DELWRI) {
 		struct buf *tbp;
 
-		tbp = newvp->v_dirtyblkhd.lh_first;
+		listheadp = &newvp->v_dirtyblkhd;
+		if ((newvp->v_flag & VONWORKLST) == 0) {
+			switch (newvp->v_type) {
+			case VDIR:
+				delay = syncdelay / 3;
+				break;
+			case VBLK:
+				if (newvp->v_specmountpoint != NULL) {
+					delay = syncdelay / 2;
+					break;
+				}
+				/* fall through */
+			default:
+				delay = syncdelay;
+			}
+			vn_syncer_add_to_worklist(newvp, delay);
+		}
+		tbp = listheadp->lh_first;
 		if (!tbp || (tbp->b_lblkno > bp->b_lblkno)) {
-			bufinsvn(bp, &newvp->v_dirtyblkhd);
+			bufinsvn(bp, listheadp);
 		} else {
 			while (tbp->b_vnbufs.le_next &&
-				(tbp->b_vnbufs.le_next->b_lblkno < bp->b_lblkno)) {
+			    (tbp->b_vnbufs.le_next->b_lblkno < bp->b_lblkno)) {
 				tbp = tbp->b_vnbufs.le_next;
 			}
 			LIST_INSERT_AFTER(tbp, bp, b_vnbufs);
 		}
 	} else {
 		bufinsvn(bp, &newvp->v_cleanblkhd);
+		if ((newvp->v_flag & VONWORKLST) &&
+			LIST_FIRST(&newvp->v_dirtyblkhd) == NULL) {
+			newvp->v_flag &= ~VONWORKLST;
+			LIST_REMOVE(newvp, v_synclist);
+		}
 	}
 	bp->b_vp = newvp;
 	vhold(bp->b_vp);
@@ -863,7 +1043,7 @@ checkalias(nvp, nvp_rdev, mp)
 		nvp->v_rdev = nvp_rdev;
 		nvp->v_hashchain = vpp;
 		nvp->v_specnext = *vpp;
-		nvp->v_specflags = 0;
+		nvp->v_specmountpoint = NULL;
 		simple_unlock(&spechash_slock);
 		*vpp = nvp;
 		if (vp != NULLVP) {
@@ -920,7 +1100,6 @@ vget(vp, flags, p)
 
 	if (VSHOULDBUSY(vp))
 		vbusy(vp);
-
 	if (flags & LK_TYPE_MASK) {
 		if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) {
 			/*
@@ -1066,7 +1245,7 @@ vdrop(vp)
 
 	simple_lock(&vp->v_interlock);
 	if (vp->v_holdcnt <= 0)
-		panic("holdrele: holdcnt");
+		panic("vdrop: holdcnt");
 	vp->v_holdcnt--;
 	if (VSHOULDFREE(vp))
 		vfree(vp);
@@ -1790,7 +1969,7 @@ vfs_mountedon(vp)
 	struct vnode *vq;
 	int error = 0;
 
-	if (vp->v_specflags & SI_MOUNTEDON)
+	if (vp->v_specmountpoint != NULL)
 		return (EBUSY);
 	if (vp->v_flag & VALIASED) {
 		simple_lock(&spechash_slock);
@@ -1798,7 +1977,7 @@ vfs_mountedon(vp)
 			if (vq->v_rdev != vp->v_rdev ||
 			    vq->v_type != vp->v_type)
 				continue;
-			if (vq->v_specflags & SI_MOUNTEDON) {
+			if (vq->v_specmountpoint != NULL) {
 				error = EBUSY;
 				break;
 			}
@@ -2326,3 +2505,170 @@ vn_pollgone(vp)
 	}
 	simple_unlock(&vp->v_pollinfo.vpi_lock);
 }
+
+
+
+/*
+ * Routine to create and manage a filesystem syncer vnode.
+ */
+#define sync_close ((int (*) __P((struct  vop_close_args *)))nullop)
+int	sync_fsync __P((struct  vop_fsync_args *));
+int	sync_inactive __P((struct  vop_inactive_args *));
+int	sync_reclaim  __P((struct  vop_reclaim_args *));
+#define sync_lock ((int (*) __P((struct  vop_lock_args *)))vop_nolock)
+#define sync_unlock ((int (*) __P((struct  vop_unlock_args *)))vop_nounlock)
+int	sync_print __P((struct vop_print_args *));
+#define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked)
+
+vop_t **sync_vnodeop_p;
+struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
+	{ &vop_default_desc,	(vop_t *) vop_eopnotsupp },
+	{ &vop_close_desc,	(vop_t *) sync_close },		/* close */
+	{ &vop_fsync_desc,	(vop_t *) sync_fsync },		/* fsync */
+	{ &vop_inactive_desc,	(vop_t *) sync_inactive },	/* inactive */
+	{ &vop_reclaim_desc,	(vop_t *) sync_reclaim },	/* reclaim */
+	{ &vop_lock_desc,	(vop_t *) sync_lock },		/* lock */
+	{ &vop_unlock_desc,	(vop_t *) sync_unlock },	/* unlock */
+	{ &vop_print_desc,	(vop_t *) sync_print },		/* print */
+	{ &vop_islocked_desc,	(vop_t *) sync_islocked },	/* islocked */
+	{ NULL, NULL }
+};
+struct vnodeopv_desc sync_vnodeop_opv_desc =
+	{ &sync_vnodeop_p, sync_vnodeop_entries };
+
+VNODEOP_SET(sync_vnodeop_opv_desc);
+
+/*
+ * Create a new filesystem syncer vnode for the specified mount point.
+ */
+int
+vfs_allocate_syncvnode(mp)
+	struct mount *mp;
+{
+	struct vnode *vp;
+	static long start, incr, next;
+	int error;
+
+	/* Allocate a new vnode */
+	if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
+		mp->mnt_syncer = NULL;
+		return (error);
+	}
+	vp->v_type = VNON;
+	/*
+	 * Place the vnode onto the syncer worklist. We attempt to
+	 * scatter them about on the list so that they will go off
+	 * at evenly distributed times even if all the filesystems
+	 * are mounted at once.
+	 */
+	next += incr;
+	if (next == 0 || next > syncer_maxdelay) {
+		start /= 2;
+		incr /= 2;
+		if (start == 0) {
+			start = syncer_maxdelay / 2;
+			incr = syncer_maxdelay;
+		}
+		next = start;
+	}
+	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
+	mp->mnt_syncer = vp;
+	return (0);
+}
+
+/*
+ * Do a lazy sync of the filesystem.
+ */
+int
+sync_fsync(ap)
+	struct vop_fsync_args /* {
+		struct vnode *a_vp;
+		struct ucred *a_cred;
+		int a_waitfor;
+		struct proc *a_p;
+	} */ *ap;
+{
+	struct vnode *syncvp = ap->a_vp;
+	struct mount *mp = syncvp->v_mount;
+	struct proc *p = ap->a_p;
+	int asyncflag;
+
+	/*
+	 * We only need to do something if this is a lazy evaluation.
+	 */
+	if (ap->a_waitfor != MNT_LAZY)
+		return (0);
+
+	/*
+	 * Move ourselves to the back of the sync list.
+	 */
+	vn_syncer_add_to_worklist(syncvp, syncdelay);
+
+	/*
+	 * Walk the list of vnodes pushing all that are dirty and
+	 * not already on the sync list.
+	 */
+	simple_lock(&mountlist_slock);
+	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0)
+		return (0);
+	asyncflag = mp->mnt_flag & MNT_ASYNC;
+	mp->mnt_flag &= ~MNT_ASYNC;
+	VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p);
+	if (asyncflag)
+		mp->mnt_flag |= MNT_ASYNC;
+	vfs_unbusy(mp, p);
+	return (0);
+}
+
+/*
+ * The syncer vnode is no referenced.
+ */
+int
+sync_inactive(ap)
+	struct vop_inactive_args /* {
+		struct vnode *a_vp;
+		struct proc *a_p;
+	} */ *ap;
+{
+
+	vgone(ap->a_vp);
+	return (0);
+}
+
+/*
+ * The syncer vnode is no longer needed and is being decommissioned.
+ */
+int
+sync_reclaim(ap)
+	struct vop_reclaim_args /* {
+		struct vnode *a_vp;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+
+	vp->v_mount->mnt_syncer = NULL;
+	if (vp->v_flag & VONWORKLST) {
+		LIST_REMOVE(vp, v_synclist);
+		vp->v_flag &= ~VONWORKLST;
+	}
+
+	return (0);
+}
+
+/*
+ * Print out a syncer vnode.
+ */
+int
+sync_print(ap)
+	struct vop_print_args /* {
+		struct vnode *a_vp;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+
+	printf("syncer vnode");
+	if (vp->v_vnlock != NULL)
+		lockmgr_printinfo(vp->v_vnlock);
+	printf("\n");
+	return (0);
+}
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index 596de9561183..4a818dc42939 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
- * $Id: vfs_syscalls.c,v 1.93 1998/02/15 04:17:09 dyson Exp $
+ * $Id: vfs_syscalls.c,v 1.94 1998/03/07 21:35:39 dyson Exp $
  */
 
 /* For 4.3 integer FS ID compatibility */
@@ -283,6 +283,14 @@ mount(p, uap)
 			mp->mnt_flag = flag;
 			mp->mnt_kern_flag = flag2;
 		}
+		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
+			if (mp->mnt_syncer == NULL)
+				error = vfs_allocate_syncvnode(mp);
+		} else {
+			if (mp->mnt_syncer != NULL)
+				vrele(mp->mnt_syncer);
+			mp->mnt_syncer = NULL;
+		}
 		vfs_unbusy(mp, p);
 		return (error);
 	}
@@ -296,6 +304,8 @@ mount(p, uap)
 		simple_unlock(&mountlist_slock);
 		checkdirs(vp);
 		VOP_UNLOCK(vp, 0, p);
+		if ((mp->mnt_flag & MNT_RDONLY) == 0)
+			error = vfs_allocate_syncvnode(mp);
 		vfs_unbusy(mp, p);
 		if (error = VFS_START(mp, 0, p))
 			vrele(vp);
@@ -431,12 +441,16 @@ dounmount(mp, flags, p)
 	vfs_msync(mp, MNT_WAIT);
 	mp->mnt_flag &=~ MNT_ASYNC;
 	cache_purgevfs(mp);	/* remove cache entries for this file sys */
+	if (mp->mnt_syncer != NULL)
+		vrele(mp->mnt_syncer);
 	if (((mp->mnt_flag & MNT_RDONLY) ||
 	     (error = VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p)) == 0) ||
 	    (flags & MNT_FORCE))
 		error = VFS_UNMOUNT(mp, flags, p);
 	simple_lock(&mountlist_slock);
 	if (error) {
+		if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL)
+			(void) vfs_allocate_syncvnode(mp);
 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
 		lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK | LK_REENABLE,
 		    &mountlist_slock, p);
@@ -490,9 +504,9 @@ sync(p, uap)
 			asyncflag = mp->mnt_flag & MNT_ASYNC;
 			mp->mnt_flag &= ~MNT_ASYNC;
 			vfs_msync(mp, MNT_NOWAIT);
-			VFS_SYNC(mp, MNT_NOWAIT, p != NULL ? p->p_ucred : NOCRED, p);
-			if (asyncflag)
-				mp->mnt_flag |= MNT_ASYNC;
+			VFS_SYNC(mp, MNT_NOWAIT,
+				((p != NULL) ? p->p_ucred : NOCRED), p);
+			mp->mnt_flag |= asyncflag;
 		}
 		simple_lock(&mountlist_slock);
 		nmp = mp->mnt_list.cqe_next;
@@ -665,10 +679,11 @@ getfsstat(p, uap)
 		if (sfsp && count < maxcount) {
 			sp = &mp->mnt_stat;
 			/*
-			 * If MNT_NOWAIT is specified, do not refresh the
-			 * fsstat cache. MNT_WAIT overrides MNT_NOWAIT.
+			 * If MNT_NOWAIT or MNT_LAZY is specified, do not
+			 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
+			 * overrides MNT_WAIT.
 			 */
-			if (((SCARG(uap, flags) & MNT_NOWAIT) == 0 ||
+			if (((SCARG(uap, flags) & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
 			    (SCARG(uap, flags) & MNT_WAIT)) &&
 			    (error = VFS_STATFS(mp, sp, p))) {
 				simple_lock(&mountlist_slock);
diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src
index bedf27422475..922e0604b542 100644
--- a/sys/kern/vnode_if.src
+++ b/sys/kern/vnode_if.src
@@ -31,7 +31,7 @@
 # SUCH DAMAGE.
 #
 #	@(#)vnode_if.src	8.12 (Berkeley) 5/14/95
-# $Id: vnode_if.src,v 1.14 1997/10/16 10:48:00 phk Exp $
+# $Id: vnode_if.src,v 1.15 1997/10/16 20:32:23 phk Exp $
 #
 
 #
@@ -428,6 +428,18 @@ vop_advlock {
 	IN int flags;
 };
 
+#
+#% balloc	vp	L L L
+#
+vop_balloc {
+	IN struct vnode *vp;
+	IN off_t startoffset;
+	IN int size;
+	IN struct ucred *cred;
+	IN int flags;
+	OUT struct buf **bpp;
+};
+
 #
 #% reallocblks	vp	L L L
 #
diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c
index 9c3c8450867d..666322f65609 100644
--- a/sys/miscfs/specfs/spec_vnops.c
+++ b/sys/miscfs/specfs/spec_vnops.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)spec_vnops.c	8.14 (Berkeley) 5/21/95
- * $Id: spec_vnops.c,v 1.58 1998/03/07 21:35:52 dyson Exp $
+ * $Id: spec_vnops.c,v 1.59 1998/03/08 08:46:18 dyson Exp $
  */
 
 #include <sys/param.h>
@@ -548,8 +548,12 @@ spec_strategy(ap)
 		struct buf *a_bp;
 	} */ *ap;
 {
+	struct buf *bp;
 
-	(*bdevsw[major(ap->a_bp->b_dev)]->d_strategy)(ap->a_bp);
+	bp = ap->a_bp;
+	if ((LIST_FIRST(&bp->b_dep)) != NULL && bioops.io_start)
+		(*bioops.io_start)(bp);
+	(*bdevsw[major(bp->b_dev)]->d_strategy)(bp);
 	return (0);
 }
 
@@ -633,7 +637,9 @@ spec_close(ap)
 		 * we must invalidate any in core blocks, so that
 		 * we can, for instance, change floppy disks.
 		 */
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, ap->a_p);
 		error = vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 0, 0);
+		VOP_UNLOCK(vp, 0, ap->a_p);
 		if (error)
 			return (error);
 
diff --git a/sys/miscfs/specfs/specdev.h b/sys/miscfs/specfs/specdev.h
index 06a5cdd9bbe5..b4c6f7750458 100644
--- a/sys/miscfs/specfs/specdev.h
+++ b/sys/miscfs/specfs/specdev.h
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)specdev.h	8.6 (Berkeley) 5/21/95
- * $Id: specdev.h,v 1.12 1997/09/14 02:58:03 peter Exp $
+ * $Id: specdev.h,v 1.13 1997/10/15 13:23:21 phk Exp $
  */
 
 /*
@@ -42,7 +42,7 @@
 struct specinfo {
 	struct	vnode **si_hashchain;
 	struct	vnode *si_specnext;
-	long	si_flags;
+	struct	mount *si_mountpoint;
 	dev_t	si_rdev;
 };
 /*
@@ -51,12 +51,7 @@ struct specinfo {
 #define v_rdev v_specinfo->si_rdev
 #define v_hashchain v_specinfo->si_hashchain
 #define v_specnext v_specinfo->si_specnext
-#define v_specflags v_specinfo->si_flags
-
-/*
- * Flags for specinfo
- */
-#define	SI_MOUNTEDON	0x0001	/* block special device is mounted on */
+#define v_specmountpoint v_specinfo->si_mountpoint
 
 /*
  * Special device management
diff --git a/sys/msdosfs/msdosfs_vfsops.c b/sys/msdosfs/msdosfs_vfsops.c
index 2b1d1d7f352f..0af5438dae63 100644
--- a/sys/msdosfs/msdosfs_vfsops.c
+++ b/sys/msdosfs/msdosfs_vfsops.c
@@ -1,4 +1,4 @@
-/*	$Id: msdosfs_vfsops.c,v 1.28 1998/02/23 16:44:32 ache Exp $ */
+/*	$Id: msdosfs_vfsops.c,v 1.29 1998/03/01 22:46:27 msmith Exp $ */
 /*	$NetBSD: msdosfs_vfsops.c,v 1.51 1997/11/17 15:36:58 ws Exp $	*/
 
 /*-
@@ -772,7 +772,7 @@ mountmsdosfs(devvp, mp, p, argp)
 	mp->mnt_stat.f_fsid.val[0] = (long)dev;
 	mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum;
 	mp->mnt_flag |= MNT_LOCAL;
-	devvp->v_specflags |= SI_MOUNTEDON;
+	devvp->v_specmountpoint = mp;
 
 	return 0;
 
@@ -818,7 +818,7 @@ msdosfs_unmount(mp, mntflags, p)
 	if (error)
 		return error;
 	pmp = VFSTOMSDOSFS(mp);
-	pmp->pm_devvp->v_specflags &= ~SI_MOUNTEDON;
+	pmp->pm_devvp->v_specmountpoint = NULL;
 #ifdef MSDOSFS_DEBUG
 	{
 		struct vnode *vp = pmp->pm_devvp;
@@ -841,8 +841,9 @@ msdosfs_unmount(mp, mntflags, p)
 		    ((u_int *)vp->v_data)[1]);
 	}
 #endif
-	error = VOP_CLOSE(pmp->pm_devvp, (pmp->pm_flags&MSDOSFSMNT_RONLY) ? FREAD : FREAD | FWRITE,
-	    NOCRED, p);
+	error = VOP_CLOSE(pmp->pm_devvp,
+		    (pmp->pm_flags&MSDOSFSMNT_RONLY) ? FREAD : FREAD | FWRITE,
+		    NOCRED, p);
 	vrele(pmp->pm_devvp);
 	free(pmp->pm_inusemap, M_MSDOSFSFAT);
 	free(pmp, M_MSDOSFSMNT);
@@ -946,9 +947,11 @@ msdosfs_sync(mp, waitfor, cred, p)
 		simple_lock(&vp->v_interlock);
 		nvp = vp->v_mntvnodes.le_next;
 		dep = VTODE(vp);
-		if (vp->v_type == VNON || ((dep->de_flag &
-		    (DE_ACCESS | DE_CREATE | DE_UPDATE | DE_MODIFIED)) == 0)
-		    && vp->v_dirtyblkhd.lh_first == NULL) {
+		if (vp->v_type == VNON
+		|| (waitfor == MNT_LAZY) /* can this happen with msdosfs? */
+		|| (((dep->de_flag &
+		     (DE_ACCESS | DE_CREATE | DE_UPDATE | DE_MODIFIED)) == 0)
+		  && (vp->v_dirtyblkhd.lh_first == NULL))) {
 			simple_unlock(&vp->v_interlock);
 			continue;
 		}
diff --git a/sys/nfs/nfs_bio.c b/sys/nfs/nfs_bio.c
index be2b42380f9e..c1f8bb246cf5 100644
--- a/sys/nfs/nfs_bio.c
+++ b/sys/nfs/nfs_bio.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
- * $Id: nfs_bio.c,v 1.51 1998/03/06 09:46:43 msmith Exp $
+ * $Id: nfs_bio.c,v 1.52 1998/03/07 21:36:01 dyson Exp $
  */
 
 
@@ -1206,10 +1206,14 @@ nfs_doio(bp, cr, p)
 		 */
     		if (error == EINTR
 		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
+			int s;
+
 			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
 			++numdirtybuffers;
 			bp->b_flags |= B_DELWRI;
+			s = splbio();
 			reassignbuf(bp, vp);
+			splx(s);
 			if ((bp->b_flags & B_ASYNC) == 0)
 			    bp->b_flags |= B_EINTR;
 	    	} else {
diff --git a/sys/nfs/nfs_vnops.c b/sys/nfs/nfs_vnops.c
index 1021f4706aa1..9c08967b19bf 100644
--- a/sys/nfs/nfs_vnops.c
+++ b/sys/nfs/nfs_vnops.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_vnops.c	8.16 (Berkeley) 5/27/95
- * $Id: nfs_vnops.c,v 1.79 1998/03/06 09:46:48 msmith Exp $
+ * $Id: nfs_vnops.c,v 1.80 1998/03/07 21:36:06 dyson Exp $
  */
 
 
@@ -2846,9 +2846,11 @@ nfs_flush(vp, cred, waitfor, p, commit)
 					vfs_bio_need_satisfy();
 				}
 			    }
+			    s = splbio();	/* XXX check this positionning */
 			    bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI);
 			    bp->b_dirtyoff = bp->b_dirtyend = 0;
 			    reassignbuf(bp, vp);
+			    splx(s);
 			    biodone(bp);
 			}
 		}
@@ -2994,6 +2996,7 @@ nfs_writebp(bp, force)
 	register struct buf *bp;
 	int force;
 {
+	int s;
 	register int oldflags = bp->b_flags, retv = 1;
 	off_t off;
 
@@ -3008,6 +3011,7 @@ nfs_writebp(bp, force)
 		if (needsbuffer)
 			vfs_bio_need_satisfy();
 	}
+	s = splbio(); /* XXX check if needed */
 	bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI);
 
 	if ((oldflags & (B_ASYNC|B_DELWRI)) == (B_ASYNC|B_DELWRI)) {
@@ -3016,6 +3020,7 @@ nfs_writebp(bp, force)
 
 	bp->b_vp->v_numoutput++;
 	curproc->p_stats->p_ru.ru_oublock++;
+	splx(s);
 
 	/*
 	 * If B_NEEDCOMMIT is set, a commit rpc may do the trick. If not
@@ -3046,7 +3051,9 @@ nfs_writebp(bp, force)
 		int rtval = biowait(bp);
 
 		if (oldflags & B_DELWRI) {
+			s = splbio();
 			reassignbuf(bp, bp->b_vp);
+			splx(s);
 		}
 
 		brelse(bp);
diff --git a/sys/nfsclient/nfs_bio.c b/sys/nfsclient/nfs_bio.c
index be2b42380f9e..c1f8bb246cf5 100644
--- a/sys/nfsclient/nfs_bio.c
+++ b/sys/nfsclient/nfs_bio.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
- * $Id: nfs_bio.c,v 1.51 1998/03/06 09:46:43 msmith Exp $
+ * $Id: nfs_bio.c,v 1.52 1998/03/07 21:36:01 dyson Exp $
  */
 
 
@@ -1206,10 +1206,14 @@ nfs_doio(bp, cr, p)
 		 */
     		if (error == EINTR
 		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
+			int s;
+
 			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
 			++numdirtybuffers;
 			bp->b_flags |= B_DELWRI;
+			s = splbio();
 			reassignbuf(bp, vp);
+			splx(s);
 			if ((bp->b_flags & B_ASYNC) == 0)
 			    bp->b_flags |= B_EINTR;
 	    	} else {
diff --git a/sys/nfsclient/nfs_vnops.c b/sys/nfsclient/nfs_vnops.c
index 1021f4706aa1..9c08967b19bf 100644
--- a/sys/nfsclient/nfs_vnops.c
+++ b/sys/nfsclient/nfs_vnops.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)nfs_vnops.c	8.16 (Berkeley) 5/27/95
- * $Id: nfs_vnops.c,v 1.79 1998/03/06 09:46:48 msmith Exp $
+ * $Id: nfs_vnops.c,v 1.80 1998/03/07 21:36:06 dyson Exp $
  */
 
 
@@ -2846,9 +2846,11 @@ nfs_flush(vp, cred, waitfor, p, commit)
 					vfs_bio_need_satisfy();
 				}
 			    }
+			    s = splbio();	/* XXX check this positionning */
 			    bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI);
 			    bp->b_dirtyoff = bp->b_dirtyend = 0;
 			    reassignbuf(bp, vp);
+			    splx(s);
 			    biodone(bp);
 			}
 		}
@@ -2994,6 +2996,7 @@ nfs_writebp(bp, force)
 	register struct buf *bp;
 	int force;
 {
+	int s;
 	register int oldflags = bp->b_flags, retv = 1;
 	off_t off;
 
@@ -3008,6 +3011,7 @@ nfs_writebp(bp, force)
 		if (needsbuffer)
 			vfs_bio_need_satisfy();
 	}
+	s = splbio(); /* XXX check if needed */
 	bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI);
 
 	if ((oldflags & (B_ASYNC|B_DELWRI)) == (B_ASYNC|B_DELWRI)) {
@@ -3016,6 +3020,7 @@ nfs_writebp(bp, force)
 
 	bp->b_vp->v_numoutput++;
 	curproc->p_stats->p_ru.ru_oublock++;
+	splx(s);
 
 	/*
 	 * If B_NEEDCOMMIT is set, a commit rpc may do the trick. If not
@@ -3046,7 +3051,9 @@ nfs_writebp(bp, force)
 		int rtval = biowait(bp);
 
 		if (oldflags & B_DELWRI) {
+			s = splbio();
 			reassignbuf(bp, bp->b_vp);
+			splx(s);
 		}
 
 		brelse(bp);
diff --git a/sys/pci/if_de.c b/sys/pci/if_de.c
index 1cfaeba29de5..e764252d396b 100644
--- a/sys/pci/if_de.c
+++ b/sys/pci/if_de.c
@@ -1,5 +1,7 @@
+#undef __FreeBSD__
+#define __FreeBSD__ 3
 /*	$NetBSD: if_de.c,v 1.56 1997/10/20 14:32:46 matt Exp $	*/
-/*	$Id: if_de.c,v 1.79 1998/02/06 12:14:08 eivind Exp $ */
+/*	$Id: if_de.c,v 1.80 1998/02/20 13:11:50 bde Exp $ */
 
 /*-
  * Copyright (c) 1994-1997 Matt Thomas (matt@3am-software.com)
diff --git a/sys/sys/bio.h b/sys/sys/bio.h
index 47117edfd37d..f274dd62f91e 100644
--- a/sys/sys/bio.h
+++ b/sys/sys/bio.h
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)buf.h	8.9 (Berkeley) 3/30/95
- * $Id: buf.h,v 1.45 1998/01/22 17:30:10 dyson Exp $
+ * $Id: buf.h,v 1.46 1998/03/07 21:36:20 dyson Exp $
  */
 
 #ifndef _SYS_BUF_H_
@@ -47,6 +47,24 @@
 #define NOLIST ((struct buf *)0x87654321)
 
 struct buf;
+struct mount;
+
+/*
+ * To avoid including <ufs/ffs/softdep.h> 
+ */   
+LIST_HEAD(workhead, worklist);
+/*
+ * These are currently used only by the soft dependency code, hence
+ * are stored once in a global variable. If other subsystems wanted
+ * to use these hooks, a pointer to a set of bio_ops could be added
+ * to each buffer.
+ */
+extern struct bio_ops {
+	void	(*io_start) __P((struct buf *));
+	void	(*io_complete) __P((struct buf *));
+	void	(*io_deallocate) __P((struct buf *));
+	int	(*io_sync) __P((struct mount *));
+} bioops;
 
 struct iodone_chain {
 	long	ic_prev_flags;
@@ -104,6 +122,7 @@ struct buf {
 	} b_cluster;
 	struct	vm_page *b_pages[btoc(MAXPHYS)];
 	int		b_npages;
+	struct	workhead b_dep;		/* List of filesystem dependencies. */
 };
 
 /*
@@ -264,6 +283,7 @@ int	breadn __P((struct vnode *, daddr_t, int, daddr_t *, int *, int,
 int	bwrite __P((struct buf *));
 void	bdwrite __P((struct buf *));
 void	bawrite __P((struct buf *));
+void	bdirty __P((struct buf *));
 int	bowrite __P((struct buf *));
 void	brelse __P((struct buf *));
 void	bqrelse __P((struct buf *));
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index 47117edfd37d..f274dd62f91e 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)buf.h	8.9 (Berkeley) 3/30/95
- * $Id: buf.h,v 1.45 1998/01/22 17:30:10 dyson Exp $
+ * $Id: buf.h,v 1.46 1998/03/07 21:36:20 dyson Exp $
  */
 
 #ifndef _SYS_BUF_H_
@@ -47,6 +47,24 @@
 #define NOLIST ((struct buf *)0x87654321)
 
 struct buf;
+struct mount;
+
+/*
+ * To avoid including <ufs/ffs/softdep.h> 
+ */   
+LIST_HEAD(workhead, worklist);
+/*
+ * These are currently used only by the soft dependency code, hence
+ * are stored once in a global variable. If other subsystems wanted
+ * to use these hooks, a pointer to a set of bio_ops could be added
+ * to each buffer.
+ */
+extern struct bio_ops {
+	void	(*io_start) __P((struct buf *));
+	void	(*io_complete) __P((struct buf *));
+	void	(*io_deallocate) __P((struct buf *));
+	int	(*io_sync) __P((struct mount *));
+} bioops;
 
 struct iodone_chain {
 	long	ic_prev_flags;
@@ -104,6 +122,7 @@ struct buf {
 	} b_cluster;
 	struct	vm_page *b_pages[btoc(MAXPHYS)];
 	int		b_npages;
+	struct	workhead b_dep;		/* List of filesystem dependencies. */
 };
 
 /*
@@ -264,6 +283,7 @@ int	breadn __P((struct vnode *, daddr_t, int, daddr_t *, int *, int,
 int	bwrite __P((struct buf *));
 void	bdwrite __P((struct buf *));
 void	bawrite __P((struct buf *));
+void	bdirty __P((struct buf *));
 int	bowrite __P((struct buf *));
 void	brelse __P((struct buf *));
 void	bqrelse __P((struct buf *));
diff --git a/sys/sys/malloc.h b/sys/sys/malloc.h
index 69d258c96ba5..a04c6881367f 100644
--- a/sys/sys/malloc.h
+++ b/sys/sys/malloc.h
@@ -31,12 +31,14 @@
  * SUCH DAMAGE.
  *
  *	@(#)malloc.h	8.5 (Berkeley) 5/3/95
- * $Id: malloc.h,v 1.35 1997/12/05 19:14:36 bde Exp $
+ * $Id: malloc.h,v 1.36 1997/12/27 09:42:03 bde Exp $
  */
 
 #ifndef _SYS_MALLOC_H_
 #define	_SYS_MALLOC_H_
 
+#define splmem splhigh
+
 #define KMEMSTATS
 
 /*
@@ -165,7 +167,7 @@ struct kmembuckets {
 #else /* do not collect statistics */
 #define	MALLOC(space, cast, size, type, flags) do { \
 	register struct kmembuckets *kbp = &bucket[BUCKETINDX(size)]; \
-	long s = splimp(); \
+	long s = splmem(); \
 	if (kbp->kb_next == NULL) { \
 		(space) = (cast)malloc((u_long)(size), type, flags); \
 	} else { \
@@ -178,7 +180,7 @@ struct kmembuckets {
 #define	FREE(addr, type) do { \
 	register struct kmembuckets *kbp; \
 	register struct kmemusage *kup = btokup(addr); \
-	long s = splimp(); \
+	long s = splmem(); \
 	if (1 << kup->ku_indx > MAXALLOCSAVE) { \
 		free((addr), type); \
 	} else { \
diff --git a/sys/sys/mount.h b/sys/sys/mount.h
index 9230ae0979be..05eae825620d 100644
--- a/sys/sys/mount.h
+++ b/sys/sys/mount.h
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)mount.h	8.21 (Berkeley) 5/20/95
- *	$Id: mount.h,v 1.56 1998/02/22 01:17:51 jkh Exp $
+ *	$Id: mount.h,v 1.57 1998/03/01 22:46:36 msmith Exp $
  */
 
 #ifndef _SYS_MOUNT_H_
@@ -79,7 +79,8 @@ struct statfs {
 	uid_t	f_owner;		/* user that mounted the filesystem */
 	int	f_type;			/* type of filesystem (see below) */
 	int	f_flags;		/* copy of mount exported flags */
-	long	f_spare[2];		/* spare for later */
+	long    f_syncwrites;		/* count of sync writes since mount */
+	long    f_asyncwrites;		/* count of async writes since mount */
 	char	f_fstypename[MFSNAMELEN]; /* fs type name */
 	char	f_mntonname[MNAMELEN];	/* directory on which mounted */
 	char	f_mntfromname[MNAMELEN];/* mounted filesystem */
@@ -146,6 +147,7 @@ struct mount {
 	struct vfsops	*mnt_op;		/* operations on fs */
 	struct vfsconf	*mnt_vfc;		/* configuration info */
 	struct vnode	*mnt_vnodecovered;	/* vnode we mounted on */
+	struct vnode	*mnt_syncer;		/* syncer vnode */
 	struct vnodelst	mnt_vnodelist;		/* list of vnodes this mount */
 	struct lock	mnt_lock;		/* mount structure lock */
 	int		mnt_flag;		/* flags shared with user */
@@ -167,6 +169,7 @@ struct mount {
 #define	MNT_UNION	0x00000020	/* union with underlying filesystem */
 #define	MNT_ASYNC	0x00000040	/* file system written asynchronously */
 #define	MNT_SUIDDIR	0x00100000	/* special handling of SUID on dirs */
+#define	MNT_SOFTDEP	0x00200000	/* soft updates being done */
 #define	MNT_NOATIME	0x10000000	/* disable update of file access time */
 #define	MNT_NOCLUSTERR	0x40000000	/* disable cluster read */
 #define	MNT_NOCLUSTERW	0x80000000	/* disable cluster write */
@@ -203,7 +206,8 @@ struct mount {
 			MNT_DEFEXPORTED	| MNT_EXPORTANON| MNT_EXKERB	| \
 			MNT_LOCAL	| MNT_USER	| MNT_QUOTA	| \
 			MNT_ROOTFS	| MNT_NOATIME	| MNT_NOCLUSTERR| \
-			MNT_NOCLUSTERW	| MNT_SUIDDIR/*	| MNT_EXPUBLIC */)
+			MNT_NOCLUSTERW	| MNT_SUIDDIR	| MNT_SOFTDEP	 \
+			/*	| MNT_EXPUBLIC */)
 /*
  * External filesystem command modifier flags.
  * Unmount can use the MNT_FORCE flag.
@@ -248,8 +252,8 @@ struct mount {
  *
  * waitfor flags to vfs_sync() and getfsstat()
  */
-#define MNT_WAIT	1
-#define MNT_NOWAIT	2
+#define MNT_WAIT	1	/* synchronously wait for I/O to complete */
+#define MNT_NOWAIT	2	/* start all I/O, but do not wait for it */
 #define MNT_LAZY	3	/* push data not written by filesystem syncer */
 
 /*
@@ -448,6 +452,7 @@ int	vfs_export			    /* process mount export info */
 int	vfs_vrele __P((struct mount *, struct vnode *));
 struct	netcred *vfs_export_lookup	    /* lookup host in fs export list */
 	  __P((struct mount *, struct netexport *, struct sockaddr *));
+int	vfs_allocate_syncvnode __P((struct mount *));
 void	vfs_getnewfsid __P((struct mount *));
 struct	mount *vfs_getvfs __P((fsid_t *));      /* return vfs given fsid */
 int	vfs_mountedon __P((struct vnode *));    /* is a vfs mounted on vp */
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index aed0b4936a13..1e21926b95a2 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vnode.h	8.7 (Berkeley) 2/4/94
- * $Id: vnode.h,v 1.66 1998/01/24 02:01:31 dyson Exp $
+ * $Id: vnode.h,v 1.67 1998/03/07 21:36:27 dyson Exp $
  */
 
 #ifndef _SYS_VNODE_H_
@@ -61,7 +61,7 @@ enum vtype	{ VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD };
 enum vtagtype	{
 	VT_NON, VT_UFS, VT_NFS, VT_MFS, VT_PC, VT_LFS, VT_LOFS, VT_FDESC,
 	VT_PORTAL, VT_NULL, VT_UMAP, VT_KERNFS, VT_PROCFS, VT_AFS, VT_ISOFS,
-	VT_UNION, VT_MSDOSFS, VT_DEVFS, VT_TFS
+	VT_UNION, VT_MSDOSFS, VT_DEVFS, VT_TFS, VT_VFS
 };
 
 /*
@@ -94,6 +94,7 @@ struct vnode {
 	LIST_ENTRY(vnode) v_mntvnodes;		/* vnodes for mount point */
 	struct	buflists v_cleanblkhd;		/* clean blocklist head */
 	struct	buflists v_dirtyblkhd;		/* dirty blocklist head */
+	LIST_ENTRY(vnode) v_synclist;		/* vnodes with dirty buffers */
 	long	v_numoutput;			/* num of writes in progress */
 	enum	vtype v_type;			/* vnode type */
 	union {
@@ -154,7 +155,8 @@ struct vnode {
 #define	VOWANT		0x20000	/* a process is waiting for VOLOCK */
 #define	VDOOMED		0x40000	/* This vnode is being recycled */
 #define	VFREE		0x80000	/* This vnode is on the freelist */
-#define	VTBFREE		0x100000	/* This vnode is no the to be freelist */
+#define	VTBFREE		0x100000 /* This vnode is on the to-be-freelist */
+#define	VONWORKLST	0x200000 /* On syncer work-list */
 
 /*
  * Vnode attributes.  A field value of VNOVAL represents a field whose value
@@ -243,6 +245,7 @@ extern int		vttoif_tab[];
 
 #define	VREF(vp)	vref(vp)
 
+
 #ifdef DIAGNOSTIC
 #define	VATTR_NULL(vap)	vattr_null(vap)
 #else
@@ -262,6 +265,8 @@ extern int		vttoif_tab[];
  */
 extern	struct vnode *rootvnode;	/* root (i.e. "/") vnode */
 extern	int desiredvnodes;		/* number of vnodes desired */
+extern	time_t syncdelay;		/* time to delay syncing vnodes */
+extern	int rushjob;		/* # of slots filesys_syncer should run ASAP */
 extern	struct vm_zone *namei_zone;
 extern	int prtactive;			/* nonzero to call vprint() */
 extern	struct vattr va_null;		/* predefined null vattr structure */
@@ -499,6 +504,7 @@ int 	vn_rdwr __P((enum uio_rw rw, struct vnode *vp, caddr_t base,
 	    int len, off_t offset, enum uio_seg segflg, int ioflg,
 	    struct ucred *cred, int *aresid, struct proc *p));
 int	vn_stat __P((struct vnode *vp, struct stat *sb, struct proc *p));
+void	vn_syncer_add_to_worklist __P((struct vnode *vp, int delay));
 int	vfs_cache_lookup __P((struct vop_lookup_args *ap));
 int	vfs_object_create __P((struct vnode *vp, struct proc *p,
                 struct ucred *cred, int waslocked));
diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c
index f2e6a74febc5..8eab25a204a2 100644
--- a/sys/ufs/ffs/ffs_alloc.c
+++ b/sys/ufs/ffs/ffs_alloc.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ffs_alloc.c	8.18 (Berkeley) 5/26/95
- * $Id: ffs_alloc.c,v 1.46 1998/02/04 22:33:27 eivind Exp $
+ * $Id: ffs_alloc.c,v 1.47 1998/02/06 12:14:13 eivind Exp $
  */
 
 #include "opt_quota.h"
@@ -57,7 +57,8 @@ typedef ufs_daddr_t allocfcn_t __P((struct inode *ip, int cg, ufs_daddr_t bpref,
 				  int size));
 
 static ufs_daddr_t ffs_alloccg __P((struct inode *, int, ufs_daddr_t, int));
-static ufs_daddr_t ffs_alloccgblk __P((struct fs *, struct cg *, ufs_daddr_t));
+static ufs_daddr_t
+	      ffs_alloccgblk __P((struct inode *, struct buf *, ufs_daddr_t));
 #ifdef DIAGNOSTIC
 static int	ffs_checkblk __P((struct inode *, ufs_daddr_t, long));
 #endif
@@ -292,7 +293,8 @@ ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp)
 					 ffs_alloccg);
 	if (bno > 0) {
 		bp->b_blkno = fsbtodb(fs, bno);
-		ffs_blkfree(ip, bprev, (long)osize);
+		if (!DOINGSOFTDEP(ITOV(ip)))
+			ffs_blkfree(ip, bprev, (long)osize);
 		if (nsize < request)
 			ffs_blkfree(ip, bno + numfrags(fs, nsize),
 			    (long)(request - nsize));
@@ -455,8 +457,10 @@ ffs_reallocblks(ap)
 #endif
 	blkno = newblk;
 	for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) {
-		if (i == ssize)
+		if (i == ssize) {
 			bap = ebap;
+			soff = -i;
+		}
 #ifdef DIAGNOSTIC
 		if (!ffs_checkblk(ip,
 		   dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
@@ -468,6 +472,16 @@ ffs_reallocblks(ap)
 		if (prtrealloc)
 			printf(" %d,", *bap);
 #endif
+		if (DOINGSOFTDEP(vp)) {
+			if (sbap == &ip->i_db[0] && i < ssize)
+				softdep_setup_allocdirect(ip, start_lbn + i,
+				    blkno, *bap, fs->fs_bsize, fs->fs_bsize,
+				    buflist->bs_children[i]);
+			else
+				softdep_setup_allocindir_page(ip, start_lbn + i,
+				    i < ssize ? sbp : ebp, soff + i, blkno,
+				    *bap, buflist->bs_children[i]);
+		}
 		*bap++ = blkno;
 	}
 	/*
@@ -509,8 +523,10 @@ ffs_reallocblks(ap)
 		printf("\n\tnew:");
 #endif
 	for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
-		ffs_blkfree(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno),
-		    fs->fs_bsize);
+		if (!DOINGSOFTDEP(vp))
+			ffs_blkfree(ip,
+			    dbtofsb(fs, buflist->bs_children[i]->b_blkno),
+			    fs->fs_bsize);
 		buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
 #ifdef DEBUG
 		if (!ffs_checkblk(ip,
@@ -847,6 +863,8 @@ ffs_fragextend(ip, cg, bprev, osize, nsize)
 		fs->fs_cs(fs, cg).cs_nffree--;
 	}
 	fs->fs_fmod = 1;
+	if (DOINGSOFTDEP(ITOV(ip)))
+		softdep_setup_blkmapdep(bp, fs, bprev);
 	bdwrite(bp);
 	return (bprev);
 }
@@ -868,7 +886,8 @@ ffs_alloccg(ip, cg, bpref, size)
 	register struct cg *cgp;
 	struct buf *bp;
 	register int i;
-	int error, bno, frags, allocsiz;
+	ufs_daddr_t bno, blkno;
+	int allocsiz, error, frags;
 
 	fs = ip->i_fs;
 	if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
@@ -887,7 +906,7 @@ ffs_alloccg(ip, cg, bpref, size)
 	}
 	cgp->cg_time = time.tv_sec;
 	if (size == fs->fs_bsize) {
-		bno = ffs_alloccgblk(fs, cgp, bpref);
+		bno = ffs_alloccgblk(ip, bp, bpref);
 		bdwrite(bp);
 		return (bno);
 	}
@@ -909,7 +928,7 @@ ffs_alloccg(ip, cg, bpref, size)
 			brelse(bp);
 			return (0);
 		}
-		bno = ffs_alloccgblk(fs, cgp, bpref);
+		bno = ffs_alloccgblk(ip, bp, bpref);
 		bpref = dtogd(fs, bno);
 		for (i = frags; i < fs->fs_frag; i++)
 			setbit(cg_blksfree(cgp), bpref + i);
@@ -936,8 +955,11 @@ ffs_alloccg(ip, cg, bpref, size)
 	cgp->cg_frsum[allocsiz]--;
 	if (frags != allocsiz)
 		cgp->cg_frsum[allocsiz - frags]++;
+	blkno = cg * fs->fs_fpg + bno;
+	if (DOINGSOFTDEP(ITOV(ip)))
+		softdep_setup_blkmapdep(bp, fs, blkno);
 	bdwrite(bp);
-	return (cg * fs->fs_fpg + bno);
+	return ((u_long)blkno);
 }
 
 /*
@@ -952,16 +974,20 @@ ffs_alloccg(ip, cg, bpref, size)
  * blocks may be fragmented by the routine that allocates them.
  */
 static ufs_daddr_t
-ffs_alloccgblk(fs, cgp, bpref)
-	register struct fs *fs;
-	register struct cg *cgp;
+ffs_alloccgblk(ip, bp, bpref)
+	struct inode *ip;
+	struct buf *bp;
 	ufs_daddr_t bpref;
 {
+	struct fs *fs;
+	struct cg *cgp;
 	ufs_daddr_t bno, blkno;
 	int cylno, pos, delta;
 	short *cylbp;
 	register int i;
 
+	fs = ip->i_fs;
+	cgp = (struct cg *)bp->b_data;
 	if (bpref == 0 || dtog(fs, bpref) != cgp->cg_cgx) {
 		bpref = cgp->cg_rotor;
 		goto norot;
@@ -1052,7 +1078,10 @@ ffs_alloccgblk(fs, cgp, bpref)
 	cg_blks(fs, cgp, cylno)[cbtorpos(fs, bno)]--;
 	cg_blktot(cgp)[cylno]--;
 	fs->fs_fmod = 1;
-	return (cgp->cg_cgx * fs->fs_fpg + bno);
+	blkno = cgp->cg_cgx * fs->fs_fpg + bno;
+	if (DOINGSOFTDEP(ITOV(ip)))
+		softdep_setup_blkmapdep(bp, fs, blkno);
+	return (blkno);
 }
 
 #ifdef notyet
@@ -1155,7 +1184,7 @@ ffs_clusteralloc(ip, cg, bpref, len)
 		panic("ffs_clusteralloc: allocated out of group");
 	len = blkstofrags(fs, len);
 	for (i = 0; i < len; i += fs->fs_frag)
-		if ((got = ffs_alloccgblk(fs, cgp, bno + i)) != bno + i)
+		if ((got = ffs_alloccgblk(ip, bp, bno + i)) != bno + i)
 			panic("ffs_clusteralloc: lost block");
 	bdwrite(bp);
 	return (bno);
@@ -1234,6 +1263,8 @@ ffs_nodealloccg(ip, cg, ipref, mode)
 	panic("ffs_nodealloccg: block not in map");
 	/* NOTREACHED */
 gotit:
+	if (DOINGSOFTDEP(ITOV(ip)))
+		softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref);
 	setbit(cg_inosused(cgp), ipref);
 	cgp->cg_cs.cs_nifree--;
 	fs->fs_cstotal.cs_nifree--;
@@ -1268,9 +1299,10 @@ ffs_blkfree(ip, bno, size)
 	int i, error, cg, blk, frags, bbase;
 
 	fs = ip->i_fs;
-	if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
-		printf("dev = 0x%lx, bsize = %ld, size = %ld, fs = %s\n",
-		    (u_long)ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
+	if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 ||
+	    fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) {
+		printf("dev=0x%lx, bno = %d, bsize = %d, size = %ld, fs = %s\n",
+		    (u_long)ip->i_dev, bno, fs->fs_bsize, size, fs->fs_fsmnt);
 		panic("ffs_blkfree: bad size");
 	}
 	cg = dtog(fs, bno);
@@ -1294,7 +1326,7 @@ ffs_blkfree(ip, bno, size)
 	bno = dtogd(fs, bno);
 	if (size == fs->fs_bsize) {
 		blkno = fragstoblks(fs, bno);
-		if (ffs_isblock(fs, cg_blksfree(cgp), blkno)) {
+		if (!ffs_isfreeblock(fs, cg_blksfree(cgp), blkno)) {
 			printf("dev = 0x%lx, block = %ld, fs = %s\n",
 			    (u_long) ip->i_dev, bno, fs->fs_fsmnt);
 			panic("ffs_blkfree: freeing free block");
@@ -1404,11 +1436,26 @@ ffs_checkblk(ip, bno, size)
 
 /*
  * Free an inode.
- *
- * The specified inode is placed back in the free map.
  */
 int
-ffs_vfree(pvp, ino, mode)
+ffs_vfree( pvp, ino, mode)
+	struct vnode *pvp;
+	ino_t ino;
+	int mode;
+{
+	if (DOINGSOFTDEP(pvp)) {
+		softdep_freefile(pvp, ino, mode);
+		return (0);
+	}
+	return (ffs_freefile(pvp, ino, mode));
+}
+
+/*
+ * Do the actual free operation.
+ * The specified inode is placed back in the free map.
+ */
+ int
+ ffs_freefile( pvp, ino, mode)
 	struct vnode *pvp;
 	ino_t ino;
 	int mode;
@@ -1429,7 +1476,7 @@ ffs_vfree(pvp, ino, mode)
 		(int)fs->fs_cgsize, NOCRED, &bp);
 	if (error) {
 		brelse(bp);
-		return (0);
+		return (error);
 	}
 	cgp = (struct cg *)bp->b_data;
 	if (!cg_chkmagic(cgp)) {
diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c
index 60d20376d1b6..cf253b72e6e4 100644
--- a/sys/ufs/ffs/ffs_balloc.c
+++ b/sys/ufs/ffs/ffs_balloc.c
@@ -31,13 +31,14 @@
  * SUCH DAMAGE.
  *
  *	@(#)ffs_balloc.c	8.8 (Berkeley) 6/16/95
- * $Id: ffs_balloc.c,v 1.18 1998/02/04 22:33:31 eivind Exp $
+ * $Id: ffs_balloc.c,v 1.19 1998/02/06 12:14:14 eivind Exp $
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/buf.h>
 #include <sys/lock.h>
+#include <sys/mount.h>
 #include <sys/vnode.h>
 
 #include <ufs/ufs/quota.h>
@@ -53,16 +54,23 @@
  * the inode and the logical block number in a file.
  */
 int
-ffs_balloc(ip, lbn, size, cred, bpp, flags)
+ffs_balloc(ap)
+	struct vop_balloc_args /* {
+		struct inode *a_ip;
+		ufs_daddr_t a_lbn;
+		int a_size;
+		struct ucred *a_cred;
+		int a_flags;
+		struct buf *a_bpp;
+	} */ *ap;
+{
 	register struct inode *ip;
 	register ufs_daddr_t lbn;
 	int size;
 	struct ucred *cred;
-	struct buf **bpp;
 	int flags;
-{
-	register struct fs *fs;
-	register ufs_daddr_t nb;
+	struct fs *fs;
+	ufs_daddr_t nb;
 	struct buf *bp, *nbp;
 	struct vnode *vp = ITOV(ip);
 	struct indir indirs[NIADDR + 2];
@@ -70,10 +78,18 @@ ffs_balloc(ip, lbn, size, cred, bpp, flags)
 	int deallocated, osize, nsize, num, i, error;
 	ufs_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
 
-	*bpp = NULL;
+	vp = ap->a_vp;
+	ip = VTOI(vp);
+	fs = ip->i_fs;
+	lbn = lblkno(fs, ap->a_startoffset);
+	size = blkoff(fs, ap->a_startoffset) + ap->a_size;
+	if (size > fs->fs_bsize)
+		panic("ffs_balloc: blk too big");
+	*ap->a_bpp = NULL;
 	if (lbn < 0)
 		return (EFBIG);
-	fs = ip->i_fs;
+	cred = ap->a_cred;
+	flags = ap->a_flags;
 
 	/*
 	 * If the next write will extend the file into a new block,
@@ -89,6 +105,10 @@ ffs_balloc(ip, lbn, size, cred, bpp, flags)
 				osize, (int)fs->fs_bsize, cred, &bp);
 			if (error)
 				return (error);
+			if (DOINGSOFTDEP(vp))
+				softdep_setup_allocdirect(ip, nb,
+				    dbtofsb(fs, bp->b_blkno), ip->i_db[nb],
+				    fs->fs_bsize, osize, bp);
 			ip->i_size = smalllblktosize(fs, nb + 1);
 			ip->i_db[nb] = dbtofsb(fs, bp->b_blkno);
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
@@ -110,7 +130,7 @@ ffs_balloc(ip, lbn, size, cred, bpp, flags)
 				return (error);
 			}
 			bp->b_blkno = fsbtodb(fs, nb);
-			*bpp = bp;
+			*ap->a_bpp = bp;
 			return (0);
 		}
 		if (nb != 0) {
@@ -132,6 +152,10 @@ ffs_balloc(ip, lbn, size, cred, bpp, flags)
 					&ip->i_db[0]), osize, nsize, cred, &bp);
 				if (error)
 					return (error);
+				if (DOINGSOFTDEP(vp))
+					softdep_setup_allocdirect(ip, lbn,
+					    dbtofsb(fs, bp->b_blkno), nb,
+					    nsize, osize, bp);
 			}
 		} else {
 			if (ip->i_size < smalllblktosize(fs, lbn + 1))
@@ -147,10 +171,13 @@ ffs_balloc(ip, lbn, size, cred, bpp, flags)
 			bp->b_blkno = fsbtodb(fs, newb);
 			if (flags & B_CLRBUF)
 				vfs_bio_clrbuf(bp);
+			if (DOINGSOFTDEP(vp))
+				softdep_setup_allocdirect(ip, lbn, newb, 0,
+				    nsize, 0, bp);
 		}
 		ip->i_db[lbn] = dbtofsb(fs, bp->b_blkno);
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
-		*bpp = bp;
+		*ap->a_bpp = bp;
 		return (0);
 	}
 	/*
@@ -180,12 +207,18 @@ ffs_balloc(ip, lbn, size, cred, bpp, flags)
 		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0);
 		bp->b_blkno = fsbtodb(fs, nb);
 		vfs_bio_clrbuf(bp);
-		/*
-		 * Write synchronously so that indirect blocks
-		 * never point at garbage.
-		 */
-		if (error = bwrite(bp))
-			goto fail;
+		if (DOINGSOFTDEP(vp)) {
+			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
+			    newb, 0, fs->fs_bsize, 0, bp);
+			bdwrite(bp);
+		} else {
+			/*
+			 * Write synchronously so that indirect blocks
+			 * never point at garbage.
+			 */
+			if (error = bwrite(bp))
+				goto fail;
+		}
 		allocib = &ip->i_ib[indirs[0].in_off];
 		*allocib = nb;
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
@@ -221,13 +254,19 @@ ffs_balloc(ip, lbn, size, cred, bpp, flags)
 		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0);
 		nbp->b_blkno = fsbtodb(fs, nb);
 		vfs_bio_clrbuf(nbp);
-		/*
-		 * Write synchronously so that indirect blocks
-		 * never point at garbage.
-		 */
-		if (error = bwrite(nbp)) {
-			brelse(bp);
-			goto fail;
+		if (DOINGSOFTDEP(vp)) {
+			softdep_setup_allocindir_meta(nbp, ip, bp,
+			    indirs[i - 1].in_off, nb);
+			bdwrite(nbp);
+		} else {
+			/*
+			 * Write synchronously so that indirect blocks
+			 * never point at garbage.
+			 */
+			if (error = bwrite(nbp)) {
+				brelse(bp);
+				goto fail;
+			}
 		}
 		bap[indirs[i - 1].in_off] = nb;
 		/*
@@ -259,6 +298,9 @@ ffs_balloc(ip, lbn, size, cred, bpp, flags)
 		nbp->b_blkno = fsbtodb(fs, nb);
 		if (flags & B_CLRBUF)
 			vfs_bio_clrbuf(nbp);
+		if (DOINGSOFTDEP(vp))
+			softdep_setup_allocindir_page(ip, lbn, bp,
+			    indirs[i].in_off, nb, 0, nbp);
 		bap[indirs[i].in_off] = nb;
 		/*
 		 * If required, write synchronously, otherwise use
@@ -271,7 +313,7 @@ ffs_balloc(ip, lbn, size, cred, bpp, flags)
 				bp->b_flags |= B_CLUSTEROK;
 			bdwrite(bp);
 		}
-		*bpp = nbp;
+		*ap->a_bpp = nbp;
 		return (0);
 	}
 	brelse(bp);
@@ -285,7 +327,7 @@ ffs_balloc(ip, lbn, size, cred, bpp, flags)
 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0);
 		nbp->b_blkno = fsbtodb(fs, nb);
 	}
-	*bpp = nbp;
+	*ap->a_bpp = nbp;
 	return (0);
 fail:
 	/*
diff --git a/sys/ufs/ffs/ffs_extern.h b/sys/ufs/ffs/ffs_extern.h
index dad97d3d426b..90cb38c02a11 100644
--- a/sys/ufs/ffs/ffs_extern.h
+++ b/sys/ufs/ffs/ffs_extern.h
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ffs_extern.h	8.6 (Berkeley) 3/30/95
- * $Id: ffs_extern.h,v 1.21 1997/11/22 08:35:45 bde Exp $
+ * $Id: ffs_extern.h,v 1.22 1998/02/03 21:52:00 bde Exp $
  */
 
 #ifndef _UFS_FFS_EXTERN_H
@@ -68,8 +68,7 @@ struct vop_reallocblks_args;
 
 int	ffs_alloc __P((struct inode *,
 	    ufs_daddr_t, ufs_daddr_t, int, struct ucred *, ufs_daddr_t *));
-int	ffs_balloc __P((struct inode *,
-	    ufs_daddr_t, int, struct ucred *, struct buf **, int));
+int	ffs_balloc __P((struct vop_balloc_args *));
 int	ffs_blkatoff __P((struct vnode *, off_t, char **, struct buf **));
 void	ffs_blkfree __P((struct inode *, ufs_daddr_t, long));
 ufs_daddr_t ffs_blkpref __P((struct inode *, ufs_daddr_t, int, ufs_daddr_t *));
@@ -79,7 +78,9 @@ int	ffs_fhtovp __P((struct mount *, struct fid *, struct sockaddr *,
 	    struct vnode **, int *, struct ucred **));
 int	ffs_flushfiles __P((struct mount *, int, struct proc *));
 void	ffs_fragacct __P((struct fs *, int, int32_t [], int));
+int	ffs_freefile __P(( struct vnode *, ino_t, int ));
 int	ffs_isblock __P((struct fs *, u_char *, ufs_daddr_t));
+int	ffs_isfreeblock __P((struct fs *, unsigned char *, ufs_daddr_t));
 int	ffs_mountfs __P((struct vnode *, struct mount *, struct proc *,
 	     struct malloc_type *));
 int	ffs_mountroot __P((void));
@@ -102,4 +103,31 @@ extern vop_t **ffs_vnodeop_p;
 extern vop_t **ffs_specop_p;
 extern vop_t **ffs_fifoop_p;
 
+/*
+ * Soft update function prototypes.
+ */
+void	softdep_initialize __P((void));
+int	softdep_process_worklist __P((struct mount *));
+int	softdep_mount __P((struct vnode *, struct mount *, struct fs *,
+	    struct ucred *));
+int	softdep_flushfiles __P((struct mount *, int, struct proc *));
+void	softdep_update_inodeblock __P((struct inode *, struct buf *, int));
+void	softdep_load_inodeblock __P((struct inode *));
+int	softdep_fsync __P((struct vnode *));
+void	softdep_freefile __P((struct vnode *, ino_t, int));
+void	softdep_setup_freeblocks __P((struct inode *, off_t));
+void	softdep_deallocate_dependencies __P((struct buf *));
+void	softdep_setup_inomapdep __P((struct buf *, struct inode *, ino_t));
+void	softdep_setup_blkmapdep __P((struct buf *, struct fs *, ufs_daddr_t));
+void	softdep_setup_allocdirect __P((struct inode *, ufs_lbn_t, ufs_daddr_t,
+	    ufs_daddr_t, long, long, struct buf *));
+void	softdep_setup_allocindir_meta __P((struct buf *, struct inode *,
+	    struct buf *, int, ufs_daddr_t));
+void	softdep_setup_allocindir_page __P((struct inode *, ufs_lbn_t,
+	    struct buf *, int, ufs_daddr_t, ufs_daddr_t, struct buf *));
+void	softdep_disk_io_initiation __P((struct buf *));
+void	softdep_disk_write_complete __P((struct buf *));
+int	softdep_sync_metadata __P((struct vop_fsync_args *));
+
 #endif /* !_UFS_FFS_EXTERN_H */
+
diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c
index cf1c043bd9bd..d1364a19b0e4 100644
--- a/sys/ufs/ffs/ffs_inode.c
+++ b/sys/ufs/ffs/ffs_inode.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ffs_inode.c	8.13 (Berkeley) 4/21/95
- * $Id: ffs_inode.c,v 1.34 1998/02/06 12:14:14 eivind Exp $
+ * $Id: ffs_inode.c,v 1.35 1998/03/07 21:36:33 dyson Exp $
  */
 
 #include "opt_quota.h"
@@ -87,8 +87,9 @@ ffs_update(vp, access, modify, waitfor)
 		    ~(IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE);
 		return (0);
 	}
-	if ((ip->i_flag &
-	    (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0)
+	if (((ip->i_flag &
+	      (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0) &&
+	    (waitfor != MNT_WAIT))
 		return (0);
 	/*
 	 * Use a copy of the current time to get consistent timestamps
@@ -129,11 +130,15 @@ ffs_update(vp, access, modify, waitfor)
 		brelse(bp);
 		return (error);
 	}
+	if (DOINGSOFTDEP(vp))
+		softdep_update_inodeblock(ip, bp, waitfor);
+	else if (ip->i_effnlink != ip->i_nlink)
+		panic("ffs_update: bad link cnt");
 	*((struct dinode *)bp->b_data +
 	    ino_to_fsbo(fs, ip->i_number)) = ip->i_din;
-	if (waitfor && (vp->v_mount->mnt_flag & MNT_ASYNC) == 0)
+	if (waitfor && (vp->v_mount->mnt_flag & MNT_ASYNC) == 0) {
 		return (bwrite(bp));
-	else {
+	} else {
 		if (bp->b_bufsize == fs->fs_bsize)
 			bp->b_flags |= B_CLUSTEROK;
 		bdwrite(bp);
@@ -171,6 +176,8 @@ ffs_truncate(vp, length, flags, cred, p)
 	off_t osize;
 
 	oip = VTOI(ovp);
+	if (oip->i_size == length)
+		return (0);
 	fs = oip->i_fs;
 	if (length < 0)
 		return (EINVAL);
@@ -197,6 +204,31 @@ ffs_truncate(vp, length, flags, cred, p)
 	if (error)
 		return (error);
 #endif
+	ovp->v_lasta = ovp->v_clen = ovp->v_cstart = ovp->v_lastw = 0;
+	if (DOINGSOFTDEP(ovp)) {
+		if (length > 0) {
+			/*
+			 * If a file is only partially truncated, then
+			 * we have to clean up the data structures
+			 * describing the allocation past the truncation
+			 * point. Finding and deallocating those structures
+			 * is a lot of work. Since partial truncation occurs
+			 * rarely, we solve the problem by syncing the file
+			 * so that it will have no data structures left.
+			 */
+			if ((error = VOP_FSYNC(ovp, cred, MNT_WAIT,
+			    p)) != 0)
+				return (error);
+		} else {
+#ifdef QUOTA
+			(void) chkdq(oip, -oip->i_blocks, NOCRED, 0);
+#endif
+			softdep_setup_freeblocks(oip, length);
+			(void) vinvalbuf(ovp, 0, cred, p, 0, 0);
+			oip->i_flag |= IN_CHANGE | IN_UPDATE;
+			return (ffs_update(ovp, &tv, &tv, 0));
+		}
+	}
 	osize = oip->i_size;
 	/*
 	 * Lengthen the size of the file. We must ensure that the
@@ -205,13 +237,15 @@ ffs_truncate(vp, length, flags, cred, p)
 	 */
 	if (osize < length) {
 		vnode_pager_setsize(ovp, length);
+#if 0
 		offset = blkoff(fs, length - 1);
 		lbn = lblkno(fs, length - 1);
+#endif
 		aflags = B_CLRBUF;
 		if (flags & IO_SYNC)
 			aflags |= B_SYNC;
-		error = ffs_balloc(oip, lbn, offset + 1, cred,
-		    &bp, aflags);
+		error = VOP_BALLOC(ovp, length - 1, 1,
+		    cred, aflags, &bp);
 		if (error)
 			return (error);
 		oip->i_size = length;
@@ -241,9 +275,13 @@ ffs_truncate(vp, length, flags, cred, p)
 		aflags = B_CLRBUF;
 		if (flags & IO_SYNC)
 			aflags |= B_SYNC;
-		error = ffs_balloc(oip, lbn, offset, cred, &bp, aflags);
-		if (error)
+		error = VOP_BALLOC(ovp, length - 1, 1, cred, aflags, &bp);
+		if (error) {
+#if 0	/* kirk's version had this */
+			vnode_pager_setsize(ovp, (u_long)osize);
+#endif
 			return (error);
+		}
 		oip->i_size = length;
 		size = blksize(fs, oip, lbn);
 		bzero((char *)bp->b_data + offset, (u_int)(size - offset));
diff --git a/sys/ufs/ffs/ffs_subr.c b/sys/ufs/ffs/ffs_subr.c
index 601a4cf7ba19..7d7de141dfbe 100644
--- a/sys/ufs/ffs/ffs_subr.c
+++ b/sys/ufs/ffs/ffs_subr.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ffs_subr.c	8.5 (Berkeley) 3/21/95
- * $Id: ffs_subr.c,v 1.18 1998/02/06 12:14:14 eivind Exp $
+ * $Id: ffs_subr.c,v 1.19 1998/02/13 00:20:36 bde Exp $
  */
 
 #include <sys/param.h>
@@ -190,6 +190,30 @@ ffs_isblock(fs, cp, h)
 	}
 }
 
+/*
+ * check if a block is free
+ */
+int
+ffs_isfreeblock(fs, cp, h)
+	struct fs *fs;
+	unsigned char *cp;
+	ufs_daddr_t h;
+{
+
+	switch ((int)fs->fs_frag) {
+	case 8:
+		return (cp[h] == 0);
+	case 4:
+		return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0);
+	case 2:
+		return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0);
+	case 1:
+		return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0);
+	default:
+		panic("ffs_isfreeblock");
+	}
+}
+
 /*
  * take a block out of the map
  */
diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c
index edfe0e696f10..71a451dd38ec 100644
--- a/sys/ufs/ffs/ffs_vfsops.c
+++ b/sys/ufs/ffs/ffs_vfsops.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ffs_vfsops.c	8.31 (Berkeley) 5/20/95
- * $Id: ffs_vfsops.c,v 1.74 1998/03/07 14:59:44 bde Exp $
+ * $Id: ffs_vfsops.c,v 1.75 1998/03/07 21:36:36 dyson Exp $
  */
 
 #include "opt_quota.h"
@@ -203,7 +203,11 @@ ffs_mount( mp, path, data, ndp, p)
 			flags = WRITECLOSE;
 			if (mp->mnt_flag & MNT_FORCE)
 				flags |= FORCECLOSE;
-			err = ffs_flushfiles(mp, flags, p);
+			if (mp->mnt_flag & MNT_SOFTDEP) {
+				err = softdep_flushfiles(mp, flags, p);
+			} else {
+				err = ffs_flushfiles(mp, flags, p);
+			}
 		}
 		if (!err && (mp->mnt_flag & MNT_RELOAD))
 			err = ffs_reload(mp, ndp->ni_cnd.cn_cred, p);
@@ -410,7 +414,10 @@ ffs_reload(mp, cred, p)
 	 * Step 1: invalidate all cached meta-data.
 	 */
 	devvp = VFSTOUFS(mp)->um_devvp;
-	if (vinvalbuf(devvp, 0, cred, p, 0, 0))
+	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p);
+	error = vinvalbuf(devvp, 0, cred, p, 0, 0);
+	VOP_UNLOCK(devvp, 0, p);
+	if (error)
 		panic("ffs_reload: dirty1");
 
 	dev = devvp->v_rdev;
@@ -516,6 +523,7 @@ ffs_reload(mp, cred, p)
 		}
 		ip->i_din = *((struct dinode *)bp->b_data +
 		    ino_to_fsbo(fs, ip->i_number));
+		ip->i_effnlink = ip->i_nlink;
 		brelse(bp);
 		vput(vp);
 		simple_lock(&mntvnode_slock);
@@ -537,10 +545,12 @@ ffs_mountfs(devvp, mp, p, malloctype)
 	register struct ufsmount *ump;
 	struct buf *bp;
 	register struct fs *fs;
+	struct cg *cgp;
 	dev_t dev;
 	struct partinfo dpart;
+	struct csum cstotal;
 	caddr_t base, space;
-	int error, i, blks, size, ronly;
+	int error, i, cyl, blks, size, ronly;
 	int32_t *lp;
 	struct ucred *cred;
 	u_int64_t maxfilesize;					/* XXX */
@@ -562,7 +572,10 @@ ffs_mountfs(devvp, mp, p, malloctype)
 
 	if (ncount > 1 && devvp != rootvp)
 		return (EBUSY);
-	if (error = vinvalbuf(devvp, V_SAVE, cred, p, 0, 0))
+	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p);
+	error = vinvalbuf(devvp, V_SAVE, cred, p, 0, 0);
+	VOP_UNLOCK(devvp, 0, p);
+	if (error)
 		return (error);
 
 	/*
@@ -674,7 +687,7 @@ ffs_mountfs(devvp, mp, p, malloctype)
 	ump->um_seqinc = fs->fs_frag;
 	for (i = 0; i < MAXQUOTAS; i++)
 		ump->um_quotas[i] = NULLVP;
-	devvp->v_specflags |= SI_MOUNTEDON;
+	devvp->v_specmountpoint = mp;
 	ffs_oldfscompat(fs);
 
 	/*
@@ -700,11 +713,17 @@ ffs_mountfs(devvp, mp, p, malloctype)
 	if (fs->fs_maxfilesize > maxfilesize)			/* XXX */
 		fs->fs_maxfilesize = maxfilesize;		/* XXX */
 	if (ronly == 0) {
+		if ((fs->fs_flags & FS_DOSOFTDEP) &&
+		    (error = softdep_mount(devvp, mp, fs, cred)) != 0) {
+			free(base, M_UFSMNT);
+			goto out;
+		}
 		fs->fs_clean = 0;
 		(void) ffs_sbupdate(ump, MNT_WAIT);
 	}
 	return (0);
 out:
+	devvp->v_specmountpoint = NULL;
 	if (bp)
 		brelse(bp);
 	(void)VOP_CLOSE(devvp, ronly ? FREAD : FREAD|FWRITE, cred, p);
@@ -765,9 +784,13 @@ ffs_unmount(mp, mntflags, p)
 	if (mntflags & MNT_FORCE) {
 		flags |= FORCECLOSE;
 	}
-	error = ffs_flushfiles(mp, flags, p);
-	if (error)
-		return (error);
+	if (mp->mnt_flag & MNT_SOFTDEP) {
+		if ((error = softdep_flushfiles(mp, flags, p)) != 0)
+			return (error);
+	} else {
+		if ((error = ffs_flushfiles(mp, flags, p)) != 0)
+			return (error);
+	}
 	ump = VFSTOUFS(mp);
 	fs = ump->um_fs;
 	if (fs->fs_ronly == 0) {
@@ -778,7 +801,7 @@ ffs_unmount(mp, mntflags, p)
 			return (error);
 		}
 	}
-	ump->um_devvp->v_specflags &= ~SI_MOUNTEDON;
+	ump->um_devvp->v_specmountpoint = NULL;
 
 	vinvalbuf(ump->um_devvp, V_SAVE, NOCRED, p, 0, 0);
 	error = VOP_CLOSE(ump->um_devvp, fs->fs_ronly ? FREAD : FREAD|FWRITE,
@@ -824,7 +847,17 @@ ffs_flushfiles(mp, flags, p)
 		 */
 	}
 #endif
-	error = vflush(mp, NULLVP, flags);
+        /*
+	 * Flush all the files.
+	 */
+	if ((error = vflush(mp, NULL, flags)) != 0)
+		return (error);
+	/*
+	 * Flush filesystem metadata.
+	 */
+	vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY, p);
+	error = VOP_FSYNC(ump->um_devvp, p->p_ucred, MNT_WAIT, p);
+	VOP_UNLOCK(ump->um_devvp, 0, p);
 	return (error);
 }
 
@@ -903,9 +936,9 @@ ffs_sync(mp, waitfor, cred, p)
 		simple_lock(&vp->v_interlock);
 		nvp = vp->v_mntvnodes.le_next;
 		ip = VTOI(vp);
-		if (((ip->i_flag &
+		if ((vp->v_type == VNON) || ((ip->i_flag &
 		     (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0) &&
-		    vp->v_dirtyblkhd.lh_first == NULL) {
+		    ((vp->v_dirtyblkhd.lh_first == NULL) || (waitfor == MNT_LAZY))) {
 			simple_unlock(&vp->v_interlock);
 			continue;
 		}
@@ -937,21 +970,22 @@ ffs_sync(mp, waitfor, cred, p)
 	/*
 	 * Force stale file system control information to be flushed.
 	 */
-	error = VOP_FSYNC(ump->um_devvp, cred, waitfor, p);
-	if (error)
-		allerror = error;
+	if (waitfor != MNT_LAZY) {
+		if (ump->um_mountp->mnt_flag & MNT_SOFTDEP)
+			waitfor = MNT_NOWAIT;
+		vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY, p);
+		if ((error = VOP_FSYNC(ump->um_devvp, cred, waitfor, p)) != 0)
+			allerror = error;
+		VOP_UNLOCK(ump->um_devvp, 0, p);
+	}
 #ifdef QUOTA
 	qsync(mp);
 #endif
 	/*
 	 * Write back modified superblock.
 	 */
-	if (fs->fs_fmod != 0) {
-		fs->fs_fmod = 0;
-		fs->fs_time = time.tv_sec;
-		if (error = ffs_sbupdate(ump, waitfor))
-			allerror = error;
-	}
+	if (fs->fs_fmod != 0 && (error = ffs_sbupdate(ump, waitfor)) != 0)
+		allerror = error;
 	return (allerror);
 }
 
@@ -1060,6 +1094,10 @@ ffs_vget(mp, ino, vpp)
 		return (error);
 	}
 	ip->i_din = *((struct dinode *)bp->b_data + ino_to_fsbo(fs, ino));
+	if (DOINGSOFTDEP(vp))
+		softdep_load_inodeblock(ip);
+	else
+		ip->i_effnlink = ip->i_nlink;
 	bqrelse(bp);
 
 	/*
@@ -1157,6 +1195,7 @@ ffs_init(vfsp)
 	struct vfsconf *vfsp;
 {
 
+	softdep_initialize();
 	return (ufs_init(vfsp));
 }
 
@@ -1200,6 +1239,8 @@ ffs_sbupdate(mp, waitfor)
 	if (allerror)
 		return (allerror);
 	bp = getblk(mp->um_devvp, SBLOCK, (int)fs->fs_sbsize, 0, 0);
+	fs->fs_fmod = 0;
+	fs->fs_time = time.tv_sec;
 	bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
 	/* Restore compatibility to old file systems.		   XXX */
 	dfs = (struct fs *)bp->b_data;				/* XXX */
diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c
index 0210d61458ee..44db8f4ac970 100644
--- a/sys/ufs/ffs/ffs_vnops.c
+++ b/sys/ufs/ffs/ffs_vnops.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ffs_vnops.c	8.15 (Berkeley) 5/14/95
- * $Id: ffs_vnops.c,v 1.42 1998/02/06 12:14:16 eivind Exp $
+ * $Id: ffs_vnops.c,v 1.43 1998/02/26 06:39:38 msmith Exp $
  */
 
 #include <sys/param.h>
@@ -74,6 +74,7 @@ static struct vnodeopv_entry_desc ffs_vnodeop_entries[] = {
 	{ &vop_getpages_desc,		(vop_t *) ffs_getpages },
 	{ &vop_putpages_desc,		(vop_t *) ffs_putpages },
 	{ &vop_read_desc,		(vop_t *) ffs_read },
+	{ &vop_balloc_desc,		(vop_t *) ffs_balloc },
 	{ &vop_reallocblks_desc,	(vop_t *) ffs_reallocblks },
 	{ &vop_write_desc,		(vop_t *) ffs_write },
 	{ NULL, NULL }
@@ -120,12 +121,11 @@ ffs_fsync(ap)
 		struct proc *a_p;
 	} */ *ap;
 {
-	register struct vnode *vp = ap->a_vp;
-	register struct buf *bp;
+	struct vnode *vp = ap->a_vp;
+	struct buf *bp;
 	struct timeval tv;
 	struct buf *nbp;
-	int pass;
-	int s;
+	int s, error, passes, skipmeta;
 	daddr_t lbn;
 
 
@@ -137,31 +137,45 @@ ffs_fsync(ap)
 		lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1));
 	}
 
-	pass = 0;
 	/*
 	 * Flush all dirty buffers associated with a vnode.
 	 */
+	passes = NIADDR;
+	skipmeta = 0;
+	if (ap->a_waitfor == MNT_WAIT)
+		skipmeta = 1;
 loop:
 	s = splbio();
+loop2:
 	for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) {
 		nbp = bp->b_vnbufs.le_next;
-		if ((bp->b_flags & B_BUSY) || (pass == 0 && (bp->b_lblkno < 0)))
+		/* 
+		 * First time through on a synchronous call,
+		 * or if it's already scheduled, skip to the next 
+		 * buffer
+		 */
+		if ((bp->b_flags & B_BUSY) ||
+		    ((skipmeta == 1) && (bp->b_lblkno < 0)))
 			continue;
 		if ((bp->b_flags & B_DELWRI) == 0)
 			panic("ffs_fsync: not dirty");
-
-		if (((bp->b_vp != vp) || (ap->a_waitfor != MNT_NOWAIT)) ||
-			((vp->v_type != VREG) && (vp->v_type != VBLK))) {
-
+		/*
+		 * If data is outstanding to another vnode, or we were
+		 * asked to wait for everything, or it's not a file or BDEV,
+		 * start the IO on this buffer immediatly.
+		 */
+		if (((bp->b_vp != vp) || (ap->a_waitfor == MNT_WAIT)) ||
+		    ((vp->v_type != VREG) && (vp->v_type != VBLK))) {
 			bremfree(bp);
 			bp->b_flags |= B_BUSY;
 			splx(s);
 
 			/*
-			 * Wait for I/O associated with indirect blocks to complete,
-			 * since there is no way to quickly wait for them below.
+			 * Wait for I/O associated with indirect blocks to
+			 * complete, since there is no way to quickly wait
+			 * for them below.
 			 */
-			if ((bp->b_vp == vp) && (ap->a_waitfor == MNT_NOWAIT)) {
+			if ((bp->b_vp == vp) || (ap->a_waitfor != MNT_WAIT)) {
 				if (bp->b_flags & B_CLUSTEROK) {
 					bdwrite(bp);
 					(void) vfs_bio_awrite(bp);
@@ -171,26 +185,30 @@ ffs_fsync(ap)
 			} else {
 				(void) bwrite(bp);
 			}
-
 		} else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) {
-
+			/* 
+			 * If the buffer is for data that has been truncated
+			 * off the file, then throw it away.
+			 */
 			bremfree(bp);
 			bp->b_flags |= B_BUSY | B_INVAL | B_NOCACHE;
 			brelse(bp);
 			splx(s);
-
 		} else {
 			vfs_bio_awrite(bp);
 			splx(s);
 		}
 		goto loop;
 	}
-	splx(s);
-
-	if (pass == 0) {
-		pass = 1;
-		goto loop;
+	/*
+	 * If we were asked to do this synchronously, then go back for
+	 * another pass, this time doing the metadata.
+	 */
+	if (skipmeta) {
+		skipmeta = 0;
+		goto loop2; /* stay within the splbio() */
 	}
+	splx(s);
 
 	if (ap->a_waitfor == MNT_WAIT) {
 		s = splbio();
@@ -198,15 +216,38 @@ ffs_fsync(ap)
 			vp->v_flag |= VBWAIT;
 			(void) tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "ffsfsn", 0);
 		}
+		/* 
+		 * Ensure that any filesystem metatdata associated
+		 * with the vnode has been written.
+		 */
 		splx(s);
-#ifdef DIAGNOSTIC
+		if ((error = softdep_sync_metadata(ap)) != 0)
+			return (error);
+		s = splbio();
 		if (vp->v_dirtyblkhd.lh_first) {
-			vprint("ffs_fsync: dirty", vp);
-			goto loop;
-		}
+			/*
+			 * Block devices associated with filesystems may
+			 * have new I/O requests posted for them even if
+			 * the vnode is locked, so no amount of trying will
+			 * get them clean. Thus we give block devices a
+			 * good effort, then just give up. For all other file
+			 * types, go around and try again until it is clean.
+			 */
+			if (passes > 0) {
+				passes -= 1;
+				goto loop2;
+			}
+#ifdef DIAGNOSTIC
+			if (vp->v_type != VBLK)
+				vprint("ffs_fsync: dirty", vp);
 #endif
+		}
 	}
-
 	gettime(&tv);
-	return (UFS_UPDATE(ap->a_vp, &tv, &tv, ap->a_waitfor == MNT_WAIT));
+	error = UFS_UPDATE(ap->a_vp, &tv, &tv, (ap->a_waitfor == MNT_WAIT));
+	if (error)
+		return (error);
+	if (DOINGSOFTDEP(vp) && ap->a_waitfor == MNT_WAIT)
+		error = softdep_fsync(vp);
+	return (error);
 }
diff --git a/sys/ufs/ffs/fs.h b/sys/ufs/ffs/fs.h
index 98a9b06698d9..9f997318f997 100644
--- a/sys/ufs/ffs/fs.h
+++ b/sys/ufs/ffs/fs.h
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)fs.h	8.13 (Berkeley) 3/21/95
- * $Id: fs.h,v 1.11 1997/03/23 20:08:22 guido Exp $
+ * $Id: fs.h,v 1.12 1997/03/24 03:19:37 bde Exp $
  */
 
 #ifndef _UFS_FFS_FS_H_
@@ -222,7 +222,7 @@ struct fs {
 	int8_t   fs_fmod;		/* super block modified flag */
 	int8_t   fs_clean;		/* file system is clean flag */
 	int8_t 	 fs_ronly;		/* mounted read-only flag */
-	int8_t   fs_flags;		/* currently unused flag */
+	int8_t   fs_flags;		/* see FS_ flags below */
 	u_char	 fs_fsmnt[MAXMNTLEN];	/* name mounted on */
 /* these fields retain the current block allocation info */
 	int32_t	 fs_cgrotor;		/* last cg searched */
@@ -254,12 +254,19 @@ struct fs {
 #define	FS_OKAY		0x7c269d38	/* superblock checksum */
 #define FS_42INODEFMT	-1		/* 4.2BSD inode format */
 #define FS_44INODEFMT	2		/* 4.4BSD inode format */
+
 /*
  * Preference for optimization.
  */
 #define FS_OPTTIME	0	/* minimize allocation time */
 #define FS_OPTSPACE	1	/* minimize disk fragmentation */
 
+/*
+ * Filesystem flags.
+ */
+#define FS_UNCLEAN    0x01    /* filesystem not clean at mount */
+#define FS_DOSOFTDEP  0x02    /* filesystem using soft dependencies */
+
 /*
  * Rotational layout table format types
  */
@@ -485,6 +492,11 @@ struct ocg {
 	(((lbn) >= NDADDR || (dip)->di_size >= smalllblktosize(fs, (lbn) + 1)) \
 	    ? (fs)->fs_bsize \
 	    : (fragroundup(fs, blkoff(fs, (dip)->di_size))))
+#define sblksize(fs, size, lbn) \
+	(((lbn) >= NDADDR || (size) >= ((lbn) + 1) << (fs)->fs_bshift) \
+	  ? (fs)->fs_bsize \
+	  : (fragroundup(fs, blkoff(fs, (size)))))
+
 
 /*
  * Number of disk sectors per block/fragment; assumes DEV_BSIZE byte
diff --git a/sys/ufs/ufs/inode.h b/sys/ufs/ufs/inode.h
index f2fd0f25fa5e..4bd1cf5d7de1 100644
--- a/sys/ufs/ufs/inode.h
+++ b/sys/ufs/ufs/inode.h
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)inode.h	8.9 (Berkeley) 5/14/95
- * $Id: inode.h,v 1.19 1997/12/05 13:43:47 jkh Exp $
+ * $Id: inode.h,v 1.20 1998/01/30 11:34:02 phk Exp $
  */
 
 #ifndef _UFS_UFS_INODE_H_
@@ -45,6 +45,11 @@
 #include <sys/lock.h>
 #include <ufs/ufs/dinode.h>
 
+/*
+ * The size of a logical block number.
+ */
+typedef long ufs_lbn_t;
+
 /*
  * This must agree with the definition in <ufs/ufs/dir.h>.
  */
@@ -67,6 +72,7 @@ struct inode {
 	u_int32_t i_flag;	/* flags, see below */
 	dev_t	  i_dev;	/* Device associated with the inode. */
 	ino_t	  i_number;	/* The identity of the inode. */
+	int	  i_effnlink;	/* i_nlink when I/O completes */
 
 	union {			/* Associated filesystem. */
 		struct	fs *fs;		/* FFS */
@@ -160,6 +166,9 @@ struct indir {
 	}								\
 }
 
+/* Determine if soft dependencies are being done */
+#define DOINGSOFTDEP(vp)	((vp)->v_mount->mnt_flag & MNT_SOFTDEP)
+
 /* This overlays the fid structure (see mount.h). */
 struct ufid {
 	u_int16_t ufid_len;	/* Length of structure. */
diff --git a/sys/ufs/ufs/ufs_extern.h b/sys/ufs/ufs/ufs_extern.h
index 5d7ec5f07a80..55d068c7e14a 100644
--- a/sys/ufs/ufs/ufs_extern.h
+++ b/sys/ufs/ufs/ufs_extern.h
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ufs_extern.h	8.10 (Berkeley) 5/14/95
- * $Id: ufs_extern.h,v 1.21 1997/10/16 11:59:09 phk Exp $
+ * $Id: ufs_extern.h,v 1.22 1997/10/27 12:50:57 bde Exp $
  */
 
 #ifndef _UFS_UFS_EXTERN_H_
@@ -68,12 +68,12 @@ int	 ufs_checkpath __P((struct inode *, struct inode *, struct ucred *));
 void	 ufs_dirbad __P((struct inode *, doff_t, char *));
 int	 ufs_dirbadentry __P((struct vnode *, struct direct *, int));
 int	 ufs_dirempty __P((struct inode *, ino_t, struct ucred *));
-int	 ufs_direnter __P((struct inode *, struct vnode *,struct componentname *));
-int	 ufs_direnter2 __P((struct vnode *, struct direct *, struct ucred *,
-		struct proc *));
-int	 ufs_dirremove __P((struct vnode *, struct componentname*));
-int	 ufs_dirrewrite
-	    __P((struct inode *, struct inode *, struct componentname *));
+void	 ufs_makedirentry __P((struct inode *, struct componentname *,
+	    struct direct *));
+int	 ufs_direnter __P((struct vnode *, struct vnode *, struct direct *,
+	    struct componentname *, struct buf *));
+int	 ufs_dirremove __P((struct vnode *, struct inode *, int, int));
+int	 ufs_dirrewrite __P((struct inode *, struct inode *, ino_t, int, int));
 int	 ufs_getlbns __P((struct vnode *, ufs_daddr_t, struct indir *, int *));
 struct vnode *
 	 ufs_ihashget __P((dev_t, ino_t));
@@ -90,4 +90,17 @@ int	 ufs_root __P((struct mount *, struct vnode **));
 int	 ufs_start __P((struct mount *, int, struct proc *));
 int	 ufs_vinit __P((struct mount *, vop_t **, vop_t **, struct vnode **));
 
+/*
+ * Soft update function prototypes.
+ */
+void	softdep_setup_directory_add __P((struct buf *, struct inode *, off_t,
+	    long, struct buf *));
+void	softdep_change_directoryentry_offset __P((struct inode *, caddr_t,
+	    caddr_t, caddr_t, int));
+void	softdep_setup_remove __P((struct buf *,struct inode *, struct inode *,
+	    int));
+void	softdep_setup_directory_change __P((struct buf *, struct inode *,
+	    struct inode *, long, int));
+void	softdep_increase_linkcnt __P((struct inode *));
+
 #endif /* !_UFS_UFS_EXTERN_H_ */
diff --git a/sys/ufs/ufs/ufs_lookup.c b/sys/ufs/ufs/ufs_lookup.c
index 108880783524..2e7d9d9cd67b 100644
--- a/sys/ufs/ufs/ufs_lookup.c
+++ b/sys/ufs/ufs/ufs_lookup.c
@@ -36,16 +36,22 @@
  * SUCH DAMAGE.
  *
  *	@(#)ufs_lookup.c	8.15 (Berkeley) 6/16/95
- * $Id: ufs_lookup.c,v 1.20 1998/02/04 22:33:36 eivind Exp $
+ * $Id: ufs_lookup.c,v 1.21 1998/02/06 12:14:18 eivind Exp $
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/kernel.h>
 #include <sys/namei.h>
 #include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/stat.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/dir.h>
@@ -143,7 +149,12 @@ ufs_lookup(ap)
 
 	bp = NULL;
 	slotoffset = -1;
+/*
+ *  XXX there was a soft-update diff about this I couldn't merge.
+ * I think this was the equiv.
+ */
 	*vpp = NULL;
+
 	vdp = ap->a_dvp;
 	dp = VTOI(vdp);
 	lockparent = flags & LOCKPARENT;
@@ -331,7 +342,7 @@ ufs_lookup(ap)
 	     (nameiop == DELETE &&
 	      (ap->a_cnp->cn_flags & DOWHITEOUT) &&
 	      (ap->a_cnp->cn_flags & ISWHITEOUT))) &&
-	    (flags & ISLASTCN) && dp->i_nlink != 0) {
+	    (flags & ISLASTCN) && dp->i_effnlink != 0) {
 		/*
 		 * Access for write is interpreted as allowing
 		 * creation of files in the directory.
@@ -603,64 +614,66 @@ ufs_dirbadentry(dp, ep, entryoffsetinblock)
 }
 
 /*
- * Write a directory entry after a call to namei, using the parameters
- * that it left in nameidata.  The argument ip is the inode which the new
- * directory entry will refer to.  Dvp is a pointer to the directory to
- * be written, which was left locked by namei. Remaining parameters
- * (dp->i_offset, dp->i_count) indicate how the space for the new
- * entry is to be obtained.
+ * Construct a new directory entry after a call to namei, using the
+ * parameters that it left in the componentname argument cnp. The
+ * argument ip is the inode to which the new directory entry will refer.
  */
-int
-ufs_direnter(ip, dvp, cnp)
+void
+ufs_makedirentry(ip, cnp, newdirp)
 	struct inode *ip;
-	struct vnode *dvp;
-	register struct componentname *cnp;
+	struct componentname *cnp;
+	struct direct *newdirp;
 {
-	register struct inode *dp;
-	struct direct newdir;
 
 #ifdef DIAGNOSTIC
 	if ((cnp->cn_flags & SAVENAME) == 0)
-		panic("ufs_direnter: missing name");
+		panic("ufs_makedirentry: missing name");
 #endif
-	dp = VTOI(dvp);
-	newdir.d_ino = ip->i_number;
-	newdir.d_namlen = cnp->cn_namelen;
-	bcopy(cnp->cn_nameptr, newdir.d_name, (unsigned)cnp->cn_namelen + 1);
-	if (!OFSFMT(dvp))
-		newdir.d_type = IFTODT(ip->i_mode);
+	newdirp->d_ino = ip->i_number;
+	newdirp->d_namlen = cnp->cn_namelen;
+	bcopy(cnp->cn_nameptr, newdirp->d_name, (unsigned)cnp->cn_namelen + 1);
+	if (ITOV(ip)->v_mount->mnt_maxsymlinklen > 0)
+		newdirp->d_type = IFTODT(ip->i_mode);
 	else {
-		newdir.d_type = 0;
+		newdirp->d_type = 0;
 #		if (BYTE_ORDER == LITTLE_ENDIAN)
-			{ u_char tmp = newdir.d_namlen;
-			newdir.d_namlen = newdir.d_type;
-			newdir.d_type = tmp; }
+			{ u_char tmp = newdirp->d_namlen;
+			newdirp->d_namlen = newdirp->d_type;
+			newdirp->d_type = tmp; }
 #		endif
 	}
-	return (ufs_direnter2(dvp, &newdir, cnp->cn_cred, cnp->cn_proc));
 }
 
 /*
- * Common entry point for directory entry removal used by ufs_direnter
- * and ufs_whiteout
+ * Write a directory entry after a call to namei, using the parameters
+ * that it left in nameidata. The argument dirp is the new directory
+ * entry contents. Dvp is a pointer to the directory to be written,
+ * which was left locked by namei. Remaining parameters (dp->i_offset, 
+ * dp->i_count) indicate how the space for the new entry is to be obtained.
+ * Non-null bp indicates that a directory is being created (for the
+ * soft dependency code).
  */
 int
-ufs_direnter2(dvp, dirp, cr, p)
+ufs_direnter(dvp, tvp, dirp, cnp, newdirbp)
 	struct vnode *dvp;
+	struct vnode *tvp;
 	struct direct *dirp;
+	struct componentname *cnp;
+	struct buf *newdirbp;
+{
 	struct ucred *cr;
 	struct proc *p;
-{
 	int newentrysize;
 	struct inode *dp;
 	struct buf *bp;
-	struct iovec aiov;
-	struct uio auio;
 	u_int dsize;
 	struct direct *ep, *nep;
-	int error, loc, spacefree;
+	int error, ret, blkoff, loc, spacefree, flags;
 	char *dirbuf;
 
+	p = curproc;	/* XXX */
+	cr = p->p_ucred;
+
 	dp = VTOI(dvp);
 	newentrysize = DIRSIZ(OFSFMT(dvp), dirp);
 
@@ -672,36 +685,55 @@ ufs_direnter2(dvp, dirp, cr, p)
 		 * new entry into a fresh block.
 		 */
 		if (dp->i_offset & (DIRBLKSIZ - 1))
-			panic("ufs_direnter2: newblk");
-		auio.uio_offset = dp->i_offset;
-		dirp->d_reclen = DIRBLKSIZ;
-		auio.uio_resid = newentrysize;
-		aiov.iov_len = newentrysize;
-		aiov.iov_base = (caddr_t)dirp;
-		auio.uio_iov = &aiov;
-		auio.uio_iovcnt = 1;
-		auio.uio_rw = UIO_WRITE;
-		auio.uio_segflg = UIO_SYSSPACE;
-		auio.uio_procp = (struct proc *)0;
-		error = VOP_WRITE(dvp, &auio, IO_SYNC, cr);
-		if (DIRBLKSIZ >
-		    VFSTOUFS(dvp->v_mount)->um_mountp->mnt_stat.f_bsize)
-			/* XXX should grow with balloc() */
-			panic("ufs_direnter2: frag size");
-		else if (!error) {
-			dp->i_size = roundup2(dp->i_size, DIRBLKSIZ);
-			dp->i_flag |= IN_CHANGE;
+			panic("ufs_direnter: newblk");
+		flags = B_CLRBUF;
+		if (!DOINGSOFTDEP(dvp))
+			flags |= B_SYNC;
+		if ((error = VOP_BALLOC(dvp, (off_t)dp->i_offset, DIRBLKSIZ,
+		    cr, flags, &bp)) != 0) {
+			if (DOINGSOFTDEP(dvp) && newdirbp != NULL)
+				bdwrite(newdirbp);
+			return (error);
 		}
+		dp->i_size = dp->i_offset + DIRBLKSIZ;
+		dp->i_flag |= IN_CHANGE | IN_UPDATE;
+		vnode_pager_setsize(dvp, (u_long)dp->i_size);
+		dirp->d_reclen = DIRBLKSIZ;
+		blkoff = dp->i_offset &
+		    (VFSTOUFS(dvp->v_mount)->um_mountp->mnt_stat.f_iosize - 1);
+		bcopy((caddr_t)dirp, (caddr_t)bp->b_data + blkoff,newentrysize);
+		if (DOINGSOFTDEP(dvp)) {
+			/*
+			 * Ensure that the entire newly allocated block is a
+			 * valid directory so that future growth within the
+			 * block does not have to ensure that the block is
+			 * written before the inode.
+			 */
+			blkoff += DIRBLKSIZ;
+			while (blkoff < bp->b_bcount) {
+				((struct direct *)
+				   (bp->b_data + blkoff))->d_reclen = DIRBLKSIZ;
+				blkoff += DIRBLKSIZ;
+			}
+			softdep_setup_directory_add(bp, dp, dp->i_offset,
+			    dirp->d_ino, newdirbp);
+			bdwrite(bp);
+		} else {
+			error = VOP_BWRITE(bp);
+		}
+		ret = UFS_UPDATE(dvp, &time, &time, !DOINGSOFTDEP(dvp));
+		if (error == 0)
+			return (ret);
 		return (error);
 	}
 
 	/*
-	 * If dp->i_count is non-zero, then namei found space
-	 * for the new entry in the range dp->i_offset to
-	 * dp->i_offset + dp->i_count in the directory.
-	 * To use this space, we may have to compact the entries located
-	 * there, by copying them together towards the beginning of the
-	 * block, leaving the free space in one usable chunk at the end.
+	 * If dp->i_count is non-zero, then namei found space for the new
+	 * entry in the range dp->i_offset to dp->i_offset + dp->i_count
+	 * in the directory. To use this space, we may have to compact
+	 * the entries located there, by copying them together towards the
+	 * beginning of the block, leaving the free space in one usable
+	 * chunk at the end.
 	 */
 
 	/*
@@ -717,14 +749,16 @@ ufs_direnter2(dvp, dirp, cr, p)
 	 * Get the block containing the space for the new directory entry.
 	 */
 	error = UFS_BLKATOFF(dvp, (off_t)dp->i_offset, &dirbuf, &bp);
-	if (error)
+	if (error) {
+		if (DOINGSOFTDEP(dvp) && newdirbp != NULL)
+			bdwrite(newdirbp);
 		return (error);
+	}
 	/*
 	 * Find space for the new entry. In the simple case, the entry at
 	 * offset base will have the space. If it does not, then namei
 	 * arranged that compacting the region dp->i_offset to
-	 * dp->i_offset + dp->i_count would yield the
-	 * space.
+	 * dp->i_offset + dp->i_count would yield the space.
 	 */
 	ep = (struct direct *)dirbuf;
 	dsize = DIRSIZ(OFSFMT(dvp), ep);
@@ -742,7 +776,11 @@ ufs_direnter2(dvp, dirp, cr, p)
 		dsize = DIRSIZ(OFSFMT(dvp), nep);
 		spacefree += nep->d_reclen - dsize;
 		loc += nep->d_reclen;
-		bcopy((caddr_t)nep, (caddr_t)ep, dsize);
+		if (DOINGSOFTDEP(dvp))
+			softdep_change_directoryentry_offset(dp, dirbuf,
+			    (caddr_t)nep, (caddr_t)ep, dsize); 
+		else
+			bcopy((caddr_t)nep, (caddr_t)ep, dsize);
 	}
 	/*
 	 * Update the pointer fields in the previous entry (if any),
@@ -752,26 +790,44 @@ ufs_direnter2(dvp, dirp, cr, p)
 	    (ep->d_ino == WINO &&
 	     bcmp(ep->d_name, dirp->d_name, dirp->d_namlen) == 0)) {
 		if (spacefree + dsize < newentrysize)
-			panic("ufs_direnter2: compact1");
+			panic("ufs_direnter: compact1");
 		dirp->d_reclen = spacefree + dsize;
 	} else {
 		if (spacefree < newentrysize)
-			panic("ufs_direnter2: compact2");
+			panic("ufs_direnter: compact2");
 		dirp->d_reclen = spacefree;
 		ep->d_reclen = dsize;
 		ep = (struct direct *)((char *)ep + dsize);
 	}
 	bcopy((caddr_t)dirp, (caddr_t)ep, (u_int)newentrysize);
 
-	if (dvp->v_mount->mnt_flag & MNT_ASYNC) {
+	if (DOINGSOFTDEP(dvp)) {
+		softdep_setup_directory_add(bp, dp,
+		    dp->i_offset + (caddr_t)ep - dirbuf, dirp->d_ino, newdirbp);
 		bdwrite(bp);
-		error = 0;
 	} else {
-		error = bowrite(bp);
+		if (dvp->v_mount->mnt_flag & MNT_ASYNC) {
+			bdwrite(bp);
+			error = 0;
+		} else {
+			error = bowrite(bp);
+		}
 	}
 	dp->i_flag |= IN_CHANGE | IN_UPDATE;
-	if (!error && dp->i_endoff && dp->i_endoff < dp->i_size)
-		error = UFS_TRUNCATE(dvp, (off_t)dp->i_endoff, IO_SYNC, cr, p);
+	/*
+	 * If all went well, and the directory can be shortened, proceed
+	 * with the truncation. Note that we have to unlock the inode for
+	 * the entry that we just entered, as the truncation may need to
+	 * lock other inodes which can lead to deadlock if we also hold a
+	 * lock on the newly entered node.
+	 */
+	if (error == 0 && dp->i_endoff && dp->i_endoff < dp->i_size) {
+		if (tvp != NULL)
+			VOP_UNLOCK(tvp, 0, p);
+		(void) UFS_TRUNCATE(dvp, (off_t)dp->i_endoff, IO_SYNC, cr, p);
+		if (tvp != NULL)
+			vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY, p);
+	}
 	return (error);
 }
 
@@ -788,18 +844,20 @@ ufs_direnter2(dvp, dirp, cr, p)
  * to the size of the previous entry.
  */
 int
-ufs_dirremove(dvp, cnp)
+ufs_dirremove(dvp, ip, flags, isrmdir)
 	struct vnode *dvp;
-	struct componentname *cnp;
+	struct inode *ip;
+	int flags;
+	int isrmdir;
 {
-	register struct inode *dp;
+	struct inode *dp;
 	struct direct *ep;
 	struct buf *bp;
 	int error;
 
 	dp = VTOI(dvp);
 
-	if (cnp->cn_flags & DOWHITEOUT) {
+	if (flags & DOWHITEOUT) {
 		/*
 		 * Whiteout entry: set d_ino to WINO.
 		 */
@@ -808,24 +866,44 @@ ufs_dirremove(dvp, cnp)
 			return (error);
 		ep->d_ino = WINO;
 		ep->d_type = DT_WHT;
-		error = VOP_BWRITE(bp);
-		dp->i_flag |= IN_CHANGE | IN_UPDATE;
-		return (error);
+		goto out;
 	}
 
+	if ((error = UFS_BLKATOFF(dvp,
+	    (off_t)(dp->i_offset - dp->i_count), (char **)&ep, &bp)) != 0)
+		return (error);
 	if (dp->i_count == 0) {
 		/*
 		 * First entry in block: set d_ino to zero.
 		 */
+#if 0
 		error =
 		    UFS_BLKATOFF(dvp, (off_t)dp->i_offset, (char **)&ep, &bp);
 		if (error)
 			return (error);
+#endif
 		ep->d_ino = 0;
-		error = bowrite(bp);
-		dp->i_flag |= IN_CHANGE | IN_UPDATE;
-		return (error);
+	} else {
+		/*
+		 * Collapse new free space into previous entry.
+		 */
+		ep->d_reclen += dp->i_reclen;
 	}
+out:
+	if (ip) {
+		ip->i_effnlink--;
+		ip->i_flag |= IN_CHANGE;
+	}
+	if (DOINGSOFTDEP(dvp)) {
+		if (ip)
+			softdep_setup_remove(bp, dp, ip, isrmdir);
+		bdwrite(bp);
+	} else {
+		if (ip)
+			ip->i_nlink--;
+		error = bowrite(bp); /* maybe this should be as below? */
+	}
+#if 0
 	/*
 	 * Collapse new free space into previous entry.
 	 */
@@ -840,6 +918,7 @@ ufs_dirremove(dvp, cnp)
 	} else {
 		error = bowrite(bp);
 	}
+#endif
 	dp->i_flag |= IN_CHANGE | IN_UPDATE;
 	return (error);
 }
@@ -850,9 +929,11 @@ ufs_dirremove(dvp, cnp)
  * set up by a call to namei.
  */
 int
-ufs_dirrewrite(dp, ip, cnp)
-	struct inode *dp, *ip;
-	struct componentname *cnp;
+ufs_dirrewrite(dp, oip, newinum, newtype, isrmdir)
+	struct inode *dp, *oip;
+	ino_t newinum;
+	int newtype;
+	int isrmdir;
 {
 	struct buf *bp;
 	struct direct *ep;
@@ -862,14 +943,22 @@ ufs_dirrewrite(dp, ip, cnp)
 	error = UFS_BLKATOFF(vdp, (off_t)dp->i_offset, (char **)&ep, &bp);
 	if (error)
 		return (error);
-	ep->d_ino = ip->i_number;
+	ep->d_ino = newinum;
 	if (!OFSFMT(vdp))
-		ep->d_type = IFTODT(ip->i_mode);
-	if (vdp->v_mount->mnt_flag & MNT_ASYNC) {
+		ep->d_type = newtype;
+	oip->i_effnlink--;
+	oip->i_flag |= IN_CHANGE;
+	if (DOINGSOFTDEP(vdp)) {
+		softdep_setup_directory_change(bp, dp, oip, newinum, isrmdir);
 		bdwrite(bp);
-		error = 0;
 	} else {
-		error = bowrite(bp);
+		oip->i_nlink--;
+		if (vdp->v_mount->mnt_flag & MNT_ASYNC) {
+			bdwrite(bp);
+			error = 0;
+		} else {
+			error = bowrite(bp);
+		}
 	}
 	dp->i_flag |= IN_CHANGE | IN_UPDATE;
 	return (error);
@@ -929,7 +1018,7 @@ ufs_dirempty(ip, parentino, cred)
 		 * 1 implies ".", 2 implies ".." if second
 		 * char is also "."
 		 */
-		if (namlen == 1)
+		if (namlen == 1 && dp->d_ino == ip->i_number)
 			continue;
 		if (dp->d_name[1] == '.' && dp->d_ino == parentino)
 			continue;
diff --git a/sys/ufs/ufs/ufs_quota.c b/sys/ufs/ufs/ufs_quota.c
index da4641de9cf8..2eae865431c2 100644
--- a/sys/ufs/ufs/ufs_quota.c
+++ b/sys/ufs/ufs/ufs_quota.c
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ufs_quota.c	8.5 (Berkeley) 5/20/95
- * $Id: ufs_quota.c,v 1.18 1998/02/06 12:14:18 eivind Exp $
+ * $Id: ufs_quota.c,v 1.19 1998/02/09 06:11:12 eivind Exp $
  */
 
 #include <sys/param.h>
@@ -425,7 +425,7 @@ quotaon(p, mp, type, fname)
 again:
 	for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nextvp) {
 		nextvp = vp->v_mntvnodes.le_next;
-		if (vp->v_writecount == 0)
+		if (vp->v_type == VNON || vp->v_writecount == 0)
 			continue;
 		if (vget(vp, LK_EXCLUSIVE, p))
 			goto again;
@@ -470,6 +470,8 @@ quotaoff(p, mp, type)
 again:
 	for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nextvp) {
 		nextvp = vp->v_mntvnodes.le_next;
+		if (vp->v_type == VNON)
+			continue;
 		if (vget(vp, LK_EXCLUSIVE, p))
 			goto again;
 		ip = VTOI(vp);
@@ -657,6 +659,8 @@ qsync(mp)
 		if (vp->v_mount != mp)
 			goto again;
 		nextvp = vp->v_mntvnodes.le_next;
+		if (vp->v_type == VNON)
+			continue;
 		simple_lock(&vp->v_interlock);
 		simple_unlock(&mntvnode_slock);
 		error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, p);
diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c
index 6abb130c3527..e0a3488c42fa 100644
--- a/sys/ufs/ufs/ufs_readwrite.c
+++ b/sys/ufs/ufs/ufs_readwrite.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ufs_readwrite.c	8.11 (Berkeley) 5/8/95
- * $Id: ufs_readwrite.c,v 1.43 1998/02/26 06:39:50 msmith Exp $
+ * $Id: ufs_readwrite.c,v 1.44 1998/03/07 21:36:42 dyson Exp $
  */
 
 #define	BLKSIZE(a, b, c)	blksize(a, b, c)
@@ -338,10 +338,10 @@ WRITE(ap)
 			flags |= B_CLRBUF;
 		else
 			flags &= ~B_CLRBUF;
-
-		error = ffs_balloc(ip,
-		    lbn, blkoffset + xfersize, ap->a_cred, &bp, flags);
-		if (error)
+/* XXX is uio->uio_offset the right thing here? */
+		error = VOP_BALLOC(vp, uio->uio_offset, xfersize,
+		    ap->a_cred, flags, &bp);
+		if (error != 0)
 			break;
 
 		if (uio->uio_offset + xfersize > ip->i_size) {
diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c
index 28eae1419315..82a7cc34d95d 100644
--- a/sys/ufs/ufs/ufs_vnops.c
+++ b/sys/ufs/ufs/ufs_vnops.c
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ufs_vnops.c	8.27 (Berkeley) 5/27/95
- * $Id: ufs_vnops.c,v 1.77 1998/02/06 12:14:19 eivind Exp $
+ * $Id: ufs_vnops.c,v 1.78 1998/02/09 06:11:14 eivind Exp $
  */
 
 #include "opt_quota.h"
@@ -59,6 +59,8 @@
 #include <sys/poll.h>
 
 #include <vm/vm_zone.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
 
 #include <miscfs/specfs/specdev.h>
 #include <miscfs/fifofs/fifo.h>
@@ -120,6 +122,18 @@ union _qcvt {
 	(q) = tmp.qcvt; \
 }
 
+/*
+ * A virgin directory (no blushing please).
+ */
+static struct dirtemplate mastertemplate = {
+	0, 12, DT_DIR, 1, ".",
+	0, DIRBLKSIZ - 12, DT_DIR, 2, ".."
+};
+static struct odirtemplate omastertemplate = {
+	0, 12, 1, ".",
+	0, DIRBLKSIZ - 12, 2, ".."
+};
+
 /*
  * Create a regular file
  */
@@ -273,6 +287,8 @@ ufs_access(ap)
 				return (error);
 #endif
 			break;
+		default:
+			break;
 		}
 	}
 
@@ -340,7 +356,7 @@ ufs_getattr(ap)
 	vap->va_fsid = ip->i_dev;
 	vap->va_fileid = ip->i_number;
 	vap->va_mode = ip->i_mode & ~IFMT;
-	vap->va_nlink = ip->i_nlink;
+	vap->va_nlink = ip->i_effnlink;
 	vap->va_uid = ip->i_uid;
 	vap->va_gid = ip->i_gid;
 	vap->va_rdev = (dev_t)ip->i_rdev;
@@ -444,6 +460,8 @@ ufs_setattr(ap)
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			break;
+		default:
+			break;
 		}
 		if (error = UFS_TRUNCATE(vp, vap->va_size, 0, cred, p))
 			return (error);
@@ -465,7 +483,7 @@ ufs_setattr(ap)
 		atimeval.tv_usec = vap->va_atime.tv_nsec / 1000;
 		mtimeval.tv_sec = vap->va_mtime.tv_sec;
 		mtimeval.tv_usec = vap->va_mtime.tv_nsec / 1000;
-		error = UFS_UPDATE(vp, &atimeval, &mtimeval, 1);
+		error = UFS_UPDATE(vp, &atimeval, &mtimeval, 0);
 		if (error)
 			return (error);
 	}
@@ -652,11 +670,7 @@ ufs_remove(ap)
 		error = EPERM;
 		goto out;
 	}
-	error = ufs_dirremove(dvp, ap->a_cnp);
-	if (error == 0) {
-		ip->i_nlink--;
-		ip->i_flag |= IN_CHANGE;
-	}
+	error = ufs_dirremove(dvp, ip, ap->a_cnp->cn_flags, 0);
 	VN_POLLEVENT(vp, POLLNLINK);
 	VN_POLLEVENT(dvp, POLLWRITE);
 out:
@@ -685,6 +699,7 @@ ufs_link(ap)
 	struct proc *p = cnp->cn_proc;
 	struct inode *ip;
 	struct timeval tv;
+	struct direct newdir;
 	int error;
 
 #ifdef DIAGNOSTIC
@@ -711,15 +726,20 @@ ufs_link(ap)
 		error = EPERM;
 		goto out1;
 	}
+	ip->i_effnlink++;
 	ip->i_nlink++;
 	ip->i_flag |= IN_CHANGE;
+	if (DOINGSOFTDEP(vp))
+		softdep_increase_linkcnt(ip);
 	gettime(&tv);
-	error = UFS_UPDATE(vp, &tv, &tv, 1);
+	error = UFS_UPDATE(vp, &tv, &tv, !DOINGSOFTDEP(vp));
 	if (!error) {
-		error = ufs_direnter(ip, tdvp, cnp);
+		ufs_makedirentry(ip, cnp, &newdir);
+		error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL);
 	}
 
 	if (error) {
+		ip->i_effnlink--;
 		ip->i_nlink--;
 		ip->i_flag |= IN_CHANGE;
 	}
@@ -770,7 +790,7 @@ ufs_whiteout(ap)
 		newdir.d_namlen = cnp->cn_namelen;
 		bcopy(cnp->cn_nameptr, newdir.d_name, (unsigned)cnp->cn_namelen + 1);
 		newdir.d_type = DT_WHT;
-		error = ufs_direnter2(dvp, &newdir, cnp->cn_cred, cnp->cn_proc);
+		error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL);
 		break;
 
 	case DELETE:
@@ -781,8 +801,10 @@ ufs_whiteout(ap)
 #endif
 
 		cnp->cn_flags &= ~DOWHITEOUT;
-		error = ufs_dirremove(dvp, cnp);
+		error = ufs_dirremove(dvp, NULL, cnp->cn_flags, 0);
 		break;
+	default:
+		panic("ufs_whiteout: unknown op");
 	}
 	if (cnp->cn_flags & HASBUF) {
 		zfree(namei_zone, cnp->cn_pnbuf);
@@ -834,11 +856,10 @@ ufs_rename(ap)
 	struct componentname *fcnp = ap->a_fcnp;
 	struct proc *p = fcnp->cn_proc;
 	struct inode *ip, *xp, *dp;
-	struct dirtemplate dirbuf;
+	struct direct newdir;
 	struct timeval tv;
 	int doingdirectory = 0, oldparent = 0, newparent = 0;
 	int error = 0;
-	u_char namlen;
 
 #ifdef DIAGNOSTIC
 	if ((tcnp->cn_flags & HASBUF) == 0 ||
@@ -965,10 +986,13 @@ ufs_rename(ap)
 	 *    completing our work, the link count
 	 *    may be wrong, but correctable.
 	 */
+	ip->i_effnlink++;
 	ip->i_nlink++;
 	ip->i_flag |= IN_CHANGE;
+	if (DOINGSOFTDEP(fvp))
+		softdep_increase_linkcnt(ip);
 	gettime(&tv);
-	if (error = UFS_UPDATE(fvp, &tv, &tv, 1)) {
+	if (error = UFS_UPDATE(fvp, &tv, &tv, !DOINGSOFTDEP(fvp))) {
 		VOP_UNLOCK(fvp, 0, p);
 		goto bad;
 	}
@@ -1027,15 +1051,20 @@ ufs_rename(ap)
 				error = EMLINK;
 				goto bad;
 			}
+			dp->i_effnlink++;
 			dp->i_nlink++;
 			dp->i_flag |= IN_CHANGE;
-			error = UFS_UPDATE(tdvp, &tv, &tv, 1);
+			if (DOINGSOFTDEP(tdvp))
+				softdep_increase_linkcnt(dp);
+			error = UFS_UPDATE(tdvp, &tv, &tv, !DOINGSOFTDEP(tdvp));
 			if (error)
 				goto bad;
 		}
-		error = ufs_direnter(ip, tdvp, tcnp);
+		ufs_makedirentry(ip, tcnp, &newdir);
+		error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL);
 		if (error) {
 			if (doingdirectory && newparent) {
+				dp->i_effnlink--;
 				dp->i_nlink--;
 				dp->i_flag |= IN_CHANGE;
 				(void)UFS_UPDATE(tdvp, &tv, &tv, 1);
@@ -1070,9 +1099,8 @@ ufs_rename(ap)
 		 * (both directories, or both not directories).
 		 */
 		if ((xp->i_mode&IFMT) == IFDIR) {
-			if (! ufs_dirempty
-					 (xp, dp->i_number, tcnp->cn_cred) || 
-			    xp->i_nlink > 2) {
+			if ((xp->i_effnlink > 2) ||
+			    !ufs_dirempty(xp, dp->i_number, tcnp->cn_cred)) {
 				error = ENOTEMPTY;
 				goto bad;
 			}
@@ -1085,40 +1113,37 @@ ufs_rename(ap)
 			error = EISDIR;
 			goto bad;
 		}
-		error = ufs_dirrewrite(dp, ip, tcnp);
+		error = ufs_dirrewrite(dp, xp, ip->i_number,
+		    IFTODT(ip->i_mode), doingdirectory);
 		if (error)
 			goto bad;
-		/*
-		 * If the target directory is in the same
-		 * directory as the source directory,
-		 * decrement the link count on the parent
-		 * of the target directory.
-		 */
-		 if (doingdirectory && !newparent) {
-			dp->i_nlink--;
+		if (doingdirectory) {
+			dp->i_effnlink--;
 			dp->i_flag |= IN_CHANGE;
+			xp->i_effnlink--;
+			xp->i_flag |= IN_CHANGE;
 		}
 		VN_POLLEVENT(tdvp, POLLWRITE);
-		vput(tdvp);
-		/*
-		 * Adjust the link count of the target to
-		 * reflect the dirrewrite above.  If this is
-		 * a directory it is empty and there are
-		 * no links to it, so we can squash the inode and
-		 * any space associated with it.  We disallowed
-		 * renaming over top of a directory with links to
-		 * it above, as the remaining link would point to
-		 * a directory without "." or ".." entries.
-		 */
-		xp->i_nlink--;
-		if (doingdirectory) {
-			if (--xp->i_nlink != 0)
-				panic("ufs_rename: linked directory");
-			error = UFS_TRUNCATE(tvp, (off_t)0, IO_SYNC,
-			    tcnp->cn_cred, tcnp->cn_proc);
+		if (doingdirectory && !DOINGSOFTDEP(tvp)) {
+			/*
+			 * Truncate inode. The only stuff left in the directory
+			 * is "." and "..". The "." reference is inconsequential
+			 * since we are quashing it. We have removed the "."
+			 * reference and the reference in the parent directory,
+			 * but there may be other hard links. The soft
+			 * dependency code will arrange to do these operations
+			 * after the parent directory entry has been deleted on
+			 * disk, so when running with that code we avoid doing
+			 * them now.
+			 */
+			dp->i_nlink--;
+			xp->i_nlink--;
+			if ((error = UFS_TRUNCATE(tvp, (off_t)0, IO_SYNC,
+			    tcnp->cn_cred, tcnp->cn_proc)) != 0)
+				goto bad;
 		}
-		xp->i_flag |= IN_CHANGE;
-		VN_POLLEVENT(tvp, POLLNLINK);
+		vput(tdvp);
+		VN_POLLEVENT(tvp, POLLNLINK); /* XXX this right? */
 		vput(tvp);
 		xp = NULL;
 	}
@@ -1151,10 +1176,9 @@ ufs_rename(ap)
 	 * changed while the new name has been entered. If the source is
 	 * a file then the entry may have been unlinked or renamed. In
 	 * either case there is no further work to be done. If the source
-	 * is a directory then it cannot have been rmdir'ed; its link
-	 * count of three would cause a rmdir to fail with ENOTEMPTY.
-	 * The IN_RENAME flag ensures that it cannot be moved by another
-	 * rename.
+	 * is a directory then it cannot have been rmdir'ed; the IN_RENAME
+	 * flag ensures that it cannot be moved by another rename or removed
+	 * by a rmdir.
 	 */
 	if (xp != ip) {
 		if (doingdirectory)
@@ -1167,44 +1191,11 @@ ufs_rename(ap)
 		 * and ".." set to point to the new parent.
 		 */
 		if (doingdirectory && newparent) {
-			dp->i_nlink--;
-			dp->i_flag |= IN_CHANGE;
-			error = vn_rdwr(UIO_READ, fvp, (caddr_t)&dirbuf,
-				sizeof (struct dirtemplate), (off_t)0,
-				UIO_SYSSPACE, IO_NODELOCKED,
-				tcnp->cn_cred, (int *)0, (struct proc *)0);
-			if (error == 0) {
-#				if (BYTE_ORDER == LITTLE_ENDIAN)
-					if (fvp->v_mount->mnt_maxsymlinklen <= 0)
-						namlen = dirbuf.dotdot_type;
-					else
-						namlen = dirbuf.dotdot_namlen;
-#				else
-					namlen = dirbuf.dotdot_namlen;
-#				endif
-				if (namlen != 2 ||
-				    dirbuf.dotdot_name[0] != '.' ||
-				    dirbuf.dotdot_name[1] != '.') {
-					ufs_dirbad(xp, (doff_t)12,
-					    "rename: mangled dir");
-				} else {
-					dirbuf.dotdot_ino = newparent;
-					(void) vn_rdwr(UIO_WRITE, fvp,
-					    (caddr_t)&dirbuf,
-					    sizeof (struct dirtemplate),
-					    (off_t)0, UIO_SYSSPACE,
-					    IO_NODELOCKED|IO_SYNC,
-					    tcnp->cn_cred, (int *)0,
-					    (struct proc *)0);
-					cache_purge(fdvp);
-				}
-			}
-		}
-		error = ufs_dirremove(fdvp, fcnp);
-		if (!error) {
-			xp->i_nlink--;
-			xp->i_flag |= IN_CHANGE;
+			xp->i_offset = mastertemplate.dot_reclen;
+			ufs_dirrewrite(xp, dp, newparent, DT_DIR, 0);
+			cache_purge(fdvp);
 		}
+		error = ufs_dirremove(fdvp, xp, fcnp->cn_flags, 0);
 		xp->i_flag &= ~IN_RENAME;
 	}
 	if (dp)
@@ -1222,6 +1213,7 @@ ufs_rename(ap)
 	if (doingdirectory)
 		ip->i_flag &= ~IN_RENAME;
 	if (vn_lock(fvp, LK_EXCLUSIVE, p) == 0) {
+		ip->i_effnlink--;
 		ip->i_nlink--;
 		ip->i_flag |= IN_CHANGE;
 		ip->i_flag &= ~IN_RENAME;
@@ -1231,18 +1223,6 @@ ufs_rename(ap)
 	return (error);
 }
 
-/*
- * A virgin directory (no blushing please).
- */
-static struct dirtemplate mastertemplate = {
-	0, 12, DT_DIR, 1, { '.', 0 },
-	0, DIRBLKSIZ - 12, DT_DIR, 2, { '.', '.', 0 }
-};
-static struct odirtemplate omastertemplate = {
-	0, 12, 1, { '.', 0 },
-	0, DIRBLKSIZ - 12, 2, { '.', '.', 0 }
-};
-
 /*
  * Mkdir system call
  */
@@ -1260,7 +1240,9 @@ ufs_mkdir(ap)
 	register struct componentname *cnp = ap->a_cnp;
 	register struct inode *ip, *dp;
 	struct vnode *tvp;
+	struct buf *bp;
 	struct dirtemplate dirtemplate, *dtp;
+	struct direct newdir;
 	struct timeval tv;
 	int error, dmode;
 
@@ -1348,25 +1330,31 @@ ufs_mkdir(ap)
 	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 	ip->i_mode = dmode;
 	tvp->v_type = VDIR;	/* Rest init'd in getnewvnode(). */
+	ip->i_effnlink = 2;
 	ip->i_nlink = 2;
+	if (DOINGSOFTDEP(tvp))
+		softdep_increase_linkcnt(ip);
 	if (cnp->cn_flags & ISWHITEOUT)
 		ip->i_flags |= UF_OPAQUE;
-	gettime(&tv);
-	error = UFS_UPDATE(tvp, &tv, &tv, 1);
 
 	/*
-	 * Bump link count in parent directory
-	 * to reflect work done below.  Should
-	 * be done before reference is created
-	 * so reparation is possible if we crash.
+	 * Bump link count in parent directory to reflect work done below.
+	 * Should be done before reference is created so cleanup is
+	 * possible if we crash.
 	 */
+	dp->i_effnlink++;
 	dp->i_nlink++;
 	dp->i_flag |= IN_CHANGE;
-	error = UFS_UPDATE(dvp, &tv, &tv, 1);
+	if (DOINGSOFTDEP(dvp))
+		softdep_increase_linkcnt(dp);
+	gettime(&tv);
+        error = UFS_UPDATE(tvp, &tv, &tv, !DOINGSOFTDEP(dvp));
 	if (error)
 		goto bad;
 
-	/* Initialize directory with "." and ".." from static template. */
+	/*
+	 * Initialize directory with "." and ".." from static template.
+	 */
 	if (dvp->v_mount->mnt_maxsymlinklen > 0
 	)
 		dtp = &mastertemplate;
@@ -1375,39 +1363,50 @@ ufs_mkdir(ap)
 	dirtemplate = *dtp;
 	dirtemplate.dot_ino = ip->i_number;
 	dirtemplate.dotdot_ino = dp->i_number;
-	error = vn_rdwr(UIO_WRITE, tvp, (caddr_t)&dirtemplate,
-	    sizeof (dirtemplate), (off_t)0, UIO_SYSSPACE,
-	    IO_NODELOCKED|IO_SYNC, cnp->cn_cred, (int *)0, (struct proc *)0);
-	if (error) {
-		dp->i_nlink--;
-		dp->i_flag |= IN_CHANGE;
+	if ((error = VOP_BALLOC(tvp, (off_t)0, DIRBLKSIZ, cnp->cn_cred,
+	    B_CLRBUF, &bp)) != 0)
+		goto bad;
+	ip->i_size = DIRBLKSIZ;
+	ip->i_flag |= IN_CHANGE | IN_UPDATE;
+	vnode_pager_setsize(tvp, (u_long)ip->i_size);
+	bcopy((caddr_t)&dirtemplate, (caddr_t)bp->b_data, sizeof dirtemplate);
+	if ((error = UFS_UPDATE(tvp, &tv, &tv, !DOINGSOFTDEP(tvp))) != 0) {
+		(void)VOP_BWRITE(bp);
 		goto bad;
 	}
-	if (DIRBLKSIZ > VFSTOUFS(dvp->v_mount)->um_mountp->mnt_stat.f_bsize)
-		panic("ufs_mkdir: blksize"); /* XXX should grow with balloc() */
-	else {
-		ip->i_size = DIRBLKSIZ;
-		ip->i_flag |= IN_CHANGE;
-	}
-
-	/* Directory set up, now install it's entry in the parent directory. */
-	error = ufs_direnter(ip, dvp, cnp);
-	if (error) {
+	VN_POLLEVENT(dvp, POLLWRITE); /* XXX right place? */
+	/*
+	 * Directory set up, now install it's entry in the parent directory.
+	 *
+	 * If we are not doing soft dependencies, then we must write out the
+	 * buffer containing the new directory body before entering the new 
+	 * name in the parent. If we are doing soft dependencies, then the
+	 * buffer containing the new directory body will be passed to and
+	 * released in the soft dependency code after the code has attached
+	 * an appropriate ordering dependency to the buffer which ensures that
+	 * the buffer is written before the new name is written in the parent.
+	 */
+	if (!DOINGSOFTDEP(dvp) && ((error = VOP_BWRITE(bp)) != 0))
+		goto bad;
+	ufs_makedirentry(ip, cnp, &newdir);
+	error = ufs_direnter(dvp, tvp, &newdir, cnp, bp);
+	
+bad:
+	if (error == 0) {
+		*ap->a_vpp = tvp;
+	} else {
+		dp->i_effnlink--;
 		dp->i_nlink--;
 		dp->i_flag |= IN_CHANGE;
-	}
-	VN_POLLEVENT(dvp, POLLWRITE);
-bad:
-	/*
-	 * No need to do an explicit VOP_TRUNCATE here, vrele will do this
-	 * for us because we set the link count to 0.
-	 */
-	if (error) {
+		/*
+		 * No need to do an explicit VOP_TRUNCATE here, vrele will
+		 * do this for us because we set the link count to 0.
+		 */
+		ip->i_effnlink = 0;
 		ip->i_nlink = 0;
 		ip->i_flag |= IN_CHANGE;
 		vput(tvp);
-	} else
-		*ap->a_vpp = tvp;
+	}
 out:
 	zfree(namei_zone, cnp->cn_pnbuf);
 	vput(dvp);
@@ -1435,14 +1434,17 @@ ufs_rmdir(ap)
 	dp = VTOI(dvp);
 
 	/*
-	 * Verify the directory is empty (and valid).
-	 * (Rmdir ".." won't be valid since
-	 *  ".." will contain a reference to
-	 *  the current directory and thus be
-	 *  non-empty.)
+	 * Do not remove a directory that is in the process of being renamed.
+	 * Verify the directory is empty (and valid). Rmdir ".." will not be
+	 * valid since ".." will contain a reference to the current directory
+	 * and thus be non-empty.
 	 */
 	error = 0;
-	if (ip->i_nlink != 2 ||
+	if (ip->i_flag & IN_RENAME) {
+		error = EINVAL;
+		goto out;
+	}
+	if (ip->i_effnlink != 2 ||
 	    !ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) {
 		error = ENOTEMPTY;
 		goto out;
@@ -1457,34 +1459,36 @@ ufs_rmdir(ap)
 	 * inode.  If we crash in between, the directory
 	 * will be reattached to lost+found,
 	 */
-	error = ufs_dirremove(dvp, cnp);
+	error = ufs_dirremove(dvp, ip, cnp->cn_flags, 1);
 	if (error)
 		goto out;
 	VN_POLLEVENT(dvp, POLLWRITE|POLLNLINK);
-	dp->i_nlink--;
-	dp->i_flag |= IN_CHANGE;
 	cache_purge(dvp);
-	vput(dvp);
-	dvp = NULL;
 	/*
-	 * Truncate inode.  The only stuff left
-	 * in the directory is "." and "..".  The
-	 * "." reference is inconsequential since
-	 * we're quashing it.  The ".." reference
-	 * has already been adjusted above.  We've
-	 * removed the "." reference and the reference
-	 * in the parent directory, but there may be
-	 * other hard links so decrement by 2 and
-	 * worry about them later.
+	 * Truncate inode. The only stuff left in the directory is "." and
+	 * "..". The "." reference is inconsequential since we are quashing
+	 * it. We have removed the "." reference and the reference in the
+	 * parent directory, but there may be other hard links. So,
+	 * ufs_dirremove will set the UF_IMMUTABLE flag to ensure that no
+	 * new entries are made. The soft dependency code will arrange to
+	 * do these operations after the parent directory entry has been
+	 * deleted on disk, so when running with that code we avoid doing
+	 * them now.
 	 */
-	ip->i_nlink -= 2;
-	error = UFS_TRUNCATE(vp, (off_t)0, IO_SYNC, cnp->cn_cred,
-	    cnp->cn_proc);
-	cache_purge(ITOV(ip));
-	VN_POLLEVENT(vp, POLLNLINK);
+	dp->i_effnlink--;
+	dp->i_flag |= IN_CHANGE;
+	ip->i_effnlink--;
+	ip->i_flag |= IN_CHANGE;
+	if (!DOINGSOFTDEP(vp)) {
+		dp->i_nlink--;
+		ip->i_nlink--;
+		error = UFS_TRUNCATE(vp, (off_t)0, IO_SYNC, cnp->cn_cred,
+		    cnp->cn_proc);
+	}
+	cache_purge(vp);
 out:
-	if (dvp)
-		vput(dvp);
+	vput(dvp);
+	VN_POLLEVENT(vp, POLLNLINK);
 	vput(vp);
 	return (error);
 }
@@ -1974,7 +1978,7 @@ ufs_vinit(mntp, specops, fifoops, vpp)
 
 	}
 	if (ip->i_number == ROOTINO)
-                vp->v_flag |= VROOT;
+		vp->v_flag |= VROOT;
 	/*
 	 * Initialize modrev times
 	 */
@@ -1995,6 +1999,7 @@ ufs_makeinode(mode, dvp, vpp, cnp)
 	struct componentname *cnp;
 {
 	register struct inode *ip, *pdir;
+	struct direct newdir;
 	struct timeval tv;
 	struct vnode *tvp;
 	int error;
@@ -2078,7 +2083,10 @@ ufs_makeinode(mode, dvp, vpp, cnp)
 	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 	ip->i_mode = mode;
 	tvp->v_type = IFTOVT(mode);	/* Rest init'd in getnewvnode(). */
+	ip->i_effnlink = 1;
 	ip->i_nlink = 1;
+	if (DOINGSOFTDEP(tvp))
+		softdep_increase_linkcnt(ip);
 	if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) &&
 	    suser(cnp->cn_cred, NULL))
 		ip->i_mode &= ~ISGID;
@@ -2090,10 +2098,11 @@ ufs_makeinode(mode, dvp, vpp, cnp)
 	 * Make sure inode goes to disk before directory entry.
 	 */
 	gettime(&tv);
-	error = UFS_UPDATE(tvp, &tv, &tv, 1);
+	error = UFS_UPDATE(tvp, &tv, &tv, !DOINGSOFTDEP(tvp));
 	if (error)
 		goto bad;
-	error = ufs_direnter(ip, dvp, cnp);
+	ufs_makedirentry(ip, cnp, &newdir);
+	error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL);
 	if (error)
 		goto bad;
 
@@ -2110,6 +2119,7 @@ ufs_makeinode(mode, dvp, vpp, cnp)
 	 */
 	zfree(namei_zone, cnp->cn_pnbuf);
 	vput(dvp);
+	ip->i_effnlink = 0;
 	ip->i_nlink = 0;
 	ip->i_flag |= IN_CHANGE;
 	vput(tvp);