Remove explicit locking of struct file.

- Introduce a finit() which is used to initailize the fields of struct file in such a way that the ops vector is only valid after the data, type, and flags are valid. - Protect f_flag and f_count with atomic operations. - Remove the global list of all files and associated accounting. - Rewrite the unp garbage collection such that it no longer requires the global list of all files and instead uses a list of all unp sockets. - Mark sockets in the accept queue so we don't incorrectly gc them. Tested by: kris, pho
svn path=/head/; revision=174988
2007-12-30 01:42:15 +00:00 · 2007-12-30 01:42:15 +00:00 · 397c19d175 · 2020-12-20 02:59:44 +00:00
commit 397c19d175
parent 2a79fd39b4
17 changed files with 344 additions and 536 deletions
--- a/sys/compat/svr4/svr4_stream.c
+++ b/sys/compat/svr4/svr4_stream.c
@ -1481,8 +1481,6 @@ svr4_do_putmsg(td, uap, fp)
 		 uap->dat, uap->flags);
 #endif /* DEBUG_SVR4 */

-	FILE_LOCK_ASSERT(fp, MA_NOTOWNED);
-
 	if (uap->ctl != NULL) {
 	  if ((error = copyin(uap->ctl, &ctl, sizeof(ctl))) != 0) {
 #ifdef DEBUG_SVR4
@ -1656,8 +1654,6 @@ svr4_do_getmsg(td, uap, fp)
 	error = 0;
 	afp = NULL;

-	FILE_LOCK_ASSERT(fp, MA_NOTOWNED);
-
 	memset(&sc, 0, sizeof(sc));

 #ifdef DEBUG_SVR4
--- a/sys/dev/streams/streams.c
+++ b/sys/dev/streams/streams.c
@ -251,12 +251,7 @@ streamsopen(struct cdev *dev, int oflags, int devtype, struct thread *td)
 	   return error;
 	}

-	FILE_LOCK(fp);
-	fp->f_data = so;
-	fp->f_flag = FREAD|FWRITE;
-	fp->f_ops = &svr4_netops;
-	fp->f_type = DTYPE_SOCKET;
-	FILE_UNLOCK(fp);
+	finit(fp, FREAD | FWRITE, DTYPE_SOCKET, so, &svr4_netops);

 	/*
 	 * Allocate a stream structure and attach it to this socket.
--- a/sys/fs/devfs/devfs_vnops.c
+++ b/sys/fs/devfs/devfs_vnops.c
@ -800,12 +800,9 @@ devfs_open(struct vop_open_args *ap)
 	if(fp == NULL)
 		return (error);
 #endif
-	FILE_LOCK(fp);
 	KASSERT(fp->f_ops == &badfileops,
 	     ("Could not vnode bypass device on fdops %p", fp->f_ops));
-	fp->f_data = dev;
-	fp->f_ops = &devfs_ops_f;
-	FILE_UNLOCK(fp);
+	finit(fp, fp->f_flag, DTYPE_VNODE, dev, &devfs_ops_f);
 	return (error);
 }

--- a/sys/fs/fifofs/fifo_vnops.c
+++ b/sys/fs/fifofs/fifo_vnops.c
@ -294,11 +294,8 @@ fifo_open(ap)
 	}
 	mtx_unlock(&fifo_mtx);
 	KASSERT(fp != NULL, ("can't fifo/vnode bypass"));
-	FILE_LOCK(fp);
 	KASSERT(fp->f_ops == &badfileops, ("not badfileops in fifo_open"));
-	fp->f_data = fip;
-	fp->f_ops = &fifo_ops_f;
-	FILE_UNLOCK(fp);
+	finit(fp, fp->f_flag, DTYPE_FIFO, fip, &fifo_ops_f);
 	return (0);
 }

--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@ -95,7 +95,6 @@ static int do_dup(struct thread *td, enum dup_type type, int old, int new,
 static int	fd_first_free(struct filedesc *, int, int);
 static int	fd_last_used(struct filedesc *, int, int);
 static void	fdgrowtable(struct filedesc *, int);
-static int	fdrop_locked(struct file *fp, struct thread *td);
 static void	fdunused(struct filedesc *fdp, int fd);
 static void	fdused(struct filedesc *fdp, int fd);

@ -137,9 +136,7 @@ struct filedesc0 {
 /*
 * Descriptor management.
 */
-struct filelist filehead;	/* head of list of open files */
-int openfiles;			/* actual number of open files */
-struct sx filelist_lock;	/* sx to protect filelist */
+volatile int openfiles;			/* actual number of open files */
 struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
 void	(*mq_fdclose)(struct thread *td, int fd, struct file *fp);

@ -428,9 +425,7 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 			error = EBADF;
 			break;
 		}
-		FILE_LOCK(fp);
 		td->td_retval[0] = OFLAGS(fp->f_flag);
-		FILE_UNLOCK(fp);
 		FILEDESC_SUNLOCK(fdp);
 		break;

@ -441,12 +436,13 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 			error = EBADF;
 			break;
 		}
-		FILE_LOCK(fp);
-		fhold_locked(fp);
-		fp->f_flag &= ~FCNTLFLAGS;
-		fp->f_flag |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
-		FILE_UNLOCK(fp);
+		fhold(fp);
 		FILEDESC_SUNLOCK(fdp);
+		do {
+			tmp = flg = fp->f_flag;
+			tmp &= ~FCNTLFLAGS;
+			tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
+		} while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
 		tmp = fp->f_flag & FNONBLOCK;
 		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
 		if (error) {
@ -459,9 +455,7 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 			fdrop(fp, td);
 			break;
 		}
-		FILE_LOCK(fp);
-		fp->f_flag &= ~FNONBLOCK;
-		FILE_UNLOCK(fp);
+		atomic_clear_int(&fp->f_flag, FNONBLOCK);
 		tmp = 0;
 		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
 		fdrop(fp, td);
@ -1359,15 +1353,13 @@ int
 falloc(struct thread *td, struct file **resultfp, int *resultfd)
 {
 	struct proc *p = td->td_proc;
-	struct file *fp, *fq;
+	struct file *fp;
 	int error, i;
 	int maxuserfiles = maxfiles - (maxfiles / 20);
 	static struct timeval lastfail;
 	static int curfail;

 	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
-	sx_xlock(&filelist_lock);
-
 	if ((openfiles >= maxuserfiles &&
 	    priv_check(td, PRIV_MAXFILES) != 0) ||
 	    openfiles >= maxfiles) {
@ -1375,18 +1367,16 @@ falloc(struct thread *td, struct file **resultfp, int *resultfd)
 			printf("kern.maxfiles limit exceeded by uid %i, please see tuning(7).\n",
 				td->td_ucred->cr_ruid);
 		}
-		sx_xunlock(&filelist_lock);
 		uma_zfree(file_zone, fp);
 		return (ENFILE);
 	}
-	openfiles++;
+	atomic_add_int(&openfiles, 1);

 	/*
 	 * If the process has file descriptor zero open, add the new file
 	 * descriptor to the list of open files at that point, otherwise
 	 * put it at the front of the list of open files.
 	 */
-	fp->f_mtxp = mtx_pool_alloc(mtxpool_sleep);
 	fp->f_count = 1;
 	if (resultfp)
 		fp->f_count++;
@ -1395,12 +1385,6 @@ falloc(struct thread *td, struct file **resultfp, int *resultfd)
 	fp->f_data = NULL;
 	fp->f_vnode = NULL;
 	FILEDESC_XLOCK(p->p_fd);
-	if ((fq = p->p_fd->fd_ofiles[0])) {
-		LIST_INSERT_AFTER(fq, fp, f_list);
-	} else {
-		LIST_INSERT_HEAD(&filehead, fp, f_list);
-	}
-	sx_xunlock(&filelist_lock);
 	if ((error = fdalloc(td, 0, &i))) {
 		FILEDESC_XUNLOCK(p->p_fd);
 		fdrop(fp, td);
@ -1961,6 +1945,23 @@ closef(struct file *fp, struct thread *td)
 	return (fdrop(fp, td));
 }

+/*
+ * Initialize the file pointer with the specified properties.
+ * 
+ * The ops are set with release semantics to be certain that the flags, type,
+ * and data are visible when ops is.  This is to prevent ops methods from being
+ * called with bad data.
+ */
+void
+finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
+{
+	fp->f_data = data;
+	fp->f_flag = flag;
+	fp->f_type = type;
+	atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
+}
+
+
 /*
 * Extract the file pointer associated with the specified descriptor for the
 * current user process.
@ -2135,54 +2136,20 @@ fputsock(struct socket *so)
 	sorele(so);
 }

-int
-fdrop(struct file *fp, struct thread *td)
-{
-
-	FILE_LOCK(fp);
-	return (fdrop_locked(fp, td));
-}
-
 /*
- * Drop reference on struct file passed in, may call closef if the
- * reference hits zero.
- * Expects struct file locked, and will unlock it.
+ * Handle the last reference to a file being closed.
 */
-static int
-fdrop_locked(struct file *fp, struct thread *td)
+int
+_fdrop(struct file *fp, struct thread *td)
 {
 	int error;

-	FILE_LOCK_ASSERT(fp, MA_OWNED);
-
-	if (--fp->f_count > 0) {
-		FILE_UNLOCK(fp);
-		return (0);
-	}
-
-	/*
-	 * We might have just dropped the last reference to a file
-	 * object that is for a UNIX domain socket whose message
-	 * buffers are being examined in unp_gc().  If that is the
-	 * case, FWAIT will be set in f_gcflag and we need to wait for
-	 * unp_gc() to finish its scan.
-	 */
-	while (fp->f_gcflag & FWAIT)
-		msleep(&fp->f_gcflag, fp->f_mtxp, 0, "fpdrop", 0);
-
-	/* We have the last ref so we can proceed without the file lock. */
-	FILE_UNLOCK(fp);
-	if (fp->f_count < 0)
-		panic("fdrop: count < 0");
+	error = 0;
+	if (fp->f_count != 0)
+		panic("fdrop: count %d", fp->f_count);
 	if (fp->f_ops != &badfileops)
 		error = fo_close(fp, td);
-	else
-		error = 0;
-
-	sx_xlock(&filelist_lock);
-	LIST_REMOVE(fp, f_list);
-	openfiles--;
-	sx_xunlock(&filelist_lock);
+	atomic_subtract_int(&openfiles, 1);
 	crfree(fp->f_cred);
 	uma_zfree(file_zone, fp);

@ -2225,9 +2192,7 @@ flock(struct thread *td, struct flock_args *uap)
 	lf.l_len = 0;
 	if (uap->how & LOCK_UN) {
 		lf.l_type = F_UNLCK;
-		FILE_LOCK(fp);
-		fp->f_flag &= ~FHASLOCK;
-		FILE_UNLOCK(fp);
+		atomic_clear_int(&fp->f_flag, FHASLOCK);
 		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
 		goto done2;
 	}
@ -2239,9 +2204,7 @@ flock(struct thread *td, struct flock_args *uap)
 		error = EBADF;
 		goto done2;
 	}
-	FILE_LOCK(fp);
-	fp->f_flag |= FHASLOCK;
-	FILE_UNLOCK(fp);
+	atomic_set_int(&fp->f_flag, FHASLOCK);
 	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
 	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
 done2:
@ -2286,9 +2249,7 @@ dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode,
 		 * Check that the mode the file is being opened for is a
 		 * subset of the mode of the existing descriptor.
 		 */
-		FILE_LOCK(wfp);
 		if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) {
-			FILE_UNLOCK(wfp);
 			FILEDESC_XUNLOCK(fdp);
 			return (EACCES);
 		}
@ -2297,8 +2258,7 @@ dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode,
 		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
 		if (fp == NULL)
 			fdused(fdp, indx);
-		fhold_locked(wfp);
-		FILE_UNLOCK(wfp);
+		fhold(wfp);
 		FILEDESC_XUNLOCK(fdp);
 		if (fp != NULL)
 			/*
@ -2419,29 +2379,23 @@ sysctl_kern_file(SYSCTL_HANDLER_ARGS)
 	struct proc *p;
 	int error, n;

-	/*
-	 * Note: because the number of file descriptors is calculated
-	 * in different ways for sizing vs returning the data,
-	 * there is information leakage from the first loop.  However,
-	 * it is of a similar order of magnitude to the leakage from
-	 * global system statistics such as kern.openfiles.
-	 */
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	if (req->oldptr == NULL) {
-		n = 16;		/* A slight overestimate. */
-		sx_slock(&filelist_lock);
-		LIST_FOREACH(fp, &filehead, f_list) {
-			/*
-			 * We should grab the lock, but this is an
-			 * estimate, so does it really matter?
-			 */
-			/* mtx_lock(fp->f_mtxp); */
-			n += fp->f_count;
-			/* mtx_unlock(f->f_mtxp); */
+		n = 0;
+		sx_slock(&allproc_lock);
+		FOREACH_PROC_IN_SYSTEM(p) {
+			if (p->p_state == PRS_NEW)
+				continue;
+			fdp = fdhold(p);
+			if (fdp == NULL)
+				continue;
+			/* overestimates sparse tables. */
+			n += fdp->fd_lastfile;
+			fddrop(fdp);
 		}
-		sx_sunlock(&filelist_lock);
+		sx_sunlock(&allproc_lock);
 		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
 	}
 	error = 0;
@ -2472,7 +2426,7 @@ sysctl_kern_file(SYSCTL_HANDLER_ARGS)
 			xf.xf_vnode = fp->f_vnode;
 			xf.xf_type = fp->f_type;
 			xf.xf_count = fp->f_count;
-			xf.xf_msgcount = fp->f_msgcount;
+			xf.xf_msgcount = 0;
 			xf.xf_offset = fp->f_offset;
 			xf.xf_flag = fp->f_flag;
 			error = SYSCTL_OUT(req, &xf, sizeof(xf));
@ -2523,7 +2477,6 @@ sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
 			continue;
 		bzero(kif, sizeof(*kif));
 		kif->kf_structsize = sizeof(*kif);
-		FILE_LOCK(fp);
 		vp = NULL;
 		so = NULL;
 		kif->kf_fd = i;
@ -2531,7 +2484,6 @@ sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
 		case DTYPE_VNODE:
 			kif->kf_type = KF_TYPE_VNODE;
 			vp = fp->f_vnode;
-			vref(vp);
 			break;

 		case DTYPE_SOCKET:
@ -2583,8 +2535,8 @@ sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
 		if (fp->f_flag & FHASLOCK)
 			kif->kf_flags |= KF_FLAG_HASLOCK;
 		kif->kf_offset = fp->f_offset;
-		FILE_UNLOCK(fp);
 		if (vp != NULL) {
+			vref(vp);
 			switch (vp->v_type) {
 			case VNON:
 				kif->kf_vnode_type = KF_VTYPE_VNON;
@ -2736,7 +2688,7 @@ db_print_file(struct file *fp, int header)
 	p = file_to_first_proc(fp);
 	db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,
 	    file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,
-	    fp->f_gcflag, fp->f_count, fp->f_msgcount, fp->f_vnode,
+	    0, fp->f_count, 0, fp->f_vnode,
 	    p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
 }

@ -2754,13 +2706,24 @@ DB_SHOW_COMMAND(file, db_show_file)

 DB_SHOW_COMMAND(files, db_show_files)
 {
+	struct filedesc *fdp;
 	struct file *fp;
+	struct proc *p;
 	int header;
+	int n;

 	header = 1;
-	LIST_FOREACH(fp, &filehead, f_list) {
-		db_print_file(fp, header);
-		header = 0;
+	FOREACH_PROC_IN_SYSTEM(p) {
+		if (p->p_state == PRS_NEW)
+			continue;
+		if ((fdp = p->p_fd) == NULL)
+			continue;
+		for (n = 0; n < fdp->fd_nfiles; ++n) {
+			if ((fp = fdp->fd_ofiles[n]) == NULL)
+				continue;
+			db_print_file(fp, header);
+			header = 0;
+		}
 	}
 }
 #endif
@ -2772,7 +2735,7 @@ SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
    &maxfiles, 0, "Maximum number of files");

 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
-    &openfiles, 0, "System-wide number of open files");
+    __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files");

 /* ARGSUSED*/
 static void
@ -2781,7 +2744,6 @@ filelistinit(void *dummy)

 	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
-	sx_init(&filelist_lock, "filelist lock");
 	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
 	mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF);
 }
--- a/sys/kern/kern_event.c
+++ b/sys/kern/kern_event.c
@ -531,12 +531,7 @@ kqueue(struct thread *td, struct kqueue_args *uap)
 	SLIST_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
 	FILEDESC_XUNLOCK(fdp);

-	FILE_LOCK(fp);
-	fp->f_flag = FREAD | FWRITE;
-	fp->f_type = DTYPE_KQUEUE;
-	fp->f_data = kq;
-	fp->f_ops = &kqueueops;
-	FILE_UNLOCK(fp);
+	finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
 	fdrop(fp, td);

 	td->td_retval[0] = fd;
@ -990,24 +985,17 @@ kqueue_acquire(struct file *fp, struct kqueue **kqp)

 	error = 0;

-	FILE_LOCK(fp);
-	do {
-		kq = fp->f_data;
-		if (fp->f_type != DTYPE_KQUEUE || kq == NULL) {
-			error = EBADF;
-			break;
-		}
-		*kqp = kq;
-		KQ_LOCK(kq);
-		if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
-			KQ_UNLOCK(kq);
-			error = EBADF;
-			break;
-		}
-		kq->kq_refcnt++;
+	kq = fp->f_data;
+	if (fp->f_type != DTYPE_KQUEUE || kq == NULL)
+		return (EBADF);
+	*kqp = kq;
+	KQ_LOCK(kq);
+	if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
 		KQ_UNLOCK(kq);
-	} while (0);
-	FILE_UNLOCK(fp);
+		return (EBADF);
+	}
+	kq->kq_refcnt++;
+	KQ_UNLOCK(kq);

 	return error;
 }
--- a/sys/kern/sys_generic.c
+++ b/sys/kern/sys_generic.c
@ -646,21 +646,17 @@ kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
 		FILEDESC_XUNLOCK(fdp);
 		goto out;
 	case FIONBIO:
-		FILE_LOCK(fp);
 		if ((tmp = *(int *)data))
-			fp->f_flag |= FNONBLOCK;
+			atomic_set_int(&fp->f_flag, FNONBLOCK);
 		else
-			fp->f_flag &= ~FNONBLOCK;
-		FILE_UNLOCK(fp);
+			atomic_clear_int(&fp->f_flag, FNONBLOCK);
 		data = (void *)&tmp;
 		break;
 	case FIOASYNC:
-		FILE_LOCK(fp);
 		if ((tmp = *(int *)data))
-			fp->f_flag |= FASYNC;
+			atomic_set_int(&fp->f_flag, FASYNC);
 		else
-			fp->f_flag &= ~FASYNC;
-		FILE_UNLOCK(fp);
+			atomic_clear_int(&fp->f_flag, FASYNC);
 		data = (void *)&tmp;
 		break;
 	}
--- a/sys/kern/sys_pipe.c
+++ b/sys/kern/sys_pipe.c
@ -363,12 +363,7 @@ pipe(td, uap)
 	 * to avoid races against processes which manage to dup() the read
 	 * side while we are blocked trying to allocate the write side.
 	 */
-	FILE_LOCK(rf);
-	rf->f_flag = FREAD | FWRITE;
-	rf->f_type = DTYPE_PIPE;
-	rf->f_data = rpipe;
-	rf->f_ops = &pipeops;
-	FILE_UNLOCK(rf);
+	finit(rf, FREAD | FWRITE, DTYPE_PIPE, rpipe, &pipeops);
 	error = falloc(td, &wf, &fd);
 	if (error) {
 		fdclose(fdp, rf, td->td_retval[0], td);
@ -378,12 +373,7 @@ pipe(td, uap)
 		return (error);
 	}
 	/* An extra reference on `wf' has been held for us by falloc(). */
-	FILE_LOCK(wf);
-	wf->f_flag = FREAD | FWRITE;
-	wf->f_type = DTYPE_PIPE;
-	wf->f_data = wpipe;
-	wf->f_ops = &pipeops;
-	FILE_UNLOCK(wf);
+	finit(wf, FREAD | FWRITE, DTYPE_PIPE, wpipe, &pipeops);
 	fdrop(wf, td);
 	td->td_retval[1] = fd;
 	fdrop(rf, td);
--- a/sys/kern/uipc_mqueue.c
+++ b/sys/kern/uipc_mqueue.c
@ -1999,12 +1999,8 @@ kmq_open(struct thread *td, struct kmq_open_args *uap)
 	mqnode_addref(pn);
 	sx_xunlock(&mqfs_data.mi_lock);

-	FILE_LOCK(fp);
-	fp->f_flag = (flags & (FREAD | FWRITE | O_NONBLOCK));
-	fp->f_type = DTYPE_MQUEUE;
-	fp->f_data = pn;
-	fp->f_ops = &mqueueops;
-	FILE_UNLOCK(fp);
+	finit(fp, flags & (FREAD | FWRITE | O_NONBLOCK), DTYPE_MQUEUE, pn,
+	    &mqueueops);

 	FILEDESC_XLOCK(fdp);
 	if (fdp->fd_ofiles[fd] == fp)
@ -2097,6 +2093,7 @@ kmq_setattr(struct thread *td, struct kmq_setattr_args *uap)
 	struct mqueue *mq;
 	struct file *fp;
 	struct mq_attr attr, oattr;
+	u_int oflag, flag;
 	int error;

 	if (uap->attr) {
@ -2112,13 +2109,15 @@ kmq_setattr(struct thread *td, struct kmq_setattr_args *uap)
 	oattr.mq_maxmsg  = mq->mq_maxmsg;
 	oattr.mq_msgsize = mq->mq_msgsize;
 	oattr.mq_curmsgs = mq->mq_curmsgs;
-	FILE_LOCK(fp);
-	oattr.mq_flags = (O_NONBLOCK & fp->f_flag);
 	if (uap->attr) {
-		fp->f_flag &= ~O_NONBLOCK;
-		fp->f_flag |= (attr.mq_flags & O_NONBLOCK);
-	}
-	FILE_UNLOCK(fp);
+		do {
+			oflag = flag = fp->f_flag;
+			flag &= ~O_NONBLOCK;
+			flag |= (attr.mq_flags & O_NONBLOCK);
+		} while (atomic_cmpset_int(&fp->f_flag, oflag, flag) == 0);
+	} else
+		oflag = fp->f_flag;
+	oattr.mq_flags = (O_NONBLOCK & oflag);
 	fdrop(fp, td);
 	if (uap->oattr)
 		error = copyout(&oattr, uap->oattr, sizeof(oattr));
--- a/sys/kern/uipc_syscalls.c
+++ b/sys/kern/uipc_syscalls.c
@ -180,12 +180,7 @@ socket(td, uap)
 	if (error) {
 		fdclose(fdp, fp, fd, td);
 	} else {
-		FILE_LOCK(fp);
-		fp->f_data = so;	/* already has ref count */
-		fp->f_flag = FREAD|FWRITE;
-		fp->f_type = DTYPE_SOCKET;
-		fp->f_ops = &socketops;
-		FILE_UNLOCK(fp);
+		finit(fp, FREAD | FWRITE, DTYPE_SOCKET, so, &socketops);
 		td->td_retval[0] = fd;
 	}
 	fdrop(fp, td);
@ -423,12 +418,7 @@ kern_accept(struct thread *td, int s, struct sockaddr **name,
 	if (pgid != 0)
 		fsetown(pgid, &so->so_sigio);

-	FILE_LOCK(nfp);
-	nfp->f_data = so;	/* nfp has ref count from falloc */
-	nfp->f_flag = fflag;
-	nfp->f_type = DTYPE_SOCKET;
-	nfp->f_ops = &socketops;
-	FILE_UNLOCK(nfp);
+	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
 	/* Sync socket nonblocking/async state with file flags */
 	tmp = fflag & FNONBLOCK;
 	(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
@ -640,16 +630,8 @@ socketpair(td, uap)
 		 if (error)
 			goto free4;
 	}
-	FILE_LOCK(fp1);
-	fp1->f_flag = FREAD|FWRITE;
-	fp1->f_type = DTYPE_SOCKET;
-	fp1->f_ops = &socketops;
-	FILE_UNLOCK(fp1);
-	FILE_LOCK(fp2);
-	fp2->f_flag = FREAD|FWRITE;
-	fp2->f_type = DTYPE_SOCKET;
-	fp2->f_ops = &socketops;
-	FILE_UNLOCK(fp2);
+	finit(fp1, FREAD | FWRITE, DTYPE_SOCKET, fp1->f_data, &socketops);
+	finit(fp2, FREAD | FWRITE, DTYPE_SOCKET, fp2->f_data, &socketops);
 	so1 = so2 = NULL;
 	error = copyout(sv, uap->rsv, 2 * sizeof (int));
 	if (error)
@ -2270,12 +2252,7 @@ sctp_peeloff(td, uap)
 	so->so_qstate &= ~SQ_COMP;
 	so->so_head = NULL;
 	ACCEPT_UNLOCK();
-	FILE_LOCK(nfp);
-	nfp->f_data = so;
-	nfp->f_flag = fflag;
-	nfp->f_type = DTYPE_SOCKET;
-	nfp->f_ops = &socketops;
-	FILE_UNLOCK(nfp);
+	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
 	error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name);
 	if (error)
 		goto noconnection;
--- a/sys/kern/uipc_usrreq.c
+++ b/sys/kern/uipc_usrreq.c
@ -233,10 +233,11 @@ static void	unp_shutdown(struct unpcb *);
 static void	unp_drop(struct unpcb *, int);
 static void	unp_gc(__unused void *, int);
 static void	unp_scan(struct mbuf *, void (*)(struct file *));
-static void	unp_mark(struct file *);
 static void	unp_discard(struct file *);
 static void	unp_freerights(struct file **, int);
 static int	unp_internalize(struct mbuf **, struct thread *);
+static void	unp_internalize_fp(struct file *);
+static void	unp_externalize_fp(struct file *);
 static struct mbuf	*unp_addsockcred(struct thread *, struct mbuf *);

 /*
@ -586,9 +587,9 @@ uipc_detach(struct socket *so)
 		unp_drop(ref, ECONNRESET);
 		UNP_PCB_UNLOCK(ref);
 	}
+	local_unp_rights = unp_rights;
 	UNP_GLOBAL_WUNLOCK();
 	unp->unp_socket->so_pcb = NULL;
-	local_unp_rights = unp_rights;
 	saved_unp_addr = unp->unp_addr;
 	unp->unp_addr = NULL;
 	unp->unp_refcount--;
@ -1600,10 +1601,7 @@ unp_externalize(struct mbuf *control, struct mbuf **controlp)
 					panic("unp_externalize fdalloc failed");
 				fp = *rp++;
 				td->td_proc->p_fd->fd_ofiles[f] = fp;
-				FILE_LOCK(fp);
-				fp->f_msgcount--;
-				FILE_UNLOCK(fp);
-				unp_rights--;
+				unp_externalize_fp(fp);
 				*fdp++ = f;
 			}
 			FILEDESC_XUNLOCK(td->td_proc->p_fd);
@ -1765,11 +1763,8 @@ unp_internalize(struct mbuf **controlp, struct thread *td)
 			for (i = 0; i < oldfds; i++) {
 				fp = fdescp->fd_ofiles[*fdp++];
 				*rp++ = fp;
-				FILE_LOCK(fp);
-				fp->f_count++;
-				fp->f_msgcount++;
-				FILE_UNLOCK(fp);
-				unp_rights++;
+				fhold(fp);
+				unp_internalize_fp(fp);
 			}
 			FILEDESC_SUNLOCK(fdescp);
 			break;
@ -1860,230 +1855,198 @@ unp_addsockcred(struct thread *td, struct mbuf *control)
 	return (m);
 }

+static struct unpcb *
+fptounp(struct file *fp)
+{
+	struct socket *so;
+
+	if (fp->f_type != DTYPE_SOCKET)
+		return (NULL);
+	if ((so = fp->f_data) == NULL)
+		return (NULL);
+	if (so->so_proto->pr_domain != &localdomain)
+		return (NULL);
+	return sotounpcb(so);
+}
+
+static void
+unp_discard(struct file *fp)
+{
+
+	unp_externalize_fp(fp);
+	(void) closef(fp, (struct thread *)NULL);
+}
+
+static void
+unp_internalize_fp(struct file *fp)
+{
+	struct unpcb *unp;
+
+	UNP_GLOBAL_WLOCK();
+	if ((unp = fptounp(fp)) != NULL) {
+		unp->unp_file = fp;
+		unp->unp_msgcount++;
+	}
+	unp_rights++;
+	UNP_GLOBAL_WUNLOCK();
+}
+
+static void
+unp_externalize_fp(struct file *fp)
+{
+	struct unpcb *unp;
+
+	UNP_GLOBAL_WLOCK();
+	if ((unp = fptounp(fp)) != NULL)
+		unp->unp_msgcount--;
+	unp_rights--;
+	UNP_GLOBAL_WUNLOCK();
+}
+
 /*
 * unp_defer indicates whether additional work has been defered for a future
 * pass through unp_gc().  It is thread local and does not require explicit
 * synchronization.
 */
-static int	unp_defer;
+static int	unp_marked;
+static int	unp_unreachable;

-static int unp_taskcount;
-SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0, "");
+static void
+unp_accessable(struct file *fp)
+{
+	struct unpcb *unp;
+
+	unp = fptounp(fp);
+	if (fp == NULL)
+		return;
+	if (unp->unp_gcflag & UNPGC_REF)
+		return;
+	unp->unp_gcflag &= ~UNPGC_DEAD;
+	unp->unp_gcflag |= UNPGC_REF;
+	unp_marked++;
+}
+
+static void
+unp_gc_process(struct unpcb *unp)
+{
+	struct socket *soa;
+	struct socket *so;
+	struct file *fp;
+
+	/* Already processed. */
+	if (unp->unp_gcflag & UNPGC_SCANNED)
+		return;
+	fp = unp->unp_file;
+	/*
+	 * Check for a socket potentially in a cycle.  It must be in a
+	 * queue as indicated by msgcount, and this must equal the file
+	 * reference count.  Note that when msgcount is 0 the file is NULL.
+	 */
+	if (unp->unp_msgcount != 0 && fp->f_count != 0 &&
+	    fp->f_count == unp->unp_msgcount) {
+		unp->unp_gcflag |= UNPGC_DEAD;
+		unp_unreachable++;
+		return;
+	}
+	/*
+	 * Mark all sockets we reference with RIGHTS.
+	 */
+	so = unp->unp_socket;
+	SOCKBUF_LOCK(&so->so_rcv);
+	unp_scan(so->so_rcv.sb_mb, unp_accessable);
+	SOCKBUF_UNLOCK(&so->so_rcv);
+	/*
+	 * Mark all sockets in our accept queue.
+	 */
+	ACCEPT_LOCK();
+	TAILQ_FOREACH(soa, &so->so_comp, so_list) {
+		SOCKBUF_LOCK(&soa->so_rcv);
+		unp_scan(soa->so_rcv.sb_mb, unp_accessable);
+		SOCKBUF_UNLOCK(&soa->so_rcv);
+	}
+	ACCEPT_UNLOCK();
+	unp->unp_gcflag |= UNPGC_SCANNED;
+}

 static int unp_recycled;
 SYSCTL_INT(_net_local, OID_AUTO, recycled, CTLFLAG_RD, &unp_recycled, 0, "");

+static int unp_taskcount;
+SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0, "");
+
 static void
 unp_gc(__unused void *arg, int pending)
 {
-	struct file *fp, *nextfp;
-	struct socket *so;
-	struct file **extra_ref, **fpp;
-	int nunref, i;
-	int nfiles_snap;
-	int nfiles_slack = 20;
+	struct unp_head *heads[] = { &unp_dhead, &unp_shead, NULL };
+	struct unp_head **head;
+	struct file **unref;
+	struct unpcb *unp;
+	int i;

 	unp_taskcount++;
-	unp_defer = 0;
+	UNP_GLOBAL_RLOCK();
 	/*
-	 * Before going through all this, set all FDs to be NOT deferred and
-	 * NOT externally accessible.
+	 * First clear all gc flags from previous runs.
+	 */
+	for (head = heads; *head != NULL; head++)
+		LIST_FOREACH(unp, *head, unp_link)
+			unp->unp_gcflag &= ~(UNPGC_REF|UNPGC_DEAD);
+	/*
+	 * Scan marking all reachable sockets with UNPGC_REF.  Once a socket
+	 * is reachable all of the sockets it references are reachable.
+	 * Stop the scan once we do a complete loop without discovering
+	 * a new reachable socket.
 	 */
-	sx_slock(&filelist_lock);
-	LIST_FOREACH(fp, &filehead, f_list)
-		fp->f_gcflag &= ~(FMARK|FDEFER);
 	do {
-		KASSERT(unp_defer >= 0, ("unp_gc: unp_defer %d", unp_defer));
-		LIST_FOREACH(fp, &filehead, f_list) {
-			FILE_LOCK(fp);
-			/*
-			 * If the file is not open, skip it -- could be a
-			 * file in the process of being opened, or in the
-			 * process of being closed.  If the file is
-			 * "closing", it may have been marked for deferred
-			 * consideration.  Clear the flag now if so.
-			 */
-			if (fp->f_count == 0) {
-				if (fp->f_gcflag & FDEFER)
-					unp_defer--;
-				fp->f_gcflag &= ~(FMARK|FDEFER);
-				FILE_UNLOCK(fp);
-				continue;
-			}
-			/*
-			 * If we already marked it as 'defer' in a
-			 * previous pass, then try to process it this
-			 * time and un-mark it.
-			 */
-			if (fp->f_gcflag & FDEFER) {
-				fp->f_gcflag &= ~FDEFER;
-				unp_defer--;
-			} else {
-				/*
-				 * If it's not deferred, then check if it's
-				 * already marked.. if so skip it
-				 */
-				if (fp->f_gcflag & FMARK) {
-					FILE_UNLOCK(fp);
-					continue;
-				}
-				/*
-				 * If all references are from messages in
-				 * transit, then skip it. it's not externally
-				 * accessible.
-				 */
-				if (fp->f_count == fp->f_msgcount) {
-					FILE_UNLOCK(fp);
-					continue;
-				}
-				/*
-				 * If it got this far then it must be
-				 * externally accessible.
-				 */
-				fp->f_gcflag |= FMARK;
-			}
-			/*
-			 * Either it was deferred, or it is externally
-			 * accessible and not already marked so.  Now check
-			 * if it is possibly one of OUR sockets.
-			 */
-			if (fp->f_type != DTYPE_SOCKET ||
-			    (so = fp->f_data) == NULL) {
-				FILE_UNLOCK(fp);
-				continue;
-			}
-			if (so->so_proto->pr_domain != &localdomain ||
-			    (so->so_proto->pr_flags & PR_RIGHTS) == 0) {
-				FILE_UNLOCK(fp);				
-				continue;
-			}
-
-			/*
-			 * Tell any other threads that do a subsequent
-			 * fdrop() that we are scanning the message
-			 * buffers.
-			 */
-			fp->f_gcflag |= FWAIT;
-			FILE_UNLOCK(fp);
-
-			/*
-			 * So, Ok, it's one of our sockets and it IS
-			 * externally accessible (or was deferred).  Now we
-			 * look to see if we hold any file descriptors in its
-			 * message buffers. Follow those links and mark them
-			 * as accessible too.
-			 */
-			SOCKBUF_LOCK(&so->so_rcv);
-			unp_scan(so->so_rcv.sb_mb, unp_mark);
-			SOCKBUF_UNLOCK(&so->so_rcv);
-
-			/*
-			 * Wake up any threads waiting in fdrop().
-			 */
-			FILE_LOCK(fp);
-			fp->f_gcflag &= ~FWAIT;
-			wakeup(&fp->f_gcflag);
-			FILE_UNLOCK(fp);
-		}
-	} while (unp_defer);
-	sx_sunlock(&filelist_lock);
+		unp_unreachable = 0;
+		unp_marked = 0;
+		for (head = heads; *head != NULL; head++)
+			LIST_FOREACH(unp, *head, unp_link)
+				unp_gc_process(unp);
+	} while (unp_marked);
+	UNP_GLOBAL_RUNLOCK();
+	if (unp_unreachable == 0)
+		return;
 	/*
-	 * XXXRW: The following comments need updating for a post-SMPng and
-	 * deferred unp_gc() world, but are still generally accurate.
-	 *
-	 * We grab an extra reference to each of the file table entries that
-	 * are not otherwise accessible and then free the rights that are
-	 * stored in messages on them.
-	 *
-	 * The bug in the orginal code is a little tricky, so I'll describe
-	 * what's wrong with it here.
-	 *
-	 * It is incorrect to simply unp_discard each entry for f_msgcount
-	 * times -- consider the case of sockets A and B that contain
-	 * references to each other.  On a last close of some other socket,
-	 * we trigger a gc since the number of outstanding rights (unp_rights)
-	 * is non-zero.  If during the sweep phase the gc code unp_discards,
-	 * we end up doing a (full) closef on the descriptor.  A closef on A
-	 * results in the following chain.  Closef calls soo_close, which
-	 * calls soclose.   Soclose calls first (through the switch
-	 * uipc_usrreq) unp_detach, which re-invokes unp_gc.  Unp_gc simply
-	 * returns because the previous instance had set unp_gcing, and we
-	 * return all the way back to soclose, which marks the socket with
-	 * SS_NOFDREF, and then calls sofree.  Sofree calls sorflush to free
-	 * up the rights that are queued in messages on the socket A, i.e.,
-	 * the reference on B.  The sorflush calls via the dom_dispose switch
-	 * unp_dispose, which unp_scans with unp_discard.  This second
-	 * instance of unp_discard just calls closef on B.
-	 *
-	 * Well, a similar chain occurs on B, resulting in a sorflush on B,
-	 * which results in another closef on A.  Unfortunately, A is already
-	 * being closed, and the descriptor has already been marked with
-	 * SS_NOFDREF, and soclose panics at this point.
-	 *
-	 * Here, we first take an extra reference to each inaccessible
-	 * descriptor.  Then, we call sorflush ourself, since we know it is a
-	 * Unix domain socket anyhow.  After we destroy all the rights
-	 * carried in messages, we do a last closef to get rid of our extra
-	 * reference.  This is the last close, and the unp_detach etc will
-	 * shut down the socket.
-	 *
-	 * 91/09/19, bsy@cs.cmu.edu
+	 * Allocate space for a local list of dead unpcbs.
 	 */
-again:
-	nfiles_snap = openfiles + nfiles_slack;	/* some slack */
-	extra_ref = malloc(nfiles_snap * sizeof(struct file *), M_TEMP,
-	    M_WAITOK);
-	sx_slock(&filelist_lock);
-	if (nfiles_snap < openfiles) {
-		sx_sunlock(&filelist_lock);
-		free(extra_ref, M_TEMP);
-		nfiles_slack += 20;
-		goto again;
-	}
-	for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref;
-	    fp != NULL; fp = nextfp) {
-		nextfp = LIST_NEXT(fp, f_list);
-		FILE_LOCK(fp);
-		/*
-		 * If it's not open, skip it
-		 */
-		if (fp->f_count == 0) {
-			FILE_UNLOCK(fp);
-			continue;
-		}
-		/*
-		 * If all refs are from msgs, and it's not marked accessible
-		 * then it must be referenced from some unreachable cycle of
-		 * (shut-down) FDs, so include it in our list of FDs to
-		 * remove.
-		 */
-		if (fp->f_count == fp->f_msgcount && !(fp->f_gcflag & FMARK)) {
-			*fpp++ = fp;
-			nunref++;
-			fp->f_count++;
-		}
-		FILE_UNLOCK(fp);
-	}
-	sx_sunlock(&filelist_lock);
+	unref = malloc(unp_unreachable * sizeof(struct file *),
+	    M_TEMP, M_WAITOK);
 	/*
-	 * For each FD on our hit list, do the following two things:
+	 * Iterate looking for sockets which have been specifically marked
+	 * as as unreachable and store them locally.
 	 */
-	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {
-		struct file *tfp = *fpp;
-		FILE_LOCK(tfp);
-		if (tfp->f_type == DTYPE_SOCKET &&
-		    tfp->f_data != NULL) {
-			FILE_UNLOCK(tfp);
-			sorflush(tfp->f_data);
-		} else {
-			FILE_UNLOCK(tfp);
-		}
-	}
-	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {
-		closef(*fpp, (struct thread *) NULL);
-		unp_recycled++;
-	}
-	free(extra_ref, M_TEMP);
+	UNP_GLOBAL_RLOCK();
+	for (i = 0, head = heads; *head != NULL; head++)
+		LIST_FOREACH(unp, *head, unp_link)
+			if (unp->unp_gcflag & UNPGC_DEAD) {
+				unref[i++] = unp->unp_file;
+				KASSERT(unp->unp_file != NULL,
+				    ("unp_gc: Invalid unpcb."));
+				KASSERT(i <= unp_unreachable,
+				    ("unp_gc: incorrect unreachable count."));
+			}
+	UNP_GLOBAL_RUNLOCK();
+	/*
+	 * All further operation is now done on a local list.  We first ref
+	 * all sockets to avoid closing them until all are flushed.
+	 */
+	for (i = 0; i < unp_unreachable; i++)
+		fhold(unref[i]);
+	/*
+	 * Now flush all sockets, free'ing rights.  This will free the
+	 * struct files associated with these sockets but leave each socket
+	 * with one remaining ref.
+	 */
+	for (i = 0; i < unp_unreachable; i++)
+		sorflush(unref[i]->f_data);
+	/*
+	 * And finally release the sockets so they can be reclaimed.
+	 */
+	for (i = 0; i < unp_unreachable; i++)
+		fdrop(unref[i], NULL);
+	unp_recycled += unp_unreachable;
+	free(unref, M_TEMP);
 }

 void
@ -2143,31 +2106,6 @@ unp_scan(struct mbuf *m0, void (*op)(struct file *))
 	}
 }

-static void
-unp_mark(struct file *fp)
-{
-
-	/* XXXRW: Should probably assert file list lock here. */
-
-	if (fp->f_gcflag & FMARK)
-		return;
-	unp_defer++;
-	fp->f_gcflag |= (FMARK|FDEFER);
-}
-
-static void
-unp_discard(struct file *fp)
-{
-
-	UNP_GLOBAL_WLOCK();
-	FILE_LOCK(fp);
-	fp->f_msgcount--;
-	unp_rights--;
-	FILE_UNLOCK(fp);
-	UNP_GLOBAL_WUNLOCK();
-	(void) closef(fp, (struct thread *)NULL);
-}
-
 #ifdef DDB
 static void
 db_print_indent(int indent)
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@ -1022,6 +1022,8 @@ kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags,
 		return (error);
 	/* An extra reference on `nfp' has been held for us by falloc(). */
 	fp = nfp;
+	/* Set the flags early so the finit in devfs can pick them up. */
+	fp->f_flag = flags & FMASK;
 	cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
 	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1 | MPSAFE, pathseg, path, td);
 	td->td_dupfd = -1;		/* XXX check for fdopen */
@ -1067,16 +1069,16 @@ kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags,
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;

-	FILE_LOCK(fp);
-	fp->f_vnode = vp;
-	if (fp->f_data == NULL)
-		fp->f_data = vp;
-	fp->f_flag = flags & FMASK;
-	fp->f_seqcount = 1;
-	fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE);
-	if (fp->f_ops == &badfileops)
-		fp->f_ops = &vnops;
-	FILE_UNLOCK(fp);
+	fp->f_vnode = vp;	/* XXX Does devfs need this? */
+	/*
+	 * If the file wasn't claimed by devfs bind it to the normal
+	 * vnode operations here.
+	 */
+	if (fp->f_ops == &badfileops) {
+		KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));
+		fp->f_seqcount = 1;
+		finit(fp, flags & FMASK, DTYPE_VNODE, vp, &vnops);
+	}

 	VOP_UNLOCK(vp, 0, td);
 	if (flags & (O_EXLOCK | O_SHLOCK)) {
@ -1093,7 +1095,7 @@ kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags,
 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
 			    type)) != 0)
 			goto bad;
-		fp->f_flag |= FHASLOCK;
+		atomic_set_int(&fp->f_flag, FHASLOCK);
 	}
 	if (flags & O_TRUNC) {
 		if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
@ -4179,14 +4181,8 @@ fhopen(td, uap)
 	}
 	/* An extra reference on `nfp' has been held for us by falloc(). */
 	fp = nfp;
-
-	FILE_LOCK(nfp);
 	nfp->f_vnode = vp;
-	nfp->f_data = vp;
-	nfp->f_flag = fmode & FMASK;
-	nfp->f_type = DTYPE_VNODE;
-	nfp->f_ops = &vnops;
-	FILE_UNLOCK(nfp);
+	finit(nfp, fmode & FMASK, DTYPE_VNODE, vp, &vnops);
 	if (fmode & (O_EXLOCK | O_SHLOCK)) {
 		lf.l_whence = SEEK_SET;
 		lf.l_start = 0;
@ -4215,7 +4211,7 @@ fhopen(td, uap)
 			goto out;
 		}
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
-		fp->f_flag |= FHASLOCK;
+		atomic_set_int(&fp->f_flag, FHASLOCK);
 	}

 	VOP_UNLOCK(vp, 0, td);
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@ -488,10 +488,12 @@ vn_read(fp, uio, active_cred, flags, td)
 {
 	struct vnode *vp;
 	int error, ioflag;
+	struct mtx *mtxp;
 	int vfslocked;

 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
+	mtxp = NULL;
 	vp = fp->f_vnode;
 	ioflag = 0;
 	if (fp->f_flag & FNONBLOCK)
@ -505,13 +507,15 @@ vn_read(fp, uio, active_cred, flags, td)
 	 * It is now protected by the FOFFSET_LOCKED flag.
 	 */
 	if ((flags & FOF_OFFSET) == 0) {
-		FILE_LOCK(fp);
+		mtxp = mtx_pool_find(mtxpool_sleep, fp);
+		mtx_lock(mtxp);
 		while(fp->f_vnread_flags & FOFFSET_LOCKED) {
 			fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
-			msleep(&fp->f_vnread_flags,fp->f_mtxp,PUSER -1,"vnread offlock",0);
+			msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
+			    "vnread offlock", 0);
 		}
 		fp->f_vnread_flags |= FOFFSET_LOCKED;
-		FILE_UNLOCK(fp);
+		mtx_unlock(mtxp);
 		vn_lock(vp, LK_SHARED | LK_RETRY, td);
 		uio->uio_offset = fp->f_offset;
 	} else
@ -526,11 +530,11 @@ vn_read(fp, uio, active_cred, flags, td)
 		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
 	if ((flags & FOF_OFFSET) == 0) {
 		fp->f_offset = uio->uio_offset;
-		FILE_LOCK(fp);
+		mtx_lock(mtxp);
 		if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
 			wakeup(&fp->f_vnread_flags);
 		fp->f_vnread_flags = 0;
-		FILE_UNLOCK(fp);
+		mtx_unlock(mtxp);
 	}
 	fp->f_nextoff = uio->uio_offset;
 	VOP_UNLOCK(vp, 0, td);
--- a/sys/netgraph/ng_socket.c
+++ b/sys/netgraph/ng_socket.c
@ -689,7 +689,7 @@ ng_internalize(struct mbuf *control, struct thread *td)
 		vn = fp->f_data;
 		if (vn && (vn->v_type == VCHR)) {
 			/* for a VCHR, actually reference the FILE */
-			fp->f_count++;
+			fhold(fp);
 			/* XXX then what :) */
 			/* how to pass on to other modules? */
 		} else {
--- a/sys/opencrypto/cryptodev.c
+++ b/sys/opencrypto/cryptodev.c
@ -840,12 +840,7 @@ cryptoioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread
 			return (error);
 		}
 		/* falloc automatically provides an extra reference to 'f'. */
-		FILE_LOCK(f);
-		f->f_flag = FREAD | FWRITE;
-		f->f_type = DTYPE_CRYPTO;
-		f->f_data = fcr;
-		f->f_ops = &cryptofops;
-		FILE_UNLOCK(f);
+		finit(f, FREAD | FWRITE, DTYPE_CRYPTO, fcr, &cryptofops);
 		*(u_int32_t *)data = fd;
 		fdrop(f, td);
 		break;
--- a/sys/sys/file.h
+++ b/sys/sys/file.h
@ -99,49 +99,37 @@ struct fileops {
 *
 * Below is the list of locks that protects members in struct file.
 *
- * (fl)	filelist_lock
- * (f)	f_mtx in struct file
+ * (f) protected with mtx_lock(mtx_pool_find(fp))
 * none	not locked
 */

 struct file {
-	LIST_ENTRY(file) f_list;/* (fl) list of active files */
-	short	f_type;		/* descriptor type */
-	void	*f_data;	/* file descriptor specific data */
-	u_int	f_flag;		/* see fcntl.h */
-	struct mtx	*f_mtxp;	/* mutex to protect data */
-	struct fileops *f_ops;	/* File operations */
-	struct	ucred *f_cred;	/* credentials associated with descriptor */
-	int	f_count;	/* (f) reference count */
-	struct vnode *f_vnode;	/* NULL or applicable vnode */
-
-	/* DFLAG_SEEKABLE specific fields */
-	off_t	f_offset;
-	short     f_vnread_flags; /* 
-				   * (f) home grown sleep lock for f_offset
-				   * Used only for shared vnode locking in
-				   * vnread()
-				   */
-#define  FOFFSET_LOCKED       0x1
-#define  FOFFSET_LOCK_WAITING 0x2		 
-	/* DTYPE_SOCKET specific fields */
-	short	f_gcflag;	/* used by thread doing fd garbage collection */
-#define	FMARK		0x1	/* mark during gc() */
-#define	FDEFER		0x2	/* defer for next gc pass */
-#define	FWAIT		0x4	/* gc is scanning message buffers */
-	int	f_msgcount;	/* (f) references from message queue */
-
-	/* DTYPE_VNODE specific fields */
-	int	f_seqcount;	/*
-				 * count of sequential accesses -- cleared
-				 * by most seek operations.
-				 */
-	off_t	f_nextoff;	/*
-				 * offset of next expected read or write
-				 */
-	void	*f_label;	/* Place-holder for struct label pointer. */
+	void		*f_data;	/* file descriptor specific data */
+	struct fileops	*f_ops;		/* File operations */
+	struct ucred	*f_cred;	/* associated credentials. */
+	struct vnode 	*f_vnode;	/* NULL or applicable vnode */
+	short		f_type;		/* descriptor type */
+	short     	f_vnread_flags; /* (f) Sleep lock for f_offset */
+	volatile u_int	f_flag;		/* see fcntl.h */
+	volatile int 	f_count;	/* reference count */
+	/*
+	 *  DTYPE_VNODE specific fields.
+	 */
+	int		f_seqcount;	/* Count of sequential accesses. */
+	off_t		f_nextoff;	/* next expected read/write offset. */
+	/*
+	 *  DFLAG_SEEKABLE specific fields
+	 */
+	off_t		f_offset;
+	/*
+	 * Mandatory Access control information.
+	 */
+	void		*f_label;	/* Place-holder for MAC label. */
 };

+#define	FOFFSET_LOCKED       0x1
+#define	FOFFSET_LOCK_WAITING 0x2		 
+
 #endif /* _KERNEL */

 /*
@ -168,20 +156,17 @@ struct xfile {
 MALLOC_DECLARE(M_FILE);
 #endif

-LIST_HEAD(filelist, file);
-extern struct filelist filehead; /* (fl) head of list of open files */
 extern struct fileops vnops;
 extern struct fileops badfileops;
 extern struct fileops socketops;
 extern int maxfiles;		/* kernel limit on number of open files */
 extern int maxfilesperproc;	/* per process limit on number of open files */
-extern int openfiles;		/* (fl) actual number of open files */
-extern struct sx filelist_lock; /* sx to protect filelist and openfiles */
+extern volatile int openfiles;	/* actual number of open files */

 int fget(struct thread *td, int fd, struct file **fpp);
 int fget_read(struct thread *td, int fd, struct file **fpp);
 int fget_write(struct thread *td, int fd, struct file **fpp);
-int fdrop(struct file *fp, struct thread *td);
+int _fdrop(struct file *fp, struct thread *td);

 /*
 * The socket operations are used a couple of places.
@ -196,12 +181,7 @@ fo_kqfilter_t	soo_kqfilter;
 fo_stat_t	soo_stat;
 fo_close_t	soo_close;

-/* Lock a file. */
-#define	FILE_LOCK(f)	mtx_lock((f)->f_mtxp)
-#define	FILE_UNLOCK(f)	mtx_unlock((f)->f_mtxp)
-#define	FILE_LOCKED(f)	mtx_owned((f)->f_mtxp)
-#define	FILE_LOCK_ASSERT(f, type) mtx_assert((f)->f_mtxp, (type))
-
+void finit(struct file *, u_int, short, void *, struct fileops *);
 int fgetvp(struct thread *td, int fd, struct vnode **vpp);
 int fgetvp_read(struct thread *td, int fd, struct vnode **vpp);
 int fgetvp_write(struct thread *td, int fd, struct vnode **vpp);
@ -209,18 +189,9 @@ int fgetvp_write(struct thread *td, int fd, struct vnode **vpp);
 int fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp);
 void fputsock(struct socket *sp);

-#define	fhold_locked(fp)						\
-	do {								\
-		FILE_LOCK_ASSERT(fp, MA_OWNED);				\
-		(fp)->f_count++;					\
-	} while (0)
-
-#define	fhold(fp)							\
-	do {								\
-		FILE_LOCK(fp);						\
-		(fp)->f_count++;					\
-		FILE_UNLOCK(fp);					\
-	} while (0)
+#define	fhold(fp)	atomic_add_int(&(fp)->f_count, 1)
+#define	fdrop(fp, td)							\
+	(atomic_fetchadd_int(&(fp)->f_count, -1) <= 1 ? _fdrop((fp), (td)) : 0)

 static __inline fo_rdwr_t	fo_read;
 static __inline fo_rdwr_t	fo_write;
--- a/sys/sys/unpcb.h
+++ b/sys/sys/unpcb.h
@ -67,6 +67,7 @@ LIST_HEAD(unp_head, unpcb);
 struct unpcb {
 	LIST_ENTRY(unpcb) unp_link; 	/* glue on list of all PCBs */
 	struct	socket *unp_socket;	/* pointer back to socket */
+	struct	file *unp_file;		/* back-pointer to file for gc. */
 	struct	vnode *unp_vnode;	/* if associated with file */
 	ino_t	unp_ino;		/* fake inode number */
 	struct	unpcb *unp_conn;	/* control block of connected socket */
@ -76,9 +77,11 @@ struct unpcb {
 	int	unp_cc;			/* copy of rcv.sb_cc */
 	int	unp_mbcnt;		/* copy of rcv.sb_mbcnt */
 	unp_gen_t unp_gencnt;		/* generation count of this instance */
-	int	unp_flags;		/* flags */
+	short	unp_flags;		/* flags */
+	short	unp_gcflag;		/* Garbage collector flags. */
 	struct	xucred unp_peercred;	/* peer credentials, if applicable */
 	u_int	unp_refcount;
+	u_int	unp_msgcount;		/* references from message queue */
 	struct	mtx unp_mtx;		/* mutex */
 };

@ -100,6 +103,10 @@ struct unpcb {
 #define	UNP_WANTCRED			0x004	/* credentials wanted */
 #define	UNP_CONNWAIT			0x008	/* connect blocks until accepted */

+#define	UNPGC_REF			0x1	/* unpcb has external ref. */
+#define	UNPGC_DEAD			0x2	/* unpcb might be dead. */
+#define	UNPGC_SCANNED			0x4	/* Has been scanned. */
+
 /*
 * These flags are used to handle non-atomicity in connect() and bind()
 * operations on a socket: in particular, to avoid races between multiple