From 92da00bb245b0398f04fdd966ce001599060f40c Mon Sep 17 00:00:00 2001
From: Matthew Dillon <dillon@FreeBSD.org>
Date: Sun, 15 Dec 2002 19:17:57 +0000
Subject: [PATCH] This is David Schultz's swapoff code which I am finally able
 to commit. This should be considered highly experimental for the moment.

Submitted by:	David Schultz <dschultz@uclink.Berkeley.EDU>
MFC after:	3 weeks
---
 include/unistd.h                     |   1 +
 lib/libc/sys/Makefile.inc            |   1 +
 lib/libc/sys/swapon.2                |  63 +++++---
 sbin/swapon/Makefile                 |   2 +
 sbin/swapon/swapon.8                 |  37 +++--
 sbin/swapon/swapon.c                 |  43 ++++--
 sys/amd64/ia32/syscalls.master       |   1 +
 sys/compat/freebsd32/syscalls.master |   1 +
 sys/ia64/ia32/syscalls.master        |   1 +
 sys/kern/subr_blist.c                | 215 +++++++++++++++++++++++----
 sys/kern/syscalls.master             |   1 +
 sys/sys/blist.h                      |   2 +
 sys/sys/conf.h                       |   1 +
 sys/sys/linedisc.h                   |   1 +
 sys/vm/swap_pager.c                  | 159 +++++++++++++++++++-
 sys/vm/swap_pager.h                  |   2 +
 sys/vm/vm_glue.c                     |  40 +++++
 sys/vm/vm_pageout.h                  |   6 +
 sys/vm/vm_swap.c                     | 140 ++++++++++++++++-
 19 files changed, 633 insertions(+), 84 deletions(-)

diff --git a/include/unistd.h b/include/unistd.h
index 160dada0a1c7..68e456f9561b 100644
--- a/include/unistd.h
+++ b/include/unistd.h
@@ -522,6 +522,7 @@ int	 setruid(uid_t);
 void	 setusershell(void);
 int	 strtofflags(char **, u_long *, u_long *);
 int	 swapon(const char *);
+int	 swapoff(const char *);
 int	 syscall(int, ...);
 off_t	 __syscall(quad_t, ...);
 int	 ttyslot(void);
diff --git a/lib/libc/sys/Makefile.inc b/lib/libc/sys/Makefile.inc
index 4fe3d7e086fd..257ef7a32f81 100644
--- a/lib/libc/sys/Makefile.inc
+++ b/lib/libc/sys/Makefile.inc
@@ -131,6 +131,7 @@ MLINKS+=shmat.2 shmdt.2
 MLINKS+=stat.2 fstat.2 stat.2 lstat.2
 MLINKS+=statfs.2 fstatfs.2
 MLINKS+=syscall.2 __syscall.2
+MLINKS+=swapon.2 swapoff.2
 MLINKS+=truncate.2 ftruncate.2
 MLINKS+=utimes.2 futimes.2 utimes.2 lutimes.2
 MLINKS+=wait.2 wait3.2 wait.2 wait4.2 wait.2 waitpid.2
diff --git a/lib/libc/sys/swapon.2 b/lib/libc/sys/swapon.2
index a852ba012c4b..f4f90926352d 100644
--- a/lib/libc/sys/swapon.2
+++ b/lib/libc/sys/swapon.2
@@ -36,14 +36,16 @@
 .Dt SWAPON 2
 .Os
 .Sh NAME
-.Nm swapon
-.Nd add a swap device for interleaved paging/swapping
+.Nm swapon , swapoff
+.Nd control devices for interleaved paging/swapping
 .Sh LIBRARY
 .Lb libc
 .Sh SYNOPSIS
 .In unistd.h
 .Ft int
 .Fn swapon "const char *special"
+.Ft int
+.Fn swapoff "const char *special"
 .Sh DESCRIPTION
 .Fn Swapon
 makes the block device
@@ -55,13 +57,22 @@ configuration time.  The size of the swap area on
 .Fa special
 is calculated at the time the device is first made available
 for swapping.
+.Pp
+The
+.Fn swapoff
+system call disables paging and swapping on the given device.
+All associated swap metadata are deallocated, and the device
+is made available for other purposes.
 .Sh RETURN VALUES
 If an error has occurred, a value of -1 is returned and
 .Va errno
 is set to indicate the error.
 .Sh ERRORS
-.Fn Swapon
-succeeds unless:
+Both
+.Fn swapon
+and
+.Fn swapoff
+can fail if:
 .Bl -tag -width Er
 .It Bq Er ENOTDIR
 A component of the path prefix is not a directory.
@@ -76,6 +87,19 @@ Search permission is denied for a component of the path prefix.
 Too many symbolic links were encountered in translating the pathname.
 .It Bq Er EPERM
 The caller is not the super-user.
+.It Bq Er EFAULT
+.Fa Special
+points outside the process's allocated address space.
+.El
+.Pp
+Additionally,
+.Fn swapon
+can fail for the following reasons:
+.Bl -tag -width Er
+.It Bq Er EINVAL
+The system has reached the boot-time limit on the number of
+swap devices,
+.Va vm.nswapdev .
 .It Bq Er ENOTBLK
 .Fa Special
 is not a block device.
@@ -84,11 +108,6 @@ The device specified by
 .Fa special
 has already
 been made available for swapping
-.It Bq Er EINVAL
-The device configured by
-.Fa special
-was not
-configured into the system as a swap device.
 .It Bq Er ENXIO
 The major device number of
 .Fa special
@@ -96,20 +115,28 @@ is out of range (this indicates no device driver exists
 for the associated hardware).
 .It Bq Er EIO
 An I/O error occurred while opening the swap device.
-.It Bq Er EFAULT
-.Fa Special
-points outside the process's allocated address space.
+.El
+.Pp
+Lastly,
+.Fn swapoff
+can fail if:
+.Bl -tag -width Er
+.It Bq Er EINVAL
+The system is not currently swapping to
+.Fa special .
+.It Bq Er ENOMEM
+Not enough virtual memory is available to safely disable
+paging and swapping to the given device.
 .El
 .Sh SEE ALSO
 .Xr config 8 ,
-.Xr swapon 8
-.Sh BUGS
-There is no way to stop swapping on a disk so that the pack may be
-dismounted.
-.Pp
-This call will be upgraded in future versions of the system.
+.Xr swapon 8 ,
+.Xr sysctl 8
 .Sh HISTORY
 The
 .Fn swapon
 function call appeared in
 .Bx 4.0 .
+.Fn Swapoff
+appeared in
+.Fx 5.0 .
diff --git a/sbin/swapon/Makefile b/sbin/swapon/Makefile
index be803b298a37..f052567e76cc 100644
--- a/sbin/swapon/Makefile
+++ b/sbin/swapon/Makefile
@@ -3,5 +3,7 @@
 
 PROG=	swapon
 MAN=	swapon.8
+LINKS=	${BINDIR}/swapon ${BINDIR}/swapoff
+MLINKS=	swapon.8 swapoff.8
 
 .include <bsd.prog.mk>
diff --git a/sbin/swapon/swapon.8 b/sbin/swapon/swapon.8
index ce23b38e95ca..edda998a38cc 100644
--- a/sbin/swapon/swapon.8
+++ b/sbin/swapon/swapon.8
@@ -36,39 +36,46 @@
 .Dt SWAPON 8
 .Os
 .Sh NAME
-.Nm swapon
-.Nd "specify additional device for paging and swapping"
+.Nm swapon , swapoff
+.Nd "specify devices for paging and swapping"
 .Sh SYNOPSIS
-.Nm
+.Nm swap[on|off]
 .Fl a
-.Nm
+.Nm swap[on|off]
 .Ar special_file ...
 .Sh DESCRIPTION
 The
-.Nm
+.Nm swapon
 utility is used to specify additional devices on which paging and swapping
 are to take place.
 The system begins by swapping and paging on only a single device
 so that only one disk is required at bootstrap time.
 Calls to
-.Nm
+.Nm swapon
 normally occur in the system multi-user initialization file
 .Pa /etc/rc
 making all swap devices available, so that the paging and swapping
 activity is interleaved across several devices.
 .Pp
+The
+.Nm swapoff
+utility disables paging and swapping on a device.
+Calls to
+.Nm swapoff
+succeed only if disabling the device would leave enough
+remaining virtual memory to accomodate all running programs.
+.Pp
 Normally, the first form is used:
 .Bl -tag -width indent
 .It Fl a
 All devices marked as ``sw''
 swap devices in
 .Pa /etc/fstab
-are made available unless their ``noauto'' option is also set.
+are added to or removed from the pool of available swap
+unless their ``noauto'' option is also set.
 .El
 .Pp
-The second form gives individual block devices as given
-in the system swap configuration table.  The call makes only this space
-available to the system for swap allocation.
+The second form is used to configure or disable individual devices.
 .Sh SEE ALSO
 .Xr swapon 2 ,
 .Xr fstab 5 ,
@@ -85,12 +92,12 @@ memory disk devices
 .It Pa /etc/fstab
 ASCII file system description table
 .El
-.Sh BUGS
-There is no way to stop paging and swapping on a device.
-It is therefore not possible to dismount swap devices which are
-mounted during system operation.
 .Sh HISTORY
 The
-.Nm
+.Nm swapon
 utility appeared in
 .Bx 4.0 .
+The
+.Nm swapoff
+utility appeared in
+.Fx 5.0 .
diff --git a/sbin/swapon/swapon.c b/sbin/swapon/swapon.c
index 69f4e73c7070..51042bc5bec7 100644
--- a/sbin/swapon/swapon.c
+++ b/sbin/swapon/swapon.c
@@ -53,8 +53,9 @@ static const char rcsid[] =
 #include <string.h>
 #include <unistd.h>
 
-static void usage(void);
-int	add(char *name, int ignoreebusy);
+static void usage(const char *);
+static int is_swapoff(const char *);
+int	swap_on_off(char *name, int ignoreebusy, int do_swapoff);
 
 int
 main(int argc, char **argv)
@@ -62,6 +63,10 @@ main(int argc, char **argv)
 	struct fstab *fsp;
 	int stat;
 	int ch, doall;
+	int do_swapoff;
+	char *pname = argv[0];
+
+	do_swapoff = is_swapoff(pname);
 
 	doall = 0;
 	while ((ch = getopt(argc, argv, "a")) != -1)
@@ -71,7 +76,7 @@ main(int argc, char **argv)
 			break;
 		case '?':
 		default:
-			usage();
+			usage(pname);
 		}
 	argv += optind;
 
@@ -82,23 +87,24 @@ main(int argc, char **argv)
 				continue;
 			if (strstr(fsp->fs_mntops, "noauto"))
 				continue;
-			if (add(fsp->fs_spec, 1))
+			if (swap_on_off(fsp->fs_spec, 1, do_swapoff))
 				stat = 1;
 			else
-				printf("swapon: adding %s as swap device\n",
+				printf("%s: %sing %s as swap device\n",
+				    pname, do_swapoff ? "remov" : "add",
 				    fsp->fs_spec);
 		}
 	else if (!*argv)
-		usage();
+		usage(pname);
 	for (; *argv; ++argv)
-		stat |= add(*argv, 0);
+		stat |= swap_on_off(*argv, 0, do_swapoff);
 	exit(stat);
 }
 
 int
-add(char *name, int ignoreebusy)
+swap_on_off(char *name, int ignoreebusy, int do_swapoff)
 {
-	if (swapon(name) == -1) {
+	if ((do_swapoff ? swapoff(name) : swapon(name)) == -1) {
 		switch (errno) {
 		case EBUSY:
 			if (!ignoreebusy)
@@ -114,8 +120,23 @@ add(char *name, int ignoreebusy)
 }
 
 static void
-usage()
+usage(const char *pname)
 {
-	fprintf(stderr, "usage: swapon [-a] [special_file ...]\n");
+	fprintf(stderr, "usage: %s [-a] [special_file ...]\n", pname);
 	exit(1);
 }
+
+static int
+is_swapoff(const char *s)
+{
+	const char *u;
+
+	if ((u = strrchr(s, '/')) != NULL)
+		++u;
+	else
+		u = s;
+	if (strcmp(u, "swapoff") == 0)
+		return 1;
+	else
+		return 0;
+}
diff --git a/sys/amd64/ia32/syscalls.master b/sys/amd64/ia32/syscalls.master
index 66cc75fe3f59..d30b5919ed5d 100644
--- a/sys/amd64/ia32/syscalls.master
+++ b/sys/amd64/ia32/syscalls.master
@@ -594,3 +594,4 @@
 421	UNIMPL	BSD	getcontext
 422	UNIMPL	BSD	setcontext
 423	UNIMPL	BSD	swapcontext
+424	MNOPROTO BSD	swapoff
diff --git a/sys/compat/freebsd32/syscalls.master b/sys/compat/freebsd32/syscalls.master
index 66cc75fe3f59..d30b5919ed5d 100644
--- a/sys/compat/freebsd32/syscalls.master
+++ b/sys/compat/freebsd32/syscalls.master
@@ -594,3 +594,4 @@
 421	UNIMPL	BSD	getcontext
 422	UNIMPL	BSD	setcontext
 423	UNIMPL	BSD	swapcontext
+424	MNOPROTO BSD	swapoff
diff --git a/sys/ia64/ia32/syscalls.master b/sys/ia64/ia32/syscalls.master
index 66cc75fe3f59..d30b5919ed5d 100644
--- a/sys/ia64/ia32/syscalls.master
+++ b/sys/ia64/ia32/syscalls.master
@@ -594,3 +594,4 @@
 421	UNIMPL	BSD	getcontext
 422	UNIMPL	BSD	setcontext
 423	UNIMPL	BSD	swapcontext
+424	MNOPROTO BSD	swapoff
diff --git a/sys/kern/subr_blist.c b/sys/kern/subr_blist.c
index eeeb7d963c30..1ae2ee274339 100644
--- a/sys/kern/subr_blist.c
+++ b/sys/kern/subr_blist.c
@@ -93,7 +93,7 @@
 #include <stdlib.h>
 #include <stdarg.h>
 
-#define malloc(a,b,c)	malloc(a)
+#define malloc(a,b,c)	calloc(a, 1)
 #define free(a,b)	free(a)
 
 typedef unsigned int u_daddr_t;
@@ -116,6 +116,9 @@ static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
 					daddr_t radix, int skip, daddr_t blk);
 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, 
 				daddr_t skip, blist_t dest, daddr_t count);
+static int blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
+static int blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
+				daddr_t radix, int skip, daddr_t blk);
 static daddr_t	blst_radix_init(blmeta_t *scan, daddr_t radix, 
 						int skip, daddr_t count);
 #ifndef _KERNEL
@@ -165,13 +168,14 @@ blist_create(daddr_t blocks)
 
 #if defined(BLIST_DEBUG)
 	printf(
-		"BLIST representing %d blocks (%d MB of swap)"
-		", requiring %dK of ram\n",
-		bl->bl_blocks,
-		bl->bl_blocks * 4 / 1024,
-		(bl->bl_rootblks * sizeof(blmeta_t) + 1023) / 1024
+		"BLIST representing %lld blocks (%lld MB of swap)"
+		", requiring %lldK of ram\n",
+		(long long)bl->bl_blocks,
+		(long long)bl->bl_blocks * 4 / 1024,
+		(long long)(bl->bl_rootblks * sizeof(blmeta_t) + 1023) / 1024
 	);
-	printf("BLIST raw radix tree contains %d records\n", bl->bl_rootblks);
+	printf("BLIST raw radix tree contains %lld records\n",
+	    (long long)bl->bl_rootblks);
 #endif
 	blst_radix_init(bl->bl_root, bl->bl_radix, bl->bl_skip, blocks);
 
@@ -225,6 +229,30 @@ blist_free(blist_t bl, daddr_t blkno, daddr_t count)
 	}
 }
 
+/*
+ * blist_fill() -	mark a region in the block bitmap as off-limits
+ *			to the allocator (i.e. allocate it), ignoring any
+ *			existing allocations.  Return the number of blocks
+ *			actually filled that were free before the call.
+ */
+
+int
+blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
+{
+	int filled;
+
+	if (bl) {
+		if (bl->bl_radix == BLIST_BMAP_RADIX)
+			filled = blst_leaf_fill(bl->bl_root, blkno, count);
+		else
+			filled = blst_meta_fill(bl->bl_root, blkno, count,
+			    bl->bl_radix, bl->bl_skip, 0);
+		bl->bl_free -= filled;
+		return filled;
+	} else
+		return 0;
+}
+
 /*
  * blist_resize() -	resize an existing radix tree to handle the
  *			specified number of blocks.  This will reallocate
@@ -507,9 +535,9 @@ blst_meta_free(
 	int next_skip = (skip >> BLIST_META_RADIX_SHIFT);
 
 #if 0
-	printf("FREE (%x,%d) FROM (%x,%d)\n",
-	    freeBlk, count,
-	    blk, radix
+	printf("FREE (%llx,%lld) FROM (%llx,%lld)\n",
+	    (long long)freeBlk, (long long)count,
+	    (long long)blk, (long long)radix
 	);
 #endif
 
@@ -678,6 +706,117 @@ static void blst_copy(
 	}
 }
 
+/*
+ * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
+ *
+ *	This routine allocates all blocks in the specified range
+ *	regardless of any existing allocations in that range.  Returns
+ *	the number of blocks allocated by the call.
+ */
+
+static int
+blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
+{
+	int n = blk & (BLIST_BMAP_RADIX - 1);
+	int nblks;
+	u_daddr_t mask, bitmap;
+
+	mask = ((u_daddr_t)-1 << n) &
+	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
+
+	/* Count the number of blocks we're about to allocate */
+	bitmap = scan->u.bmu_bitmap & mask;
+	for (nblks = 0; bitmap != 0; nblks++)
+		bitmap &= bitmap - 1;
+
+	scan->u.bmu_bitmap &= ~mask;
+	return nblks;
+}
+
+/*
+ * BLIST_META_FILL() -	allocate specific blocks at a meta node
+ *
+ *	This routine allocates the specified range of blocks,
+ *	regardless of any existing allocations in the range.  The
+ *	range must be within the extent of this node.  Returns the
+ *	number of blocks allocated by the call.
+ */
+static int
+blst_meta_fill(
+	blmeta_t *scan,
+	daddr_t allocBlk,
+	daddr_t count,
+	daddr_t radix, 
+	int skip,
+	daddr_t blk
+) {
+	int i;
+	int next_skip = (skip >> BLIST_META_RADIX_SHIFT);
+	int nblks = 0;
+
+	if (count == radix || scan->u.bmu_avail == 0)  {
+		/*
+		 * ALL-ALLOCATED special case
+		 */
+		nblks = scan->u.bmu_avail;
+		scan->u.bmu_avail = 0;
+		scan->bm_bighint = count;
+		return nblks;
+	}
+
+	if (scan->u.bmu_avail == radix) {
+		radix >>= BLIST_META_RADIX_SHIFT;
+
+		/*
+		 * ALL-FREE special case, initialize sublevel
+		 */
+		for (i = 1; i <= skip; i += next_skip) {
+			if (scan[i].bm_bighint == (daddr_t)-1)
+				break;
+			if (next_skip == 1) {
+				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
+				scan[i].bm_bighint = BLIST_BMAP_RADIX;
+			} else {
+				scan[i].bm_bighint = radix;
+				scan[i].u.bmu_avail = radix;
+			}
+		}
+	} else {
+		radix >>= BLIST_META_RADIX_SHIFT;
+	}
+
+	if (count > radix)
+		panic("blist_meta_fill: allocation too large");
+
+	i = (allocBlk - blk) / radix;
+	blk += i * radix;
+	i = i * next_skip + 1;
+
+	while (i <= skip && blk < allocBlk + count) {
+		daddr_t v;
+
+		v = blk + radix - allocBlk;
+		if (v > count)
+			v = count;
+
+		if (scan->bm_bighint == (daddr_t)-1)
+			panic("blst_meta_fill: filling unexpected range");
+
+		if (next_skip == 1) {
+			nblks += blst_leaf_fill(&scan[i], allocBlk, v);
+		} else {
+			nblks += blst_meta_fill(&scan[i], allocBlk, v,
+			    radix, next_skip - 1, blk);
+		}
+		count -= v;
+		allocBlk += v;
+		blk += radix;
+		i += next_skip;
+	}
+	scan->u.bmu_avail -= nblks;
+	return nblks;
+}
+
 /*
  * BLST_RADIX_INIT() - initialize radix tree
  *
@@ -768,41 +907,41 @@ blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab)
 
 	if (radix == BLIST_BMAP_RADIX) {
 		printf(
-		    "%*.*s(%04x,%d): bitmap %08x big=%d\n", 
+		    "%*.*s(%08llx,%lld): bitmap %08llx big=%lld\n", 
 		    tab, tab, "",
-		    blk, radix,
-		    scan->u.bmu_bitmap,
-		    scan->bm_bighint
+		    (long long)blk, (long long)radix,
+		    (long long)scan->u.bmu_bitmap,
+		    (long long)scan->bm_bighint
 		);
 		return;
 	}
 
 	if (scan->u.bmu_avail == 0) {
 		printf(
-		    "%*.*s(%04x,%d) ALL ALLOCATED\n",
+		    "%*.*s(%08llx,%lld) ALL ALLOCATED\n",
 		    tab, tab, "",
-		    blk,
-		    radix
+		    (long long)blk,
+		    (long long)radix
 		);
 		return;
 	}
 	if (scan->u.bmu_avail == radix) {
 		printf(
-		    "%*.*s(%04x,%d) ALL FREE\n",
+		    "%*.*s(%08llx,%lld) ALL FREE\n",
 		    tab, tab, "",
-		    blk,
-		    radix
+		    (long long)blk,
+		    (long long)radix
 		);
 		return;
 	}
 
 	printf(
-	    "%*.*s(%04x,%d): subtree (%d/%d) big=%d {\n",
+	    "%*.*s(%08llx,%lld): subtree (%lld/%lld) big=%lld {\n",
 	    tab, tab, "",
-	    blk, radix,
-	    scan->u.bmu_avail,
-	    radix,
-	    scan->bm_bighint
+	    (long long)blk, (long long)radix,
+	    (long long)scan->u.bmu_avail,
+	    (long long)radix,
+	    (long long)scan->bm_bighint
 	);
 
 	radix >>= BLIST_META_RADIX_SHIFT;
@@ -812,9 +951,9 @@ blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab)
 	for (i = 1; i <= skip; i += next_skip) {
 		if (scan[i].bm_bighint == (daddr_t)-1) {
 			printf(
-			    "%*.*s(%04x,%d): Terminator\n",
+			    "%*.*s(%08llx,%lld): Terminator\n",
 			    tab, tab, "",
-			    blk, radix
+			    (long long)blk, (long long)radix
 			);
 			lastState = 0;
 			break;
@@ -866,13 +1005,14 @@ main(int ac, char **av)
 		daddr_t count = 0;
 
 
-		printf("%d/%d/%d> ", bl->bl_free, size, bl->bl_radix);
+		printf("%lld/%lld/%lld> ", (long long)bl->bl_free,
+		    (long long)size, (long long)bl->bl_radix);
 		fflush(stdout);
 		if (fgets(buf, sizeof(buf), stdin) == NULL)
 			break;
 		switch(buf[0]) {
 		case 'r':
-			if (sscanf(buf + 1, "%d", &count) == 1) {
+			if (sscanf(buf + 1, "%lld", &count) == 1) {
 				blist_resize(&bl, count, 1);
 			} else {
 				printf("?\n");
@@ -881,26 +1021,37 @@ main(int ac, char **av)
 			blist_print(bl);
 			break;
 		case 'a':
-			if (sscanf(buf + 1, "%d", &count) == 1) {
+			if (sscanf(buf + 1, "%lld", &count) == 1) {
 				daddr_t blk = blist_alloc(bl, count);
-				printf("    R=%04x\n", blk);
+				printf("    R=%08llx\n", (long long)blk);
 			} else {
 				printf("?\n");
 			}
 			break;
 		case 'f':
-			if (sscanf(buf + 1, "%x %d", &da, &count) == 2) {
+			if (sscanf(buf + 1, "%llx %lld",
+			    (long long *)&da, (long long *)&count) == 2) {
 				blist_free(bl, da, count);
 			} else {
 				printf("?\n");
 			}
 			break;
+		case 'l':
+			if (sscanf(buf + 1, "%llx %lld",
+			    (long long *)&da, (long long *)&count) == 2) {
+				printf("    n=%d\n",
+				    blist_fill(bl, da, count));
+			} else {
+				printf("?\n");
+			}
+			break;
 		case '?':
 		case 'h':
 			puts(
 			    "p          -print\n"
 			    "a %d       -allocate\n"
 			    "f %x %d    -free\n"
+			    "l %x %d    -fill\n"
 			    "r %d       -resize\n"
 			    "h/?        -help"
 			);
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index a41eb122bcf7..014427491ed6 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -612,6 +612,7 @@
 422	MSTD	BSD	{ int setcontext(const struct __ucontext *ucp); }
 423	MSTD	BSD	{ int swapcontext(struct __ucontext *oucp, \
 			    const struct __ucontext *ucp); }
+424	MSTD	BSD	{ int swapoff(const char *name); }
 
 ; Please copy any additions and changes to the following compatability tables:
 ; sys/ia64/ia32/syscalls.master  (take a best guess)
diff --git a/sys/sys/blist.h b/sys/sys/blist.h
index fa4be7e276b4..d426e483e06d 100644
--- a/sys/sys/blist.h
+++ b/sys/sys/blist.h
@@ -9,6 +9,7 @@
  *		(void)  blist_destroy(blist)
  *		blkno = blist_alloc(blist, count)
  *		(void)  blist_free(blist, blkno, count)
+ *		nblks = blist_fill(blist, blkno, count)
  *		(void)  blist_resize(&blist, count, freeextra)
  *		
  *
@@ -78,6 +79,7 @@ extern blist_t blist_create(daddr_t blocks);
 extern void blist_destroy(blist_t blist);
 extern daddr_t blist_alloc(blist_t blist, daddr_t count);
 extern void blist_free(blist_t blist, daddr_t blkno, daddr_t count);
+extern int blist_fill(blist_t bl, daddr_t blkno, daddr_t count);
 extern void blist_print(blist_t blist);
 extern void blist_resize(blist_t *pblist, daddr_t count, int freenew);
 
diff --git a/sys/sys/conf.h b/sys/sys/conf.h
index 1be8506c0296..15108c22c815 100644
--- a/sys/sys/conf.h
+++ b/sys/sys/conf.h
@@ -274,6 +274,7 @@ struct swdevt {
 };
 #define	SW_FREED	0x01
 #define	SW_SEQUENTIAL	0x02
+#define SW_CLOSING	0x04
 #define	sw_freed	sw_flags	/* XXX compat */
 
 #ifdef _KERNEL
diff --git a/sys/sys/linedisc.h b/sys/sys/linedisc.h
index 1be8506c0296..15108c22c815 100644
--- a/sys/sys/linedisc.h
+++ b/sys/sys/linedisc.h
@@ -274,6 +274,7 @@ struct swdevt {
 };
 #define	SW_FREED	0x01
 #define	SW_SEQUENTIAL	0x02
+#define SW_CLOSING	0x04
 #define	sw_freed	sw_flags	/* XXX compat */
 
 #ifdef _KERNEL
diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c
index de203e23a635..2f43bc42c877 100644
--- a/sys/vm/swap_pager.c
+++ b/sys/vm/swap_pager.c
@@ -206,6 +206,8 @@ static __inline daddr_t	swp_pager_getswapspace(int npages);
 /*
  * Metadata functions
  */
+static __inline struct swblock **
+    swp_pager_hash(vm_object_t object, vm_pindex_t index);
 static void swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t);
 static void swp_pager_meta_free(vm_object_t, vm_pindex_t, daddr_t);
 static void swp_pager_meta_free_all(vm_object_t);
@@ -512,12 +514,22 @@ swp_pager_freeswapspace(blk, npages)
 	daddr_t blk;
 	int npages;
 {
+	struct swdevt *sp = &swdevt[BLK2DEVIDX(blk)];
+
 	GIANT_REQUIRED;
 
+	/* per-swap area stats */
+	sp->sw_used -= npages;
+
+	/*
+	 * If we are attempting to stop swapping on this device, we
+	 * don't want to mark any blocks free lest they be reused.
+	 */
+	if (sp->sw_flags & SW_CLOSING)
+		return;
+
 	blist_free(swapblist, blk, npages);
 	vm_swap_size += npages;
-	/* per-swap area stats */
-	swdevt[BLK2DEVIDX(blk)].sw_used -= npages;
 	swp_sizecheck();
 }
 
@@ -1624,6 +1636,149 @@ swp_pager_async_iodone(bp)
 	splx(s);
 }
 
+/*
+ *	swap_pager_isswapped:
+ *
+ *	Return 1 if at least one page in the given object is paged
+ *	out to the given swap device.
+ *
+ *	This routine may not block.
+ */
+int swap_pager_isswapped(vm_object_t object, int devidx) {
+	daddr_t index = 0;
+	int bcount;
+	int i;
+
+	for (bcount = 0; bcount < object->un_pager.swp.swp_bcount; bcount++) {
+		struct swblock *swap;
+
+		if ((swap = *swp_pager_hash(object, index)) != NULL) {
+			for (i = 0; i < SWAP_META_PAGES; ++i) {
+				daddr_t v = swap->swb_pages[i];
+				if (v != SWAPBLK_NONE &&
+				    BLK2DEVIDX(v) == devidx)
+					return 1;
+			}
+		}
+
+		index += SWAP_META_PAGES;
+		if (index > 0x20000000)
+			panic("swap_pager_isswapped: failed to locate all swap meta blocks");
+	}
+	return 0;
+}
+
+/*
+ * SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in
+ *
+ *	This routine dissociates the page at the given index within a
+ *	swap block from its backing store, paging it in if necessary.
+ *	If the page is paged in, it is placed in the inactive queue,
+ *	since it had its backing store ripped out from under it.
+ *	We also attempt to swap in all other pages in the swap block,
+ *	we only guarantee that the one at the specified index is
+ *	paged in.
+ *
+ *	XXX - The code to page the whole block in doesn't work, so we
+ *	      revert to the one-by-one behavior for now.  Sigh.
+ */
+static __inline void
+swp_pager_force_pagein(struct swblock *swap, int idx)
+{
+	vm_object_t object;
+	vm_page_t m;
+	vm_pindex_t pindex;
+
+	object = swap->swb_object;
+	pindex = swap->swb_index;
+
+	vm_object_pip_add(object, 1);
+	m = vm_page_grab(object, pindex + idx, VM_ALLOC_NORMAL|VM_ALLOC_RETRY);
+	if (m->valid == VM_PAGE_BITS_ALL) {
+		vm_object_pip_subtract(object, 1);
+		vm_page_lock_queues();
+		vm_page_activate(m);
+		vm_page_dirty(m);
+		vm_page_wakeup(m);
+		vm_page_unlock_queues();
+		vm_pager_page_unswapped(m);
+		return;
+	}
+
+	if (swap_pager_getpages(object, &m, 1, 0) !=
+	    VM_PAGER_OK)
+		panic("swap_pager_force_pagein: read from swap failed");/*XXX*/
+	vm_object_pip_subtract(object, 1);
+
+	vm_page_lock_queues();
+	vm_page_dirty(m);
+	vm_page_dontneed(m);
+	vm_page_wakeup(m);
+	vm_page_unlock_queues();
+	vm_pager_page_unswapped(m);
+}
+
+
+/*
+ *	swap_pager_swapoff:
+ *
+ *	Page in all of the pages that have been paged out to the
+ *	given device.  The corresponding blocks in the bitmap must be
+ *	marked as allocated and the device must be flagged SW_CLOSING.
+ *	There may be no processes swapped out to the device.
+ *
+ *	The sw_used parameter points to the field in the swdev structure
+ *	that contains a count of the number of blocks still allocated
+ *	on the device.  If we encounter objects with a nonzero pip count
+ *	in our scan, we use this number to determine if we're really done.
+ *
+ *	This routine may block.
+ */
+void
+swap_pager_swapoff(int devidx, int *sw_used)
+{
+	struct swblock **pswap;
+	struct swblock *swap;
+	vm_object_t waitobj;
+	daddr_t v;
+	int i, j;
+
+	GIANT_REQUIRED;
+
+full_rescan:
+	waitobj = NULL;
+	for (i = 0; i <= swhash_mask; i++) { /* '<=' is correct here */
+restart:
+		pswap = &swhash[i];
+		while ((swap = *pswap) != NULL) {
+                        for (j = 0; j < SWAP_META_PAGES; ++j) {
+                                v = swap->swb_pages[j];
+                                if (v != SWAPBLK_NONE &&
+				    BLK2DEVIDX(v) == devidx)
+                                        break;
+                        }
+			if (j < SWAP_META_PAGES) {
+				swp_pager_force_pagein(swap, j);
+				goto restart;
+			} else if (swap->swb_object->paging_in_progress) {
+				if (!waitobj)
+					waitobj = swap->swb_object;
+			}
+			pswap = &swap->swb_hnext;
+		}
+	}
+	if (waitobj && *sw_used) {
+	    /*
+	     * We wait on an arbitrary object to clock our rescans
+	     * to the rate of paging completion.
+	     */
+	    vm_object_pip_wait(waitobj, "swpoff");
+	    goto full_rescan;
+	}
+	if (*sw_used)
+	    panic("swapoff: failed to locate %d swap blocks", *sw_used);
+}
+
 /************************************************************************
  *				SWAP META DATA 				*
  ************************************************************************
diff --git a/sys/vm/swap_pager.h b/sys/vm/swap_pager.h
index 97d50d388d65..44022849c115 100644
--- a/sys/vm/swap_pager.h
+++ b/sys/vm/swap_pager.h
@@ -83,9 +83,11 @@ extern struct pagerlst swap_pager_un_object_list;
 extern int swap_pager_full;
 extern struct blist *swapblist;
 extern struct uma_zone *swap_zone;
+extern int nswap_lowat, nswap_hiwat;
 
 void swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *);
 boolean_t swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after);
+void swap_pager_swapoff(int devidx, int *sw_used);
 
 int swap_pager_swp_alloc(vm_object_t, int);
 void swap_pager_copy(vm_object_t, vm_object_t, vm_pindex_t, int);
diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c
index 6ac6a96688aa..e38b3d32fee8 100644
--- a/sys/vm/vm_glue.c
+++ b/sys/vm/vm_glue.c
@@ -91,6 +91,7 @@
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pager.h>
+#include <vm/swap_pager.h>
 
 #include <sys/user.h>
 
@@ -324,6 +325,45 @@ vm_proc_swapin(struct proc *p)
 	up = (vm_offset_t)p->p_uarea;
 	pmap_qenter(up, ma, UAREA_PAGES);
 }
+
+/*
+ * Swap in the UAREAs of all processes swapped out to the given device.
+ * The pages in the UAREA are marked dirty and their swap metadata is freed.
+ */
+void
+vm_proc_swapin_all(int devidx)
+{
+	struct proc *p;
+	vm_object_t object;
+	vm_page_t m;
+
+retry:
+	sx_slock(&allproc_lock);
+	FOREACH_PROC_IN_SYSTEM(p) {
+		PROC_LOCK(p);
+		mtx_lock_spin(&sched_lock);
+
+		object = p->p_upages_obj;
+		if (object != NULL &&
+		    swap_pager_isswapped(p->p_upages_obj, devidx)) {
+			sx_sunlock(&allproc_lock);
+			faultin(p);
+			mtx_unlock_spin(&sched_lock);
+			PROC_UNLOCK(p);
+			vm_page_lock_queues();
+			TAILQ_FOREACH(m, &object->memq, listq)
+				vm_page_dirty(m);
+			vm_page_unlock_queues();
+			swap_pager_freespace(object, 0,
+			    object->un_pager.swp.swp_bcount);
+			goto retry;
+		}
+
+		mtx_unlock_spin(&sched_lock);
+		PROC_UNLOCK(p);
+	}
+	sx_sunlock(&allproc_lock);
+}
 #endif
 
 /*
diff --git a/sys/vm/vm_pageout.h b/sys/vm/vm_pageout.h
index c909c689a3c7..d68ec795d62e 100644
--- a/sys/vm/vm_pageout.h
+++ b/sys/vm/vm_pageout.h
@@ -104,6 +104,12 @@ extern void pagedaemon_wakeup(void);
 extern void vm_wait(void);
 extern void vm_waitpfault(void);
 
+/* XXX This is probably misplaced. */
+#ifndef NO_SWAPPING
+void vm_proc_swapin_all(int);
+int swap_pager_isswapped(vm_object_t, int);
+#endif	/* !NO_SWAPPING */
+
 #ifdef _KERNEL
 void vm_pageout_page(vm_page_t, vm_object_t);
 void vm_pageout_cluster(vm_page_t, vm_object_t);
diff --git a/sys/vm/vm_swap.c b/sys/vm/vm_swap.c
index 1781182d22ec..0ec522034832 100644
--- a/sys/vm/vm_swap.c
+++ b/sys/vm/vm_swap.c
@@ -36,6 +36,7 @@
 
 #include "opt_mac.h"
 #include "opt_swap.h"
+#include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -58,6 +59,7 @@
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
+#include <vm/vm_pageout.h>
 #include <vm/swap_pager.h>
 #include <vm/uma.h>
 
@@ -73,6 +75,8 @@ struct swdevt *swdevt = should_be_malloced;
 static int nswap;		/* first block after the interleaved devs */
 int nswdev = NSWAPDEV;
 int vm_swap_size;
+static int swdev_syscall_active = 0; /* serialize swap(on|off) */
+
 
 static int swapdev_strategy(struct vop_strategy_args *ap);
 struct vnode *swapdev_vp;
@@ -165,11 +169,12 @@ swapdev_strategy(ap)
 
 /*
  * Create a special vnode op vector for swapdev_vp - we only use
- * VOP_STRATEGY(), everything else returns an error.
+ * VOP_STRATEGY() and reclaim; everything else returns an error.
  */
 vop_t **swapdev_vnodeop_p;
 static struct vnodeopv_entry_desc swapdev_vnodeop_entries[] = {  
 	{ &vop_default_desc,		(vop_t *) vop_defaultop },
+	{ &vop_reclaim_desc,		(vop_t *) vop_null },
 	{ &vop_strategy_desc,		(vop_t *) swapdev_strategy },
 	{ NULL, NULL }
 };
@@ -208,19 +213,23 @@ swapon(td, uap)
 	if (error)
 		goto done2;
 
+	while (swdev_syscall_active)
+	    tsleep(&swdev_syscall_active, PUSER - 1, "swpon", 0);
+	swdev_syscall_active = 1;
+
 	/*
 	 * Swap metadata may not fit in the KVM if we have physical
 	 * memory of >1GB.
 	 */
 	if (swap_zone == NULL) {
 		error = ENOMEM;
-		goto done2;
+		goto done;
 	}
 
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->name, td);
 	error = namei(&nd);
 	if (error)
-		goto done2;
+		goto done;
 
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
@@ -239,6 +248,9 @@ swapon(td, uap)
 
 	if (error)
 		vrele(vp);
+done:
+	swdev_syscall_active = 0;
+	wakeup_one(&swdev_syscall_active);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
@@ -252,8 +264,6 @@ done2:
  *
  * The new swap code uses page-sized blocks.  The old swap code used
  * DEV_BSIZE'd chunks.
- *
- * XXX locking when multiple swapon's run in parallel
  */
 int
 swaponvp(td, vp, dev, nblks)
@@ -330,7 +340,7 @@ swaponvp(td, vp, dev, nblks)
 	sp->sw_vp = vp;
 	sp->sw_dev = dev2udev(dev);
 	sp->sw_device = dev;
-	sp->sw_flags |= SW_FREED;
+	sp->sw_flags = SW_FREED;
 	sp->sw_nblks = nblks;
 	sp->sw_used = 0;
 
@@ -356,9 +366,127 @@ swaponvp(td, vp, dev, nblks)
 		vm_swap_size += blk;
 	}
 
+	swap_pager_full = 0;
+
 	return (0);
 }
 
+/*
+ * SYSCALL: swapoff(devname)
+ *
+ * Disable swapping on the given device.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct swapoff_args {
+	char *name;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+swapoff(td, uap)
+	struct thread *td;
+	struct swapoff_args *uap;
+{
+	struct vnode *vp;
+	struct nameidata nd;
+	struct swdevt *sp;
+	swblk_t dvbase, vsbase;
+	u_long nblks, aligned_nblks, blk;
+	int error, index;
+
+	mtx_lock(&Giant);
+
+	error = suser(td);
+	if (error)
+		goto done2;
+
+	while (swdev_syscall_active)
+	    tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
+	swdev_syscall_active = 1;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->name, td);
+	error = namei(&nd);
+	if (error)
+		goto done;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vp = nd.ni_vp;
+
+	for (sp = swdevt, index = 0 ; index < nswdev; index++, sp++) {
+		if (sp->sw_vp == vp)
+			goto found;
+	}
+	error = EINVAL;
+	goto done;
+found:
+	nblks = sp->sw_nblks;
+
+	/*
+	 * We can turn off this swap device safely only if the
+	 * available virtual memory in the system will fit the amount
+	 * of data we will have to page back in, plus an epsilon so
+	 * the system doesn't become critically low on swap space.
+	 */
+	if (cnt.v_free_count + cnt.v_cache_count + vm_swap_size <
+	    nblks + nswap_lowat) {
+		error = ENOMEM;
+		goto done;
+	}
+
+	/*
+	 * Prevent further allocations on this device.
+	 */
+	sp->sw_flags |= SW_CLOSING;
+	for (dvbase = dmmax; dvbase < nblks; dvbase += dmmax) {
+		blk = min(nblks - dvbase, dmmax);
+		vsbase = index * dmmax + dvbase * nswdev;
+		vm_swap_size -= blist_fill(swapblist, vsbase, blk);
+	}
+
+	/*
+	 * Page in the contents of the device and close it.
+	 */
+#ifndef NO_SWAPPING
+       	vm_proc_swapin_all(index);
+#endif /* !NO_SWAPPING */
+	swap_pager_swapoff(index, &sp->sw_used);
+
+	VOP_CLOSE(vp, FREAD | FWRITE, td->td_ucred, td);
+	vrele(vp);
+	sp->sw_vp = NULL;
+
+	/*
+	 * Resize the bitmap based on the new largest swap device,
+	 * or free the bitmap if there are no more devices.
+	 */
+	for (sp = swdevt, nblks = 0; sp < swdevt + nswdev; sp++) {
+		if (sp->sw_vp == NULL)
+			continue;
+		nblks = max(nblks, sp->sw_nblks);
+	}
+
+	aligned_nblks = (nblks + (dmmax - 1)) & ~(u_long)(dmmax - 1);
+	nswap = aligned_nblks * nswdev;
+
+	if (nswap == 0) {
+		blist_destroy(swapblist);
+		swapblist = NULL;
+		vrele(swapdev_vp);
+		swapdev_vp = NULL;
+	} else
+		blist_resize(&swapblist, nswap, 0);
+
+done:
+	swdev_syscall_active = 0;
+	wakeup_one(&swdev_syscall_active);
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
 static int
 sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS)
 {