Correct an error of omission in the implementation of the truncation

operation on POSIX shared memory objects and tmpfs. Previously, neither of these modules correctly handled the case in which the new size of the object or file was not a multiple of the page size. Specifically, they did not handle partial page truncation of data stored on swap. As a result, stale data might later be returned to an application. Interestingly, a data inconsistency was less likely to occur under tmpfs than POSIX shared memory objects. The reason being that a different mistake by the tmpfs truncation operation helped avoid a data inconsistency. If the data was still resident in memory in a PG_CACHED page, then the tmpfs truncation operation would reactivate that page, zero the truncated portion, and leave the page pinned in memory. More precisely, the benevolent error was that the truncation operation didn't add the reactivated page to any of the paging queues, effectively pinning the page. This page would remain pinned until the file was destroyed or the page was read or written. With this change, the page is now added to the inactive queue. Discussed with: jhb Reviewed by: kib (an earlier version) MFC after: 3 weeks
2012-01-08 20:09:26 +00:00 · 2012-01-08 20:09:26 +00:00 · 2971897d51
commit 2971897d51
parent ca54e1aee3
2 changed files with 113 additions and 66 deletions
--- a/sys/fs/tmpfs/tmpfs_subr.c
+++ b/sys/fs/tmpfs/tmpfs_subr.c
@ -48,6 +48,7 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>

@ -886,10 +887,10 @@ tmpfs_reg_resize(struct vnode *vp, off_t newsize)
 	struct tmpfs_mount *tmp;
 	struct tmpfs_node *node;
 	vm_object_t uobj;
-	vm_page_t m;
-	vm_pindex_t newpages, oldpages;
+	vm_page_t m, ma[1];
+	vm_pindex_t idx, newpages, oldpages;
 	off_t oldsize;
-	size_t zerolen;
+	int base, rv;

 	MPASS(vp->v_type == VREG);
 	MPASS(newsize >= 0);
@ -912,14 +913,56 @@ tmpfs_reg_resize(struct vnode *vp, off_t newsize)
 	    newpages - oldpages > TMPFS_PAGES_AVAIL(tmp))
 		return (ENOSPC);

-	TMPFS_LOCK(tmp);
-	tmp->tm_pages_used += (newpages - oldpages);
-	TMPFS_UNLOCK(tmp);
-
-	node->tn_size = newsize;
-	vnode_pager_setsize(vp, newsize);
 	VM_OBJECT_LOCK(uobj);
 	if (newsize < oldsize) {
+		/*
+		 * Zero the truncated part of the last page.
+		 */
+		base = newsize & PAGE_MASK;
+		if (base != 0) {
+			idx = OFF_TO_IDX(newsize);
+retry:
+			m = vm_page_lookup(uobj, idx);
+			if (m != NULL) {
+				if ((m->oflags & VPO_BUSY) != 0 ||
+				    m->busy != 0) {
+					vm_page_sleep(m, "tmfssz");
+					goto retry;
+				}
+			} else if (vm_pager_has_page(uobj, idx, NULL, NULL)) {
+				m = vm_page_alloc(uobj, idx, VM_ALLOC_NORMAL);
+				if (m == NULL) {
+					VM_OBJECT_UNLOCK(uobj);
+					VM_WAIT;
+					VM_OBJECT_LOCK(uobj);
+					goto retry;
+				} else if (m->valid != VM_PAGE_BITS_ALL) {
+					ma[0] = m;
+					rv = vm_pager_get_pages(uobj, ma, 1, 0);
+					m = vm_page_lookup(uobj, idx);
+				} else
+					/* A cached page was reactivated. */
+					rv = VM_PAGER_OK;
+				vm_page_lock(m);
+				if (rv == VM_PAGER_OK) {
+					vm_page_deactivate(m);
+					vm_page_unlock(m);
+					vm_page_wakeup(m);
+				} else {
+					vm_page_free(m);
+					vm_page_unlock(m);
+					VM_OBJECT_UNLOCK(uobj);
+					return (EIO);
+				}
+			}
+			if (m != NULL) {
+				pmap_zero_page_area(m, base, PAGE_SIZE - base);
+				MPASS(m->valid == VM_PAGE_BITS_ALL);
+				vm_page_dirty(m);
+				vm_pager_page_unswapped(m);
+			}
+		}
+
 		/*
 		 * Release any swap space and free any whole pages.
 		 */
@ -928,19 +971,16 @@ tmpfs_reg_resize(struct vnode *vp, off_t newsize)
 			    newpages);
 			vm_object_page_remove(uobj, newpages, 0, 0);
 		}
-
-		/*
-		 * Zero the truncated part of the last page.
-		 */
-		zerolen = round_page(newsize) - newsize;
-		if (zerolen > 0) {
-			m = vm_page_grab(uobj, OFF_TO_IDX(newsize),
-			    VM_ALLOC_NOBUSY | VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
-			pmap_zero_page_area(m, PAGE_SIZE - zerolen, zerolen);
-		}
 	}
 	uobj->size = newpages;
 	VM_OBJECT_UNLOCK(uobj);
+
+	TMPFS_LOCK(tmp);
+	tmp->tm_pages_used += (newpages - oldpages);
+	TMPFS_UNLOCK(tmp);
+
+	node->tn_size = newsize;
+	vnode_pager_setsize(vp, newsize);
 	return (0);
 }

--- a/sys/kern/uipc_shm.c
+++ b/sys/kern/uipc_shm.c
@ -39,13 +39,6 @@
 *
 * (3) Resource limits?  Does this need its own resource limits or are the
 *     existing limits in mmap(2) sufficient?
- *
- * (4) Partial page truncation.  vnode_pager_setsize() will zero any parts
- *     of a partially mapped page as a result of ftruncate(2)/truncate(2).
- *     We can do the same (with the same pmap evil), but do we need to
- *     worry about the bits on disk if the page is swapped out or will the
- *     swapper zero the parts of a page that are invalid if the page is
- *     swapped back in for us?
 */

 #include <sys/cdefs.h>
@ -86,6 +79,7 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>

@ -253,9 +247,10 @@ static int
 shm_dotruncate(struct shmfd *shmfd, off_t length)
 {
 	vm_object_t object;
-	vm_page_t m;
-	vm_pindex_t nobjsize;
+	vm_page_t m, ma[1];
+	vm_pindex_t idx, nobjsize;
 	vm_ooffset_t delta;
+	int base, rv;

 	object = shmfd->shm_object;
 	VM_OBJECT_LOCK(object);
@ -275,6 +270,56 @@ shm_dotruncate(struct shmfd *shmfd, off_t length)
 			VM_OBJECT_UNLOCK(object);
 			return (EBUSY);
 		}
+
+		/*
+		 * Zero the truncated part of the last page.
+		 */
+		base = length & PAGE_MASK;
+		if (base != 0) {
+			idx = OFF_TO_IDX(length);
+retry:
+			m = vm_page_lookup(object, idx);
+			if (m != NULL) {
+				if ((m->oflags & VPO_BUSY) != 0 ||
+				    m->busy != 0) {
+					vm_page_sleep(m, "shmtrc");
+					goto retry;
+				}
+			} else if (vm_pager_has_page(object, idx, NULL, NULL)) {
+				m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL);
+				if (m == NULL) {
+					VM_OBJECT_UNLOCK(object);
+					VM_WAIT;
+					VM_OBJECT_LOCK(object);
+					goto retry;
+				} else if (m->valid != VM_PAGE_BITS_ALL) {
+					ma[0] = m;
+					rv = vm_pager_get_pages(object, ma, 1,
+					    0);
+					m = vm_page_lookup(object, idx);
+				} else
+					/* A cached page was reactivated. */
+					rv = VM_PAGER_OK;
+				vm_page_lock(m);
+				if (rv == VM_PAGER_OK) {
+					vm_page_deactivate(m);
+					vm_page_unlock(m);
+					vm_page_wakeup(m);
+				} else {
+					vm_page_free(m);
+					vm_page_unlock(m);
+					VM_OBJECT_UNLOCK(object);
+					return (EIO);
+				}
+			}
+			if (m != NULL) {
+				pmap_zero_page_area(m, base, PAGE_SIZE - base);
+				KASSERT(m->valid == VM_PAGE_BITS_ALL,
+				    ("shm_dotruncate: page %p is invalid", m));
+				vm_page_dirty(m);
+				vm_pager_page_unswapped(m);
+			}
+		}
 		delta = ptoa(object->size - nobjsize);

 		/* Toss in memory pages. */
@ -289,45 +334,7 @@ shm_dotruncate(struct shmfd *shmfd, off_t length)
 		/* Free the swap accounted for shm */
 		swap_release_by_cred(delta, object->cred);
 		object->charge -= delta;
-
-		/*
-		 * If the last page is partially mapped, then zero out
-		 * the garbage at the end of the page.  See comments
-		 * in vnode_pager_setsize() for more details.
-		 *
-		 * XXXJHB: This handles in memory pages, but what about
-		 * a page swapped out to disk?
-		 */
-		if ((length & PAGE_MASK) &&
-		    (m = vm_page_lookup(object, OFF_TO_IDX(length))) != NULL &&
-		    m->valid != 0) {
-			int base = (int)length & PAGE_MASK;
-			int size = PAGE_SIZE - base;
-
-			pmap_zero_page_area(m, base, size);
-
-			/*
-			 * Update the valid bits to reflect the blocks that
-			 * have been zeroed.  Some of these valid bits may
-			 * have already been set.
-			 */
-			vm_page_set_valid_range(m, base, size);
-
-			/*
-			 * Round "base" to the next block boundary so that the
-			 * dirty bit for a partially zeroed block is not
-			 * cleared.
-			 */
-			base = roundup2(base, DEV_BSIZE);
-
-			vm_page_clear_dirty(m, base, PAGE_SIZE - base);
-		} else if ((length & PAGE_MASK) &&
-		    __predict_false(object->cache != NULL)) {
-			vm_page_cache_free(object, OFF_TO_IDX(length),
-			    nobjsize);
-		}
 	} else {
-
 		/* Attempt to reserve the swap */
 		delta = ptoa(nobjsize - object->size);
 		if (!swap_reserve_by_cred(delta, object->cred)) {