diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 80d06073a6ab..0fab0009bc58 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -1785,6 +1785,8 @@ brelse(struct buf *bp) bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); + KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0, + ("brelse: non-VMIO buffer marked NOREUSE")); if (BUF_LOCKRECURSED(bp)) { /* @@ -1873,8 +1875,10 @@ brelse(struct buf *bp) allocbuf(bp, 0); } - if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0) { + if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0 || + (bp->b_flags & (B_DELWRI | B_NOREUSE)) == B_NOREUSE) { allocbuf(bp, 0); + bp->b_flags &= ~B_NOREUSE; if (bp->b_vp != NULL) brelvp(bp); } @@ -1969,6 +1973,10 @@ bqrelse(struct buf *bp) if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("bqrelse: not dirty"); + if ((bp->b_flags & B_NOREUSE) != 0) { + brelse(bp); + return; + } qindex = QUEUE_CLEAN; } binsfree(bp, qindex); @@ -2079,10 +2087,15 @@ vfs_vmio_unwire(struct buf *bp, vm_page_t m) freed = false; if (!freed) { /* - * In order to maintain LRU page ordering, put - * the page at the tail of the inactive queue. + * If the page is unlikely to be reused, let the + * VM know. Otherwise, maintain LRU page + * ordering and put the page at the tail of the + * inactive queue. */ - vm_page_deactivate(m); + if ((bp->b_flags & B_NOREUSE) != 0) + vm_page_deactivate_noreuse(m); + else + vm_page_deactivate(m); } } vm_page_unlock(m); @@ -2456,8 +2469,9 @@ getnewbuf_reuse_bp(struct buf *bp, int qindex) * Note: we no longer distinguish between VMIO and non-VMIO * buffers. */ - KASSERT((bp->b_flags & B_DELWRI) == 0, - ("delwri buffer %p found in queue %d", bp, qindex)); + KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0, + ("invalid buffer %p flags %#x found in queue %d", bp, bp->b_flags, + qindex)); /* * When recycling a clean buffer we have to truncate it and diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c index d70b68567dad..d0074d3e34ed 100644 --- a/sys/kern/vfs_default.c +++ b/sys/kern/vfs_default.c @@ -1034,9 +1034,12 @@ vop_stdallocate(struct vop_allocate_args *ap) int vop_stdadvise(struct vop_advise_args *ap) { + struct buf *bp; + struct buflists *bl; struct vnode *vp; + daddr_t bn, startn, endn; off_t start, end; - int error; + int bsize, error; vp = ap->a_vp; switch (ap->a_advice) { @@ -1049,28 +1052,59 @@ vop_stdadvise(struct vop_advise_args *ap) error = 0; break; case POSIX_FADV_DONTNEED: - /* - * Flush any open FS buffers and then remove pages - * from the backing VM object. Using vinvalbuf() here - * is a bit heavy-handed as it flushes all buffers for - * the given vnode, not just the buffers covering the - * requested range. - */ error = 0; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_iflag & VI_DOOMED) { VOP_UNLOCK(vp, 0); break; } - vinvalbuf(vp, V_CLEANONLY, 0, 0); + + /* + * Deactivate pages in the specified range from the backing VM + * object. Pages that are resident in the buffer cache will + * remain wired until their corresponding buffers are released + * below. + */ if (vp->v_object != NULL) { start = trunc_page(ap->a_start); end = round_page(ap->a_end); VM_OBJECT_WLOCK(vp->v_object); - vm_object_page_cache(vp->v_object, OFF_TO_IDX(start), + vm_object_page_noreuse(vp->v_object, OFF_TO_IDX(start), OFF_TO_IDX(end)); VM_OBJECT_WUNLOCK(vp->v_object); } + + BO_RLOCK(&vp->v_bufobj); + bsize = vp->v_bufobj.bo_bsize; + startn = ap->a_start / bsize; + if (ap->a_end == OFF_MAX) { + endn = -1; + bl = &vp->v_bufobj.bo_clean.bv_hd; + if (!TAILQ_EMPTY(bl)) + endn = TAILQ_LAST(bl, buflists)->b_lblkno; + bl = &vp->v_bufobj.bo_dirty.bv_hd; + if (!TAILQ_EMPTY(bl) && + endn < TAILQ_LAST(bl, buflists)->b_lblkno) + endn = TAILQ_LAST(bl, buflists)->b_lblkno; + } else + endn = ap->a_end / bsize; + BO_RUNLOCK(&vp->v_bufobj); + /* + * In the VMIO case, use the B_NOREUSE flag to hint that the + * pages backing each buffer in the range are unlikely to be + * reused. Dirty buffers will have the hint applied once + * they've been written. + */ + for (bn = startn; bn <= endn; bn++) { + bp = getblk(vp, bn, bsize, 0, 0, GB_NOCREAT | + GB_UNMAPPED); + if (bp == NULL) + continue; + bp->b_flags |= B_RELBUF; + if (vp->v_object != NULL) + bp->b_flags |= B_NOREUSE; + brelse(bp); + } VOP_UNLOCK(vp, 0); break; default: diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index 70a302ba6e51..ce4436ad731c 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -4610,8 +4610,6 @@ kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len, new->fa_advice = advice; new->fa_start = offset; new->fa_end = end; - new->fa_prevstart = 0; - new->fa_prevend = 0; fp->f_advice = new; new = fa; } diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 18a9ac311020..f07df31bbb2f 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -770,10 +770,9 @@ vn_read(fp, uio, active_cred, flags, td) struct thread *td; { struct vnode *vp; - struct mtx *mtxp; + off_t orig_offset; int error, ioflag; int advice; - off_t offset, start, end; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); @@ -797,7 +796,7 @@ vn_read(fp, uio, active_cred, flags, td) /* Disable read-ahead for random I/O. */ break; } - offset = uio->uio_offset; + orig_offset = uio->uio_offset; #ifdef MAC error = mac_vnode_check_read(active_cred, fp->f_cred, vp); @@ -807,39 +806,14 @@ vn_read(fp, uio, active_cred, flags, td) fp->f_nextoff = uio->uio_offset; VOP_UNLOCK(vp, 0); if (error == 0 && advice == POSIX_FADV_NOREUSE && - offset != uio->uio_offset) { + orig_offset != uio->uio_offset) /* - * Use POSIX_FADV_DONTNEED to flush clean pages and - * buffers for the backing file after a - * POSIX_FADV_NOREUSE read(2). To optimize the common - * case of using POSIX_FADV_NOREUSE with sequential - * access, track the previous implicit DONTNEED - * request and grow this request to include the - * current read(2) in addition to the previous - * DONTNEED. With purely sequential access this will - * cause the DONTNEED requests to continously grow to - * cover all of the previously read regions of the - * file. This allows filesystem blocks that are - * accessed by multiple calls to read(2) to be flushed - * once the last read(2) finishes. + * Use POSIX_FADV_DONTNEED to flush pages and buffers + * for the backing file after a POSIX_FADV_NOREUSE + * read(2). */ - start = offset; - end = uio->uio_offset - 1; - mtxp = mtx_pool_find(mtxpool_sleep, fp); - mtx_lock(mtxp); - if (fp->f_advice != NULL && - fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) { - if (start != 0 && fp->f_advice->fa_prevend + 1 == start) - start = fp->f_advice->fa_prevstart; - else if (fp->f_advice->fa_prevstart != 0 && - fp->f_advice->fa_prevstart == end + 1) - end = fp->f_advice->fa_prevend; - fp->f_advice->fa_prevstart = start; - fp->f_advice->fa_prevend = end; - } - mtx_unlock(mtxp); - error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED); - } + error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1, + POSIX_FADV_DONTNEED); return (error); } @@ -856,10 +830,9 @@ vn_write(fp, uio, active_cred, flags, td) { struct vnode *vp; struct mount *mp; - struct mtx *mtxp; + off_t orig_offset; int error, ioflag, lock_flags; int advice; - off_t offset, start, end; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); @@ -902,7 +875,7 @@ vn_write(fp, uio, active_cred, flags, td) /* XXX: Is this correct? */ break; } - offset = uio->uio_offset; + orig_offset = uio->uio_offset; #ifdef MAC error = mac_vnode_check_write(active_cred, fp->f_cred, vp); @@ -914,55 +887,14 @@ vn_write(fp, uio, active_cred, flags, td) if (vp->v_type != VCHR) vn_finished_write(mp); if (error == 0 && advice == POSIX_FADV_NOREUSE && - offset != uio->uio_offset) { + orig_offset != uio->uio_offset) /* - * Use POSIX_FADV_DONTNEED to flush clean pages and - * buffers for the backing file after a - * POSIX_FADV_NOREUSE write(2). To optimize the - * common case of using POSIX_FADV_NOREUSE with - * sequential access, track the previous implicit - * DONTNEED request and grow this request to include - * the current write(2) in addition to the previous - * DONTNEED. With purely sequential access this will - * cause the DONTNEED requests to continously grow to - * cover all of the previously written regions of the - * file. - * - * Note that the blocks just written are almost - * certainly still dirty, so this only works when - * VOP_ADVISE() calls from subsequent writes push out - * the data written by this write(2) once the backing - * buffers are clean. However, as compared to forcing - * IO_DIRECT, this gives much saner behavior. Write - * clustering is still allowed, and clean pages are - * merely moved to the cache page queue rather than - * outright thrown away. This means a subsequent - * read(2) can still avoid hitting the disk if the - * pages have not been reclaimed. - * - * This does make POSIX_FADV_NOREUSE largely useless - * with non-sequential access. However, sequential - * access is the more common use case and the flag is - * merely advisory. + * Use POSIX_FADV_DONTNEED to flush pages and buffers + * for the backing file after a POSIX_FADV_NOREUSE + * write(2). */ - start = offset; - end = uio->uio_offset - 1; - mtxp = mtx_pool_find(mtxpool_sleep, fp); - mtx_lock(mtxp); - if (fp->f_advice != NULL && - fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) { - if (start != 0 && fp->f_advice->fa_prevend + 1 == start) - start = fp->f_advice->fa_prevstart; - else if (fp->f_advice->fa_prevstart != 0 && - fp->f_advice->fa_prevstart == end + 1) - end = fp->f_advice->fa_prevend; - fp->f_advice->fa_prevstart = start; - fp->f_advice->fa_prevend = end; - } - mtx_unlock(mtxp); - error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED); - } - + error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1, + POSIX_FADV_DONTNEED); unlock: return (error); } diff --git a/sys/sys/buf.h b/sys/sys/buf.h index d5ce0e51d87a..bdc457eef35f 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -204,7 +204,7 @@ struct buf { #define B_PERSISTENT 0x00000100 /* Perm. ref'ed while EXT2FS mounted. */ #define B_DONE 0x00000200 /* I/O completed. */ #define B_EINTR 0x00000400 /* I/O was interrupted */ -#define B_00000800 0x00000800 /* Available flag. */ +#define B_NOREUSE 0x00000800 /* Contents not reused once released. */ #define B_00001000 0x00001000 /* Available flag. */ #define B_INVAL 0x00002000 /* Does not contain valid info. */ #define B_BARRIER 0x00004000 /* Write this and all preceeding first. */ @@ -229,7 +229,7 @@ struct buf { #define PRINT_BUF_FLAGS "\20\40remfree\37cluster\36vmio\35ram\34managed" \ "\33paging\32infreecnt\31nocopy\30b23\27relbuf\26dirty\25b20" \ "\24b19\23b18\22clusterok\21malloc\20nocache\17b14\16inval" \ - "\15b12\14b11\13eintr\12done\11persist\10delwri" \ + "\15b12\14noreuse\13eintr\12done\11persist\10delwri" \ "\7validsuspwrt\6cache\5deferred\4direct\3async\2needcommit\1age" /* diff --git a/sys/sys/file.h b/sys/sys/file.h index cb51c27f76b1..68d33e0dfd25 100644 --- a/sys/sys/file.h +++ b/sys/sys/file.h @@ -160,8 +160,6 @@ struct fadvise_info { int fa_advice; /* (f) FADV_* type. */ off_t fa_start; /* (f) Region start. */ off_t fa_end; /* (f) Region end. */ - off_t fa_prevstart; /* (f) Previous NOREUSE start. */ - off_t fa_prevend; /* (f) Previous NOREUSE end. */ }; struct file { diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index a4aac95c7adb..0a3c2efdf8cf 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -1963,15 +1963,15 @@ vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, } /* - * vm_object_page_cache: + * vm_object_page_noreuse: * - * For the given object, attempt to move the specified clean - * pages to the cache queue. If a page is wired for any reason, - * then it will not be changed. Pages are specified by the given - * range ["start", "end"). As a special case, if "end" is zero, - * then the range extends from "start" to the end of the object. - * Any mappings to the specified pages are removed before the - * pages are moved to the cache queue. + * For the given object, attempt to move the specified pages to + * the head of the inactive queue. This bypasses regular LRU + * operation and allows the pages to be reused quickly under memory + * pressure. If a page is wired for any reason, then it will not + * be queued. Pages are specified by the range ["start", "end"). + * As a special case, if "end" is zero, then the range extends from + * "start" to the end of the object. * * This operation should only be performed on objects that * contain non-fictitious, managed pages. @@ -1979,14 +1979,14 @@ vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, * The object must be locked. */ void -vm_object_page_cache(vm_object_t object, vm_pindex_t start, vm_pindex_t end) +vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { struct mtx *mtx, *new_mtx; vm_page_t p, next; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0, - ("vm_object_page_cache: illegal object %p", object)); + ("vm_object_page_noreuse: illegal object %p", object)); if (object->resident_page_count == 0) return; p = vm_page_find_least(object, start); @@ -2009,7 +2009,7 @@ vm_object_page_cache(vm_object_t object, vm_pindex_t start, vm_pindex_t end) mtx = new_mtx; mtx_lock(mtx); } - vm_page_try_to_cache(p); + vm_page_deactivate_noreuse(p); } if (mtx != NULL) mtx_unlock(mtx); diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h index 7e433aee0228..894a8d5616ba 100644 --- a/sys/vm/vm_object.h +++ b/sys/vm/vm_object.h @@ -304,10 +304,10 @@ void vm_object_terminate (vm_object_t); void vm_object_set_writeable_dirty (vm_object_t); void vm_object_init (void); void vm_object_madvise(vm_object_t, vm_pindex_t, vm_pindex_t, int); -void vm_object_page_cache(vm_object_t object, vm_pindex_t start, - vm_pindex_t end); boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end, int flags); +void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, + vm_pindex_t end); void vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int options); boolean_t vm_object_populate(vm_object_t, vm_pindex_t, vm_pindex_t); diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 2aaddfb01826..a3a9a10417a4 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -2588,6 +2588,19 @@ vm_page_deactivate(vm_page_t m) _vm_page_deactivate(m, 0); } +/* + * Move the specified page to the inactive queue with the expectation + * that it is unlikely to be reused. + * + * The page must be locked. + */ +void +vm_page_deactivate_noreuse(vm_page_t m) +{ + + _vm_page_deactivate(m, 1); +} + /* * vm_page_try_to_cache: * @@ -2740,8 +2753,7 @@ vm_page_cache(vm_page_t m) /* * vm_page_advise * - * Deactivate or do nothing, as appropriate. This routine is used - * by madvise() and vop_stdadvise(). + * Deactivate or do nothing, as appropriate. * * The object and page must be locked. */ diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index fd7d3f443c17..dedd6ac9eb0c 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -451,6 +451,7 @@ void vm_page_cache_transfer(vm_object_t, vm_pindex_t, vm_object_t); int vm_page_try_to_cache (vm_page_t); int vm_page_try_to_free (vm_page_t); void vm_page_deactivate (vm_page_t); +void vm_page_deactivate_noreuse(vm_page_t); void vm_page_dequeue(vm_page_t m); void vm_page_dequeue_locked(vm_page_t m); vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t);