As a step towards the elimination of PG_CACHED pages, rework the handling

of POSIX_FADV_DONTNEED so that it causes the backing pages to be moved to
the head of the inactive queue instead of being cached.

This affects the implementation of POSIX_FADV_NOREUSE as well, since it
works by applying POSIX_FADV_DONTNEED to file ranges after they have been
read or written.  At that point the corresponding buffers may still be
dirty, so the previous implementation would coalesce successive ranges and
apply POSIX_FADV_DONTNEED to the result, ensuring that pages backing the
dirty buffers would eventually be cached.  To preserve this behaviour in an
efficient manner, this change adds a new buf flag, B_NOREUSE, which causes
the pages backing a VMIO buf to be placed at the head of the inactive queue
when the buf is released.  POSIX_FADV_NOREUSE then works by setting this
flag in bufs that underlie the specified range.

Reviewed by:	alc, kib
Sponsored by:	EMC / Isilon Storage Division
Differential Revision:	https://reviews.freebsd.org/D3726
This commit is contained in:
Mark Johnston 2015-09-30 23:06:29 +00:00
parent d358fa780b
commit 3138cd3670
10 changed files with 110 additions and 121 deletions

View File

@ -1785,6 +1785,8 @@ brelse(struct buf *bp)
bp, bp->b_vp, bp->b_flags);
KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0,
("brelse: non-VMIO buffer marked NOREUSE"));
if (BUF_LOCKRECURSED(bp)) {
/*
@ -1873,8 +1875,10 @@ brelse(struct buf *bp)
allocbuf(bp, 0);
}
if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0) {
if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0 ||
(bp->b_flags & (B_DELWRI | B_NOREUSE)) == B_NOREUSE) {
allocbuf(bp, 0);
bp->b_flags &= ~B_NOREUSE;
if (bp->b_vp != NULL)
brelvp(bp);
}
@ -1969,6 +1973,10 @@ bqrelse(struct buf *bp)
if ((bp->b_flags & B_DELWRI) == 0 &&
(bp->b_xflags & BX_VNDIRTY))
panic("bqrelse: not dirty");
if ((bp->b_flags & B_NOREUSE) != 0) {
brelse(bp);
return;
}
qindex = QUEUE_CLEAN;
}
binsfree(bp, qindex);
@ -2079,10 +2087,15 @@ vfs_vmio_unwire(struct buf *bp, vm_page_t m)
freed = false;
if (!freed) {
/*
* In order to maintain LRU page ordering, put
* the page at the tail of the inactive queue.
* If the page is unlikely to be reused, let the
* VM know. Otherwise, maintain LRU page
* ordering and put the page at the tail of the
* inactive queue.
*/
vm_page_deactivate(m);
if ((bp->b_flags & B_NOREUSE) != 0)
vm_page_deactivate_noreuse(m);
else
vm_page_deactivate(m);
}
}
vm_page_unlock(m);
@ -2456,8 +2469,9 @@ getnewbuf_reuse_bp(struct buf *bp, int qindex)
* Note: we no longer distinguish between VMIO and non-VMIO
* buffers.
*/
KASSERT((bp->b_flags & B_DELWRI) == 0,
("delwri buffer %p found in queue %d", bp, qindex));
KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0,
("invalid buffer %p flags %#x found in queue %d", bp, bp->b_flags,
qindex));
/*
* When recycling a clean buffer we have to truncate it and

View File

@ -1034,9 +1034,12 @@ vop_stdallocate(struct vop_allocate_args *ap)
int
vop_stdadvise(struct vop_advise_args *ap)
{
struct buf *bp;
struct buflists *bl;
struct vnode *vp;
daddr_t bn, startn, endn;
off_t start, end;
int error;
int bsize, error;
vp = ap->a_vp;
switch (ap->a_advice) {
@ -1049,28 +1052,59 @@ vop_stdadvise(struct vop_advise_args *ap)
error = 0;
break;
case POSIX_FADV_DONTNEED:
/*
* Flush any open FS buffers and then remove pages
* from the backing VM object. Using vinvalbuf() here
* is a bit heavy-handed as it flushes all buffers for
* the given vnode, not just the buffers covering the
* requested range.
*/
error = 0;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
if (vp->v_iflag & VI_DOOMED) {
VOP_UNLOCK(vp, 0);
break;
}
vinvalbuf(vp, V_CLEANONLY, 0, 0);
/*
* Deactivate pages in the specified range from the backing VM
* object. Pages that are resident in the buffer cache will
* remain wired until their corresponding buffers are released
* below.
*/
if (vp->v_object != NULL) {
start = trunc_page(ap->a_start);
end = round_page(ap->a_end);
VM_OBJECT_WLOCK(vp->v_object);
vm_object_page_cache(vp->v_object, OFF_TO_IDX(start),
vm_object_page_noreuse(vp->v_object, OFF_TO_IDX(start),
OFF_TO_IDX(end));
VM_OBJECT_WUNLOCK(vp->v_object);
}
BO_RLOCK(&vp->v_bufobj);
bsize = vp->v_bufobj.bo_bsize;
startn = ap->a_start / bsize;
if (ap->a_end == OFF_MAX) {
endn = -1;
bl = &vp->v_bufobj.bo_clean.bv_hd;
if (!TAILQ_EMPTY(bl))
endn = TAILQ_LAST(bl, buflists)->b_lblkno;
bl = &vp->v_bufobj.bo_dirty.bv_hd;
if (!TAILQ_EMPTY(bl) &&
endn < TAILQ_LAST(bl, buflists)->b_lblkno)
endn = TAILQ_LAST(bl, buflists)->b_lblkno;
} else
endn = ap->a_end / bsize;
BO_RUNLOCK(&vp->v_bufobj);
/*
* In the VMIO case, use the B_NOREUSE flag to hint that the
* pages backing each buffer in the range are unlikely to be
* reused. Dirty buffers will have the hint applied once
* they've been written.
*/
for (bn = startn; bn <= endn; bn++) {
bp = getblk(vp, bn, bsize, 0, 0, GB_NOCREAT |
GB_UNMAPPED);
if (bp == NULL)
continue;
bp->b_flags |= B_RELBUF;
if (vp->v_object != NULL)
bp->b_flags |= B_NOREUSE;
brelse(bp);
}
VOP_UNLOCK(vp, 0);
break;
default:

View File

@ -4610,8 +4610,6 @@ kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
new->fa_advice = advice;
new->fa_start = offset;
new->fa_end = end;
new->fa_prevstart = 0;
new->fa_prevend = 0;
fp->f_advice = new;
new = fa;
}

View File

@ -770,10 +770,9 @@ vn_read(fp, uio, active_cred, flags, td)
struct thread *td;
{
struct vnode *vp;
struct mtx *mtxp;
off_t orig_offset;
int error, ioflag;
int advice;
off_t offset, start, end;
KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
uio->uio_td, td));
@ -797,7 +796,7 @@ vn_read(fp, uio, active_cred, flags, td)
/* Disable read-ahead for random I/O. */
break;
}
offset = uio->uio_offset;
orig_offset = uio->uio_offset;
#ifdef MAC
error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
@ -807,39 +806,14 @@ vn_read(fp, uio, active_cred, flags, td)
fp->f_nextoff = uio->uio_offset;
VOP_UNLOCK(vp, 0);
if (error == 0 && advice == POSIX_FADV_NOREUSE &&
offset != uio->uio_offset) {
orig_offset != uio->uio_offset)
/*
* Use POSIX_FADV_DONTNEED to flush clean pages and
* buffers for the backing file after a
* POSIX_FADV_NOREUSE read(2). To optimize the common
* case of using POSIX_FADV_NOREUSE with sequential
* access, track the previous implicit DONTNEED
* request and grow this request to include the
* current read(2) in addition to the previous
* DONTNEED. With purely sequential access this will
* cause the DONTNEED requests to continously grow to
* cover all of the previously read regions of the
* file. This allows filesystem blocks that are
* accessed by multiple calls to read(2) to be flushed
* once the last read(2) finishes.
* Use POSIX_FADV_DONTNEED to flush pages and buffers
* for the backing file after a POSIX_FADV_NOREUSE
* read(2).
*/
start = offset;
end = uio->uio_offset - 1;
mtxp = mtx_pool_find(mtxpool_sleep, fp);
mtx_lock(mtxp);
if (fp->f_advice != NULL &&
fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
start = fp->f_advice->fa_prevstart;
else if (fp->f_advice->fa_prevstart != 0 &&
fp->f_advice->fa_prevstart == end + 1)
end = fp->f_advice->fa_prevend;
fp->f_advice->fa_prevstart = start;
fp->f_advice->fa_prevend = end;
}
mtx_unlock(mtxp);
error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
}
error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
POSIX_FADV_DONTNEED);
return (error);
}
@ -856,10 +830,9 @@ vn_write(fp, uio, active_cred, flags, td)
{
struct vnode *vp;
struct mount *mp;
struct mtx *mtxp;
off_t orig_offset;
int error, ioflag, lock_flags;
int advice;
off_t offset, start, end;
KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
uio->uio_td, td));
@ -902,7 +875,7 @@ vn_write(fp, uio, active_cred, flags, td)
/* XXX: Is this correct? */
break;
}
offset = uio->uio_offset;
orig_offset = uio->uio_offset;
#ifdef MAC
error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
@ -914,55 +887,14 @@ vn_write(fp, uio, active_cred, flags, td)
if (vp->v_type != VCHR)
vn_finished_write(mp);
if (error == 0 && advice == POSIX_FADV_NOREUSE &&
offset != uio->uio_offset) {
orig_offset != uio->uio_offset)
/*
* Use POSIX_FADV_DONTNEED to flush clean pages and
* buffers for the backing file after a
* POSIX_FADV_NOREUSE write(2). To optimize the
* common case of using POSIX_FADV_NOREUSE with
* sequential access, track the previous implicit
* DONTNEED request and grow this request to include
* the current write(2) in addition to the previous
* DONTNEED. With purely sequential access this will
* cause the DONTNEED requests to continously grow to
* cover all of the previously written regions of the
* file.
*
* Note that the blocks just written are almost
* certainly still dirty, so this only works when
* VOP_ADVISE() calls from subsequent writes push out
* the data written by this write(2) once the backing
* buffers are clean. However, as compared to forcing
* IO_DIRECT, this gives much saner behavior. Write
* clustering is still allowed, and clean pages are
* merely moved to the cache page queue rather than
* outright thrown away. This means a subsequent
* read(2) can still avoid hitting the disk if the
* pages have not been reclaimed.
*
* This does make POSIX_FADV_NOREUSE largely useless
* with non-sequential access. However, sequential
* access is the more common use case and the flag is
* merely advisory.
* Use POSIX_FADV_DONTNEED to flush pages and buffers
* for the backing file after a POSIX_FADV_NOREUSE
* write(2).
*/
start = offset;
end = uio->uio_offset - 1;
mtxp = mtx_pool_find(mtxpool_sleep, fp);
mtx_lock(mtxp);
if (fp->f_advice != NULL &&
fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
start = fp->f_advice->fa_prevstart;
else if (fp->f_advice->fa_prevstart != 0 &&
fp->f_advice->fa_prevstart == end + 1)
end = fp->f_advice->fa_prevend;
fp->f_advice->fa_prevstart = start;
fp->f_advice->fa_prevend = end;
}
mtx_unlock(mtxp);
error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
}
error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
POSIX_FADV_DONTNEED);
unlock:
return (error);
}

View File

@ -204,7 +204,7 @@ struct buf {
#define B_PERSISTENT 0x00000100 /* Perm. ref'ed while EXT2FS mounted. */
#define B_DONE 0x00000200 /* I/O completed. */
#define B_EINTR 0x00000400 /* I/O was interrupted */
#define B_00000800 0x00000800 /* Available flag. */
#define B_NOREUSE 0x00000800 /* Contents not reused once released. */
#define B_00001000 0x00001000 /* Available flag. */
#define B_INVAL 0x00002000 /* Does not contain valid info. */
#define B_BARRIER 0x00004000 /* Write this and all preceeding first. */
@ -229,7 +229,7 @@ struct buf {
#define PRINT_BUF_FLAGS "\20\40remfree\37cluster\36vmio\35ram\34managed" \
"\33paging\32infreecnt\31nocopy\30b23\27relbuf\26dirty\25b20" \
"\24b19\23b18\22clusterok\21malloc\20nocache\17b14\16inval" \
"\15b12\14b11\13eintr\12done\11persist\10delwri" \
"\15b12\14noreuse\13eintr\12done\11persist\10delwri" \
"\7validsuspwrt\6cache\5deferred\4direct\3async\2needcommit\1age"
/*

View File

@ -160,8 +160,6 @@ struct fadvise_info {
int fa_advice; /* (f) FADV_* type. */
off_t fa_start; /* (f) Region start. */
off_t fa_end; /* (f) Region end. */
off_t fa_prevstart; /* (f) Previous NOREUSE start. */
off_t fa_prevend; /* (f) Previous NOREUSE end. */
};
struct file {

View File

@ -1963,15 +1963,15 @@ skipmemq:
}
/*
* vm_object_page_cache:
* vm_object_page_noreuse:
*
* For the given object, attempt to move the specified clean
* pages to the cache queue. If a page is wired for any reason,
* then it will not be changed. Pages are specified by the given
* range ["start", "end"). As a special case, if "end" is zero,
* then the range extends from "start" to the end of the object.
* Any mappings to the specified pages are removed before the
* pages are moved to the cache queue.
* For the given object, attempt to move the specified pages to
* the head of the inactive queue. This bypasses regular LRU
* operation and allows the pages to be reused quickly under memory
* pressure. If a page is wired for any reason, then it will not
* be queued. Pages are specified by the range ["start", "end").
* As a special case, if "end" is zero, then the range extends from
* "start" to the end of the object.
*
* This operation should only be performed on objects that
* contain non-fictitious, managed pages.
@ -1979,14 +1979,14 @@ skipmemq:
* The object must be locked.
*/
void
vm_object_page_cache(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
{
struct mtx *mtx, *new_mtx;
vm_page_t p, next;
VM_OBJECT_ASSERT_WLOCKED(object);
KASSERT((object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0,
("vm_object_page_cache: illegal object %p", object));
("vm_object_page_noreuse: illegal object %p", object));
if (object->resident_page_count == 0)
return;
p = vm_page_find_least(object, start);
@ -2009,7 +2009,7 @@ vm_object_page_cache(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
mtx = new_mtx;
mtx_lock(mtx);
}
vm_page_try_to_cache(p);
vm_page_deactivate_noreuse(p);
}
if (mtx != NULL)
mtx_unlock(mtx);

View File

@ -304,10 +304,10 @@ void vm_object_terminate (vm_object_t);
void vm_object_set_writeable_dirty (vm_object_t);
void vm_object_init (void);
void vm_object_madvise(vm_object_t, vm_pindex_t, vm_pindex_t, int);
void vm_object_page_cache(vm_object_t object, vm_pindex_t start,
vm_pindex_t end);
boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start,
vm_ooffset_t end, int flags);
void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start,
vm_pindex_t end);
void vm_object_page_remove(vm_object_t object, vm_pindex_t start,
vm_pindex_t end, int options);
boolean_t vm_object_populate(vm_object_t, vm_pindex_t, vm_pindex_t);

View File

@ -2588,6 +2588,19 @@ vm_page_deactivate(vm_page_t m)
_vm_page_deactivate(m, 0);
}
/*
* Move the specified page to the inactive queue with the expectation
* that it is unlikely to be reused.
*
* The page must be locked.
*/
void
vm_page_deactivate_noreuse(vm_page_t m)
{
_vm_page_deactivate(m, 1);
}
/*
* vm_page_try_to_cache:
*
@ -2740,8 +2753,7 @@ vm_page_cache(vm_page_t m)
/*
* vm_page_advise
*
* Deactivate or do nothing, as appropriate. This routine is used
* by madvise() and vop_stdadvise().
* Deactivate or do nothing, as appropriate.
*
* The object and page must be locked.
*/

View File

@ -451,6 +451,7 @@ void vm_page_cache_transfer(vm_object_t, vm_pindex_t, vm_object_t);
int vm_page_try_to_cache (vm_page_t);
int vm_page_try_to_free (vm_page_t);
void vm_page_deactivate (vm_page_t);
void vm_page_deactivate_noreuse(vm_page_t);
void vm_page_dequeue(vm_page_t m);
void vm_page_dequeue_locked(vm_page_t m);
vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t);