Add MAP_NOSYNC feature to mmap(), and MADV_NOSYNC and MADV_AUTOSYNC to
madvise(). This feature prevents the update daemon from gratuitously flushing dirty pages associated with a mapped file-backed region of memory. The system pager will still page the memory as necessary and the VM system will still be fully coherent with the filesystem. Modifications made by other means to the same area of memory, for example by write(), are unaffected. The feature works on a page-granularity basis. MAP_NOSYNC allows one to use mmap() to share memory between processes without incuring any significant filesystem overhead, putting it in the same performance category as SysV Shared memory and anonymous memory. Reviewed by: julian, alc, dg
This commit is contained in:
parent
8828590563
commit
4f79d873c1
@ -58,6 +58,8 @@ The known behaviors are given in
|
||||
#define MADV_WILLNEED 3 /* will need these pages */
|
||||
#define MADV_DONTNEED 4 /* don't need these pages */
|
||||
#define MADV_FREE 5 /* data is now unimportant */
|
||||
#define MADV_NOSYNC 6 /* no explicit commit to physical backing store */
|
||||
#define MADV_AUTOSYNC 7 /* default commit method to physical backing store */
|
||||
.Ed
|
||||
.Pp
|
||||
.Bl -tag -width MADV_SEQUENTIAL
|
||||
@ -96,6 +98,31 @@ call.
|
||||
References made to that address space range will not make the VM system
|
||||
page the information back in from backing store until the page is
|
||||
modified again.
|
||||
.It Dv MADV_NOSYNC
|
||||
Request that the system not flush the data associated with this map to
|
||||
physical backing store unless it needs to. Typically this prevents the
|
||||
filesystem update daemon from gratuitously writing pages dirtied
|
||||
by the VM system to physical disk. Note that VM/filesystem coherency is
|
||||
always maintained, this feature simply ensures that the mapped data is
|
||||
only flush when it needs to be, usually by the system pager.
|
||||
.Pp
|
||||
This feature is typically used when you want to use a file-backed shared
|
||||
memory area to communicate between processes (IPC) and do not particularly
|
||||
need the data being stored in that area to be physically written to disk.
|
||||
With this feature you get the equivalent performance with mmap that you
|
||||
would expect to get with SysV shared memory calls, but in a more controllable
|
||||
and less restrictive manner. However, note that this feature is not portable
|
||||
across UNIX platforms (though some may do the right thing by default).
|
||||
For more information see the MAP_NOSYNC section of
|
||||
.Xr mmap 2
|
||||
.It Dv MADV_AUTOSYNC
|
||||
Undoes the effects of MADV_NOSYNC for any future pages dirtied within the
|
||||
address range. The effect on pages already dirtied is indeterminate - they
|
||||
may or may not be reverted. You can guarentee reversion by using the
|
||||
.Xr msync 2
|
||||
or
|
||||
.Xr fsync 2
|
||||
system calls.
|
||||
.El
|
||||
.Sh RETURN VALUES
|
||||
Upon successful completion,
|
||||
|
@ -150,6 +150,35 @@ stack top is the starting address returned by the call, plus
|
||||
.Fa len
|
||||
bytes. The bottom of the stack at maximum growth is the starting
|
||||
address returned by the call.
|
||||
.It Dv MAP_NOSYNC
|
||||
Causes data dirtied via this VM map to be flushed to physical media
|
||||
only when necessary (usually by the pager) rather then gratuitously.
|
||||
Typically this prevents the update daemons from flushing pages dirtied
|
||||
through such maps and thus allows efficient sharing of memory across
|
||||
unassociated processes using a file-backed shared memory map. Without
|
||||
this option any VM pages you dirty may be flushed to disk every so often
|
||||
(every 30-60 seconds usually) which can create performance problems if you
|
||||
do not need that to occur (such as when you are using shared file-backed
|
||||
mmap regions for IPC purposes). Note that VM/filesystem coherency is
|
||||
maintained whether you use MAP_NOSYNC or not. This option is not portable
|
||||
across UNIX platforms (yet), though some may implement the same behavior
|
||||
by default.
|
||||
.Pp
|
||||
The
|
||||
.Xr fsync 2
|
||||
function will flush all dirty data and metadata associated with a file,
|
||||
including dirty NOSYNC VM data, to physical media. The
|
||||
.Xr sync 1
|
||||
command and
|
||||
.Xr sync 2
|
||||
system call generally do not flush dirty NOSYNC VM data.
|
||||
The
|
||||
.Xr msync 2
|
||||
system call is obsolete since
|
||||
.Os BSD
|
||||
implements a coherent filesystem buffer cache. However, it may be
|
||||
used to associate dirty VM pages with filesystem buffers and thus cause
|
||||
them to be flushed to physical media sooner rather then later.
|
||||
.El
|
||||
.Pp
|
||||
The
|
||||
|
@ -2460,7 +2460,7 @@ vfs_msync(struct mount *mp, int flags) {
|
||||
if (!vget(vp,
|
||||
LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
|
||||
if (vp->v_object) {
|
||||
vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : 0);
|
||||
vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC);
|
||||
anyio = 1;
|
||||
}
|
||||
vput(vp);
|
||||
|
@ -1307,7 +1307,8 @@ symlink(p, uap)
|
||||
vput(nd.ni_vp);
|
||||
vput(nd.ni_dvp);
|
||||
ASSERT_VOP_UNLOCKED(nd.ni_dvp, "symlink");
|
||||
ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink");
|
||||
if (error == 0)
|
||||
ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink");
|
||||
out:
|
||||
zfree(namei_zone, path);
|
||||
return (error);
|
||||
|
@ -2460,7 +2460,7 @@ vfs_msync(struct mount *mp, int flags) {
|
||||
if (!vget(vp,
|
||||
LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
|
||||
if (vp->v_object) {
|
||||
vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : 0);
|
||||
vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC);
|
||||
anyio = 1;
|
||||
}
|
||||
vput(vp);
|
||||
|
@ -1307,7 +1307,8 @@ symlink(p, uap)
|
||||
vput(nd.ni_vp);
|
||||
vput(nd.ni_dvp);
|
||||
ASSERT_VOP_UNLOCKED(nd.ni_dvp, "symlink");
|
||||
ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink");
|
||||
if (error == 0)
|
||||
ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink");
|
||||
out:
|
||||
zfree(namei_zone, path);
|
||||
return (error);
|
||||
|
@ -65,6 +65,7 @@
|
||||
#define MAP_NOEXTEND 0x0100 /* for MAP_FILE, don't change file size */
|
||||
#define MAP_HASSEMAPHORE 0x0200 /* region may contain semaphores */
|
||||
#define MAP_STACK 0x0400 /* region grows down, like a stack */
|
||||
#define MAP_NOSYNC 0x0800 /* page to but do not sync underlying file */
|
||||
|
||||
#ifdef _P1003_1B_VISIBLE
|
||||
/*
|
||||
@ -103,6 +104,8 @@
|
||||
#define MADV_WILLNEED 3 /* will need these pages */
|
||||
#define MADV_DONTNEED 4 /* dont need these pages */
|
||||
#define MADV_FREE 5 /* dont need these pages, and junk contents */
|
||||
#define MADV_NOSYNC 6 /* try to avoid flushes to physical media */
|
||||
#define MADV_AUTOSYNC 7 /* revert to default flushing strategy */
|
||||
|
||||
/*
|
||||
* Return bits from mincore
|
||||
|
@ -779,15 +779,29 @@ RetryFault:;
|
||||
vm_page_flag_set(fs.m, PG_WRITEABLE);
|
||||
vm_object_set_flag(fs.m->object,
|
||||
OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
|
||||
|
||||
/*
|
||||
* If the fault is a write, we know that this page is being
|
||||
* written NOW. This will save on the pmap_is_modified() calls
|
||||
* later.
|
||||
* written NOW so dirty it explicitly to save on
|
||||
* pmap_is_modified() calls later.
|
||||
*
|
||||
* If this is a NOSYNC mmap we do not want to set PG_NOSYNC
|
||||
* if the page is already dirty to prevent data written with
|
||||
* the expectation of being synced from not being synced.
|
||||
* Likewise if this entry does not request NOSYNC then make
|
||||
* sure the page isn't marked NOSYNC. Applications sharing
|
||||
* data should use the same flags to avoid ping ponging.
|
||||
*
|
||||
* Also tell the backing pager, if any, that it should remove
|
||||
* any swap backing since the page is now dirty.
|
||||
*/
|
||||
if (fault_flags & VM_FAULT_DIRTY) {
|
||||
if (fs.entry->eflags & MAP_ENTRY_NOSYNC) {
|
||||
if (fs.m->dirty == 0)
|
||||
vm_page_flag_set(fs.m, PG_NOSYNC);
|
||||
} else {
|
||||
vm_page_flag_clear(fs.m, PG_NOSYNC);
|
||||
}
|
||||
vm_page_dirty(fs.m);
|
||||
vm_pager_page_unswapped(fs.m);
|
||||
}
|
||||
|
@ -460,6 +460,9 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
|
||||
KASSERT(object == NULL,
|
||||
("vm_map_insert: paradoxical MAP_NOFAULT request"));
|
||||
}
|
||||
if (cow & MAP_DISABLE_SYNCER)
|
||||
protoeflags |= MAP_ENTRY_NOSYNC;
|
||||
|
||||
if (object) {
|
||||
/*
|
||||
* When object is non-NULL, it could be shared with another
|
||||
@ -539,13 +542,15 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
|
||||
* Update the free space hint
|
||||
*/
|
||||
if ((map->first_free == prev_entry) &&
|
||||
(prev_entry->end >= new_entry->start))
|
||||
(prev_entry->end >= new_entry->start)) {
|
||||
map->first_free = new_entry;
|
||||
}
|
||||
|
||||
if (cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL))
|
||||
if (cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) {
|
||||
pmap_object_init_pt(map->pmap, start,
|
||||
object, OFF_TO_IDX(offset), end - start,
|
||||
cow & MAP_PREFAULT_PARTIAL);
|
||||
}
|
||||
|
||||
return (KERN_SUCCESS);
|
||||
}
|
||||
@ -1026,6 +1031,8 @@ vm_map_madvise(map, start, end, behav)
|
||||
case MADV_NORMAL:
|
||||
case MADV_SEQUENTIAL:
|
||||
case MADV_RANDOM:
|
||||
case MADV_NOSYNC:
|
||||
case MADV_AUTOSYNC:
|
||||
modify_map = 1;
|
||||
vm_map_lock(map);
|
||||
break;
|
||||
@ -1077,6 +1084,12 @@ vm_map_madvise(map, start, end, behav)
|
||||
case MADV_RANDOM:
|
||||
vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
|
||||
break;
|
||||
case MADV_NOSYNC:
|
||||
current->eflags |= MAP_ENTRY_NOSYNC;
|
||||
break;
|
||||
case MADV_AUTOSYNC:
|
||||
current->eflags &= ~MAP_ENTRY_NOSYNC;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
@ -112,7 +112,7 @@ struct vm_map_entry {
|
||||
vm_pindex_t lastr; /* last read */
|
||||
};
|
||||
|
||||
#define MAP_ENTRY_UNUSED_01 0x1
|
||||
#define MAP_ENTRY_NOSYNC 0x1
|
||||
#define MAP_ENTRY_IS_SUB_MAP 0x2
|
||||
#define MAP_ENTRY_COW 0x4
|
||||
#define MAP_ENTRY_NEEDS_COPY 0x8
|
||||
@ -329,6 +329,7 @@ vmspace_resident_count(struct vmspace *vmspace)
|
||||
#define MAP_NOFAULT 0x4
|
||||
#define MAP_PREFAULT 0x8
|
||||
#define MAP_PREFAULT_PARTIAL 0x10
|
||||
#define MAP_DISABLE_SYNCER 0x20
|
||||
|
||||
/*
|
||||
* vm_fault option flags
|
||||
|
@ -626,7 +626,7 @@ madvise(p, uap)
|
||||
/*
|
||||
* Check for illegal behavior
|
||||
*/
|
||||
if (uap->behav < 0 || uap->behav > MADV_FREE)
|
||||
if (uap->behav < 0 || uap->behav > MADV_AUTOSYNC)
|
||||
return (EINVAL);
|
||||
/*
|
||||
* Check for illegal addresses. Watch out for address wrap... Note
|
||||
@ -1046,9 +1046,10 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
|
||||
flags |= MAP_SHARED;
|
||||
}
|
||||
|
||||
if ((flags & (MAP_ANON|MAP_SHARED)) == 0) {
|
||||
if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
|
||||
docow |= MAP_COPY_ON_WRITE;
|
||||
}
|
||||
if (flags & MAP_NOSYNC)
|
||||
docow |= MAP_DISABLE_SYNCER;
|
||||
|
||||
#if defined(VM_PROT_READ_IS_EXEC)
|
||||
if (prot & VM_PROT_READ)
|
||||
|
@ -478,8 +478,10 @@ vm_object_terminate(object)
|
||||
/*
|
||||
* vm_object_page_clean
|
||||
*
|
||||
* Clean all dirty pages in the specified range of object.
|
||||
* Leaves page on whatever queue it is currently on.
|
||||
* Clean all dirty pages in the specified range of object. Leaves page
|
||||
* on whatever queue it is currently on. If NOSYNC is set then do not
|
||||
* write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC),
|
||||
* leaving the object dirty.
|
||||
*
|
||||
* Odd semantics: if start == end, we clean everything.
|
||||
*
|
||||
@ -503,6 +505,7 @@ vm_object_page_clean(object, start, end, flags)
|
||||
int chkb;
|
||||
int maxb;
|
||||
int i;
|
||||
int clearobjflags;
|
||||
int pagerflags;
|
||||
vm_page_t maf[vm_pageout_page_count];
|
||||
vm_page_t mab[vm_pageout_page_count];
|
||||
@ -527,12 +530,26 @@ vm_object_page_clean(object, start, end, flags)
|
||||
tend = end;
|
||||
}
|
||||
|
||||
/*
|
||||
* Generally set CLEANCHK interlock and make the page read-only so
|
||||
* we can then clear the object flags.
|
||||
*
|
||||
* However, if this is a nosync mmap then the object is likely to
|
||||
* stay dirty so do not mess with the page and do not clear the
|
||||
* object flags.
|
||||
*/
|
||||
|
||||
clearobjflags = 1;
|
||||
|
||||
for(p = TAILQ_FIRST(&object->memq); p; p = TAILQ_NEXT(p, listq)) {
|
||||
vm_page_flag_set(p, PG_CLEANCHK);
|
||||
vm_page_protect(p, VM_PROT_READ);
|
||||
if ((flags & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC))
|
||||
clearobjflags = 0;
|
||||
else
|
||||
vm_page_protect(p, VM_PROT_READ);
|
||||
}
|
||||
|
||||
if ((tstart == 0) && (tend == object->size)) {
|
||||
if (clearobjflags && (tstart == 0) && (tend == object->size)) {
|
||||
vm_object_clear_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
|
||||
}
|
||||
|
||||
@ -557,6 +574,16 @@ vm_object_page_clean(object, start, end, flags)
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we have been asked to skip nosync pages and this is a
|
||||
* nosync page, skip it. Note that the object flags were
|
||||
* not cleared in this case so we do not have to set them.
|
||||
*/
|
||||
if ((flags & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
|
||||
vm_page_flag_clear(p, PG_CLEANCHK);
|
||||
continue;
|
||||
}
|
||||
|
||||
s = splvm();
|
||||
while (vm_page_sleep_busy(p, TRUE, "vpcwai")) {
|
||||
if (object->generation != curgeneration) {
|
||||
|
@ -153,8 +153,9 @@ struct vm_object {
|
||||
|
||||
#ifdef KERNEL
|
||||
|
||||
#define OBJPC_SYNC 0x1 /* sync I/O */
|
||||
#define OBJPC_INVAL 0x2 /* invalidate */
|
||||
#define OBJPC_SYNC 0x1 /* sync I/O */
|
||||
#define OBJPC_INVAL 0x2 /* invalidate */
|
||||
#define OBJPC_NOSYNC 0x4 /* skip if PG_NOSYNC */
|
||||
|
||||
TAILQ_HEAD(object_q, vm_object);
|
||||
|
||||
|
@ -1522,15 +1522,19 @@ vm_page_set_validclean(m, base, size)
|
||||
|
||||
/*
|
||||
* Set valid, clear dirty bits. If validating the entire
|
||||
* page we can safely clear the pmap modify bit.
|
||||
* page we can safely clear the pmap modify bit. We also
|
||||
* use this opportunity to clear the PG_NOSYNC flag. If a process
|
||||
* takes a write fault on a MAP_NOSYNC memory area the flag will
|
||||
* be set again.
|
||||
*/
|
||||
|
||||
pagebits = vm_page_bits(base, size);
|
||||
m->valid |= pagebits;
|
||||
m->dirty &= ~pagebits;
|
||||
|
||||
if (base == 0 && size == PAGE_SIZE)
|
||||
if (base == 0 && size == PAGE_SIZE) {
|
||||
pmap_clear_modify(VM_PAGE_TO_PHYS(m));
|
||||
vm_page_flag_clear(m, PG_NOSYNC);
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
|
@ -234,6 +234,7 @@ extern struct vpgqueues vm_page_queues[PQ_COUNT];
|
||||
#define PG_REFERENCED 0x0080 /* page has been referenced */
|
||||
#define PG_CLEANCHK 0x0100 /* page will be checked for cleaning */
|
||||
#define PG_SWAPINPROG 0x0200 /* swap I/O in progress on page */
|
||||
#define PG_NOSYNC 0x0400 /* do not collect for syncer */
|
||||
|
||||
/*
|
||||
* Misc constants.
|
||||
@ -437,10 +438,9 @@ vm_page_unhold(vm_page_t mem)
|
||||
/*
|
||||
* vm_page_protect:
|
||||
*
|
||||
* Reduce the protection of a page. This routine never
|
||||
* raises the protection and therefore can be safely
|
||||
* called if the page is already at VM_PROT_NONE ( it
|
||||
* will be a NOP effectively ).
|
||||
* Reduce the protection of a page. This routine never raises the
|
||||
* protection and therefore can be safely called if the page is already
|
||||
* at VM_PROT_NONE (it will be a NOP effectively ).
|
||||
*/
|
||||
|
||||
static __inline void
|
||||
|
Loading…
Reference in New Issue
Block a user