Add MAP_NOSYNC feature to mmap(), and MADV_NOSYNC and MADV_AUTOSYNC to

madvise().

    This feature prevents the update daemon from gratuitously flushing
    dirty pages associated with a mapped file-backed region of memory.  The
    system pager will still page the memory as necessary and the VM system
    will still be fully coherent with the filesystem.  Modifications made
    by other means to the same area of memory, for example by write(), are
    unaffected.  The feature works on a page-granularity basis.

    MAP_NOSYNC allows one to use mmap() to share memory between processes
    without incuring any significant filesystem overhead, putting it in
    the same performance category as SysV Shared memory and anonymous memory.

Reviewed by: julian, alc, dg
This commit is contained in:
Matthew Dillon 1999-12-12 03:19:33 +00:00
parent 8828590563
commit 4f79d873c1
15 changed files with 147 additions and 25 deletions

View File

@ -58,6 +58,8 @@ The known behaviors are given in
#define MADV_WILLNEED 3 /* will need these pages */
#define MADV_DONTNEED 4 /* don't need these pages */
#define MADV_FREE 5 /* data is now unimportant */
#define MADV_NOSYNC 6 /* no explicit commit to physical backing store */
#define MADV_AUTOSYNC 7 /* default commit method to physical backing store */
.Ed
.Pp
.Bl -tag -width MADV_SEQUENTIAL
@ -96,6 +98,31 @@ call.
References made to that address space range will not make the VM system
page the information back in from backing store until the page is
modified again.
.It Dv MADV_NOSYNC
Request that the system not flush the data associated with this map to
physical backing store unless it needs to. Typically this prevents the
filesystem update daemon from gratuitously writing pages dirtied
by the VM system to physical disk. Note that VM/filesystem coherency is
always maintained, this feature simply ensures that the mapped data is
only flush when it needs to be, usually by the system pager.
.Pp
This feature is typically used when you want to use a file-backed shared
memory area to communicate between processes (IPC) and do not particularly
need the data being stored in that area to be physically written to disk.
With this feature you get the equivalent performance with mmap that you
would expect to get with SysV shared memory calls, but in a more controllable
and less restrictive manner. However, note that this feature is not portable
across UNIX platforms (though some may do the right thing by default).
For more information see the MAP_NOSYNC section of
.Xr mmap 2
.It Dv MADV_AUTOSYNC
Undoes the effects of MADV_NOSYNC for any future pages dirtied within the
address range. The effect on pages already dirtied is indeterminate - they
may or may not be reverted. You can guarentee reversion by using the
.Xr msync 2
or
.Xr fsync 2
system calls.
.El
.Sh RETURN VALUES
Upon successful completion,

View File

@ -150,6 +150,35 @@ stack top is the starting address returned by the call, plus
.Fa len
bytes. The bottom of the stack at maximum growth is the starting
address returned by the call.
.It Dv MAP_NOSYNC
Causes data dirtied via this VM map to be flushed to physical media
only when necessary (usually by the pager) rather then gratuitously.
Typically this prevents the update daemons from flushing pages dirtied
through such maps and thus allows efficient sharing of memory across
unassociated processes using a file-backed shared memory map. Without
this option any VM pages you dirty may be flushed to disk every so often
(every 30-60 seconds usually) which can create performance problems if you
do not need that to occur (such as when you are using shared file-backed
mmap regions for IPC purposes). Note that VM/filesystem coherency is
maintained whether you use MAP_NOSYNC or not. This option is not portable
across UNIX platforms (yet), though some may implement the same behavior
by default.
.Pp
The
.Xr fsync 2
function will flush all dirty data and metadata associated with a file,
including dirty NOSYNC VM data, to physical media. The
.Xr sync 1
command and
.Xr sync 2
system call generally do not flush dirty NOSYNC VM data.
The
.Xr msync 2
system call is obsolete since
.Os BSD
implements a coherent filesystem buffer cache. However, it may be
used to associate dirty VM pages with filesystem buffers and thus cause
them to be flushed to physical media sooner rather then later.
.El
.Pp
The

View File

@ -2460,7 +2460,7 @@ vfs_msync(struct mount *mp, int flags) {
if (!vget(vp,
LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
if (vp->v_object) {
vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : 0);
vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC);
anyio = 1;
}
vput(vp);

View File

@ -1307,7 +1307,8 @@ symlink(p, uap)
vput(nd.ni_vp);
vput(nd.ni_dvp);
ASSERT_VOP_UNLOCKED(nd.ni_dvp, "symlink");
ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink");
if (error == 0)
ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink");
out:
zfree(namei_zone, path);
return (error);

View File

@ -2460,7 +2460,7 @@ vfs_msync(struct mount *mp, int flags) {
if (!vget(vp,
LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
if (vp->v_object) {
vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : 0);
vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC);
anyio = 1;
}
vput(vp);

View File

@ -1307,7 +1307,8 @@ symlink(p, uap)
vput(nd.ni_vp);
vput(nd.ni_dvp);
ASSERT_VOP_UNLOCKED(nd.ni_dvp, "symlink");
ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink");
if (error == 0)
ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink");
out:
zfree(namei_zone, path);
return (error);

View File

@ -65,6 +65,7 @@
#define MAP_NOEXTEND 0x0100 /* for MAP_FILE, don't change file size */
#define MAP_HASSEMAPHORE 0x0200 /* region may contain semaphores */
#define MAP_STACK 0x0400 /* region grows down, like a stack */
#define MAP_NOSYNC 0x0800 /* page to but do not sync underlying file */
#ifdef _P1003_1B_VISIBLE
/*
@ -103,6 +104,8 @@
#define MADV_WILLNEED 3 /* will need these pages */
#define MADV_DONTNEED 4 /* dont need these pages */
#define MADV_FREE 5 /* dont need these pages, and junk contents */
#define MADV_NOSYNC 6 /* try to avoid flushes to physical media */
#define MADV_AUTOSYNC 7 /* revert to default flushing strategy */
/*
* Return bits from mincore

View File

@ -779,15 +779,29 @@ RetryFault:;
vm_page_flag_set(fs.m, PG_WRITEABLE);
vm_object_set_flag(fs.m->object,
OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
/*
* If the fault is a write, we know that this page is being
* written NOW. This will save on the pmap_is_modified() calls
* later.
* written NOW so dirty it explicitly to save on
* pmap_is_modified() calls later.
*
* If this is a NOSYNC mmap we do not want to set PG_NOSYNC
* if the page is already dirty to prevent data written with
* the expectation of being synced from not being synced.
* Likewise if this entry does not request NOSYNC then make
* sure the page isn't marked NOSYNC. Applications sharing
* data should use the same flags to avoid ping ponging.
*
* Also tell the backing pager, if any, that it should remove
* any swap backing since the page is now dirty.
*/
if (fault_flags & VM_FAULT_DIRTY) {
if (fs.entry->eflags & MAP_ENTRY_NOSYNC) {
if (fs.m->dirty == 0)
vm_page_flag_set(fs.m, PG_NOSYNC);
} else {
vm_page_flag_clear(fs.m, PG_NOSYNC);
}
vm_page_dirty(fs.m);
vm_pager_page_unswapped(fs.m);
}

View File

@ -460,6 +460,9 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
KASSERT(object == NULL,
("vm_map_insert: paradoxical MAP_NOFAULT request"));
}
if (cow & MAP_DISABLE_SYNCER)
protoeflags |= MAP_ENTRY_NOSYNC;
if (object) {
/*
* When object is non-NULL, it could be shared with another
@ -539,13 +542,15 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
* Update the free space hint
*/
if ((map->first_free == prev_entry) &&
(prev_entry->end >= new_entry->start))
(prev_entry->end >= new_entry->start)) {
map->first_free = new_entry;
}
if (cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL))
if (cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) {
pmap_object_init_pt(map->pmap, start,
object, OFF_TO_IDX(offset), end - start,
cow & MAP_PREFAULT_PARTIAL);
}
return (KERN_SUCCESS);
}
@ -1026,6 +1031,8 @@ vm_map_madvise(map, start, end, behav)
case MADV_NORMAL:
case MADV_SEQUENTIAL:
case MADV_RANDOM:
case MADV_NOSYNC:
case MADV_AUTOSYNC:
modify_map = 1;
vm_map_lock(map);
break;
@ -1077,6 +1084,12 @@ vm_map_madvise(map, start, end, behav)
case MADV_RANDOM:
vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
break;
case MADV_NOSYNC:
current->eflags |= MAP_ENTRY_NOSYNC;
break;
case MADV_AUTOSYNC:
current->eflags &= ~MAP_ENTRY_NOSYNC;
break;
default:
break;
}

View File

@ -112,7 +112,7 @@ struct vm_map_entry {
vm_pindex_t lastr; /* last read */
};
#define MAP_ENTRY_UNUSED_01 0x1
#define MAP_ENTRY_NOSYNC 0x1
#define MAP_ENTRY_IS_SUB_MAP 0x2
#define MAP_ENTRY_COW 0x4
#define MAP_ENTRY_NEEDS_COPY 0x8
@ -329,6 +329,7 @@ vmspace_resident_count(struct vmspace *vmspace)
#define MAP_NOFAULT 0x4
#define MAP_PREFAULT 0x8
#define MAP_PREFAULT_PARTIAL 0x10
#define MAP_DISABLE_SYNCER 0x20
/*
* vm_fault option flags

View File

@ -626,7 +626,7 @@ madvise(p, uap)
/*
* Check for illegal behavior
*/
if (uap->behav < 0 || uap->behav > MADV_FREE)
if (uap->behav < 0 || uap->behav > MADV_AUTOSYNC)
return (EINVAL);
/*
* Check for illegal addresses. Watch out for address wrap... Note
@ -1046,9 +1046,10 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
flags |= MAP_SHARED;
}
if ((flags & (MAP_ANON|MAP_SHARED)) == 0) {
if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
docow |= MAP_COPY_ON_WRITE;
}
if (flags & MAP_NOSYNC)
docow |= MAP_DISABLE_SYNCER;
#if defined(VM_PROT_READ_IS_EXEC)
if (prot & VM_PROT_READ)

View File

@ -478,8 +478,10 @@ vm_object_terminate(object)
/*
* vm_object_page_clean
*
* Clean all dirty pages in the specified range of object.
* Leaves page on whatever queue it is currently on.
* Clean all dirty pages in the specified range of object. Leaves page
* on whatever queue it is currently on. If NOSYNC is set then do not
* write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC),
* leaving the object dirty.
*
* Odd semantics: if start == end, we clean everything.
*
@ -503,6 +505,7 @@ vm_object_page_clean(object, start, end, flags)
int chkb;
int maxb;
int i;
int clearobjflags;
int pagerflags;
vm_page_t maf[vm_pageout_page_count];
vm_page_t mab[vm_pageout_page_count];
@ -527,12 +530,26 @@ vm_object_page_clean(object, start, end, flags)
tend = end;
}
/*
* Generally set CLEANCHK interlock and make the page read-only so
* we can then clear the object flags.
*
* However, if this is a nosync mmap then the object is likely to
* stay dirty so do not mess with the page and do not clear the
* object flags.
*/
clearobjflags = 1;
for(p = TAILQ_FIRST(&object->memq); p; p = TAILQ_NEXT(p, listq)) {
vm_page_flag_set(p, PG_CLEANCHK);
vm_page_protect(p, VM_PROT_READ);
if ((flags & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC))
clearobjflags = 0;
else
vm_page_protect(p, VM_PROT_READ);
}
if ((tstart == 0) && (tend == object->size)) {
if (clearobjflags && (tstart == 0) && (tend == object->size)) {
vm_object_clear_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
}
@ -557,6 +574,16 @@ vm_object_page_clean(object, start, end, flags)
continue;
}
/*
* If we have been asked to skip nosync pages and this is a
* nosync page, skip it. Note that the object flags were
* not cleared in this case so we do not have to set them.
*/
if ((flags & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
vm_page_flag_clear(p, PG_CLEANCHK);
continue;
}
s = splvm();
while (vm_page_sleep_busy(p, TRUE, "vpcwai")) {
if (object->generation != curgeneration) {

View File

@ -153,8 +153,9 @@ struct vm_object {
#ifdef KERNEL
#define OBJPC_SYNC 0x1 /* sync I/O */
#define OBJPC_INVAL 0x2 /* invalidate */
#define OBJPC_SYNC 0x1 /* sync I/O */
#define OBJPC_INVAL 0x2 /* invalidate */
#define OBJPC_NOSYNC 0x4 /* skip if PG_NOSYNC */
TAILQ_HEAD(object_q, vm_object);

View File

@ -1522,15 +1522,19 @@ vm_page_set_validclean(m, base, size)
/*
* Set valid, clear dirty bits. If validating the entire
* page we can safely clear the pmap modify bit.
* page we can safely clear the pmap modify bit. We also
* use this opportunity to clear the PG_NOSYNC flag. If a process
* takes a write fault on a MAP_NOSYNC memory area the flag will
* be set again.
*/
pagebits = vm_page_bits(base, size);
m->valid |= pagebits;
m->dirty &= ~pagebits;
if (base == 0 && size == PAGE_SIZE)
if (base == 0 && size == PAGE_SIZE) {
pmap_clear_modify(VM_PAGE_TO_PHYS(m));
vm_page_flag_clear(m, PG_NOSYNC);
}
}
#if 0

View File

@ -234,6 +234,7 @@ extern struct vpgqueues vm_page_queues[PQ_COUNT];
#define PG_REFERENCED 0x0080 /* page has been referenced */
#define PG_CLEANCHK 0x0100 /* page will be checked for cleaning */
#define PG_SWAPINPROG 0x0200 /* swap I/O in progress on page */
#define PG_NOSYNC 0x0400 /* do not collect for syncer */
/*
* Misc constants.
@ -437,10 +438,9 @@ vm_page_unhold(vm_page_t mem)
/*
* vm_page_protect:
*
* Reduce the protection of a page. This routine never
* raises the protection and therefore can be safely
* called if the page is already at VM_PROT_NONE ( it
* will be a NOP effectively ).
* Reduce the protection of a page. This routine never raises the
* protection and therefore can be safely called if the page is already
* at VM_PROT_NONE (it will be a NOP effectively ).
*/
static __inline void