Introduce and use a new VM interface for temporarily pinning pages. This

new interface replaces the combined use of vm_fault_quick() and
pmap_extract_and_hold() throughout the kernel.

In collaboration with:	kib@
This commit is contained in:
Alan Cox 2010-12-25 21:26:56 +00:00
parent 50ca181e5d
commit 82de724fe1
11 changed files with 116 additions and 291 deletions

View File

@ -90,7 +90,6 @@ __FBSDID("$FreeBSD$");
#include <ulp/tom/cxgb_t3_ddp.h>
#include <ulp/tom/cxgb_toepcb.h>
#include <ulp/tom/cxgb_tcp.h>
#include <ulp/tom/cxgb_vm.h>
static int (*pru_sosend)(struct socket *so, struct sockaddr *addr,
@ -218,8 +217,9 @@ cxgb_hold_iovec_pages(struct uio *uio, vm_page_t *m, int *held, vm_prot_t prot)
count = min(count, npages);
err = vm_fault_hold_user_pages(map,
(vm_offset_t)iov->iov_base, mp, count, prot);
/* The following return value is not used. XXX */
err = vm_fault_quick_hold_pages(map,
(vm_offset_t)iov->iov_base, iov->iov_len, prot, mp, count);
mp += count;
totcount += count;
curbytes = iov->iov_len;
@ -503,7 +503,7 @@ cxgb_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
* - the number of bytes to be transferred exceeds the threshold
* - the number of bytes currently in flight won't exceed the in-flight
* threshold XXX TODO
* - vm_fault_hold_user_pages succeeds
* - vm_fault_quick_hold_pages succeeds
* - blocking socket XXX for now
*
*/
@ -970,7 +970,7 @@ cxgb_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
* - the number of bytes to be transferred exceeds the threshold
* - the number of bytes currently in flight won't exceed the in-flight
* threshold XXX TODO
* - vm_fault_hold_user_pages succeeds
* - vm_fault_quick_hold_pages succeeds
* - blocking socket XXX for now
* - iovcnt is 1
*

View File

@ -90,7 +90,6 @@ __FBSDID("$FreeBSD$");
#include <ulp/tom/cxgb_t3_ddp.h>
#include <ulp/tom/cxgb_toepcb.h>
#include <ulp/tom/cxgb_tcp.h>
#include <ulp/tom/cxgb_vm.h>
#define MAX_SCHEDULE_TIMEOUT 300
@ -130,14 +129,6 @@ t3_pin_pages(bus_dma_tag_t tag, bus_dmamap_t dmamap, vm_offset_t addr,
struct ddp_gather_list *p;
vm_map_t map;
/*
* XXX need x86 agnostic check
*/
if (addr + len > VM_MAXUSER_ADDRESS)
return (EFAULT);
pg_off = addr & PAGE_MASK;
npages = (pg_off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
p = malloc(sizeof(struct ddp_gather_list) + npages * sizeof(vm_page_t *),
@ -146,10 +137,11 @@ t3_pin_pages(bus_dma_tag_t tag, bus_dmamap_t dmamap, vm_offset_t addr,
return (ENOMEM);
map = &curthread->td_proc->p_vmspace->vm_map;
err = vm_fault_hold_user_pages(map, addr, p->dgl_pages, npages,
VM_PROT_READ | VM_PROT_WRITE);
if (err)
if (vm_fault_quick_hold_pages(map, addr, len, VM_PROT_READ |
VM_PROT_WRITE, p->dgl_pages, npages) < 0) {
err = EFAULT;
goto free_gl;
}
if (gl && gl->dgl_offset == pg_off && gl->dgl_nelem >= npages &&
gl->dgl_length >= len) {

View File

@ -1,143 +0,0 @@
/**************************************************************************
Copyright (c) 2007-2008, Chelsio Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Neither the name of the Chelsio Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
***************************************************************************/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/types.h>
#include <sys/fcntl.h>
#include <sys/kernel.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/mbuf.h>
#include <sys/condvar.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/syslog.h>
#include <vm/vm.h>
#include <vm/vm_page.h>
#include <vm/vm_map.h>
#include <vm/vm_extern.h>
#include <vm/pmap.h>
#include <ulp/tom/cxgb_vm.h>
/*
* This routine takes a user's map, array of pages, number of pages, and flags
* and then does the following:
* - validate that the user has access to those pages (flags indicates read
* or write) - if not fail
* - validate that count is enough to hold range number of pages - if not fail
* - fault in any non-resident pages
* - if the user is doing a read force a write fault for any COWed pages
* - if the user is doing a read mark all pages as dirty
* - hold all pages
*/
int
vm_fault_hold_user_pages(vm_map_t map, vm_offset_t addr, vm_page_t *mp,
int count, vm_prot_t prot)
{
vm_offset_t end, va;
int faults, rv;
pmap_t pmap;
vm_page_t m, *pages;
pmap = vm_map_pmap(map);
pages = mp;
addr &= ~PAGE_MASK;
/*
* Check that virtual address range is legal
* This check is somewhat bogus as on some architectures kernel
* and user do not share VA - however, it appears that all FreeBSD
* architectures define it
*/
end = addr + (count * PAGE_SIZE);
if (end > VM_MAXUSER_ADDRESS) {
log(LOG_WARNING, "bad address passed to vm_fault_hold_user_pages");
return (EFAULT);
}
/*
* First optimistically assume that all pages are resident
* (and R/W if for write) if so just mark pages as held (and
* dirty if for write) and return
*/
for (pages = mp, faults = 0, va = addr; va < end;
va += PAGE_SIZE, pages++) {
/*
* it would be really nice if we had an unlocked
* version of this so we were only acquiring the
* pmap lock 1 time as opposed to potentially
* many dozens of times
*/
*pages = m = pmap_extract_and_hold(pmap, va, prot);
if (m == NULL) {
faults++;
continue;
}
/*
* Preemptively mark dirty - the pages
* will never have the modified bit set if
* they are only changed via DMA
*/
if (prot & VM_PROT_WRITE) {
vm_page_lock_queues();
vm_page_dirty(m);
vm_page_unlock_queues();
}
}
if (faults == 0)
return (0);
/*
* Pages either have insufficient permissions or are not present
* trigger a fault where neccessary
*/
for (pages = mp, va = addr; va < end; va += PAGE_SIZE, pages++) {
if (*pages == NULL && (rv = vm_fault_hold(map, va, prot,
VM_FAULT_NORMAL, pages)) != KERN_SUCCESS)
goto error;
}
return (0);
error:
log(LOG_WARNING,
"vm_fault bad return rv=%d va=0x%zx\n", rv, va);
for (pages = mp, va = addr; va < end; va += PAGE_SIZE, pages++)
if (*pages) {
vm_page_lock(*pages);
vm_page_unhold(*pages);
vm_page_unlock(*pages);
*pages = NULL;
}
return (EFAULT);
}

View File

@ -1,38 +0,0 @@
/**************************************************************************
Copyright (c) 2007-2008, Chelsio Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Neither the name of the Chelsio Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
$FreeBSD$
***************************************************************************/
#ifndef CXGB_VM_H_
#define CXGB_VM_H_
int vm_fault_hold_user_pages(vm_map_t map, vm_offset_t addr,
vm_page_t *mp, int count, vm_prot_t prot);
#endif

View File

@ -177,11 +177,10 @@ via_free_sg_info(drm_via_sg_info_t *vsg)
free(vsg->desc_pages, DRM_MEM_DRIVER);
case dr_via_pages_locked:
for (i=0; i < vsg->num_pages; ++i) {
if ( NULL != (page = vsg->pages[i])) {
vm_page_lock(page);
vm_page_unwire(page, 0);
vm_page_unlock(page);
}
page = vsg->pages[i];
vm_page_lock(page);
vm_page_unwire(page, 0);
vm_page_unlock(page);
}
case dr_via_pages_alloc:
free(vsg->pages, DRM_MEM_DRIVER);
@ -224,41 +223,31 @@ via_lock_all_dma_pages(drm_via_sg_info_t *vsg, drm_via_dmablit_t *xfer)
{
unsigned long first_pfn = VIA_PFN(xfer->mem_addr);
vm_page_t m;
vm_map_t map;
int i;
map = &curproc->p_vmspace->vm_map;
vsg->num_pages = VIA_PFN(xfer->mem_addr +
(xfer->num_lines * xfer->mem_stride -1)) - first_pfn + 1;
/* Make sure that the user has access to these pages */
for(i = 0; i < vsg->num_pages; i++) {
if (vm_fault_quick((caddr_t)xfer->mem_addr + IDX_TO_OFF(i),
VM_PROT_RW) < 0)
return (-EACCES);
}
if (NULL == (vsg->pages = malloc(sizeof(vm_page_t) * vsg->num_pages,
DRM_MEM_DRIVER, M_NOWAIT | M_ZERO)))
DRM_MEM_DRIVER, M_NOWAIT)))
return -ENOMEM;
for(i = 0; i < vsg->num_pages; i++) {
m = pmap_extract_and_hold(map->pmap,
(vm_offset_t)xfer->mem_addr + IDX_TO_OFF(i), VM_PROT_RW);
if (m == NULL)
break;
vsg->state = dr_via_pages_alloc;
if (vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
(vm_offset_t)xfer->mem_addr, vsg->num_pages * PAGE_SIZE,
VM_PROT_READ | VM_PROT_WRITE, vsg->pages, vsg->num_pages) < 0)
return -EACCES;
for (i = 0; i < vsg->num_pages; i++) {
m = vsg->pages[i];
vm_page_lock(m);
vm_page_wire(m);
vm_page_unhold(m);
vm_page_unlock(m);
vsg->pages[i] = m;
}
vsg->state = dr_via_pages_locked;
if (i != vsg->num_pages)
return -EINVAL;
DRM_DEBUG("DMA pages locked\n");
return 0;

View File

@ -747,10 +747,8 @@ pipe_build_write_buffer(wpipe, uio)
struct pipe *wpipe;
struct uio *uio;
{
pmap_t pmap;
u_int size;
int i;
vm_offset_t addr, endaddr;
PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
KASSERT(wpipe->pipe_state & PIPE_DIRECTW,
@ -760,25 +758,10 @@ pipe_build_write_buffer(wpipe, uio)
if (size > wpipe->pipe_buffer.size)
size = wpipe->pipe_buffer.size;
pmap = vmspace_pmap(curproc->p_vmspace);
endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
if (endaddr < addr)
if ((i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
(vm_offset_t)uio->uio_iov->iov_base, size, VM_PROT_READ,
wpipe->pipe_map.ms, PIPENPAGES)) < 0)
return (EFAULT);
for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
/*
* vm_fault_quick() can sleep.
*/
race:
if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0) {
vm_page_unhold_pages(wpipe->pipe_map.ms, i);
return (EFAULT);
}
wpipe->pipe_map.ms[i] = pmap_extract_and_hold(pmap, addr,
VM_PROT_READ);
if (wpipe->pipe_map.ms[i] == NULL)
goto race;
}
/*
* set up the control block

View File

@ -103,24 +103,20 @@ socow_setup(struct mbuf *m0, struct uio *uio)
struct vmspace *vmspace;
struct vm_map *map;
vm_offset_t offset, uva;
vm_size_t len;
socow_stats.attempted++;
vmspace = curproc->p_vmspace;
map = &vmspace->vm_map;
uva = (vm_offset_t) uio->uio_iov->iov_base;
offset = uva & PAGE_MASK;
len = PAGE_SIZE - offset;
/*
* Verify that access to the given address is allowed from user-space.
*/
if (vm_fault_quick((caddr_t)uva, VM_PROT_READ) < 0)
return (0);
/*
* verify page is mapped & not already wired for i/o
*/
pp = pmap_extract_and_hold(map->pmap, uva, VM_PROT_READ);
if (pp == NULL) {
if (vm_fault_quick_hold_pages(map, uva, len, &pp, 1, VM_PROT_READ) <
0) {
socow_stats.fail_not_mapped++;
return(0);
}
@ -165,7 +161,7 @@ socow_setup(struct mbuf *m0, struct uio *uio)
*/
MEXTADD(m0, sf_buf_kva(sf), PAGE_SIZE, socow_iodone,
(void*)sf_buf_kva(sf), sf, M_RDONLY, EXT_SFBUF);
m0->m_len = PAGE_SIZE - offset;
m0->m_len = len;
m0->m_data = (caddr_t)sf_buf_kva(sf) + offset;
socow_stats.success++;

View File

@ -3855,46 +3855,19 @@ vm_hold_free_pages(struct buf *bp, int newbsize)
int
vmapbuf(struct buf *bp)
{
caddr_t addr, kva;
caddr_t kva;
vm_prot_t prot;
int pidx, i;
struct vm_page *m;
struct pmap *pmap = &curproc->p_vmspace->vm_pmap;
int pidx;
if (bp->b_bufsize < 0)
return (-1);
prot = VM_PROT_READ;
if (bp->b_iocmd == BIO_READ)
prot |= VM_PROT_WRITE; /* Less backwards than it looks */
for (addr = (caddr_t)trunc_page((vm_offset_t)bp->b_data), pidx = 0;
addr < bp->b_data + bp->b_bufsize;
addr += PAGE_SIZE, pidx++) {
/*
* Do the vm_fault if needed; do the copy-on-write thing
* when reading stuff off device into memory.
*
* NOTE! Must use pmap_extract() because addr may be in
* the userland address space, and kextract is only guarenteed
* to work for the kernland address space (see: sparc64 port).
*/
retry:
if (vm_fault_quick(addr >= bp->b_data ? addr : bp->b_data,
prot) < 0) {
for (i = 0; i < pidx; ++i) {
vm_page_lock(bp->b_pages[i]);
vm_page_unhold(bp->b_pages[i]);
vm_page_unlock(bp->b_pages[i]);
bp->b_pages[i] = NULL;
}
return(-1);
}
m = pmap_extract_and_hold(pmap, (vm_offset_t)addr, prot);
if (m == NULL)
goto retry;
bp->b_pages[pidx] = m;
}
if (pidx > btoc(MAXPHYS))
panic("vmapbuf: mapped more than MAXPHYS");
if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
(vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages,
btoc(MAXPHYS))) < 0)
return (-1);
pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx);
kva = bp->b_saveaddr;

View File

@ -161,12 +161,8 @@ zbuf_sfbuf_get(struct vm_map *map, vm_offset_t uaddr)
struct sf_buf *sf;
vm_page_t pp;
if (vm_fault_quick((caddr_t) uaddr, VM_PROT_READ | VM_PROT_WRITE) <
0)
return (NULL);
pp = pmap_extract_and_hold(map->pmap, uaddr, VM_PROT_READ |
VM_PROT_WRITE);
if (pp == NULL)
if (vm_fault_quick_hold_pages(map, uaddr, PAGE_SIZE, VM_PROT_READ |
VM_PROT_WRITE, &pp, 1) < 0)
return (NULL);
vm_page_lock(pp);
vm_page_wire(pp);

View File

@ -63,6 +63,8 @@ void vm_fault_copy_entry(vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t,
vm_ooffset_t *);
int vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
int fault_flags, vm_page_t *m_hold);
int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
vm_prot_t prot, vm_page_t *ma, int max_count);
void vm_fault_unwire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t);
int vm_fault_wire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t);
int vm_forkproc(struct thread *, struct proc *, struct thread *, struct vmspace *, int);

View File

@ -1044,6 +1044,81 @@ vm_fault_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry)
}
}
/*
* Hold each of the physical pages that are mapped by the specified range of
* virtual addresses, ["addr", "addr" + "len"), if those mappings are valid
* and allow the specified types of access, "prot". If all of the implied
* pages are successfully held, then the number of held pages is returned
* together with pointers to those pages in the array "ma". However, if any
* of the pages cannot be held, -1 is returned.
*/
int
vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
vm_prot_t prot, vm_page_t *ma, int max_count)
{
vm_offset_t end, va;
vm_page_t *mp;
int count;
boolean_t pmap_failed;
end = round_page(addr + len);
addr = trunc_page(addr);
/*
* Check for illegal addresses.
*/
if (addr < vm_map_min(map) || addr > end || end > vm_map_max(map))
return (-1);
count = howmany(end - addr, PAGE_SIZE);
if (count > max_count)
panic("vm_fault_quick_hold_pages: count > max_count");
/*
* Most likely, the physical pages are resident in the pmap, so it is
* faster to try pmap_extract_and_hold() first.
*/
pmap_failed = FALSE;
for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) {
*mp = pmap_extract_and_hold(map->pmap, va, prot);
if (*mp == NULL)
pmap_failed = TRUE;
else if ((prot & VM_PROT_WRITE) != 0 &&
(*ma)->dirty != VM_PAGE_BITS_ALL) {
/*
* Explicitly dirty the physical page. Otherwise, the
* caller's changes may go unnoticed because they are
* performed through an unmanaged mapping or by a DMA
* operation.
*/
vm_page_lock_queues();
vm_page_dirty(*mp);
vm_page_unlock_queues();
}
}
if (pmap_failed) {
/*
* One or more pages could not be held by the pmap. Either no
* page was mapped at the specified virtual address or that
* mapping had insufficient permissions. Attempt to fault in
* and hold these pages.
*/
for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE)
if (*mp == NULL && vm_fault_hold(map, va, prot,
VM_FAULT_NORMAL, mp) != KERN_SUCCESS)
goto error;
}
return (count);
error:
for (mp = ma; mp < ma + count; mp++)
if (*mp != NULL) {
vm_page_lock(*mp);
vm_page_unhold(*mp);
vm_page_unlock(*mp);
}
return (-1);
}
/*
* vm_fault_quick:
*