From 756a5412798b7de1709bb1de2db5ba2a5908cba3 Mon Sep 17 00:00:00 2001 From: Gleb Smirnoff Date: Tue, 15 Jan 2019 01:02:16 +0000 Subject: [PATCH] Allocate pager bufs from UMA instead of 80-ish mutex protected linked list. o In vm_pager_bufferinit() create pbuf_zone and start accounting on how many pbufs are we going to have set. In various subsystems that are going to utilize pbufs create private zones via call to pbuf_zsecond_create(). The latter calls uma_zsecond_create(), and sets a limit on created zone. After startup preallocate pbufs according to requirements of all pbuf zones. Subsystems that used to have a private limit with old allocator now have private pbuf zones: md(4), fusefs, NFS client, smbfs, VFS cluster, FFS, swap, vnode pager. The following subsystems use shared pbuf zone: cam(4), nvme(4), physio(9), aio(4). They should have their private limits, but changing that is out of scope of this commit. o Fetch tunable value of kern.nswbuf from init_param2() and while here move NSWBUF_MIN to opt_param.h and eliminate opt_swap.h, that was holding only this option. Default values aren't touched by this commit, but they probably should be reviewed wrt to modern hardware. This change removes a tight bottleneck from sendfile(2) operation, that uses pbufs in vnode pager. Other pagers also would benefit from faster allocation. Together with: gallatin Tested by: pho --- sys/cam/cam_periph.c | 8 +- sys/conf/options | 2 +- sys/dev/md/md.c | 9 +- sys/dev/nvme/nvme_ctrlr.c | 4 +- sys/fs/fuse/fuse_main.c | 5 +- sys/fs/fuse/fuse_vnops.c | 10 +- sys/fs/nfsclient/nfs_clbio.c | 12 +- sys/fs/nfsclient/nfs_clport.c | 5 +- sys/fs/smbfs/smbfs_io.c | 10 +- sys/fs/smbfs/smbfs_vfsops.c | 6 +- sys/kern/kern_physio.c | 4 +- sys/kern/subr_param.c | 10 ++ sys/kern/vfs_aio.c | 6 +- sys/kern/vfs_bio.c | 16 --- sys/kern/vfs_cluster.c | 20 +++- sys/sys/buf.h | 18 +-- sys/ufs/ffs/ffs_rawread.c | 25 ++--- sys/vm/swap_pager.c | 57 +++++----- sys/vm/vm_pager.c | 206 +++++++++++++--------------------- sys/vm/vnode_pager.c | 45 ++++---- 20 files changed, 218 insertions(+), 260 deletions(-) diff --git a/sys/cam/cam_periph.c b/sys/cam/cam_periph.c index cc069325748c..88fcc915bbfe 100644 --- a/sys/cam/cam_periph.c +++ b/sys/cam/cam_periph.c @@ -936,7 +936,7 @@ cam_periph_mapmem(union ccb *ccb, struct cam_periph_map_info *mapinfo, /* * Get the buffer. */ - mapinfo->bp[i] = getpbuf(NULL); + mapinfo->bp[i] = uma_zalloc(pbuf_zone, M_WAITOK); /* put our pointer in the data slot */ mapinfo->bp[i]->b_data = *data_ptrs[i]; @@ -962,9 +962,9 @@ cam_periph_mapmem(union ccb *ccb, struct cam_periph_map_info *mapinfo, for (j = 0; j < i; ++j) { *data_ptrs[j] = mapinfo->bp[j]->b_caller1; vunmapbuf(mapinfo->bp[j]); - relpbuf(mapinfo->bp[j], NULL); + uma_zfree(pbuf_zone, mapinfo->bp[j]); } - relpbuf(mapinfo->bp[i], NULL); + uma_zfree(pbuf_zone, mapinfo->bp[i]); PRELE(curproc); return(EACCES); } @@ -1052,7 +1052,7 @@ cam_periph_unmapmem(union ccb *ccb, struct cam_periph_map_info *mapinfo) vunmapbuf(mapinfo->bp[i]); /* release the buffer */ - relpbuf(mapinfo->bp[i], NULL); + uma_zfree(pbuf_zone, mapinfo->bp[i]); } /* allow ourselves to be swapped once again */ diff --git a/sys/conf/options b/sys/conf/options index bf59d9937aaa..4724a1a601c1 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -187,7 +187,7 @@ NO_ADAPTIVE_SX NO_EVENTTIMERS opt_timer.h NO_OBSOLETE_CODE opt_global.h NO_SYSCTL_DESCR opt_global.h -NSWBUF_MIN opt_swap.h +NSWBUF_MIN opt_param.h MBUF_PACKET_ZONE_DISABLE opt_global.h PANIC_REBOOT_WAIT_TIME opt_panic.h PCI_HP opt_pci.h diff --git a/sys/dev/md/md.c b/sys/dev/md/md.c index f138bad5cc5d..244682ba292b 100644 --- a/sys/dev/md/md.c +++ b/sys/dev/md/md.c @@ -231,7 +231,7 @@ static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(md_softc_list); #define NMASK (NINDIR-1) static int nshift; -static int md_vnode_pbuf_freecnt; +static uma_zone_t md_pbuf_zone; struct indir { uintptr_t *array; @@ -962,7 +962,7 @@ mdstart_vnode(struct md_s *sc, struct bio *bp) auio.uio_iovcnt = piov - auio.uio_iov; piov = auio.uio_iov; } else if ((bp->bio_flags & BIO_UNMAPPED) != 0) { - pb = getpbuf(&md_vnode_pbuf_freecnt); + pb = uma_zalloc(md_pbuf_zone, M_WAITOK); bp->bio_resid = len; unmapped_step: npages = atop(min(MAXPHYS, round_page(len + (ma_offs & @@ -1013,7 +1013,7 @@ mdstart_vnode(struct md_s *sc, struct bio *bp) if (len > 0) goto unmapped_step; } - relpbuf(pb, &md_vnode_pbuf_freecnt); + uma_zfree(md_pbuf_zone, pb); } free(piov, M_MD); @@ -2118,7 +2118,7 @@ g_md_init(struct g_class *mp __unused) sx_xunlock(&md_sx); } } - md_vnode_pbuf_freecnt = nswbuf / 10; + md_pbuf_zone = pbuf_zsecond_create("mdpbuf", nswbuf / 10); status_dev = make_dev(&mdctl_cdevsw, INT_MAX, UID_ROOT, GID_WHEEL, 0600, MDCTL_NAME); g_topology_lock(); @@ -2214,5 +2214,6 @@ g_md_fini(struct g_class *mp __unused) sx_destroy(&md_sx); if (status_dev != NULL) destroy_dev(status_dev); + uma_zdestroy(md_pbuf_zone); delete_unrhdr(md_uh); } diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c index e35530d60152..1bbafa8ea444 100644 --- a/sys/dev/nvme/nvme_ctrlr.c +++ b/sys/dev/nvme/nvme_ctrlr.c @@ -1052,7 +1052,7 @@ nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr, * this passthrough command. */ PHOLD(curproc); - buf = getpbuf(NULL); + buf = uma_zalloc(pbuf_zone, M_WAITOK); buf->b_data = pt->buf; buf->b_bufsize = pt->len; buf->b_iocmd = pt->is_read ? BIO_READ : BIO_WRITE; @@ -1101,7 +1101,7 @@ nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr, err: if (buf != NULL) { - relpbuf(buf, NULL); + uma_zfree(pbuf_zone, buf); PRELE(curproc); } diff --git a/sys/fs/fuse/fuse_main.c b/sys/fs/fuse/fuse_main.c index ca712709a720..c43cf2bb9b10 100644 --- a/sys/fs/fuse/fuse_main.c +++ b/sys/fs/fuse/fuse_main.c @@ -84,7 +84,7 @@ struct mtx fuse_mtx; extern struct vfsops fuse_vfsops; extern struct cdevsw fuse_cdevsw; extern struct vop_vector fuse_vnops; -extern int fuse_pbuf_freecnt; +extern uma_zone_t fuse_pbuf_zone; static struct vfsconf fuse_vfsconf = { .vfc_version = VFS_VERSION, @@ -122,7 +122,6 @@ fuse_loader(struct module *m, int what, void *arg) switch (what) { case MOD_LOAD: /* kldload */ - fuse_pbuf_freecnt = nswbuf / 2 + 1; mtx_init(&fuse_mtx, "fuse_mtx", NULL, MTX_DEF); err = fuse_device_init(); if (err) { @@ -130,6 +129,7 @@ fuse_loader(struct module *m, int what, void *arg) return (err); } fuse_ipc_init(); + fuse_pbuf_zone = pbuf_zsecond_create("fusepbuf", nswbuf / 2); /* vfs_modevent ignores its first arg */ if ((err = vfs_modevent(NULL, what, &fuse_vfsconf))) @@ -144,6 +144,7 @@ fuse_loader(struct module *m, int what, void *arg) if ((err = vfs_modevent(NULL, what, &fuse_vfsconf))) return (err); fuse_bringdown(eh_tag); + uma_zdestroy(fuse_pbuf_zone); break; default: return (EINVAL); diff --git a/sys/fs/fuse/fuse_vnops.c b/sys/fs/fuse/fuse_vnops.c index 1c405197c1e9..75de97bc226d 100644 --- a/sys/fs/fuse/fuse_vnops.c +++ b/sys/fs/fuse/fuse_vnops.c @@ -201,7 +201,7 @@ static int fuse_reclaim_revoked = 0; SYSCTL_INT(_vfs_fuse, OID_AUTO, reclaim_revoked, CTLFLAG_RW, &fuse_reclaim_revoked, 0, ""); -int fuse_pbuf_freecnt = -1; +uma_zone_t fuse_pbuf_zone; #define fuse_vm_page_lock(m) vm_page_lock((m)); #define fuse_vm_page_unlock(m) vm_page_unlock((m)); @@ -1824,7 +1824,7 @@ fuse_vnop_getpages(struct vop_getpages_args *ap) * We use only the kva address for the buffer, but this is extremely * convenient and fast. */ - bp = getpbuf(&fuse_pbuf_freecnt); + bp = uma_zalloc(fuse_pbuf_zone, M_WAITOK); kva = (vm_offset_t)bp->b_data; pmap_qenter(kva, pages, npages); @@ -1845,7 +1845,7 @@ fuse_vnop_getpages(struct vop_getpages_args *ap) error = fuse_io_dispatch(vp, &uio, IO_DIRECT, cred); pmap_qremove(kva, npages); - relpbuf(bp, &fuse_pbuf_freecnt); + uma_zfree(fuse_pbuf_zone, bp); if (error && (uio.uio_resid == count)) { FS_DEBUG("error %d\n", error); @@ -1958,7 +1958,7 @@ fuse_vnop_putpages(struct vop_putpages_args *ap) * We use only the kva address for the buffer, but this is extremely * convenient and fast. */ - bp = getpbuf(&fuse_pbuf_freecnt); + bp = uma_zalloc(fuse_pbuf_zone, M_WAITOK); kva = (vm_offset_t)bp->b_data; pmap_qenter(kva, pages, npages); @@ -1978,7 +1978,7 @@ fuse_vnop_putpages(struct vop_putpages_args *ap) error = fuse_io_dispatch(vp, &uio, IO_DIRECT, cred); pmap_qremove(kva, npages); - relpbuf(bp, &fuse_pbuf_freecnt); + uma_zfree(fuse_pbuf_zone, bp); if (!error) { int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE; diff --git a/sys/fs/nfsclient/nfs_clbio.c b/sys/fs/nfsclient/nfs_clbio.c index b802b2ebd964..db6b851374b2 100644 --- a/sys/fs/nfsclient/nfs_clbio.c +++ b/sys/fs/nfsclient/nfs_clbio.c @@ -70,7 +70,7 @@ extern struct nfsmount *ncl_iodmount[NFS_MAXASYNCDAEMON]; extern int newnfs_directio_enable; extern int nfs_keep_dirty_on_error; -int ncl_pbuf_freecnt = -1; /* start out unlimited */ +uma_zone_t ncl_pbuf_zone; static struct buf *nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, struct thread *td); @@ -182,7 +182,7 @@ ncl_getpages(struct vop_getpages_args *ap) * We use only the kva address for the buffer, but this is extremely * convenient and fast. */ - bp = getpbuf(&ncl_pbuf_freecnt); + bp = uma_zalloc(ncl_pbuf_zone, M_WAITOK); kva = (vm_offset_t) bp->b_data; pmap_qenter(kva, pages, npages); @@ -203,7 +203,7 @@ ncl_getpages(struct vop_getpages_args *ap) error = ncl_readrpc(vp, &uio, cred); pmap_qremove(kva, npages); - relpbuf(bp, &ncl_pbuf_freecnt); + uma_zfree(ncl_pbuf_zone, bp); if (error && (uio.uio_resid == count)) { printf("ncl_getpages: error %d\n", error); @@ -793,7 +793,7 @@ nfs_directio_write(vp, uiop, cred, ioflag) while (uiop->uio_resid > 0) { size = MIN(uiop->uio_resid, wsize); size = MIN(uiop->uio_iov->iov_len, size); - bp = getpbuf(&ncl_pbuf_freecnt); + bp = uma_zalloc(ncl_pbuf_zone, M_WAITOK); t_uio = malloc(sizeof(struct uio), M_NFSDIRECTIO, M_WAITOK); t_iov = malloc(sizeof(struct iovec), M_NFSDIRECTIO, M_WAITOK); t_iov->iov_base = malloc(size, M_NFSDIRECTIO, M_WAITOK); @@ -836,7 +836,7 @@ nfs_directio_write(vp, uiop, cred, ioflag) free(t_iov, M_NFSDIRECTIO); free(t_uio, M_NFSDIRECTIO); bp->b_vp = NULL; - relpbuf(bp, &ncl_pbuf_freecnt); + uma_zfree(ncl_pbuf_zone, bp); if (error == EINTR) return (error); goto do_sync; @@ -1571,7 +1571,7 @@ ncl_doio_directwrite(struct buf *bp) mtx_unlock(&np->n_mtx); } bp->b_vp = NULL; - relpbuf(bp, &ncl_pbuf_freecnt); + uma_zfree(ncl_pbuf_zone, bp); } /* diff --git a/sys/fs/nfsclient/nfs_clport.c b/sys/fs/nfsclient/nfs_clport.c index 662adfc482fd..d2747e904b16 100644 --- a/sys/fs/nfsclient/nfs_clport.c +++ b/sys/fs/nfsclient/nfs_clport.c @@ -79,7 +79,7 @@ extern struct vop_vector newnfs_vnodeops; extern struct vop_vector newnfs_fifoops; extern uma_zone_t newnfsnode_zone; extern struct buf_ops buf_ops_newnfs; -extern int ncl_pbuf_freecnt; +extern uma_zone_t ncl_pbuf_zone; extern short nfsv4_cbport; extern int nfscl_enablecallb; extern int nfs_numnfscbd; @@ -1023,7 +1023,7 @@ nfscl_init(void) return; inited = 1; nfscl_inited = 1; - ncl_pbuf_freecnt = nswbuf / 2 + 1; + ncl_pbuf_zone = pbuf_zsecond_create("nfspbuf", nswbuf / 2); } /* @@ -1357,6 +1357,7 @@ nfscl_modevent(module_t mod, int type, void *data) #if 0 ncl_call_invalcaches = NULL; nfsd_call_nfscl = NULL; + uma_zdestroy(ncl_pbuf_zone); /* and get rid of the mutexes */ mtx_destroy(&ncl_iod_mutex); loaded = 0; diff --git a/sys/fs/smbfs/smbfs_io.c b/sys/fs/smbfs/smbfs_io.c index 82f73ceb4594..4edba5c761e7 100644 --- a/sys/fs/smbfs/smbfs_io.c +++ b/sys/fs/smbfs/smbfs_io.c @@ -63,7 +63,7 @@ /*#define SMBFS_RWGENERIC*/ -extern int smbfs_pbuf_freecnt; +extern uma_zone_t smbfs_pbuf_zone; static int smbfs_fastlookup = 1; @@ -468,7 +468,7 @@ smbfs_getpages(ap) scred = smbfs_malloc_scred(); smb_makescred(scred, td, cred); - bp = getpbuf(&smbfs_pbuf_freecnt); + bp = uma_zalloc(smbfs_pbuf_zone, M_WAITOK); kva = (vm_offset_t) bp->b_data; pmap_qenter(kva, pages, npages); @@ -490,7 +490,7 @@ smbfs_getpages(ap) smbfs_free_scred(scred); pmap_qremove(kva, npages); - relpbuf(bp, &smbfs_pbuf_freecnt); + uma_zfree(smbfs_pbuf_zone, bp); if (error && (uio.uio_resid == count)) { printf("smbfs_getpages: error %d\n",error); @@ -593,7 +593,7 @@ smbfs_putpages(ap) rtvals[i] = VM_PAGER_ERROR; } - bp = getpbuf(&smbfs_pbuf_freecnt); + bp = uma_zalloc(smbfs_pbuf_zone, M_WAITOK); kva = (vm_offset_t) bp->b_data; pmap_qenter(kva, pages, npages); @@ -621,7 +621,7 @@ smbfs_putpages(ap) pmap_qremove(kva, npages); - relpbuf(bp, &smbfs_pbuf_freecnt); + uma_zfree(smbfs_pbuf_zone, bp); if (error == 0) { vnode_pager_undirty_pages(pages, rtvals, count - uio.uio_resid, diff --git a/sys/fs/smbfs/smbfs_vfsops.c b/sys/fs/smbfs/smbfs_vfsops.c index 20efbb75e55f..186c004e309a 100644 --- a/sys/fs/smbfs/smbfs_vfsops.c +++ b/sys/fs/smbfs/smbfs_vfsops.c @@ -88,7 +88,7 @@ MODULE_DEPEND(smbfs, netsmb, NSMB_VERSION, NSMB_VERSION, NSMB_VERSION); MODULE_DEPEND(smbfs, libiconv, 1, 1, 2); MODULE_DEPEND(smbfs, libmchain, 1, 1, 1); -int smbfs_pbuf_freecnt = -1; /* start out unlimited */ +uma_zone_t smbfs_pbuf_zone; static int smbfs_cmount(struct mntarg *ma, void * data, uint64_t flags) @@ -367,7 +367,8 @@ smbfs_quotactl(mp, cmd, uid, arg) int smbfs_init(struct vfsconf *vfsp) { - smbfs_pbuf_freecnt = nswbuf / 2 + 1; + + smbfs_pbuf_zone = pbuf_zsecond_create("smbpbuf", nswbuf / 2); SMBVDEBUG("done.\n"); return 0; } @@ -377,6 +378,7 @@ int smbfs_uninit(struct vfsconf *vfsp) { + uma_zdestroy(smbfs_pbuf_zone); SMBVDEBUG("done.\n"); return 0; } diff --git a/sys/kern/kern_physio.c b/sys/kern/kern_physio.c index b049b8d17fac..dccbb33e681a 100644 --- a/sys/kern/kern_physio.c +++ b/sys/kern/kern_physio.c @@ -104,7 +104,7 @@ physio(struct cdev *dev, struct uio *uio, int ioflag) maxpages = btoc(MIN(uio->uio_resid, MAXPHYS)) + 1; pages = malloc(sizeof(*pages) * maxpages, M_DEVBUF, M_WAITOK); } else { - pbuf = getpbuf(NULL); + pbuf = uma_zalloc(pbuf_zone, M_WAITOK); sa = pbuf->b_data; maxpages = btoc(MAXPHYS); pages = pbuf->b_pages; @@ -220,7 +220,7 @@ physio(struct cdev *dev, struct uio *uio, int ioflag) } doerror: if (pbuf) - relpbuf(pbuf, NULL); + uma_zfree(pbuf_zone, pbuf); else if (pages) free(pages, M_DEVBUF); g_destroy_bio(bp); diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c index 7c6f6eba3d91..517f8233778b 100644 --- a/sys/kern/subr_param.c +++ b/sys/kern/subr_param.c @@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include @@ -286,6 +287,15 @@ init_param2(long physpages) TUNABLE_INT_FETCH("kern.nbuf", &nbuf); TUNABLE_INT_FETCH("kern.bio_transient_maxcnt", &bio_transient_maxcnt); + /* + * Physical buffers are pre-allocated buffers (struct buf) that + * are used as temporary holders for I/O, such as paging I/O. + */ + nswbuf = min(nbuf / 4, 256); + TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf); + if (nswbuf < NSWBUF_MIN) + nswbuf = NSWBUF_MIN; + /* * The default for maxpipekva is min(1/64 of the kernel address space, * max(1/64 of main memory, 512KB)). See sys_pipe.c for more details. diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c index 350c51a01265..995cfef8344f 100644 --- a/sys/kern/vfs_aio.c +++ b/sys/kern/vfs_aio.c @@ -1267,7 +1267,7 @@ aio_qbio(struct proc *p, struct kaiocb *job) goto unref; } - job->pbuf = pbuf = (struct buf *)getpbuf(NULL); + job->pbuf = pbuf = uma_zalloc(pbuf_zone, M_WAITOK); BUF_KERNPROC(pbuf); AIO_LOCK(ki); ki->kaio_buffer_count++; @@ -1318,7 +1318,7 @@ aio_qbio(struct proc *p, struct kaiocb *job) AIO_LOCK(ki); ki->kaio_buffer_count--; AIO_UNLOCK(ki); - relpbuf(pbuf, NULL); + uma_zfree(pbuf_zone, pbuf); job->pbuf = NULL; } g_destroy_bio(bp); @@ -2344,7 +2344,7 @@ aio_biowakeup(struct bio *bp) ki = userp->p_aioinfo; if (job->pbuf) { pmap_qremove((vm_offset_t)job->pbuf->b_data, job->npages); - relpbuf(job->pbuf, NULL); + uma_zfree(pbuf_zone, job->pbuf); job->pbuf = NULL; atomic_subtract_int(&num_buf_aio, 1); AIO_LOCK(ki); diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 2f0f3a637f7e..3766c7d8d55f 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -86,7 +86,6 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include "opt_swap.h" static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer"); @@ -1017,10 +1016,6 @@ bd_speedup(void) mtx_unlock(&bdlock); } -#ifndef NSWBUF_MIN -#define NSWBUF_MIN 16 -#endif - #ifdef __i386__ #define TRANSIENT_DENOM 5 #else @@ -1129,20 +1124,9 @@ kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) nbuf = buf_sz / BKVASIZE; } - /* - * swbufs are used as temporary holders for I/O, such as paging I/O. - * We have no less then 16 and no more then 256. - */ - nswbuf = min(nbuf / 4, 256); - TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf); - if (nswbuf < NSWBUF_MIN) - nswbuf = NSWBUF_MIN; - /* * Reserve space for the buffer cache buffers */ - swbuf = (void *)v; - v = (caddr_t)(swbuf + nswbuf); buf = (void *)v; v = (caddr_t)(buf + nbuf); diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index 1e1e1c5708d4..d35c396089e6 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -63,7 +63,9 @@ SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, #endif static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer"); +static uma_zone_t cluster_pbuf_zone; +static void cluster_init(void *); static struct cluster_save *cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int gbflags); static struct buf *cluster_rbuild(struct vnode *vp, u_quad_t filesize, @@ -83,6 +85,15 @@ static int read_min = 1; SYSCTL_INT(_vfs, OID_AUTO, read_min, CTLFLAG_RW, &read_min, 0, "Cluster read min block count"); +SYSINIT(cluster, SI_SUB_CPU, SI_ORDER_ANY, cluster_init, NULL); + +static void +cluster_init(void *dummy) +{ + + cluster_pbuf_zone = pbuf_zsecond_create("clpbuf", nswbuf / 2); +} + /* * Read data to a buf, including read-ahead if we find this to be beneficial. * cluster_read replaces bread. @@ -372,7 +383,7 @@ cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) return tbp; - bp = trypbuf(&cluster_pbuf_freecnt); + bp = uma_zalloc(cluster_pbuf_zone, M_NOWAIT); if (bp == NULL) return tbp; @@ -603,7 +614,7 @@ cluster_callback(struct buf *bp) bufdone(tbp); } pbrelvp(bp); - relpbuf(bp, &cluster_pbuf_freecnt); + uma_zfree(cluster_pbuf_zone, bp); } /* @@ -856,9 +867,8 @@ cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len, (tbp->b_bcount != tbp->b_bufsize) || (tbp->b_bcount != size) || (len == 1) || - ((bp = (vp->v_vflag & VV_MD) != 0 ? - trypbuf(&cluster_pbuf_freecnt) : - getpbuf(&cluster_pbuf_freecnt)) == NULL)) { + ((bp = uma_zalloc(cluster_pbuf_zone, + (vp->v_vflag & VV_MD) != 0 ? M_NOWAIT : M_WAITOK)) == NULL)) { totalwritten += tbp->b_bufsize; bawrite(tbp); ++start_lbn; diff --git a/sys/sys/buf.h b/sys/sys/buf.h index a099a972efc0..fe49224417d9 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -44,6 +44,7 @@ #include #include #include +#include struct bio; struct buf; @@ -275,6 +276,11 @@ struct buf { #define PRINT_BUF_VFLAGS "\20\4bkgrderr\3bkgrdwait\2bkgrdinprog\1scanned" #ifdef _KERNEL + +#ifndef NSWBUF_MIN +#define NSWBUF_MIN 16 +#endif + /* * Buffer locking */ @@ -287,7 +293,7 @@ extern const char *buf_wmesg; /* Default buffer lock message */ * Initialize a lock. */ #define BUF_LOCKINIT(bp) \ - lockinit(&(bp)->b_lock, PRIBIO + 4, buf_wmesg, 0, 0) + lockinit(&(bp)->b_lock, PRIBIO + 4, buf_wmesg, 0, LK_NEW) /* * * Get a lock sleeping non-interruptably until it becomes available. @@ -493,10 +499,6 @@ extern int bdwriteskip; extern int dirtybufferflushes; extern int altbufferflushes; extern int nswbuf; /* Number of swap I/O buffer headers. */ -extern int cluster_pbuf_freecnt; /* Number of pbufs for clusters */ -extern int vnode_pbuf_freecnt; /* Number of pbufs for vnode pager */ -extern int vnode_async_pbuf_freecnt; /* Number of pbufs for vnode pager, - asynchronous reads */ extern caddr_t unmapped_buf; /* Data address for unmapped buffers. */ static inline int @@ -537,7 +539,6 @@ void brelse(struct buf *); void bqrelse(struct buf *); int vfs_bio_awrite(struct buf *); void vfs_drain_busy_pages(struct buf *bp); -struct buf * getpbuf(int *); struct buf *incore(struct bufobj *, daddr_t); struct buf *gbincore(struct bufobj *, daddr_t); struct buf *getblk(struct vnode *, daddr_t, int, int, int, int); @@ -549,6 +550,9 @@ int bufwrite(struct buf *); void bufdone(struct buf *); void bd_speedup(void); +extern uma_zone_t pbuf_zone; +uma_zone_t pbuf_zsecond_create(char *name, int max); + int cluster_read(struct vnode *, u_quad_t, daddr_t, long, struct ucred *, long, int, int, struct buf **); int cluster_wbuild(struct vnode *, long, daddr_t, int, int); @@ -562,7 +566,6 @@ void vfs_busy_pages(struct buf *, int clear_modify); void vfs_unbusy_pages(struct buf *); int vmapbuf(struct buf *, int); void vunmapbuf(struct buf *); -void relpbuf(struct buf *, int *); void brelvp(struct buf *); void bgetvp(struct vnode *, struct buf *); void pbgetbo(struct bufobj *bo, struct buf *bp); @@ -571,7 +574,6 @@ void pbrelbo(struct buf *); void pbrelvp(struct buf *); int allocbuf(struct buf *bp, int size); void reassignbuf(struct buf *); -struct buf *trypbuf(int *); void bwait(struct buf *, u_char, const char *); void bdone(struct buf *); diff --git a/sys/ufs/ffs/ffs_rawread.c b/sys/ufs/ffs/ffs_rawread.c index 65dbea9dbd53..ab065f2de37d 100644 --- a/sys/ufs/ffs/ffs_rawread.c +++ b/sys/ufs/ffs/ffs_rawread.c @@ -74,9 +74,7 @@ int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); SYSCTL_DECL(_vfs_ffs); -static int ffsrawbufcnt = 4; -SYSCTL_INT(_vfs_ffs, OID_AUTO, ffsrawbufcnt, CTLFLAG_RD, &ffsrawbufcnt, 0, - "Buffers available for raw reads"); +static uma_zone_t ffsraw_pbuf_zone; static int allowrawread = 1; SYSCTL_INT(_vfs_ffs, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0, @@ -90,7 +88,8 @@ static void ffs_rawread_setup(void *arg __unused) { - ffsrawbufcnt = (nswbuf > 100 ) ? (nswbuf - (nswbuf >> 4)) : nswbuf - 8; + ffsraw_pbuf_zone = pbuf_zsecond_create("ffsrawpbuf", + (nswbuf > 100 ) ? (nswbuf - (nswbuf >> 4)) : nswbuf - 8); } SYSINIT(ffs_raw, SI_SUB_VM_CONF, SI_ORDER_ANY, ffs_rawread_setup, NULL); @@ -296,8 +295,7 @@ ffs_rawread_main(struct vnode *vp, while (resid > 0) { if (bp == NULL) { /* Setup first read */ - /* XXX: Leave some bufs for swap */ - bp = getpbuf(&ffsrawbufcnt); + bp = uma_zalloc(ffsraw_pbuf_zone, M_WAITOK); pbgetvp(vp, bp); error = ffs_rawread_readahead(vp, udata, offset, resid, td, bp); @@ -305,9 +303,9 @@ ffs_rawread_main(struct vnode *vp, break; if (resid > bp->b_bufsize) { /* Setup fist readahead */ - /* XXX: Leave bufs for swap */ if (rawreadahead != 0) - nbp = trypbuf(&ffsrawbufcnt); + nbp = uma_zalloc(ffsraw_pbuf_zone, + M_NOWAIT); else nbp = NULL; if (nbp != NULL) { @@ -324,7 +322,8 @@ ffs_rawread_main(struct vnode *vp, nbp); if (nerror) { pbrelvp(nbp); - relpbuf(nbp, &ffsrawbufcnt); + uma_zfree(ffsraw_pbuf_zone, + nbp); nbp = NULL; } } @@ -365,7 +364,7 @@ ffs_rawread_main(struct vnode *vp, if (resid <= bp->b_bufsize) { /* No more readaheads */ pbrelvp(nbp); - relpbuf(nbp, &ffsrawbufcnt); + uma_zfree(ffsraw_pbuf_zone, nbp); nbp = NULL; } else { /* Setup next readahead */ nerror = ffs_rawread_readahead(vp, @@ -379,7 +378,7 @@ ffs_rawread_main(struct vnode *vp, nbp); if (nerror != 0) { pbrelvp(nbp); - relpbuf(nbp, &ffsrawbufcnt); + uma_zfree(ffsraw_pbuf_zone, nbp); nbp = NULL; } } @@ -395,13 +394,13 @@ ffs_rawread_main(struct vnode *vp, if (bp != NULL) { pbrelvp(bp); - relpbuf(bp, &ffsrawbufcnt); + uma_zfree(ffsraw_pbuf_zone, bp); } if (nbp != NULL) { /* Run down readahead buffer */ bwait(nbp, PRIBIO, "rawrd"); vunmapbuf(nbp); pbrelvp(nbp); - relpbuf(nbp, &ffsrawbufcnt); + uma_zfree(ffsraw_pbuf_zone, nbp); } if (error == 0) diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index f99da24e6cc0..ea0a61ab276b 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -71,7 +71,6 @@ #include __FBSDID("$FreeBSD$"); -#include "opt_swap.h" #include "opt_vm.h" #include @@ -324,9 +323,8 @@ swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred) static int swap_pager_full = 2; /* swap space exhaustion (task killing) */ static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/ -static int nsw_rcount; /* free read buffers */ -static int nsw_wcount_sync; /* limit write buffers / synchronous */ -static int nsw_wcount_async; /* limit write buffers / asynchronous */ +static struct mtx swbuf_mtx; /* to sync nsw_wcount_async */ +static int nsw_wcount_async; /* limit async write buffers */ static int nsw_wcount_async_max;/* assigned maximum */ static int nsw_cluster_max; /* maximum VOP I/O allowed */ @@ -352,6 +350,8 @@ static struct sx sw_alloc_sx; (&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)]) static struct pagerlst swap_pager_object_list[NOBJLISTS]; +static uma_zone_t swwbuf_zone; +static uma_zone_t swrbuf_zone; static uma_zone_t swblk_zone; static uma_zone_t swpctrie_zone; @@ -539,12 +539,12 @@ swap_pager_swap_init(void) */ nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER); - mtx_lock(&pbuf_mtx); - nsw_rcount = (nswbuf + 1) / 2; - nsw_wcount_sync = (nswbuf + 3) / 4; nsw_wcount_async = 4; nsw_wcount_async_max = nsw_wcount_async; - mtx_unlock(&pbuf_mtx); + mtx_init(&swbuf_mtx, "async swbuf mutex", NULL, MTX_DEF); + + swwbuf_zone = pbuf_zsecond_create("swwbuf", nswbuf / 4); + swrbuf_zone = pbuf_zsecond_create("swrbuf", nswbuf / 2); /* * Initialize our zone, taking the user's requested size or @@ -1205,7 +1205,7 @@ swap_pager_getpages(vm_object_t object, vm_page_t *ma, int count, int *rbehind, ("no swap blocking containing %p(%jx)", object, (uintmax_t)pindex)); VM_OBJECT_WUNLOCK(object); - bp = getpbuf(&nsw_rcount); + bp = uma_zalloc(swrbuf_zone, M_WAITOK); /* Pages cannot leave the object while busy. */ for (i = 0, p = bm; i < count; i++, p = TAILQ_NEXT(p, listq)) { MPASS(p->pindex == bm->pindex + i); @@ -1406,12 +1406,17 @@ swap_pager_putpages(vm_object_t object, vm_page_t *ma, int count, * All I/O parameters have been satisfied, build the I/O * request and assign the swap space. */ - if (sync == TRUE) { - bp = getpbuf(&nsw_wcount_sync); - } else { - bp = getpbuf(&nsw_wcount_async); - bp->b_flags = B_ASYNC; + if (sync != TRUE) { + mtx_lock(&swbuf_mtx); + while (nsw_wcount_async == 0) + msleep(&nsw_wcount_async, &swbuf_mtx, PVM, + "swbufa", 0); + nsw_wcount_async--; + mtx_unlock(&swbuf_mtx); } + bp = uma_zalloc(swwbuf_zone, M_WAITOK); + if (sync != TRUE) + bp->b_flags = B_ASYNC; bp->b_flags |= B_PAGING; bp->b_iocmd = BIO_WRITE; @@ -1634,15 +1639,13 @@ swp_pager_async_iodone(struct buf *bp) /* * release the physical I/O buffer */ - relpbuf( - bp, - ((bp->b_iocmd == BIO_READ) ? &nsw_rcount : - ((bp->b_flags & B_ASYNC) ? - &nsw_wcount_async : - &nsw_wcount_sync - ) - ) - ); + if (bp->b_flags & B_ASYNC) { + mtx_lock(&swbuf_mtx); + if (++nsw_wcount_async == 1) + wakeup(&nsw_wcount_async); + mtx_unlock(&swbuf_mtx); + } + uma_zfree((bp->b_iocmd == BIO_READ) ? swrbuf_zone : swwbuf_zone, bp); } int @@ -2627,6 +2630,7 @@ swapgeom_done(struct bio *bp2) bp->b_ioflags |= BIO_ERROR; bp->b_resid = bp->b_bcount - bp2->bio_completed; bp->b_error = bp2->bio_error; + bp->b_caller1 = NULL; bufdone(bp); sp = bp2->bio_caller1; mtx_lock(&sw_dev_mtx); @@ -2666,6 +2670,7 @@ swapgeom_strategy(struct buf *bp, struct swdevt *sp) return; } + bp->b_caller1 = bio; bio->bio_caller1 = sp; bio->bio_caller2 = bp; bio->bio_cmd = bp->b_iocmd; @@ -2880,7 +2885,7 @@ sysctl_swap_async_max(SYSCTL_HANDLER_ARGS) if (new > nswbuf / 2 || new < 1) return (EINVAL); - mtx_lock(&pbuf_mtx); + mtx_lock(&swbuf_mtx); while (nsw_wcount_async_max != new) { /* * Adjust difference. If the current async count is too low, @@ -2895,11 +2900,11 @@ sysctl_swap_async_max(SYSCTL_HANDLER_ARGS) } else { nsw_wcount_async_max -= nsw_wcount_async; nsw_wcount_async = 0; - msleep(&nsw_wcount_async, &pbuf_mtx, PSWP, + msleep(&nsw_wcount_async, &swbuf_mtx, PSWP, "swpsysctl", 0); } } - mtx_unlock(&pbuf_mtx); + mtx_unlock(&swbuf_mtx); return (0); } diff --git a/sys/vm/vm_pager.c b/sys/vm/vm_pager.c index 3864c9dfc1c4..3a302fa1ce56 100644 --- a/sys/vm/vm_pager.c +++ b/sys/vm/vm_pager.c @@ -68,6 +68,8 @@ #include __FBSDID("$FreeBSD$"); +#include "opt_param.h" + #include #include #include @@ -85,10 +87,12 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include -int cluster_pbuf_freecnt = -1; /* unlimited to begin with */ - -struct buf *swbuf; +uma_zone_t pbuf_zone; +static int pbuf_init(void *, int, int); +static int pbuf_ctor(void *, int, void *, int); +static void pbuf_dtor(void *, int, void *); static int dead_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *); static vm_object_t dead_pager_alloc(void *, vm_ooffset_t, vm_prot_t, @@ -167,9 +171,6 @@ struct pagerops *pagertab[] = { * cleaning requests (NPENDINGIO == 64) * the maximum swap cluster size * (MAXPHYS == 64k) if you want to get the most efficiency. */ -struct mtx_padalign __exclusive_cache_line pbuf_mtx; -static TAILQ_HEAD(swqueue, buf) bswlist; -static int bswneeded; vm_offset_t swapbkva; /* swap buffers kva */ void @@ -177,7 +178,6 @@ vm_pager_init(void) { struct pagerops **pgops; - TAILQ_INIT(&bswlist); /* * Initialize known pagers */ @@ -186,30 +186,51 @@ vm_pager_init(void) (*(*pgops)->pgo_init)(); } +static int nswbuf_max; + void vm_pager_bufferinit(void) { - struct buf *bp; - int i; - mtx_init(&pbuf_mtx, "pbuf mutex", NULL, MTX_DEF); - bp = swbuf; - /* - * Now set up swap and physical I/O buffer headers. - */ - for (i = 0; i < nswbuf; i++, bp++) { - TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist); - BUF_LOCKINIT(bp); - LIST_INIT(&bp->b_dep); - bp->b_rcred = bp->b_wcred = NOCRED; - bp->b_xflags = 0; - } - - cluster_pbuf_freecnt = nswbuf / 2; - vnode_pbuf_freecnt = nswbuf / 2 + 1; - vnode_async_pbuf_freecnt = nswbuf / 2; + /* Main zone for paging bufs. */ + pbuf_zone = uma_zcreate("pbuf", sizeof(struct buf), + pbuf_ctor, pbuf_dtor, pbuf_init, NULL, UMA_ALIGN_CACHE, + UMA_ZONE_VM | UMA_ZONE_NOFREE); + /* Few systems may still use this zone directly, so it needs a limit. */ + nswbuf_max += uma_zone_set_max(pbuf_zone, NSWBUF_MIN); } +uma_zone_t +pbuf_zsecond_create(char *name, int max) +{ + uma_zone_t zone; + + zone = uma_zsecond_create(name, pbuf_ctor, pbuf_dtor, NULL, NULL, + pbuf_zone); + /* + * uma_prealloc() rounds up to items per slab. If we would prealloc + * immediately on every pbuf_zsecond_create(), we may accumulate too + * much of difference between hard limit and prealloced items, which + * means wasted memory. + */ + if (nswbuf_max > 0) + nswbuf_max += uma_zone_set_max(zone, max); + else + uma_prealloc(pbuf_zone, uma_zone_set_max(zone, max)); + + return (zone); +} + +static void +pbuf_prealloc(void *arg __unused) +{ + + uma_prealloc(pbuf_zone, nswbuf_max); + nswbuf_max = -1; +} + +SYSINIT(pbuf, SI_SUB_KTHREAD_BUF, SI_ORDER_ANY, pbuf_prealloc, NULL); + /* * Allocate an instance of a pager of the given type. * Size, protection and offset parameters are passed in for pagers that @@ -347,110 +368,33 @@ vm_pager_object_lookup(struct pagerlst *pg_list, void *handle) return (object); } -/* - * initialize a physical buffer - */ - -/* - * XXX This probably belongs in vfs_bio.c - */ -static void -initpbuf(struct buf *bp) +static int +pbuf_ctor(void *mem, int size, void *arg, int flags) { + struct buf *bp = mem; - KASSERT(bp->b_bufobj == NULL, ("initpbuf with bufobj")); - KASSERT(bp->b_vp == NULL, ("initpbuf with vp")); + bp->b_vp = NULL; + bp->b_bufobj = NULL; + + /* copied from initpbuf() */ bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; - bp->b_qindex = 0; /* On no queue (QUEUE_NONE) */ - bp->b_kvabase = (caddr_t)(MAXPHYS * (bp - swbuf)) + swapbkva; + bp->b_qindex = 0; /* On no queue (QUEUE_NONE) */ bp->b_data = bp->b_kvabase; - bp->b_kvasize = MAXPHYS; - bp->b_flags = 0; bp->b_xflags = 0; + bp->b_flags = 0; bp->b_ioflags = 0; bp->b_iodone = NULL; bp->b_error = 0; BUF_LOCK(bp, LK_EXCLUSIVE, NULL); - buf_track(bp, __func__); + + return (0); } -/* - * allocate a physical buffer - * - * There are a limited number (nswbuf) of physical buffers. We need - * to make sure that no single subsystem is able to hog all of them, - * so each subsystem implements a counter which is typically initialized - * to 1/2 nswbuf. getpbuf() decrements this counter in allocation and - * increments it on release, and blocks if the counter hits zero. A - * subsystem may initialize the counter to -1 to disable the feature, - * but it must still be sure to match up all uses of getpbuf() with - * relpbuf() using the same variable. - * - * NOTE: pfreecnt can be NULL, but this 'feature' will be removed - * relatively soon when the rest of the subsystems get smart about it. XXX - */ -struct buf * -getpbuf(int *pfreecnt) -{ - struct buf *bp; - - mtx_lock(&pbuf_mtx); - for (;;) { - if (pfreecnt != NULL) { - while (*pfreecnt == 0) { - msleep(pfreecnt, &pbuf_mtx, PVM, "wswbuf0", 0); - } - } - - /* get a bp from the swap buffer header pool */ - if ((bp = TAILQ_FIRST(&bswlist)) != NULL) - break; - - bswneeded = 1; - msleep(&bswneeded, &pbuf_mtx, PVM, "wswbuf1", 0); - /* loop in case someone else grabbed one */ - } - TAILQ_REMOVE(&bswlist, bp, b_freelist); - if (pfreecnt) - --*pfreecnt; - mtx_unlock(&pbuf_mtx); - initpbuf(bp); - return (bp); -} - -/* - * allocate a physical buffer, if one is available. - * - * Note that there is no NULL hack here - all subsystems using this - * call understand how to use pfreecnt. - */ -struct buf * -trypbuf(int *pfreecnt) -{ - struct buf *bp; - - mtx_lock(&pbuf_mtx); - if (*pfreecnt == 0 || (bp = TAILQ_FIRST(&bswlist)) == NULL) { - mtx_unlock(&pbuf_mtx); - return NULL; - } - TAILQ_REMOVE(&bswlist, bp, b_freelist); - --*pfreecnt; - mtx_unlock(&pbuf_mtx); - initpbuf(bp); - return (bp); -} - -/* - * release a physical buffer - * - * NOTE: pfreecnt can be NULL, but this 'feature' will be removed - * relatively soon when the rest of the subsystems get smart about it. XXX - */ -void -relpbuf(struct buf *bp, int *pfreecnt) +static void +pbuf_dtor(void *mem, int size, void *arg) { + struct buf *bp = mem; if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); @@ -461,24 +405,24 @@ relpbuf(struct buf *bp, int *pfreecnt) bp->b_wcred = NOCRED; } - KASSERT(bp->b_vp == NULL, ("relpbuf with vp")); - KASSERT(bp->b_bufobj == NULL, ("relpbuf with bufobj")); - - buf_track(bp, __func__); BUF_UNLOCK(bp); +} - mtx_lock(&pbuf_mtx); - TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist); +static int +pbuf_init(void *mem, int size, int flags) +{ + struct buf *bp = mem; - if (bswneeded) { - bswneeded = 0; - wakeup(&bswneeded); - } - if (pfreecnt) { - if (++*pfreecnt == 1) - wakeup(pfreecnt); - } - mtx_unlock(&pbuf_mtx); + bp->b_kvabase = (void *)kva_alloc(MAXPHYS); + if (bp->b_kvabase == NULL) + return (ENOMEM); + bp->b_kvasize = MAXPHYS; + BUF_LOCKINIT(bp); + LIST_INIT(&bp->b_dep); + bp->b_rcred = bp->b_wcred = NOCRED; + bp->b_xflags = 0; + + return (0); } /* diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c index 52bac7ce373d..3e71ab4436cc 100644 --- a/sys/vm/vnode_pager.c +++ b/sys/vm/vnode_pager.c @@ -58,6 +58,7 @@ __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include +#include #include #include #include @@ -82,6 +83,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address, daddr_t *rtaddress, int *run); @@ -107,15 +109,22 @@ struct pagerops vnodepagerops = { .pgo_haspage = vnode_pager_haspage, }; -int vnode_pbuf_freecnt; -int vnode_async_pbuf_freecnt; - static struct domainset *vnode_domainset = NULL; SYSCTL_PROC(_debug, OID_AUTO, vnode_domainset, CTLTYPE_STRING | CTLFLAG_RW, &vnode_domainset, 0, sysctl_handle_domainset, "A", "Default vnode NUMA policy"); +static uma_zone_t vnode_pbuf_zone; + +static void +vnode_pager_init(void *dummy) +{ + + vnode_pbuf_zone = pbuf_zsecond_create("vnpbuf", nswbuf * 8); +} +SYSINIT(vnode_pager, SI_SUB_CPU, SI_ORDER_ANY, vnode_pager_init, NULL); + /* Create the VM system backing object for this vnode */ int vnode_create_vobject(struct vnode *vp, off_t isize, struct thread *td) @@ -563,7 +572,7 @@ vnode_pager_input_smlfs(vm_object_t object, vm_page_t m) break; } if (fileaddr != -1) { - bp = getpbuf(&vnode_pbuf_freecnt); + bp = uma_zalloc(vnode_pbuf_zone, M_WAITOK); /* build a minimal buffer header */ bp->b_iocmd = BIO_READ; @@ -595,7 +604,7 @@ vnode_pager_input_smlfs(vm_object_t object, vm_page_t m) */ bp->b_vp = NULL; pbrelbo(bp); - relpbuf(bp, &vnode_pbuf_freecnt); + uma_zfree(vnode_pbuf_zone, bp); if (error) break; } else @@ -757,7 +766,7 @@ vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count, #ifdef INVARIANTS off_t blkno0; #endif - int bsize, pagesperblock, *freecnt; + int bsize, pagesperblock; int error, before, after, rbehind, rahead, poff, i; int bytecount, secmask; @@ -788,17 +797,7 @@ vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count, return (VM_PAGER_OK); } - /* - * Synchronous and asynchronous paging operations use different - * free pbuf counters. This is done to avoid asynchronous requests - * to consume all pbufs. - * Allocate the pbuf at the very beginning of the function, so that - * if we are low on certain kind of pbufs don't even proceed to BMAP, - * but sleep. - */ - freecnt = iodone != NULL ? - &vnode_async_pbuf_freecnt : &vnode_pbuf_freecnt; - bp = getpbuf(freecnt); + bp = uma_zalloc(vnode_pbuf_zone, M_WAITOK); /* * Get the underlying device blocks for the file with VOP_BMAP(). @@ -807,7 +806,7 @@ vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count, */ error = VOP_BMAP(vp, foff / bsize, &bo, &bp->b_blkno, &after, &before); if (error == EOPNOTSUPP) { - relpbuf(bp, freecnt); + uma_zfree(vnode_pbuf_zone, bp); VM_OBJECT_WLOCK(object); for (i = 0; i < count; i++) { VM_CNT_INC(v_vnodein); @@ -819,7 +818,7 @@ vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count, VM_OBJECT_WUNLOCK(object); return (error); } else if (error != 0) { - relpbuf(bp, freecnt); + uma_zfree(vnode_pbuf_zone, bp); return (VM_PAGER_ERROR); } @@ -828,7 +827,7 @@ vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count, * than a page size, then use special small filesystem code. */ if (pagesperblock == 0) { - relpbuf(bp, freecnt); + uma_zfree(vnode_pbuf_zone, bp); for (i = 0; i < count; i++) { VM_CNT_INC(v_vnodein); VM_CNT_INC(v_vnodepgsin); @@ -847,7 +846,7 @@ vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count, KASSERT(count == 1, ("%s: array[%d] request to a sparse file %p", __func__, count, vp)); - relpbuf(bp, freecnt); + uma_zfree(vnode_pbuf_zone, bp); pmap_zero_page(m[0]); KASSERT(m[0]->dirty == 0, ("%s: page %p is dirty", __func__, m[0])); @@ -1061,7 +1060,7 @@ vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count, bp->b_pages[i] = NULL; bp->b_vp = NULL; pbrelbo(bp); - relpbuf(bp, &vnode_pbuf_freecnt); + uma_zfree(vnode_pbuf_zone, bp); return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); } } @@ -1079,7 +1078,7 @@ vnode_pager_generic_getpages_done_async(struct buf *bp) bp->b_pages[i] = NULL; bp->b_vp = NULL; pbrelbo(bp); - relpbuf(bp, &vnode_async_pbuf_freecnt); + uma_zfree(vnode_pbuf_zone, bp); } static int