diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 70dc565fae39..4cc9a4ed1889 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -63,6 +63,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -100,6 +101,7 @@ caddr_t unmapped_buf; /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */ struct proc *bufdaemonproc; +struct proc *bufspacedaemonproc; static int inmem(struct vnode *vp, daddr_t blkno); static void vm_hold_free_pages(struct buf *bp, int newbsize); @@ -116,11 +118,18 @@ static void vfs_vmio_extend(struct buf *bp, int npages, int size); static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno); static int buf_flush(struct vnode *vp, int); +static int buf_recycle(bool); +static int buf_scan(bool); static int flushbufqueues(struct vnode *, int, int); static void buf_daemon(void); static void bremfreel(struct buf *bp); static __inline void bd_wakeup(void); static int sysctl_runningspace(SYSCTL_HANDLER_ARGS); +static void bufkva_reclaim(vmem_t *, int); +static void bufkva_free(struct buf *); +static int buf_import(void *, void **, int, int); +static void buf_release(void *, void **, int); + #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int sysctl_bufspace(SYSCTL_HANDLER_ARGS); @@ -145,23 +154,23 @@ static long bufkvaspace; SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0, "Kernel virtual memory used for buffers"); static long maxbufspace; -SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0, - "Maximum allowed value of bufspace (including buf_daemon)"); +SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0, + "Maximum allowed value of bufspace (including metadata)"); static long bufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, "Amount of malloced memory for buffers"); static long maxbufmallocspace; -SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, - "Maximum amount of malloced memory for buffers"); +SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, + 0, "Maximum amount of malloced memory for buffers"); static long lobufspace; -SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0, +SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RW, &lobufspace, 0, "Minimum amount of buffers we want to have"); long hibufspace; -SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0, - "Maximum allowed value of bufspace (excluding buf_daemon)"); -static int bufreusecnt; -SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0, - "Number of times we have reused a buffer"); +SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &hibufspace, 0, + "Maximum allowed value of bufspace (excluding metadata)"); +long bufspacethresh; +SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh, + 0, "Bufspace consumed before waking the daemon to free some"); static int buffreekvacnt; SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0, "Number of times we have freed the KVA space from some buffer"); @@ -205,10 +214,10 @@ SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, "Number of free buffers"); static int lofreebuffers; SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0, - "XXX Unused"); + "Target number of free buffers"); static int hifreebuffers; SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, - "XXX Complicatedly unused"); + "Threshold for clean buffer recycling"); static int getnewbufcalls; SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, "Number of calls to getnewbuf"); @@ -219,6 +228,9 @@ static int mappingrestarts; SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0, "Number of times getblk has had to restart a buffer mapping for " "unmapped buffer"); +static int numbufallocfails; +SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0, + "Number of times buffer allocations failed"); static int flushbufqtarget = 100; SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, "Amount of work to do in flushbufqueues when helping bufdaemon"); @@ -232,16 +244,6 @@ SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD, &unmapped_buf_allowed, 0, "Permit the use of the unmapped i/o"); -/* - * Lock for the non-dirty bufqueues - */ -static struct mtx_padalign bqclean; - -/* - * Lock for the dirty queue. - */ -static struct mtx_padalign bqdirty; - /* * This lock synchronizes access to bd_request. */ @@ -270,6 +272,11 @@ static struct mtx_padalign bdirtylock; */ static int bd_request; +/* + * Request/wakeup point for the bufspace daemon. + */ +static int bufspace_request; + /* * Request for the buf daemon to write more buffers than is indicated by * lodirtybuf. This may be necessary to push out excess dependencies or @@ -298,7 +305,7 @@ static int runningbufreq; * Synchronization (sleep/wakeup) variable for buffer requests. * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done * by and/or. - * Used in numdirtywakeup(), bufspacewakeup(), bufcountadd(), bwillwrite(), + * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(), * getnewbuf(), and getblk(). */ static volatile int needsbuffer; @@ -311,30 +318,43 @@ static int bdirtywait; /* * Definitions for the buffer free lists. */ -#define BUFFER_QUEUES 4 /* number of free buffer queues */ - #define QUEUE_NONE 0 /* on no queue */ -#define QUEUE_CLEAN 1 /* non-B_DELWRI buffers */ +#define QUEUE_EMPTY 1 /* empty buffer headers */ #define QUEUE_DIRTY 2 /* B_DELWRI buffers */ -#define QUEUE_EMPTY 3 /* empty buffer headers */ +#define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */ #define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */ +/* Maximum number of clean buffer queues. */ +#define CLEAN_QUEUES 16 + +/* Configured number of clean queues. */ +static int clean_queues; + +/* Maximum number of buffer queues. */ +#define BUFFER_QUEUES (QUEUE_CLEAN + CLEAN_QUEUES) + /* Queues for free buffers with various properties */ static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; #ifdef INVARIANTS static int bq_len[BUFFER_QUEUES]; #endif +/* + * Lock for each bufqueue + */ +static struct mtx_padalign bqlocks[BUFFER_QUEUES]; + +/* + * per-cpu empty buffer cache. + */ +uma_zone_t buf_zone; + /* * Single global constant for BUF_WMESG, to avoid getting multiple references. * buf_wmesg is referred from macros. */ const char *buf_wmesg = BUF_WMESG; -#define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */ -#define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */ -#define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ - static int sysctl_runningspace(SYSCTL_HANDLER_ARGS) { @@ -382,6 +402,21 @@ sysctl_bufspace(SYSCTL_HANDLER_ARGS) } #endif +static int +bqcleanq(void) +{ + static int nextq; + + return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN); +} + +static int +bqisclean(int qindex) +{ + + return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES); +} + /* * bqlock: * @@ -391,9 +426,7 @@ static inline struct mtx * bqlock(int qindex) { - if (qindex == QUEUE_DIRTY) - return (struct mtx *)(&bqdirty); - return (struct mtx *)(&bqclean); + return (struct mtx *)&bqlocks[qindex]; } /* @@ -447,61 +480,254 @@ bdirtyadd(void) } /* - * bufspacewakeup: + * bufspace_wakeup: * * Called when buffer space is potentially available for recovery. * getnewbuf() will block on this flag when it is unable to free * sufficient buffer space. Buffer space becomes recoverable when * bp's get placed back in the queues. */ -static __inline void -bufspacewakeup(void) +static void +bufspace_wakeup(void) { - int need_wakeup, on; /* - * If someone is waiting for bufspace, wake them up. Even - * though we may not have freed the kva space yet, the waiting - * process will be able to now. + * If someone is waiting for bufspace, wake them up. + * + * Since needsbuffer is set prior to doing an additional queue + * scan it is safe to check for the flag prior to acquiring the + * lock. The thread that is preparing to scan again before + * blocking would discover the buf we released. */ - rw_rlock(&nblock); - for (;;) { - need_wakeup = 0; - on = needsbuffer; - if ((on & VFS_BIO_NEED_BUFSPACE) == 0) - break; - need_wakeup = 1; - if (atomic_cmpset_rel_int(&needsbuffer, on, - on & ~VFS_BIO_NEED_BUFSPACE)) - break; + if (needsbuffer) { + rw_rlock(&nblock); + if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1) + wakeup(__DEVOLATILE(void *, &needsbuffer)); + rw_runlock(&nblock); + } +} + +/* + * bufspace_daemonwakeup: + * + * Wakeup the daemon responsible for freeing clean bufs. + */ +static void +bufspace_daemonwakeup(void) +{ + rw_rlock(&nblock); + if (bufspace_request == 0) { + bufspace_request = 1; + wakeup(&bufspace_request); } - if (need_wakeup) - wakeup(__DEVOLATILE(void *, &needsbuffer)); rw_runlock(&nblock); } /* - * bufspaceadjust: + * bufspace_adjust: * * Adjust the reported bufspace for a KVA managed buffer, possibly * waking any waiters. */ static void -bufspaceadjust(struct buf *bp, int bufsize) +bufspace_adjust(struct buf *bp, int bufsize) { + long space; int diff; KASSERT((bp->b_flags & B_MALLOC) == 0, - ("bufspaceadjust: malloc buf %p", bp)); + ("bufspace_adjust: malloc buf %p", bp)); diff = bufsize - bp->b_bufsize; if (diff < 0) { atomic_subtract_long(&bufspace, -diff); - bufspacewakeup(); - } else - atomic_add_long(&bufspace, diff); + bufspace_wakeup(); + } else { + space = atomic_fetchadd_long(&bufspace, diff); + /* Wake up the daemon on the transition. */ + if (space < bufspacethresh && space + diff >= bufspacethresh) + bufspace_daemonwakeup(); + } bp->b_bufsize = bufsize; } +/* + * bufspace_reserve: + * + * Reserve bufspace before calling allocbuf(). metadata has a + * different space limit than data. + */ +static int +bufspace_reserve(int size, bool metadata) +{ + long limit; + long space; + + if (metadata) + limit = maxbufspace; + else + limit = hibufspace; + do { + space = bufspace; + if (space + size > limit) + return (ENOSPC); + } while (atomic_cmpset_long(&bufspace, space, space + size) == 0); + + /* Wake up the daemon on the transition. */ + if (space < bufspacethresh && space + size >= bufspacethresh) + bufspace_daemonwakeup(); + + return (0); +} + +/* + * bufspace_release: + * + * Release reserved bufspace after bufspace_adjust() has consumed it. + */ +static void +bufspace_release(int size) +{ + atomic_subtract_long(&bufspace, size); + bufspace_wakeup(); +} + +/* + * bufspace_wait: + * + * Wait for bufspace, acting as the buf daemon if a locked vnode is + * supplied. needsbuffer must be set in a safe fashion prior to + * polling for space. The operation must be re-tried on return. + */ +static void +bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo) +{ + struct thread *td; + int error, fl, norunbuf; + + if ((gbflags & GB_NOWAIT_BD) != 0) + return; + + td = curthread; + rw_wlock(&nblock); + while (needsbuffer != 0) { + if (vp != NULL && vp->v_type != VCHR && + (td->td_pflags & TDP_BUFNEED) == 0) { + rw_wunlock(&nblock); + /* + * getblk() is called with a vnode locked, and + * some majority of the dirty buffers may as + * well belong to the vnode. Flushing the + * buffers there would make a progress that + * cannot be achieved by the buf_daemon, that + * cannot lock the vnode. + */ + norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) | + (td->td_pflags & TDP_NORUNNINGBUF); + + /* + * Play bufdaemon. The getnewbuf() function + * may be called while the thread owns lock + * for another dirty buffer for the same + * vnode, which makes it impossible to use + * VOP_FSYNC() there, due to the buffer lock + * recursion. + */ + td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; + fl = buf_flush(vp, flushbufqtarget); + td->td_pflags &= norunbuf; + rw_wlock(&nblock); + if (fl != 0) + continue; + if (needsbuffer == 0) + break; + } + error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock, + (PRIBIO + 4) | slpflag, "newbuf", slptimeo); + if (error != 0) + break; + } + rw_wunlock(&nblock); +} + + +/* + * bufspace_daemon: + * + * buffer space management daemon. Tries to maintain some marginal + * amount of free buffer space so that requesting processes neither + * block nor work to reclaim buffers. + */ +static void +bufspace_daemon(void) +{ + for (;;) { + kproc_suspend_check(bufspacedaemonproc); + + /* + * Free buffers from the clean queue until we meet our + * targets. + * + * Theory of operation: The buffer cache is most efficient + * when some free buffer headers and space are always + * available to getnewbuf(). This daemon attempts to prevent + * the excessive blocking and synchronization associated + * with shortfall. It goes through three phases according + * demand: + * + * 1) The daemon wakes up voluntarily once per-second + * during idle periods when the counters are below + * the wakeup thresholds (bufspacethresh, lofreebuffers). + * + * 2) The daemon wakes up as we cross the thresholds + * ahead of any potential blocking. This may bounce + * slightly according to the rate of consumption and + * release. + * + * 3) The daemon and consumers are starved for working + * clean buffers. This is the 'bufspace' sleep below + * which will inefficiently trade bufs with bqrelse + * until we return to condition 2. + */ + while (bufspace > lobufspace || + numfreebuffers < hifreebuffers) { + if (buf_recycle(false) != 0) { + atomic_set_int(&needsbuffer, 1); + if (buf_recycle(false) != 0) { + rw_wlock(&nblock); + if (needsbuffer) + rw_sleep(__DEVOLATILE(void *, + &needsbuffer), &nblock, + PRIBIO|PDROP, "bufspace", + hz/10); + else + rw_wunlock(&nblock); + } + } + maybe_yield(); + } + + /* + * Re-check our limits under the exclusive nblock. + */ + rw_wlock(&nblock); + if (bufspace < bufspacethresh && + numfreebuffers > lofreebuffers) { + bufspace_request = 0; + rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP, + "-", hz); + } else + rw_wunlock(&nblock); + } +} + +static struct kproc_desc bufspace_kp = { + "bufspacedaemon", + bufspace_daemon, + &bufspacedaemonproc +}; +SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, + &bufspace_kp); + /* * bufmallocadjust: * @@ -516,10 +742,9 @@ bufmallocadjust(struct buf *bp, int bufsize) KASSERT((bp->b_flags & B_MALLOC) != 0, ("bufmallocadjust: non-malloc buf %p", bp)); diff = bufsize - bp->b_bufsize; - if (diff < 0) { + if (diff < 0) atomic_subtract_long(&bufmallocspace, -diff); - bufspacewakeup(); - } else + else atomic_add_long(&bufmallocspace, diff); bp->b_bufsize = bufsize; } @@ -570,67 +795,6 @@ runningbufwakeup(struct buf *bp) runningwakeup(); } -/* - * bufcountadd: - * - * Called when a buffer has been added to one of the free queues to - * account for the buffer and to wakeup anyone waiting for free buffers. - * This typically occurs when large amounts of metadata are being handled - * by the buffer cache ( else buffer space runs out first, usually ). - */ -static __inline void -bufcountadd(struct buf *bp) -{ - int mask, need_wakeup, old, on; - - KASSERT((bp->b_flags & B_INFREECNT) == 0, - ("buf %p already counted as free", bp)); - bp->b_flags |= B_INFREECNT; - old = atomic_fetchadd_int(&numfreebuffers, 1); - KASSERT(old >= 0 && old < nbuf, - ("numfreebuffers climbed to %d", old + 1)); - mask = VFS_BIO_NEED_ANY; - if (numfreebuffers >= hifreebuffers) - mask |= VFS_BIO_NEED_FREE; - rw_rlock(&nblock); - for (;;) { - need_wakeup = 0; - on = needsbuffer; - if (on == 0) - break; - need_wakeup = 1; - if (atomic_cmpset_rel_int(&needsbuffer, on, on & ~mask)) - break; - } - if (need_wakeup) - wakeup(__DEVOLATILE(void *, &needsbuffer)); - rw_runlock(&nblock); -} - -/* - * bufcountsub: - * - * Decrement the numfreebuffers count as needed. - */ -static void -bufcountsub(struct buf *bp) -{ - int old; - - /* - * Fixup numfreebuffers count. If the buffer is invalid or not - * delayed-write, the buffer was free and we must decrement - * numfreebuffers. - */ - if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) { - KASSERT((bp->b_flags & B_INFREECNT) != 0, - ("buf %p not counted in numfreebuffers", bp)); - bp->b_flags &= ~B_INFREECNT; - old = atomic_fetchadd_int(&numfreebuffers, -1); - KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1)); - } -} - /* * waitrunningbufspace() * @@ -847,8 +1011,10 @@ bufinit(void) int i; CTASSERT(MAXBCACHEBUF >= MAXBSIZE); - mtx_init(&bqclean, "bufq clean lock", NULL, MTX_DEF); - mtx_init(&bqdirty, "bufq dirty lock", NULL, MTX_DEF); + mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF); + mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF); + for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++) + mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF); mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); rw_init(&nblock, "needsbuffer lock"); mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); @@ -864,7 +1030,7 @@ bufinit(void) for (i = 0; i < nbuf; i++) { bp = &buf[i]; bzero(bp, sizeof *bp); - bp->b_flags = B_INVAL | B_INFREECNT; + bp->b_flags = B_INVAL; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_EMPTY; @@ -881,18 +1047,19 @@ bufinit(void) /* * maxbufspace is the absolute maximum amount of buffer space we are * allowed to reserve in KVM and in real terms. The absolute maximum - * is nominally used by buf_daemon. hibufspace is the nominal maximum - * used by most other processes. The differential is required to - * ensure that buf_daemon is able to run when other processes might - * be blocked waiting for buffer space. + * is nominally used by metadata. hibufspace is the nominal maximum + * used by most other requests. The differential is required to + * ensure that metadata deadlocks don't occur. * * maxbufspace is based on BKVASIZE. Allocating buffers larger then * this may result in KVM fragmentation which is not handled optimally - * by the system. + * by the system. XXX This is less true with vmem. We could use + * PAGE_SIZE. */ maxbufspace = (long)nbuf * BKVASIZE; hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBCACHEBUF * 10); - lobufspace = hibufspace - MAXBCACHEBUF; + lobufspace = (hibufspace / 20) * 19; /* 95% */ + bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2; /* * Note: The 16 MiB upper limit for hirunningspace was chosen @@ -906,44 +1073,61 @@ bufinit(void) 16 * 1024 * 1024), 1024 * 1024); lorunningspace = roundup((hirunningspace * 2) / 3, MAXBCACHEBUF); -/* - * Limit the amount of malloc memory since it is wired permanently into - * the kernel space. Even though this is accounted for in the buffer - * allocation, we don't want the malloced region to grow uncontrolled. - * The malloc scheme improves memory utilization significantly on average - * (small) directories. - */ + /* + * Limit the amount of malloc memory since it is wired permanently into + * the kernel space. Even though this is accounted for in the buffer + * allocation, we don't want the malloced region to grow uncontrolled. + * The malloc scheme improves memory utilization significantly on + * average (small) directories. + */ maxbufmallocspace = hibufspace / 20; -/* - * Reduce the chance of a deadlock occuring by limiting the number - * of delayed-write dirty buffers we allow to stack up. - */ + /* + * Reduce the chance of a deadlock occuring by limiting the number + * of delayed-write dirty buffers we allow to stack up. + */ hidirtybuffers = nbuf / 4 + 20; dirtybufthresh = hidirtybuffers * 9 / 10; numdirtybuffers = 0; -/* - * To support extreme low-memory systems, make sure hidirtybuffers cannot - * eat up all available buffer space. This occurs when our minimum cannot - * be met. We try to size hidirtybuffers to 3/4 our buffer space assuming - * BKVASIZE'd buffers. - */ + /* + * To support extreme low-memory systems, make sure hidirtybuffers + * cannot eat up all available buffer space. This occurs when our + * minimum cannot be met. We try to size hidirtybuffers to 3/4 our + * buffer space assuming BKVASIZE'd buffers. + */ while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { hidirtybuffers >>= 1; } lodirtybuffers = hidirtybuffers / 2; -/* - * Try to keep the number of free buffers in the specified range, - * and give special processes (e.g. like buf_daemon) access to an - * emergency reserve. - */ - lofreebuffers = nbuf / 18 + 5; - hifreebuffers = 2 * lofreebuffers; + /* + * lofreebuffers should be sufficient to avoid stalling waiting on + * buf headers under heavy utilization. The bufs in per-cpu caches + * are counted as free but will be unavailable to threads executing + * on other cpus. + * + * hifreebuffers is the free target for the bufspace daemon. This + * should be set appropriately to limit work per-iteration. + */ + lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus); + hifreebuffers = (3 * lofreebuffers) / 2; numfreebuffers = nbuf; bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED); + + /* Setup the kva and free list allocators. */ + vmem_set_reclaim(buffer_arena, bufkva_reclaim); + buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf), + NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0); + + /* + * Size the clean queue according to the amount of buffer space. + * One queue per-256mb up to the max. More queues gives better + * concurrency but less accurate LRU. + */ + clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES); + } #ifdef INVARIANTS @@ -1129,10 +1313,25 @@ binsfree(struct buf *bp, int qindex) { struct mtx *olock, *nlock; - BUF_ASSERT_XLOCKED(bp); + if (qindex != QUEUE_EMPTY) { + BUF_ASSERT_XLOCKED(bp); + } + /* + * Stick to the same clean queue for the lifetime of the buf to + * limit locking below. Otherwise pick ont sequentially. + */ + if (qindex == QUEUE_CLEAN) { + if (bqisclean(bp->b_qindex)) + qindex = bp->b_qindex; + else + qindex = bqcleanq(); + } + + /* + * Handle delayed bremfree() processing. + */ nlock = bqlock(qindex); - /* Handle delayed bremfree() processing. */ if (bp->b_flags & B_REMFREE) { olock = bqlock(bp->b_qindex); mtx_lock(olock); @@ -1156,15 +1355,263 @@ binsfree(struct buf *bp, int qindex) bq_len[bp->b_qindex]++; #endif mtx_unlock(nlock); +} + +/* + * buf_free: + * + * Free a buffer to the buf zone once it no longer has valid contents. + */ +static void +buf_free(struct buf *bp) +{ + + if (bp->b_flags & B_REMFREE) + bremfreef(bp); + if (bp->b_vflags & BV_BKGRDINPROG) + panic("losing buffer 1"); + if (bp->b_rcred != NOCRED) { + crfree(bp->b_rcred); + bp->b_rcred = NOCRED; + } + if (bp->b_wcred != NOCRED) { + crfree(bp->b_wcred); + bp->b_wcred = NOCRED; + } + if (!LIST_EMPTY(&bp->b_dep)) + buf_deallocate(bp); + bufkva_free(bp); + BUF_UNLOCK(bp); + uma_zfree(buf_zone, bp); + atomic_add_int(&numfreebuffers, 1); + bufspace_wakeup(); +} + +/* + * buf_import: + * + * Import bufs into the uma cache from the buf list. The system still + * expects a static array of bufs and much of the synchronization + * around bufs assumes type stable storage. As a result, UMA is used + * only as a per-cpu cache of bufs still maintained on a global list. + */ +static int +buf_import(void *arg, void **store, int cnt, int flags) +{ + struct buf *bp; + int i; + + mtx_lock(&bqlocks[QUEUE_EMPTY]); + for (i = 0; i < cnt; i++) { + bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); + if (bp == NULL) + break; + bremfreel(bp); + store[i] = bp; + } + mtx_unlock(&bqlocks[QUEUE_EMPTY]); + + return (i); +} + +/* + * buf_release: + * + * Release bufs from the uma cache back to the buffer queues. + */ +static void +buf_release(void *arg, void **store, int cnt) +{ + int i; + + for (i = 0; i < cnt; i++) + binsfree(store[i], QUEUE_EMPTY); +} + +/* + * buf_alloc: + * + * Allocate an empty buffer header. + */ +static struct buf * +buf_alloc(void) +{ + struct buf *bp; + + bp = uma_zalloc(buf_zone, M_NOWAIT); + if (bp == NULL) { + bufspace_daemonwakeup(); + atomic_add_int(&numbufallocfails, 1); + return (NULL); + } /* - * Something we can maybe free or reuse. + * Wake-up the bufspace daemon on transition. */ - if (bp->b_bufsize && !(bp->b_flags & B_DELWRI)) - bufspacewakeup(); + if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers) + bufspace_daemonwakeup(); - if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI)) - bufcountadd(bp); + if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) + panic("getnewbuf_empty: Locked buf %p on free queue.", bp); + + KASSERT(bp->b_vp == NULL, + ("bp: %p still has vnode %p.", bp, bp->b_vp)); + KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0, + ("invalid buffer %p flags %#x", bp, bp->b_flags)); + KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0, + ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags)); + KASSERT(bp->b_npages == 0, + ("bp: %p still has %d vm pages\n", bp, bp->b_npages)); + KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp)); + KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp)); + + bp->b_flags = 0; + bp->b_ioflags = 0; + bp->b_xflags = 0; + bp->b_vflags = 0; + bp->b_vp = NULL; + bp->b_blkno = bp->b_lblkno = 0; + bp->b_offset = NOOFFSET; + bp->b_iodone = 0; + bp->b_error = 0; + bp->b_resid = 0; + bp->b_bcount = 0; + bp->b_npages = 0; + bp->b_dirtyoff = bp->b_dirtyend = 0; + bp->b_bufobj = NULL; + bp->b_pin_count = 0; + bp->b_data = bp->b_kvabase = unmapped_buf; + bp->b_fsprivate1 = NULL; + bp->b_fsprivate2 = NULL; + bp->b_fsprivate3 = NULL; + LIST_INIT(&bp->b_dep); + + return (bp); +} + +/* + * buf_qrecycle: + * + * Free a buffer from the given bufqueue. kva controls whether the + * freed buf must own some kva resources. This is used for + * defragmenting. + */ +static int +buf_qrecycle(int qindex, bool kva) +{ + struct buf *bp, *nbp; + + if (kva) + atomic_add_int(&bufdefragcnt, 1); + nbp = NULL; + mtx_lock(&bqlocks[qindex]); + nbp = TAILQ_FIRST(&bufqueues[qindex]); + + /* + * Run scan, possibly freeing data and/or kva mappings on the fly + * depending. + */ + while ((bp = nbp) != NULL) { + /* + * Calculate next bp (we can only use it if we do not + * release the bqlock). + */ + nbp = TAILQ_NEXT(bp, b_freelist); + + /* + * If we are defragging then we need a buffer with + * some kva to reclaim. + */ + if (kva && bp->b_kvasize == 0) + continue; + + if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) + continue; + + /* + * Skip buffers with background writes in progress. + */ + if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { + BUF_UNLOCK(bp); + continue; + } + + KASSERT(bp->b_qindex == qindex, + ("getnewbuf: inconsistent queue %d bp %p", qindex, bp)); + /* + * NOTE: nbp is now entirely invalid. We can only restart + * the scan from this point on. + */ + bremfreel(bp); + mtx_unlock(&bqlocks[qindex]); + + /* + * Requeue the background write buffer with error and + * restart the scan. + */ + if ((bp->b_vflags & BV_BKGRDERR) != 0) { + bqrelse(bp); + mtx_lock(&bqlocks[qindex]); + nbp = TAILQ_FIRST(&bufqueues[qindex]); + continue; + } + bp->b_flags |= B_INVAL; + brelse(bp); + return (0); + } + mtx_unlock(&bqlocks[qindex]); + + return (ENOBUFS); +} + +/* + * buf_recycle: + * + * Iterate through all clean queues until we find a buf to recycle or + * exhaust the search. + */ +static int +buf_recycle(bool kva) +{ + int qindex, first_qindex; + + qindex = first_qindex = bqcleanq(); + do { + if (buf_qrecycle(qindex, kva) == 0) + return (0); + if (++qindex == QUEUE_CLEAN + clean_queues) + qindex = QUEUE_CLEAN; + } while (qindex != first_qindex); + + return (ENOBUFS); +} + +/* + * buf_scan: + * + * Scan the clean queues looking for a buffer to recycle. needsbuffer + * is set on failure so that the caller may optionally bufspace_wait() + * in a race-free fashion. + */ +static int +buf_scan(bool defrag) +{ + int error; + + /* + * To avoid heavy synchronization and wakeup races we set + * needsbuffer and re-poll before failing. This ensures that + * no frees can be missed between an unsuccessful poll and + * going to sleep in a synchronized fashion. + */ + if ((error = buf_recycle(defrag)) != 0) { + atomic_set_int(&needsbuffer, 1); + bufspace_daemonwakeup(); + error = buf_recycle(defrag); + } + if (error == 0) + atomic_add_int(&getnewbufrestarts, 1); + return (error); } /* @@ -1185,7 +1632,6 @@ bremfree(struct buf *bp) BUF_ASSERT_XLOCKED(bp); bp->b_flags |= B_REMFREE; - bufcountsub(bp); } /* @@ -1219,7 +1665,9 @@ bremfreel(struct buf *bp) bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfreel: buffer %p not on a queue.", bp)); - BUF_ASSERT_XLOCKED(bp); + if (bp->b_qindex != QUEUE_EMPTY) { + BUF_ASSERT_XLOCKED(bp); + } mtx_assert(bqlock(bp->b_qindex), MA_OWNED); TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); @@ -1229,25 +1677,17 @@ bremfreel(struct buf *bp) bq_len[bp->b_qindex]--; #endif bp->b_qindex = QUEUE_NONE; - /* - * If this was a delayed bremfree() we only need to remove the buffer - * from the queue and return the stats are already done. - */ - if (bp->b_flags & B_REMFREE) { - bp->b_flags &= ~B_REMFREE; - return; - } - bufcountsub(bp); + bp->b_flags &= ~B_REMFREE; } /* - * bufkvafree: + * bufkva_free: * * Free the kva allocation for a buffer. * */ static void -bufkvafree(struct buf *bp) +bufkva_free(struct buf *bp) { #ifdef INVARIANTS @@ -1271,12 +1711,12 @@ bufkvafree(struct buf *bp) } /* - * bufkvaalloc: + * bufkva_alloc: * * Allocate the buffer KVA and set b_kvasize and b_kvabase. */ static int -bufkvaalloc(struct buf *bp, int maxsize, int gbflags) +bufkva_alloc(struct buf *bp, int maxsize, int gbflags) { vm_offset_t addr; int error; @@ -1284,7 +1724,7 @@ bufkvaalloc(struct buf *bp, int maxsize, int gbflags) KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0, ("Invalid gbflags 0x%x in %s", gbflags, __func__)); - bufkvafree(bp); + bufkva_free(bp); addr = 0; error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr); @@ -1293,7 +1733,6 @@ bufkvaalloc(struct buf *bp, int maxsize, int gbflags) * Buffer map is too fragmented. Request the caller * to defragment the map. */ - atomic_add_int(&bufdefragcnt, 1); return (error); } bp->b_kvabase = (caddr_t)addr; @@ -1309,6 +1748,24 @@ bufkvaalloc(struct buf *bp, int maxsize, int gbflags) return (0); } +/* + * bufkva_reclaim: + * + * Reclaim buffer kva by freeing buffers holding kva. This is a vmem + * callback that fires to avoid returning failure. + */ +static void +bufkva_reclaim(vmem_t *vmem, int flags) +{ + int i; + + for (i = 0; i < 5; i++) + if (buf_scan(true) != 0) + break; + return; +} + + /* * Attempt to initiate asynchronous I/O on read-ahead blocks. We must * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set, @@ -1900,14 +2357,11 @@ brelse(struct buf *bp) /* buffers with no memory */ if (bp->b_bufsize == 0) { - bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); - if (bp->b_vflags & BV_BKGRDINPROG) - panic("losing buffer 1"); - bufkvafree(bp); - qindex = QUEUE_EMPTY; - bp->b_flags |= B_AGE; + buf_free(bp); + return; + } /* buffers with junk contents */ - } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || + if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || (bp->b_ioflags & BIO_ERROR)) { bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); if (bp->b_vflags & BV_BKGRDINPROG) @@ -1927,6 +2381,8 @@ brelse(struct buf *bp) panic("brelse: not dirty"); /* unlock */ BUF_UNLOCK(bp); + if (qindex == QUEUE_CLEAN) + bufspace_wakeup(); } /* @@ -1949,6 +2405,7 @@ bqrelse(struct buf *bp) KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); + qindex = QUEUE_NONE; if (BUF_LOCKRECURSED(bp)) { /* do not release to free list */ BUF_UNLOCK(bp); @@ -1984,6 +2441,8 @@ bqrelse(struct buf *bp) out: /* unlock */ BUF_UNLOCK(bp); + if (qindex == QUEUE_CLEAN) + bufspace_wakeup(); } /* @@ -2383,297 +2842,26 @@ vfs_bio_awrite(struct buf *bp) } /* - * Ask the bufdaemon for help, or act as bufdaemon itself, when a - * locked vnode is supplied. + * getnewbuf_kva: + * + * Allocate KVA for an empty buf header according to gbflags. */ -static void -getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo, - int defrag) -{ - struct thread *td; - char *waitmsg; - int error, fl, flags, norunbuf; - - mtx_assert(&bqclean, MA_OWNED); - - if (defrag) { - flags = VFS_BIO_NEED_BUFSPACE; - waitmsg = "nbufkv"; - } else if (bufspace >= hibufspace) { - waitmsg = "nbufbs"; - flags = VFS_BIO_NEED_BUFSPACE; - } else { - waitmsg = "newbuf"; - flags = VFS_BIO_NEED_ANY; - } - atomic_set_int(&needsbuffer, flags); - mtx_unlock(&bqclean); - - bd_speedup(); /* heeeelp */ - if ((gbflags & GB_NOWAIT_BD) != 0) - return; - - td = curthread; - rw_wlock(&nblock); - while ((needsbuffer & flags) != 0) { - if (vp != NULL && vp->v_type != VCHR && - (td->td_pflags & TDP_BUFNEED) == 0) { - rw_wunlock(&nblock); - /* - * getblk() is called with a vnode locked, and - * some majority of the dirty buffers may as - * well belong to the vnode. Flushing the - * buffers there would make a progress that - * cannot be achieved by the buf_daemon, that - * cannot lock the vnode. - */ - norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) | - (td->td_pflags & TDP_NORUNNINGBUF); - - /* - * Play bufdaemon. The getnewbuf() function - * may be called while the thread owns lock - * for another dirty buffer for the same - * vnode, which makes it impossible to use - * VOP_FSYNC() there, due to the buffer lock - * recursion. - */ - td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; - fl = buf_flush(vp, flushbufqtarget); - td->td_pflags &= norunbuf; - rw_wlock(&nblock); - if (fl != 0) - continue; - if ((needsbuffer & flags) == 0) - break; - } - error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock, - (PRIBIO + 4) | slpflag, waitmsg, slptimeo); - if (error != 0) - break; - } - rw_wunlock(&nblock); -} - -static void -getnewbuf_reuse_bp(struct buf *bp, int qindex) +static int +getnewbuf_kva(struct buf *bp, int gbflags, int maxsize) { - CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d " - "queue %d (recycling)", bp, bp->b_vp, bp->b_flags, - bp->b_kvasize, bp->b_bufsize, qindex); - mtx_assert(&bqclean, MA_NOTOWNED); + if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_UNMAPPED) { + /* + * In order to keep fragmentation sane we only allocate kva + * in BKVASIZE chunks. XXX with vmem we can do page size. + */ + maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; - /* - * Note: we no longer distinguish between VMIO and non-VMIO - * buffers. - */ - KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0, - ("invalid buffer %p flags %#x found in queue %d", bp, bp->b_flags, - qindex)); - - /* - * When recycling a clean buffer we have to truncate it and - * release the vnode. - */ - if (qindex == QUEUE_CLEAN) { - allocbuf(bp, 0); - if (bp->b_vp != NULL) - brelvp(bp); + if (maxsize != bp->b_kvasize && + bufkva_alloc(bp, maxsize, gbflags)) + return (ENOSPC); } - - /* - * Get the rest of the buffer freed up. b_kva* is still valid - * after this operation. - */ - if (bp->b_rcred != NOCRED) { - crfree(bp->b_rcred); - bp->b_rcred = NOCRED; - } - if (bp->b_wcred != NOCRED) { - crfree(bp->b_wcred); - bp->b_wcred = NOCRED; - } - if (!LIST_EMPTY(&bp->b_dep)) - buf_deallocate(bp); - if (bp->b_vflags & BV_BKGRDINPROG) - panic("losing buffer 3"); - KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p. qindex: %d", - bp, bp->b_vp, qindex)); - KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0, - ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags)); - KASSERT(bp->b_npages == 0, - ("bp: %p still has %d vm pages\n", bp, bp->b_npages)); - - bp->b_flags = 0; - bp->b_ioflags = 0; - bp->b_xflags = 0; - KASSERT((bp->b_flags & B_INFREECNT) == 0, - ("buf %p still counted as free?", bp)); - bp->b_vflags = 0; - bp->b_vp = NULL; - bp->b_blkno = bp->b_lblkno = 0; - bp->b_offset = NOOFFSET; - bp->b_iodone = 0; - bp->b_error = 0; - bp->b_resid = 0; - bp->b_bcount = 0; - bp->b_npages = 0; - bp->b_dirtyoff = bp->b_dirtyend = 0; - bp->b_bufobj = NULL; - bp->b_pin_count = 0; - bp->b_data = bp->b_kvabase; - bp->b_fsprivate1 = NULL; - bp->b_fsprivate2 = NULL; - bp->b_fsprivate3 = NULL; - - LIST_INIT(&bp->b_dep); -} - -static struct buf * -getnewbuf_scan(int maxsize, int defrag, int unmapped, int metadata) -{ - struct buf *bp, *nbp; - int nqindex, qindex, pass; - - KASSERT(!unmapped || !defrag, ("both unmapped and defrag")); - - pass = 0; -restart: - if (pass != 0) - atomic_add_int(&getnewbufrestarts, 1); - - nbp = NULL; - mtx_lock(&bqclean); - /* - * If we're not defragging or low on bufspace attempt to make a new - * buf from a header. - */ - if (defrag == 0 && bufspace + maxsize < hibufspace) { - nqindex = QUEUE_EMPTY; - nbp = TAILQ_FIRST(&bufqueues[nqindex]); - } - /* - * All available buffers might be clean or we need to start recycling. - */ - if (nbp == NULL) { - nqindex = QUEUE_CLEAN; - nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); - } - - /* - * Run scan, possibly freeing data and/or kva mappings on the fly - * depending. - */ - while ((bp = nbp) != NULL) { - qindex = nqindex; - - /* - * Calculate next bp (we can only use it if we do not - * release the bqlock) - */ - if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) { - switch (qindex) { - case QUEUE_EMPTY: - nqindex = QUEUE_CLEAN; - nbp = TAILQ_FIRST(&bufqueues[nqindex]); - if (nbp != NULL) - break; - /* FALLTHROUGH */ - case QUEUE_CLEAN: - if (metadata && pass == 0) { - pass = 1; - nqindex = QUEUE_EMPTY; - nbp = TAILQ_FIRST(&bufqueues[nqindex]); - } - /* - * nbp is NULL. - */ - break; - } - } - /* - * If we are defragging then we need a buffer with - * b_kvasize != 0. This situation occurs when we - * have many unmapped bufs. - */ - if (defrag && bp->b_kvasize == 0) - continue; - - /* - * Start freeing the bp. This is somewhat involved. nbp - * remains valid only for QUEUE_EMPTY[KVA] bp's. - */ - if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) - continue; - /* - * BKGRDINPROG can only be set with the buf and bufobj - * locks both held. We tolerate a race to clear it here. - */ - if (bp->b_vflags & BV_BKGRDINPROG) { - BUF_UNLOCK(bp); - continue; - } - - /* - * Requeue the background write buffer with error. - */ - if ((bp->b_vflags & BV_BKGRDERR) != 0) { - bremfreel(bp); - mtx_unlock(&bqclean); - bqrelse(bp); - continue; - } - - KASSERT(bp->b_qindex == qindex, - ("getnewbuf: inconsistent queue %d bp %p", qindex, bp)); - - bremfreel(bp); - mtx_unlock(&bqclean); - - /* - * NOTE: nbp is now entirely invalid. We can only restart - * the scan from this point on. - */ - getnewbuf_reuse_bp(bp, qindex); - mtx_assert(&bqclean, MA_NOTOWNED); - - /* - * If we are defragging then free the buffer. - */ - if (defrag) { - bp->b_flags |= B_INVAL; - brelse(bp); - defrag = 0; - goto restart; - } - - /* - * Notify any waiters for the buffer lock about - * identity change by freeing the buffer. - */ - if (qindex == QUEUE_CLEAN && BUF_LOCKWAITERS(bp)) { - bp->b_flags |= B_INVAL; - brelse(bp); - goto restart; - } - - if (metadata) - break; - - /* - * If we are overcomitted then recover the buffer and its - * KVM space. This occurs in rare situations when multiple - * processes are blocked in getnewbuf() or allocbuf(). - */ - if (bufspace >= hibufspace && bp->b_kvasize != 0) { - bp->b_flags |= B_INVAL; - brelse(bp); - goto restart; - } - break; - } - return (bp); + return (0); } /* @@ -2682,86 +2870,54 @@ getnewbuf_scan(int maxsize, int defrag, int unmapped, int metadata) * Find and initialize a new buffer header, freeing up existing buffers * in the bufqueues as necessary. The new buffer is returned locked. * - * Important: B_INVAL is not set. If the caller wishes to throw the - * buffer away, the caller must set B_INVAL prior to calling brelse(). - * * We block if: * We have insufficient buffer headers * We have insufficient buffer space * buffer_arena is too fragmented ( space reservation fails ) * If we have to flush dirty buffers ( but we try to avoid this ) + * + * The caller is responsible for releasing the reserved bufspace after + * allocbuf() is called. */ static struct buf * -getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize, - int gbflags) +getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags) { struct buf *bp; - int defrag, metadata; + bool metadata, reserved; KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); if (!unmapped_buf_allowed) gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC); - defrag = 0; if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 || vp->v_type == VCHR) - metadata = 1; + metadata = true; else - metadata = 0; - /* - * We can't afford to block since we might be holding a vnode lock, - * which may prevent system daemons from running. We deal with - * low-memory situations by proactively returning memory and running - * async I/O rather then sync I/O. - */ + metadata = false; atomic_add_int(&getnewbufcalls, 1); -restart: - bp = getnewbuf_scan(maxsize, defrag, (gbflags & (GB_UNMAPPED | - GB_KVAALLOC)) == GB_UNMAPPED, metadata); - if (bp != NULL) - defrag = 0; + reserved = false; + do { + if (reserved == false && + bufspace_reserve(maxsize, metadata) != 0) + continue; + reserved = true; + if ((bp = buf_alloc()) == NULL) + continue; + if (getnewbuf_kva(bp, gbflags, maxsize) == 0) + return (bp); + break; + } while(buf_scan(false) == 0); - /* - * If we exhausted our list, sleep as appropriate. We may have to - * wakeup various daemons and write out some dirty buffers. - * - * Generally we are sleeping due to insufficient buffer space. - */ - if (bp == NULL) { - mtx_assert(&bqclean, MA_OWNED); - getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag); - mtx_assert(&bqclean, MA_NOTOWNED); - } else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) { - mtx_assert(&bqclean, MA_NOTOWNED); - - bufkvafree(bp); - atomic_add_int(&bufreusecnt, 1); - } else { - mtx_assert(&bqclean, MA_NOTOWNED); - - /* - * We finally have a valid bp. We aren't quite out of the - * woods, we still have to reserve kva space. In order to - * keep fragmentation sane we only allocate kva in BKVASIZE - * chunks. - */ - maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; - - if (maxsize != bp->b_kvasize && - bufkvaalloc(bp, maxsize, gbflags)) { - defrag = 1; - bp->b_flags |= B_INVAL; - brelse(bp); - goto restart; - } else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == - (GB_UNMAPPED | GB_KVAALLOC)) { - bp->b_data = unmapped_buf; - BUF_CHECK_UNMAPPED(bp); - } - atomic_add_int(&bufreusecnt, 1); + if (reserved) + bufspace_release(maxsize); + if (bp != NULL) { + bp->b_flags |= B_INVAL; + brelse(bp); } - return (bp); + bufspace_wait(vp, gbflags, slpflag, slptimeo); + + return (NULL); } /* @@ -2771,7 +2927,6 @@ getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize, * update daemon but if it cannot keep up this process starts to * take the load in an attempt to prevent getnewbuf() from blocking. */ - static struct kproc_desc buf_kp = { "bufdaemon", buf_daemon, @@ -2902,19 +3057,19 @@ flushbufqueues(struct vnode *lvp, int target, int flushdeps) bp = NULL; sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO); sentinel->b_qindex = QUEUE_SENTINEL; - mtx_lock(&bqdirty); + mtx_lock(&bqlocks[queue]); TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist); - mtx_unlock(&bqdirty); + mtx_unlock(&bqlocks[queue]); while (flushed != target) { maybe_yield(); - mtx_lock(&bqdirty); + mtx_lock(&bqlocks[queue]); bp = TAILQ_NEXT(sentinel, b_freelist); if (bp != NULL) { TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist); TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel, b_freelist); } else { - mtx_unlock(&bqdirty); + mtx_unlock(&bqlocks[queue]); break; } /* @@ -2926,11 +3081,11 @@ flushbufqueues(struct vnode *lvp, int target, int flushdeps) */ if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL && bp->b_vp != lvp)) { - mtx_unlock(&bqdirty); + mtx_unlock(&bqlocks[queue]); continue; } error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL); - mtx_unlock(&bqdirty); + mtx_unlock(&bqlocks[queue]); if (error != 0) continue; if (bp->b_pin_count > 0) { @@ -3013,9 +3168,9 @@ flushbufqueues(struct vnode *lvp, int target, int flushdeps) vn_finished_write(mp); BUF_UNLOCK(bp); } - mtx_lock(&bqdirty); + mtx_lock(&bqlocks[queue]); TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist); - mtx_unlock(&bqdirty); + mtx_unlock(&bqlocks[queue]); free(sentinel, M_TEMP); return (flushed); } @@ -3196,7 +3351,6 @@ vfs_setdirty_locked_object(struct buf *bp) static void bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags) { - struct buf *scratch_bp; int bsize, maxsize, need_mapping, need_kva; off_t offset; @@ -3229,37 +3383,16 @@ bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags) maxsize = size + (offset & PAGE_MASK); maxsize = imax(maxsize, bsize); -mapping_loop: - if (bufkvaalloc(bp, maxsize, gbflags)) { - /* - * Request defragmentation. getnewbuf() returns us the - * allocated space by the scratch buffer KVA. - */ - scratch_bp = getnewbuf(bp->b_vp, 0, 0, size, maxsize, gbflags | - (GB_UNMAPPED | GB_KVAALLOC)); - if (scratch_bp == NULL) { - if ((gbflags & GB_NOWAIT_BD) != 0) { - /* - * XXXKIB: defragmentation cannot - * succeed, not sure what else to do. - */ - panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp); - } - atomic_add_int(&mappingrestarts, 1); - goto mapping_loop; + while (bufkva_alloc(bp, maxsize, gbflags) != 0) { + if ((gbflags & GB_NOWAIT_BD) != 0) { + /* + * XXXKIB: defragmentation cannot + * succeed, not sure what else to do. + */ + panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp); } - KASSERT(scratch_bp->b_kvabase != unmapped_buf, - ("scratch bp has no KVA %p", scratch_bp)); - /* Grab pointers. */ - bp->b_kvabase = scratch_bp->b_kvabase; - bp->b_kvasize = scratch_bp->b_kvasize; - bp->b_data = scratch_bp->b_data; - - /* Get rid of the scratch buffer. */ - scratch_bp->b_kvasize = 0; - scratch_bp->b_flags |= B_INVAL; - scratch_bp->b_data = scratch_bp->b_kvabase = unmapped_buf; - brelse(scratch_bp); + atomic_add_int(&mappingrestarts, 1); + bufspace_wait(bp->b_vp, gbflags, 0, 0); } has_addr: if (need_mapping) { @@ -3486,7 +3619,7 @@ getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, } maxsize = imax(maxsize, bsize); - bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags); + bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags); if (bp == NULL) { if (slpflag || slptimeo) return NULL; @@ -3510,6 +3643,7 @@ getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, BO_UNLOCK(bo); bp->b_flags |= B_INVAL; brelse(bp); + bufspace_release(maxsize); goto loop; } @@ -3543,6 +3677,7 @@ getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, } allocbuf(bp, size); + bufspace_release(maxsize); bp->b_flags &= ~B_DONE; } CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp); @@ -3564,12 +3699,13 @@ geteblk(int size, int flags) int maxsize; maxsize = (size + BKVAMASK) & ~BKVAMASK; - while ((bp = getnewbuf(NULL, 0, 0, size, maxsize, flags)) == NULL) { + while ((bp = getnewbuf(NULL, 0, 0, maxsize, flags)) == NULL) { if ((flags & GB_NOWAIT_BD) && (curthread->td_pflags & TDP_BUFNEED) != 0) return (NULL); } allocbuf(bp, size); + bufspace_release(maxsize); bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ BUF_ASSERT_HELD(bp); return (bp); @@ -3595,7 +3731,7 @@ vfs_nonvmio_truncate(struct buf *bp, int newbsize) return; } vm_hold_free_pages(bp, newbsize); - bufspaceadjust(bp, newbsize); + bufspace_adjust(bp, newbsize); } /* @@ -3646,7 +3782,7 @@ vfs_nonvmio_extend(struct buf *bp, int newbsize) bcopy(origbuf, bp->b_data, origbufsize); free(origbuf, M_BIOBUF); } - bufspaceadjust(bp, newbsize); + bufspace_adjust(bp, newbsize); } /* @@ -3708,7 +3844,7 @@ allocbuf(struct buf *bp, int size) /* XXX This looks as if it should be newbsize > b_bufsize */ else if (size > bp->b_bcount) vfs_vmio_extend(bp, desiredpages, size); - bufspaceadjust(bp, newbsize); + bufspace_adjust(bp, newbsize); } bp->b_bcount = size; /* requested buffer size. */ return (1); @@ -4596,7 +4732,7 @@ DB_COMMAND(countfreebufs, db_coundfreebufs) for (i = 0; i < nbuf; i++) { bp = &buf[i]; - if ((bp->b_flags & B_INFREECNT) != 0) + if (bp->b_qindex == QUEUE_EMPTY) nfree++; else used++; diff --git a/sys/vm/vm_init.c b/sys/vm/vm_init.c index be1038758824..e6645858ad37 100644 --- a/sys/vm/vm_init.c +++ b/sys/vm/vm_init.c @@ -74,6 +74,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -229,12 +230,15 @@ vm_ksubmap_init(struct kva_md_info *kmi) /* * Allocate the buffer arena. + * + * Enable the quantum cache if we have more than 4 cpus. This + * avoids lock contention at the expense of some fragmentation. */ size = (long)nbuf * BKVASIZE; kmi->buffer_sva = firstaddr; kmi->buffer_eva = kmi->buffer_sva + size; vmem_init(buffer_arena, "buffer arena", kmi->buffer_sva, size, - PAGE_SIZE, 0, 0); + PAGE_SIZE, (mp_ncpus > 4) ? BKVASIZE * 8 : 0, 0); firstaddr += size; /*