2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* CDDL HEADER START
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the terms of the
|
|
|
|
* Common Development and Distribution License (the "License").
|
|
|
|
* You may not use this file except in compliance with the License.
|
|
|
|
*
|
|
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
|
|
* See the License for the specific language governing permissions
|
|
|
|
* and limitations under the License.
|
|
|
|
*
|
|
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
|
|
*
|
|
|
|
* CDDL HEADER END
|
|
|
|
*/
|
|
|
|
/*
|
2010-05-28 13:45:14 -07:00
|
|
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
2015-06-26 15:14:45 -07:00
|
|
|
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
|
2016-05-15 08:02:28 -07:00
|
|
|
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
|
2015-06-26 15:14:45 -07:00
|
|
|
* Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
|
2014-05-22 10:11:57 +01:00
|
|
|
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* DVA-based Adjustable Replacement Cache
|
|
|
|
*
|
|
|
|
* While much of the theory of operation used here is
|
|
|
|
* based on the self-tuning, low overhead replacement cache
|
|
|
|
* presented by Megiddo and Modha at FAST 2003, there are some
|
|
|
|
* significant differences:
|
|
|
|
*
|
|
|
|
* 1. The Megiddo and Modha model assumes any page is evictable.
|
|
|
|
* Pages in its cache cannot be "locked" into memory. This makes
|
|
|
|
* the eviction algorithm simple: evict the last page in the list.
|
|
|
|
* This also make the performance characteristics easy to reason
|
|
|
|
* about. Our cache is not so simple. At any given moment, some
|
|
|
|
* subset of the blocks in the cache are un-evictable because we
|
|
|
|
* have handed out a reference to them. Blocks are only evictable
|
|
|
|
* when there are no external references active. This makes
|
|
|
|
* eviction far more problematic: we choose to evict the evictable
|
|
|
|
* blocks that are the "lowest" in the list.
|
|
|
|
*
|
|
|
|
* There are times when it is not possible to evict the requested
|
|
|
|
* space. In these circumstances we are unable to adjust the cache
|
|
|
|
* size. To prevent the cache growing unbounded at these times we
|
|
|
|
* implement a "cache throttle" that slows the flow of new data
|
|
|
|
* into the cache until we can make space available.
|
|
|
|
*
|
|
|
|
* 2. The Megiddo and Modha model assumes a fixed cache size.
|
|
|
|
* Pages are evicted when the cache is full and there is a cache
|
|
|
|
* miss. Our model has a variable sized cache. It grows with
|
|
|
|
* high use, but also tries to react to memory pressure from the
|
|
|
|
* operating system: decreasing its size when system memory is
|
|
|
|
* tight.
|
|
|
|
*
|
|
|
|
* 3. The Megiddo and Modha model assumes a fixed page size. All
|
2013-06-11 09:12:34 -08:00
|
|
|
* elements of the cache are therefore exactly the same size. So
|
2008-11-20 12:01:55 -08:00
|
|
|
* when adjusting the cache size following a cache miss, its simply
|
|
|
|
* a matter of choosing a single page to evict. In our model, we
|
|
|
|
* have variable sized cache blocks (rangeing from 512 bytes to
|
2013-06-11 09:12:34 -08:00
|
|
|
* 128K bytes). We therefore choose a set of blocks to evict to make
|
2008-11-20 12:01:55 -08:00
|
|
|
* space for a cache miss that approximates as closely as possible
|
|
|
|
* the space used by the new block.
|
|
|
|
*
|
|
|
|
* See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
|
|
|
|
* by N. Megiddo & D. Modha, FAST 2003
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The locking model:
|
|
|
|
*
|
|
|
|
* A new reference to a cache buffer can be obtained in two
|
|
|
|
* ways: 1) via a hash table lookup using the DVA as a key,
|
|
|
|
* or 2) via one of the ARC lists. The arc_read() interface
|
|
|
|
* uses method 1, while the internal arc algorithms for
|
2013-06-11 09:12:34 -08:00
|
|
|
* adjusting the cache use method 2. We therefore provide two
|
2008-11-20 12:01:55 -08:00
|
|
|
* types of locks: 1) the hash table lock array, and 2) the
|
|
|
|
* arc list locks.
|
|
|
|
*
|
2013-01-11 08:54:18 -08:00
|
|
|
* Buffers do not have their own mutexes, rather they rely on the
|
|
|
|
* hash table mutexes for the bulk of their protection (i.e. most
|
|
|
|
* fields in the arc_buf_hdr_t are protected by these mutexes).
|
2008-11-20 12:01:55 -08:00
|
|
|
*
|
|
|
|
* buf_hash_find() returns the appropriate mutex (held) when it
|
|
|
|
* locates the requested buffer in the hash table. It returns
|
|
|
|
* NULL for the mutex if the buffer was not in the table.
|
|
|
|
*
|
|
|
|
* buf_hash_remove() expects the appropriate hash mutex to be
|
|
|
|
* already held before it is invoked.
|
|
|
|
*
|
|
|
|
* Each arc state also has a mutex which is used to protect the
|
|
|
|
* buffer list associated with the state. When attempting to
|
|
|
|
* obtain a hash table lock while holding an arc list lock you
|
|
|
|
* must use: mutex_tryenter() to avoid deadlock. Also note that
|
|
|
|
* the active state mutex must be held before the ghost state mutex.
|
|
|
|
*
|
|
|
|
* Arc buffers may have an associated eviction callback function.
|
|
|
|
* This function will be invoked prior to removing the buffer (e.g.
|
|
|
|
* in arc_do_user_evicts()). Note however that the data associated
|
|
|
|
* with the buffer may be evicted prior to the callback. The callback
|
|
|
|
* must be made with *no locks held* (to prevent deadlock). Additionally,
|
|
|
|
* the users of callbacks must ensure that their private data is
|
2014-07-15 03:43:18 -04:00
|
|
|
* protected from simultaneous callbacks from arc_clear_callback()
|
2008-11-20 12:01:55 -08:00
|
|
|
* and arc_do_user_evicts().
|
|
|
|
*
|
2011-12-22 12:20:43 -08:00
|
|
|
* It as also possible to register a callback which is run when the
|
|
|
|
* arc_meta_limit is reached and no buffers can be safely evicted. In
|
|
|
|
* this case the arc user should drop a reference on some arc buffers so
|
|
|
|
* they can be reclaimed and the arc_meta_limit honored. For example,
|
|
|
|
* when using the ZPL each dentry holds a references on a znode. These
|
|
|
|
* dentries must be pruned before the arc buffer holding the znode can
|
|
|
|
* be safely evicted.
|
|
|
|
*
|
2008-11-20 12:01:55 -08:00
|
|
|
* Note that the majority of the performance stats are manipulated
|
|
|
|
* with atomic operations.
|
|
|
|
*
|
2014-12-29 19:12:23 -08:00
|
|
|
* The L2ARC uses the l2ad_mtx on each vdev for the following:
|
2008-11-20 12:01:55 -08:00
|
|
|
*
|
|
|
|
* - L2ARC buflist creation
|
|
|
|
* - L2ARC buflist eviction
|
|
|
|
* - L2ARC write completion, which walks L2ARC buflists
|
|
|
|
* - ARC header destruction, as it removes from L2ARC buflists
|
|
|
|
* - ARC header release, as it removes from L2ARC buflists
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <sys/spa.h>
|
|
|
|
#include <sys/zio.h>
|
2013-08-01 13:02:10 -07:00
|
|
|
#include <sys/zio_compress.h>
|
2008-11-20 12:01:55 -08:00
|
|
|
#include <sys/zfs_context.h>
|
|
|
|
#include <sys/arc.h>
|
2015-06-26 15:14:45 -07:00
|
|
|
#include <sys/refcount.h>
|
2008-12-03 12:09:06 -08:00
|
|
|
#include <sys/vdev.h>
|
2009-07-02 15:44:48 -07:00
|
|
|
#include <sys/vdev_impl.h>
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
#include <sys/dsl_pool.h>
|
2015-01-12 19:52:19 -08:00
|
|
|
#include <sys/multilist.h>
|
2008-11-20 12:01:55 -08:00
|
|
|
#ifdef _KERNEL
|
|
|
|
#include <sys/vmsystm.h>
|
|
|
|
#include <vm/anon.h>
|
|
|
|
#include <sys/fs/swapnode.h>
|
2011-12-22 12:20:43 -08:00
|
|
|
#include <sys/zpl.h>
|
2014-11-14 10:21:53 -08:00
|
|
|
#include <linux/mm_compat.h>
|
2008-11-20 12:01:55 -08:00
|
|
|
#endif
|
|
|
|
#include <sys/callb.h>
|
|
|
|
#include <sys/kstat.h>
|
2012-01-20 10:58:57 -08:00
|
|
|
#include <sys/dmu_tx.h>
|
2010-05-28 13:45:14 -07:00
|
|
|
#include <zfs_fletcher.h>
|
2014-10-21 17:59:33 -07:00
|
|
|
#include <sys/arc_impl.h>
|
2014-12-12 18:07:39 -08:00
|
|
|
#include <sys/trace_arc.h>
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2013-05-16 14:18:06 -07:00
|
|
|
#ifndef _KERNEL
|
|
|
|
/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
|
|
|
|
boolean_t arc_watch = B_FALSE;
|
|
|
|
#endif
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
static kmutex_t arc_reclaim_lock;
|
|
|
|
static kcondvar_t arc_reclaim_thread_cv;
|
|
|
|
static boolean_t arc_reclaim_thread_exit;
|
|
|
|
static kcondvar_t arc_reclaim_waiters_cv;
|
|
|
|
|
|
|
|
static kmutex_t arc_user_evicts_lock;
|
|
|
|
static kcondvar_t arc_user_evicts_cv;
|
|
|
|
static boolean_t arc_user_evicts_thread_exit;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
/*
|
2015-01-12 19:52:19 -08:00
|
|
|
* The number of headers to evict in arc_evict_state_impl() before
|
|
|
|
* dropping the sublist lock and evicting from another sublist. A lower
|
|
|
|
* value means we're more likely to evict the "correct" header (i.e. the
|
|
|
|
* oldest header in the arc state), but comes with higher overhead
|
|
|
|
* (i.e. more invocations of arc_evict_state_impl()).
|
|
|
|
*/
|
|
|
|
int zfs_arc_evict_batch_limit = 10;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The number of sublists used for each of the arc state lists. If this
|
|
|
|
* is not set to a suitable value by the user, it will be configured to
|
|
|
|
* the number of CPUs on the system in arc_init().
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
*/
|
2015-01-12 19:52:19 -08:00
|
|
|
int zfs_arc_num_sublists_per_state = 0;
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/* number of seconds before growing cache again */
|
2015-06-26 11:28:18 -07:00
|
|
|
static int arc_grow_retry = 5;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
/* shift of arc_c for calculating overflow limit in arc_get_data_buf */
|
2015-06-26 11:28:18 -07:00
|
|
|
int zfs_arc_overflow_shift = 8;
|
2014-01-03 10:36:26 -08:00
|
|
|
|
2015-06-26 15:59:23 -07:00
|
|
|
/* shift of arc_c for calculating both min and max arc_p */
|
|
|
|
static int arc_p_min_shift = 4;
|
|
|
|
|
2009-02-18 12:51:31 -08:00
|
|
|
/* log2(fraction of arc to reclaim) */
|
2015-06-26 11:28:18 -07:00
|
|
|
static int arc_shrink_shift = 7;
|
2009-02-18 12:51:31 -08:00
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
2015-06-26 11:28:18 -07:00
|
|
|
* log2(fraction of ARC which must be free to allow growing).
|
|
|
|
* I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
|
|
|
|
* when reading a new block into the ARC, we will evict an equal-sized block
|
|
|
|
* from the ARC.
|
|
|
|
*
|
|
|
|
* This must be less than arc_shrink_shift, so that when we shrink the ARC,
|
|
|
|
* we will still not allow it to grow.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
2015-06-26 11:28:18 -07:00
|
|
|
int arc_no_grow_shift = 5;
|
2013-07-24 10:14:11 -07:00
|
|
|
|
2014-08-20 10:09:40 -07:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
/*
|
|
|
|
* minimum lifespan of a prefetch block in clock ticks
|
|
|
|
* (initialized in arc_init())
|
|
|
|
*/
|
2015-06-26 11:28:18 -07:00
|
|
|
static int arc_min_prefetch_lifespan;
|
2015-01-12 19:52:19 -08:00
|
|
|
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
/*
|
|
|
|
* If this percent of memory is free, don't throttle.
|
|
|
|
*/
|
|
|
|
int arc_lotsfree_percent = 10;
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
static int arc_dead;
|
|
|
|
|
2008-12-03 12:09:06 -08:00
|
|
|
/*
|
|
|
|
* The arc has filled available memory and has now warmed up.
|
|
|
|
*/
|
|
|
|
static boolean_t arc_warm;
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* These tunables are for performance analysis.
|
|
|
|
*/
|
2010-08-26 11:49:16 -07:00
|
|
|
unsigned long zfs_arc_max = 0;
|
|
|
|
unsigned long zfs_arc_min = 0;
|
|
|
|
unsigned long zfs_arc_meta_limit = 0;
|
2015-01-12 19:52:19 -08:00
|
|
|
unsigned long zfs_arc_meta_min = 0;
|
2016-07-13 07:42:40 -05:00
|
|
|
unsigned long zfs_arc_dnode_limit = 0;
|
|
|
|
unsigned long zfs_arc_dnode_reduce_percent = 10;
|
2015-06-26 11:28:18 -07:00
|
|
|
int zfs_arc_grow_retry = 0;
|
|
|
|
int zfs_arc_shrink_shift = 0;
|
2015-06-26 15:59:23 -07:00
|
|
|
int zfs_arc_p_min_shift = 0;
|
2015-06-26 11:28:18 -07:00
|
|
|
int zfs_disable_dup_eviction = 0;
|
|
|
|
int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-03-17 15:08:22 -07:00
|
|
|
/*
|
2015-06-26 11:28:18 -07:00
|
|
|
* These tunables are Linux specific
|
2015-03-17 15:08:22 -07:00
|
|
|
*/
|
2015-07-27 13:17:32 -07:00
|
|
|
unsigned long zfs_arc_sys_free = 0;
|
2015-06-26 11:28:18 -07:00
|
|
|
int zfs_arc_min_prefetch_lifespan = 0;
|
|
|
|
int zfs_arc_p_aggressive_disable = 1;
|
|
|
|
int zfs_arc_p_dampener_disable = 1;
|
|
|
|
int zfs_arc_meta_prune = 10000;
|
|
|
|
int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED;
|
|
|
|
int zfs_arc_meta_adjust_restarts = 4096;
|
2015-07-28 11:30:00 -07:00
|
|
|
int zfs_arc_lotsfree_percent = 10;
|
2015-03-17 15:08:22 -07:00
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/* The 6 states: */
|
|
|
|
static arc_state_t ARC_anon;
|
|
|
|
static arc_state_t ARC_mru;
|
|
|
|
static arc_state_t ARC_mru_ghost;
|
|
|
|
static arc_state_t ARC_mfu;
|
|
|
|
static arc_state_t ARC_mfu_ghost;
|
|
|
|
static arc_state_t ARC_l2c_only;
|
|
|
|
|
|
|
|
typedef struct arc_stats {
|
|
|
|
kstat_named_t arcstat_hits;
|
|
|
|
kstat_named_t arcstat_misses;
|
|
|
|
kstat_named_t arcstat_demand_data_hits;
|
|
|
|
kstat_named_t arcstat_demand_data_misses;
|
|
|
|
kstat_named_t arcstat_demand_metadata_hits;
|
|
|
|
kstat_named_t arcstat_demand_metadata_misses;
|
|
|
|
kstat_named_t arcstat_prefetch_data_hits;
|
|
|
|
kstat_named_t arcstat_prefetch_data_misses;
|
|
|
|
kstat_named_t arcstat_prefetch_metadata_hits;
|
|
|
|
kstat_named_t arcstat_prefetch_metadata_misses;
|
|
|
|
kstat_named_t arcstat_mru_hits;
|
|
|
|
kstat_named_t arcstat_mru_ghost_hits;
|
|
|
|
kstat_named_t arcstat_mfu_hits;
|
|
|
|
kstat_named_t arcstat_mfu_ghost_hits;
|
|
|
|
kstat_named_t arcstat_deleted;
|
2013-06-11 09:12:34 -08:00
|
|
|
/*
|
|
|
|
* Number of buffers that could not be evicted because the hash lock
|
|
|
|
* was held by another thread. The lock may not necessarily be held
|
|
|
|
* by something using the same buffer, since hash locks are shared
|
|
|
|
* by multiple buffers.
|
|
|
|
*/
|
2008-11-20 12:01:55 -08:00
|
|
|
kstat_named_t arcstat_mutex_miss;
|
2013-06-11 09:12:34 -08:00
|
|
|
/*
|
|
|
|
* Number of buffers skipped because they have I/O in progress, are
|
|
|
|
* indrect prefetch buffers that have not lived long enough, or are
|
|
|
|
* not from the spa we're trying to evict from.
|
|
|
|
*/
|
2008-11-20 12:01:55 -08:00
|
|
|
kstat_named_t arcstat_evict_skip;
|
2015-01-12 19:52:19 -08:00
|
|
|
/*
|
|
|
|
* Number of times arc_evict_state() was unable to evict enough
|
|
|
|
* buffers to reach its target amount.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_evict_not_enough;
|
2010-05-28 13:45:14 -07:00
|
|
|
kstat_named_t arcstat_evict_l2_cached;
|
|
|
|
kstat_named_t arcstat_evict_l2_eligible;
|
|
|
|
kstat_named_t arcstat_evict_l2_ineligible;
|
2015-01-12 19:52:19 -08:00
|
|
|
kstat_named_t arcstat_evict_l2_skip;
|
2008-11-20 12:01:55 -08:00
|
|
|
kstat_named_t arcstat_hash_elements;
|
|
|
|
kstat_named_t arcstat_hash_elements_max;
|
|
|
|
kstat_named_t arcstat_hash_collisions;
|
|
|
|
kstat_named_t arcstat_hash_chains;
|
|
|
|
kstat_named_t arcstat_hash_chain_max;
|
|
|
|
kstat_named_t arcstat_p;
|
|
|
|
kstat_named_t arcstat_c;
|
|
|
|
kstat_named_t arcstat_c_min;
|
|
|
|
kstat_named_t arcstat_c_max;
|
|
|
|
kstat_named_t arcstat_size;
|
2015-06-26 14:54:17 -07:00
|
|
|
/*
|
|
|
|
* Number of bytes consumed by internal ARC structures necessary
|
|
|
|
* for tracking purposes; these structures are not actually
|
|
|
|
* backed by ARC buffers. This includes arc_buf_hdr_t structures
|
|
|
|
* (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
|
|
|
|
* caches), and arc_buf_t structures (allocated via arc_buf_t
|
|
|
|
* cache).
|
|
|
|
*/
|
2008-11-20 12:01:55 -08:00
|
|
|
kstat_named_t arcstat_hdr_size;
|
2015-06-26 14:54:17 -07:00
|
|
|
/*
|
|
|
|
* Number of bytes consumed by ARC buffers of type equal to
|
|
|
|
* ARC_BUFC_DATA. This is generally consumed by buffers backing
|
|
|
|
* on disk user data (e.g. plain file contents).
|
|
|
|
*/
|
2009-02-18 12:51:31 -08:00
|
|
|
kstat_named_t arcstat_data_size;
|
2015-06-26 14:54:17 -07:00
|
|
|
/*
|
|
|
|
* Number of bytes consumed by ARC buffers of type equal to
|
|
|
|
* ARC_BUFC_METADATA. This is generally consumed by buffers
|
|
|
|
* backing on disk data that is used for internal ZFS
|
|
|
|
* structures (e.g. ZAP, dnode, indirect blocks, etc).
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_metadata_size;
|
|
|
|
/*
|
2016-07-13 07:42:40 -05:00
|
|
|
* Number of bytes consumed by dmu_buf_impl_t objects.
|
2015-06-26 14:54:17 -07:00
|
|
|
*/
|
2016-07-13 07:42:40 -05:00
|
|
|
kstat_named_t arcstat_dbuf_size;
|
|
|
|
/*
|
|
|
|
* Number of bytes consumed by dnode_t objects.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_dnode_size;
|
|
|
|
/*
|
|
|
|
* Number of bytes consumed by bonus buffers.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_bonus_size;
|
2015-06-26 14:54:17 -07:00
|
|
|
/*
|
|
|
|
* Total number of bytes consumed by ARC buffers residing in the
|
|
|
|
* arc_anon state. This includes *all* buffers in the arc_anon
|
|
|
|
* state; e.g. data, metadata, evictable, and unevictable buffers
|
|
|
|
* are all included in this value.
|
|
|
|
*/
|
2012-01-30 13:28:40 -08:00
|
|
|
kstat_named_t arcstat_anon_size;
|
2015-06-26 14:54:17 -07:00
|
|
|
/*
|
|
|
|
* Number of bytes consumed by ARC buffers that meet the
|
|
|
|
* following criteria: backing buffers of type ARC_BUFC_DATA,
|
|
|
|
* residing in the arc_anon state, and are eligible for eviction
|
|
|
|
* (e.g. have no outstanding holds on the buffer).
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_anon_evictable_data;
|
|
|
|
/*
|
|
|
|
* Number of bytes consumed by ARC buffers that meet the
|
|
|
|
* following criteria: backing buffers of type ARC_BUFC_METADATA,
|
|
|
|
* residing in the arc_anon state, and are eligible for eviction
|
|
|
|
* (e.g. have no outstanding holds on the buffer).
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_anon_evictable_metadata;
|
|
|
|
/*
|
|
|
|
* Total number of bytes consumed by ARC buffers residing in the
|
|
|
|
* arc_mru state. This includes *all* buffers in the arc_mru
|
|
|
|
* state; e.g. data, metadata, evictable, and unevictable buffers
|
|
|
|
* are all included in this value.
|
|
|
|
*/
|
2012-01-30 13:28:40 -08:00
|
|
|
kstat_named_t arcstat_mru_size;
|
2015-06-26 14:54:17 -07:00
|
|
|
/*
|
|
|
|
* Number of bytes consumed by ARC buffers that meet the
|
|
|
|
* following criteria: backing buffers of type ARC_BUFC_DATA,
|
|
|
|
* residing in the arc_mru state, and are eligible for eviction
|
|
|
|
* (e.g. have no outstanding holds on the buffer).
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_mru_evictable_data;
|
|
|
|
/*
|
|
|
|
* Number of bytes consumed by ARC buffers that meet the
|
|
|
|
* following criteria: backing buffers of type ARC_BUFC_METADATA,
|
|
|
|
* residing in the arc_mru state, and are eligible for eviction
|
|
|
|
* (e.g. have no outstanding holds on the buffer).
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_mru_evictable_metadata;
|
|
|
|
/*
|
|
|
|
* Total number of bytes that *would have been* consumed by ARC
|
|
|
|
* buffers in the arc_mru_ghost state. The key thing to note
|
|
|
|
* here, is the fact that this size doesn't actually indicate
|
|
|
|
* RAM consumption. The ghost lists only consist of headers and
|
|
|
|
* don't actually have ARC buffers linked off of these headers.
|
|
|
|
* Thus, *if* the headers had associated ARC buffers, these
|
|
|
|
* buffers *would have* consumed this number of bytes.
|
|
|
|
*/
|
2012-01-30 13:28:40 -08:00
|
|
|
kstat_named_t arcstat_mru_ghost_size;
|
2015-06-26 14:54:17 -07:00
|
|
|
/*
|
|
|
|
* Number of bytes that *would have been* consumed by ARC
|
|
|
|
* buffers that are eligible for eviction, of type
|
|
|
|
* ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_mru_ghost_evictable_data;
|
|
|
|
/*
|
|
|
|
* Number of bytes that *would have been* consumed by ARC
|
|
|
|
* buffers that are eligible for eviction, of type
|
|
|
|
* ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_mru_ghost_evictable_metadata;
|
|
|
|
/*
|
|
|
|
* Total number of bytes consumed by ARC buffers residing in the
|
|
|
|
* arc_mfu state. This includes *all* buffers in the arc_mfu
|
|
|
|
* state; e.g. data, metadata, evictable, and unevictable buffers
|
|
|
|
* are all included in this value.
|
|
|
|
*/
|
2012-01-30 13:28:40 -08:00
|
|
|
kstat_named_t arcstat_mfu_size;
|
2015-06-26 14:54:17 -07:00
|
|
|
/*
|
|
|
|
* Number of bytes consumed by ARC buffers that are eligible for
|
|
|
|
* eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
|
|
|
|
* state.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_mfu_evictable_data;
|
|
|
|
/*
|
|
|
|
* Number of bytes consumed by ARC buffers that are eligible for
|
|
|
|
* eviction, of type ARC_BUFC_METADATA, and reside in the
|
|
|
|
* arc_mfu state.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_mfu_evictable_metadata;
|
|
|
|
/*
|
|
|
|
* Total number of bytes that *would have been* consumed by ARC
|
|
|
|
* buffers in the arc_mfu_ghost state. See the comment above
|
|
|
|
* arcstat_mru_ghost_size for more details.
|
|
|
|
*/
|
2012-01-30 13:28:40 -08:00
|
|
|
kstat_named_t arcstat_mfu_ghost_size;
|
2015-06-26 14:54:17 -07:00
|
|
|
/*
|
|
|
|
* Number of bytes that *would have been* consumed by ARC
|
|
|
|
* buffers that are eligible for eviction, of type
|
|
|
|
* ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_mfu_ghost_evictable_data;
|
|
|
|
/*
|
|
|
|
* Number of bytes that *would have been* consumed by ARC
|
|
|
|
* buffers that are eligible for eviction, of type
|
|
|
|
* ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_mfu_ghost_evictable_metadata;
|
2008-11-20 12:01:55 -08:00
|
|
|
kstat_named_t arcstat_l2_hits;
|
|
|
|
kstat_named_t arcstat_l2_misses;
|
|
|
|
kstat_named_t arcstat_l2_feeds;
|
|
|
|
kstat_named_t arcstat_l2_rw_clash;
|
2009-02-18 12:51:31 -08:00
|
|
|
kstat_named_t arcstat_l2_read_bytes;
|
|
|
|
kstat_named_t arcstat_l2_write_bytes;
|
2008-11-20 12:01:55 -08:00
|
|
|
kstat_named_t arcstat_l2_writes_sent;
|
|
|
|
kstat_named_t arcstat_l2_writes_done;
|
|
|
|
kstat_named_t arcstat_l2_writes_error;
|
2015-01-12 19:52:19 -08:00
|
|
|
kstat_named_t arcstat_l2_writes_lock_retry;
|
2016-02-10 10:42:01 -08:00
|
|
|
kstat_named_t arcstat_l2_writes_skip_toobig;
|
2008-11-20 12:01:55 -08:00
|
|
|
kstat_named_t arcstat_l2_evict_lock_retry;
|
|
|
|
kstat_named_t arcstat_l2_evict_reading;
|
2014-12-29 19:12:23 -08:00
|
|
|
kstat_named_t arcstat_l2_evict_l1cached;
|
2008-11-20 12:01:55 -08:00
|
|
|
kstat_named_t arcstat_l2_free_on_write;
|
2015-01-12 19:52:19 -08:00
|
|
|
kstat_named_t arcstat_l2_cdata_free_on_write;
|
2008-11-20 12:01:55 -08:00
|
|
|
kstat_named_t arcstat_l2_abort_lowmem;
|
|
|
|
kstat_named_t arcstat_l2_cksum_bad;
|
|
|
|
kstat_named_t arcstat_l2_io_error;
|
|
|
|
kstat_named_t arcstat_l2_size;
|
2013-08-01 13:02:10 -07:00
|
|
|
kstat_named_t arcstat_l2_asize;
|
2008-11-20 12:01:55 -08:00
|
|
|
kstat_named_t arcstat_l2_hdr_size;
|
2013-08-01 13:02:10 -07:00
|
|
|
kstat_named_t arcstat_l2_compress_successes;
|
|
|
|
kstat_named_t arcstat_l2_compress_zeros;
|
|
|
|
kstat_named_t arcstat_l2_compress_failures;
|
2008-11-20 12:01:55 -08:00
|
|
|
kstat_named_t arcstat_memory_throttle_count;
|
2012-12-21 14:57:09 -08:00
|
|
|
kstat_named_t arcstat_duplicate_buffers;
|
|
|
|
kstat_named_t arcstat_duplicate_buffers_size;
|
|
|
|
kstat_named_t arcstat_duplicate_reads;
|
2011-03-29 18:08:59 -07:00
|
|
|
kstat_named_t arcstat_memory_direct_count;
|
|
|
|
kstat_named_t arcstat_memory_indirect_count;
|
2011-03-24 12:13:55 -07:00
|
|
|
kstat_named_t arcstat_no_grow;
|
|
|
|
kstat_named_t arcstat_tempreserve;
|
|
|
|
kstat_named_t arcstat_loaned_bytes;
|
2011-12-22 12:20:43 -08:00
|
|
|
kstat_named_t arcstat_prune;
|
2011-03-24 12:13:55 -07:00
|
|
|
kstat_named_t arcstat_meta_used;
|
|
|
|
kstat_named_t arcstat_meta_limit;
|
2016-07-13 07:42:40 -05:00
|
|
|
kstat_named_t arcstat_dnode_limit;
|
2011-03-24 12:13:55 -07:00
|
|
|
kstat_named_t arcstat_meta_max;
|
2015-01-12 19:52:19 -08:00
|
|
|
kstat_named_t arcstat_meta_min;
|
2015-12-26 22:10:31 +01:00
|
|
|
kstat_named_t arcstat_sync_wait_for_async;
|
|
|
|
kstat_named_t arcstat_demand_hit_predictive_prefetch;
|
2015-07-27 13:17:32 -07:00
|
|
|
kstat_named_t arcstat_need_free;
|
|
|
|
kstat_named_t arcstat_sys_free;
|
2008-11-20 12:01:55 -08:00
|
|
|
} arc_stats_t;
|
|
|
|
|
|
|
|
static arc_stats_t arc_stats = {
|
|
|
|
{ "hits", KSTAT_DATA_UINT64 },
|
|
|
|
{ "misses", KSTAT_DATA_UINT64 },
|
|
|
|
{ "demand_data_hits", KSTAT_DATA_UINT64 },
|
|
|
|
{ "demand_data_misses", KSTAT_DATA_UINT64 },
|
|
|
|
{ "demand_metadata_hits", KSTAT_DATA_UINT64 },
|
|
|
|
{ "demand_metadata_misses", KSTAT_DATA_UINT64 },
|
|
|
|
{ "prefetch_data_hits", KSTAT_DATA_UINT64 },
|
|
|
|
{ "prefetch_data_misses", KSTAT_DATA_UINT64 },
|
|
|
|
{ "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
|
|
|
|
{ "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
|
|
|
|
{ "mru_hits", KSTAT_DATA_UINT64 },
|
|
|
|
{ "mru_ghost_hits", KSTAT_DATA_UINT64 },
|
|
|
|
{ "mfu_hits", KSTAT_DATA_UINT64 },
|
|
|
|
{ "mfu_ghost_hits", KSTAT_DATA_UINT64 },
|
|
|
|
{ "deleted", KSTAT_DATA_UINT64 },
|
|
|
|
{ "mutex_miss", KSTAT_DATA_UINT64 },
|
|
|
|
{ "evict_skip", KSTAT_DATA_UINT64 },
|
2015-01-12 19:52:19 -08:00
|
|
|
{ "evict_not_enough", KSTAT_DATA_UINT64 },
|
2010-05-28 13:45:14 -07:00
|
|
|
{ "evict_l2_cached", KSTAT_DATA_UINT64 },
|
|
|
|
{ "evict_l2_eligible", KSTAT_DATA_UINT64 },
|
|
|
|
{ "evict_l2_ineligible", KSTAT_DATA_UINT64 },
|
2015-01-12 19:52:19 -08:00
|
|
|
{ "evict_l2_skip", KSTAT_DATA_UINT64 },
|
2008-11-20 12:01:55 -08:00
|
|
|
{ "hash_elements", KSTAT_DATA_UINT64 },
|
|
|
|
{ "hash_elements_max", KSTAT_DATA_UINT64 },
|
|
|
|
{ "hash_collisions", KSTAT_DATA_UINT64 },
|
|
|
|
{ "hash_chains", KSTAT_DATA_UINT64 },
|
|
|
|
{ "hash_chain_max", KSTAT_DATA_UINT64 },
|
|
|
|
{ "p", KSTAT_DATA_UINT64 },
|
|
|
|
{ "c", KSTAT_DATA_UINT64 },
|
|
|
|
{ "c_min", KSTAT_DATA_UINT64 },
|
|
|
|
{ "c_max", KSTAT_DATA_UINT64 },
|
|
|
|
{ "size", KSTAT_DATA_UINT64 },
|
|
|
|
{ "hdr_size", KSTAT_DATA_UINT64 },
|
2009-02-18 12:51:31 -08:00
|
|
|
{ "data_size", KSTAT_DATA_UINT64 },
|
2015-06-26 14:54:17 -07:00
|
|
|
{ "metadata_size", KSTAT_DATA_UINT64 },
|
2016-07-13 07:42:40 -05:00
|
|
|
{ "dbuf_size", KSTAT_DATA_UINT64 },
|
|
|
|
{ "dnode_size", KSTAT_DATA_UINT64 },
|
|
|
|
{ "bonus_size", KSTAT_DATA_UINT64 },
|
2012-01-30 13:28:40 -08:00
|
|
|
{ "anon_size", KSTAT_DATA_UINT64 },
|
2015-06-26 14:54:17 -07:00
|
|
|
{ "anon_evictable_data", KSTAT_DATA_UINT64 },
|
|
|
|
{ "anon_evictable_metadata", KSTAT_DATA_UINT64 },
|
2012-01-30 13:28:40 -08:00
|
|
|
{ "mru_size", KSTAT_DATA_UINT64 },
|
2015-06-26 14:54:17 -07:00
|
|
|
{ "mru_evictable_data", KSTAT_DATA_UINT64 },
|
|
|
|
{ "mru_evictable_metadata", KSTAT_DATA_UINT64 },
|
2012-01-30 13:28:40 -08:00
|
|
|
{ "mru_ghost_size", KSTAT_DATA_UINT64 },
|
2015-06-26 14:54:17 -07:00
|
|
|
{ "mru_ghost_evictable_data", KSTAT_DATA_UINT64 },
|
|
|
|
{ "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
|
2012-01-30 13:28:40 -08:00
|
|
|
{ "mfu_size", KSTAT_DATA_UINT64 },
|
2015-06-26 14:54:17 -07:00
|
|
|
{ "mfu_evictable_data", KSTAT_DATA_UINT64 },
|
|
|
|
{ "mfu_evictable_metadata", KSTAT_DATA_UINT64 },
|
2012-01-30 13:28:40 -08:00
|
|
|
{ "mfu_ghost_size", KSTAT_DATA_UINT64 },
|
2015-06-26 14:54:17 -07:00
|
|
|
{ "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 },
|
|
|
|
{ "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
|
2008-11-20 12:01:55 -08:00
|
|
|
{ "l2_hits", KSTAT_DATA_UINT64 },
|
|
|
|
{ "l2_misses", KSTAT_DATA_UINT64 },
|
|
|
|
{ "l2_feeds", KSTAT_DATA_UINT64 },
|
|
|
|
{ "l2_rw_clash", KSTAT_DATA_UINT64 },
|
2009-02-18 12:51:31 -08:00
|
|
|
{ "l2_read_bytes", KSTAT_DATA_UINT64 },
|
|
|
|
{ "l2_write_bytes", KSTAT_DATA_UINT64 },
|
2008-11-20 12:01:55 -08:00
|
|
|
{ "l2_writes_sent", KSTAT_DATA_UINT64 },
|
|
|
|
{ "l2_writes_done", KSTAT_DATA_UINT64 },
|
|
|
|
{ "l2_writes_error", KSTAT_DATA_UINT64 },
|
2015-01-12 19:52:19 -08:00
|
|
|
{ "l2_writes_lock_retry", KSTAT_DATA_UINT64 },
|
2016-02-10 10:42:01 -08:00
|
|
|
{ "l2_writes_skip_toobig", KSTAT_DATA_UINT64 },
|
2008-11-20 12:01:55 -08:00
|
|
|
{ "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
|
|
|
|
{ "l2_evict_reading", KSTAT_DATA_UINT64 },
|
2014-12-29 19:12:23 -08:00
|
|
|
{ "l2_evict_l1cached", KSTAT_DATA_UINT64 },
|
2008-11-20 12:01:55 -08:00
|
|
|
{ "l2_free_on_write", KSTAT_DATA_UINT64 },
|
2015-01-12 19:52:19 -08:00
|
|
|
{ "l2_cdata_free_on_write", KSTAT_DATA_UINT64 },
|
2008-11-20 12:01:55 -08:00
|
|
|
{ "l2_abort_lowmem", KSTAT_DATA_UINT64 },
|
|
|
|
{ "l2_cksum_bad", KSTAT_DATA_UINT64 },
|
|
|
|
{ "l2_io_error", KSTAT_DATA_UINT64 },
|
|
|
|
{ "l2_size", KSTAT_DATA_UINT64 },
|
2013-08-01 13:02:10 -07:00
|
|
|
{ "l2_asize", KSTAT_DATA_UINT64 },
|
2008-11-20 12:01:55 -08:00
|
|
|
{ "l2_hdr_size", KSTAT_DATA_UINT64 },
|
2013-08-01 13:02:10 -07:00
|
|
|
{ "l2_compress_successes", KSTAT_DATA_UINT64 },
|
|
|
|
{ "l2_compress_zeros", KSTAT_DATA_UINT64 },
|
|
|
|
{ "l2_compress_failures", KSTAT_DATA_UINT64 },
|
2011-03-24 12:13:55 -07:00
|
|
|
{ "memory_throttle_count", KSTAT_DATA_UINT64 },
|
2012-12-21 14:57:09 -08:00
|
|
|
{ "duplicate_buffers", KSTAT_DATA_UINT64 },
|
|
|
|
{ "duplicate_buffers_size", KSTAT_DATA_UINT64 },
|
|
|
|
{ "duplicate_reads", KSTAT_DATA_UINT64 },
|
2011-03-29 18:08:59 -07:00
|
|
|
{ "memory_direct_count", KSTAT_DATA_UINT64 },
|
|
|
|
{ "memory_indirect_count", KSTAT_DATA_UINT64 },
|
2011-03-24 12:13:55 -07:00
|
|
|
{ "arc_no_grow", KSTAT_DATA_UINT64 },
|
|
|
|
{ "arc_tempreserve", KSTAT_DATA_UINT64 },
|
|
|
|
{ "arc_loaned_bytes", KSTAT_DATA_UINT64 },
|
2011-12-22 12:20:43 -08:00
|
|
|
{ "arc_prune", KSTAT_DATA_UINT64 },
|
2011-03-24 12:13:55 -07:00
|
|
|
{ "arc_meta_used", KSTAT_DATA_UINT64 },
|
|
|
|
{ "arc_meta_limit", KSTAT_DATA_UINT64 },
|
2016-07-13 07:42:40 -05:00
|
|
|
{ "arc_dnode_limit", KSTAT_DATA_UINT64 },
|
2011-03-24 12:13:55 -07:00
|
|
|
{ "arc_meta_max", KSTAT_DATA_UINT64 },
|
2015-07-27 13:17:32 -07:00
|
|
|
{ "arc_meta_min", KSTAT_DATA_UINT64 },
|
2015-12-26 22:10:31 +01:00
|
|
|
{ "sync_wait_for_async", KSTAT_DATA_UINT64 },
|
|
|
|
{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
|
2015-07-27 13:17:32 -07:00
|
|
|
{ "arc_need_free", KSTAT_DATA_UINT64 },
|
|
|
|
{ "arc_sys_free", KSTAT_DATA_UINT64 }
|
2008-11-20 12:01:55 -08:00
|
|
|
};
|
|
|
|
|
|
|
|
#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
|
|
|
|
|
|
|
|
#define ARCSTAT_INCR(stat, val) \
|
2013-06-11 09:12:34 -08:00
|
|
|
atomic_add_64(&arc_stats.stat.value.ui64, (val))
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
|
2008-11-20 12:01:55 -08:00
|
|
|
#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
|
|
|
|
|
|
|
|
#define ARCSTAT_MAX(stat, val) { \
|
|
|
|
uint64_t m; \
|
|
|
|
while ((val) > (m = arc_stats.stat.value.ui64) && \
|
|
|
|
(m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
|
|
|
|
continue; \
|
|
|
|
}
|
|
|
|
|
|
|
|
#define ARCSTAT_MAXSTAT(stat) \
|
|
|
|
ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We define a macro to allow ARC hits/misses to be easily broken down by
|
|
|
|
* two separate conditions, giving a total of four different subtypes for
|
|
|
|
* each of hits and misses (so eight statistics total).
|
|
|
|
*/
|
|
|
|
#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
|
|
|
|
if (cond1) { \
|
|
|
|
if (cond2) { \
|
|
|
|
ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
|
|
|
|
} else { \
|
|
|
|
ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
|
|
|
|
} \
|
|
|
|
} else { \
|
|
|
|
if (cond2) { \
|
|
|
|
ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
|
|
|
|
} else { \
|
|
|
|
ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
|
|
|
|
} \
|
|
|
|
}
|
|
|
|
|
|
|
|
kstat_t *arc_ksp;
|
2010-05-28 13:45:14 -07:00
|
|
|
static arc_state_t *arc_anon;
|
2008-11-20 12:01:55 -08:00
|
|
|
static arc_state_t *arc_mru;
|
|
|
|
static arc_state_t *arc_mru_ghost;
|
|
|
|
static arc_state_t *arc_mfu;
|
|
|
|
static arc_state_t *arc_mfu_ghost;
|
|
|
|
static arc_state_t *arc_l2c_only;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* There are several ARC variables that are critical to export as kstats --
|
|
|
|
* but we don't want to have to grovel around in the kstat whenever we wish to
|
|
|
|
* manipulate them. For these variables, we therefore define them to be in
|
|
|
|
* terms of the statistic variable. This assures that we are not introducing
|
|
|
|
* the possibility of inconsistency by having shadow copies of the variables,
|
|
|
|
* while still allowing the code to be readable.
|
|
|
|
*/
|
|
|
|
#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
|
|
|
|
#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
|
|
|
|
#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
|
|
|
|
#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
|
|
|
|
#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
|
2011-03-24 12:13:55 -07:00
|
|
|
#define arc_no_grow ARCSTAT(arcstat_no_grow)
|
|
|
|
#define arc_tempreserve ARCSTAT(arcstat_tempreserve)
|
|
|
|
#define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes)
|
2013-02-17 12:00:54 -08:00
|
|
|
#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
|
2016-07-13 07:42:40 -05:00
|
|
|
#define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */
|
2015-01-12 19:52:19 -08:00
|
|
|
#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */
|
2013-02-17 12:00:54 -08:00
|
|
|
#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */
|
|
|
|
#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
|
2016-07-13 07:42:40 -05:00
|
|
|
#define arc_dbuf_size ARCSTAT(arcstat_dbuf_size) /* dbuf metadata */
|
|
|
|
#define arc_dnode_size ARCSTAT(arcstat_dnode_size) /* dnode metadata */
|
|
|
|
#define arc_bonus_size ARCSTAT(arcstat_bonus_size) /* bonus buffer metadata */
|
2015-07-27 13:17:32 -07:00
|
|
|
#define arc_need_free ARCSTAT(arcstat_need_free) /* bytes to be freed */
|
|
|
|
#define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2013-08-01 13:02:10 -07:00
|
|
|
#define L2ARC_IS_VALID_COMPRESS(_c_) \
|
|
|
|
((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
|
|
|
|
|
2011-12-22 12:20:43 -08:00
|
|
|
static list_t arc_prune_list;
|
|
|
|
static kmutex_t arc_prune_mtx;
|
2015-05-30 09:57:53 -05:00
|
|
|
static taskq_t *arc_prune_taskq;
|
2008-11-20 12:01:55 -08:00
|
|
|
static arc_buf_t *arc_eviction_list;
|
|
|
|
static arc_buf_hdr_t arc_eviction_hdr;
|
2010-05-28 13:45:14 -07:00
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
#define GHOST_STATE(state) \
|
|
|
|
((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
|
|
|
|
(state) == arc_l2c_only)
|
|
|
|
|
2014-12-06 09:24:32 -08:00
|
|
|
#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
|
|
|
|
#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
|
|
|
|
#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
|
|
|
|
#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH)
|
|
|
|
#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ)
|
|
|
|
#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE)
|
2014-12-29 19:12:23 -08:00
|
|
|
|
2014-12-06 09:24:32 -08:00
|
|
|
#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE)
|
2014-12-29 19:12:23 -08:00
|
|
|
#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS)
|
2014-12-06 09:24:32 -08:00
|
|
|
#define HDR_L2_READING(hdr) \
|
2014-12-29 19:12:23 -08:00
|
|
|
(((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \
|
|
|
|
((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
|
2014-12-06 09:24:32 -08:00
|
|
|
#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
|
|
|
|
#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
|
|
|
|
#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
#define HDR_ISTYPE_METADATA(hdr) \
|
|
|
|
((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
|
|
|
|
#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr))
|
|
|
|
|
|
|
|
#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
|
|
|
|
#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* Other sizes
|
|
|
|
*/
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
|
|
|
|
#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Hash table routines
|
|
|
|
*/
|
|
|
|
|
2010-08-26 11:46:09 -07:00
|
|
|
#define HT_LOCK_ALIGN 64
|
|
|
|
#define HT_LOCK_PAD (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN)))
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
struct ht_lock {
|
|
|
|
kmutex_t ht_lock;
|
|
|
|
#ifdef _KERNEL
|
2010-08-26 11:46:09 -07:00
|
|
|
unsigned char pad[HT_LOCK_PAD];
|
2008-11-20 12:01:55 -08:00
|
|
|
#endif
|
|
|
|
};
|
|
|
|
|
2014-10-23 16:00:41 -07:00
|
|
|
#define BUF_LOCKS 8192
|
2008-11-20 12:01:55 -08:00
|
|
|
typedef struct buf_hash_table {
|
|
|
|
uint64_t ht_mask;
|
|
|
|
arc_buf_hdr_t **ht_table;
|
|
|
|
struct ht_lock ht_locks[BUF_LOCKS];
|
|
|
|
} buf_hash_table_t;
|
|
|
|
|
|
|
|
static buf_hash_table_t buf_hash_table;
|
|
|
|
|
|
|
|
#define BUF_HASH_INDEX(spa, dva, birth) \
|
|
|
|
(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
|
|
|
|
#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
|
|
|
|
#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
|
2010-05-28 13:45:14 -07:00
|
|
|
#define HDR_LOCK(hdr) \
|
|
|
|
(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
uint64_t zfs_crc64_table[256];
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Level 2 ARC
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
|
2013-08-01 13:02:10 -07:00
|
|
|
#define L2ARC_HEADROOM 2 /* num of writes */
|
2016-02-10 10:42:01 -08:00
|
|
|
#define L2ARC_MAX_BLOCK_SIZE (16 * 1024 * 1024) /* max compress size */
|
|
|
|
|
2013-08-01 13:02:10 -07:00
|
|
|
/*
|
|
|
|
* If we discover during ARC scan any buffers to be compressed, we boost
|
|
|
|
* our headroom for the next scanning cycle by this percentage multiple.
|
|
|
|
*/
|
|
|
|
#define L2ARC_HEADROOM_BOOST 200
|
2009-02-18 12:51:31 -08:00
|
|
|
#define L2ARC_FEED_SECS 1 /* caching interval secs */
|
|
|
|
#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2016-02-10 10:42:01 -08:00
|
|
|
|
2015-06-16 01:12:19 +02:00
|
|
|
/*
|
|
|
|
* Used to distinguish headers that are being process by
|
|
|
|
* l2arc_write_buffers(), but have yet to be assigned to a l2arc disk
|
|
|
|
* address. This can happen when the header is added to the l2arc's list
|
|
|
|
* of buffers to write in the first stage of l2arc_write_buffers(), but
|
|
|
|
* has not yet been written out which happens in the second stage of
|
|
|
|
* l2arc_write_buffers().
|
|
|
|
*/
|
|
|
|
#define L2ARC_ADDR_UNSET ((uint64_t)(-1))
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
|
|
|
|
#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
|
|
|
|
|
2013-06-11 09:12:34 -08:00
|
|
|
/* L2ARC Performance Tunables */
|
2011-07-08 12:41:57 -07:00
|
|
|
unsigned long l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */
|
|
|
|
unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */
|
|
|
|
unsigned long l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */
|
2013-08-01 13:02:10 -07:00
|
|
|
unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
|
2016-02-10 10:42:01 -08:00
|
|
|
unsigned long l2arc_max_block_size = L2ARC_MAX_BLOCK_SIZE;
|
2011-07-08 12:41:57 -07:00
|
|
|
unsigned long l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
|
|
|
|
unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */
|
|
|
|
int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
|
2013-08-01 13:02:10 -07:00
|
|
|
int l2arc_nocompress = B_FALSE; /* don't compress bufs */
|
2011-07-08 12:41:57 -07:00
|
|
|
int l2arc_feed_again = B_TRUE; /* turbo warmup */
|
2013-07-24 09:57:56 -07:00
|
|
|
int l2arc_norw = B_FALSE; /* no reads during writes */
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* L2ARC Internals
|
|
|
|
*/
|
|
|
|
static list_t L2ARC_dev_list; /* device list */
|
|
|
|
static list_t *l2arc_dev_list; /* device list pointer */
|
|
|
|
static kmutex_t l2arc_dev_mtx; /* device list mutex */
|
|
|
|
static l2arc_dev_t *l2arc_dev_last; /* last device used */
|
|
|
|
static list_t L2ARC_free_on_write; /* free after write buf list */
|
|
|
|
static list_t *l2arc_free_on_write; /* free after write list ptr */
|
|
|
|
static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
|
|
|
|
static uint64_t l2arc_ndev; /* number of devices */
|
|
|
|
|
|
|
|
typedef struct l2arc_read_callback {
|
2013-08-01 13:02:10 -07:00
|
|
|
arc_buf_t *l2rcb_buf; /* read buffer */
|
|
|
|
spa_t *l2rcb_spa; /* spa */
|
|
|
|
blkptr_t l2rcb_bp; /* original blkptr */
|
2014-06-25 10:37:59 -08:00
|
|
|
zbookmark_phys_t l2rcb_zb; /* original bookmark */
|
2013-08-01 13:02:10 -07:00
|
|
|
int l2rcb_flags; /* original flags */
|
|
|
|
enum zio_compress l2rcb_compress; /* applied compress */
|
2008-11-20 12:01:55 -08:00
|
|
|
} l2arc_read_callback_t;
|
|
|
|
|
|
|
|
typedef struct l2arc_data_free {
|
|
|
|
/* protected by l2arc_free_on_write_mtx */
|
|
|
|
void *l2df_data;
|
|
|
|
size_t l2df_size;
|
|
|
|
void (*l2df_func)(void *, size_t);
|
|
|
|
list_node_t l2df_list_node;
|
|
|
|
} l2arc_data_free_t;
|
|
|
|
|
|
|
|
static kmutex_t l2arc_feed_thr_lock;
|
|
|
|
static kcondvar_t l2arc_feed_thr_cv;
|
|
|
|
static uint8_t l2arc_thread_exit;
|
|
|
|
|
2014-12-06 09:24:32 -08:00
|
|
|
static void arc_get_data_buf(arc_buf_t *);
|
|
|
|
static void arc_access(arc_buf_hdr_t *, kmutex_t *);
|
2015-01-12 19:52:19 -08:00
|
|
|
static boolean_t arc_is_overflowing(void);
|
2014-12-06 09:24:32 -08:00
|
|
|
static void arc_buf_watch(arc_buf_t *);
|
2015-06-26 11:28:18 -07:00
|
|
|
static void arc_tuning_update(void);
|
2016-07-13 07:42:40 -05:00
|
|
|
static void arc_prune_async(int64_t);
|
2014-12-06 09:24:32 -08:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
|
|
|
|
static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
|
|
|
|
|
2014-12-06 09:24:32 -08:00
|
|
|
static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
|
|
|
|
static void l2arc_read_done(zio_t *);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
static boolean_t l2arc_compress_buf(arc_buf_hdr_t *);
|
2014-12-06 09:24:32 -08:00
|
|
|
static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress);
|
|
|
|
static void l2arc_release_cdata_buf(arc_buf_hdr_t *);
|
2013-08-01 13:02:10 -07:00
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
static uint64_t
|
2009-02-18 12:51:31 -08:00
|
|
|
buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
|
|
|
uint8_t *vdva = (uint8_t *)dva;
|
|
|
|
uint64_t crc = -1ULL;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
|
|
|
|
|
|
|
|
for (i = 0; i < sizeof (dva_t); i++)
|
|
|
|
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
|
|
|
|
|
2009-02-18 12:51:31 -08:00
|
|
|
crc ^= (spa>>8) ^ birth;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
return (crc);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define BUF_EMPTY(buf) \
|
|
|
|
((buf)->b_dva.dva_word[0] == 0 && \
|
2014-12-29 19:12:23 -08:00
|
|
|
(buf)->b_dva.dva_word[1] == 0)
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
#define BUF_EQUAL(spa, dva, birth, buf) \
|
|
|
|
((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
|
|
|
|
((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
|
|
|
|
((buf)->b_birth == birth) && ((buf)->b_spa == spa)
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
static void
|
|
|
|
buf_discard_identity(arc_buf_hdr_t *hdr)
|
|
|
|
{
|
|
|
|
hdr->b_dva.dva_word[0] = 0;
|
|
|
|
hdr->b_dva.dva_word[1] = 0;
|
|
|
|
hdr->b_birth = 0;
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
static arc_buf_hdr_t *
|
2014-06-05 13:19:08 -08:00
|
|
|
buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2014-06-05 13:19:08 -08:00
|
|
|
const dva_t *dva = BP_IDENTITY(bp);
|
|
|
|
uint64_t birth = BP_PHYSICAL_BIRTH(bp);
|
2008-11-20 12:01:55 -08:00
|
|
|
uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
|
|
|
|
kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
|
2014-12-06 09:24:32 -08:00
|
|
|
arc_buf_hdr_t *hdr;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
mutex_enter(hash_lock);
|
2014-12-06 09:24:32 -08:00
|
|
|
for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
|
|
|
|
hdr = hdr->b_hash_next) {
|
|
|
|
if (BUF_EQUAL(spa, dva, birth, hdr)) {
|
2008-11-20 12:01:55 -08:00
|
|
|
*lockp = hash_lock;
|
2014-12-06 09:24:32 -08:00
|
|
|
return (hdr);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
mutex_exit(hash_lock);
|
|
|
|
*lockp = NULL;
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Insert an entry into the hash table. If there is already an element
|
|
|
|
* equal to elem in the hash table, then the already existing element
|
|
|
|
* will be returned and the new element will not be inserted.
|
|
|
|
* Otherwise returns NULL.
|
2014-12-29 19:12:23 -08:00
|
|
|
* If lockp == NULL, the caller is assumed to already hold the hash lock.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
|
|
|
static arc_buf_hdr_t *
|
2014-12-06 09:24:32 -08:00
|
|
|
buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2014-12-06 09:24:32 -08:00
|
|
|
uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
|
2008-11-20 12:01:55 -08:00
|
|
|
kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
|
2014-12-06 09:24:32 -08:00
|
|
|
arc_buf_hdr_t *fhdr;
|
2008-11-20 12:01:55 -08:00
|
|
|
uint32_t i;
|
|
|
|
|
2014-12-06 09:24:32 -08:00
|
|
|
ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
|
|
|
|
ASSERT(hdr->b_birth != 0);
|
|
|
|
ASSERT(!HDR_IN_HASH_TABLE(hdr));
|
2014-12-29 19:12:23 -08:00
|
|
|
|
|
|
|
if (lockp != NULL) {
|
|
|
|
*lockp = hash_lock;
|
|
|
|
mutex_enter(hash_lock);
|
|
|
|
} else {
|
|
|
|
ASSERT(MUTEX_HELD(hash_lock));
|
|
|
|
}
|
|
|
|
|
2014-12-06 09:24:32 -08:00
|
|
|
for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
|
|
|
|
fhdr = fhdr->b_hash_next, i++) {
|
|
|
|
if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
|
|
|
|
return (fhdr);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2014-12-06 09:24:32 -08:00
|
|
|
hdr->b_hash_next = buf_hash_table.ht_table[idx];
|
|
|
|
buf_hash_table.ht_table[idx] = hdr;
|
|
|
|
hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/* collect some hash table performance data */
|
|
|
|
if (i > 0) {
|
|
|
|
ARCSTAT_BUMP(arcstat_hash_collisions);
|
|
|
|
if (i == 1)
|
|
|
|
ARCSTAT_BUMP(arcstat_hash_chains);
|
|
|
|
|
|
|
|
ARCSTAT_MAX(arcstat_hash_chain_max, i);
|
|
|
|
}
|
|
|
|
|
|
|
|
ARCSTAT_BUMP(arcstat_hash_elements);
|
|
|
|
ARCSTAT_MAXSTAT(arcstat_hash_elements);
|
|
|
|
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2014-12-06 09:24:32 -08:00
|
|
|
buf_hash_remove(arc_buf_hdr_t *hdr)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2014-12-06 09:24:32 -08:00
|
|
|
arc_buf_hdr_t *fhdr, **hdrp;
|
|
|
|
uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
|
2014-12-06 09:24:32 -08:00
|
|
|
ASSERT(HDR_IN_HASH_TABLE(hdr));
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-06 09:24:32 -08:00
|
|
|
hdrp = &buf_hash_table.ht_table[idx];
|
|
|
|
while ((fhdr = *hdrp) != hdr) {
|
|
|
|
ASSERT(fhdr != NULL);
|
|
|
|
hdrp = &fhdr->b_hash_next;
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
2014-12-06 09:24:32 -08:00
|
|
|
*hdrp = hdr->b_hash_next;
|
|
|
|
hdr->b_hash_next = NULL;
|
|
|
|
hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/* collect some hash table performance data */
|
|
|
|
ARCSTAT_BUMPDOWN(arcstat_hash_elements);
|
|
|
|
|
|
|
|
if (buf_hash_table.ht_table[idx] &&
|
|
|
|
buf_hash_table.ht_table[idx]->b_hash_next == NULL)
|
|
|
|
ARCSTAT_BUMPDOWN(arcstat_hash_chains);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Global data structures and functions for the buf kmem cache.
|
|
|
|
*/
|
2014-12-29 19:12:23 -08:00
|
|
|
static kmem_cache_t *hdr_full_cache;
|
|
|
|
static kmem_cache_t *hdr_l2only_cache;
|
2008-11-20 12:01:55 -08:00
|
|
|
static kmem_cache_t *buf_cache;
|
|
|
|
|
|
|
|
static void
|
|
|
|
buf_fini(void)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2010-08-26 11:46:09 -07:00
|
|
|
#if defined(_KERNEL) && defined(HAVE_SPL)
|
2013-11-01 20:26:11 +01:00
|
|
|
/*
|
|
|
|
* Large allocations which do not require contiguous pages
|
|
|
|
* should be using vmem_free() in the linux kernel\
|
|
|
|
*/
|
2010-08-26 11:46:09 -07:00
|
|
|
vmem_free(buf_hash_table.ht_table,
|
|
|
|
(buf_hash_table.ht_mask + 1) * sizeof (void *));
|
|
|
|
#else
|
2008-11-20 12:01:55 -08:00
|
|
|
kmem_free(buf_hash_table.ht_table,
|
|
|
|
(buf_hash_table.ht_mask + 1) * sizeof (void *));
|
2010-08-26 11:46:09 -07:00
|
|
|
#endif
|
2008-11-20 12:01:55 -08:00
|
|
|
for (i = 0; i < BUF_LOCKS; i++)
|
|
|
|
mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
|
2014-12-29 19:12:23 -08:00
|
|
|
kmem_cache_destroy(hdr_full_cache);
|
|
|
|
kmem_cache_destroy(hdr_l2only_cache);
|
2008-11-20 12:01:55 -08:00
|
|
|
kmem_cache_destroy(buf_cache);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Constructor callback - called when the cache is empty
|
|
|
|
* and a new buf is requested.
|
|
|
|
*/
|
|
|
|
/* ARGSUSED */
|
|
|
|
static int
|
2014-12-29 19:12:23 -08:00
|
|
|
hdr_full_cons(void *vbuf, void *unused, int kmflag)
|
|
|
|
{
|
|
|
|
arc_buf_hdr_t *hdr = vbuf;
|
|
|
|
|
|
|
|
bzero(hdr, HDR_FULL_SIZE);
|
|
|
|
cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
|
|
|
|
refcount_create(&hdr->b_l1hdr.b_refcnt);
|
|
|
|
mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
|
|
|
|
list_link_init(&hdr->b_l1hdr.b_arc_node);
|
|
|
|
list_link_init(&hdr->b_l2hdr.b_l2node);
|
2015-01-12 19:52:19 -08:00
|
|
|
multilist_link_init(&hdr->b_l1hdr.b_arc_node);
|
2014-12-29 19:12:23 -08:00
|
|
|
arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ARGSUSED */
|
|
|
|
static int
|
|
|
|
hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2014-12-06 09:24:32 -08:00
|
|
|
arc_buf_hdr_t *hdr = vbuf;
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
bzero(hdr, HDR_L2ONLY_SIZE);
|
|
|
|
arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2008-12-03 12:09:06 -08:00
|
|
|
/* ARGSUSED */
|
|
|
|
static int
|
|
|
|
buf_cons(void *vbuf, void *unused, int kmflag)
|
|
|
|
{
|
|
|
|
arc_buf_t *buf = vbuf;
|
|
|
|
|
|
|
|
bzero(buf, sizeof (arc_buf_t));
|
2010-05-28 13:45:14 -07:00
|
|
|
mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
|
2009-02-18 12:51:31 -08:00
|
|
|
arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
|
|
|
|
|
2008-12-03 12:09:06 -08:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* Destructor callback - called when a cached buf is
|
|
|
|
* no longer required.
|
|
|
|
*/
|
|
|
|
/* ARGSUSED */
|
|
|
|
static void
|
2014-12-29 19:12:23 -08:00
|
|
|
hdr_full_dest(void *vbuf, void *unused)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2014-12-06 09:24:32 -08:00
|
|
|
arc_buf_hdr_t *hdr = vbuf;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-06 09:24:32 -08:00
|
|
|
ASSERT(BUF_EMPTY(hdr));
|
2014-12-29 19:12:23 -08:00
|
|
|
cv_destroy(&hdr->b_l1hdr.b_cv);
|
|
|
|
refcount_destroy(&hdr->b_l1hdr.b_refcnt);
|
|
|
|
mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
|
2015-01-12 19:52:19 -08:00
|
|
|
ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
|
2014-12-29 19:12:23 -08:00
|
|
|
arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ARGSUSED */
|
|
|
|
static void
|
|
|
|
hdr_l2only_dest(void *vbuf, void *unused)
|
|
|
|
{
|
|
|
|
ASSERTV(arc_buf_hdr_t *hdr = vbuf);
|
|
|
|
|
|
|
|
ASSERT(BUF_EMPTY(hdr));
|
|
|
|
arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2008-12-03 12:09:06 -08:00
|
|
|
/* ARGSUSED */
|
|
|
|
static void
|
|
|
|
buf_dest(void *vbuf, void *unused)
|
|
|
|
{
|
|
|
|
arc_buf_t *buf = vbuf;
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
mutex_destroy(&buf->b_evict_lock);
|
2009-02-18 12:51:31 -08:00
|
|
|
arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
|
2008-12-03 12:09:06 -08:00
|
|
|
}
|
|
|
|
|
2015-06-29 10:34:47 -07:00
|
|
|
/*
|
|
|
|
* Reclaim callback -- invoked when memory is low.
|
|
|
|
*/
|
|
|
|
/* ARGSUSED */
|
|
|
|
static void
|
|
|
|
hdr_recl(void *unused)
|
|
|
|
{
|
|
|
|
dprintf("hdr_recl called\n");
|
|
|
|
/*
|
|
|
|
* umem calls the reclaim func when we destroy the buf cache,
|
|
|
|
* which is after we do arc_fini().
|
|
|
|
*/
|
|
|
|
if (!arc_dead)
|
|
|
|
cv_signal(&arc_reclaim_thread_cv);
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
static void
|
|
|
|
buf_init(void)
|
|
|
|
{
|
|
|
|
uint64_t *ct;
|
|
|
|
uint64_t hsize = 1ULL << 12;
|
|
|
|
int i, j;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The hash table is big enough to fill all of physical memory
|
2014-08-20 10:09:40 -07:00
|
|
|
* with an average block size of zfs_arc_average_blocksize (default 8K).
|
|
|
|
* By default, the table will take up
|
|
|
|
* totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
2014-08-20 10:09:40 -07:00
|
|
|
while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE)
|
2008-11-20 12:01:55 -08:00
|
|
|
hsize <<= 1;
|
|
|
|
retry:
|
|
|
|
buf_hash_table.ht_mask = hsize - 1;
|
2010-08-26 11:46:09 -07:00
|
|
|
#if defined(_KERNEL) && defined(HAVE_SPL)
|
2013-11-01 20:26:11 +01:00
|
|
|
/*
|
|
|
|
* Large allocations which do not require contiguous pages
|
|
|
|
* should be using vmem_alloc() in the linux kernel
|
|
|
|
*/
|
2010-08-26 11:46:09 -07:00
|
|
|
buf_hash_table.ht_table =
|
|
|
|
vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
|
|
|
|
#else
|
2008-11-20 12:01:55 -08:00
|
|
|
buf_hash_table.ht_table =
|
|
|
|
kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
|
2010-08-26 11:46:09 -07:00
|
|
|
#endif
|
2008-11-20 12:01:55 -08:00
|
|
|
if (buf_hash_table.ht_table == NULL) {
|
|
|
|
ASSERT(hsize > (1ULL << 8));
|
|
|
|
hsize >>= 1;
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
|
2015-06-29 10:34:47 -07:00
|
|
|
0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
|
2014-12-29 19:12:23 -08:00
|
|
|
hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
|
2015-06-29 10:34:47 -07:00
|
|
|
HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
|
2014-12-29 19:12:23 -08:00
|
|
|
NULL, NULL, 0);
|
2008-11-20 12:01:55 -08:00
|
|
|
buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
|
2008-12-03 12:09:06 -08:00
|
|
|
0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
for (i = 0; i < 256; i++)
|
|
|
|
for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
|
|
|
|
*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
|
|
|
|
|
|
|
|
for (i = 0; i < BUF_LOCKS; i++) {
|
|
|
|
mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
|
2015-03-30 22:43:29 -05:00
|
|
|
NULL, MUTEX_DEFAULT, NULL);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
/*
|
|
|
|
* Transition between the two allocation states for the arc_buf_hdr struct.
|
|
|
|
* The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
|
|
|
|
* (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
|
|
|
|
* version is used when a cache buffer is only in the L2ARC in order to reduce
|
|
|
|
* memory usage.
|
|
|
|
*/
|
|
|
|
static arc_buf_hdr_t *
|
|
|
|
arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
|
|
|
|
{
|
|
|
|
arc_buf_hdr_t *nhdr;
|
|
|
|
l2arc_dev_t *dev;
|
|
|
|
|
|
|
|
ASSERT(HDR_HAS_L2HDR(hdr));
|
|
|
|
ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
|
|
|
|
(old == hdr_l2only_cache && new == hdr_full_cache));
|
|
|
|
|
|
|
|
dev = hdr->b_l2hdr.b_dev;
|
|
|
|
nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
|
|
|
|
|
|
|
|
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
|
|
|
|
buf_hash_remove(hdr);
|
|
|
|
|
|
|
|
bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
|
2015-06-16 01:12:19 +02:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
if (new == hdr_full_cache) {
|
|
|
|
nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
|
|
|
|
/*
|
|
|
|
* arc_access and arc_change_state need to be aware that a
|
|
|
|
* header has just come out of L2ARC, so we set its state to
|
|
|
|
* l2c_only even though it's about to change.
|
|
|
|
*/
|
|
|
|
nhdr->b_l1hdr.b_state = arc_l2c_only;
|
2015-01-12 19:52:19 -08:00
|
|
|
|
|
|
|
/* Verify previous threads set to NULL before freeing */
|
|
|
|
ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL);
|
2014-12-29 19:12:23 -08:00
|
|
|
} else {
|
|
|
|
ASSERT(hdr->b_l1hdr.b_buf == NULL);
|
|
|
|
ASSERT0(hdr->b_l1hdr.b_datacnt);
|
2015-01-12 19:52:19 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we've reached here, We must have been called from
|
|
|
|
* arc_evict_hdr(), as such we should have already been
|
|
|
|
* removed from any ghost list we were previously on
|
|
|
|
* (which protects us from racing with arc_evict_state),
|
|
|
|
* thus no locking is needed during this check.
|
|
|
|
*/
|
|
|
|
ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
/*
|
2015-01-12 19:52:19 -08:00
|
|
|
* A buffer must not be moved into the arc_l2c_only
|
|
|
|
* state if it's not finished being written out to the
|
|
|
|
* l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field
|
|
|
|
* might try to be accessed, even though it was removed.
|
2014-12-29 19:12:23 -08:00
|
|
|
*/
|
2015-01-12 19:52:19 -08:00
|
|
|
VERIFY(!HDR_L2_WRITING(hdr));
|
|
|
|
VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* The header has been reallocated so we need to re-insert it into any
|
|
|
|
* lists it was on.
|
|
|
|
*/
|
|
|
|
(void) buf_hash_insert(nhdr, NULL);
|
|
|
|
|
|
|
|
ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
|
|
|
|
|
|
|
|
mutex_enter(&dev->l2ad_mtx);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We must place the realloc'ed header back into the list at
|
|
|
|
* the same spot. Otherwise, if it's placed earlier in the list,
|
|
|
|
* l2arc_write_buffers() could find it during the function's
|
|
|
|
* write phase, and try to write it out to the l2arc.
|
|
|
|
*/
|
|
|
|
list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
|
|
|
|
list_remove(&dev->l2ad_buflist, hdr);
|
|
|
|
|
|
|
|
mutex_exit(&dev->l2ad_mtx);
|
|
|
|
|
2015-06-16 01:12:19 +02:00
|
|
|
/*
|
|
|
|
* Since we're using the pointer address as the tag when
|
|
|
|
* incrementing and decrementing the l2ad_alloc refcount, we
|
|
|
|
* must remove the old pointer (that we're about to destroy) and
|
|
|
|
* add the new pointer to the refcount. Otherwise we'd remove
|
|
|
|
* the wrong pointer address when calling arc_hdr_destroy() later.
|
|
|
|
*/
|
|
|
|
|
|
|
|
(void) refcount_remove_many(&dev->l2ad_alloc,
|
|
|
|
hdr->b_l2hdr.b_asize, hdr);
|
|
|
|
|
|
|
|
(void) refcount_add_many(&dev->l2ad_alloc,
|
|
|
|
nhdr->b_l2hdr.b_asize, nhdr);
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
buf_discard_identity(hdr);
|
|
|
|
hdr->b_freeze_cksum = NULL;
|
|
|
|
kmem_cache_free(old, hdr);
|
|
|
|
|
|
|
|
return (nhdr);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
#define ARC_MINTIME (hz>>4) /* 62 ms */
|
|
|
|
|
|
|
|
static void
|
|
|
|
arc_cksum_verify(arc_buf_t *buf)
|
|
|
|
{
|
|
|
|
zio_cksum_t zc;
|
|
|
|
|
|
|
|
if (!(zfs_flags & ZFS_DEBUG_MODIFY))
|
|
|
|
return;
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
|
|
|
|
if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) {
|
|
|
|
mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
|
|
|
|
if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
|
|
|
|
panic("buffer modified while frozen!");
|
2014-12-29 19:12:23 -08:00
|
|
|
mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
arc_cksum_equal(arc_buf_t *buf)
|
|
|
|
{
|
|
|
|
zio_cksum_t zc;
|
|
|
|
int equal;
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
|
|
|
|
equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
|
2014-12-29 19:12:23 -08:00
|
|
|
mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
return (equal);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
arc_cksum_compute(arc_buf_t *buf, boolean_t force)
|
|
|
|
{
|
|
|
|
if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
|
|
|
|
return;
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
if (buf->b_hdr->b_freeze_cksum != NULL) {
|
2014-12-29 19:12:23 -08:00
|
|
|
mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
return;
|
|
|
|
}
|
2015-06-29 10:02:03 -07:00
|
|
|
buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
|
2008-11-20 12:01:55 -08:00
|
|
|
fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
|
|
|
|
buf->b_hdr->b_freeze_cksum);
|
2014-12-29 19:12:23 -08:00
|
|
|
mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
|
2013-05-16 14:18:06 -07:00
|
|
|
arc_buf_watch(buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef _KERNEL
|
|
|
|
void
|
|
|
|
arc_buf_sigsegv(int sig, siginfo_t *si, void *unused)
|
|
|
|
{
|
|
|
|
panic("Got SIGSEGV at address: 0x%lx\n", (long) si->si_addr);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* ARGSUSED */
|
|
|
|
static void
|
|
|
|
arc_buf_unwatch(arc_buf_t *buf)
|
|
|
|
{
|
|
|
|
#ifndef _KERNEL
|
|
|
|
if (arc_watch) {
|
|
|
|
ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size,
|
|
|
|
PROT_READ | PROT_WRITE));
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ARGSUSED */
|
|
|
|
static void
|
|
|
|
arc_buf_watch(arc_buf_t *buf)
|
|
|
|
{
|
|
|
|
#ifndef _KERNEL
|
|
|
|
if (arc_watch)
|
|
|
|
ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size, PROT_READ));
|
|
|
|
#endif
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
static arc_buf_contents_t
|
|
|
|
arc_buf_type(arc_buf_hdr_t *hdr)
|
|
|
|
{
|
|
|
|
if (HDR_ISTYPE_METADATA(hdr)) {
|
|
|
|
return (ARC_BUFC_METADATA);
|
|
|
|
} else {
|
|
|
|
return (ARC_BUFC_DATA);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static uint32_t
|
|
|
|
arc_bufc_to_flags(arc_buf_contents_t type)
|
|
|
|
{
|
|
|
|
switch (type) {
|
|
|
|
case ARC_BUFC_DATA:
|
|
|
|
/* metadata field is 0 if buffer contains normal data */
|
|
|
|
return (0);
|
|
|
|
case ARC_BUFC_METADATA:
|
|
|
|
return (ARC_FLAG_BUFC_METADATA);
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
panic("undefined ARC buffer type!");
|
|
|
|
return ((uint32_t)-1);
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
void
|
|
|
|
arc_buf_thaw(arc_buf_t *buf)
|
|
|
|
{
|
|
|
|
if (zfs_flags & ZFS_DEBUG_MODIFY) {
|
2014-12-29 19:12:23 -08:00
|
|
|
if (buf->b_hdr->b_l1hdr.b_state != arc_anon)
|
2008-11-20 12:01:55 -08:00
|
|
|
panic("modifying non-anon buffer!");
|
2014-12-29 19:12:23 -08:00
|
|
|
if (HDR_IO_IN_PROGRESS(buf->b_hdr))
|
2008-11-20 12:01:55 -08:00
|
|
|
panic("modifying buffer while i/o in progress!");
|
|
|
|
arc_cksum_verify(buf);
|
|
|
|
}
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
if (buf->b_hdr->b_freeze_cksum != NULL) {
|
|
|
|
kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
|
|
|
|
buf->b_hdr->b_freeze_cksum = NULL;
|
|
|
|
}
|
2010-05-28 13:45:14 -07:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
|
2013-05-16 14:18:06 -07:00
|
|
|
|
|
|
|
arc_buf_unwatch(buf);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
arc_buf_freeze(arc_buf_t *buf)
|
|
|
|
{
|
2010-05-28 13:45:14 -07:00
|
|
|
kmutex_t *hash_lock;
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
if (!(zfs_flags & ZFS_DEBUG_MODIFY))
|
|
|
|
return;
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
hash_lock = HDR_LOCK(buf->b_hdr);
|
|
|
|
mutex_enter(hash_lock);
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
|
2014-12-29 19:12:23 -08:00
|
|
|
buf->b_hdr->b_l1hdr.b_state == arc_anon);
|
2008-11-20 12:01:55 -08:00
|
|
|
arc_cksum_compute(buf, B_FALSE);
|
2010-05-28 13:45:14 -07:00
|
|
|
mutex_exit(hash_lock);
|
2013-05-16 14:18:06 -07:00
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2014-12-06 09:24:32 -08:00
|
|
|
add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2014-12-29 19:12:23 -08:00
|
|
|
arc_state_t *state;
|
|
|
|
|
|
|
|
ASSERT(HDR_HAS_L1HDR(hdr));
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(MUTEX_HELD(hash_lock));
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
state = hdr->b_l1hdr.b_state;
|
|
|
|
|
|
|
|
if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
|
|
|
|
(state != arc_anon)) {
|
|
|
|
/* We don't use the L2-only state list. */
|
|
|
|
if (state != arc_l2c_only) {
|
2015-01-12 19:52:19 -08:00
|
|
|
arc_buf_contents_t type = arc_buf_type(hdr);
|
2014-12-29 19:12:23 -08:00
|
|
|
uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt;
|
2015-01-12 19:52:19 -08:00
|
|
|
multilist_t *list = &state->arcs_list[type];
|
|
|
|
uint64_t *size = &state->arcs_lsize[type];
|
|
|
|
|
|
|
|
multilist_remove(list, hdr);
|
2014-12-29 19:12:23 -08:00
|
|
|
|
|
|
|
if (GHOST_STATE(state)) {
|
|
|
|
ASSERT0(hdr->b_l1hdr.b_datacnt);
|
|
|
|
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
|
|
|
|
delta = hdr->b_size;
|
|
|
|
}
|
|
|
|
ASSERT(delta > 0);
|
|
|
|
ASSERT3U(*size, >=, delta);
|
|
|
|
atomic_add_64(size, -delta);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
2008-12-03 12:09:06 -08:00
|
|
|
/* remove the prefetch flag if we get a reference */
|
2014-12-29 19:12:23 -08:00
|
|
|
hdr->b_flags &= ~ARC_FLAG_PREFETCH;
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2014-12-06 09:24:32 -08:00
|
|
|
remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
|
|
|
int cnt;
|
2014-12-29 19:12:23 -08:00
|
|
|
arc_state_t *state = hdr->b_l1hdr.b_state;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(HDR_HAS_L1HDR(hdr));
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
|
|
|
|
ASSERT(!GHOST_STATE(state));
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
/*
|
|
|
|
* arc_l2c_only counts as a ghost state so we don't need to explicitly
|
|
|
|
* check to prevent usage of the arc_l2c_only list.
|
|
|
|
*/
|
|
|
|
if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
|
2008-11-20 12:01:55 -08:00
|
|
|
(state != arc_anon)) {
|
2015-01-12 19:52:19 -08:00
|
|
|
arc_buf_contents_t type = arc_buf_type(hdr);
|
|
|
|
multilist_t *list = &state->arcs_list[type];
|
|
|
|
uint64_t *size = &state->arcs_lsize[type];
|
|
|
|
|
|
|
|
multilist_insert(list, hdr);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(hdr->b_l1hdr.b_datacnt > 0);
|
|
|
|
atomic_add_64(size, hdr->b_size *
|
|
|
|
hdr->b_l1hdr.b_datacnt);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
return (cnt);
|
|
|
|
}
|
|
|
|
|
2013-10-02 17:11:19 -07:00
|
|
|
/*
|
|
|
|
* Returns detailed information about a specific arc buffer. When the
|
|
|
|
* state_index argument is set the function will calculate the arc header
|
|
|
|
* list position for its arc state. Since this requires a linear traversal
|
|
|
|
* callers are strongly encourage not to do this. However, it can be helpful
|
|
|
|
* for targeted analysis so the functionality is provided.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
|
|
|
|
{
|
|
|
|
arc_buf_hdr_t *hdr = ab->b_hdr;
|
2014-12-29 19:12:23 -08:00
|
|
|
l1arc_buf_hdr_t *l1hdr = NULL;
|
|
|
|
l2arc_buf_hdr_t *l2hdr = NULL;
|
|
|
|
arc_state_t *state = NULL;
|
|
|
|
|
2016-07-10 09:09:02 -05:00
|
|
|
memset(abi, 0, sizeof (arc_buf_info_t));
|
|
|
|
|
|
|
|
if (hdr == NULL)
|
|
|
|
return;
|
|
|
|
|
|
|
|
abi->abi_flags = hdr->b_flags;
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
if (HDR_HAS_L1HDR(hdr)) {
|
|
|
|
l1hdr = &hdr->b_l1hdr;
|
|
|
|
state = l1hdr->b_state;
|
|
|
|
}
|
|
|
|
if (HDR_HAS_L2HDR(hdr))
|
|
|
|
l2hdr = &hdr->b_l2hdr;
|
2013-10-02 17:11:19 -07:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
if (l1hdr) {
|
|
|
|
abi->abi_datacnt = l1hdr->b_datacnt;
|
|
|
|
abi->abi_access = l1hdr->b_arc_access;
|
|
|
|
abi->abi_mru_hits = l1hdr->b_mru_hits;
|
|
|
|
abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
|
|
|
|
abi->abi_mfu_hits = l1hdr->b_mfu_hits;
|
|
|
|
abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits;
|
|
|
|
abi->abi_holds = refcount_count(&l1hdr->b_refcnt);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (l2hdr) {
|
|
|
|
abi->abi_l2arc_dattr = l2hdr->b_daddr;
|
|
|
|
abi->abi_l2arc_asize = l2hdr->b_asize;
|
2015-09-11 09:18:56 -07:00
|
|
|
abi->abi_l2arc_compress = l2hdr->b_compress;
|
2014-12-29 19:12:23 -08:00
|
|
|
abi->abi_l2arc_hits = l2hdr->b_hits;
|
|
|
|
}
|
|
|
|
|
2013-10-02 17:11:19 -07:00
|
|
|
abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
|
2014-12-29 19:12:23 -08:00
|
|
|
abi->abi_state_contents = arc_buf_type(hdr);
|
2013-10-02 17:11:19 -07:00
|
|
|
abi->abi_size = hdr->b_size;
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
2015-01-12 19:52:19 -08:00
|
|
|
* Move the supplied buffer to the indicated state. The hash lock
|
2008-11-20 12:01:55 -08:00
|
|
|
* for the buffer must be held by the caller.
|
|
|
|
*/
|
|
|
|
static void
|
2014-12-06 09:24:32 -08:00
|
|
|
arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
|
|
|
|
kmutex_t *hash_lock)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2014-12-29 19:12:23 -08:00
|
|
|
arc_state_t *old_state;
|
|
|
|
int64_t refcnt;
|
|
|
|
uint32_t datacnt;
|
2008-11-20 12:01:55 -08:00
|
|
|
uint64_t from_delta, to_delta;
|
2014-12-29 19:12:23 -08:00
|
|
|
arc_buf_contents_t buftype = arc_buf_type(hdr);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We almost always have an L1 hdr here, since we call arc_hdr_realloc()
|
|
|
|
* in arc_read() when bringing a buffer out of the L2ARC. However, the
|
|
|
|
* L1 hdr doesn't always exist when we change state to arc_anon before
|
|
|
|
* destroying a header, in which case reallocating to add the L1 hdr is
|
|
|
|
* pointless.
|
|
|
|
*/
|
|
|
|
if (HDR_HAS_L1HDR(hdr)) {
|
|
|
|
old_state = hdr->b_l1hdr.b_state;
|
|
|
|
refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
|
|
|
|
datacnt = hdr->b_l1hdr.b_datacnt;
|
|
|
|
} else {
|
|
|
|
old_state = arc_l2c_only;
|
|
|
|
refcnt = 0;
|
|
|
|
datacnt = 0;
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
ASSERT(MUTEX_HELD(hash_lock));
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
ASSERT3P(new_state, !=, old_state);
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(refcnt == 0 || datacnt > 0);
|
|
|
|
ASSERT(!GHOST_STATE(new_state) || datacnt == 0);
|
|
|
|
ASSERT(old_state != arc_anon || datacnt <= 1);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
from_delta = to_delta = datacnt * hdr->b_size;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If this buffer is evictable, transfer it from the
|
|
|
|
* old state list to the new state list.
|
|
|
|
*/
|
|
|
|
if (refcnt == 0) {
|
2014-12-29 19:12:23 -08:00
|
|
|
if (old_state != arc_anon && old_state != arc_l2c_only) {
|
|
|
|
uint64_t *size = &old_state->arcs_lsize[buftype];
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(HDR_HAS_L1HDR(hdr));
|
2015-01-12 19:52:19 -08:00
|
|
|
multilist_remove(&old_state->arcs_list[buftype], hdr);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If prefetching out of the ghost cache,
|
2010-05-28 13:45:14 -07:00
|
|
|
* we will have a non-zero datacnt.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
2014-12-29 19:12:23 -08:00
|
|
|
if (GHOST_STATE(old_state) && datacnt == 0) {
|
2008-11-20 12:01:55 -08:00
|
|
|
/* ghost elements have a ghost size */
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(hdr->b_l1hdr.b_buf == NULL);
|
2014-12-06 09:24:32 -08:00
|
|
|
from_delta = hdr->b_size;
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
ASSERT3U(*size, >=, from_delta);
|
|
|
|
atomic_add_64(size, -from_delta);
|
|
|
|
}
|
2014-12-29 19:12:23 -08:00
|
|
|
if (new_state != arc_anon && new_state != arc_l2c_only) {
|
|
|
|
uint64_t *size = &new_state->arcs_lsize[buftype];
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
/*
|
|
|
|
* An L1 header always exists here, since if we're
|
|
|
|
* moving to some L1-cached state (i.e. not l2c_only or
|
|
|
|
* anonymous), we realloc the header to add an L1hdr
|
|
|
|
* beforehand.
|
|
|
|
*/
|
|
|
|
ASSERT(HDR_HAS_L1HDR(hdr));
|
2015-01-12 19:52:19 -08:00
|
|
|
multilist_insert(&new_state->arcs_list[buftype], hdr);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/* ghost elements have a ghost size */
|
|
|
|
if (GHOST_STATE(new_state)) {
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT0(datacnt);
|
|
|
|
ASSERT(hdr->b_l1hdr.b_buf == NULL);
|
2014-12-06 09:24:32 -08:00
|
|
|
to_delta = hdr->b_size;
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
atomic_add_64(size, to_delta);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-12-06 09:24:32 -08:00
|
|
|
ASSERT(!BUF_EMPTY(hdr));
|
|
|
|
if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
|
|
|
|
buf_hash_remove(hdr);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
/* adjust state sizes (ignore arc_l2c_only) */
|
2015-06-26 15:14:45 -07:00
|
|
|
|
|
|
|
if (to_delta && new_state != arc_l2c_only) {
|
|
|
|
ASSERT(HDR_HAS_L1HDR(hdr));
|
|
|
|
if (GHOST_STATE(new_state)) {
|
|
|
|
ASSERT0(datacnt);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We moving a header to a ghost state, we first
|
|
|
|
* remove all arc buffers. Thus, we'll have a
|
|
|
|
* datacnt of zero, and no arc buffer to use for
|
|
|
|
* the reference. As a result, we use the arc
|
|
|
|
* header pointer for the reference.
|
|
|
|
*/
|
|
|
|
(void) refcount_add_many(&new_state->arcs_size,
|
|
|
|
hdr->b_size, hdr);
|
|
|
|
} else {
|
|
|
|
arc_buf_t *buf;
|
|
|
|
ASSERT3U(datacnt, !=, 0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Each individual buffer holds a unique reference,
|
|
|
|
* thus we must remove each of these references one
|
|
|
|
* at a time.
|
|
|
|
*/
|
|
|
|
for (buf = hdr->b_l1hdr.b_buf; buf != NULL;
|
|
|
|
buf = buf->b_next) {
|
|
|
|
(void) refcount_add_many(&new_state->arcs_size,
|
|
|
|
hdr->b_size, buf);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
if (from_delta && old_state != arc_l2c_only) {
|
2015-06-26 15:14:45 -07:00
|
|
|
ASSERT(HDR_HAS_L1HDR(hdr));
|
|
|
|
if (GHOST_STATE(old_state)) {
|
|
|
|
/*
|
|
|
|
* When moving a header off of a ghost state,
|
|
|
|
* there's the possibility for datacnt to be
|
|
|
|
* non-zero. This is because we first add the
|
|
|
|
* arc buffer to the header prior to changing
|
|
|
|
* the header's state. Since we used the header
|
|
|
|
* for the reference when putting the header on
|
|
|
|
* the ghost state, we must balance that and use
|
|
|
|
* the header when removing off the ghost state
|
|
|
|
* (even though datacnt is non zero).
|
|
|
|
*/
|
|
|
|
|
|
|
|
IMPLY(datacnt == 0, new_state == arc_anon ||
|
|
|
|
new_state == arc_l2c_only);
|
|
|
|
|
|
|
|
(void) refcount_remove_many(&old_state->arcs_size,
|
|
|
|
hdr->b_size, hdr);
|
|
|
|
} else {
|
|
|
|
arc_buf_t *buf;
|
|
|
|
ASSERT3U(datacnt, !=, 0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Each individual buffer holds a unique reference,
|
|
|
|
* thus we must remove each of these references one
|
|
|
|
* at a time.
|
|
|
|
*/
|
|
|
|
for (buf = hdr->b_l1hdr.b_buf; buf != NULL;
|
|
|
|
buf = buf->b_next) {
|
|
|
|
(void) refcount_remove_many(
|
|
|
|
&old_state->arcs_size, hdr->b_size, buf);
|
|
|
|
}
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
2015-06-26 15:14:45 -07:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
if (HDR_HAS_L1HDR(hdr))
|
|
|
|
hdr->b_l1hdr.b_state = new_state;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
/*
|
|
|
|
* L2 headers should never be on the L2 state list since they don't
|
|
|
|
* have L1 headers allocated.
|
|
|
|
*/
|
2015-01-12 19:52:19 -08:00
|
|
|
ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
|
|
|
|
multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2009-02-18 12:51:31 -08:00
|
|
|
arc_space_consume(uint64_t space, arc_space_type_t type)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2009-02-18 12:51:31 -08:00
|
|
|
ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
|
|
|
|
|
|
|
|
switch (type) {
|
2010-08-26 09:52:41 -07:00
|
|
|
default:
|
|
|
|
break;
|
2009-02-18 12:51:31 -08:00
|
|
|
case ARC_SPACE_DATA:
|
|
|
|
ARCSTAT_INCR(arcstat_data_size, space);
|
|
|
|
break;
|
2014-02-03 12:41:47 -08:00
|
|
|
case ARC_SPACE_META:
|
2015-06-26 14:54:17 -07:00
|
|
|
ARCSTAT_INCR(arcstat_metadata_size, space);
|
2014-02-03 12:41:47 -08:00
|
|
|
break;
|
2016-07-13 07:42:40 -05:00
|
|
|
case ARC_SPACE_BONUS:
|
|
|
|
ARCSTAT_INCR(arcstat_bonus_size, space);
|
|
|
|
break;
|
|
|
|
case ARC_SPACE_DNODE:
|
|
|
|
ARCSTAT_INCR(arcstat_dnode_size, space);
|
|
|
|
break;
|
|
|
|
case ARC_SPACE_DBUF:
|
|
|
|
ARCSTAT_INCR(arcstat_dbuf_size, space);
|
2009-02-18 12:51:31 -08:00
|
|
|
break;
|
|
|
|
case ARC_SPACE_HDRS:
|
|
|
|
ARCSTAT_INCR(arcstat_hdr_size, space);
|
|
|
|
break;
|
|
|
|
case ARC_SPACE_L2HDRS:
|
|
|
|
ARCSTAT_INCR(arcstat_l2_hdr_size, space);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2015-06-26 14:54:17 -07:00
|
|
|
if (type != ARC_SPACE_DATA)
|
2014-02-03 12:41:47 -08:00
|
|
|
ARCSTAT_INCR(arcstat_meta_used, space);
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
atomic_add_64(&arc_size, space);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2009-02-18 12:51:31 -08:00
|
|
|
arc_space_return(uint64_t space, arc_space_type_t type)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2009-02-18 12:51:31 -08:00
|
|
|
ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
|
|
|
|
|
|
|
|
switch (type) {
|
2010-08-26 09:52:41 -07:00
|
|
|
default:
|
|
|
|
break;
|
2009-02-18 12:51:31 -08:00
|
|
|
case ARC_SPACE_DATA:
|
|
|
|
ARCSTAT_INCR(arcstat_data_size, -space);
|
|
|
|
break;
|
2014-02-03 12:41:47 -08:00
|
|
|
case ARC_SPACE_META:
|
2015-06-26 14:54:17 -07:00
|
|
|
ARCSTAT_INCR(arcstat_metadata_size, -space);
|
2014-02-03 12:41:47 -08:00
|
|
|
break;
|
2016-07-13 07:42:40 -05:00
|
|
|
case ARC_SPACE_BONUS:
|
|
|
|
ARCSTAT_INCR(arcstat_bonus_size, -space);
|
|
|
|
break;
|
|
|
|
case ARC_SPACE_DNODE:
|
|
|
|
ARCSTAT_INCR(arcstat_dnode_size, -space);
|
|
|
|
break;
|
|
|
|
case ARC_SPACE_DBUF:
|
|
|
|
ARCSTAT_INCR(arcstat_dbuf_size, -space);
|
2009-02-18 12:51:31 -08:00
|
|
|
break;
|
|
|
|
case ARC_SPACE_HDRS:
|
|
|
|
ARCSTAT_INCR(arcstat_hdr_size, -space);
|
|
|
|
break;
|
|
|
|
case ARC_SPACE_L2HDRS:
|
|
|
|
ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2014-02-03 12:41:47 -08:00
|
|
|
if (type != ARC_SPACE_DATA) {
|
|
|
|
ASSERT(arc_meta_used >= space);
|
2015-06-26 14:54:17 -07:00
|
|
|
if (arc_meta_max < arc_meta_used)
|
|
|
|
arc_meta_max = arc_meta_used;
|
2014-02-03 12:41:47 -08:00
|
|
|
ARCSTAT_INCR(arcstat_meta_used, -space);
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(arc_size >= space);
|
|
|
|
atomic_add_64(&arc_size, -space);
|
|
|
|
}
|
|
|
|
|
|
|
|
arc_buf_t *
|
2014-09-10 11:59:03 -07:00
|
|
|
arc_buf_alloc(spa_t *spa, uint64_t size, void *tag, arc_buf_contents_t type)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
|
|
|
arc_buf_hdr_t *hdr;
|
|
|
|
arc_buf_t *buf;
|
|
|
|
|
2014-11-03 12:15:08 -08:00
|
|
|
VERIFY3U(size, <=, spa_maxblocksize(spa));
|
2014-12-29 19:12:23 -08:00
|
|
|
hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(BUF_EMPTY(hdr));
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT3P(hdr->b_freeze_cksum, ==, NULL);
|
2008-11-20 12:01:55 -08:00
|
|
|
hdr->b_size = size;
|
2011-11-11 14:07:54 -08:00
|
|
|
hdr->b_spa = spa_load_guid(spa);
|
2014-12-29 19:12:23 -08:00
|
|
|
hdr->b_l1hdr.b_mru_hits = 0;
|
|
|
|
hdr->b_l1hdr.b_mru_ghost_hits = 0;
|
|
|
|
hdr->b_l1hdr.b_mfu_hits = 0;
|
|
|
|
hdr->b_l1hdr.b_mfu_ghost_hits = 0;
|
|
|
|
hdr->b_l1hdr.b_l2_hits = 0;
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
|
|
|
|
buf->b_hdr = hdr;
|
|
|
|
buf->b_data = NULL;
|
|
|
|
buf->b_efunc = NULL;
|
|
|
|
buf->b_private = NULL;
|
|
|
|
buf->b_next = NULL;
|
2014-12-29 19:12:23 -08:00
|
|
|
|
|
|
|
hdr->b_flags = arc_bufc_to_flags(type);
|
|
|
|
hdr->b_flags |= ARC_FLAG_HAS_L1HDR;
|
|
|
|
|
|
|
|
hdr->b_l1hdr.b_buf = buf;
|
|
|
|
hdr->b_l1hdr.b_state = arc_anon;
|
|
|
|
hdr->b_l1hdr.b_arc_access = 0;
|
|
|
|
hdr->b_l1hdr.b_datacnt = 1;
|
2015-01-12 19:52:19 -08:00
|
|
|
hdr->b_l1hdr.b_tmp_cdata = NULL;
|
2014-12-29 19:12:23 -08:00
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
arc_get_data_buf(buf);
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
|
|
|
|
(void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
return (buf);
|
|
|
|
}
|
|
|
|
|
2009-07-02 15:44:48 -07:00
|
|
|
static char *arc_onloan_tag = "onloan";
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Loan out an anonymous arc buffer. Loaned buffers are not counted as in
|
|
|
|
* flight data by arc_tempreserve_space() until they are "returned". Loaned
|
|
|
|
* buffers must be returned to the arc before they can be used by the DMU or
|
|
|
|
* freed.
|
|
|
|
*/
|
|
|
|
arc_buf_t *
|
2014-09-10 11:59:03 -07:00
|
|
|
arc_loan_buf(spa_t *spa, uint64_t size)
|
2009-07-02 15:44:48 -07:00
|
|
|
{
|
|
|
|
arc_buf_t *buf;
|
|
|
|
|
|
|
|
buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
|
|
|
|
|
|
|
|
atomic_add_64(&arc_loaned_bytes, size);
|
|
|
|
return (buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return a loaned arc buffer to the arc.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
arc_return_buf(arc_buf_t *buf, void *tag)
|
|
|
|
{
|
|
|
|
arc_buf_hdr_t *hdr = buf->b_hdr;
|
|
|
|
|
|
|
|
ASSERT(buf->b_data != NULL);
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(HDR_HAS_L1HDR(hdr));
|
|
|
|
(void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
|
|
|
|
(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
|
2009-07-02 15:44:48 -07:00
|
|
|
|
|
|
|
atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
|
|
|
|
}
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
/* Detach an arc_buf from a dbuf (tag) */
|
|
|
|
void
|
|
|
|
arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
|
|
|
|
{
|
2014-12-29 19:12:23 -08:00
|
|
|
arc_buf_hdr_t *hdr = buf->b_hdr;
|
2010-05-28 13:45:14 -07:00
|
|
|
|
|
|
|
ASSERT(buf->b_data != NULL);
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(HDR_HAS_L1HDR(hdr));
|
|
|
|
(void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
|
|
|
|
(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
|
2010-05-28 13:45:14 -07:00
|
|
|
buf->b_efunc = NULL;
|
|
|
|
buf->b_private = NULL;
|
|
|
|
|
|
|
|
atomic_add_64(&arc_loaned_bytes, hdr->b_size);
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
static arc_buf_t *
|
|
|
|
arc_buf_clone(arc_buf_t *from)
|
|
|
|
{
|
|
|
|
arc_buf_t *buf;
|
|
|
|
arc_buf_hdr_t *hdr = from->b_hdr;
|
|
|
|
uint64_t size = hdr->b_size;
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(HDR_HAS_L1HDR(hdr));
|
|
|
|
ASSERT(hdr->b_l1hdr.b_state != arc_anon);
|
2010-05-28 13:45:14 -07:00
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
|
|
|
|
buf->b_hdr = hdr;
|
|
|
|
buf->b_data = NULL;
|
|
|
|
buf->b_efunc = NULL;
|
|
|
|
buf->b_private = NULL;
|
2014-12-29 19:12:23 -08:00
|
|
|
buf->b_next = hdr->b_l1hdr.b_buf;
|
|
|
|
hdr->b_l1hdr.b_buf = buf;
|
2008-11-20 12:01:55 -08:00
|
|
|
arc_get_data_buf(buf);
|
|
|
|
bcopy(from->b_data, buf->b_data, size);
|
2012-12-21 14:57:09 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* This buffer already exists in the arc so create a duplicate
|
|
|
|
* copy for the caller. If the buffer is associated with user data
|
|
|
|
* then track the size and number of duplicates. These stats will be
|
|
|
|
* updated as duplicate buffers are created and destroyed.
|
|
|
|
*/
|
2014-12-29 19:12:23 -08:00
|
|
|
if (HDR_ISTYPE_DATA(hdr)) {
|
2012-12-21 14:57:09 -08:00
|
|
|
ARCSTAT_BUMP(arcstat_duplicate_buffers);
|
|
|
|
ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
|
|
|
|
}
|
2014-12-29 19:12:23 -08:00
|
|
|
hdr->b_l1hdr.b_datacnt += 1;
|
2008-11-20 12:01:55 -08:00
|
|
|
return (buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
arc_buf_add_ref(arc_buf_t *buf, void* tag)
|
|
|
|
{
|
|
|
|
arc_buf_hdr_t *hdr;
|
|
|
|
kmutex_t *hash_lock;
|
|
|
|
|
|
|
|
/*
|
2008-12-03 12:09:06 -08:00
|
|
|
* Check to see if this buffer is evicted. Callers
|
|
|
|
* must verify b_data != NULL to know if the add_ref
|
|
|
|
* was successful.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
2010-05-28 13:45:14 -07:00
|
|
|
mutex_enter(&buf->b_evict_lock);
|
2008-12-03 12:09:06 -08:00
|
|
|
if (buf->b_data == NULL) {
|
2010-05-28 13:45:14 -07:00
|
|
|
mutex_exit(&buf->b_evict_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
return;
|
|
|
|
}
|
2010-05-28 13:45:14 -07:00
|
|
|
hash_lock = HDR_LOCK(buf->b_hdr);
|
2008-11-20 12:01:55 -08:00
|
|
|
mutex_enter(hash_lock);
|
2010-05-28 13:45:14 -07:00
|
|
|
hdr = buf->b_hdr;
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(HDR_HAS_L1HDR(hdr));
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
|
|
|
|
mutex_exit(&buf->b_evict_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
|
|
|
|
hdr->b_l1hdr.b_state == arc_mfu);
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
add_reference(hdr, hash_lock, tag);
|
2009-02-18 12:51:31 -08:00
|
|
|
DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
|
2008-11-20 12:01:55 -08:00
|
|
|
arc_access(hdr, hash_lock);
|
|
|
|
mutex_exit(hash_lock);
|
|
|
|
ARCSTAT_BUMP(arcstat_hits);
|
2014-12-29 19:12:23 -08:00
|
|
|
ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
|
|
|
|
demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
|
2008-11-20 12:01:55 -08:00
|
|
|
data, metadata, hits);
|
|
|
|
}
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
static void
|
|
|
|
arc_buf_free_on_write(void *data, size_t size,
|
|
|
|
void (*free_func)(void *, size_t))
|
|
|
|
{
|
|
|
|
l2arc_data_free_t *df;
|
|
|
|
|
|
|
|
df = kmem_alloc(sizeof (*df), KM_SLEEP);
|
|
|
|
df->l2df_data = data;
|
|
|
|
df->l2df_size = size;
|
|
|
|
df->l2df_func = free_func;
|
|
|
|
mutex_enter(&l2arc_free_on_write_mtx);
|
|
|
|
list_insert_head(l2arc_free_on_write, df);
|
|
|
|
mutex_exit(&l2arc_free_on_write_mtx);
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* Free the arc data buffer. If it is an l2arc write in progress,
|
|
|
|
* the buffer is placed on l2arc_free_on_write to be freed later.
|
|
|
|
*/
|
|
|
|
static void
|
2013-05-16 14:18:06 -07:00
|
|
|
arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2013-05-16 14:18:06 -07:00
|
|
|
arc_buf_hdr_t *hdr = buf->b_hdr;
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
if (HDR_L2_WRITING(hdr)) {
|
2015-01-12 19:52:19 -08:00
|
|
|
arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func);
|
2008-11-20 12:01:55 -08:00
|
|
|
ARCSTAT_BUMP(arcstat_l2_free_on_write);
|
|
|
|
} else {
|
2013-05-16 14:18:06 -07:00
|
|
|
free_func(buf->b_data, hdr->b_size);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
static void
|
|
|
|
arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
|
|
|
|
{
|
|
|
|
ASSERT(HDR_HAS_L2HDR(hdr));
|
|
|
|
ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The b_tmp_cdata field is linked off of the b_l1hdr, so if
|
|
|
|
* that doesn't exist, the header is in the arc_l2c_only state,
|
|
|
|
* and there isn't anything to free (it's already been freed).
|
|
|
|
*/
|
|
|
|
if (!HDR_HAS_L1HDR(hdr))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The header isn't being written to the l2arc device, thus it
|
|
|
|
* shouldn't have a b_tmp_cdata to free.
|
|
|
|
*/
|
|
|
|
if (!HDR_L2_WRITING(hdr)) {
|
|
|
|
ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The header does not have compression enabled. This can be due
|
|
|
|
* to the buffer not being compressible, or because we're
|
|
|
|
* freeing the buffer before the second phase of
|
|
|
|
* l2arc_write_buffer() has started (which does the compression
|
|
|
|
* step). In either case, b_tmp_cdata does not point to a
|
|
|
|
* separately compressed buffer, so there's nothing to free (it
|
|
|
|
* points to the same buffer as the arc_buf_t's b_data field).
|
|
|
|
*/
|
2015-09-11 09:18:56 -07:00
|
|
|
if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_OFF) {
|
2015-01-12 19:52:19 -08:00
|
|
|
hdr->b_l1hdr.b_tmp_cdata = NULL;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* There's nothing to free since the buffer was all zero's and
|
|
|
|
* compressed to a zero length buffer.
|
|
|
|
*/
|
2015-09-11 09:18:56 -07:00
|
|
|
if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_EMPTY) {
|
2015-01-12 19:52:19 -08:00
|
|
|
ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2015-09-11 09:18:56 -07:00
|
|
|
ASSERT(L2ARC_IS_VALID_COMPRESS(hdr->b_l2hdr.b_compress));
|
2015-01-12 19:52:19 -08:00
|
|
|
|
|
|
|
arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata,
|
|
|
|
hdr->b_size, zio_data_buf_free);
|
|
|
|
|
|
|
|
ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
|
|
|
|
hdr->b_l1hdr.b_tmp_cdata = NULL;
|
|
|
|
}
|
|
|
|
|
2014-07-15 03:43:18 -04:00
|
|
|
/*
|
|
|
|
* Free up buf->b_data and if 'remove' is set, then pull the
|
|
|
|
* arc_buf_t off of the the arc_buf_hdr_t's list and free it.
|
|
|
|
*/
|
2008-11-20 12:01:55 -08:00
|
|
|
static void
|
2015-01-12 19:52:19 -08:00
|
|
|
arc_buf_destroy(arc_buf_t *buf, boolean_t remove)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
|
|
|
arc_buf_t **bufp;
|
|
|
|
|
|
|
|
/* free up data associated with the buf */
|
2014-12-29 19:12:23 -08:00
|
|
|
if (buf->b_data != NULL) {
|
|
|
|
arc_state_t *state = buf->b_hdr->b_l1hdr.b_state;
|
2008-11-20 12:01:55 -08:00
|
|
|
uint64_t size = buf->b_hdr->b_size;
|
2014-12-29 19:12:23 -08:00
|
|
|
arc_buf_contents_t type = arc_buf_type(buf->b_hdr);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
arc_cksum_verify(buf);
|
2013-05-16 14:18:06 -07:00
|
|
|
arc_buf_unwatch(buf);
|
2010-05-28 13:45:14 -07:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
if (type == ARC_BUFC_METADATA) {
|
|
|
|
arc_buf_data_free(buf, zio_buf_free);
|
|
|
|
arc_space_return(size, ARC_SPACE_META);
|
|
|
|
} else {
|
|
|
|
ASSERT(type == ARC_BUFC_DATA);
|
|
|
|
arc_buf_data_free(buf, zio_data_buf_free);
|
|
|
|
arc_space_return(size, ARC_SPACE_DATA);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
2015-01-12 19:52:19 -08:00
|
|
|
|
|
|
|
/* protected by hash lock, if in the hash table */
|
|
|
|
if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) {
|
2008-11-20 12:01:55 -08:00
|
|
|
uint64_t *cnt = &state->arcs_lsize[type];
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(refcount_is_zero(
|
|
|
|
&buf->b_hdr->b_l1hdr.b_refcnt));
|
|
|
|
ASSERT(state != arc_anon && state != arc_l2c_only);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
ASSERT3U(*cnt, >=, size);
|
|
|
|
atomic_add_64(cnt, -size);
|
|
|
|
}
|
2015-06-26 15:14:45 -07:00
|
|
|
|
|
|
|
(void) refcount_remove_many(&state->arcs_size, size, buf);
|
2008-11-20 12:01:55 -08:00
|
|
|
buf->b_data = NULL;
|
2012-12-21 14:57:09 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we're destroying a duplicate buffer make sure
|
|
|
|
* that the appropriate statistics are updated.
|
|
|
|
*/
|
2014-12-29 19:12:23 -08:00
|
|
|
if (buf->b_hdr->b_l1hdr.b_datacnt > 1 &&
|
|
|
|
HDR_ISTYPE_DATA(buf->b_hdr)) {
|
2012-12-21 14:57:09 -08:00
|
|
|
ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
|
|
|
|
ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
|
|
|
|
}
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0);
|
|
|
|
buf->b_hdr->b_l1hdr.b_datacnt -= 1;
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* only remove the buf if requested */
|
2014-07-15 03:43:18 -04:00
|
|
|
if (!remove)
|
2008-11-20 12:01:55 -08:00
|
|
|
return;
|
|
|
|
|
|
|
|
/* remove the buf from the hdr list */
|
2014-12-29 19:12:23 -08:00
|
|
|
for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf;
|
|
|
|
bufp = &(*bufp)->b_next)
|
2008-11-20 12:01:55 -08:00
|
|
|
continue;
|
|
|
|
*bufp = buf->b_next;
|
2010-05-28 13:45:14 -07:00
|
|
|
buf->b_next = NULL;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
ASSERT(buf->b_efunc == NULL);
|
|
|
|
|
|
|
|
/* clean up the buf */
|
|
|
|
buf->b_hdr = NULL;
|
|
|
|
kmem_cache_free(buf_cache, buf);
|
|
|
|
}
|
|
|
|
|
2015-06-16 01:12:19 +02:00
|
|
|
static void
|
|
|
|
arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
|
|
|
|
{
|
|
|
|
l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
|
|
|
|
l2arc_dev_t *dev = l2hdr->b_dev;
|
|
|
|
|
|
|
|
ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
|
|
|
|
ASSERT(HDR_HAS_L2HDR(hdr));
|
|
|
|
|
|
|
|
list_remove(&dev->l2ad_buflist, hdr);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't want to leak the b_tmp_cdata buffer that was
|
|
|
|
* allocated in l2arc_write_buffers()
|
|
|
|
*/
|
|
|
|
arc_buf_l2_cdata_free(hdr);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then
|
|
|
|
* this header is being processed by l2arc_write_buffers() (i.e.
|
|
|
|
* it's in the first stage of l2arc_write_buffers()).
|
|
|
|
* Re-affirming that truth here, just to serve as a reminder. If
|
|
|
|
* b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or
|
|
|
|
* may not have its HDR_L2_WRITING flag set. (the write may have
|
|
|
|
* completed, in which case HDR_L2_WRITING will be false and the
|
|
|
|
* b_daddr field will point to the address of the buffer on disk).
|
|
|
|
*/
|
|
|
|
IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with
|
|
|
|
* l2arc_write_buffers(). Since we've just removed this header
|
|
|
|
* from the l2arc buffer list, this header will never reach the
|
|
|
|
* second stage of l2arc_write_buffers(), which increments the
|
|
|
|
* accounting stats for this header. Thus, we must be careful
|
|
|
|
* not to decrement them for this header either.
|
|
|
|
*/
|
|
|
|
if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) {
|
|
|
|
ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
|
|
|
|
ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
|
|
|
|
|
|
|
|
vdev_space_update(dev->l2ad_vdev,
|
|
|
|
-l2hdr->b_asize, 0, 0);
|
|
|
|
|
|
|
|
(void) refcount_remove_many(&dev->l2ad_alloc,
|
|
|
|
l2hdr->b_asize, hdr);
|
|
|
|
}
|
|
|
|
|
|
|
|
hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
static void
|
|
|
|
arc_hdr_destroy(arc_buf_hdr_t *hdr)
|
|
|
|
{
|
2014-12-29 19:12:23 -08:00
|
|
|
if (HDR_HAS_L1HDR(hdr)) {
|
|
|
|
ASSERT(hdr->b_l1hdr.b_buf == NULL ||
|
|
|
|
hdr->b_l1hdr.b_datacnt > 0);
|
|
|
|
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
|
|
|
|
ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(!HDR_IN_HASH_TABLE(hdr));
|
|
|
|
|
|
|
|
if (HDR_HAS_L2HDR(hdr)) {
|
2015-06-16 01:12:19 +02:00
|
|
|
l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
|
|
|
|
boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
|
2010-05-28 13:45:14 -07:00
|
|
|
|
2015-06-16 01:12:19 +02:00
|
|
|
if (!buflist_held)
|
|
|
|
mutex_enter(&dev->l2ad_mtx);
|
2014-12-29 19:12:23 -08:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
/*
|
2015-06-16 01:12:19 +02:00
|
|
|
* Even though we checked this conditional above, we
|
|
|
|
* need to check this again now that we have the
|
|
|
|
* l2ad_mtx. This is because we could be racing with
|
|
|
|
* another thread calling l2arc_evict() which might have
|
|
|
|
* destroyed this header's L2 portion as we were waiting
|
|
|
|
* to acquire the l2ad_mtx. If that happens, we don't
|
|
|
|
* want to re-destroy the header's L2 portion.
|
2015-01-12 19:52:19 -08:00
|
|
|
*/
|
2015-06-16 01:12:19 +02:00
|
|
|
if (HDR_HAS_L2HDR(hdr))
|
|
|
|
arc_hdr_l2hdr_destroy(hdr);
|
2010-05-28 13:45:14 -07:00
|
|
|
|
|
|
|
if (!buflist_held)
|
2015-06-16 01:12:19 +02:00
|
|
|
mutex_exit(&dev->l2ad_mtx);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
if (!BUF_EMPTY(hdr))
|
2010-05-28 13:45:14 -07:00
|
|
|
buf_discard_identity(hdr);
|
2014-12-29 19:12:23 -08:00
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
if (hdr->b_freeze_cksum != NULL) {
|
|
|
|
kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
|
|
|
|
hdr->b_freeze_cksum = NULL;
|
|
|
|
}
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
if (HDR_HAS_L1HDR(hdr)) {
|
|
|
|
while (hdr->b_l1hdr.b_buf) {
|
|
|
|
arc_buf_t *buf = hdr->b_l1hdr.b_buf;
|
|
|
|
|
|
|
|
if (buf->b_efunc != NULL) {
|
2015-01-12 19:52:19 -08:00
|
|
|
mutex_enter(&arc_user_evicts_lock);
|
2014-12-29 19:12:23 -08:00
|
|
|
mutex_enter(&buf->b_evict_lock);
|
|
|
|
ASSERT(buf->b_hdr != NULL);
|
2015-01-12 19:52:19 -08:00
|
|
|
arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE);
|
2014-12-29 19:12:23 -08:00
|
|
|
hdr->b_l1hdr.b_buf = buf->b_next;
|
|
|
|
buf->b_hdr = &arc_eviction_hdr;
|
|
|
|
buf->b_next = arc_eviction_list;
|
|
|
|
arc_eviction_list = buf;
|
|
|
|
mutex_exit(&buf->b_evict_lock);
|
2015-01-12 19:52:19 -08:00
|
|
|
cv_signal(&arc_user_evicts_cv);
|
|
|
|
mutex_exit(&arc_user_evicts_lock);
|
2014-12-29 19:12:23 -08:00
|
|
|
} else {
|
2015-01-12 19:52:19 -08:00
|
|
|
arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE);
|
2014-12-29 19:12:23 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT3P(hdr->b_hash_next, ==, NULL);
|
2014-12-29 19:12:23 -08:00
|
|
|
if (HDR_HAS_L1HDR(hdr)) {
|
2015-01-12 19:52:19 -08:00
|
|
|
ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
|
|
|
|
kmem_cache_free(hdr_full_cache, hdr);
|
|
|
|
} else {
|
|
|
|
kmem_cache_free(hdr_l2only_cache, hdr);
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
arc_buf_free(arc_buf_t *buf, void *tag)
|
|
|
|
{
|
|
|
|
arc_buf_hdr_t *hdr = buf->b_hdr;
|
2014-12-29 19:12:23 -08:00
|
|
|
int hashed = hdr->b_l1hdr.b_state != arc_anon;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
ASSERT(buf->b_efunc == NULL);
|
|
|
|
ASSERT(buf->b_data != NULL);
|
|
|
|
|
|
|
|
if (hashed) {
|
|
|
|
kmutex_t *hash_lock = HDR_LOCK(hdr);
|
|
|
|
|
|
|
|
mutex_enter(hash_lock);
|
2010-05-28 13:45:14 -07:00
|
|
|
hdr = buf->b_hdr;
|
|
|
|
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
(void) remove_reference(hdr, hash_lock, tag);
|
2014-12-29 19:12:23 -08:00
|
|
|
if (hdr->b_l1hdr.b_datacnt > 1) {
|
2015-01-12 19:52:19 -08:00
|
|
|
arc_buf_destroy(buf, TRUE);
|
2010-05-28 13:45:14 -07:00
|
|
|
} else {
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(buf == hdr->b_l1hdr.b_buf);
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(buf->b_efunc == NULL);
|
2014-12-06 09:24:32 -08:00
|
|
|
hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
|
2010-05-28 13:45:14 -07:00
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
mutex_exit(hash_lock);
|
|
|
|
} else if (HDR_IO_IN_PROGRESS(hdr)) {
|
|
|
|
int destroy_hdr;
|
|
|
|
/*
|
|
|
|
* We are in the middle of an async write. Don't destroy
|
|
|
|
* this buffer unless the write completes before we finish
|
|
|
|
* decrementing the reference count.
|
|
|
|
*/
|
2015-01-12 19:52:19 -08:00
|
|
|
mutex_enter(&arc_user_evicts_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
(void) remove_reference(hdr, NULL, tag);
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
|
2008-11-20 12:01:55 -08:00
|
|
|
destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
|
2015-01-12 19:52:19 -08:00
|
|
|
mutex_exit(&arc_user_evicts_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
if (destroy_hdr)
|
|
|
|
arc_hdr_destroy(hdr);
|
|
|
|
} else {
|
2010-05-28 13:45:14 -07:00
|
|
|
if (remove_reference(hdr, NULL, tag) > 0)
|
2015-01-12 19:52:19 -08:00
|
|
|
arc_buf_destroy(buf, TRUE);
|
2010-05-28 13:45:14 -07:00
|
|
|
else
|
2008-11-20 12:01:55 -08:00
|
|
|
arc_hdr_destroy(hdr);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-09-04 07:00:57 -05:00
|
|
|
boolean_t
|
2008-11-20 12:01:55 -08:00
|
|
|
arc_buf_remove_ref(arc_buf_t *buf, void* tag)
|
|
|
|
{
|
|
|
|
arc_buf_hdr_t *hdr = buf->b_hdr;
|
2015-06-29 10:02:03 -07:00
|
|
|
kmutex_t *hash_lock = HDR_LOCK(hdr);
|
2013-09-04 07:00:57 -05:00
|
|
|
boolean_t no_callback = (buf->b_efunc == NULL);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
if (hdr->b_l1hdr.b_state == arc_anon) {
|
|
|
|
ASSERT(hdr->b_l1hdr.b_datacnt == 1);
|
2008-11-20 12:01:55 -08:00
|
|
|
arc_buf_free(buf, tag);
|
|
|
|
return (no_callback);
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_enter(hash_lock);
|
2010-05-28 13:45:14 -07:00
|
|
|
hdr = buf->b_hdr;
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(hdr->b_l1hdr.b_datacnt > 0);
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(hdr->b_l1hdr.b_state != arc_anon);
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(buf->b_data != NULL);
|
|
|
|
|
|
|
|
(void) remove_reference(hdr, hash_lock, tag);
|
2014-12-29 19:12:23 -08:00
|
|
|
if (hdr->b_l1hdr.b_datacnt > 1) {
|
2008-11-20 12:01:55 -08:00
|
|
|
if (no_callback)
|
2015-01-12 19:52:19 -08:00
|
|
|
arc_buf_destroy(buf, TRUE);
|
2008-11-20 12:01:55 -08:00
|
|
|
} else if (no_callback) {
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL);
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(buf->b_efunc == NULL);
|
2014-12-06 09:24:32 -08:00
|
|
|
hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 ||
|
|
|
|
refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
|
2008-11-20 12:01:55 -08:00
|
|
|
mutex_exit(hash_lock);
|
|
|
|
return (no_callback);
|
|
|
|
}
|
|
|
|
|
2014-09-10 11:59:03 -07:00
|
|
|
uint64_t
|
2008-11-20 12:01:55 -08:00
|
|
|
arc_buf_size(arc_buf_t *buf)
|
|
|
|
{
|
|
|
|
return (buf->b_hdr->b_size);
|
|
|
|
}
|
|
|
|
|
2012-12-21 14:57:09 -08:00
|
|
|
/*
|
|
|
|
* Called from the DMU to determine if the current buffer should be
|
|
|
|
* evicted. In order to ensure proper locking, the eviction must be initiated
|
|
|
|
* from the DMU. Return true if the buffer is associated with user data and
|
|
|
|
* duplicate buffers still exist.
|
|
|
|
*/
|
|
|
|
boolean_t
|
|
|
|
arc_buf_eviction_needed(arc_buf_t *buf)
|
|
|
|
{
|
|
|
|
arc_buf_hdr_t *hdr;
|
|
|
|
boolean_t evict_needed = B_FALSE;
|
|
|
|
|
|
|
|
if (zfs_disable_dup_eviction)
|
|
|
|
return (B_FALSE);
|
|
|
|
|
|
|
|
mutex_enter(&buf->b_evict_lock);
|
|
|
|
hdr = buf->b_hdr;
|
|
|
|
if (hdr == NULL) {
|
|
|
|
/*
|
|
|
|
* We are in arc_do_user_evicts(); let that function
|
|
|
|
* perform the eviction.
|
|
|
|
*/
|
|
|
|
ASSERT(buf->b_data == NULL);
|
|
|
|
mutex_exit(&buf->b_evict_lock);
|
|
|
|
return (B_FALSE);
|
|
|
|
} else if (buf->b_data == NULL) {
|
|
|
|
/*
|
|
|
|
* We have already been added to the arc eviction list;
|
|
|
|
* recommend eviction.
|
|
|
|
*/
|
|
|
|
ASSERT3P(hdr, ==, &arc_eviction_hdr);
|
|
|
|
mutex_exit(&buf->b_evict_lock);
|
|
|
|
return (B_TRUE);
|
|
|
|
}
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr))
|
2012-12-21 14:57:09 -08:00
|
|
|
evict_needed = B_TRUE;
|
|
|
|
|
|
|
|
mutex_exit(&buf->b_evict_lock);
|
|
|
|
return (evict_needed);
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
2015-01-12 19:52:19 -08:00
|
|
|
* Evict the arc_buf_hdr that is provided as a parameter. The resultant
|
|
|
|
* state of the header is dependent on its state prior to entering this
|
|
|
|
* function. The following transitions are possible:
|
2008-11-20 12:01:55 -08:00
|
|
|
*
|
2015-01-12 19:52:19 -08:00
|
|
|
* - arc_mru -> arc_mru_ghost
|
|
|
|
* - arc_mfu -> arc_mfu_ghost
|
|
|
|
* - arc_mru_ghost -> arc_l2c_only
|
|
|
|
* - arc_mru_ghost -> deleted
|
|
|
|
* - arc_mfu_ghost -> arc_l2c_only
|
|
|
|
* - arc_mfu_ghost -> deleted
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
2015-01-12 19:52:19 -08:00
|
|
|
static int64_t
|
|
|
|
arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2015-01-12 19:52:19 -08:00
|
|
|
arc_state_t *evicted_state, *state;
|
|
|
|
int64_t bytes_evicted = 0;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
ASSERT(MUTEX_HELD(hash_lock));
|
|
|
|
ASSERT(HDR_HAS_L1HDR(hdr));
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
state = hdr->b_l1hdr.b_state;
|
|
|
|
if (GHOST_STATE(state)) {
|
|
|
|
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
|
|
|
|
ASSERT(hdr->b_l1hdr.b_buf == NULL);
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
|
|
|
|
/*
|
2015-01-12 19:52:19 -08:00
|
|
|
* l2arc_write_buffers() relies on a header's L1 portion
|
|
|
|
* (i.e. its b_tmp_cdata field) during its write phase.
|
|
|
|
* Thus, we cannot push a header onto the arc_l2c_only
|
|
|
|
* state (removing its L1 piece) until the header is
|
|
|
|
* done being written to the l2arc.
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
*/
|
2015-01-12 19:52:19 -08:00
|
|
|
if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
|
|
|
|
ARCSTAT_BUMP(arcstat_evict_l2_skip);
|
|
|
|
return (bytes_evicted);
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
}
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
ARCSTAT_BUMP(arcstat_deleted);
|
|
|
|
bytes_evicted += hdr->b_size;
|
2010-05-28 13:45:14 -07:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
|
2010-05-28 13:45:14 -07:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
if (HDR_HAS_L2HDR(hdr)) {
|
|
|
|
/*
|
|
|
|
* This buffer is cached on the 2nd Level ARC;
|
|
|
|
* don't destroy the header.
|
|
|
|
*/
|
|
|
|
arc_change_state(arc_l2c_only, hdr, hash_lock);
|
|
|
|
/*
|
|
|
|
* dropping from L1+L2 cached to L2-only,
|
|
|
|
* realloc to remove the L1 header.
|
|
|
|
*/
|
|
|
|
hdr = arc_hdr_realloc(hdr, hdr_full_cache,
|
|
|
|
hdr_l2only_cache);
|
2008-11-20 12:01:55 -08:00
|
|
|
} else {
|
2015-01-12 19:52:19 -08:00
|
|
|
arc_change_state(arc_anon, hdr, hash_lock);
|
|
|
|
arc_hdr_destroy(hdr);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
2015-01-12 19:52:19 -08:00
|
|
|
return (bytes_evicted);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
ASSERT(state == arc_mru || state == arc_mfu);
|
|
|
|
evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
/* prefetch buffers have a minimum lifespan */
|
|
|
|
if (HDR_IO_IN_PROGRESS(hdr) ||
|
|
|
|
((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
|
|
|
|
ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
|
|
|
|
arc_min_prefetch_lifespan)) {
|
|
|
|
ARCSTAT_BUMP(arcstat_evict_skip);
|
|
|
|
return (bytes_evicted);
|
Prioritize "metadata" in arc_get_data_buf
When the arc is at it's size limit and a new buffer is added, data will
be evicted (or recycled) from the arc to make room for this new buffer.
As far as I can tell, this is to try and keep the arc from over stepping
it's bounds (i.e. keep it below the size limitation placed on it).
This makes sense conceptually, but there appears to be a subtle flaw in
its current implementation, resulting in metadata buffers being
throttled. When it evicts from the arc's lists, it also passes in a
"type" so as to remove a buffer of the same type that it is adding. The
problem with this is that once the size limit is hit, the ratio of
"metadata" to "data" contained in the arc essentially becomes fixed.
For example, consider the following scenario:
* the size of the arc is capped at 10G
* the meta_limit is capped at 4G
* 9G of the arc contains "data"
* 1G of the arc contains "metadata"
Now, every time a new "metadata" buffer is created and added to the arc,
an older "metadata" buffer(s) will be removed from the arc; preserving
the 9G "data" to 1G "metadata" ratio that was in-place when the size
limit was reached. This occurs even though the amount of "metadata" is
far below the "metadata" limit. This can result in the arc behaving
pathologically for certain workloads.
To fix this, the arc_get_data_buf function was modified to evict "data"
from the arc even when adding a "metadata" buffer; unless it's at the
"metadata" limit. In addition, arc_evict now more closely resembles
arc_evict_ghost; such that when evicting "data" from the arc, it may
make a second pass over the arc lists and evict "metadata" if it cannot
meet the eviction size the first time around.
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #2110
2013-12-30 09:30:00 -08:00
|
|
|
}
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
|
|
|
|
ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0);
|
|
|
|
while (hdr->b_l1hdr.b_buf) {
|
|
|
|
arc_buf_t *buf = hdr->b_l1hdr.b_buf;
|
|
|
|
if (!mutex_tryenter(&buf->b_evict_lock)) {
|
|
|
|
ARCSTAT_BUMP(arcstat_mutex_miss);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (buf->b_data != NULL)
|
|
|
|
bytes_evicted += hdr->b_size;
|
|
|
|
if (buf->b_efunc != NULL) {
|
|
|
|
mutex_enter(&arc_user_evicts_lock);
|
|
|
|
arc_buf_destroy(buf, FALSE);
|
|
|
|
hdr->b_l1hdr.b_buf = buf->b_next;
|
|
|
|
buf->b_hdr = &arc_eviction_hdr;
|
|
|
|
buf->b_next = arc_eviction_list;
|
|
|
|
arc_eviction_list = buf;
|
|
|
|
cv_signal(&arc_user_evicts_cv);
|
|
|
|
mutex_exit(&arc_user_evicts_lock);
|
|
|
|
mutex_exit(&buf->b_evict_lock);
|
|
|
|
} else {
|
|
|
|
mutex_exit(&buf->b_evict_lock);
|
|
|
|
arc_buf_destroy(buf, TRUE);
|
|
|
|
}
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
if (HDR_HAS_L2HDR(hdr)) {
|
|
|
|
ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size);
|
|
|
|
} else {
|
|
|
|
if (l2arc_write_eligible(hdr->b_spa, hdr))
|
|
|
|
ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size);
|
|
|
|
else
|
|
|
|
ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size);
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
if (hdr->b_l1hdr.b_datacnt == 0) {
|
|
|
|
arc_change_state(evicted_state, hdr, hash_lock);
|
|
|
|
ASSERT(HDR_IN_HASH_TABLE(hdr));
|
|
|
|
hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
|
|
|
|
hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
|
|
|
|
DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
return (bytes_evicted);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
static uint64_t
|
|
|
|
arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
|
|
|
|
uint64_t spa, int64_t bytes)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2015-01-12 19:52:19 -08:00
|
|
|
multilist_sublist_t *mls;
|
|
|
|
uint64_t bytes_evicted = 0;
|
|
|
|
arc_buf_hdr_t *hdr;
|
2008-11-20 12:01:55 -08:00
|
|
|
kmutex_t *hash_lock;
|
2015-01-12 19:52:19 -08:00
|
|
|
int evict_count = 0;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
ASSERT3P(marker, !=, NULL);
|
2015-06-29 10:02:03 -07:00
|
|
|
IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
|
2015-01-12 19:52:19 -08:00
|
|
|
|
|
|
|
mls = multilist_sublist_lock(ml, idx);
|
2010-08-26 14:24:34 -07:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
|
|
|
|
hdr = multilist_sublist_prev(mls, marker)) {
|
|
|
|
if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
|
|
|
|
(evict_count >= zfs_arc_evict_batch_limit))
|
|
|
|
break;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* To keep our iteration location, move the marker
|
|
|
|
* forward. Since we're not holding hdr's hash lock, we
|
|
|
|
* must be very careful and not remove 'hdr' from the
|
|
|
|
* sublist. Otherwise, other consumers might mistake the
|
|
|
|
* 'hdr' as not being on a sublist when they call the
|
|
|
|
* multilist_link_active() function (they all rely on
|
|
|
|
* the hash lock protecting concurrent insertions and
|
|
|
|
* removals). multilist_sublist_move_forward() was
|
|
|
|
* specifically implemented to ensure this is the case
|
|
|
|
* (only 'marker' will be removed and re-inserted).
|
|
|
|
*/
|
|
|
|
multilist_sublist_move_forward(mls, marker);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The only case where the b_spa field should ever be
|
|
|
|
* zero, is the marker headers inserted by
|
|
|
|
* arc_evict_state(). It's possible for multiple threads
|
|
|
|
* to be calling arc_evict_state() concurrently (e.g.
|
|
|
|
* dsl_pool_close() and zio_inject_fault()), so we must
|
|
|
|
* skip any markers we see from these other threads.
|
|
|
|
*/
|
2014-12-06 09:24:32 -08:00
|
|
|
if (hdr->b_spa == 0)
|
2010-08-26 14:24:34 -07:00
|
|
|
continue;
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
/* we're only interested in evicting buffers of a certain spa */
|
|
|
|
if (spa != 0 && hdr->b_spa != spa) {
|
|
|
|
ARCSTAT_BUMP(arcstat_evict_skip);
|
2010-05-28 13:45:14 -07:00
|
|
|
continue;
|
2015-01-12 19:52:19 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
hash_lock = HDR_LOCK(hdr);
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
|
|
|
|
/*
|
2015-01-12 19:52:19 -08:00
|
|
|
* We aren't calling this function from any code path
|
|
|
|
* that would already be holding a hash lock, so we're
|
|
|
|
* asserting on this assumption to be defensive in case
|
|
|
|
* this ever changes. Without this check, it would be
|
|
|
|
* possible to incorrectly increment arcstat_mutex_miss
|
|
|
|
* below (e.g. if the code changed such that we called
|
|
|
|
* this function with a hash lock held).
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
*/
|
2015-01-12 19:52:19 -08:00
|
|
|
ASSERT(!MUTEX_HELD(hash_lock));
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
if (mutex_tryenter(hash_lock)) {
|
2015-01-12 19:52:19 -08:00
|
|
|
uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
|
|
|
|
mutex_exit(hash_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
bytes_evicted += evicted;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2010-08-26 14:24:34 -07:00
|
|
|
/*
|
2015-01-12 19:52:19 -08:00
|
|
|
* If evicted is zero, arc_evict_hdr() must have
|
|
|
|
* decided to skip this header, don't increment
|
|
|
|
* evict_count in this case.
|
2010-08-26 14:24:34 -07:00
|
|
|
*/
|
2015-01-12 19:52:19 -08:00
|
|
|
if (evicted != 0)
|
|
|
|
evict_count++;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If arc_size isn't overflowing, signal any
|
|
|
|
* threads that might happen to be waiting.
|
|
|
|
*
|
|
|
|
* For each header evicted, we wake up a single
|
|
|
|
* thread. If we used cv_broadcast, we could
|
|
|
|
* wake up "too many" threads causing arc_size
|
|
|
|
* to significantly overflow arc_c; since
|
|
|
|
* arc_get_data_buf() doesn't check for overflow
|
|
|
|
* when it's woken up (it doesn't because it's
|
|
|
|
* possible for the ARC to be overflowing while
|
|
|
|
* full of un-evictable buffers, and the
|
|
|
|
* function should proceed in this case).
|
|
|
|
*
|
|
|
|
* If threads are left sleeping, due to not
|
|
|
|
* using cv_broadcast, they will be woken up
|
|
|
|
* just before arc_reclaim_thread() sleeps.
|
|
|
|
*/
|
|
|
|
mutex_enter(&arc_reclaim_lock);
|
|
|
|
if (!arc_is_overflowing())
|
|
|
|
cv_signal(&arc_reclaim_waiters_cv);
|
|
|
|
mutex_exit(&arc_reclaim_lock);
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
} else {
|
2015-01-12 19:52:19 -08:00
|
|
|
ARCSTAT_BUMP(arcstat_mutex_miss);
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
multilist_sublist_unlock(mls);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
return (bytes_evicted);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
/*
|
|
|
|
* Evict buffers from the given arc state, until we've removed the
|
|
|
|
* specified number of bytes. Move the removed buffers to the
|
|
|
|
* appropriate evict state.
|
|
|
|
*
|
|
|
|
* This function makes a "best effort". It skips over any buffers
|
|
|
|
* it can't get a hash_lock on, and so, may not catch all candidates.
|
|
|
|
* It may also return without evicting as much space as requested.
|
|
|
|
*
|
|
|
|
* If bytes is specified using the special value ARC_EVICT_ALL, this
|
|
|
|
* will evict all available (i.e. unlocked and evictable) buffers from
|
|
|
|
* the given arc state; which is used by arc_flush().
|
|
|
|
*/
|
|
|
|
static uint64_t
|
|
|
|
arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
|
|
|
|
arc_buf_contents_t type)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2015-01-12 19:52:19 -08:00
|
|
|
uint64_t total_evicted = 0;
|
|
|
|
multilist_t *ml = &state->arcs_list[type];
|
|
|
|
int num_sublists;
|
|
|
|
arc_buf_hdr_t **markers;
|
|
|
|
int i;
|
|
|
|
|
2015-06-29 10:02:03 -07:00
|
|
|
IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
|
2015-01-12 19:52:19 -08:00
|
|
|
|
|
|
|
num_sublists = multilist_get_num_sublists(ml);
|
2009-02-18 12:51:31 -08:00
|
|
|
|
|
|
|
/*
|
2015-01-12 19:52:19 -08:00
|
|
|
* If we've tried to evict from each sublist, made some
|
|
|
|
* progress, but still have not hit the target number of bytes
|
|
|
|
* to evict, we want to keep trying. The markers allow us to
|
|
|
|
* pick up where we left off for each individual sublist, rather
|
|
|
|
* than starting from the tail each time.
|
2009-02-18 12:51:31 -08:00
|
|
|
*/
|
2015-01-12 19:52:19 -08:00
|
|
|
markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
|
|
|
|
for (i = 0; i < num_sublists; i++) {
|
|
|
|
multilist_sublist_t *mls;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A b_spa of 0 is used to indicate that this header is
|
|
|
|
* a marker. This fact is used in arc_adjust_type() and
|
|
|
|
* arc_evict_state_impl().
|
|
|
|
*/
|
|
|
|
markers[i]->b_spa = 0;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
mls = multilist_sublist_lock(ml, i);
|
|
|
|
multilist_sublist_insert_tail(mls, markers[i]);
|
|
|
|
multilist_sublist_unlock(mls);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2009-02-18 12:51:31 -08:00
|
|
|
/*
|
2015-01-12 19:52:19 -08:00
|
|
|
* While we haven't hit our target number of bytes to evict, or
|
|
|
|
* we're evicting all available buffers.
|
2009-02-18 12:51:31 -08:00
|
|
|
*/
|
2015-01-12 19:52:19 -08:00
|
|
|
while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
|
2016-07-13 07:42:40 -05:00
|
|
|
int sublist_idx = multilist_get_random_index(ml);
|
|
|
|
uint64_t scan_evicted = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Try to reduce pinned dnodes with a floor of arc_dnode_limit.
|
|
|
|
* Request that 10% of the LRUs be scanned by the superblock
|
|
|
|
* shrinker.
|
|
|
|
*/
|
|
|
|
if (type == ARC_BUFC_DATA && arc_dnode_size > arc_dnode_limit)
|
|
|
|
arc_prune_async((arc_dnode_size - arc_dnode_limit) /
|
|
|
|
sizeof (dnode_t) / zfs_arc_dnode_reduce_percent);
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
/*
|
|
|
|
* Start eviction using a randomly selected sublist,
|
|
|
|
* this is to try and evenly balance eviction across all
|
|
|
|
* sublists. Always starting at the same sublist
|
|
|
|
* (e.g. index 0) would cause evictions to favor certain
|
|
|
|
* sublists over others.
|
|
|
|
*/
|
|
|
|
for (i = 0; i < num_sublists; i++) {
|
|
|
|
uint64_t bytes_remaining;
|
|
|
|
uint64_t bytes_evicted;
|
2009-02-18 12:51:31 -08:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
if (bytes == ARC_EVICT_ALL)
|
|
|
|
bytes_remaining = ARC_EVICT_ALL;
|
|
|
|
else if (total_evicted < bytes)
|
|
|
|
bytes_remaining = bytes - total_evicted;
|
|
|
|
else
|
|
|
|
break;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
|
|
|
|
markers[sublist_idx], spa, bytes_remaining);
|
|
|
|
|
|
|
|
scan_evicted += bytes_evicted;
|
|
|
|
total_evicted += bytes_evicted;
|
|
|
|
|
|
|
|
/* we've reached the end, wrap to the beginning */
|
|
|
|
if (++sublist_idx >= num_sublists)
|
|
|
|
sublist_idx = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we didn't evict anything during this scan, we have
|
|
|
|
* no reason to believe we'll evict more during another
|
|
|
|
* scan, so break the loop.
|
|
|
|
*/
|
|
|
|
if (scan_evicted == 0) {
|
|
|
|
/* This isn't possible, let's make that obvious */
|
|
|
|
ASSERT3S(bytes, !=, 0);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
/*
|
|
|
|
* When bytes is ARC_EVICT_ALL, the only way to
|
|
|
|
* break the loop is when scan_evicted is zero.
|
|
|
|
* In that case, we actually have evicted enough,
|
|
|
|
* so we don't want to increment the kstat.
|
|
|
|
*/
|
|
|
|
if (bytes != ARC_EVICT_ALL) {
|
|
|
|
ASSERT3S(total_evicted, <, bytes);
|
|
|
|
ARCSTAT_BUMP(arcstat_evict_not_enough);
|
|
|
|
}
|
2009-02-18 12:51:31 -08:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
break;
|
|
|
|
}
|
2009-02-18 12:51:31 -08:00
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
for (i = 0; i < num_sublists; i++) {
|
|
|
|
multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
|
|
|
|
multilist_sublist_remove(mls, markers[i]);
|
|
|
|
multilist_sublist_unlock(mls);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
kmem_cache_free(hdr_full_cache, markers[i]);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
2015-01-12 19:52:19 -08:00
|
|
|
kmem_free(markers, sizeof (*markers) * num_sublists);
|
|
|
|
|
|
|
|
return (total_evicted);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Flush all "evictable" data of the given type from the arc state
|
|
|
|
* specified. This will not evict any "active" buffers (i.e. referenced).
|
|
|
|
*
|
|
|
|
* When 'retry' is set to FALSE, the function will make a single pass
|
|
|
|
* over the state and evict any buffers that it can. Since it doesn't
|
|
|
|
* continually retry the eviction, it might end up leaving some buffers
|
|
|
|
* in the ARC due to lock misses.
|
|
|
|
*
|
|
|
|
* When 'retry' is set to TRUE, the function will continually retry the
|
|
|
|
* eviction until *all* evictable buffers have been removed from the
|
|
|
|
* state. As a result, if concurrent insertions into the state are
|
|
|
|
* allowed (e.g. if the ARC isn't shutting down), this function might
|
|
|
|
* wind up in an infinite loop, continually trying to evict buffers.
|
|
|
|
*/
|
|
|
|
static uint64_t
|
|
|
|
arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
|
|
|
|
boolean_t retry)
|
|
|
|
{
|
|
|
|
uint64_t evicted = 0;
|
|
|
|
|
|
|
|
while (state->arcs_lsize[type] != 0) {
|
|
|
|
evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
|
|
|
|
|
|
|
|
if (!retry)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return (evicted);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2011-12-22 12:20:43 -08:00
|
|
|
/*
|
2015-09-23 15:59:04 -07:00
|
|
|
* Helper function for arc_prune_async() it is responsible for safely
|
|
|
|
* handling the execution of a registered arc_prune_func_t.
|
2011-12-22 12:20:43 -08:00
|
|
|
*/
|
|
|
|
static void
|
2015-05-30 09:57:53 -05:00
|
|
|
arc_prune_task(void *ptr)
|
2011-12-22 12:20:43 -08:00
|
|
|
{
|
2015-05-30 09:57:53 -05:00
|
|
|
arc_prune_t *ap = (arc_prune_t *)ptr;
|
|
|
|
arc_prune_func_t *func = ap->p_pfunc;
|
2011-12-22 12:20:43 -08:00
|
|
|
|
2015-05-30 09:57:53 -05:00
|
|
|
if (func != NULL)
|
|
|
|
func(ap->p_adjust, ap->p_private);
|
2011-12-22 12:20:43 -08:00
|
|
|
|
2016-05-23 11:58:21 -07:00
|
|
|
refcount_remove(&ap->p_refcnt, func);
|
2015-05-30 09:57:53 -05:00
|
|
|
}
|
2011-12-22 12:20:43 -08:00
|
|
|
|
2015-05-30 09:57:53 -05:00
|
|
|
/*
|
|
|
|
* Notify registered consumers they must drop holds on a portion of the ARC
|
|
|
|
* buffered they reference. This provides a mechanism to ensure the ARC can
|
|
|
|
* honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This
|
|
|
|
* is analogous to dnlc_reduce_cache() but more generic.
|
|
|
|
*
|
2015-09-23 15:59:04 -07:00
|
|
|
* This operation is performed asynchronously so it may be safely called
|
2015-06-26 11:28:18 -07:00
|
|
|
* in the context of the arc_reclaim_thread(). A reference is taken here
|
2015-05-30 09:57:53 -05:00
|
|
|
* for each registered arc_prune_t and the arc_prune_task() is responsible
|
|
|
|
* for releasing it once the registered arc_prune_func_t has completed.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
arc_prune_async(int64_t adjust)
|
|
|
|
{
|
|
|
|
arc_prune_t *ap;
|
2011-12-22 12:20:43 -08:00
|
|
|
|
2015-05-30 09:57:53 -05:00
|
|
|
mutex_enter(&arc_prune_mtx);
|
|
|
|
for (ap = list_head(&arc_prune_list); ap != NULL;
|
|
|
|
ap = list_next(&arc_prune_list, ap)) {
|
2011-12-22 12:20:43 -08:00
|
|
|
|
2015-05-30 09:57:53 -05:00
|
|
|
if (refcount_count(&ap->p_refcnt) >= 2)
|
|
|
|
continue;
|
2011-12-22 12:20:43 -08:00
|
|
|
|
2015-05-30 09:57:53 -05:00
|
|
|
refcount_add(&ap->p_refcnt, ap->p_pfunc);
|
|
|
|
ap->p_adjust = adjust;
|
|
|
|
taskq_dispatch(arc_prune_taskq, arc_prune_task, ap, TQ_SLEEP);
|
|
|
|
ARCSTAT_BUMP(arcstat_prune);
|
2011-12-22 12:20:43 -08:00
|
|
|
}
|
|
|
|
mutex_exit(&arc_prune_mtx);
|
|
|
|
}
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
/*
|
|
|
|
* Evict the specified number of bytes from the state specified,
|
|
|
|
* restricting eviction to the spa and type given. This function
|
|
|
|
* prevents us from trying to evict more from a state's list than
|
|
|
|
* is "evictable", and to skip evicting altogether when passed a
|
|
|
|
* negative value for "bytes". In contrast, arc_evict_state() will
|
|
|
|
* evict everything it can, when passed a negative value for "bytes".
|
|
|
|
*/
|
|
|
|
static uint64_t
|
|
|
|
arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
|
|
|
|
arc_buf_contents_t type)
|
|
|
|
{
|
|
|
|
int64_t delta;
|
|
|
|
|
|
|
|
if (bytes > 0 && state->arcs_lsize[type] > 0) {
|
|
|
|
delta = MIN(state->arcs_lsize[type], bytes);
|
|
|
|
return (arc_evict_state(state, spa, delta, type));
|
|
|
|
}
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The goal of this function is to evict enough meta data buffers from the
|
|
|
|
* ARC in order to enforce the arc_meta_limit. Achieving this is slightly
|
|
|
|
* more complicated than it appears because it is common for data buffers
|
|
|
|
* to have holds on meta data buffers. In addition, dnode meta data buffers
|
|
|
|
* will be held by the dnodes in the block preventing them from being freed.
|
|
|
|
* This means we can't simply traverse the ARC and expect to always find
|
|
|
|
* enough unheld meta data buffer to release.
|
|
|
|
*
|
|
|
|
* Therefore, this function has been updated to make alternating passes
|
|
|
|
* over the ARC releasing data buffers and then newly unheld meta data
|
|
|
|
* buffers. This ensures forward progress is maintained and arc_meta_used
|
|
|
|
* will decrease. Normally this is sufficient, but if required the ARC
|
|
|
|
* will call the registered prune callbacks causing dentry and inodes to
|
|
|
|
* be dropped from the VFS cache. This will make dnode meta data buffers
|
|
|
|
* available for reclaim.
|
|
|
|
*/
|
|
|
|
static uint64_t
|
2015-05-30 09:57:53 -05:00
|
|
|
arc_adjust_meta_balanced(void)
|
2015-01-12 19:52:19 -08:00
|
|
|
{
|
|
|
|
int64_t adjustmnt, delta, prune = 0;
|
|
|
|
uint64_t total_evicted = 0;
|
|
|
|
arc_buf_contents_t type = ARC_BUFC_DATA;
|
2015-06-26 11:28:18 -07:00
|
|
|
int restarts = MAX(zfs_arc_meta_adjust_restarts, 0);
|
2015-01-12 19:52:19 -08:00
|
|
|
|
|
|
|
restart:
|
|
|
|
/*
|
|
|
|
* This slightly differs than the way we evict from the mru in
|
|
|
|
* arc_adjust because we don't have a "target" value (i.e. no
|
|
|
|
* "meta" arc_p). As a result, I think we can completely
|
|
|
|
* cannibalize the metadata in the MRU before we evict the
|
|
|
|
* metadata from the MFU. I think we probably need to implement a
|
|
|
|
* "metadata arc_p" value to do this properly.
|
|
|
|
*/
|
|
|
|
adjustmnt = arc_meta_used - arc_meta_limit;
|
|
|
|
|
|
|
|
if (adjustmnt > 0 && arc_mru->arcs_lsize[type] > 0) {
|
|
|
|
delta = MIN(arc_mru->arcs_lsize[type], adjustmnt);
|
|
|
|
total_evicted += arc_adjust_impl(arc_mru, 0, delta, type);
|
|
|
|
adjustmnt -= delta;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We can't afford to recalculate adjustmnt here. If we do,
|
|
|
|
* new metadata buffers can sneak into the MRU or ANON lists,
|
|
|
|
* thus penalize the MFU metadata. Although the fudge factor is
|
|
|
|
* small, it has been empirically shown to be significant for
|
|
|
|
* certain workloads (e.g. creating many empty directories). As
|
|
|
|
* such, we use the original calculation for adjustmnt, and
|
|
|
|
* simply decrement the amount of data evicted from the MRU.
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (adjustmnt > 0 && arc_mfu->arcs_lsize[type] > 0) {
|
|
|
|
delta = MIN(arc_mfu->arcs_lsize[type], adjustmnt);
|
|
|
|
total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type);
|
|
|
|
}
|
|
|
|
|
|
|
|
adjustmnt = arc_meta_used - arc_meta_limit;
|
|
|
|
|
|
|
|
if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
|
|
|
|
delta = MIN(adjustmnt,
|
|
|
|
arc_mru_ghost->arcs_lsize[type]);
|
|
|
|
total_evicted += arc_adjust_impl(arc_mru_ghost, 0, delta, type);
|
|
|
|
adjustmnt -= delta;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[type] > 0) {
|
|
|
|
delta = MIN(adjustmnt,
|
|
|
|
arc_mfu_ghost->arcs_lsize[type]);
|
|
|
|
total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, delta, type);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If after attempting to make the requested adjustment to the ARC
|
|
|
|
* the meta limit is still being exceeded then request that the
|
|
|
|
* higher layers drop some cached objects which have holds on ARC
|
|
|
|
* meta buffers. Requests to the upper layers will be made with
|
|
|
|
* increasingly large scan sizes until the ARC is below the limit.
|
|
|
|
*/
|
|
|
|
if (arc_meta_used > arc_meta_limit) {
|
|
|
|
if (type == ARC_BUFC_DATA) {
|
|
|
|
type = ARC_BUFC_METADATA;
|
|
|
|
} else {
|
|
|
|
type = ARC_BUFC_DATA;
|
|
|
|
|
|
|
|
if (zfs_arc_meta_prune) {
|
|
|
|
prune += zfs_arc_meta_prune;
|
2015-05-30 09:57:53 -05:00
|
|
|
arc_prune_async(prune);
|
2015-01-12 19:52:19 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (restarts > 0) {
|
|
|
|
restarts--;
|
|
|
|
goto restart;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return (total_evicted);
|
|
|
|
}
|
|
|
|
|
2015-05-30 09:57:53 -05:00
|
|
|
/*
|
|
|
|
* Evict metadata buffers from the cache, such that arc_meta_used is
|
|
|
|
* capped by the arc_meta_limit tunable.
|
|
|
|
*/
|
|
|
|
static uint64_t
|
|
|
|
arc_adjust_meta_only(void)
|
|
|
|
{
|
|
|
|
uint64_t total_evicted = 0;
|
|
|
|
int64_t target;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we're over the meta limit, we want to evict enough
|
|
|
|
* metadata to get back under the meta limit. We don't want to
|
|
|
|
* evict so much that we drop the MRU below arc_p, though. If
|
|
|
|
* we're over the meta limit more than we're over arc_p, we
|
|
|
|
* evict some from the MRU here, and some from the MFU below.
|
|
|
|
*/
|
|
|
|
target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
|
2015-06-26 15:14:45 -07:00
|
|
|
(int64_t)(refcount_count(&arc_anon->arcs_size) +
|
|
|
|
refcount_count(&arc_mru->arcs_size) - arc_p));
|
2015-05-30 09:57:53 -05:00
|
|
|
|
|
|
|
total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Similar to the above, we want to evict enough bytes to get us
|
|
|
|
* below the meta limit, but not so much as to drop us below the
|
|
|
|
* space alloted to the MFU (which is defined as arc_c - arc_p).
|
|
|
|
*/
|
|
|
|
target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
|
2015-06-26 15:14:45 -07:00
|
|
|
(int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p)));
|
2015-05-30 09:57:53 -05:00
|
|
|
|
|
|
|
total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
|
|
|
|
|
|
|
|
return (total_evicted);
|
|
|
|
}
|
|
|
|
|
|
|
|
static uint64_t
|
|
|
|
arc_adjust_meta(void)
|
|
|
|
{
|
|
|
|
if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
|
|
|
|
return (arc_adjust_meta_only());
|
|
|
|
else
|
|
|
|
return (arc_adjust_meta_balanced());
|
|
|
|
}
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
/*
|
|
|
|
* Return the type of the oldest buffer in the given arc state
|
|
|
|
*
|
|
|
|
* This function will select a random sublist of type ARC_BUFC_DATA and
|
|
|
|
* a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
|
|
|
|
* is compared, and the type which contains the "older" buffer will be
|
|
|
|
* returned.
|
|
|
|
*/
|
|
|
|
static arc_buf_contents_t
|
|
|
|
arc_adjust_type(arc_state_t *state)
|
|
|
|
{
|
|
|
|
multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA];
|
|
|
|
multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA];
|
|
|
|
int data_idx = multilist_get_random_index(data_ml);
|
|
|
|
int meta_idx = multilist_get_random_index(meta_ml);
|
|
|
|
multilist_sublist_t *data_mls;
|
|
|
|
multilist_sublist_t *meta_mls;
|
|
|
|
arc_buf_contents_t type;
|
|
|
|
arc_buf_hdr_t *data_hdr;
|
|
|
|
arc_buf_hdr_t *meta_hdr;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We keep the sublist lock until we're finished, to prevent
|
|
|
|
* the headers from being destroyed via arc_evict_state().
|
|
|
|
*/
|
|
|
|
data_mls = multilist_sublist_lock(data_ml, data_idx);
|
|
|
|
meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* These two loops are to ensure we skip any markers that
|
|
|
|
* might be at the tail of the lists due to arc_evict_state().
|
|
|
|
*/
|
|
|
|
|
|
|
|
for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
|
|
|
|
data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
|
|
|
|
if (data_hdr->b_spa != 0)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
|
|
|
|
meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
|
|
|
|
if (meta_hdr->b_spa != 0)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (data_hdr == NULL && meta_hdr == NULL) {
|
|
|
|
type = ARC_BUFC_DATA;
|
|
|
|
} else if (data_hdr == NULL) {
|
|
|
|
ASSERT3P(meta_hdr, !=, NULL);
|
|
|
|
type = ARC_BUFC_METADATA;
|
|
|
|
} else if (meta_hdr == NULL) {
|
|
|
|
ASSERT3P(data_hdr, !=, NULL);
|
|
|
|
type = ARC_BUFC_DATA;
|
|
|
|
} else {
|
|
|
|
ASSERT3P(data_hdr, !=, NULL);
|
|
|
|
ASSERT3P(meta_hdr, !=, NULL);
|
|
|
|
|
|
|
|
/* The headers can't be on the sublist without an L1 header */
|
|
|
|
ASSERT(HDR_HAS_L1HDR(data_hdr));
|
|
|
|
ASSERT(HDR_HAS_L1HDR(meta_hdr));
|
|
|
|
|
|
|
|
if (data_hdr->b_l1hdr.b_arc_access <
|
|
|
|
meta_hdr->b_l1hdr.b_arc_access) {
|
|
|
|
type = ARC_BUFC_DATA;
|
|
|
|
} else {
|
|
|
|
type = ARC_BUFC_METADATA;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
multilist_sublist_unlock(meta_mls);
|
|
|
|
multilist_sublist_unlock(data_mls);
|
|
|
|
|
|
|
|
return (type);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Evict buffers from the cache, such that arc_size is capped by arc_c.
|
|
|
|
*/
|
|
|
|
static uint64_t
|
|
|
|
arc_adjust(void)
|
|
|
|
{
|
|
|
|
uint64_t total_evicted = 0;
|
|
|
|
uint64_t bytes;
|
|
|
|
int64_t target;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we're over arc_meta_limit, we want to correct that before
|
|
|
|
* potentially evicting data buffers below.
|
|
|
|
*/
|
|
|
|
total_evicted += arc_adjust_meta();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Adjust MRU size
|
|
|
|
*
|
|
|
|
* If we're over the target cache size, we want to evict enough
|
|
|
|
* from the list to get back to our target size. We don't want
|
|
|
|
* to evict too much from the MRU, such that it drops below
|
|
|
|
* arc_p. So, if we're over our target cache size more than
|
|
|
|
* the MRU is over arc_p, we'll evict enough to get back to
|
|
|
|
* arc_p here, and then evict more from the MFU below.
|
|
|
|
*/
|
|
|
|
target = MIN((int64_t)(arc_size - arc_c),
|
2015-06-26 15:14:45 -07:00
|
|
|
(int64_t)(refcount_count(&arc_anon->arcs_size) +
|
|
|
|
refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p));
|
2015-01-12 19:52:19 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we're below arc_meta_min, always prefer to evict data.
|
|
|
|
* Otherwise, try to satisfy the requested number of bytes to
|
|
|
|
* evict from the type which contains older buffers; in an
|
|
|
|
* effort to keep newer buffers in the cache regardless of their
|
|
|
|
* type. If we cannot satisfy the number of bytes from this
|
|
|
|
* type, spill over into the next type.
|
|
|
|
*/
|
|
|
|
if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
|
|
|
|
arc_meta_used > arc_meta_min) {
|
|
|
|
bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
|
|
|
|
total_evicted += bytes;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we couldn't evict our target number of bytes from
|
|
|
|
* metadata, we try to get the rest from data.
|
|
|
|
*/
|
|
|
|
target -= bytes;
|
|
|
|
|
|
|
|
total_evicted +=
|
|
|
|
arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
|
|
|
|
} else {
|
|
|
|
bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
|
|
|
|
total_evicted += bytes;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we couldn't evict our target number of bytes from
|
|
|
|
* data, we try to get the rest from metadata.
|
|
|
|
*/
|
|
|
|
target -= bytes;
|
|
|
|
|
|
|
|
total_evicted +=
|
|
|
|
arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Adjust MFU size
|
|
|
|
*
|
|
|
|
* Now that we've tried to evict enough from the MRU to get its
|
|
|
|
* size back to arc_p, if we're still above the target cache
|
|
|
|
* size, we evict the rest from the MFU.
|
|
|
|
*/
|
|
|
|
target = arc_size - arc_c;
|
|
|
|
|
2015-07-01 17:18:08 +02:00
|
|
|
if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
|
2015-01-12 19:52:19 -08:00
|
|
|
arc_meta_used > arc_meta_min) {
|
|
|
|
bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
|
|
|
|
total_evicted += bytes;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we couldn't evict our target number of bytes from
|
|
|
|
* metadata, we try to get the rest from data.
|
|
|
|
*/
|
|
|
|
target -= bytes;
|
|
|
|
|
|
|
|
total_evicted +=
|
|
|
|
arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
|
|
|
|
} else {
|
|
|
|
bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
|
|
|
|
total_evicted += bytes;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we couldn't evict our target number of bytes from
|
|
|
|
* data, we try to get the rest from data.
|
|
|
|
*/
|
|
|
|
target -= bytes;
|
|
|
|
|
|
|
|
total_evicted +=
|
|
|
|
arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Adjust ghost lists
|
|
|
|
*
|
|
|
|
* In addition to the above, the ARC also defines target values
|
|
|
|
* for the ghost lists. The sum of the mru list and mru ghost
|
|
|
|
* list should never exceed the target size of the cache, and
|
|
|
|
* the sum of the mru list, mfu list, mru ghost list, and mfu
|
|
|
|
* ghost list should never exceed twice the target size of the
|
|
|
|
* cache. The following logic enforces these limits on the ghost
|
|
|
|
* caches, and evicts from them as needed.
|
|
|
|
*/
|
2015-06-26 15:14:45 -07:00
|
|
|
target = refcount_count(&arc_mru->arcs_size) +
|
|
|
|
refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
|
2015-01-12 19:52:19 -08:00
|
|
|
|
|
|
|
bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
|
|
|
|
total_evicted += bytes;
|
|
|
|
|
|
|
|
target -= bytes;
|
|
|
|
|
|
|
|
total_evicted +=
|
|
|
|
arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We assume the sum of the mru list and mfu list is less than
|
|
|
|
* or equal to arc_c (we enforced this above), which means we
|
|
|
|
* can use the simpler of the two equations below:
|
|
|
|
*
|
|
|
|
* mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
|
|
|
|
* mru ghost + mfu ghost <= arc_c
|
|
|
|
*/
|
2015-06-26 15:14:45 -07:00
|
|
|
target = refcount_count(&arc_mru_ghost->arcs_size) +
|
|
|
|
refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
|
2015-01-12 19:52:19 -08:00
|
|
|
|
|
|
|
bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
|
|
|
|
total_evicted += bytes;
|
|
|
|
|
|
|
|
target -= bytes;
|
|
|
|
|
|
|
|
total_evicted +=
|
|
|
|
arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
|
|
|
|
|
|
|
|
return (total_evicted);
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
static void
|
|
|
|
arc_do_user_evicts(void)
|
|
|
|
{
|
2015-01-12 19:52:19 -08:00
|
|
|
mutex_enter(&arc_user_evicts_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
while (arc_eviction_list != NULL) {
|
|
|
|
arc_buf_t *buf = arc_eviction_list;
|
|
|
|
arc_eviction_list = buf->b_next;
|
2010-05-28 13:45:14 -07:00
|
|
|
mutex_enter(&buf->b_evict_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
buf->b_hdr = NULL;
|
2010-05-28 13:45:14 -07:00
|
|
|
mutex_exit(&buf->b_evict_lock);
|
2015-01-12 19:52:19 -08:00
|
|
|
mutex_exit(&arc_user_evicts_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
if (buf->b_efunc != NULL)
|
2014-07-15 03:43:18 -04:00
|
|
|
VERIFY0(buf->b_efunc(buf->b_private));
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
buf->b_efunc = NULL;
|
|
|
|
buf->b_private = NULL;
|
|
|
|
kmem_cache_free(buf_cache, buf);
|
2015-01-12 19:52:19 -08:00
|
|
|
mutex_enter(&arc_user_evicts_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
2015-01-12 19:52:19 -08:00
|
|
|
mutex_exit(&arc_user_evicts_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
void
|
|
|
|
arc_flush(spa_t *spa, boolean_t retry)
|
2011-12-22 12:20:43 -08:00
|
|
|
{
|
2015-01-12 19:52:19 -08:00
|
|
|
uint64_t guid = 0;
|
2014-01-03 11:40:52 -08:00
|
|
|
|
2015-03-17 15:08:22 -07:00
|
|
|
/*
|
2015-01-12 19:52:19 -08:00
|
|
|
* If retry is TRUE, a spa must not be specified since we have
|
|
|
|
* no good way to determine if all of a spa's buffers have been
|
|
|
|
* evicted from an arc state.
|
2015-03-17 15:08:22 -07:00
|
|
|
*/
|
2015-01-12 19:52:19 -08:00
|
|
|
ASSERT(!retry || spa == 0);
|
2009-02-18 12:51:31 -08:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
if (spa != NULL)
|
2011-11-11 14:07:54 -08:00
|
|
|
guid = spa_load_guid(spa);
|
2009-02-18 12:51:31 -08:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
(void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
|
|
|
|
(void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
|
|
|
|
|
|
|
|
(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
|
|
|
|
(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
|
|
|
|
|
|
|
|
(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
|
|
|
|
(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
|
|
|
|
(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
arc_do_user_evicts();
|
|
|
|
ASSERT(spa || arc_eviction_list == NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2015-06-26 11:28:18 -07:00
|
|
|
arc_shrink(int64_t to_free)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2016-01-22 07:37:37 -06:00
|
|
|
uint64_t c = arc_c;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2016-01-22 07:37:37 -06:00
|
|
|
if (c > to_free && c - to_free > arc_c_min) {
|
|
|
|
arc_c = c - to_free;
|
2015-06-26 11:28:18 -07:00
|
|
|
atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
|
2008-11-20 12:01:55 -08:00
|
|
|
if (arc_c > arc_size)
|
|
|
|
arc_c = MAX(arc_size, arc_c_min);
|
|
|
|
if (arc_p > arc_c)
|
|
|
|
arc_p = (arc_c >> 1);
|
|
|
|
ASSERT(arc_c >= arc_c_min);
|
|
|
|
ASSERT((int64_t)arc_p >= 0);
|
2016-01-22 07:37:37 -06:00
|
|
|
} else {
|
|
|
|
arc_c = arc_c_min;
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (arc_size > arc_c)
|
2015-01-12 19:52:19 -08:00
|
|
|
(void) arc_adjust();
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2015-06-26 11:28:18 -07:00
|
|
|
typedef enum free_memory_reason_t {
|
|
|
|
FMR_UNKNOWN,
|
|
|
|
FMR_NEEDFREE,
|
|
|
|
FMR_LOTSFREE,
|
|
|
|
FMR_SWAPFS_MINFREE,
|
|
|
|
FMR_PAGES_PP_MAXIMUM,
|
|
|
|
FMR_HEAP_ARENA,
|
|
|
|
FMR_ZIO_ARENA,
|
|
|
|
} free_memory_reason_t;
|
|
|
|
|
|
|
|
int64_t last_free_memory;
|
|
|
|
free_memory_reason_t last_free_reason;
|
|
|
|
|
|
|
|
#ifdef _KERNEL
|
|
|
|
/*
|
|
|
|
* Additional reserve of pages for pp_reserve.
|
|
|
|
*/
|
|
|
|
int64_t arc_pages_pp_reserve = 64;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Additional reserve of pages for swapfs.
|
|
|
|
*/
|
|
|
|
int64_t arc_swapfs_reserve = 64;
|
|
|
|
#endif /* _KERNEL */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return the amount of memory that can be consumed before reclaim will be
|
|
|
|
* needed. Positive if there is sufficient free memory, negative indicates
|
|
|
|
* the amount of memory that needs to be freed up.
|
|
|
|
*/
|
|
|
|
static int64_t
|
|
|
|
arc_available_memory(void)
|
|
|
|
{
|
|
|
|
int64_t lowest = INT64_MAX;
|
|
|
|
free_memory_reason_t r = FMR_UNKNOWN;
|
|
|
|
#ifdef _KERNEL
|
|
|
|
int64_t n;
|
2015-07-27 13:17:32 -07:00
|
|
|
#ifdef __linux__
|
|
|
|
pgcnt_t needfree = btop(arc_need_free);
|
|
|
|
pgcnt_t lotsfree = btop(arc_sys_free);
|
|
|
|
pgcnt_t desfree = 0;
|
|
|
|
#endif
|
2015-06-26 11:28:18 -07:00
|
|
|
|
|
|
|
if (needfree > 0) {
|
|
|
|
n = PAGESIZE * (-needfree);
|
|
|
|
if (n < lowest) {
|
|
|
|
lowest = n;
|
|
|
|
r = FMR_NEEDFREE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* check that we're out of range of the pageout scanner. It starts to
|
|
|
|
* schedule paging if freemem is less than lotsfree and needfree.
|
|
|
|
* lotsfree is the high-water mark for pageout, and needfree is the
|
|
|
|
* number of needed free pages. We add extra pages here to make sure
|
|
|
|
* the scanner doesn't start up while we're freeing memory.
|
|
|
|
*/
|
|
|
|
n = PAGESIZE * (freemem - lotsfree - needfree - desfree);
|
|
|
|
if (n < lowest) {
|
|
|
|
lowest = n;
|
|
|
|
r = FMR_LOTSFREE;
|
|
|
|
}
|
|
|
|
|
2015-07-27 13:17:32 -07:00
|
|
|
#ifndef __linux__
|
2015-06-26 11:28:18 -07:00
|
|
|
/*
|
|
|
|
* check to make sure that swapfs has enough space so that anon
|
|
|
|
* reservations can still succeed. anon_resvmem() checks that the
|
|
|
|
* availrmem is greater than swapfs_minfree, and the number of reserved
|
|
|
|
* swap pages. We also add a bit of extra here just to prevent
|
|
|
|
* circumstances from getting really dire.
|
|
|
|
*/
|
|
|
|
n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve -
|
|
|
|
desfree - arc_swapfs_reserve);
|
|
|
|
if (n < lowest) {
|
|
|
|
lowest = n;
|
|
|
|
r = FMR_SWAPFS_MINFREE;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check that we have enough availrmem that memory locking (e.g., via
|
|
|
|
* mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum
|
|
|
|
* stores the number of pages that cannot be locked; when availrmem
|
|
|
|
* drops below pages_pp_maximum, page locking mechanisms such as
|
|
|
|
* page_pp_lock() will fail.)
|
|
|
|
*/
|
|
|
|
n = PAGESIZE * (availrmem - pages_pp_maximum -
|
|
|
|
arc_pages_pp_reserve);
|
|
|
|
if (n < lowest) {
|
|
|
|
lowest = n;
|
|
|
|
r = FMR_PAGES_PP_MAXIMUM;
|
|
|
|
}
|
2015-07-27 13:17:32 -07:00
|
|
|
#endif
|
2015-06-26 11:28:18 -07:00
|
|
|
|
|
|
|
#if defined(__i386)
|
|
|
|
/*
|
|
|
|
* If we're on an i386 platform, it's possible that we'll exhaust the
|
|
|
|
* kernel heap space before we ever run out of available physical
|
|
|
|
* memory. Most checks of the size of the heap_area compare against
|
|
|
|
* tune.t_minarmem, which is the minimum available real memory that we
|
|
|
|
* can have in the system. However, this is generally fixed at 25 pages
|
|
|
|
* which is so low that it's useless. In this comparison, we seek to
|
|
|
|
* calculate the total heap-size, and reclaim if more than 3/4ths of the
|
|
|
|
* heap is allocated. (Or, in the calculation, if less than 1/4th is
|
|
|
|
* free)
|
|
|
|
*/
|
|
|
|
n = vmem_size(heap_arena, VMEM_FREE) -
|
|
|
|
(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2);
|
|
|
|
if (n < lowest) {
|
|
|
|
lowest = n;
|
|
|
|
r = FMR_HEAP_ARENA;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If zio data pages are being allocated out of a separate heap segment,
|
|
|
|
* then enforce that the size of available vmem for this arena remains
|
|
|
|
* above about 1/16th free.
|
|
|
|
*
|
|
|
|
* Note: The 1/16th arena free requirement was put in place
|
|
|
|
* to aggressively evict memory from the arc in order to avoid
|
|
|
|
* memory fragmentation issues.
|
|
|
|
*/
|
|
|
|
if (zio_arena != NULL) {
|
|
|
|
n = vmem_size(zio_arena, VMEM_FREE) -
|
|
|
|
(vmem_size(zio_arena, VMEM_ALLOC) >> 4);
|
|
|
|
if (n < lowest) {
|
|
|
|
lowest = n;
|
|
|
|
r = FMR_ZIO_ARENA;
|
|
|
|
}
|
|
|
|
}
|
2015-07-27 13:17:32 -07:00
|
|
|
#else /* _KERNEL */
|
2015-06-26 11:28:18 -07:00
|
|
|
/* Every 100 calls, free a small amount */
|
|
|
|
if (spa_get_random(100) == 0)
|
|
|
|
lowest = -1024;
|
2015-07-27 13:17:32 -07:00
|
|
|
#endif /* _KERNEL */
|
2015-06-26 11:28:18 -07:00
|
|
|
|
|
|
|
last_free_memory = lowest;
|
|
|
|
last_free_reason = r;
|
|
|
|
|
|
|
|
return (lowest);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Determine if the system is under memory pressure and is asking
|
|
|
|
* to reclaim memory. A return value of TRUE indicates that the system
|
|
|
|
* is under memory pressure and that the arc should adjust accordingly.
|
|
|
|
*/
|
|
|
|
static boolean_t
|
|
|
|
arc_reclaim_needed(void)
|
|
|
|
{
|
|
|
|
return (arc_available_memory() < 0);
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
static void
|
2015-06-26 11:28:18 -07:00
|
|
|
arc_kmem_reap_now(void)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
|
|
|
size_t i;
|
|
|
|
kmem_cache_t *prev_cache = NULL;
|
|
|
|
kmem_cache_t *prev_data_cache = NULL;
|
|
|
|
extern kmem_cache_t *zio_buf_cache[];
|
|
|
|
extern kmem_cache_t *zio_data_buf_cache[];
|
2015-06-24 15:48:22 -07:00
|
|
|
extern kmem_cache_t *range_seg_cache;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-05-30 09:57:53 -05:00
|
|
|
if ((arc_meta_used >= arc_meta_limit) && zfs_arc_meta_prune) {
|
|
|
|
/*
|
|
|
|
* We are exceeding our meta-data cache limit.
|
|
|
|
* Prune some entries to release holds on meta-data.
|
|
|
|
*/
|
2015-09-23 15:59:04 -07:00
|
|
|
arc_prune_async(zfs_arc_meta_prune);
|
2015-05-30 09:57:53 -05:00
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
|
2015-10-30 14:34:22 -07:00
|
|
|
#ifdef _ILP32
|
|
|
|
/* reach upper limit of cache size on 32-bit */
|
|
|
|
if (zio_buf_cache[i] == NULL)
|
|
|
|
break;
|
|
|
|
#endif
|
2008-11-20 12:01:55 -08:00
|
|
|
if (zio_buf_cache[i] != prev_cache) {
|
|
|
|
prev_cache = zio_buf_cache[i];
|
|
|
|
kmem_cache_reap_now(zio_buf_cache[i]);
|
|
|
|
}
|
|
|
|
if (zio_data_buf_cache[i] != prev_data_cache) {
|
|
|
|
prev_data_cache = zio_data_buf_cache[i];
|
|
|
|
kmem_cache_reap_now(zio_data_buf_cache[i]);
|
|
|
|
}
|
|
|
|
}
|
2015-01-12 19:52:19 -08:00
|
|
|
kmem_cache_reap_now(buf_cache);
|
2014-12-29 19:12:23 -08:00
|
|
|
kmem_cache_reap_now(hdr_full_cache);
|
|
|
|
kmem_cache_reap_now(hdr_l2only_cache);
|
2015-06-24 15:48:22 -07:00
|
|
|
kmem_cache_reap_now(range_seg_cache);
|
2015-06-26 11:28:18 -07:00
|
|
|
|
|
|
|
if (zio_arena != NULL) {
|
|
|
|
/*
|
|
|
|
* Ask the vmem arena to reclaim unused memory from its
|
|
|
|
* quantum caches.
|
|
|
|
*/
|
|
|
|
vmem_qcache_reap(zio_arena);
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2012-03-13 14:29:16 -07:00
|
|
|
/*
|
2015-01-12 19:52:19 -08:00
|
|
|
* Threads can block in arc_get_data_buf() waiting for this thread to evict
|
|
|
|
* enough data and signal them to proceed. When this happens, the threads in
|
|
|
|
* arc_get_data_buf() are sleeping while holding the hash lock for their
|
|
|
|
* particular arc header. Thus, we must be careful to never sleep on a
|
|
|
|
* hash lock in this thread. This is to prevent the following deadlock:
|
|
|
|
*
|
|
|
|
* - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L",
|
|
|
|
* waiting for the reclaim thread to signal it.
|
|
|
|
*
|
|
|
|
* - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
|
|
|
|
* fails, and goes to sleep forever.
|
|
|
|
*
|
|
|
|
* This possible deadlock is avoided by always acquiring a hash lock
|
|
|
|
* using mutex_tryenter() from arc_reclaim_thread().
|
2012-03-13 14:29:16 -07:00
|
|
|
*/
|
2008-11-20 12:01:55 -08:00
|
|
|
static void
|
2015-06-26 11:28:18 -07:00
|
|
|
arc_reclaim_thread(void)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2015-06-26 11:28:18 -07:00
|
|
|
fstrans_cookie_t cookie = spl_fstrans_mark();
|
2016-05-06 12:35:52 -04:00
|
|
|
hrtime_t growtime = 0;
|
2008-11-20 12:01:55 -08:00
|
|
|
callb_cpr_t cpr;
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
mutex_enter(&arc_reclaim_lock);
|
2015-06-26 11:28:18 -07:00
|
|
|
while (!arc_reclaim_thread_exit) {
|
|
|
|
int64_t to_free;
|
|
|
|
int64_t free_memory = arc_available_memory();
|
|
|
|
uint64_t evicted = 0;
|
2012-03-13 14:29:16 -07:00
|
|
|
|
2015-06-26 11:28:18 -07:00
|
|
|
arc_tuning_update();
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-06-26 11:28:18 -07:00
|
|
|
mutex_exit(&arc_reclaim_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-06-26 11:28:18 -07:00
|
|
|
if (free_memory < 0) {
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-06-26 11:28:18 -07:00
|
|
|
arc_no_grow = B_TRUE;
|
2008-12-03 12:09:06 -08:00
|
|
|
arc_warm = B_TRUE;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-06-26 11:28:18 -07:00
|
|
|
/*
|
|
|
|
* Wait at least zfs_grow_retry (default 5) seconds
|
|
|
|
* before considering growing.
|
|
|
|
*/
|
2016-05-06 12:35:52 -04:00
|
|
|
growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
|
2011-03-30 18:59:17 -07:00
|
|
|
|
2015-06-26 11:28:18 -07:00
|
|
|
arc_kmem_reap_now();
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-06-26 11:28:18 -07:00
|
|
|
/*
|
|
|
|
* If we are still low on memory, shrink the ARC
|
|
|
|
* so that we have arc_shrink_min free space.
|
|
|
|
*/
|
|
|
|
free_memory = arc_available_memory();
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-06-26 11:28:18 -07:00
|
|
|
to_free = (arc_c >> arc_shrink_shift) - free_memory;
|
|
|
|
if (to_free > 0) {
|
|
|
|
#ifdef _KERNEL
|
2015-07-27 13:17:32 -07:00
|
|
|
to_free = MAX(to_free, arc_need_free);
|
2015-06-26 11:28:18 -07:00
|
|
|
#endif
|
|
|
|
arc_shrink(to_free);
|
|
|
|
}
|
|
|
|
} else if (free_memory < arc_c >> arc_no_grow_shift) {
|
|
|
|
arc_no_grow = B_TRUE;
|
2016-05-06 12:35:52 -04:00
|
|
|
} else if (gethrtime() >= growtime) {
|
2015-06-26 11:28:18 -07:00
|
|
|
arc_no_grow = B_FALSE;
|
|
|
|
}
|
2013-07-24 10:14:11 -07:00
|
|
|
|
2015-06-26 11:28:18 -07:00
|
|
|
evicted = arc_adjust();
|
2013-07-24 10:14:11 -07:00
|
|
|
|
2015-06-26 11:28:18 -07:00
|
|
|
mutex_enter(&arc_reclaim_lock);
|
2013-07-24 10:14:11 -07:00
|
|
|
|
2015-06-26 11:28:18 -07:00
|
|
|
/*
|
|
|
|
* If evicted is zero, we couldn't evict anything via
|
|
|
|
* arc_adjust(). This could be due to hash lock
|
|
|
|
* collisions, but more likely due to the majority of
|
|
|
|
* arc buffers being unevictable. Therefore, even if
|
|
|
|
* arc_size is above arc_c, another pass is unlikely to
|
|
|
|
* be helpful and could potentially cause us to enter an
|
|
|
|
* infinite loop.
|
|
|
|
*/
|
|
|
|
if (arc_size <= arc_c || evicted == 0) {
|
|
|
|
/*
|
|
|
|
* We're either no longer overflowing, or we
|
|
|
|
* can't evict anything more, so we should wake
|
2015-07-27 13:17:32 -07:00
|
|
|
* up any threads before we go to sleep and clear
|
|
|
|
* arc_need_free since nothing more can be done.
|
2015-06-26 11:28:18 -07:00
|
|
|
*/
|
|
|
|
cv_broadcast(&arc_reclaim_waiters_cv);
|
2015-07-27 13:17:32 -07:00
|
|
|
arc_need_free = 0;
|
2013-07-24 10:14:11 -07:00
|
|
|
|
2015-06-26 11:28:18 -07:00
|
|
|
/*
|
|
|
|
* Block until signaled, or after one second (we
|
|
|
|
* might need to perform arc_kmem_reap_now()
|
|
|
|
* even if we aren't being signalled)
|
|
|
|
*/
|
|
|
|
CALLB_CPR_SAFE_BEGIN(&cpr);
|
2016-05-11 16:55:48 -07:00
|
|
|
(void) cv_timedwait_sig_hires(&arc_reclaim_thread_cv,
|
2016-05-06 12:35:52 -04:00
|
|
|
&arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
|
2015-06-26 11:28:18 -07:00
|
|
|
CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
|
|
|
|
}
|
2015-01-12 19:52:19 -08:00
|
|
|
}
|
2013-07-24 10:14:11 -07:00
|
|
|
|
2015-06-26 11:28:18 -07:00
|
|
|
arc_reclaim_thread_exit = FALSE;
|
2015-01-12 19:52:19 -08:00
|
|
|
cv_broadcast(&arc_reclaim_thread_cv);
|
|
|
|
CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */
|
|
|
|
spl_fstrans_unmark(cookie);
|
|
|
|
thread_exit();
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
arc_user_evicts_thread(void)
|
|
|
|
{
|
2015-06-26 11:28:18 -07:00
|
|
|
fstrans_cookie_t cookie = spl_fstrans_mark();
|
2015-01-12 19:52:19 -08:00
|
|
|
callb_cpr_t cpr;
|
2013-07-24 10:14:11 -07:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG);
|
2013-07-24 10:14:11 -07:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
mutex_enter(&arc_user_evicts_lock);
|
|
|
|
while (!arc_user_evicts_thread_exit) {
|
|
|
|
mutex_exit(&arc_user_evicts_lock);
|
|
|
|
|
|
|
|
arc_do_user_evicts();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is necessary in order for the mdb ::arc dcmd to
|
|
|
|
* show up to date information. Since the ::arc command
|
|
|
|
* does not call the kstat's update function, without
|
|
|
|
* this call, the command may show stale stats for the
|
|
|
|
* anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
|
|
|
|
* with this change, the data might be up to 1 second
|
|
|
|
* out of date; but that should suffice. The arc_state_t
|
|
|
|
* structures can be queried directly if more accurate
|
|
|
|
* information is needed.
|
|
|
|
*/
|
|
|
|
if (arc_ksp != NULL)
|
|
|
|
arc_ksp->ks_update(arc_ksp, KSTAT_READ);
|
|
|
|
|
|
|
|
mutex_enter(&arc_user_evicts_lock);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Block until signaled, or after one second (we need to
|
|
|
|
* call the arc's kstat update function regularly).
|
|
|
|
*/
|
|
|
|
CALLB_CPR_SAFE_BEGIN(&cpr);
|
2015-06-11 10:47:19 -07:00
|
|
|
(void) cv_timedwait_sig(&arc_user_evicts_cv,
|
2015-01-12 19:52:19 -08:00
|
|
|
&arc_user_evicts_lock, ddi_get_lbolt() + hz);
|
|
|
|
CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
arc_user_evicts_thread_exit = FALSE;
|
|
|
|
cv_broadcast(&arc_user_evicts_cv);
|
|
|
|
CALLB_CPR_EXIT(&cpr); /* drops arc_user_evicts_lock */
|
2015-03-30 22:43:29 -05:00
|
|
|
spl_fstrans_unmark(cookie);
|
2008-11-20 12:01:55 -08:00
|
|
|
thread_exit();
|
|
|
|
}
|
|
|
|
|
2011-03-29 18:08:59 -07:00
|
|
|
#ifdef _KERNEL
|
|
|
|
/*
|
2012-03-13 14:29:16 -07:00
|
|
|
* Determine the amount of memory eligible for eviction contained in the
|
|
|
|
* ARC. All clean data reported by the ghost lists can always be safely
|
|
|
|
* evicted. Due to arc_c_min, the same does not hold for all clean data
|
|
|
|
* contained by the regular mru and mfu lists.
|
|
|
|
*
|
|
|
|
* In the case of the regular mru and mfu lists, we need to report as
|
|
|
|
* much clean data as possible, such that evicting that same reported
|
|
|
|
* data will not bring arc_size below arc_c_min. Thus, in certain
|
|
|
|
* circumstances, the total amount of clean data in the mru and mfu
|
|
|
|
* lists might not actually be evictable.
|
|
|
|
*
|
|
|
|
* The following two distinct cases are accounted for:
|
|
|
|
*
|
|
|
|
* 1. The sum of the amount of dirty data contained by both the mru and
|
|
|
|
* mfu lists, plus the ARC's other accounting (e.g. the anon list),
|
|
|
|
* is greater than or equal to arc_c_min.
|
|
|
|
* (i.e. amount of dirty data >= arc_c_min)
|
|
|
|
*
|
|
|
|
* This is the easy case; all clean data contained by the mru and mfu
|
|
|
|
* lists is evictable. Evicting all clean data can only drop arc_size
|
|
|
|
* to the amount of dirty data, which is greater than arc_c_min.
|
|
|
|
*
|
|
|
|
* 2. The sum of the amount of dirty data contained by both the mru and
|
|
|
|
* mfu lists, plus the ARC's other accounting (e.g. the anon list),
|
|
|
|
* is less than arc_c_min.
|
|
|
|
* (i.e. arc_c_min > amount of dirty data)
|
|
|
|
*
|
|
|
|
* 2.1. arc_size is greater than or equal arc_c_min.
|
|
|
|
* (i.e. arc_size >= arc_c_min > amount of dirty data)
|
|
|
|
*
|
|
|
|
* In this case, not all clean data from the regular mru and mfu
|
|
|
|
* lists is actually evictable; we must leave enough clean data
|
|
|
|
* to keep arc_size above arc_c_min. Thus, the maximum amount of
|
|
|
|
* evictable data from the two lists combined, is exactly the
|
|
|
|
* difference between arc_size and arc_c_min.
|
|
|
|
*
|
|
|
|
* 2.2. arc_size is less than arc_c_min
|
|
|
|
* (i.e. arc_c_min > arc_size > amount of dirty data)
|
|
|
|
*
|
|
|
|
* In this case, none of the data contained in the mru and mfu
|
|
|
|
* lists is evictable, even if it's clean. Since arc_size is
|
|
|
|
* already below arc_c_min, evicting any more would only
|
|
|
|
* increase this negative difference.
|
2011-03-29 18:08:59 -07:00
|
|
|
*/
|
2012-03-13 14:29:16 -07:00
|
|
|
static uint64_t
|
|
|
|
arc_evictable_memory(void) {
|
|
|
|
uint64_t arc_clean =
|
|
|
|
arc_mru->arcs_lsize[ARC_BUFC_DATA] +
|
|
|
|
arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
|
|
|
|
arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
|
|
|
|
arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
|
|
|
|
uint64_t ghost_clean =
|
|
|
|
arc_mru_ghost->arcs_lsize[ARC_BUFC_DATA] +
|
|
|
|
arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] +
|
|
|
|
arc_mfu_ghost->arcs_lsize[ARC_BUFC_DATA] +
|
|
|
|
arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA];
|
|
|
|
uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0);
|
|
|
|
|
|
|
|
if (arc_dirty >= arc_c_min)
|
|
|
|
return (ghost_clean + arc_clean);
|
|
|
|
|
|
|
|
return (ghost_clean + MAX((int64_t)arc_size - (int64_t)arc_c_min, 0));
|
|
|
|
}
|
|
|
|
|
2014-10-02 07:21:08 -05:00
|
|
|
/*
|
|
|
|
* If sc->nr_to_scan is zero, the caller is requesting a query of the
|
|
|
|
* number of objects which can potentially be freed. If it is nonzero,
|
|
|
|
* the request is to free that many objects.
|
|
|
|
*
|
|
|
|
* Linux kernels >= 3.12 have the count_objects and scan_objects callbacks
|
|
|
|
* in struct shrinker and also require the shrinker to return the number
|
|
|
|
* of objects freed.
|
|
|
|
*
|
|
|
|
* Older kernels require the shrinker to return the number of freeable
|
|
|
|
* objects following the freeing of nr_to_free.
|
|
|
|
*/
|
|
|
|
static spl_shrinker_t
|
2011-06-21 14:26:51 -07:00
|
|
|
__arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
|
2011-03-29 18:08:59 -07:00
|
|
|
{
|
2014-10-02 07:21:08 -05:00
|
|
|
int64_t pages;
|
2011-03-29 18:08:59 -07:00
|
|
|
|
2012-03-13 14:29:16 -07:00
|
|
|
/* The arc is considered warm once reclaim has occurred */
|
|
|
|
if (unlikely(arc_warm == B_FALSE))
|
|
|
|
arc_warm = B_TRUE;
|
2011-03-29 18:08:59 -07:00
|
|
|
|
2012-03-13 14:29:16 -07:00
|
|
|
/* Return the potential number of reclaimable pages */
|
2014-10-02 07:21:08 -05:00
|
|
|
pages = btop((int64_t)arc_evictable_memory());
|
2012-03-13 14:29:16 -07:00
|
|
|
if (sc->nr_to_scan == 0)
|
|
|
|
return (pages);
|
2011-05-09 12:18:46 -07:00
|
|
|
|
|
|
|
/* Not allowed to perform filesystem reclaim */
|
2011-06-21 14:26:51 -07:00
|
|
|
if (!(sc->gfp_mask & __GFP_FS))
|
2014-10-02 07:21:08 -05:00
|
|
|
return (SHRINK_STOP);
|
2011-05-09 12:18:46 -07:00
|
|
|
|
2011-03-29 18:08:59 -07:00
|
|
|
/* Reclaim in progress */
|
2015-01-12 19:52:19 -08:00
|
|
|
if (mutex_tryenter(&arc_reclaim_lock) == 0)
|
2014-10-02 07:21:08 -05:00
|
|
|
return (SHRINK_STOP);
|
2011-03-29 18:08:59 -07:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
mutex_exit(&arc_reclaim_lock);
|
|
|
|
|
2012-03-13 14:29:16 -07:00
|
|
|
/*
|
|
|
|
* Evict the requested number of pages by shrinking arc_c the
|
|
|
|
* requested amount. If there is nothing left to evict just
|
|
|
|
* reap whatever we can from the various arc slabs.
|
|
|
|
*/
|
|
|
|
if (pages > 0) {
|
2015-06-26 11:28:18 -07:00
|
|
|
arc_shrink(ptob(sc->nr_to_scan));
|
|
|
|
arc_kmem_reap_now();
|
2014-10-02 07:21:08 -05:00
|
|
|
#ifdef HAVE_SPLIT_SHRINKER_CALLBACK
|
|
|
|
pages = MAX(pages - btop(arc_evictable_memory()), 0);
|
|
|
|
#else
|
2013-12-23 11:34:20 -08:00
|
|
|
pages = btop(arc_evictable_memory());
|
2014-10-02 07:21:08 -05:00
|
|
|
#endif
|
2012-03-13 14:29:16 -07:00
|
|
|
} else {
|
2015-06-26 11:28:18 -07:00
|
|
|
arc_kmem_reap_now();
|
2014-10-02 07:21:08 -05:00
|
|
|
pages = SHRINK_STOP;
|
2012-03-13 14:29:16 -07:00
|
|
|
}
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
/*
|
|
|
|
* We've reaped what we can, wake up threads.
|
|
|
|
*/
|
|
|
|
cv_broadcast(&arc_reclaim_waiters_cv);
|
|
|
|
|
2012-03-13 14:29:16 -07:00
|
|
|
/*
|
|
|
|
* When direct reclaim is observed it usually indicates a rapid
|
|
|
|
* increase in memory pressure. This occurs because the kswapd
|
|
|
|
* threads were unable to asynchronously keep enough free memory
|
|
|
|
* available. In this case set arc_no_grow to briefly pause arc
|
|
|
|
* growth to avoid compounding the memory pressure.
|
|
|
|
*/
|
2011-03-29 18:08:59 -07:00
|
|
|
if (current_is_kswapd()) {
|
2012-03-13 14:29:16 -07:00
|
|
|
ARCSTAT_BUMP(arcstat_memory_indirect_count);
|
2011-03-29 18:08:59 -07:00
|
|
|
} else {
|
2012-03-13 14:29:16 -07:00
|
|
|
arc_no_grow = B_TRUE;
|
2015-07-27 13:17:32 -07:00
|
|
|
arc_need_free = ptob(sc->nr_to_scan);
|
2012-03-13 14:29:16 -07:00
|
|
|
ARCSTAT_BUMP(arcstat_memory_direct_count);
|
2011-03-29 18:08:59 -07:00
|
|
|
}
|
|
|
|
|
2013-12-23 11:34:20 -08:00
|
|
|
return (pages);
|
2011-03-29 18:08:59 -07:00
|
|
|
}
|
2011-06-21 14:26:51 -07:00
|
|
|
SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
|
2011-03-29 18:08:59 -07:00
|
|
|
|
|
|
|
SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS);
|
|
|
|
#endif /* _KERNEL */
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* Adapt arc info given the number of bytes we are trying to add and
|
|
|
|
* the state that we are comming from. This function is only called
|
|
|
|
* when we are adding new content to the cache.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
arc_adapt(int bytes, arc_state_t *state)
|
|
|
|
{
|
|
|
|
int mult;
|
2015-06-26 15:59:23 -07:00
|
|
|
uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
|
2015-06-26 15:14:45 -07:00
|
|
|
int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size);
|
|
|
|
int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
if (state == arc_l2c_only)
|
|
|
|
return;
|
|
|
|
|
|
|
|
ASSERT(bytes > 0);
|
|
|
|
/*
|
|
|
|
* Adapt the target size of the MRU list:
|
|
|
|
* - if we just hit in the MRU ghost list, then increase
|
|
|
|
* the target size of the MRU list.
|
|
|
|
* - if we just hit in the MFU ghost list, then increase
|
|
|
|
* the target size of the MFU list by decreasing the
|
|
|
|
* target size of the MRU list.
|
|
|
|
*/
|
|
|
|
if (state == arc_mru_ghost) {
|
2015-06-26 15:14:45 -07:00
|
|
|
mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
|
2014-01-03 10:36:26 -08:00
|
|
|
if (!zfs_arc_p_dampener_disable)
|
|
|
|
mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-06-26 15:59:23 -07:00
|
|
|
arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
|
2008-11-20 12:01:55 -08:00
|
|
|
} else if (state == arc_mfu_ghost) {
|
2009-02-18 12:51:31 -08:00
|
|
|
uint64_t delta;
|
|
|
|
|
2015-06-26 15:14:45 -07:00
|
|
|
mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
|
2014-01-03 10:36:26 -08:00
|
|
|
if (!zfs_arc_p_dampener_disable)
|
|
|
|
mult = MIN(mult, 10);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2009-02-18 12:51:31 -08:00
|
|
|
delta = MIN(bytes * mult, arc_p);
|
2015-06-26 15:59:23 -07:00
|
|
|
arc_p = MAX(arc_p_min, arc_p - delta);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
ASSERT((int64_t)arc_p >= 0);
|
|
|
|
|
2015-06-26 11:28:18 -07:00
|
|
|
if (arc_reclaim_needed()) {
|
|
|
|
cv_signal(&arc_reclaim_thread_cv);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
if (arc_no_grow)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (arc_c >= arc_c_max)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we're within (2 * maxblocksize) bytes of the target
|
|
|
|
* cache size, increment the target cache size
|
|
|
|
*/
|
2015-10-13 09:17:01 -07:00
|
|
|
ASSERT3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT);
|
2015-06-04 08:06:27 -05:00
|
|
|
if (arc_size >= arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
|
2008-11-20 12:01:55 -08:00
|
|
|
atomic_add_64(&arc_c, (int64_t)bytes);
|
|
|
|
if (arc_c > arc_c_max)
|
|
|
|
arc_c = arc_c_max;
|
|
|
|
else if (state == arc_anon)
|
|
|
|
atomic_add_64(&arc_p, (int64_t)bytes);
|
|
|
|
if (arc_p > arc_c)
|
|
|
|
arc_p = arc_c;
|
|
|
|
}
|
|
|
|
ASSERT((int64_t)arc_p >= 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2015-01-12 19:52:19 -08:00
|
|
|
* Check if arc_size has grown past our upper threshold, determined by
|
|
|
|
* zfs_arc_overflow_shift.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
2015-01-12 19:52:19 -08:00
|
|
|
static boolean_t
|
|
|
|
arc_is_overflowing(void)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2015-01-12 19:52:19 -08:00
|
|
|
/* Always allow at least one block of overflow */
|
|
|
|
uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
|
|
|
|
arc_c >> zfs_arc_overflow_shift);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
return (arc_size >= arc_c + overflow);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2015-01-12 19:52:19 -08:00
|
|
|
* The buffer, supplied as the first argument, needs a data block. If we
|
|
|
|
* are hitting the hard limit for the cache size, we must sleep, waiting
|
|
|
|
* for the eviction thread to catch up. If we're past the target size
|
|
|
|
* but below the hard limit, we'll only signal the reclaim thread and
|
|
|
|
* continue on.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
|
|
|
static void
|
|
|
|
arc_get_data_buf(arc_buf_t *buf)
|
|
|
|
{
|
2014-12-29 19:12:23 -08:00
|
|
|
arc_state_t *state = buf->b_hdr->b_l1hdr.b_state;
|
2008-11-20 12:01:55 -08:00
|
|
|
uint64_t size = buf->b_hdr->b_size;
|
2014-12-29 19:12:23 -08:00
|
|
|
arc_buf_contents_t type = arc_buf_type(buf->b_hdr);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
arc_adapt(size, state);
|
|
|
|
|
|
|
|
/*
|
2015-01-12 19:52:19 -08:00
|
|
|
* If arc_size is currently overflowing, and has grown past our
|
|
|
|
* upper limit, we must be adding data faster than the evict
|
|
|
|
* thread can evict. Thus, to ensure we don't compound the
|
|
|
|
* problem by adding more data and forcing arc_size to grow even
|
|
|
|
* further past it's target size, we halt and wait for the
|
|
|
|
* eviction thread to catch up.
|
|
|
|
*
|
|
|
|
* It's also possible that the reclaim thread is unable to evict
|
|
|
|
* enough buffers to get arc_size below the overflow limit (e.g.
|
|
|
|
* due to buffers being un-evictable, or hash lock collisions).
|
|
|
|
* In this case, we want to proceed regardless if we're
|
|
|
|
* overflowing; thus we don't use a while loop here.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
2015-01-12 19:52:19 -08:00
|
|
|
if (arc_is_overflowing()) {
|
|
|
|
mutex_enter(&arc_reclaim_lock);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Now that we've acquired the lock, we may no longer be
|
|
|
|
* over the overflow limit, lets check.
|
|
|
|
*
|
|
|
|
* We're ignoring the case of spurious wake ups. If that
|
|
|
|
* were to happen, it'd let this thread consume an ARC
|
|
|
|
* buffer before it should have (i.e. before we're under
|
|
|
|
* the overflow limit and were signalled by the reclaim
|
|
|
|
* thread). As long as that is a rare occurrence, it
|
|
|
|
* shouldn't cause any harm.
|
|
|
|
*/
|
|
|
|
if (arc_is_overflowing()) {
|
|
|
|
cv_signal(&arc_reclaim_thread_cv);
|
|
|
|
cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
mutex_exit(&arc_reclaim_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
2011-12-22 12:20:43 -08:00
|
|
|
|
Prioritize "metadata" in arc_get_data_buf
When the arc is at it's size limit and a new buffer is added, data will
be evicted (or recycled) from the arc to make room for this new buffer.
As far as I can tell, this is to try and keep the arc from over stepping
it's bounds (i.e. keep it below the size limitation placed on it).
This makes sense conceptually, but there appears to be a subtle flaw in
its current implementation, resulting in metadata buffers being
throttled. When it evicts from the arc's lists, it also passes in a
"type" so as to remove a buffer of the same type that it is adding. The
problem with this is that once the size limit is hit, the ratio of
"metadata" to "data" contained in the arc essentially becomes fixed.
For example, consider the following scenario:
* the size of the arc is capped at 10G
* the meta_limit is capped at 4G
* 9G of the arc contains "data"
* 1G of the arc contains "metadata"
Now, every time a new "metadata" buffer is created and added to the arc,
an older "metadata" buffer(s) will be removed from the arc; preserving
the 9G "data" to 1G "metadata" ratio that was in-place when the size
limit was reached. This occurs even though the amount of "metadata" is
far below the "metadata" limit. This can result in the arc behaving
pathologically for certain workloads.
To fix this, the arc_get_data_buf function was modified to evict "data"
from the arc even when adding a "metadata" buffer; unless it's at the
"metadata" limit. In addition, arc_evict now more closely resembles
arc_evict_ghost; such that when evicting "data" from the arc, it may
make a second pass over the arc lists and evict "metadata" if it cannot
meet the eviction size the first time around.
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #2110
2013-12-30 09:30:00 -08:00
|
|
|
if (type == ARC_BUFC_METADATA) {
|
2015-01-12 19:52:19 -08:00
|
|
|
buf->b_data = zio_buf_alloc(size);
|
|
|
|
arc_space_consume(size, ARC_SPACE_META);
|
|
|
|
} else {
|
|
|
|
ASSERT(type == ARC_BUFC_DATA);
|
|
|
|
buf->b_data = zio_data_buf_alloc(size);
|
|
|
|
arc_space_consume(size, ARC_SPACE_DATA);
|
Prioritize "metadata" in arc_get_data_buf
When the arc is at it's size limit and a new buffer is added, data will
be evicted (or recycled) from the arc to make room for this new buffer.
As far as I can tell, this is to try and keep the arc from over stepping
it's bounds (i.e. keep it below the size limitation placed on it).
This makes sense conceptually, but there appears to be a subtle flaw in
its current implementation, resulting in metadata buffers being
throttled. When it evicts from the arc's lists, it also passes in a
"type" so as to remove a buffer of the same type that it is adding. The
problem with this is that once the size limit is hit, the ratio of
"metadata" to "data" contained in the arc essentially becomes fixed.
For example, consider the following scenario:
* the size of the arc is capped at 10G
* the meta_limit is capped at 4G
* 9G of the arc contains "data"
* 1G of the arc contains "metadata"
Now, every time a new "metadata" buffer is created and added to the arc,
an older "metadata" buffer(s) will be removed from the arc; preserving
the 9G "data" to 1G "metadata" ratio that was in-place when the size
limit was reached. This occurs even though the amount of "metadata" is
far below the "metadata" limit. This can result in the arc behaving
pathologically for certain workloads.
To fix this, the arc_get_data_buf function was modified to evict "data"
from the arc even when adding a "metadata" buffer; unless it's at the
"metadata" limit. In addition, arc_evict now more closely resembles
arc_evict_ghost; such that when evicting "data" from the arc, it may
make a second pass over the arc lists and evict "metadata" if it cannot
meet the eviction size the first time around.
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #2110
2013-12-30 09:30:00 -08:00
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* Update the state size. Note that ghost states have a
|
|
|
|
* "ghost size" and so don't need to be updated.
|
|
|
|
*/
|
2014-12-29 19:12:23 -08:00
|
|
|
if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) {
|
2008-11-20 12:01:55 -08:00
|
|
|
arc_buf_hdr_t *hdr = buf->b_hdr;
|
2015-06-26 15:14:45 -07:00
|
|
|
arc_state_t *state = hdr->b_l1hdr.b_state;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-06-26 15:14:45 -07:00
|
|
|
(void) refcount_add_many(&state->arcs_size, size, buf);
|
2015-01-12 19:52:19 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If this is reached via arc_read, the link is
|
|
|
|
* protected by the hash lock. If reached via
|
|
|
|
* arc_buf_alloc, the header should not be accessed by
|
|
|
|
* any other thread. And, if reached via arc_read_done,
|
|
|
|
* the hash lock will protect it if it's found in the
|
|
|
|
* hash table; otherwise no other thread should be
|
|
|
|
* trying to [add|remove]_reference it.
|
|
|
|
*/
|
|
|
|
if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
|
|
|
|
atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type],
|
|
|
|
size);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* If we are growing the cache, and we are adding anonymous
|
|
|
|
* data, and we have outgrown arc_p, update arc_p
|
|
|
|
*/
|
2015-01-12 19:52:19 -08:00
|
|
|
if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
|
2015-06-26 15:14:45 -07:00
|
|
|
(refcount_count(&arc_anon->arcs_size) +
|
|
|
|
refcount_count(&arc_mru->arcs_size) > arc_p))
|
2008-11-20 12:01:55 -08:00
|
|
|
arc_p = MIN(arc_c, arc_p + size);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This routine is called whenever a buffer is accessed.
|
|
|
|
* NOTE: the hash lock is dropped in this function.
|
|
|
|
*/
|
|
|
|
static void
|
2014-12-06 09:24:32 -08:00
|
|
|
arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2010-05-28 13:45:14 -07:00
|
|
|
clock_t now;
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(MUTEX_HELD(hash_lock));
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(HDR_HAS_L1HDR(hdr));
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
if (hdr->b_l1hdr.b_state == arc_anon) {
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* This buffer is not in the cache, and does not
|
|
|
|
* appear in our "ghost" list. Add the new buffer
|
|
|
|
* to the MRU state.
|
|
|
|
*/
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT0(hdr->b_l1hdr.b_arc_access);
|
|
|
|
hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
|
2014-12-06 09:24:32 -08:00
|
|
|
DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
|
|
|
|
arc_change_state(arc_mru, hdr, hash_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
} else if (hdr->b_l1hdr.b_state == arc_mru) {
|
2010-05-28 13:45:14 -07:00
|
|
|
now = ddi_get_lbolt();
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* If this buffer is here because of a prefetch, then either:
|
|
|
|
* - clear the flag if this is a "referencing" read
|
|
|
|
* (any subsequent access will bump this into the MFU state).
|
|
|
|
* or
|
|
|
|
* - move the buffer to the head of the list if this is
|
|
|
|
* another prefetch (to make it less likely to be evicted).
|
|
|
|
*/
|
2014-12-29 19:12:23 -08:00
|
|
|
if (HDR_PREFETCH(hdr)) {
|
|
|
|
if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
|
2015-01-12 19:52:19 -08:00
|
|
|
/* link protected by hash lock */
|
|
|
|
ASSERT(multilist_link_active(
|
2014-12-29 19:12:23 -08:00
|
|
|
&hdr->b_l1hdr.b_arc_node));
|
2008-11-20 12:01:55 -08:00
|
|
|
} else {
|
2014-12-06 09:24:32 -08:00
|
|
|
hdr->b_flags &= ~ARC_FLAG_PREFETCH;
|
2014-12-29 19:12:23 -08:00
|
|
|
atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
|
2008-11-20 12:01:55 -08:00
|
|
|
ARCSTAT_BUMP(arcstat_mru_hits);
|
|
|
|
}
|
2014-12-29 19:12:23 -08:00
|
|
|
hdr->b_l1hdr.b_arc_access = now;
|
2008-11-20 12:01:55 -08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This buffer has been "accessed" only once so far,
|
|
|
|
* but it is still in the cache. Move it to the MFU
|
|
|
|
* state.
|
|
|
|
*/
|
2014-12-29 19:12:23 -08:00
|
|
|
if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access +
|
|
|
|
ARC_MINTIME)) {
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* More than 125ms have passed since we
|
|
|
|
* instantiated this buffer. Move it to the
|
|
|
|
* most frequently used state.
|
|
|
|
*/
|
2014-12-29 19:12:23 -08:00
|
|
|
hdr->b_l1hdr.b_arc_access = now;
|
2014-12-06 09:24:32 -08:00
|
|
|
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
|
|
|
|
arc_change_state(arc_mfu, hdr, hash_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
2014-12-29 19:12:23 -08:00
|
|
|
atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
|
2008-11-20 12:01:55 -08:00
|
|
|
ARCSTAT_BUMP(arcstat_mru_hits);
|
2014-12-29 19:12:23 -08:00
|
|
|
} else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
|
2008-11-20 12:01:55 -08:00
|
|
|
arc_state_t *new_state;
|
|
|
|
/*
|
|
|
|
* This buffer has been "accessed" recently, but
|
|
|
|
* was evicted from the cache. Move it to the
|
|
|
|
* MFU state.
|
|
|
|
*/
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
if (HDR_PREFETCH(hdr)) {
|
2008-11-20 12:01:55 -08:00
|
|
|
new_state = arc_mru;
|
2014-12-29 19:12:23 -08:00
|
|
|
if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
|
2014-12-06 09:24:32 -08:00
|
|
|
hdr->b_flags &= ~ARC_FLAG_PREFETCH;
|
|
|
|
DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
|
2008-11-20 12:01:55 -08:00
|
|
|
} else {
|
|
|
|
new_state = arc_mfu;
|
2014-12-06 09:24:32 -08:00
|
|
|
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
|
2014-12-06 09:24:32 -08:00
|
|
|
arc_change_state(new_state, hdr, hash_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits);
|
2008-11-20 12:01:55 -08:00
|
|
|
ARCSTAT_BUMP(arcstat_mru_ghost_hits);
|
2014-12-29 19:12:23 -08:00
|
|
|
} else if (hdr->b_l1hdr.b_state == arc_mfu) {
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* This buffer has been accessed more than once and is
|
|
|
|
* still in the cache. Keep it in the MFU state.
|
|
|
|
*
|
|
|
|
* NOTE: an add_reference() that occurred when we did
|
|
|
|
* the arc_read() will have kicked this off the list.
|
|
|
|
* If it was a prefetch, we will explicitly move it to
|
|
|
|
* the head of the list now.
|
|
|
|
*/
|
2014-12-29 19:12:23 -08:00
|
|
|
if ((HDR_PREFETCH(hdr)) != 0) {
|
|
|
|
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
|
2015-01-12 19:52:19 -08:00
|
|
|
/* link protected by hash_lock */
|
|
|
|
ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
2014-12-29 19:12:23 -08:00
|
|
|
atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits);
|
2008-11-20 12:01:55 -08:00
|
|
|
ARCSTAT_BUMP(arcstat_mfu_hits);
|
2014-12-29 19:12:23 -08:00
|
|
|
hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
|
|
|
|
} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
|
2008-11-20 12:01:55 -08:00
|
|
|
arc_state_t *new_state = arc_mfu;
|
|
|
|
/*
|
|
|
|
* This buffer has been accessed more than once but has
|
|
|
|
* been evicted from the cache. Move it back to the
|
|
|
|
* MFU state.
|
|
|
|
*/
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
if (HDR_PREFETCH(hdr)) {
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* This is a prefetch access...
|
|
|
|
* move this block back to the MRU state.
|
|
|
|
*/
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
|
2008-11-20 12:01:55 -08:00
|
|
|
new_state = arc_mru;
|
|
|
|
}
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
|
2014-12-06 09:24:32 -08:00
|
|
|
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
|
|
|
|
arc_change_state(new_state, hdr, hash_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits);
|
2008-11-20 12:01:55 -08:00
|
|
|
ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
|
2014-12-29 19:12:23 -08:00
|
|
|
} else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* This buffer is on the 2nd Level ARC.
|
|
|
|
*/
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
|
2014-12-06 09:24:32 -08:00
|
|
|
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
|
|
|
|
arc_change_state(arc_mfu, hdr, hash_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
} else {
|
2014-12-29 19:12:23 -08:00
|
|
|
cmn_err(CE_PANIC, "invalid arc state 0x%p",
|
|
|
|
hdr->b_l1hdr.b_state);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* a generic arc_done_func_t which you can use */
|
|
|
|
/* ARGSUSED */
|
|
|
|
void
|
|
|
|
arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
|
|
|
|
{
|
2010-05-28 13:45:14 -07:00
|
|
|
if (zio == NULL || zio->io_error == 0)
|
|
|
|
bcopy(buf->b_data, arg, buf->b_hdr->b_size);
|
2013-09-04 07:00:57 -05:00
|
|
|
VERIFY(arc_buf_remove_ref(buf, arg));
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* a generic arc_done_func_t */
|
|
|
|
void
|
|
|
|
arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
|
|
|
|
{
|
|
|
|
arc_buf_t **bufp = arg;
|
|
|
|
if (zio && zio->io_error) {
|
2013-09-04 07:00:57 -05:00
|
|
|
VERIFY(arc_buf_remove_ref(buf, arg));
|
2008-11-20 12:01:55 -08:00
|
|
|
*bufp = NULL;
|
|
|
|
} else {
|
|
|
|
*bufp = buf;
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(buf->b_data);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
arc_read_done(zio_t *zio)
|
|
|
|
{
|
2014-06-05 13:19:08 -08:00
|
|
|
arc_buf_hdr_t *hdr;
|
2008-11-20 12:01:55 -08:00
|
|
|
arc_buf_t *buf;
|
|
|
|
arc_buf_t *abuf; /* buffer we're assigning to callback */
|
2014-06-05 13:19:08 -08:00
|
|
|
kmutex_t *hash_lock = NULL;
|
2008-11-20 12:01:55 -08:00
|
|
|
arc_callback_t *callback_list, *acb;
|
|
|
|
int freeable = FALSE;
|
|
|
|
|
|
|
|
buf = zio->io_private;
|
|
|
|
hdr = buf->b_hdr;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The hdr was inserted into hash-table and removed from lists
|
|
|
|
* prior to starting I/O. We should find this header, since
|
|
|
|
* it's in the hash table, and it should be legit since it's
|
|
|
|
* not possible to evict it during the I/O. The only possible
|
|
|
|
* reason for it not to be found is if we were freed during the
|
|
|
|
* read.
|
|
|
|
*/
|
2014-06-05 13:19:08 -08:00
|
|
|
if (HDR_IN_HASH_TABLE(hdr)) {
|
|
|
|
arc_buf_hdr_t *found;
|
|
|
|
|
|
|
|
ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
|
|
|
|
ASSERT3U(hdr->b_dva.dva_word[0], ==,
|
|
|
|
BP_IDENTITY(zio->io_bp)->dva_word[0]);
|
|
|
|
ASSERT3U(hdr->b_dva.dva_word[1], ==,
|
|
|
|
BP_IDENTITY(zio->io_bp)->dva_word[1]);
|
|
|
|
|
|
|
|
found = buf_hash_find(hdr->b_spa, zio->io_bp,
|
|
|
|
&hash_lock);
|
|
|
|
|
|
|
|
ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
|
|
|
|
hash_lock == NULL) ||
|
|
|
|
(found == hdr &&
|
|
|
|
DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
|
|
|
|
(found == hdr && HDR_L2_READING(hdr)));
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-06 09:24:32 -08:00
|
|
|
hdr->b_flags &= ~ARC_FLAG_L2_EVICTED;
|
2014-12-29 19:12:23 -08:00
|
|
|
if (l2arc_noprefetch && HDR_PREFETCH(hdr))
|
2014-12-06 09:24:32 -08:00
|
|
|
hdr->b_flags &= ~ARC_FLAG_L2CACHE;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/* byteswap if necessary */
|
2014-12-29 19:12:23 -08:00
|
|
|
callback_list = hdr->b_l1hdr.b_acb;
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(callback_list != NULL);
|
2010-05-28 13:45:14 -07:00
|
|
|
if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
|
2012-12-13 15:24:15 -08:00
|
|
|
dmu_object_byteswap_t bswap =
|
|
|
|
DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
|
2013-02-14 23:37:43 -05:00
|
|
|
if (BP_GET_LEVEL(zio->io_bp) > 0)
|
|
|
|
byteswap_uint64_array(buf->b_data, hdr->b_size);
|
|
|
|
else
|
|
|
|
dmu_ot_byteswap[bswap].ob_func(buf->b_data, hdr->b_size);
|
2008-12-03 12:09:06 -08:00
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
arc_cksum_compute(buf, B_FALSE);
|
2013-05-16 14:18:06 -07:00
|
|
|
arc_buf_watch(buf);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
if (hash_lock && zio->io_error == 0 &&
|
|
|
|
hdr->b_l1hdr.b_state == arc_anon) {
|
2010-05-28 13:45:14 -07:00
|
|
|
/*
|
|
|
|
* Only call arc_access on anonymous buffers. This is because
|
|
|
|
* if we've issued an I/O for an evicted buffer, we've already
|
|
|
|
* called arc_access (to prevent any simultaneous readers from
|
|
|
|
* getting confused).
|
|
|
|
*/
|
|
|
|
arc_access(hdr, hash_lock);
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/* create copies of the data buffer for the callers */
|
|
|
|
abuf = buf;
|
|
|
|
for (acb = callback_list; acb; acb = acb->acb_next) {
|
|
|
|
if (acb->acb_done) {
|
2012-12-21 14:57:09 -08:00
|
|
|
if (abuf == NULL) {
|
|
|
|
ARCSTAT_BUMP(arcstat_duplicate_reads);
|
2008-11-20 12:01:55 -08:00
|
|
|
abuf = arc_buf_clone(buf);
|
2012-12-21 14:57:09 -08:00
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
acb->acb_buf = abuf;
|
|
|
|
abuf = NULL;
|
|
|
|
}
|
|
|
|
}
|
2014-12-29 19:12:23 -08:00
|
|
|
hdr->b_l1hdr.b_acb = NULL;
|
2014-12-06 09:24:32 -08:00
|
|
|
hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(!HDR_BUF_AVAILABLE(hdr));
|
2010-05-28 13:45:14 -07:00
|
|
|
if (abuf == buf) {
|
|
|
|
ASSERT(buf->b_efunc == NULL);
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(hdr->b_l1hdr.b_datacnt == 1);
|
2014-12-06 09:24:32 -08:00
|
|
|
hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
|
2010-05-28 13:45:14 -07:00
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
|
|
|
|
callback_list != NULL);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
if (zio->io_error != 0) {
|
2014-12-06 09:24:32 -08:00
|
|
|
hdr->b_flags |= ARC_FLAG_IO_ERROR;
|
2014-12-29 19:12:23 -08:00
|
|
|
if (hdr->b_l1hdr.b_state != arc_anon)
|
2008-11-20 12:01:55 -08:00
|
|
|
arc_change_state(arc_anon, hdr, hash_lock);
|
|
|
|
if (HDR_IN_HASH_TABLE(hdr))
|
|
|
|
buf_hash_remove(hdr);
|
2014-12-29 19:12:23 -08:00
|
|
|
freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Broadcast before we drop the hash_lock to avoid the possibility
|
|
|
|
* that the hdr (and hence the cv) might be freed before we get to
|
|
|
|
* the cv_broadcast().
|
|
|
|
*/
|
2014-12-29 19:12:23 -08:00
|
|
|
cv_broadcast(&hdr->b_l1hdr.b_cv);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
if (hash_lock != NULL) {
|
2008-11-20 12:01:55 -08:00
|
|
|
mutex_exit(hash_lock);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* This block was freed while we waited for the read to
|
|
|
|
* complete. It has been removed from the hash table and
|
|
|
|
* moved to the anonymous state (so that it won't show up
|
|
|
|
* in the cache).
|
|
|
|
*/
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
|
|
|
|
freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* execute each callback and free its structure */
|
|
|
|
while ((acb = callback_list) != NULL) {
|
|
|
|
if (acb->acb_done)
|
|
|
|
acb->acb_done(zio, acb->acb_buf, acb->acb_private);
|
|
|
|
|
|
|
|
if (acb->acb_zio_dummy != NULL) {
|
|
|
|
acb->acb_zio_dummy->io_error = zio->io_error;
|
|
|
|
zio_nowait(acb->acb_zio_dummy);
|
|
|
|
}
|
|
|
|
|
|
|
|
callback_list = acb->acb_next;
|
|
|
|
kmem_free(acb, sizeof (arc_callback_t));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (freeable)
|
|
|
|
arc_hdr_destroy(hdr);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2013-01-11 08:54:18 -08:00
|
|
|
* "Read" the block at the specified DVA (in bp) via the
|
2008-11-20 12:01:55 -08:00
|
|
|
* cache. If the block is found in the cache, invoke the provided
|
|
|
|
* callback immediately and return. Note that the `zio' parameter
|
|
|
|
* in the callback will be NULL in this case, since no IO was
|
|
|
|
* required. If the block is not in the cache pass the read request
|
|
|
|
* on to the spa with a substitute callback function, so that the
|
|
|
|
* requested block will be added to the cache.
|
|
|
|
*
|
|
|
|
* If a read request arrives for a block that has a read in-progress,
|
|
|
|
* either wait for the in-progress read to complete (and return the
|
|
|
|
* results); or, if this is a read with a "done" func, add a record
|
|
|
|
* to the read to invoke the "done" func when the read completes,
|
|
|
|
* and return; or just return.
|
|
|
|
*
|
|
|
|
* arc_read_done() will invoke all the requested "done" functions
|
|
|
|
* for readers of this block.
|
|
|
|
*/
|
|
|
|
int
|
2013-07-02 13:26:24 -07:00
|
|
|
arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
|
2014-12-06 09:24:32 -08:00
|
|
|
void *private, zio_priority_t priority, int zio_flags,
|
|
|
|
arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2014-06-05 13:19:08 -08:00
|
|
|
arc_buf_hdr_t *hdr = NULL;
|
2010-08-26 09:58:04 -07:00
|
|
|
arc_buf_t *buf = NULL;
|
2014-06-05 13:19:08 -08:00
|
|
|
kmutex_t *hash_lock = NULL;
|
2008-11-20 12:01:55 -08:00
|
|
|
zio_t *rzio;
|
2011-11-11 14:07:54 -08:00
|
|
|
uint64_t guid = spa_load_guid(spa);
|
Add visibility in to arc_read
This change is an attempt to add visibility into the arc_read calls
occurring on a system, in real time. To do this, a list was added to the
in memory SPA data structure for a pool, with each element on the list
corresponding to a call to arc_read. These entries are then exported
through the kstat interface, which can then be interpreted in userspace.
For each arc_read call, the following information is exported:
* A unique identifier (uint64_t)
* The time the entry was added to the list (hrtime_t)
(*not* wall clock time; relative to the other entries on the list)
* The objset ID (uint64_t)
* The object number (uint64_t)
* The indirection level (uint64_t)
* The block ID (uint64_t)
* The name of the function originating the arc_read call (char[24])
* The arc_flags from the arc_read call (uint32_t)
* The PID of the reading thread (pid_t)
* The command or name of thread originating read (char[16])
From this exported information one can see, in real time, exactly what
is being read, what function is generating the read, and whether or not
the read was found to be already cached.
There is still some work to be done, but this should serve as a good
starting point.
Specifically, dbuf_read's are not accounted for in the currently
exported information. Thus, a follow up patch should probably be added
to export these calls that never call into arc_read (they only hit the
dbuf hash table). In addition, it might be nice to create a utility
similar to "arcstat.py" to digest the exported information and display
it in a more readable format. Or perhaps, log the information and allow
for it to be "replayed" at a later time.
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
2013-09-06 16:09:05 -07:00
|
|
|
int rc = 0;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-06-05 13:19:08 -08:00
|
|
|
ASSERT(!BP_IS_EMBEDDED(bp) ||
|
|
|
|
BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
top:
|
2014-06-05 13:19:08 -08:00
|
|
|
if (!BP_IS_EMBEDDED(bp)) {
|
|
|
|
/*
|
|
|
|
* Embedded BP's have no DVA and require no I/O to "read".
|
|
|
|
* Create an anonymous arc buf to back it.
|
|
|
|
*/
|
|
|
|
hdr = buf_hash_find(guid, bp, &hash_lock);
|
|
|
|
}
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) {
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-06 09:24:32 -08:00
|
|
|
*arc_flags |= ARC_FLAG_CACHED;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
if (HDR_IO_IN_PROGRESS(hdr)) {
|
|
|
|
|
2015-12-26 22:10:31 +01:00
|
|
|
if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
|
|
|
|
priority == ZIO_PRIORITY_SYNC_READ) {
|
|
|
|
/*
|
|
|
|
* This sync read must wait for an
|
|
|
|
* in-progress async read (e.g. a predictive
|
|
|
|
* prefetch). Async reads are queued
|
|
|
|
* separately at the vdev_queue layer, so
|
|
|
|
* this is a form of priority inversion.
|
|
|
|
* Ideally, we would "inherit" the demand
|
|
|
|
* i/o's priority by moving the i/o from
|
|
|
|
* the async queue to the synchronous queue,
|
|
|
|
* but there is currently no mechanism to do
|
|
|
|
* so. Track this so that we can evaluate
|
|
|
|
* the magnitude of this potential performance
|
|
|
|
* problem.
|
|
|
|
*
|
|
|
|
* Note that if the prefetch i/o is already
|
|
|
|
* active (has been issued to the device),
|
|
|
|
* the prefetch improved performance, because
|
|
|
|
* we issued it sooner than we would have
|
|
|
|
* without the prefetch.
|
|
|
|
*/
|
|
|
|
DTRACE_PROBE1(arc__sync__wait__for__async,
|
|
|
|
arc_buf_hdr_t *, hdr);
|
|
|
|
ARCSTAT_BUMP(arcstat_sync_wait_for_async);
|
|
|
|
}
|
|
|
|
if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
|
|
|
|
hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH;
|
|
|
|
}
|
|
|
|
|
2014-12-06 09:24:32 -08:00
|
|
|
if (*arc_flags & ARC_FLAG_WAIT) {
|
2014-12-29 19:12:23 -08:00
|
|
|
cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
mutex_exit(hash_lock);
|
|
|
|
goto top;
|
|
|
|
}
|
2014-12-06 09:24:32 -08:00
|
|
|
ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
if (done) {
|
2015-12-26 22:10:31 +01:00
|
|
|
arc_callback_t *acb = NULL;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
acb = kmem_zalloc(sizeof (arc_callback_t),
|
2014-11-20 19:09:39 -05:00
|
|
|
KM_SLEEP);
|
2008-11-20 12:01:55 -08:00
|
|
|
acb->acb_done = done;
|
|
|
|
acb->acb_private = private;
|
|
|
|
if (pio != NULL)
|
|
|
|
acb->acb_zio_dummy = zio_null(pio,
|
2009-02-18 12:51:31 -08:00
|
|
|
spa, NULL, NULL, NULL, zio_flags);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
ASSERT(acb->acb_done != NULL);
|
2014-12-29 19:12:23 -08:00
|
|
|
acb->acb_next = hdr->b_l1hdr.b_acb;
|
|
|
|
hdr->b_l1hdr.b_acb = acb;
|
2008-11-20 12:01:55 -08:00
|
|
|
add_reference(hdr, hash_lock, private);
|
|
|
|
mutex_exit(hash_lock);
|
Add visibility in to arc_read
This change is an attempt to add visibility into the arc_read calls
occurring on a system, in real time. To do this, a list was added to the
in memory SPA data structure for a pool, with each element on the list
corresponding to a call to arc_read. These entries are then exported
through the kstat interface, which can then be interpreted in userspace.
For each arc_read call, the following information is exported:
* A unique identifier (uint64_t)
* The time the entry was added to the list (hrtime_t)
(*not* wall clock time; relative to the other entries on the list)
* The objset ID (uint64_t)
* The object number (uint64_t)
* The indirection level (uint64_t)
* The block ID (uint64_t)
* The name of the function originating the arc_read call (char[24])
* The arc_flags from the arc_read call (uint32_t)
* The PID of the reading thread (pid_t)
* The command or name of thread originating read (char[16])
From this exported information one can see, in real time, exactly what
is being read, what function is generating the read, and whether or not
the read was found to be already cached.
There is still some work to be done, but this should serve as a good
starting point.
Specifically, dbuf_read's are not accounted for in the currently
exported information. Thus, a follow up patch should probably be added
to export these calls that never call into arc_read (they only hit the
dbuf hash table). In addition, it might be nice to create a utility
similar to "arcstat.py" to digest the exported information and display
it in a more readable format. Or perhaps, log the information and allow
for it to be "replayed" at a later time.
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
2013-09-06 16:09:05 -07:00
|
|
|
goto out;
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
mutex_exit(hash_lock);
|
Add visibility in to arc_read
This change is an attempt to add visibility into the arc_read calls
occurring on a system, in real time. To do this, a list was added to the
in memory SPA data structure for a pool, with each element on the list
corresponding to a call to arc_read. These entries are then exported
through the kstat interface, which can then be interpreted in userspace.
For each arc_read call, the following information is exported:
* A unique identifier (uint64_t)
* The time the entry was added to the list (hrtime_t)
(*not* wall clock time; relative to the other entries on the list)
* The objset ID (uint64_t)
* The object number (uint64_t)
* The indirection level (uint64_t)
* The block ID (uint64_t)
* The name of the function originating the arc_read call (char[24])
* The arc_flags from the arc_read call (uint32_t)
* The PID of the reading thread (pid_t)
* The command or name of thread originating read (char[16])
From this exported information one can see, in real time, exactly what
is being read, what function is generating the read, and whether or not
the read was found to be already cached.
There is still some work to be done, but this should serve as a good
starting point.
Specifically, dbuf_read's are not accounted for in the currently
exported information. Thus, a follow up patch should probably be added
to export these calls that never call into arc_read (they only hit the
dbuf hash table). In addition, it might be nice to create a utility
similar to "arcstat.py" to digest the exported information and display
it in a more readable format. Or perhaps, log the information and allow
for it to be "replayed" at a later time.
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
2013-09-06 16:09:05 -07:00
|
|
|
goto out;
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
|
|
|
|
hdr->b_l1hdr.b_state == arc_mfu);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
if (done) {
|
2015-12-26 22:10:31 +01:00
|
|
|
if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
|
|
|
|
/*
|
|
|
|
* This is a demand read which does not have to
|
|
|
|
* wait for i/o because we did a predictive
|
|
|
|
* prefetch i/o for it, which has completed.
|
|
|
|
*/
|
|
|
|
DTRACE_PROBE1(
|
|
|
|
arc__demand__hit__predictive__prefetch,
|
|
|
|
arc_buf_hdr_t *, hdr);
|
|
|
|
ARCSTAT_BUMP(
|
|
|
|
arcstat_demand_hit_predictive_prefetch);
|
|
|
|
hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH;
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
add_reference(hdr, hash_lock, private);
|
|
|
|
/*
|
|
|
|
* If this block is already in use, create a new
|
|
|
|
* copy of the data so that we will be guaranteed
|
|
|
|
* that arc_release() will always succeed.
|
|
|
|
*/
|
2014-12-29 19:12:23 -08:00
|
|
|
buf = hdr->b_l1hdr.b_buf;
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(buf);
|
|
|
|
ASSERT(buf->b_data);
|
|
|
|
if (HDR_BUF_AVAILABLE(hdr)) {
|
|
|
|
ASSERT(buf->b_efunc == NULL);
|
2014-12-06 09:24:32 -08:00
|
|
|
hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
|
2008-11-20 12:01:55 -08:00
|
|
|
} else {
|
|
|
|
buf = arc_buf_clone(buf);
|
|
|
|
}
|
2010-05-28 13:45:14 -07:00
|
|
|
|
2014-12-06 09:24:32 -08:00
|
|
|
} else if (*arc_flags & ARC_FLAG_PREFETCH &&
|
2014-12-29 19:12:23 -08:00
|
|
|
refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
|
2014-12-06 09:24:32 -08:00
|
|
|
hdr->b_flags |= ARC_FLAG_PREFETCH;
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
|
|
|
|
arc_access(hdr, hash_lock);
|
2014-12-06 09:24:32 -08:00
|
|
|
if (*arc_flags & ARC_FLAG_L2CACHE)
|
|
|
|
hdr->b_flags |= ARC_FLAG_L2CACHE;
|
|
|
|
if (*arc_flags & ARC_FLAG_L2COMPRESS)
|
|
|
|
hdr->b_flags |= ARC_FLAG_L2COMPRESS;
|
2008-11-20 12:01:55 -08:00
|
|
|
mutex_exit(hash_lock);
|
|
|
|
ARCSTAT_BUMP(arcstat_hits);
|
2014-12-29 19:12:23 -08:00
|
|
|
ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
|
|
|
|
demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
|
2008-11-20 12:01:55 -08:00
|
|
|
data, metadata, hits);
|
|
|
|
|
|
|
|
if (done)
|
|
|
|
done(NULL, buf, private);
|
|
|
|
} else {
|
|
|
|
uint64_t size = BP_GET_LSIZE(bp);
|
2014-06-05 13:19:08 -08:00
|
|
|
arc_callback_t *acb;
|
2008-12-03 12:09:06 -08:00
|
|
|
vdev_t *vd = NULL;
|
2013-02-10 22:21:05 -08:00
|
|
|
uint64_t addr = 0;
|
2009-02-18 12:51:31 -08:00
|
|
|
boolean_t devw = B_FALSE;
|
2014-03-20 16:55:09 -07:00
|
|
|
enum zio_compress b_compress = ZIO_COMPRESS_OFF;
|
2014-12-29 19:12:23 -08:00
|
|
|
int32_t b_asize = 0;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-09-10 11:59:03 -07:00
|
|
|
/*
|
|
|
|
* Gracefully handle a damaged logical block size as a
|
2015-12-09 11:00:35 -08:00
|
|
|
* checksum error.
|
2014-09-10 11:59:03 -07:00
|
|
|
*/
|
2014-11-03 12:15:08 -08:00
|
|
|
if (size > spa_maxblocksize(spa)) {
|
2015-12-09 11:00:35 -08:00
|
|
|
ASSERT3P(buf, ==, NULL);
|
|
|
|
rc = SET_ERROR(ECKSUM);
|
2014-09-10 11:59:03 -07:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
if (hdr == NULL) {
|
|
|
|
/* this block is not in the cache */
|
2014-06-05 13:19:08 -08:00
|
|
|
arc_buf_hdr_t *exists = NULL;
|
2008-11-20 12:01:55 -08:00
|
|
|
arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
|
|
|
|
buf = arc_buf_alloc(spa, size, private, type);
|
|
|
|
hdr = buf->b_hdr;
|
2014-06-05 13:19:08 -08:00
|
|
|
if (!BP_IS_EMBEDDED(bp)) {
|
|
|
|
hdr->b_dva = *BP_IDENTITY(bp);
|
|
|
|
hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
|
|
|
|
exists = buf_hash_insert(hdr, &hash_lock);
|
|
|
|
}
|
|
|
|
if (exists != NULL) {
|
2008-11-20 12:01:55 -08:00
|
|
|
/* somebody beat us to the hash insert */
|
|
|
|
mutex_exit(hash_lock);
|
2010-05-28 13:45:14 -07:00
|
|
|
buf_discard_identity(hdr);
|
2008-11-20 12:01:55 -08:00
|
|
|
(void) arc_buf_remove_ref(buf, private);
|
|
|
|
goto top; /* restart the IO request */
|
|
|
|
}
|
2014-12-06 09:24:32 -08:00
|
|
|
|
2015-12-26 22:10:31 +01:00
|
|
|
/*
|
|
|
|
* If there is a callback, we pass our reference to
|
|
|
|
* it; otherwise we remove our reference.
|
|
|
|
*/
|
|
|
|
if (done == NULL) {
|
2008-11-20 12:01:55 -08:00
|
|
|
(void) remove_reference(hdr, hash_lock,
|
|
|
|
private);
|
|
|
|
}
|
2015-12-26 22:10:31 +01:00
|
|
|
if (*arc_flags & ARC_FLAG_PREFETCH)
|
|
|
|
hdr->b_flags |= ARC_FLAG_PREFETCH;
|
2014-12-06 09:24:32 -08:00
|
|
|
if (*arc_flags & ARC_FLAG_L2CACHE)
|
|
|
|
hdr->b_flags |= ARC_FLAG_L2CACHE;
|
|
|
|
if (*arc_flags & ARC_FLAG_L2COMPRESS)
|
|
|
|
hdr->b_flags |= ARC_FLAG_L2COMPRESS;
|
2008-11-20 12:01:55 -08:00
|
|
|
if (BP_GET_LEVEL(bp) > 0)
|
2014-12-06 09:24:32 -08:00
|
|
|
hdr->b_flags |= ARC_FLAG_INDIRECT;
|
2008-11-20 12:01:55 -08:00
|
|
|
} else {
|
2014-12-29 19:12:23 -08:00
|
|
|
/*
|
|
|
|
* This block is in the ghost cache. If it was L2-only
|
|
|
|
* (and thus didn't have an L1 hdr), we realloc the
|
|
|
|
* header to add an L1 hdr.
|
|
|
|
*/
|
|
|
|
if (!HDR_HAS_L1HDR(hdr)) {
|
|
|
|
hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
|
|
|
|
hdr_full_cache);
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
|
2015-01-12 19:52:19 -08:00
|
|
|
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-12-26 22:10:31 +01:00
|
|
|
/*
|
|
|
|
* If there is a callback, we pass a reference to it.
|
|
|
|
*/
|
|
|
|
if (done != NULL)
|
|
|
|
add_reference(hdr, hash_lock, private);
|
2014-12-06 09:24:32 -08:00
|
|
|
if (*arc_flags & ARC_FLAG_PREFETCH)
|
|
|
|
hdr->b_flags |= ARC_FLAG_PREFETCH;
|
|
|
|
if (*arc_flags & ARC_FLAG_L2CACHE)
|
|
|
|
hdr->b_flags |= ARC_FLAG_L2CACHE;
|
|
|
|
if (*arc_flags & ARC_FLAG_L2COMPRESS)
|
|
|
|
hdr->b_flags |= ARC_FLAG_L2COMPRESS;
|
2008-11-20 12:01:55 -08:00
|
|
|
buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
|
|
|
|
buf->b_hdr = hdr;
|
|
|
|
buf->b_data = NULL;
|
|
|
|
buf->b_efunc = NULL;
|
|
|
|
buf->b_private = NULL;
|
|
|
|
buf->b_next = NULL;
|
2014-12-29 19:12:23 -08:00
|
|
|
hdr->b_l1hdr.b_buf = buf;
|
|
|
|
ASSERT0(hdr->b_l1hdr.b_datacnt);
|
|
|
|
hdr->b_l1hdr.b_datacnt = 1;
|
2010-05-28 13:45:14 -07:00
|
|
|
arc_get_data_buf(buf);
|
|
|
|
arc_access(hdr, hash_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2015-12-26 22:10:31 +01:00
|
|
|
if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
|
|
|
|
hdr->b_flags |= ARC_FLAG_PREDICTIVE_PREFETCH;
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
|
2010-05-28 13:45:14 -07:00
|
|
|
|
2014-11-20 19:09:39 -05:00
|
|
|
acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
|
2008-11-20 12:01:55 -08:00
|
|
|
acb->acb_done = done;
|
|
|
|
acb->acb_private = private;
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(hdr->b_l1hdr.b_acb == NULL);
|
|
|
|
hdr->b_l1hdr.b_acb = acb;
|
2014-12-06 09:24:32 -08:00
|
|
|
hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
if (HDR_HAS_L2HDR(hdr) &&
|
|
|
|
(vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
|
|
|
|
devw = hdr->b_l2hdr.b_dev->l2ad_writing;
|
|
|
|
addr = hdr->b_l2hdr.b_daddr;
|
2015-09-11 09:18:56 -07:00
|
|
|
b_compress = hdr->b_l2hdr.b_compress;
|
2014-12-29 19:12:23 -08:00
|
|
|
b_asize = hdr->b_l2hdr.b_asize;
|
2008-12-03 12:09:06 -08:00
|
|
|
/*
|
|
|
|
* Lock out device removal.
|
|
|
|
*/
|
|
|
|
if (vdev_is_dead(vd) ||
|
|
|
|
!spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
|
|
|
|
vd = NULL;
|
|
|
|
}
|
|
|
|
|
2014-06-05 13:19:08 -08:00
|
|
|
if (hash_lock != NULL)
|
|
|
|
mutex_exit(hash_lock);
|
2008-12-03 12:09:06 -08:00
|
|
|
|
2013-06-11 09:12:34 -08:00
|
|
|
/*
|
|
|
|
* At this point, we have a level 1 cache miss. Try again in
|
|
|
|
* L2ARC if possible.
|
|
|
|
*/
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT3U(hdr->b_size, ==, size);
|
2010-05-28 13:45:14 -07:00
|
|
|
DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
|
2014-06-25 10:37:59 -08:00
|
|
|
uint64_t, size, zbookmark_phys_t *, zb);
|
2008-11-20 12:01:55 -08:00
|
|
|
ARCSTAT_BUMP(arcstat_misses);
|
2014-12-29 19:12:23 -08:00
|
|
|
ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
|
|
|
|
demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
|
2008-11-20 12:01:55 -08:00
|
|
|
data, metadata, misses);
|
|
|
|
|
2015-12-26 22:10:31 +01:00
|
|
|
if (priority == ZIO_PRIORITY_ASYNC_READ)
|
|
|
|
hdr->b_flags |= ARC_FLAG_PRIO_ASYNC_READ;
|
|
|
|
else
|
|
|
|
hdr->b_flags &= ~ARC_FLAG_PRIO_ASYNC_READ;
|
|
|
|
|
2009-02-18 12:51:31 -08:00
|
|
|
if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* Read from the L2ARC if the following are true:
|
2008-12-03 12:09:06 -08:00
|
|
|
* 1. The L2ARC vdev was previously cached.
|
|
|
|
* 2. This buffer still has L2ARC metadata.
|
|
|
|
* 3. This buffer isn't currently writing to the L2ARC.
|
|
|
|
* 4. The L2ARC entry wasn't evicted, which may
|
|
|
|
* also have invalidated the vdev.
|
2009-02-18 12:51:31 -08:00
|
|
|
* 5. This isn't prefetch and l2arc_noprefetch is set.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
2014-12-29 19:12:23 -08:00
|
|
|
if (HDR_HAS_L2HDR(hdr) &&
|
2009-02-18 12:51:31 -08:00
|
|
|
!HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
|
|
|
|
!(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
|
2008-11-20 12:01:55 -08:00
|
|
|
l2arc_read_callback_t *cb;
|
|
|
|
|
|
|
|
DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
|
|
|
|
ARCSTAT_BUMP(arcstat_l2_hits);
|
2014-12-29 19:12:23 -08:00
|
|
|
atomic_inc_32(&hdr->b_l2hdr.b_hits);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
|
2014-11-20 19:09:39 -05:00
|
|
|
KM_SLEEP);
|
2008-11-20 12:01:55 -08:00
|
|
|
cb->l2rcb_buf = buf;
|
|
|
|
cb->l2rcb_spa = spa;
|
|
|
|
cb->l2rcb_bp = *bp;
|
|
|
|
cb->l2rcb_zb = *zb;
|
2008-12-03 12:09:06 -08:00
|
|
|
cb->l2rcb_flags = zio_flags;
|
2014-03-20 16:55:09 -07:00
|
|
|
cb->l2rcb_compress = b_compress;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2013-02-10 22:21:05 -08:00
|
|
|
ASSERT(addr >= VDEV_LABEL_START_SIZE &&
|
|
|
|
addr + size < vd->vdev_psize -
|
|
|
|
VDEV_LABEL_END_SIZE);
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
2008-12-03 12:09:06 -08:00
|
|
|
* l2arc read. The SCL_L2ARC lock will be
|
|
|
|
* released by l2arc_read_done().
|
2013-08-01 13:02:10 -07:00
|
|
|
* Issue a null zio if the underlying buffer
|
|
|
|
* was squashed to zero size by compression.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
2014-03-20 16:55:09 -07:00
|
|
|
if (b_compress == ZIO_COMPRESS_EMPTY) {
|
2013-08-01 13:02:10 -07:00
|
|
|
rzio = zio_null(pio, spa, vd,
|
|
|
|
l2arc_read_done, cb,
|
|
|
|
zio_flags | ZIO_FLAG_DONT_CACHE |
|
|
|
|
ZIO_FLAG_CANFAIL |
|
|
|
|
ZIO_FLAG_DONT_PROPAGATE |
|
|
|
|
ZIO_FLAG_DONT_RETRY);
|
|
|
|
} else {
|
|
|
|
rzio = zio_read_phys(pio, vd, addr,
|
2014-03-20 16:55:09 -07:00
|
|
|
b_asize, buf->b_data,
|
|
|
|
ZIO_CHECKSUM_OFF,
|
2013-08-01 13:02:10 -07:00
|
|
|
l2arc_read_done, cb, priority,
|
|
|
|
zio_flags | ZIO_FLAG_DONT_CACHE |
|
|
|
|
ZIO_FLAG_CANFAIL |
|
|
|
|
ZIO_FLAG_DONT_PROPAGATE |
|
|
|
|
ZIO_FLAG_DONT_RETRY, B_FALSE);
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
|
|
|
|
zio_t *, rzio);
|
2014-03-20 16:55:09 -07:00
|
|
|
ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-06 09:24:32 -08:00
|
|
|
if (*arc_flags & ARC_FLAG_NOWAIT) {
|
2008-12-03 12:09:06 -08:00
|
|
|
zio_nowait(rzio);
|
Add visibility in to arc_read
This change is an attempt to add visibility into the arc_read calls
occurring on a system, in real time. To do this, a list was added to the
in memory SPA data structure for a pool, with each element on the list
corresponding to a call to arc_read. These entries are then exported
through the kstat interface, which can then be interpreted in userspace.
For each arc_read call, the following information is exported:
* A unique identifier (uint64_t)
* The time the entry was added to the list (hrtime_t)
(*not* wall clock time; relative to the other entries on the list)
* The objset ID (uint64_t)
* The object number (uint64_t)
* The indirection level (uint64_t)
* The block ID (uint64_t)
* The name of the function originating the arc_read call (char[24])
* The arc_flags from the arc_read call (uint32_t)
* The PID of the reading thread (pid_t)
* The command or name of thread originating read (char[16])
From this exported information one can see, in real time, exactly what
is being read, what function is generating the read, and whether or not
the read was found to be already cached.
There is still some work to be done, but this should serve as a good
starting point.
Specifically, dbuf_read's are not accounted for in the currently
exported information. Thus, a follow up patch should probably be added
to export these calls that never call into arc_read (they only hit the
dbuf hash table). In addition, it might be nice to create a utility
similar to "arcstat.py" to digest the exported information and display
it in a more readable format. Or perhaps, log the information and allow
for it to be "replayed" at a later time.
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
2013-09-06 16:09:05 -07:00
|
|
|
goto out;
|
2008-12-03 12:09:06 -08:00
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-06 09:24:32 -08:00
|
|
|
ASSERT(*arc_flags & ARC_FLAG_WAIT);
|
2008-12-03 12:09:06 -08:00
|
|
|
if (zio_wait(rzio) == 0)
|
Add visibility in to arc_read
This change is an attempt to add visibility into the arc_read calls
occurring on a system, in real time. To do this, a list was added to the
in memory SPA data structure for a pool, with each element on the list
corresponding to a call to arc_read. These entries are then exported
through the kstat interface, which can then be interpreted in userspace.
For each arc_read call, the following information is exported:
* A unique identifier (uint64_t)
* The time the entry was added to the list (hrtime_t)
(*not* wall clock time; relative to the other entries on the list)
* The objset ID (uint64_t)
* The object number (uint64_t)
* The indirection level (uint64_t)
* The block ID (uint64_t)
* The name of the function originating the arc_read call (char[24])
* The arc_flags from the arc_read call (uint32_t)
* The PID of the reading thread (pid_t)
* The command or name of thread originating read (char[16])
From this exported information one can see, in real time, exactly what
is being read, what function is generating the read, and whether or not
the read was found to be already cached.
There is still some work to be done, but this should serve as a good
starting point.
Specifically, dbuf_read's are not accounted for in the currently
exported information. Thus, a follow up patch should probably be added
to export these calls that never call into arc_read (they only hit the
dbuf hash table). In addition, it might be nice to create a utility
similar to "arcstat.py" to digest the exported information and display
it in a more readable format. Or perhaps, log the information and allow
for it to be "replayed" at a later time.
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
2013-09-06 16:09:05 -07:00
|
|
|
goto out;
|
2008-12-03 12:09:06 -08:00
|
|
|
|
|
|
|
/* l2arc read error; goto zio_read() */
|
2008-11-20 12:01:55 -08:00
|
|
|
} else {
|
|
|
|
DTRACE_PROBE1(l2arc__miss,
|
|
|
|
arc_buf_hdr_t *, hdr);
|
|
|
|
ARCSTAT_BUMP(arcstat_l2_misses);
|
|
|
|
if (HDR_L2_WRITING(hdr))
|
|
|
|
ARCSTAT_BUMP(arcstat_l2_rw_clash);
|
2008-12-03 12:09:06 -08:00
|
|
|
spa_config_exit(spa, SCL_L2ARC, vd);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
2009-02-18 12:51:31 -08:00
|
|
|
} else {
|
|
|
|
if (vd != NULL)
|
|
|
|
spa_config_exit(spa, SCL_L2ARC, vd);
|
|
|
|
if (l2arc_ndev != 0) {
|
|
|
|
DTRACE_PROBE1(l2arc__miss,
|
|
|
|
arc_buf_hdr_t *, hdr);
|
|
|
|
ARCSTAT_BUMP(arcstat_l2_misses);
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
rzio = zio_read(pio, spa, bp, buf->b_data, size,
|
2008-12-03 12:09:06 -08:00
|
|
|
arc_read_done, buf, priority, zio_flags, zb);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-06 09:24:32 -08:00
|
|
|
if (*arc_flags & ARC_FLAG_WAIT) {
|
Add visibility in to arc_read
This change is an attempt to add visibility into the arc_read calls
occurring on a system, in real time. To do this, a list was added to the
in memory SPA data structure for a pool, with each element on the list
corresponding to a call to arc_read. These entries are then exported
through the kstat interface, which can then be interpreted in userspace.
For each arc_read call, the following information is exported:
* A unique identifier (uint64_t)
* The time the entry was added to the list (hrtime_t)
(*not* wall clock time; relative to the other entries on the list)
* The objset ID (uint64_t)
* The object number (uint64_t)
* The indirection level (uint64_t)
* The block ID (uint64_t)
* The name of the function originating the arc_read call (char[24])
* The arc_flags from the arc_read call (uint32_t)
* The PID of the reading thread (pid_t)
* The command or name of thread originating read (char[16])
From this exported information one can see, in real time, exactly what
is being read, what function is generating the read, and whether or not
the read was found to be already cached.
There is still some work to be done, but this should serve as a good
starting point.
Specifically, dbuf_read's are not accounted for in the currently
exported information. Thus, a follow up patch should probably be added
to export these calls that never call into arc_read (they only hit the
dbuf hash table). In addition, it might be nice to create a utility
similar to "arcstat.py" to digest the exported information and display
it in a more readable format. Or perhaps, log the information and allow
for it to be "replayed" at a later time.
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
2013-09-06 16:09:05 -07:00
|
|
|
rc = zio_wait(rzio);
|
|
|
|
goto out;
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-06 09:24:32 -08:00
|
|
|
ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
|
2008-11-20 12:01:55 -08:00
|
|
|
zio_nowait(rzio);
|
|
|
|
}
|
Add visibility in to arc_read
This change is an attempt to add visibility into the arc_read calls
occurring on a system, in real time. To do this, a list was added to the
in memory SPA data structure for a pool, with each element on the list
corresponding to a call to arc_read. These entries are then exported
through the kstat interface, which can then be interpreted in userspace.
For each arc_read call, the following information is exported:
* A unique identifier (uint64_t)
* The time the entry was added to the list (hrtime_t)
(*not* wall clock time; relative to the other entries on the list)
* The objset ID (uint64_t)
* The object number (uint64_t)
* The indirection level (uint64_t)
* The block ID (uint64_t)
* The name of the function originating the arc_read call (char[24])
* The arc_flags from the arc_read call (uint32_t)
* The PID of the reading thread (pid_t)
* The command or name of thread originating read (char[16])
From this exported information one can see, in real time, exactly what
is being read, what function is generating the read, and whether or not
the read was found to be already cached.
There is still some work to be done, but this should serve as a good
starting point.
Specifically, dbuf_read's are not accounted for in the currently
exported information. Thus, a follow up patch should probably be added
to export these calls that never call into arc_read (they only hit the
dbuf hash table). In addition, it might be nice to create a utility
similar to "arcstat.py" to digest the exported information and display
it in a more readable format. Or perhaps, log the information and allow
for it to be "replayed" at a later time.
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
2013-09-06 16:09:05 -07:00
|
|
|
|
|
|
|
out:
|
|
|
|
spa_read_history_add(spa, zb, *arc_flags);
|
|
|
|
return (rc);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2011-12-22 12:20:43 -08:00
|
|
|
arc_prune_t *
|
|
|
|
arc_add_prune_callback(arc_prune_func_t *func, void *private)
|
|
|
|
{
|
|
|
|
arc_prune_t *p;
|
|
|
|
|
2013-11-01 20:26:11 +01:00
|
|
|
p = kmem_alloc(sizeof (*p), KM_SLEEP);
|
2011-12-22 12:20:43 -08:00
|
|
|
p->p_pfunc = func;
|
|
|
|
p->p_private = private;
|
|
|
|
list_link_init(&p->p_node);
|
|
|
|
refcount_create(&p->p_refcnt);
|
|
|
|
|
|
|
|
mutex_enter(&arc_prune_mtx);
|
|
|
|
refcount_add(&p->p_refcnt, &arc_prune_list);
|
|
|
|
list_insert_head(&arc_prune_list, p);
|
|
|
|
mutex_exit(&arc_prune_mtx);
|
|
|
|
|
|
|
|
return (p);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
arc_remove_prune_callback(arc_prune_t *p)
|
|
|
|
{
|
2016-05-23 11:58:21 -07:00
|
|
|
boolean_t wait = B_FALSE;
|
2011-12-22 12:20:43 -08:00
|
|
|
mutex_enter(&arc_prune_mtx);
|
|
|
|
list_remove(&arc_prune_list, p);
|
2016-05-23 11:58:21 -07:00
|
|
|
if (refcount_remove(&p->p_refcnt, &arc_prune_list) > 0)
|
|
|
|
wait = B_TRUE;
|
2011-12-22 12:20:43 -08:00
|
|
|
mutex_exit(&arc_prune_mtx);
|
2016-05-23 11:58:21 -07:00
|
|
|
|
|
|
|
/* wait for arc_prune_task to finish */
|
|
|
|
if (wait)
|
|
|
|
taskq_wait_outstanding(arc_prune_taskq, 0);
|
|
|
|
ASSERT0(refcount_count(&p->p_refcnt));
|
|
|
|
refcount_destroy(&p->p_refcnt);
|
|
|
|
kmem_free(p, sizeof (*p));
|
2011-12-22 12:20:43 -08:00
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
void
|
|
|
|
arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
|
|
|
|
{
|
|
|
|
ASSERT(buf->b_hdr != NULL);
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon);
|
|
|
|
ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) ||
|
|
|
|
func == NULL);
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(buf->b_efunc == NULL);
|
|
|
|
ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
buf->b_efunc = func;
|
|
|
|
buf->b_private = private;
|
|
|
|
}
|
|
|
|
|
Illumos #3805 arc shouldn't cache freed blocks
3805 arc shouldn't cache freed blocks
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Richard Elling <richard.elling@dey-sys.com>
Reviewed by: Will Andrews <will@firepipe.net>
Approved by: Dan McDonald <danmcd@nexenta.com>
References:
illumos/illumos-gate@6e6d5868f52089b9026785bd90257a3d3f6e5ee2
https://www.illumos.org/issues/3805
ZFS should proactively evict freed blocks from the cache.
On dcenter, we saw that we were caching ~256GB of metadata, while the
pool only had <4GB of metadata on disk. We were wasting about half the
system's RAM (252GB) on blocks that have been freed.
Even though these freed blocks will never be used again, and thus will
eventually be evicted, this causes us to use memory inefficiently for 2
reasons:
1. A block that is freed has no chance of being accessed again, but will
be kept in memory preferentially to a block that was accessed before it
(and is thus older) but has not been freed and thus has at least some
chance of being accessed again.
2. We partition the ARC into several buckets:
user data that has been accessed only once (MRU)
metadata that has been accessed only once (MRU)
user data that has been accessed more than once (MFU)
metadata that has been accessed more than once (MFU)
The user data vs metadata split is somewhat arbitrary, and the primary
control on how much memory is used to cache data vs metadata is to
simply try to keep the proportion the same as it has been in the past
(each bucket "evicts against" itself). The secondary control is to
evict data before evicting metadata.
Because of this bucketing, we may end up with one bucket mostly
containing freed blocks that are very old, while another bucket has more
recently accessed, still-allocated blocks. Data in the useful bucket
(with still-allocated blocks) may be evicted in preference to data in
the useless bucket (with old, freed blocks).
On dcenter, we saw that the MFU metadata bucket was 230MB, while the MFU
data bucket was 27GB and the MRU metadata bucket was 256GB. However,
the vast majority of data in the MRU metadata bucket (256GB) was freed
blocks, and thus useless. Meanwhile, the MFU metadata bucket (230MB)
was constantly evicting useful blocks that will be soon needed.
The problem of cache segmentation is a larger problem that needs more
investigation. However, if we stop caching freed blocks, it should
reduce the impact of this more fundamental issue.
Ported-by: Richard Yao <ryao@cs.stonybrook.edu>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1503
2013-06-06 18:46:55 -04:00
|
|
|
/*
|
|
|
|
* Notify the arc that a block was freed, and thus will never be used again.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
arc_freed(spa_t *spa, const blkptr_t *bp)
|
|
|
|
{
|
|
|
|
arc_buf_hdr_t *hdr;
|
|
|
|
kmutex_t *hash_lock;
|
|
|
|
uint64_t guid = spa_load_guid(spa);
|
|
|
|
|
2014-06-05 13:19:08 -08:00
|
|
|
ASSERT(!BP_IS_EMBEDDED(bp));
|
|
|
|
|
|
|
|
hdr = buf_hash_find(guid, bp, &hash_lock);
|
Illumos #3805 arc shouldn't cache freed blocks
3805 arc shouldn't cache freed blocks
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Richard Elling <richard.elling@dey-sys.com>
Reviewed by: Will Andrews <will@firepipe.net>
Approved by: Dan McDonald <danmcd@nexenta.com>
References:
illumos/illumos-gate@6e6d5868f52089b9026785bd90257a3d3f6e5ee2
https://www.illumos.org/issues/3805
ZFS should proactively evict freed blocks from the cache.
On dcenter, we saw that we were caching ~256GB of metadata, while the
pool only had <4GB of metadata on disk. We were wasting about half the
system's RAM (252GB) on blocks that have been freed.
Even though these freed blocks will never be used again, and thus will
eventually be evicted, this causes us to use memory inefficiently for 2
reasons:
1. A block that is freed has no chance of being accessed again, but will
be kept in memory preferentially to a block that was accessed before it
(and is thus older) but has not been freed and thus has at least some
chance of being accessed again.
2. We partition the ARC into several buckets:
user data that has been accessed only once (MRU)
metadata that has been accessed only once (MRU)
user data that has been accessed more than once (MFU)
metadata that has been accessed more than once (MFU)
The user data vs metadata split is somewhat arbitrary, and the primary
control on how much memory is used to cache data vs metadata is to
simply try to keep the proportion the same as it has been in the past
(each bucket "evicts against" itself). The secondary control is to
evict data before evicting metadata.
Because of this bucketing, we may end up with one bucket mostly
containing freed blocks that are very old, while another bucket has more
recently accessed, still-allocated blocks. Data in the useful bucket
(with still-allocated blocks) may be evicted in preference to data in
the useless bucket (with old, freed blocks).
On dcenter, we saw that the MFU metadata bucket was 230MB, while the MFU
data bucket was 27GB and the MRU metadata bucket was 256GB. However,
the vast majority of data in the MRU metadata bucket (256GB) was freed
blocks, and thus useless. Meanwhile, the MFU metadata bucket (230MB)
was constantly evicting useful blocks that will be soon needed.
The problem of cache segmentation is a larger problem that needs more
investigation. However, if we stop caching freed blocks, it should
reduce the impact of this more fundamental issue.
Ported-by: Richard Yao <ryao@cs.stonybrook.edu>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1503
2013-06-06 18:46:55 -04:00
|
|
|
if (hdr == NULL)
|
|
|
|
return;
|
|
|
|
if (HDR_BUF_AVAILABLE(hdr)) {
|
2014-12-29 19:12:23 -08:00
|
|
|
arc_buf_t *buf = hdr->b_l1hdr.b_buf;
|
Illumos #3805 arc shouldn't cache freed blocks
3805 arc shouldn't cache freed blocks
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Richard Elling <richard.elling@dey-sys.com>
Reviewed by: Will Andrews <will@firepipe.net>
Approved by: Dan McDonald <danmcd@nexenta.com>
References:
illumos/illumos-gate@6e6d5868f52089b9026785bd90257a3d3f6e5ee2
https://www.illumos.org/issues/3805
ZFS should proactively evict freed blocks from the cache.
On dcenter, we saw that we were caching ~256GB of metadata, while the
pool only had <4GB of metadata on disk. We were wasting about half the
system's RAM (252GB) on blocks that have been freed.
Even though these freed blocks will never be used again, and thus will
eventually be evicted, this causes us to use memory inefficiently for 2
reasons:
1. A block that is freed has no chance of being accessed again, but will
be kept in memory preferentially to a block that was accessed before it
(and is thus older) but has not been freed and thus has at least some
chance of being accessed again.
2. We partition the ARC into several buckets:
user data that has been accessed only once (MRU)
metadata that has been accessed only once (MRU)
user data that has been accessed more than once (MFU)
metadata that has been accessed more than once (MFU)
The user data vs metadata split is somewhat arbitrary, and the primary
control on how much memory is used to cache data vs metadata is to
simply try to keep the proportion the same as it has been in the past
(each bucket "evicts against" itself). The secondary control is to
evict data before evicting metadata.
Because of this bucketing, we may end up with one bucket mostly
containing freed blocks that are very old, while another bucket has more
recently accessed, still-allocated blocks. Data in the useful bucket
(with still-allocated blocks) may be evicted in preference to data in
the useless bucket (with old, freed blocks).
On dcenter, we saw that the MFU metadata bucket was 230MB, while the MFU
data bucket was 27GB and the MRU metadata bucket was 256GB. However,
the vast majority of data in the MRU metadata bucket (256GB) was freed
blocks, and thus useless. Meanwhile, the MFU metadata bucket (230MB)
was constantly evicting useful blocks that will be soon needed.
The problem of cache segmentation is a larger problem that needs more
investigation. However, if we stop caching freed blocks, it should
reduce the impact of this more fundamental issue.
Ported-by: Richard Yao <ryao@cs.stonybrook.edu>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1503
2013-06-06 18:46:55 -04:00
|
|
|
add_reference(hdr, hash_lock, FTAG);
|
2014-12-06 09:24:32 -08:00
|
|
|
hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
|
Illumos #3805 arc shouldn't cache freed blocks
3805 arc shouldn't cache freed blocks
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Richard Elling <richard.elling@dey-sys.com>
Reviewed by: Will Andrews <will@firepipe.net>
Approved by: Dan McDonald <danmcd@nexenta.com>
References:
illumos/illumos-gate@6e6d5868f52089b9026785bd90257a3d3f6e5ee2
https://www.illumos.org/issues/3805
ZFS should proactively evict freed blocks from the cache.
On dcenter, we saw that we were caching ~256GB of metadata, while the
pool only had <4GB of metadata on disk. We were wasting about half the
system's RAM (252GB) on blocks that have been freed.
Even though these freed blocks will never be used again, and thus will
eventually be evicted, this causes us to use memory inefficiently for 2
reasons:
1. A block that is freed has no chance of being accessed again, but will
be kept in memory preferentially to a block that was accessed before it
(and is thus older) but has not been freed and thus has at least some
chance of being accessed again.
2. We partition the ARC into several buckets:
user data that has been accessed only once (MRU)
metadata that has been accessed only once (MRU)
user data that has been accessed more than once (MFU)
metadata that has been accessed more than once (MFU)
The user data vs metadata split is somewhat arbitrary, and the primary
control on how much memory is used to cache data vs metadata is to
simply try to keep the proportion the same as it has been in the past
(each bucket "evicts against" itself). The secondary control is to
evict data before evicting metadata.
Because of this bucketing, we may end up with one bucket mostly
containing freed blocks that are very old, while another bucket has more
recently accessed, still-allocated blocks. Data in the useful bucket
(with still-allocated blocks) may be evicted in preference to data in
the useless bucket (with old, freed blocks).
On dcenter, we saw that the MFU metadata bucket was 230MB, while the MFU
data bucket was 27GB and the MRU metadata bucket was 256GB. However,
the vast majority of data in the MRU metadata bucket (256GB) was freed
blocks, and thus useless. Meanwhile, the MFU metadata bucket (230MB)
was constantly evicting useful blocks that will be soon needed.
The problem of cache segmentation is a larger problem that needs more
investigation. However, if we stop caching freed blocks, it should
reduce the impact of this more fundamental issue.
Ported-by: Richard Yao <ryao@cs.stonybrook.edu>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1503
2013-06-06 18:46:55 -04:00
|
|
|
mutex_exit(hash_lock);
|
|
|
|
|
|
|
|
arc_release(buf, FTAG);
|
|
|
|
(void) arc_buf_remove_ref(buf, FTAG);
|
|
|
|
} else {
|
|
|
|
mutex_exit(hash_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
2014-07-15 03:43:18 -04:00
|
|
|
* Clear the user eviction callback set by arc_set_callback(), first calling
|
|
|
|
* it if it exists. Because the presence of a callback keeps an arc_buf cached
|
|
|
|
* clearing the callback may result in the arc_buf being destroyed. However,
|
|
|
|
* it will not result in the *last* arc_buf being destroyed, hence the data
|
|
|
|
* will remain cached in the ARC. We make a copy of the arc buffer here so
|
|
|
|
* that we can process the callback without holding any locks.
|
|
|
|
*
|
|
|
|
* It's possible that the callback is already in the process of being cleared
|
|
|
|
* by another thread. In this case we can not clear the callback.
|
|
|
|
*
|
|
|
|
* Returns B_TRUE if the callback was successfully called and cleared.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
2014-07-15 03:43:18 -04:00
|
|
|
boolean_t
|
|
|
|
arc_clear_callback(arc_buf_t *buf)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
|
|
|
arc_buf_hdr_t *hdr;
|
|
|
|
kmutex_t *hash_lock;
|
2014-07-15 03:43:18 -04:00
|
|
|
arc_evict_func_t *efunc = buf->b_efunc;
|
|
|
|
void *private = buf->b_private;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
mutex_enter(&buf->b_evict_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
hdr = buf->b_hdr;
|
|
|
|
if (hdr == NULL) {
|
|
|
|
/*
|
|
|
|
* We are in arc_do_user_evicts().
|
|
|
|
*/
|
|
|
|
ASSERT(buf->b_data == NULL);
|
2010-05-28 13:45:14 -07:00
|
|
|
mutex_exit(&buf->b_evict_lock);
|
2014-07-15 03:43:18 -04:00
|
|
|
return (B_FALSE);
|
2008-12-03 12:09:06 -08:00
|
|
|
} else if (buf->b_data == NULL) {
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
2008-12-03 12:09:06 -08:00
|
|
|
* We are on the eviction list; process this buffer now
|
|
|
|
* but let arc_do_user_evicts() do the reaping.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
2008-12-03 12:09:06 -08:00
|
|
|
buf->b_efunc = NULL;
|
2010-05-28 13:45:14 -07:00
|
|
|
mutex_exit(&buf->b_evict_lock);
|
2014-07-15 03:43:18 -04:00
|
|
|
VERIFY0(efunc(private));
|
|
|
|
return (B_TRUE);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
2008-12-03 12:09:06 -08:00
|
|
|
hash_lock = HDR_LOCK(hdr);
|
|
|
|
mutex_enter(hash_lock);
|
2010-05-28 13:45:14 -07:00
|
|
|
hdr = buf->b_hdr;
|
|
|
|
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <,
|
|
|
|
hdr->b_l1hdr.b_datacnt);
|
|
|
|
ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
|
|
|
|
hdr->b_l1hdr.b_state == arc_mfu);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-07-15 03:43:18 -04:00
|
|
|
buf->b_efunc = NULL;
|
|
|
|
buf->b_private = NULL;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
if (hdr->b_l1hdr.b_datacnt > 1) {
|
2014-07-15 03:43:18 -04:00
|
|
|
mutex_exit(&buf->b_evict_lock);
|
2015-01-12 19:52:19 -08:00
|
|
|
arc_buf_destroy(buf, TRUE);
|
2014-07-15 03:43:18 -04:00
|
|
|
} else {
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(buf == hdr->b_l1hdr.b_buf);
|
2014-12-06 09:24:32 -08:00
|
|
|
hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
|
2014-07-15 03:43:18 -04:00
|
|
|
mutex_exit(&buf->b_evict_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2014-07-15 03:43:18 -04:00
|
|
|
mutex_exit(hash_lock);
|
|
|
|
VERIFY0(efunc(private));
|
|
|
|
return (B_TRUE);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2013-06-11 09:12:34 -08:00
|
|
|
* Release this buffer from the cache, making it an anonymous buffer. This
|
|
|
|
* must be done after a read and prior to modifying the buffer contents.
|
2008-11-20 12:01:55 -08:00
|
|
|
* If the buffer has more than one reference, we must make
|
2008-12-03 12:09:06 -08:00
|
|
|
* a new hdr for the buffer.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
|
|
|
void
|
|
|
|
arc_release(arc_buf_t *buf, void *tag)
|
|
|
|
{
|
2014-12-29 19:12:23 -08:00
|
|
|
kmutex_t *hash_lock;
|
|
|
|
arc_state_t *state;
|
|
|
|
arc_buf_hdr_t *hdr = buf->b_hdr;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
/*
|
2015-01-12 19:52:19 -08:00
|
|
|
* It would be nice to assert that if its DMU metadata (level >
|
2010-05-28 13:45:14 -07:00
|
|
|
* 0 || it's the dnode file), then it must be syncing context.
|
|
|
|
* But we don't know that information at this level.
|
|
|
|
*/
|
|
|
|
|
|
|
|
mutex_enter(&buf->b_evict_lock);
|
2008-12-03 12:09:06 -08:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
ASSERT(HDR_HAS_L1HDR(hdr));
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
/*
|
|
|
|
* We don't grab the hash lock prior to this check, because if
|
|
|
|
* the buffer's header is in the arc_anon state, it won't be
|
|
|
|
* linked into the hash table.
|
|
|
|
*/
|
|
|
|
if (hdr->b_l1hdr.b_state == arc_anon) {
|
|
|
|
mutex_exit(&buf->b_evict_lock);
|
|
|
|
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
|
|
|
|
ASSERT(!HDR_IN_HASH_TABLE(hdr));
|
|
|
|
ASSERT(!HDR_HAS_L2HDR(hdr));
|
|
|
|
ASSERT(BUF_EMPTY(hdr));
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1);
|
|
|
|
ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
|
|
|
|
ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
|
|
|
|
|
|
|
|
ASSERT3P(buf->b_efunc, ==, NULL);
|
|
|
|
ASSERT3P(buf->b_private, ==, NULL);
|
|
|
|
|
|
|
|
hdr->b_l1hdr.b_arc_access = 0;
|
|
|
|
arc_buf_thaw(buf);
|
|
|
|
|
|
|
|
return;
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
hash_lock = HDR_LOCK(hdr);
|
|
|
|
mutex_enter(hash_lock);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This assignment is only valid as long as the hash_lock is
|
|
|
|
* held, we must be careful not to reference state or the
|
|
|
|
* b_state field after dropping the lock.
|
|
|
|
*/
|
|
|
|
state = hdr->b_l1hdr.b_state;
|
|
|
|
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
|
|
|
|
ASSERT3P(state, !=, arc_anon);
|
|
|
|
|
|
|
|
/* this buffer is not on any list */
|
|
|
|
ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0);
|
|
|
|
|
|
|
|
if (HDR_HAS_L2HDR(hdr)) {
|
|
|
|
mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
|
2015-01-12 19:52:19 -08:00
|
|
|
|
|
|
|
/*
|
2015-06-16 01:12:19 +02:00
|
|
|
* We have to recheck this conditional again now that
|
|
|
|
* we're holding the l2ad_mtx to prevent a race with
|
|
|
|
* another thread which might be concurrently calling
|
|
|
|
* l2arc_evict(). In that case, l2arc_evict() might have
|
|
|
|
* destroyed the header's L2 portion as we were waiting
|
|
|
|
* to acquire the l2ad_mtx.
|
2015-01-12 19:52:19 -08:00
|
|
|
*/
|
2015-06-16 01:12:19 +02:00
|
|
|
if (HDR_HAS_L2HDR(hdr))
|
|
|
|
arc_hdr_l2hdr_destroy(hdr);
|
2015-01-12 19:52:19 -08:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
|
2008-12-03 12:09:06 -08:00
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* Do we have more than one buf?
|
|
|
|
*/
|
2014-12-29 19:12:23 -08:00
|
|
|
if (hdr->b_l1hdr.b_datacnt > 1) {
|
2008-11-20 12:01:55 -08:00
|
|
|
arc_buf_hdr_t *nhdr;
|
|
|
|
arc_buf_t **bufp;
|
|
|
|
uint64_t blksz = hdr->b_size;
|
2009-02-18 12:51:31 -08:00
|
|
|
uint64_t spa = hdr->b_spa;
|
2014-12-29 19:12:23 -08:00
|
|
|
arc_buf_contents_t type = arc_buf_type(hdr);
|
2008-11-20 12:01:55 -08:00
|
|
|
uint32_t flags = hdr->b_flags;
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
2010-05-28 13:45:14 -07:00
|
|
|
* Pull the data off of this hdr and attach it to
|
|
|
|
* a new anonymous hdr.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
|
|
|
(void) remove_reference(hdr, hash_lock, tag);
|
2014-12-29 19:12:23 -08:00
|
|
|
bufp = &hdr->b_l1hdr.b_buf;
|
2008-11-20 12:01:55 -08:00
|
|
|
while (*bufp != buf)
|
|
|
|
bufp = &(*bufp)->b_next;
|
2010-05-28 13:45:14 -07:00
|
|
|
*bufp = buf->b_next;
|
2008-11-20 12:01:55 -08:00
|
|
|
buf->b_next = NULL;
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT3P(state, !=, arc_l2c_only);
|
2015-06-26 15:14:45 -07:00
|
|
|
|
|
|
|
(void) refcount_remove_many(
|
|
|
|
&state->arcs_size, hdr->b_size, buf);
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
|
|
|
|
uint64_t *size;
|
|
|
|
|
|
|
|
ASSERT3P(state, !=, arc_l2c_only);
|
|
|
|
size = &state->arcs_lsize[type];
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT3U(*size, >=, hdr->b_size);
|
|
|
|
atomic_add_64(size, -hdr->b_size);
|
|
|
|
}
|
2012-12-21 14:57:09 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We're releasing a duplicate user data buffer, update
|
|
|
|
* our statistics accordingly.
|
|
|
|
*/
|
2014-12-29 19:12:23 -08:00
|
|
|
if (HDR_ISTYPE_DATA(hdr)) {
|
2012-12-21 14:57:09 -08:00
|
|
|
ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
|
|
|
|
ARCSTAT_INCR(arcstat_duplicate_buffers_size,
|
|
|
|
-hdr->b_size);
|
|
|
|
}
|
2014-12-29 19:12:23 -08:00
|
|
|
hdr->b_l1hdr.b_datacnt -= 1;
|
2008-11-20 12:01:55 -08:00
|
|
|
arc_cksum_verify(buf);
|
2013-05-16 14:18:06 -07:00
|
|
|
arc_buf_unwatch(buf);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
mutex_exit(hash_lock);
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
|
2008-11-20 12:01:55 -08:00
|
|
|
nhdr->b_size = blksz;
|
|
|
|
nhdr->b_spa = spa;
|
2014-12-29 19:12:23 -08:00
|
|
|
|
|
|
|
nhdr->b_l1hdr.b_mru_hits = 0;
|
|
|
|
nhdr->b_l1hdr.b_mru_ghost_hits = 0;
|
|
|
|
nhdr->b_l1hdr.b_mfu_hits = 0;
|
|
|
|
nhdr->b_l1hdr.b_mfu_ghost_hits = 0;
|
|
|
|
nhdr->b_l1hdr.b_l2_hits = 0;
|
2014-12-06 09:24:32 -08:00
|
|
|
nhdr->b_flags = flags & ARC_FLAG_L2_WRITING;
|
2014-12-29 19:12:23 -08:00
|
|
|
nhdr->b_flags |= arc_bufc_to_flags(type);
|
|
|
|
nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
|
|
|
|
|
|
|
|
nhdr->b_l1hdr.b_buf = buf;
|
|
|
|
nhdr->b_l1hdr.b_datacnt = 1;
|
|
|
|
nhdr->b_l1hdr.b_state = arc_anon;
|
|
|
|
nhdr->b_l1hdr.b_arc_access = 0;
|
2015-01-12 19:52:19 -08:00
|
|
|
nhdr->b_l1hdr.b_tmp_cdata = NULL;
|
2008-11-20 12:01:55 -08:00
|
|
|
nhdr->b_freeze_cksum = NULL;
|
2014-12-29 19:12:23 -08:00
|
|
|
|
|
|
|
(void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
|
2008-11-20 12:01:55 -08:00
|
|
|
buf->b_hdr = nhdr;
|
2010-05-28 13:45:14 -07:00
|
|
|
mutex_exit(&buf->b_evict_lock);
|
2015-06-26 15:14:45 -07:00
|
|
|
(void) refcount_add_many(&arc_anon->arcs_size, blksz, buf);
|
2008-11-20 12:01:55 -08:00
|
|
|
} else {
|
2010-05-28 13:45:14 -07:00
|
|
|
mutex_exit(&buf->b_evict_lock);
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
|
2015-01-12 19:52:19 -08:00
|
|
|
/* protected by hash lock, or hdr is on arc_anon */
|
|
|
|
ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
|
2014-12-29 19:12:23 -08:00
|
|
|
hdr->b_l1hdr.b_mru_hits = 0;
|
|
|
|
hdr->b_l1hdr.b_mru_ghost_hits = 0;
|
|
|
|
hdr->b_l1hdr.b_mfu_hits = 0;
|
|
|
|
hdr->b_l1hdr.b_mfu_ghost_hits = 0;
|
|
|
|
hdr->b_l1hdr.b_l2_hits = 0;
|
|
|
|
arc_change_state(arc_anon, hdr, hash_lock);
|
|
|
|
hdr->b_l1hdr.b_arc_access = 0;
|
|
|
|
mutex_exit(hash_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
buf_discard_identity(hdr);
|
2008-11-20 12:01:55 -08:00
|
|
|
arc_buf_thaw(buf);
|
|
|
|
}
|
|
|
|
buf->b_efunc = NULL;
|
|
|
|
buf->b_private = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
arc_released(arc_buf_t *buf)
|
|
|
|
{
|
2008-12-03 12:09:06 -08:00
|
|
|
int released;
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
mutex_enter(&buf->b_evict_lock);
|
2014-12-29 19:12:23 -08:00
|
|
|
released = (buf->b_data != NULL &&
|
|
|
|
buf->b_hdr->b_l1hdr.b_state == arc_anon);
|
2010-05-28 13:45:14 -07:00
|
|
|
mutex_exit(&buf->b_evict_lock);
|
2008-12-03 12:09:06 -08:00
|
|
|
return (released);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef ZFS_DEBUG
|
|
|
|
int
|
|
|
|
arc_referenced(arc_buf_t *buf)
|
|
|
|
{
|
2008-12-03 12:09:06 -08:00
|
|
|
int referenced;
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
mutex_enter(&buf->b_evict_lock);
|
2014-12-29 19:12:23 -08:00
|
|
|
referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
|
2010-05-28 13:45:14 -07:00
|
|
|
mutex_exit(&buf->b_evict_lock);
|
2008-12-03 12:09:06 -08:00
|
|
|
return (referenced);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static void
|
|
|
|
arc_write_ready(zio_t *zio)
|
|
|
|
{
|
|
|
|
arc_write_callback_t *callback = zio->io_private;
|
|
|
|
arc_buf_t *buf = callback->awcb_buf;
|
|
|
|
arc_buf_hdr_t *hdr = buf->b_hdr;
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(HDR_HAS_L1HDR(hdr));
|
|
|
|
ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
|
|
|
|
ASSERT(hdr->b_l1hdr.b_datacnt > 0);
|
2008-12-03 12:09:06 -08:00
|
|
|
callback->awcb_ready(zio, buf, callback->awcb_private);
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* If the IO is already in progress, then this is a re-write
|
2008-12-03 12:09:06 -08:00
|
|
|
* attempt, so we need to thaw and re-compute the cksum.
|
|
|
|
* It is the responsibility of the callback to handle the
|
|
|
|
* accounting for any re-write attempt.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
|
|
|
if (HDR_IO_IN_PROGRESS(hdr)) {
|
2014-12-29 19:12:23 -08:00
|
|
|
mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
if (hdr->b_freeze_cksum != NULL) {
|
|
|
|
kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
|
|
|
|
hdr->b_freeze_cksum = NULL;
|
|
|
|
}
|
2014-12-29 19:12:23 -08:00
|
|
|
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
arc_cksum_compute(buf, B_FALSE);
|
2014-12-06 09:24:32 -08:00
|
|
|
hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2016-05-15 08:02:28 -07:00
|
|
|
static void
|
|
|
|
arc_write_children_ready(zio_t *zio)
|
|
|
|
{
|
|
|
|
arc_write_callback_t *callback = zio->io_private;
|
|
|
|
arc_buf_t *buf = callback->awcb_buf;
|
|
|
|
|
|
|
|
callback->awcb_children_ready(zio, buf, callback->awcb_private);
|
|
|
|
}
|
|
|
|
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
/*
|
|
|
|
* The SPA calls this callback for each physical write that happens on behalf
|
|
|
|
* of a logical write. See the comment in dbuf_write_physdone() for details.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
arc_write_physdone(zio_t *zio)
|
|
|
|
{
|
|
|
|
arc_write_callback_t *cb = zio->io_private;
|
|
|
|
if (cb->awcb_physdone != NULL)
|
|
|
|
cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
static void
|
|
|
|
arc_write_done(zio_t *zio)
|
|
|
|
{
|
|
|
|
arc_write_callback_t *callback = zio->io_private;
|
|
|
|
arc_buf_t *buf = callback->awcb_buf;
|
|
|
|
arc_buf_hdr_t *hdr = buf->b_hdr;
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(hdr->b_l1hdr.b_acb == NULL);
|
2010-05-28 13:45:14 -07:00
|
|
|
|
|
|
|
if (zio->io_error == 0) {
|
2014-06-05 13:19:08 -08:00
|
|
|
if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
|
2013-12-09 10:37:51 -08:00
|
|
|
buf_discard_identity(hdr);
|
|
|
|
} else {
|
|
|
|
hdr->b_dva = *BP_IDENTITY(zio->io_bp);
|
|
|
|
hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
|
|
|
|
}
|
2010-05-28 13:45:14 -07:00
|
|
|
} else {
|
|
|
|
ASSERT(BUF_EMPTY(hdr));
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/*
|
2014-06-05 13:19:08 -08:00
|
|
|
* If the block to be written was all-zero or compressed enough to be
|
|
|
|
* embedded in the BP, no write was performed so there will be no
|
|
|
|
* dva/birth/checksum. The buffer must therefore remain anonymous
|
|
|
|
* (and uncached).
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
|
|
|
if (!BUF_EMPTY(hdr)) {
|
|
|
|
arc_buf_hdr_t *exists;
|
|
|
|
kmutex_t *hash_lock;
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(zio->io_error == 0);
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
arc_cksum_verify(buf);
|
|
|
|
|
|
|
|
exists = buf_hash_insert(hdr, &hash_lock);
|
2014-12-29 19:12:23 -08:00
|
|
|
if (exists != NULL) {
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* This can only happen if we overwrite for
|
|
|
|
* sync-to-convergence, because we remove
|
|
|
|
* buffers from the hash table when we arc_free().
|
|
|
|
*/
|
2010-05-28 13:45:14 -07:00
|
|
|
if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
|
|
|
|
if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
|
|
|
|
panic("bad overwrite, hdr=%p exists=%p",
|
|
|
|
(void *)hdr, (void *)exists);
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(refcount_is_zero(
|
|
|
|
&exists->b_l1hdr.b_refcnt));
|
2010-05-28 13:45:14 -07:00
|
|
|
arc_change_state(arc_anon, exists, hash_lock);
|
|
|
|
mutex_exit(hash_lock);
|
|
|
|
arc_hdr_destroy(exists);
|
|
|
|
exists = buf_hash_insert(hdr, &hash_lock);
|
|
|
|
ASSERT3P(exists, ==, NULL);
|
2013-05-10 12:47:54 -07:00
|
|
|
} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
|
|
|
|
/* nopwrite */
|
|
|
|
ASSERT(zio->io_prop.zp_nopwrite);
|
|
|
|
if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
|
|
|
|
panic("bad nopwrite, hdr=%p exists=%p",
|
|
|
|
(void *)hdr, (void *)exists);
|
2010-05-28 13:45:14 -07:00
|
|
|
} else {
|
|
|
|
/* Dedup */
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(hdr->b_l1hdr.b_datacnt == 1);
|
|
|
|
ASSERT(hdr->b_l1hdr.b_state == arc_anon);
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(BP_GET_DEDUP(zio->io_bp));
|
|
|
|
ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
2014-12-06 09:24:32 -08:00
|
|
|
hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
|
2008-12-03 12:09:06 -08:00
|
|
|
/* if it's not anon, we are doing a scrub */
|
2014-12-29 19:12:23 -08:00
|
|
|
if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
|
2008-12-03 12:09:06 -08:00
|
|
|
arc_access(hdr, hash_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
mutex_exit(hash_lock);
|
|
|
|
} else {
|
2014-12-06 09:24:32 -08:00
|
|
|
hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
|
2010-05-28 13:45:14 -07:00
|
|
|
callback->awcb_done(zio, buf, callback->awcb_private);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
kmem_free(callback, sizeof (arc_write_callback_t));
|
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
2010-05-28 13:45:14 -07:00
|
|
|
arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
|
2013-08-01 13:02:10 -07:00
|
|
|
blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
|
2016-05-15 08:02:28 -07:00
|
|
|
const zio_prop_t *zp, arc_done_func_t *ready,
|
|
|
|
arc_done_func_t *children_ready, arc_done_func_t *physdone,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
arc_done_func_t *done, void *private, zio_priority_t priority,
|
2014-06-25 10:37:59 -08:00
|
|
|
int zio_flags, const zbookmark_phys_t *zb)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
|
|
|
arc_buf_hdr_t *hdr = buf->b_hdr;
|
|
|
|
arc_write_callback_t *callback;
|
2008-12-03 12:09:06 -08:00
|
|
|
zio_t *zio;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2008-12-03 12:09:06 -08:00
|
|
|
ASSERT(ready != NULL);
|
2010-05-28 13:45:14 -07:00
|
|
|
ASSERT(done != NULL);
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(!HDR_IO_ERROR(hdr));
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
|
|
|
|
ASSERT(hdr->b_l1hdr.b_acb == NULL);
|
|
|
|
ASSERT(hdr->b_l1hdr.b_datacnt > 0);
|
2008-12-03 12:09:06 -08:00
|
|
|
if (l2arc)
|
2014-12-06 09:24:32 -08:00
|
|
|
hdr->b_flags |= ARC_FLAG_L2CACHE;
|
2013-08-01 13:02:10 -07:00
|
|
|
if (l2arc_compress)
|
2014-12-06 09:24:32 -08:00
|
|
|
hdr->b_flags |= ARC_FLAG_L2COMPRESS;
|
2014-11-20 19:09:39 -05:00
|
|
|
callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
|
2008-11-20 12:01:55 -08:00
|
|
|
callback->awcb_ready = ready;
|
2016-05-15 08:02:28 -07:00
|
|
|
callback->awcb_children_ready = children_ready;
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
callback->awcb_physdone = physdone;
|
2008-11-20 12:01:55 -08:00
|
|
|
callback->awcb_done = done;
|
|
|
|
callback->awcb_private = private;
|
|
|
|
callback->awcb_buf = buf;
|
2008-12-03 12:09:06 -08:00
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
|
2016-05-15 08:02:28 -07:00
|
|
|
arc_write_ready,
|
|
|
|
(children_ready != NULL) ? arc_write_children_ready : NULL,
|
|
|
|
arc_write_physdone, arc_write_done, callback,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
priority, zio_flags, zb);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
arc_memory_throttle(uint64_t reserve, uint64_t txg)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
|
|
|
#ifdef _KERNEL
|
2015-07-28 11:30:00 -07:00
|
|
|
uint64_t available_memory = ptob(freemem);
|
|
|
|
static uint64_t page_load = 0;
|
|
|
|
static uint64_t last_txg = 0;
|
|
|
|
#ifdef __linux__
|
|
|
|
pgcnt_t minfree = btop(arc_sys_free / 4);
|
|
|
|
#endif
|
2013-02-01 09:33:04 -08:00
|
|
|
|
2015-06-26 11:28:18 -07:00
|
|
|
if (freemem > physmem * arc_lotsfree_percent / 100)
|
|
|
|
return (0);
|
|
|
|
|
2015-07-28 11:30:00 -07:00
|
|
|
if (txg > last_txg) {
|
|
|
|
last_txg = txg;
|
|
|
|
page_load = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we are in pageout, we know that memory is already tight,
|
|
|
|
* the arc is already going to be evicting, so we just want to
|
|
|
|
* continue to let page writes occur as quickly as possible.
|
|
|
|
*/
|
|
|
|
if (current_is_kswapd()) {
|
|
|
|
if (page_load > MAX(ptob(minfree), available_memory) / 4) {
|
|
|
|
DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
|
|
|
|
return (SET_ERROR(ERESTART));
|
|
|
|
}
|
|
|
|
/* Note: reserve is inflated, so we deflate */
|
|
|
|
page_load += reserve / 8;
|
|
|
|
return (0);
|
|
|
|
} else if (page_load > 0 && arc_reclaim_needed()) {
|
2015-06-26 11:28:18 -07:00
|
|
|
/* memory is low, delay before restarting */
|
2008-11-20 12:01:55 -08:00
|
|
|
ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
|
2012-01-20 10:58:57 -08:00
|
|
|
DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
|
2013-03-08 10:41:28 -08:00
|
|
|
return (SET_ERROR(EAGAIN));
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
2015-07-28 11:30:00 -07:00
|
|
|
page_load = 0;
|
2008-11-20 12:01:55 -08:00
|
|
|
#endif
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
arc_tempreserve_clear(uint64_t reserve)
|
|
|
|
{
|
|
|
|
atomic_add_64(&arc_tempreserve, -reserve);
|
|
|
|
ASSERT((int64_t)arc_tempreserve >= 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
arc_tempreserve_space(uint64_t reserve, uint64_t txg)
|
|
|
|
{
|
|
|
|
int error;
|
2009-07-02 15:44:48 -07:00
|
|
|
uint64_t anon_size;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2016-01-22 07:37:37 -06:00
|
|
|
if (!arc_no_grow &&
|
|
|
|
reserve > arc_c/4 &&
|
|
|
|
reserve * 4 > (2ULL << SPA_MAXBLOCKSHIFT))
|
2008-11-20 12:01:55 -08:00
|
|
|
arc_c = MIN(arc_c_max, reserve * 4);
|
2014-04-28 13:56:47 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Throttle when the calculated memory footprint for the TXG
|
|
|
|
* exceeds the target ARC size.
|
|
|
|
*/
|
2012-01-20 10:58:57 -08:00
|
|
|
if (reserve > arc_c) {
|
|
|
|
DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
|
2014-04-28 13:56:47 -07:00
|
|
|
return (SET_ERROR(ERESTART));
|
2012-01-20 10:58:57 -08:00
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2009-07-02 15:44:48 -07:00
|
|
|
/*
|
|
|
|
* Don't count loaned bufs as in flight dirty data to prevent long
|
|
|
|
* network delays from blocking transactions that are ready to be
|
|
|
|
* assigned to a txg.
|
|
|
|
*/
|
2015-06-26 15:14:45 -07:00
|
|
|
anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) -
|
|
|
|
arc_loaned_bytes), 0);
|
2009-07-02 15:44:48 -07:00
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* Writes will, almost always, require additional memory allocations
|
2013-06-11 09:12:34 -08:00
|
|
|
* in order to compress/encrypt/etc the data. We therefore need to
|
2008-11-20 12:01:55 -08:00
|
|
|
* make sure that there is sufficient available memory for this.
|
|
|
|
*/
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
error = arc_memory_throttle(reserve, txg);
|
|
|
|
if (error != 0)
|
2008-11-20 12:01:55 -08:00
|
|
|
return (error);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Throttle writes when the amount of dirty data in the cache
|
|
|
|
* gets too large. We try to keep the cache less than half full
|
|
|
|
* of dirty blocks so that our sync times don't grow too large.
|
|
|
|
* Note: if two requests come in concurrently, we might let them
|
|
|
|
* both succeed, when one of them should fail. Not a huge deal.
|
|
|
|
*/
|
2009-07-02 15:44:48 -07:00
|
|
|
|
|
|
|
if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
|
|
|
|
anon_size > arc_c / 4) {
|
2008-11-20 12:01:55 -08:00
|
|
|
dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
|
|
|
|
"anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
|
|
|
|
arc_tempreserve>>10,
|
|
|
|
arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
|
|
|
|
arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
|
|
|
|
reserve>>10, arc_c>>10);
|
2012-01-20 10:58:57 -08:00
|
|
|
DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
|
2013-03-08 10:41:28 -08:00
|
|
|
return (SET_ERROR(ERESTART));
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
atomic_add_64(&arc_tempreserve, reserve);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2012-01-30 13:28:40 -08:00
|
|
|
static void
|
|
|
|
arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
|
|
|
|
kstat_named_t *evict_data, kstat_named_t *evict_metadata)
|
|
|
|
{
|
2015-06-26 15:14:45 -07:00
|
|
|
size->value.ui64 = refcount_count(&state->arcs_size);
|
2012-01-30 13:28:40 -08:00
|
|
|
evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA];
|
|
|
|
evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA];
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
arc_kstat_update(kstat_t *ksp, int rw)
|
|
|
|
{
|
|
|
|
arc_stats_t *as = ksp->ks_data;
|
|
|
|
|
|
|
|
if (rw == KSTAT_WRITE) {
|
2015-06-26 14:54:17 -07:00
|
|
|
return (EACCES);
|
2012-01-30 13:28:40 -08:00
|
|
|
} else {
|
|
|
|
arc_kstat_update_state(arc_anon,
|
|
|
|
&as->arcstat_anon_size,
|
2015-06-26 14:54:17 -07:00
|
|
|
&as->arcstat_anon_evictable_data,
|
|
|
|
&as->arcstat_anon_evictable_metadata);
|
2012-01-30 13:28:40 -08:00
|
|
|
arc_kstat_update_state(arc_mru,
|
|
|
|
&as->arcstat_mru_size,
|
2015-06-26 14:54:17 -07:00
|
|
|
&as->arcstat_mru_evictable_data,
|
|
|
|
&as->arcstat_mru_evictable_metadata);
|
2012-01-30 13:28:40 -08:00
|
|
|
arc_kstat_update_state(arc_mru_ghost,
|
|
|
|
&as->arcstat_mru_ghost_size,
|
2015-06-26 14:54:17 -07:00
|
|
|
&as->arcstat_mru_ghost_evictable_data,
|
|
|
|
&as->arcstat_mru_ghost_evictable_metadata);
|
2012-01-30 13:28:40 -08:00
|
|
|
arc_kstat_update_state(arc_mfu,
|
|
|
|
&as->arcstat_mfu_size,
|
2015-06-26 14:54:17 -07:00
|
|
|
&as->arcstat_mfu_evictable_data,
|
|
|
|
&as->arcstat_mfu_evictable_metadata);
|
2012-03-27 10:10:26 -07:00
|
|
|
arc_kstat_update_state(arc_mfu_ghost,
|
2012-01-30 13:28:40 -08:00
|
|
|
&as->arcstat_mfu_ghost_size,
|
2015-06-26 14:54:17 -07:00
|
|
|
&as->arcstat_mfu_ghost_evictable_data,
|
|
|
|
&as->arcstat_mfu_ghost_evictable_metadata);
|
2012-01-30 13:28:40 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
/*
|
|
|
|
* This function *must* return indices evenly distributed between all
|
|
|
|
* sublists of the multilist. This is needed due to how the ARC eviction
|
|
|
|
* code is laid out; arc_evict_state() assumes ARC buffers are evenly
|
|
|
|
* distributed between all sublists and uses this assumption when
|
|
|
|
* deciding which sublist to evict from and how much to evict from it.
|
|
|
|
*/
|
|
|
|
unsigned int
|
|
|
|
arc_state_multilist_index_func(multilist_t *ml, void *obj)
|
|
|
|
{
|
|
|
|
arc_buf_hdr_t *hdr = obj;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We rely on b_dva to generate evenly distributed index
|
|
|
|
* numbers using buf_hash below. So, as an added precaution,
|
|
|
|
* let's make sure we never add empty buffers to the arc lists.
|
|
|
|
*/
|
|
|
|
ASSERT(!BUF_EMPTY(hdr));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The assumption here, is the hash value for a given
|
|
|
|
* arc_buf_hdr_t will remain constant throughout its lifetime
|
|
|
|
* (i.e. its b_spa, b_dva, and b_birth fields don't change).
|
|
|
|
* Thus, we don't need to store the header's sublist index
|
|
|
|
* on insertion, as this index can be recalculated on removal.
|
|
|
|
*
|
|
|
|
* Also, the low order bits of the hash value are thought to be
|
|
|
|
* distributed evenly. Otherwise, in the case that the multilist
|
|
|
|
* has a power of two number of sublists, each sublists' usage
|
|
|
|
* would not be evenly distributed.
|
|
|
|
*/
|
|
|
|
return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
|
|
|
|
multilist_get_num_sublists(ml));
|
|
|
|
}
|
|
|
|
|
2015-06-26 11:28:18 -07:00
|
|
|
/*
|
|
|
|
* Called during module initialization and periodically thereafter to
|
|
|
|
* apply reasonable changes to the exposed performance tunings. Non-zero
|
|
|
|
* zfs_* values which differ from the currently set values will be applied.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
arc_tuning_update(void)
|
|
|
|
{
|
|
|
|
/* Valid range: 64M - <all physical memory> */
|
|
|
|
if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) &&
|
|
|
|
(zfs_arc_max > 64 << 20) && (zfs_arc_max < ptob(physmem)) &&
|
|
|
|
(zfs_arc_max > arc_c_min)) {
|
|
|
|
arc_c_max = zfs_arc_max;
|
|
|
|
arc_c = arc_c_max;
|
|
|
|
arc_p = (arc_c >> 1);
|
2016-07-27 14:27:31 -07:00
|
|
|
arc_meta_limit = (3 * arc_c_max) / 4;
|
2016-07-13 07:42:40 -05:00
|
|
|
arc_dnode_limit = arc_meta_limit / 10;
|
2015-06-26 11:28:18 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Valid range: 32M - <arc_c_max> */
|
|
|
|
if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) &&
|
|
|
|
(zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) &&
|
|
|
|
(zfs_arc_min <= arc_c_max)) {
|
|
|
|
arc_c_min = zfs_arc_min;
|
|
|
|
arc_c = MAX(arc_c, arc_c_min);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Valid range: 16M - <arc_c_max> */
|
|
|
|
if ((zfs_arc_meta_min) && (zfs_arc_meta_min != arc_meta_min) &&
|
|
|
|
(zfs_arc_meta_min >= 1ULL << SPA_MAXBLOCKSHIFT) &&
|
|
|
|
(zfs_arc_meta_min <= arc_c_max)) {
|
|
|
|
arc_meta_min = zfs_arc_meta_min;
|
|
|
|
arc_meta_limit = MAX(arc_meta_limit, arc_meta_min);
|
2016-07-13 07:42:40 -05:00
|
|
|
arc_dnode_limit = arc_meta_limit / 10;
|
2015-06-26 11:28:18 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Valid range: <arc_meta_min> - <arc_c_max> */
|
|
|
|
if ((zfs_arc_meta_limit) && (zfs_arc_meta_limit != arc_meta_limit) &&
|
|
|
|
(zfs_arc_meta_limit >= zfs_arc_meta_min) &&
|
|
|
|
(zfs_arc_meta_limit <= arc_c_max))
|
|
|
|
arc_meta_limit = zfs_arc_meta_limit;
|
|
|
|
|
2016-07-13 07:42:40 -05:00
|
|
|
/* Valid range: <arc_meta_min> - <arc_c_max> */
|
|
|
|
if ((zfs_arc_dnode_limit) && (zfs_arc_dnode_limit != arc_dnode_limit) &&
|
|
|
|
(zfs_arc_dnode_limit >= zfs_arc_meta_min) &&
|
|
|
|
(zfs_arc_dnode_limit <= arc_c_max))
|
|
|
|
arc_dnode_limit = zfs_arc_dnode_limit;
|
|
|
|
|
2015-06-26 11:28:18 -07:00
|
|
|
/* Valid range: 1 - N */
|
|
|
|
if (zfs_arc_grow_retry)
|
|
|
|
arc_grow_retry = zfs_arc_grow_retry;
|
|
|
|
|
|
|
|
/* Valid range: 1 - N */
|
|
|
|
if (zfs_arc_shrink_shift) {
|
|
|
|
arc_shrink_shift = zfs_arc_shrink_shift;
|
|
|
|
arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1);
|
|
|
|
}
|
|
|
|
|
2015-06-26 15:59:23 -07:00
|
|
|
/* Valid range: 1 - N */
|
|
|
|
if (zfs_arc_p_min_shift)
|
|
|
|
arc_p_min_shift = zfs_arc_p_min_shift;
|
|
|
|
|
2015-06-26 11:28:18 -07:00
|
|
|
/* Valid range: 1 - N ticks */
|
|
|
|
if (zfs_arc_min_prefetch_lifespan)
|
|
|
|
arc_min_prefetch_lifespan = zfs_arc_min_prefetch_lifespan;
|
2015-07-27 13:17:32 -07:00
|
|
|
|
2015-07-28 11:30:00 -07:00
|
|
|
/* Valid range: 0 - 100 */
|
|
|
|
if ((zfs_arc_lotsfree_percent >= 0) &&
|
|
|
|
(zfs_arc_lotsfree_percent <= 100))
|
|
|
|
arc_lotsfree_percent = zfs_arc_lotsfree_percent;
|
|
|
|
|
2015-07-27 13:17:32 -07:00
|
|
|
/* Valid range: 0 - <all physical memory> */
|
|
|
|
if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free))
|
|
|
|
arc_sys_free = MIN(MAX(zfs_arc_sys_free, 0), ptob(physmem));
|
2015-07-28 11:30:00 -07:00
|
|
|
|
2015-06-26 11:28:18 -07:00
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
void
|
|
|
|
arc_init(void)
|
|
|
|
{
|
2015-06-26 11:28:18 -07:00
|
|
|
/*
|
|
|
|
* allmem is "all memory that we could possibly use".
|
|
|
|
*/
|
|
|
|
#ifdef _KERNEL
|
|
|
|
uint64_t allmem = ptob(physmem);
|
|
|
|
#else
|
|
|
|
uint64_t allmem = (physmem * PAGESIZE) / 2;
|
|
|
|
#endif
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
|
|
|
|
cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL);
|
|
|
|
cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
|
|
|
|
|
|
|
|
mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL);
|
|
|
|
cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/* Convert seconds to clock ticks */
|
2015-06-26 11:28:18 -07:00
|
|
|
arc_min_prefetch_lifespan = 1 * hz;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/* Start out with 1/8 of all memory */
|
2015-06-26 11:28:18 -07:00
|
|
|
arc_c = allmem / 8;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
#ifdef _KERNEL
|
|
|
|
/*
|
|
|
|
* On architectures where the physical memory can be larger
|
|
|
|
* than the addressable space (intel in 32-bit mode), we may
|
|
|
|
* need to limit the cache to 1/8 of VM size.
|
|
|
|
*/
|
|
|
|
arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
|
2015-06-26 11:28:18 -07:00
|
|
|
|
2011-03-29 18:08:59 -07:00
|
|
|
/*
|
|
|
|
* Register a shrinker to support synchronous (direct) memory
|
|
|
|
* reclaim from the arc. This is done to prevent kswapd from
|
|
|
|
* swapping out pages when it is preferable to shrink the arc.
|
|
|
|
*/
|
|
|
|
spl_register_shrinker(&arc_shrinker);
|
2015-07-27 13:17:32 -07:00
|
|
|
|
|
|
|
/* Set to 1/64 of all memory or a minimum of 512K */
|
|
|
|
arc_sys_free = MAX(ptob(physmem / 64), (512 * 1024));
|
|
|
|
arc_need_free = 0;
|
2008-11-20 12:01:55 -08:00
|
|
|
#endif
|
|
|
|
|
2016-01-24 13:11:15 -06:00
|
|
|
/* Set max to 1/2 of all memory */
|
|
|
|
arc_c_max = allmem / 2;
|
|
|
|
|
2016-01-11 13:52:17 -08:00
|
|
|
/*
|
|
|
|
* In userland, there's only the memory pressure that we artificially
|
|
|
|
* create (see arc_available_memory()). Don't let arc_c get too
|
|
|
|
* small, because it can cause transactions to be larger than
|
|
|
|
* arc_c, causing arc_tempreserve_space() to fail.
|
|
|
|
*/
|
|
|
|
#ifndef _KERNEL
|
2016-01-24 13:11:15 -06:00
|
|
|
arc_c_min = MAX(arc_c_max / 2, 2ULL << SPA_MAXBLOCKSHIFT);
|
2016-01-11 13:52:17 -08:00
|
|
|
#else
|
2015-06-04 08:06:27 -05:00
|
|
|
arc_c_min = 2ULL << SPA_MAXBLOCKSHIFT;
|
2016-01-11 13:52:17 -08:00
|
|
|
#endif
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
arc_c = arc_c_max;
|
|
|
|
arc_p = (arc_c >> 1);
|
|
|
|
|
2015-06-26 11:28:18 -07:00
|
|
|
/* Set min to 1/2 of arc_c_min */
|
|
|
|
arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT;
|
|
|
|
/* Initialize maximum observed usage to zero */
|
2011-03-24 12:13:55 -07:00
|
|
|
arc_meta_max = 0;
|
2015-06-26 11:28:18 -07:00
|
|
|
/* Set limit to 3/4 of arc_c_max with a floor of arc_meta_min */
|
|
|
|
arc_meta_limit = MAX((3 * arc_c_max) / 4, arc_meta_min);
|
2016-07-13 07:42:40 -05:00
|
|
|
/* Default dnode limit is 10% of overall meta limit */
|
|
|
|
arc_dnode_limit = arc_meta_limit / 10;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-06-26 11:28:18 -07:00
|
|
|
/* Apply user specified tunings */
|
|
|
|
arc_tuning_update();
|
2015-06-24 15:49:08 -07:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
if (zfs_arc_num_sublists_per_state < 1)
|
2015-06-26 11:28:18 -07:00
|
|
|
zfs_arc_num_sublists_per_state = MAX(boot_ncpus, 1);
|
2015-01-12 19:52:19 -08:00
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/* if kmem_flags are set, lets try to use less memory */
|
|
|
|
if (kmem_debugging())
|
|
|
|
arc_c = arc_c / 2;
|
|
|
|
if (arc_c < arc_c_min)
|
|
|
|
arc_c = arc_c_min;
|
|
|
|
|
|
|
|
arc_anon = &ARC_anon;
|
|
|
|
arc_mru = &ARC_mru;
|
|
|
|
arc_mru_ghost = &ARC_mru_ghost;
|
|
|
|
arc_mfu = &ARC_mfu;
|
|
|
|
arc_mfu_ghost = &ARC_mfu_ghost;
|
|
|
|
arc_l2c_only = &ARC_l2c_only;
|
|
|
|
arc_size = 0;
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
|
2014-12-29 19:12:23 -08:00
|
|
|
sizeof (arc_buf_hdr_t),
|
2015-01-12 19:52:19 -08:00
|
|
|
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
|
|
|
|
zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
|
|
|
|
multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
|
2014-12-29 19:12:23 -08:00
|
|
|
sizeof (arc_buf_hdr_t),
|
2015-01-12 19:52:19 -08:00
|
|
|
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
|
|
|
|
zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
|
|
|
|
multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
|
2014-12-29 19:12:23 -08:00
|
|
|
sizeof (arc_buf_hdr_t),
|
2015-01-12 19:52:19 -08:00
|
|
|
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
|
|
|
|
zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
|
|
|
|
multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
|
2014-12-29 19:12:23 -08:00
|
|
|
sizeof (arc_buf_hdr_t),
|
2015-01-12 19:52:19 -08:00
|
|
|
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
|
|
|
|
zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
|
|
|
|
multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
|
2014-12-29 19:12:23 -08:00
|
|
|
sizeof (arc_buf_hdr_t),
|
2015-01-12 19:52:19 -08:00
|
|
|
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
|
|
|
|
zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
|
|
|
|
multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
|
2014-12-29 19:12:23 -08:00
|
|
|
sizeof (arc_buf_hdr_t),
|
2015-01-12 19:52:19 -08:00
|
|
|
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
|
|
|
|
zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
|
|
|
|
multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
|
2014-12-29 19:12:23 -08:00
|
|
|
sizeof (arc_buf_hdr_t),
|
2015-01-12 19:52:19 -08:00
|
|
|
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
|
|
|
|
zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
|
|
|
|
multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
|
2014-12-29 19:12:23 -08:00
|
|
|
sizeof (arc_buf_hdr_t),
|
2015-01-12 19:52:19 -08:00
|
|
|
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
|
|
|
|
zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
|
|
|
|
multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
|
2014-12-29 19:12:23 -08:00
|
|
|
sizeof (arc_buf_hdr_t),
|
2015-01-12 19:52:19 -08:00
|
|
|
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
|
|
|
|
zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
|
|
|
|
multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
|
2014-12-29 19:12:23 -08:00
|
|
|
sizeof (arc_buf_hdr_t),
|
2015-01-12 19:52:19 -08:00
|
|
|
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
|
|
|
|
zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2013-10-02 17:11:19 -07:00
|
|
|
arc_anon->arcs_state = ARC_STATE_ANON;
|
|
|
|
arc_mru->arcs_state = ARC_STATE_MRU;
|
|
|
|
arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
|
|
|
|
arc_mfu->arcs_state = ARC_STATE_MFU;
|
|
|
|
arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
|
|
|
|
arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
|
|
|
|
|
2015-06-26 15:14:45 -07:00
|
|
|
refcount_create(&arc_anon->arcs_size);
|
|
|
|
refcount_create(&arc_mru->arcs_size);
|
|
|
|
refcount_create(&arc_mru_ghost->arcs_size);
|
|
|
|
refcount_create(&arc_mfu->arcs_size);
|
|
|
|
refcount_create(&arc_mfu_ghost->arcs_size);
|
|
|
|
refcount_create(&arc_l2c_only->arcs_size);
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
buf_init();
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
arc_reclaim_thread_exit = FALSE;
|
|
|
|
arc_user_evicts_thread_exit = FALSE;
|
2011-12-22 12:20:43 -08:00
|
|
|
list_create(&arc_prune_list, sizeof (arc_prune_t),
|
|
|
|
offsetof(arc_prune_t, p_node));
|
2008-11-20 12:01:55 -08:00
|
|
|
arc_eviction_list = NULL;
|
2011-12-22 12:20:43 -08:00
|
|
|
mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
|
2008-11-20 12:01:55 -08:00
|
|
|
bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
|
|
|
|
|
2015-07-24 10:08:31 -07:00
|
|
|
arc_prune_taskq = taskq_create("arc_prune", max_ncpus, defclsyspri,
|
2015-06-03 11:43:30 -07:00
|
|
|
max_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
|
2015-05-30 09:57:53 -05:00
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
|
|
|
|
sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
|
|
|
|
|
|
|
|
if (arc_ksp != NULL) {
|
|
|
|
arc_ksp->ks_data = &arc_stats;
|
2012-01-30 13:28:40 -08:00
|
|
|
arc_ksp->ks_update = arc_kstat_update;
|
2008-11-20 12:01:55 -08:00
|
|
|
kstat_install(arc_ksp);
|
|
|
|
}
|
|
|
|
|
2015-06-26 11:28:18 -07:00
|
|
|
(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
|
2015-07-24 10:08:31 -07:00
|
|
|
TS_RUN, defclsyspri);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
(void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0,
|
2015-07-24 10:08:31 -07:00
|
|
|
TS_RUN, defclsyspri);
|
2015-01-12 19:52:19 -08:00
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
arc_dead = FALSE;
|
2008-12-03 12:09:06 -08:00
|
|
|
arc_warm = B_FALSE;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
/*
|
|
|
|
* Calculate maximum amount of dirty data per pool.
|
|
|
|
*
|
|
|
|
* If it has been set by a module parameter, take that.
|
|
|
|
* Otherwise, use a percentage of physical memory defined by
|
|
|
|
* zfs_dirty_data_max_percent (default 10%) with a cap at
|
|
|
|
* zfs_dirty_data_max_max (default 25% of physical memory).
|
|
|
|
*/
|
|
|
|
if (zfs_dirty_data_max_max == 0)
|
2015-10-30 16:10:01 -07:00
|
|
|
zfs_dirty_data_max_max = (uint64_t)physmem * PAGESIZE *
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
zfs_dirty_data_max_max_percent / 100;
|
|
|
|
|
|
|
|
if (zfs_dirty_data_max == 0) {
|
2015-10-30 16:10:01 -07:00
|
|
|
zfs_dirty_data_max = (uint64_t)physmem * PAGESIZE *
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-28 20:01:20 -07:00
|
|
|
zfs_dirty_data_max_percent / 100;
|
|
|
|
zfs_dirty_data_max = MIN(zfs_dirty_data_max,
|
|
|
|
zfs_dirty_data_max_max);
|
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
arc_fini(void)
|
|
|
|
{
|
2011-12-22 12:20:43 -08:00
|
|
|
arc_prune_t *p;
|
|
|
|
|
2011-03-29 18:08:59 -07:00
|
|
|
#ifdef _KERNEL
|
|
|
|
spl_unregister_shrinker(&arc_shrinker);
|
|
|
|
#endif /* _KERNEL */
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
mutex_enter(&arc_reclaim_lock);
|
|
|
|
arc_reclaim_thread_exit = TRUE;
|
|
|
|
/*
|
|
|
|
* The reclaim thread will set arc_reclaim_thread_exit back to
|
|
|
|
* FALSE when it is finished exiting; we're waiting for that.
|
|
|
|
*/
|
|
|
|
while (arc_reclaim_thread_exit) {
|
|
|
|
cv_signal(&arc_reclaim_thread_cv);
|
|
|
|
cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock);
|
|
|
|
}
|
|
|
|
mutex_exit(&arc_reclaim_lock);
|
|
|
|
|
|
|
|
mutex_enter(&arc_user_evicts_lock);
|
|
|
|
arc_user_evicts_thread_exit = TRUE;
|
|
|
|
/*
|
|
|
|
* The user evicts thread will set arc_user_evicts_thread_exit
|
|
|
|
* to FALSE when it is finished exiting; we're waiting for that.
|
|
|
|
*/
|
|
|
|
while (arc_user_evicts_thread_exit) {
|
|
|
|
cv_signal(&arc_user_evicts_cv);
|
|
|
|
cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock);
|
|
|
|
}
|
|
|
|
mutex_exit(&arc_user_evicts_lock);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
/* Use TRUE to ensure *all* buffers are evicted */
|
|
|
|
arc_flush(NULL, TRUE);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
arc_dead = TRUE;
|
|
|
|
|
|
|
|
if (arc_ksp != NULL) {
|
|
|
|
kstat_delete(arc_ksp);
|
|
|
|
arc_ksp = NULL;
|
|
|
|
}
|
|
|
|
|
2015-05-30 09:57:53 -05:00
|
|
|
taskq_wait(arc_prune_taskq);
|
|
|
|
taskq_destroy(arc_prune_taskq);
|
|
|
|
|
2011-12-22 12:20:43 -08:00
|
|
|
mutex_enter(&arc_prune_mtx);
|
|
|
|
while ((p = list_head(&arc_prune_list)) != NULL) {
|
|
|
|
list_remove(&arc_prune_list, p);
|
|
|
|
refcount_remove(&p->p_refcnt, &arc_prune_list);
|
|
|
|
refcount_destroy(&p->p_refcnt);
|
|
|
|
kmem_free(p, sizeof (*p));
|
|
|
|
}
|
|
|
|
mutex_exit(&arc_prune_mtx);
|
|
|
|
|
|
|
|
list_destroy(&arc_prune_list);
|
|
|
|
mutex_destroy(&arc_prune_mtx);
|
2015-01-12 19:52:19 -08:00
|
|
|
mutex_destroy(&arc_reclaim_lock);
|
|
|
|
cv_destroy(&arc_reclaim_thread_cv);
|
|
|
|
cv_destroy(&arc_reclaim_waiters_cv);
|
|
|
|
|
|
|
|
mutex_destroy(&arc_user_evicts_lock);
|
|
|
|
cv_destroy(&arc_user_evicts_cv);
|
|
|
|
|
2015-06-26 15:14:45 -07:00
|
|
|
refcount_destroy(&arc_anon->arcs_size);
|
|
|
|
refcount_destroy(&arc_mru->arcs_size);
|
|
|
|
refcount_destroy(&arc_mru_ghost->arcs_size);
|
|
|
|
refcount_destroy(&arc_mfu->arcs_size);
|
|
|
|
refcount_destroy(&arc_mfu_ghost->arcs_size);
|
|
|
|
refcount_destroy(&arc_l2c_only->arcs_size);
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
|
|
|
|
multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
|
|
|
|
multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
|
|
|
|
multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
|
|
|
|
multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
|
|
|
|
multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
|
|
|
|
multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
|
|
|
|
multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
|
|
|
|
multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
|
|
|
|
multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
buf_fini();
|
2009-07-02 15:44:48 -07:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT0(arc_loaned_bytes);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Level 2 ARC
|
|
|
|
*
|
|
|
|
* The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
|
|
|
|
* It uses dedicated storage devices to hold cached data, which are populated
|
|
|
|
* using large infrequent writes. The main role of this cache is to boost
|
|
|
|
* the performance of random read workloads. The intended L2ARC devices
|
|
|
|
* include short-stroked disks, solid state disks, and other media with
|
|
|
|
* substantially faster read latency than disk.
|
|
|
|
*
|
|
|
|
* +-----------------------+
|
|
|
|
* | ARC |
|
|
|
|
* +-----------------------+
|
|
|
|
* | ^ ^
|
|
|
|
* | | |
|
|
|
|
* l2arc_feed_thread() arc_read()
|
|
|
|
* | | |
|
|
|
|
* | l2arc read |
|
|
|
|
* V | |
|
|
|
|
* +---------------+ |
|
|
|
|
* | L2ARC | |
|
|
|
|
* +---------------+ |
|
|
|
|
* | ^ |
|
|
|
|
* l2arc_write() | |
|
|
|
|
* | | |
|
|
|
|
* V | |
|
|
|
|
* +-------+ +-------+
|
|
|
|
* | vdev | | vdev |
|
|
|
|
* | cache | | cache |
|
|
|
|
* +-------+ +-------+
|
|
|
|
* +=========+ .-----.
|
|
|
|
* : L2ARC : |-_____-|
|
|
|
|
* : devices : | Disks |
|
|
|
|
* +=========+ `-_____-'
|
|
|
|
*
|
|
|
|
* Read requests are satisfied from the following sources, in order:
|
|
|
|
*
|
|
|
|
* 1) ARC
|
|
|
|
* 2) vdev cache of L2ARC devices
|
|
|
|
* 3) L2ARC devices
|
|
|
|
* 4) vdev cache of disks
|
|
|
|
* 5) disks
|
|
|
|
*
|
|
|
|
* Some L2ARC device types exhibit extremely slow write performance.
|
|
|
|
* To accommodate for this there are some significant differences between
|
|
|
|
* the L2ARC and traditional cache design:
|
|
|
|
*
|
|
|
|
* 1. There is no eviction path from the ARC to the L2ARC. Evictions from
|
|
|
|
* the ARC behave as usual, freeing buffers and placing headers on ghost
|
|
|
|
* lists. The ARC does not send buffers to the L2ARC during eviction as
|
|
|
|
* this would add inflated write latencies for all ARC memory pressure.
|
|
|
|
*
|
|
|
|
* 2. The L2ARC attempts to cache data from the ARC before it is evicted.
|
|
|
|
* It does this by periodically scanning buffers from the eviction-end of
|
|
|
|
* the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
|
2013-08-01 13:02:10 -07:00
|
|
|
* not already there. It scans until a headroom of buffers is satisfied,
|
|
|
|
* which itself is a buffer for ARC eviction. If a compressible buffer is
|
|
|
|
* found during scanning and selected for writing to an L2ARC device, we
|
|
|
|
* temporarily boost scanning headroom during the next scan cycle to make
|
|
|
|
* sure we adapt to compression effects (which might significantly reduce
|
|
|
|
* the data volume we write to L2ARC). The thread that does this is
|
2008-11-20 12:01:55 -08:00
|
|
|
* l2arc_feed_thread(), illustrated below; example sizes are included to
|
|
|
|
* provide a better sense of ratio than this diagram:
|
|
|
|
*
|
|
|
|
* head --> tail
|
|
|
|
* +---------------------+----------+
|
|
|
|
* ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
|
|
|
|
* +---------------------+----------+ | o L2ARC eligible
|
|
|
|
* ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
|
|
|
|
* +---------------------+----------+ |
|
|
|
|
* 15.9 Gbytes ^ 32 Mbytes |
|
|
|
|
* headroom |
|
|
|
|
* l2arc_feed_thread()
|
|
|
|
* |
|
|
|
|
* l2arc write hand <--[oooo]--'
|
|
|
|
* | 8 Mbyte
|
|
|
|
* | write max
|
|
|
|
* V
|
|
|
|
* +==============================+
|
|
|
|
* L2ARC dev |####|#|###|###| |####| ... |
|
|
|
|
* +==============================+
|
|
|
|
* 32 Gbytes
|
|
|
|
*
|
|
|
|
* 3. If an ARC buffer is copied to the L2ARC but then hit instead of
|
|
|
|
* evicted, then the L2ARC has cached a buffer much sooner than it probably
|
|
|
|
* needed to, potentially wasting L2ARC device bandwidth and storage. It is
|
|
|
|
* safe to say that this is an uncommon case, since buffers at the end of
|
|
|
|
* the ARC lists have moved there due to inactivity.
|
|
|
|
*
|
|
|
|
* 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
|
|
|
|
* then the L2ARC simply misses copying some buffers. This serves as a
|
|
|
|
* pressure valve to prevent heavy read workloads from both stalling the ARC
|
|
|
|
* with waits and clogging the L2ARC with writes. This also helps prevent
|
|
|
|
* the potential for the L2ARC to churn if it attempts to cache content too
|
|
|
|
* quickly, such as during backups of the entire pool.
|
|
|
|
*
|
2008-12-03 12:09:06 -08:00
|
|
|
* 5. After system boot and before the ARC has filled main memory, there are
|
|
|
|
* no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
|
|
|
|
* lists can remain mostly static. Instead of searching from tail of these
|
|
|
|
* lists as pictured, the l2arc_feed_thread() will search from the list heads
|
|
|
|
* for eligible buffers, greatly increasing its chance of finding them.
|
|
|
|
*
|
|
|
|
* The L2ARC device write speed is also boosted during this time so that
|
|
|
|
* the L2ARC warms up faster. Since there have been no ARC evictions yet,
|
|
|
|
* there are no L2ARC reads, and no fear of degrading read performance
|
|
|
|
* through increased writes.
|
|
|
|
*
|
|
|
|
* 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
|
2008-11-20 12:01:55 -08:00
|
|
|
* the vdev queue can aggregate them into larger and fewer writes. Each
|
|
|
|
* device is written to in a rotor fashion, sweeping writes through
|
|
|
|
* available space then repeating.
|
|
|
|
*
|
2008-12-03 12:09:06 -08:00
|
|
|
* 7. The L2ARC does not store dirty content. It never needs to flush
|
2008-11-20 12:01:55 -08:00
|
|
|
* write buffers back to disk based storage.
|
|
|
|
*
|
2008-12-03 12:09:06 -08:00
|
|
|
* 8. If an ARC buffer is written (and dirtied) which also exists in the
|
2008-11-20 12:01:55 -08:00
|
|
|
* L2ARC, the now stale L2ARC buffer is immediately dropped.
|
|
|
|
*
|
|
|
|
* The performance of the L2ARC can be tweaked by a number of tunables, which
|
|
|
|
* may be necessary for different workloads:
|
|
|
|
*
|
|
|
|
* l2arc_write_max max write bytes per interval
|
2008-12-03 12:09:06 -08:00
|
|
|
* l2arc_write_boost extra write bytes during device warmup
|
2008-11-20 12:01:55 -08:00
|
|
|
* l2arc_noprefetch skip caching prefetched buffers
|
2013-08-01 13:02:10 -07:00
|
|
|
* l2arc_nocompress skip compressing buffers
|
2008-11-20 12:01:55 -08:00
|
|
|
* l2arc_headroom number of max device writes to precache
|
2013-08-01 13:02:10 -07:00
|
|
|
* l2arc_headroom_boost when we find compressed buffers during ARC
|
|
|
|
* scanning, we multiply headroom by this
|
|
|
|
* percentage factor for the next scan cycle,
|
|
|
|
* since more compressed buffers are likely to
|
|
|
|
* be present
|
2008-11-20 12:01:55 -08:00
|
|
|
* l2arc_feed_secs seconds between L2ARC writing
|
|
|
|
*
|
|
|
|
* Tunables may be removed or added as future performance improvements are
|
|
|
|
* integrated, and also may become zpool properties.
|
2009-02-18 12:51:31 -08:00
|
|
|
*
|
|
|
|
* There are three key functions that control how the L2ARC warms up:
|
|
|
|
*
|
|
|
|
* l2arc_write_eligible() check if a buffer is eligible to cache
|
|
|
|
* l2arc_write_size() calculate how much to write
|
|
|
|
* l2arc_write_interval() calculate sleep delay between writes
|
|
|
|
*
|
|
|
|
* These three functions determine what to write, how much, and how quickly
|
|
|
|
* to send writes.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
|
|
|
|
2009-02-18 12:51:31 -08:00
|
|
|
static boolean_t
|
2014-12-06 09:24:32 -08:00
|
|
|
l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
|
2009-02-18 12:51:31 -08:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* A buffer is *not* eligible for the L2ARC if it:
|
|
|
|
* 1. belongs to a different spa.
|
2010-05-28 13:45:14 -07:00
|
|
|
* 2. is already cached on the L2ARC.
|
|
|
|
* 3. has an I/O in progress (it may be an incomplete read).
|
|
|
|
* 4. is flagged not eligible (zfs property).
|
2009-02-18 12:51:31 -08:00
|
|
|
*/
|
2014-12-29 19:12:23 -08:00
|
|
|
if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) ||
|
2014-12-06 09:24:32 -08:00
|
|
|
HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr))
|
2009-02-18 12:51:31 -08:00
|
|
|
return (B_FALSE);
|
|
|
|
|
|
|
|
return (B_TRUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static uint64_t
|
2013-08-01 13:02:10 -07:00
|
|
|
l2arc_write_size(void)
|
2009-02-18 12:51:31 -08:00
|
|
|
{
|
|
|
|
uint64_t size;
|
|
|
|
|
2013-08-01 13:02:10 -07:00
|
|
|
/*
|
|
|
|
* Make sure our globals have meaningful values in case the user
|
|
|
|
* altered them.
|
|
|
|
*/
|
|
|
|
size = l2arc_write_max;
|
|
|
|
if (size == 0) {
|
|
|
|
cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
|
|
|
|
"be greater than zero, resetting it to the default (%d)",
|
|
|
|
L2ARC_WRITE_SIZE);
|
|
|
|
size = l2arc_write_max = L2ARC_WRITE_SIZE;
|
|
|
|
}
|
2009-02-18 12:51:31 -08:00
|
|
|
|
|
|
|
if (arc_warm == B_FALSE)
|
2013-08-01 13:02:10 -07:00
|
|
|
size += l2arc_write_boost;
|
2009-02-18 12:51:31 -08:00
|
|
|
|
|
|
|
return (size);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
static clock_t
|
|
|
|
l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
|
|
|
|
{
|
2010-05-28 13:45:14 -07:00
|
|
|
clock_t interval, next, now;
|
2009-02-18 12:51:31 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If the ARC lists are busy, increase our write rate; if the
|
|
|
|
* lists are stale, idle back. This is achieved by checking
|
|
|
|
* how much we previously wrote - if it was more than half of
|
|
|
|
* what we wanted, schedule the next write much sooner.
|
|
|
|
*/
|
|
|
|
if (l2arc_feed_again && wrote > (wanted / 2))
|
|
|
|
interval = (hz * l2arc_feed_min_ms) / 1000;
|
|
|
|
else
|
|
|
|
interval = hz * l2arc_feed_secs;
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
now = ddi_get_lbolt();
|
|
|
|
next = MAX(now, MIN(now + interval, began + interval));
|
2009-02-18 12:51:31 -08:00
|
|
|
|
|
|
|
return (next);
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* Cycle through L2ARC devices. This is how L2ARC load balances.
|
2008-12-03 12:09:06 -08:00
|
|
|
* If a device is returned, this also returns holding the spa config lock.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
|
|
|
static l2arc_dev_t *
|
|
|
|
l2arc_dev_get_next(void)
|
|
|
|
{
|
2008-12-03 12:09:06 -08:00
|
|
|
l2arc_dev_t *first, *next = NULL;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2008-12-03 12:09:06 -08:00
|
|
|
/*
|
|
|
|
* Lock out the removal of spas (spa_namespace_lock), then removal
|
|
|
|
* of cache devices (l2arc_dev_mtx). Once a device has been selected,
|
|
|
|
* both locks will be dropped and a spa config lock held instead.
|
|
|
|
*/
|
|
|
|
mutex_enter(&spa_namespace_lock);
|
|
|
|
mutex_enter(&l2arc_dev_mtx);
|
|
|
|
|
|
|
|
/* if there are no vdevs, there is nothing to do */
|
|
|
|
if (l2arc_ndev == 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
first = NULL;
|
|
|
|
next = l2arc_dev_last;
|
|
|
|
do {
|
|
|
|
/* loop around the list looking for a non-faulted vdev */
|
|
|
|
if (next == NULL) {
|
2008-11-20 12:01:55 -08:00
|
|
|
next = list_head(l2arc_dev_list);
|
2008-12-03 12:09:06 -08:00
|
|
|
} else {
|
|
|
|
next = list_next(l2arc_dev_list, next);
|
|
|
|
if (next == NULL)
|
|
|
|
next = list_head(l2arc_dev_list);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* if we have come back to the start, bail out */
|
|
|
|
if (first == NULL)
|
|
|
|
first = next;
|
|
|
|
else if (next == first)
|
|
|
|
break;
|
|
|
|
|
|
|
|
} while (vdev_is_dead(next->l2ad_vdev));
|
|
|
|
|
|
|
|
/* if we were unable to find any usable vdevs, return NULL */
|
|
|
|
if (vdev_is_dead(next->l2ad_vdev))
|
|
|
|
next = NULL;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
l2arc_dev_last = next;
|
|
|
|
|
2008-12-03 12:09:06 -08:00
|
|
|
out:
|
|
|
|
mutex_exit(&l2arc_dev_mtx);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Grab the config lock to prevent the 'next' device from being
|
|
|
|
* removed while we are writing to it.
|
|
|
|
*/
|
|
|
|
if (next != NULL)
|
|
|
|
spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
|
|
|
|
mutex_exit(&spa_namespace_lock);
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
return (next);
|
|
|
|
}
|
|
|
|
|
2008-12-03 12:09:06 -08:00
|
|
|
/*
|
|
|
|
* Free buffers that were tagged for destruction.
|
|
|
|
*/
|
|
|
|
static void
|
2010-08-26 09:52:41 -07:00
|
|
|
l2arc_do_free_on_write(void)
|
2008-12-03 12:09:06 -08:00
|
|
|
{
|
|
|
|
list_t *buflist;
|
|
|
|
l2arc_data_free_t *df, *df_prev;
|
|
|
|
|
|
|
|
mutex_enter(&l2arc_free_on_write_mtx);
|
|
|
|
buflist = l2arc_free_on_write;
|
|
|
|
|
|
|
|
for (df = list_tail(buflist); df; df = df_prev) {
|
|
|
|
df_prev = list_prev(buflist, df);
|
|
|
|
ASSERT(df->l2df_data != NULL);
|
|
|
|
ASSERT(df->l2df_func != NULL);
|
|
|
|
df->l2df_func(df->l2df_data, df->l2df_size);
|
|
|
|
list_remove(buflist, df);
|
|
|
|
kmem_free(df, sizeof (l2arc_data_free_t));
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_exit(&l2arc_free_on_write_mtx);
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* A write to a cache device has completed. Update all headers to allow
|
|
|
|
* reads from these buffers to begin.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
l2arc_write_done(zio_t *zio)
|
|
|
|
{
|
|
|
|
l2arc_write_callback_t *cb;
|
|
|
|
l2arc_dev_t *dev;
|
|
|
|
list_t *buflist;
|
2014-12-06 09:24:32 -08:00
|
|
|
arc_buf_hdr_t *head, *hdr, *hdr_prev;
|
2008-11-20 12:01:55 -08:00
|
|
|
kmutex_t *hash_lock;
|
2014-05-22 10:11:57 +01:00
|
|
|
int64_t bytes_dropped = 0;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
cb = zio->io_private;
|
|
|
|
ASSERT(cb != NULL);
|
|
|
|
dev = cb->l2wcb_dev;
|
|
|
|
ASSERT(dev != NULL);
|
|
|
|
head = cb->l2wcb_head;
|
|
|
|
ASSERT(head != NULL);
|
2014-12-29 19:12:23 -08:00
|
|
|
buflist = &dev->l2ad_buflist;
|
2008-11-20 12:01:55 -08:00
|
|
|
ASSERT(buflist != NULL);
|
|
|
|
DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
|
|
|
|
l2arc_write_callback_t *, cb);
|
|
|
|
|
|
|
|
if (zio->io_error != 0)
|
|
|
|
ARCSTAT_BUMP(arcstat_l2_writes_error);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* All writes completed, or an error was hit.
|
|
|
|
*/
|
2015-01-12 19:52:19 -08:00
|
|
|
top:
|
|
|
|
mutex_enter(&dev->l2ad_mtx);
|
2014-12-06 09:24:32 -08:00
|
|
|
for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
|
|
|
|
hdr_prev = list_prev(buflist, hdr);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-06 09:24:32 -08:00
|
|
|
hash_lock = HDR_LOCK(hdr);
|
2015-01-12 19:52:19 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We cannot use mutex_enter or else we can deadlock
|
|
|
|
* with l2arc_write_buffers (due to swapping the order
|
|
|
|
* the hash lock and l2ad_mtx are taken).
|
|
|
|
*/
|
2008-11-20 12:01:55 -08:00
|
|
|
if (!mutex_tryenter(hash_lock)) {
|
|
|
|
/*
|
2015-01-12 19:52:19 -08:00
|
|
|
* Missed the hash lock. We must retry so we
|
|
|
|
* don't leave the ARC_FLAG_L2_WRITING bit set.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
2015-01-12 19:52:19 -08:00
|
|
|
ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't want to rescan the headers we've
|
|
|
|
* already marked as having been written out, so
|
|
|
|
* we reinsert the head node so we can pick up
|
|
|
|
* where we left off.
|
|
|
|
*/
|
|
|
|
list_remove(buflist, head);
|
|
|
|
list_insert_after(buflist, hdr, head);
|
|
|
|
|
|
|
|
mutex_exit(&dev->l2ad_mtx);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We wait for the hash lock to become available
|
|
|
|
* to try and prevent busy waiting, and increase
|
|
|
|
* the chance we'll be able to acquire the lock
|
|
|
|
* the next time around.
|
|
|
|
*/
|
|
|
|
mutex_enter(hash_lock);
|
|
|
|
mutex_exit(hash_lock);
|
|
|
|
goto top;
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
/*
|
2015-01-12 19:52:19 -08:00
|
|
|
* We could not have been moved into the arc_l2c_only
|
|
|
|
* state while in-flight due to our ARC_FLAG_L2_WRITING
|
|
|
|
* bit being set. Let's just ensure that's being enforced.
|
|
|
|
*/
|
|
|
|
ASSERT(HDR_HAS_L1HDR(hdr));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We may have allocated a buffer for L2ARC compression,
|
|
|
|
* we must release it to avoid leaking this data.
|
2014-12-29 19:12:23 -08:00
|
|
|
*/
|
2015-01-12 19:52:19 -08:00
|
|
|
l2arc_release_cdata_buf(hdr);
|
2014-12-29 19:12:23 -08:00
|
|
|
|
2016-02-10 10:42:01 -08:00
|
|
|
/*
|
|
|
|
* Skipped - drop L2ARC entry and mark the header as no
|
|
|
|
* longer L2 eligibile.
|
|
|
|
*/
|
|
|
|
if (hdr->b_l2hdr.b_daddr == L2ARC_ADDR_UNSET) {
|
|
|
|
list_remove(buflist, hdr);
|
|
|
|
hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
|
|
|
|
hdr->b_flags &= ~ARC_FLAG_L2CACHE;
|
|
|
|
|
|
|
|
ARCSTAT_BUMP(arcstat_l2_writes_skip_toobig);
|
|
|
|
|
|
|
|
(void) refcount_remove_many(&dev->l2ad_alloc,
|
|
|
|
hdr->b_l2hdr.b_asize, hdr);
|
|
|
|
} else if (zio->io_error != 0) {
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
2008-12-03 12:09:06 -08:00
|
|
|
* Error - drop L2ARC entry.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
2014-12-06 09:24:32 -08:00
|
|
|
list_remove(buflist, hdr);
|
2014-12-29 19:12:23 -08:00
|
|
|
hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
|
|
|
|
|
|
|
|
ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
|
2014-12-06 09:24:32 -08:00
|
|
|
ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
|
2015-06-16 01:12:19 +02:00
|
|
|
|
|
|
|
bytes_dropped += hdr->b_l2hdr.b_asize;
|
|
|
|
(void) refcount_remove_many(&dev->l2ad_alloc,
|
|
|
|
hdr->b_l2hdr.b_asize, hdr);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2015-01-12 19:52:19 -08:00
|
|
|
* Allow ARC to begin reads and ghost list evictions to
|
|
|
|
* this L2ARC entry.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
2014-12-06 09:24:32 -08:00
|
|
|
hdr->b_flags &= ~ARC_FLAG_L2_WRITING;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
mutex_exit(hash_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
atomic_inc_64(&l2arc_writes_done);
|
|
|
|
list_remove(buflist, head);
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(!HDR_HAS_L1HDR(head));
|
|
|
|
kmem_cache_free(hdr_l2only_cache, head);
|
|
|
|
mutex_exit(&dev->l2ad_mtx);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-05-22 10:11:57 +01:00
|
|
|
vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
|
|
|
|
|
2008-12-03 12:09:06 -08:00
|
|
|
l2arc_do_free_on_write();
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
kmem_free(cb, sizeof (l2arc_write_callback_t));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A read to a cache device completed. Validate buffer contents before
|
|
|
|
* handing over to the regular ARC routines.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
l2arc_read_done(zio_t *zio)
|
|
|
|
{
|
|
|
|
l2arc_read_callback_t *cb;
|
|
|
|
arc_buf_hdr_t *hdr;
|
|
|
|
arc_buf_t *buf;
|
|
|
|
kmutex_t *hash_lock;
|
2008-12-03 12:09:06 -08:00
|
|
|
int equal;
|
|
|
|
|
|
|
|
ASSERT(zio->io_vd != NULL);
|
|
|
|
ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
|
|
|
|
|
|
|
|
spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
cb = zio->io_private;
|
|
|
|
ASSERT(cb != NULL);
|
|
|
|
buf = cb->l2rcb_buf;
|
|
|
|
ASSERT(buf != NULL);
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
hash_lock = HDR_LOCK(buf->b_hdr);
|
2008-11-20 12:01:55 -08:00
|
|
|
mutex_enter(hash_lock);
|
2010-05-28 13:45:14 -07:00
|
|
|
hdr = buf->b_hdr;
|
|
|
|
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2013-08-01 13:02:10 -07:00
|
|
|
/*
|
|
|
|
* If the buffer was compressed, decompress it first.
|
|
|
|
*/
|
|
|
|
if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
|
|
|
|
l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
|
|
|
|
ASSERT(zio->io_data != NULL);
|
2015-09-11 09:18:56 -07:00
|
|
|
ASSERT3U(zio->io_size, ==, hdr->b_size);
|
|
|
|
ASSERT3U(BP_GET_LSIZE(&cb->l2rcb_bp), ==, hdr->b_size);
|
2013-08-01 13:02:10 -07:00
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* Check this survived the L2ARC journey.
|
|
|
|
*/
|
|
|
|
equal = arc_cksum_equal(buf);
|
|
|
|
if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
|
|
|
|
mutex_exit(hash_lock);
|
|
|
|
zio->io_private = buf;
|
2008-12-03 12:09:06 -08:00
|
|
|
zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
|
|
|
|
zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
|
2008-11-20 12:01:55 -08:00
|
|
|
arc_read_done(zio);
|
|
|
|
} else {
|
|
|
|
mutex_exit(hash_lock);
|
|
|
|
/*
|
|
|
|
* Buffer didn't survive caching. Increment stats and
|
|
|
|
* reissue to the original storage device.
|
|
|
|
*/
|
2008-12-03 12:09:06 -08:00
|
|
|
if (zio->io_error != 0) {
|
2008-11-20 12:01:55 -08:00
|
|
|
ARCSTAT_BUMP(arcstat_l2_io_error);
|
2008-12-03 12:09:06 -08:00
|
|
|
} else {
|
2013-03-08 10:41:28 -08:00
|
|
|
zio->io_error = SET_ERROR(EIO);
|
2008-12-03 12:09:06 -08:00
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
if (!equal)
|
|
|
|
ARCSTAT_BUMP(arcstat_l2_cksum_bad);
|
|
|
|
|
|
|
|
/*
|
2008-12-03 12:09:06 -08:00
|
|
|
* If there's no waiter, issue an async i/o to the primary
|
|
|
|
* storage now. If there *is* a waiter, the caller must
|
|
|
|
* issue the i/o in a context where it's OK to block.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
2009-02-18 12:51:31 -08:00
|
|
|
if (zio->io_waiter == NULL) {
|
|
|
|
zio_t *pio = zio_unique_parent(zio);
|
|
|
|
|
|
|
|
ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
|
|
|
|
|
|
|
|
zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
|
2015-09-11 09:18:56 -07:00
|
|
|
buf->b_data, hdr->b_size, arc_read_done, buf,
|
2008-12-03 12:09:06 -08:00
|
|
|
zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
|
2009-02-18 12:51:31 -08:00
|
|
|
}
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
kmem_free(cb, sizeof (l2arc_read_callback_t));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is the list priority from which the L2ARC will search for pages to
|
|
|
|
* cache. This is used within loops (0..3) to cycle through lists in the
|
|
|
|
* desired order. This order can have a significant effect on cache
|
|
|
|
* performance.
|
|
|
|
*
|
|
|
|
* Currently the metadata lists are hit first, MFU then MRU, followed by
|
|
|
|
* the data lists. This function returns a locked list, and also returns
|
|
|
|
* the lock pointer.
|
|
|
|
*/
|
2015-01-12 19:52:19 -08:00
|
|
|
static multilist_sublist_t *
|
|
|
|
l2arc_sublist_lock(int list_num)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2015-01-12 19:52:19 -08:00
|
|
|
multilist_t *ml = NULL;
|
|
|
|
unsigned int idx;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
ASSERT(list_num >= 0 && list_num <= 3);
|
|
|
|
|
|
|
|
switch (list_num) {
|
|
|
|
case 0:
|
2015-01-12 19:52:19 -08:00
|
|
|
ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
|
2008-11-20 12:01:55 -08:00
|
|
|
break;
|
|
|
|
case 1:
|
2015-01-12 19:52:19 -08:00
|
|
|
ml = &arc_mru->arcs_list[ARC_BUFC_METADATA];
|
2008-11-20 12:01:55 -08:00
|
|
|
break;
|
|
|
|
case 2:
|
2015-01-12 19:52:19 -08:00
|
|
|
ml = &arc_mfu->arcs_list[ARC_BUFC_DATA];
|
2008-11-20 12:01:55 -08:00
|
|
|
break;
|
|
|
|
case 3:
|
2015-01-12 19:52:19 -08:00
|
|
|
ml = &arc_mru->arcs_list[ARC_BUFC_DATA];
|
2008-11-20 12:01:55 -08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
/*
|
|
|
|
* Return a randomly-selected sublist. This is acceptable
|
|
|
|
* because the caller feeds only a little bit of data for each
|
|
|
|
* call (8MB). Subsequent calls will result in different
|
|
|
|
* sublists being selected.
|
|
|
|
*/
|
|
|
|
idx = multilist_get_random_index(ml);
|
|
|
|
return (multilist_sublist_lock(ml, idx));
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Evict buffers from the device write hand to the distance specified in
|
|
|
|
* bytes. This distance may span populated buffers, it may span nothing.
|
|
|
|
* This is clearing a region on the L2ARC device ready for writing.
|
|
|
|
* If the 'all' boolean is set, every buffer is evicted.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
|
|
|
|
{
|
|
|
|
list_t *buflist;
|
2014-12-06 09:24:32 -08:00
|
|
|
arc_buf_hdr_t *hdr, *hdr_prev;
|
2008-11-20 12:01:55 -08:00
|
|
|
kmutex_t *hash_lock;
|
|
|
|
uint64_t taddr;
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
buflist = &dev->l2ad_buflist;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
if (!all && dev->l2ad_first) {
|
|
|
|
/*
|
|
|
|
* This is the first sweep through the device. There is
|
|
|
|
* nothing to evict.
|
|
|
|
*/
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2008-12-03 12:09:06 -08:00
|
|
|
if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* When nearing the end of the device, evict to the end
|
|
|
|
* before the device write hand jumps to the start.
|
|
|
|
*/
|
|
|
|
taddr = dev->l2ad_end;
|
|
|
|
} else {
|
|
|
|
taddr = dev->l2ad_hand + distance;
|
|
|
|
}
|
|
|
|
DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
|
|
|
|
uint64_t, taddr, boolean_t, all);
|
|
|
|
|
|
|
|
top:
|
2014-12-29 19:12:23 -08:00
|
|
|
mutex_enter(&dev->l2ad_mtx);
|
2014-12-06 09:24:32 -08:00
|
|
|
for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
|
|
|
|
hdr_prev = list_prev(buflist, hdr);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-06 09:24:32 -08:00
|
|
|
hash_lock = HDR_LOCK(hdr);
|
2015-01-12 19:52:19 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We cannot use mutex_enter or else we can deadlock
|
|
|
|
* with l2arc_write_buffers (due to swapping the order
|
|
|
|
* the hash lock and l2ad_mtx are taken).
|
|
|
|
*/
|
2008-11-20 12:01:55 -08:00
|
|
|
if (!mutex_tryenter(hash_lock)) {
|
|
|
|
/*
|
|
|
|
* Missed the hash lock. Retry.
|
|
|
|
*/
|
|
|
|
ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
|
2014-12-29 19:12:23 -08:00
|
|
|
mutex_exit(&dev->l2ad_mtx);
|
2008-11-20 12:01:55 -08:00
|
|
|
mutex_enter(hash_lock);
|
|
|
|
mutex_exit(hash_lock);
|
|
|
|
goto top;
|
|
|
|
}
|
|
|
|
|
2014-12-06 09:24:32 -08:00
|
|
|
if (HDR_L2_WRITE_HEAD(hdr)) {
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* We hit a write head node. Leave it for
|
|
|
|
* l2arc_write_done().
|
|
|
|
*/
|
2014-12-06 09:24:32 -08:00
|
|
|
list_remove(buflist, hdr);
|
2008-11-20 12:01:55 -08:00
|
|
|
mutex_exit(hash_lock);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
if (!all && HDR_HAS_L2HDR(hdr) &&
|
|
|
|
(hdr->b_l2hdr.b_daddr > taddr ||
|
|
|
|
hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* We've evicted to the target address,
|
|
|
|
* or the end of the device.
|
|
|
|
*/
|
|
|
|
mutex_exit(hash_lock);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(HDR_HAS_L2HDR(hdr));
|
|
|
|
if (!HDR_HAS_L1HDR(hdr)) {
|
2014-12-06 09:24:32 -08:00
|
|
|
ASSERT(!HDR_L2_READING(hdr));
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* This doesn't exist in the ARC. Destroy.
|
|
|
|
* arc_hdr_destroy() will call list_remove()
|
|
|
|
* and decrement arcstat_l2_size.
|
|
|
|
*/
|
2014-12-06 09:24:32 -08:00
|
|
|
arc_change_state(arc_anon, hdr, hash_lock);
|
|
|
|
arc_hdr_destroy(hdr);
|
2008-11-20 12:01:55 -08:00
|
|
|
} else {
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
|
|
|
|
ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
|
2008-12-03 12:09:06 -08:00
|
|
|
/*
|
|
|
|
* Invalidate issued or about to be issued
|
|
|
|
* reads, since we may be about to write
|
|
|
|
* over this location.
|
|
|
|
*/
|
2014-12-06 09:24:32 -08:00
|
|
|
if (HDR_L2_READING(hdr)) {
|
2008-12-03 12:09:06 -08:00
|
|
|
ARCSTAT_BUMP(arcstat_l2_evict_reading);
|
2014-12-06 09:24:32 -08:00
|
|
|
hdr->b_flags |= ARC_FLAG_L2_EVICTED;
|
2008-12-03 12:09:06 -08:00
|
|
|
}
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
/* Ensure this header has finished being written */
|
|
|
|
ASSERT(!HDR_L2_WRITING(hdr));
|
|
|
|
ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
|
2015-06-16 01:12:19 +02:00
|
|
|
|
|
|
|
arc_hdr_l2hdr_destroy(hdr);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
mutex_exit(hash_lock);
|
|
|
|
}
|
2014-12-29 19:12:23 -08:00
|
|
|
mutex_exit(&dev->l2ad_mtx);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Find and write ARC buffers to the L2ARC device.
|
|
|
|
*
|
2014-12-06 09:24:32 -08:00
|
|
|
* An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
|
2008-11-20 12:01:55 -08:00
|
|
|
* for reading until they have completed writing.
|
2013-08-01 13:02:10 -07:00
|
|
|
* The headroom_boost is an in-out parameter used to maintain headroom boost
|
|
|
|
* state between calls to this function.
|
|
|
|
*
|
|
|
|
* Returns the number of bytes actually written (which may be smaller than
|
|
|
|
* the delta by which the device hand has changed due to alignment).
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
2009-02-18 12:51:31 -08:00
|
|
|
static uint64_t
|
2013-08-01 13:02:10 -07:00
|
|
|
l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
|
|
|
|
boolean_t *headroom_boost)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2014-12-06 09:24:32 -08:00
|
|
|
arc_buf_hdr_t *hdr, *hdr_prev, *head;
|
Account for ashift when gathering buffers to be written to l2arc device
If we don't account for that, then we might end up overwriting disk
area of buffers that have not been evicted yet, because l2arc_evict
operates in terms of disk addresses.
The discrepancy between the write size calculation and the actual
increment to l2ad_hand was introduced in commit 3a17a7a9.
The change that introduced l2ad_hand alignment was almost correct
as the write size was accumulated as a sum of rounded buffer sizes.
See commit illumos/illumos-gate@e14bb32.
Also, we now consistently use asize / a_sz for the allocated size and
psize / p_sz for the physical size. The latter accounts for a
possible size reduction because of the compression, whereas the
former accounts for a possible subsequent size expansion because of
the alignment requirements.
The code still assumes that either underlying storage subsystems or
hardware is able to do read-modify-write when an L2ARC buffer size is
not a multiple of a disk's block size. This is true for 4KB sector disks
that provide 512B sector emulation, but may not be true in general.
In other words, we currently do not have any code to make sure that
an L2ARC buffer, whether compressed or not, which is used for physical
I/O has a suitable size.
Note that currently the cache device utilization is calculated based
on the physical size, not the allocated size. The same applies to
l2_asize kstat. That is wrong, but this commit does not fix that.
The accounting problem was introduced partially in commit 3a17a7a9
and partially in 3038a2b (accounting became consistent but in favour
of the wrong size).
Porting Notes:
Reworked to be C90 compatible and the 'write_psize' variable was
removed because it is now unused.
References:
https://reviews.csiden.org/r/229/
https://reviews.freebsd.org/D2764
Ported-by: kernelOfTruth <kerneloftruth@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3400
Closes #3433
Closes #3451
2015-06-12 21:20:29 +02:00
|
|
|
uint64_t write_asize, write_sz, headroom, buf_compress_minsz,
|
|
|
|
stats_size;
|
2008-11-20 12:01:55 -08:00
|
|
|
void *buf_data;
|
2013-08-01 13:02:10 -07:00
|
|
|
boolean_t full;
|
2008-11-20 12:01:55 -08:00
|
|
|
l2arc_write_callback_t *cb;
|
|
|
|
zio_t *pio, *wzio;
|
2011-11-11 14:07:54 -08:00
|
|
|
uint64_t guid = spa_load_guid(spa);
|
2010-08-26 09:52:39 -07:00
|
|
|
int try;
|
2013-08-01 13:02:10 -07:00
|
|
|
const boolean_t do_headroom_boost = *headroom_boost;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
ASSERT(dev->l2ad_vdev != NULL);
|
|
|
|
|
2013-08-01 13:02:10 -07:00
|
|
|
/* Lower the flag now, we might want to raise it again later. */
|
|
|
|
*headroom_boost = B_FALSE;
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
pio = NULL;
|
Account for ashift when gathering buffers to be written to l2arc device
If we don't account for that, then we might end up overwriting disk
area of buffers that have not been evicted yet, because l2arc_evict
operates in terms of disk addresses.
The discrepancy between the write size calculation and the actual
increment to l2ad_hand was introduced in commit 3a17a7a9.
The change that introduced l2ad_hand alignment was almost correct
as the write size was accumulated as a sum of rounded buffer sizes.
See commit illumos/illumos-gate@e14bb32.
Also, we now consistently use asize / a_sz for the allocated size and
psize / p_sz for the physical size. The latter accounts for a
possible size reduction because of the compression, whereas the
former accounts for a possible subsequent size expansion because of
the alignment requirements.
The code still assumes that either underlying storage subsystems or
hardware is able to do read-modify-write when an L2ARC buffer size is
not a multiple of a disk's block size. This is true for 4KB sector disks
that provide 512B sector emulation, but may not be true in general.
In other words, we currently do not have any code to make sure that
an L2ARC buffer, whether compressed or not, which is used for physical
I/O has a suitable size.
Note that currently the cache device utilization is calculated based
on the physical size, not the allocated size. The same applies to
l2_asize kstat. That is wrong, but this commit does not fix that.
The accounting problem was introduced partially in commit 3a17a7a9
and partially in 3038a2b (accounting became consistent but in favour
of the wrong size).
Porting Notes:
Reworked to be C90 compatible and the 'write_psize' variable was
removed because it is now unused.
References:
https://reviews.csiden.org/r/229/
https://reviews.freebsd.org/D2764
Ported-by: kernelOfTruth <kerneloftruth@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3400
Closes #3433
Closes #3451
2015-06-12 21:20:29 +02:00
|
|
|
write_sz = write_asize = 0;
|
2008-11-20 12:01:55 -08:00
|
|
|
full = B_FALSE;
|
2014-12-29 19:12:23 -08:00
|
|
|
head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
|
2014-12-06 09:24:32 -08:00
|
|
|
head->b_flags |= ARC_FLAG_L2_WRITE_HEAD;
|
2014-12-29 19:12:23 -08:00
|
|
|
head->b_flags |= ARC_FLAG_HAS_L2HDR;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2013-08-01 13:02:10 -07:00
|
|
|
/*
|
|
|
|
* We will want to try to compress buffers that are at least 2x the
|
|
|
|
* device sector size.
|
|
|
|
*/
|
|
|
|
buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* Copy buffers for L2ARC writing.
|
|
|
|
*/
|
2010-08-26 09:52:39 -07:00
|
|
|
for (try = 0; try <= 3; try++) {
|
2015-01-12 19:52:19 -08:00
|
|
|
multilist_sublist_t *mls = l2arc_sublist_lock(try);
|
2013-08-01 13:02:10 -07:00
|
|
|
uint64_t passed_sz = 0;
|
|
|
|
|
2008-12-03 12:09:06 -08:00
|
|
|
/*
|
|
|
|
* L2ARC fast warmup.
|
|
|
|
*
|
|
|
|
* Until the ARC is warm and starts to evict, read from the
|
|
|
|
* head of the ARC lists rather than the tail.
|
|
|
|
*/
|
|
|
|
if (arc_warm == B_FALSE)
|
2015-01-12 19:52:19 -08:00
|
|
|
hdr = multilist_sublist_head(mls);
|
2008-12-03 12:09:06 -08:00
|
|
|
else
|
2015-01-12 19:52:19 -08:00
|
|
|
hdr = multilist_sublist_tail(mls);
|
2008-12-03 12:09:06 -08:00
|
|
|
|
2013-08-01 13:02:10 -07:00
|
|
|
headroom = target_sz * l2arc_headroom;
|
|
|
|
if (do_headroom_boost)
|
|
|
|
headroom = (headroom * l2arc_headroom_boost) / 100;
|
|
|
|
|
2014-12-06 09:24:32 -08:00
|
|
|
for (; hdr; hdr = hdr_prev) {
|
2013-08-01 13:02:10 -07:00
|
|
|
kmutex_t *hash_lock;
|
|
|
|
uint64_t buf_sz;
|
Account for ashift when gathering buffers to be written to l2arc device
If we don't account for that, then we might end up overwriting disk
area of buffers that have not been evicted yet, because l2arc_evict
operates in terms of disk addresses.
The discrepancy between the write size calculation and the actual
increment to l2ad_hand was introduced in commit 3a17a7a9.
The change that introduced l2ad_hand alignment was almost correct
as the write size was accumulated as a sum of rounded buffer sizes.
See commit illumos/illumos-gate@e14bb32.
Also, we now consistently use asize / a_sz for the allocated size and
psize / p_sz for the physical size. The latter accounts for a
possible size reduction because of the compression, whereas the
former accounts for a possible subsequent size expansion because of
the alignment requirements.
The code still assumes that either underlying storage subsystems or
hardware is able to do read-modify-write when an L2ARC buffer size is
not a multiple of a disk's block size. This is true for 4KB sector disks
that provide 512B sector emulation, but may not be true in general.
In other words, we currently do not have any code to make sure that
an L2ARC buffer, whether compressed or not, which is used for physical
I/O has a suitable size.
Note that currently the cache device utilization is calculated based
on the physical size, not the allocated size. The same applies to
l2_asize kstat. That is wrong, but this commit does not fix that.
The accounting problem was introduced partially in commit 3a17a7a9
and partially in 3038a2b (accounting became consistent but in favour
of the wrong size).
Porting Notes:
Reworked to be C90 compatible and the 'write_psize' variable was
removed because it is now unused.
References:
https://reviews.csiden.org/r/229/
https://reviews.freebsd.org/D2764
Ported-by: kernelOfTruth <kerneloftruth@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3400
Closes #3433
Closes #3451
2015-06-12 21:20:29 +02:00
|
|
|
uint64_t buf_a_sz;
|
2013-08-01 13:02:10 -07:00
|
|
|
|
2008-12-03 12:09:06 -08:00
|
|
|
if (arc_warm == B_FALSE)
|
2015-01-12 19:52:19 -08:00
|
|
|
hdr_prev = multilist_sublist_next(mls, hdr);
|
2008-12-03 12:09:06 -08:00
|
|
|
else
|
2015-01-12 19:52:19 -08:00
|
|
|
hdr_prev = multilist_sublist_prev(mls, hdr);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-06 09:24:32 -08:00
|
|
|
hash_lock = HDR_LOCK(hdr);
|
2013-08-01 13:02:10 -07:00
|
|
|
if (!mutex_tryenter(hash_lock)) {
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* Skip this buffer rather than waiting.
|
|
|
|
*/
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2014-12-06 09:24:32 -08:00
|
|
|
passed_sz += hdr->b_size;
|
2008-11-20 12:01:55 -08:00
|
|
|
if (passed_sz > headroom) {
|
|
|
|
/*
|
|
|
|
* Searched too far.
|
|
|
|
*/
|
|
|
|
mutex_exit(hash_lock);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2014-12-06 09:24:32 -08:00
|
|
|
if (!l2arc_write_eligible(guid, hdr)) {
|
2008-11-20 12:01:55 -08:00
|
|
|
mutex_exit(hash_lock);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
Account for ashift when gathering buffers to be written to l2arc device
If we don't account for that, then we might end up overwriting disk
area of buffers that have not been evicted yet, because l2arc_evict
operates in terms of disk addresses.
The discrepancy between the write size calculation and the actual
increment to l2ad_hand was introduced in commit 3a17a7a9.
The change that introduced l2ad_hand alignment was almost correct
as the write size was accumulated as a sum of rounded buffer sizes.
See commit illumos/illumos-gate@e14bb32.
Also, we now consistently use asize / a_sz for the allocated size and
psize / p_sz for the physical size. The latter accounts for a
possible size reduction because of the compression, whereas the
former accounts for a possible subsequent size expansion because of
the alignment requirements.
The code still assumes that either underlying storage subsystems or
hardware is able to do read-modify-write when an L2ARC buffer size is
not a multiple of a disk's block size. This is true for 4KB sector disks
that provide 512B sector emulation, but may not be true in general.
In other words, we currently do not have any code to make sure that
an L2ARC buffer, whether compressed or not, which is used for physical
I/O has a suitable size.
Note that currently the cache device utilization is calculated based
on the physical size, not the allocated size. The same applies to
l2_asize kstat. That is wrong, but this commit does not fix that.
The accounting problem was introduced partially in commit 3a17a7a9
and partially in 3038a2b (accounting became consistent but in favour
of the wrong size).
Porting Notes:
Reworked to be C90 compatible and the 'write_psize' variable was
removed because it is now unused.
References:
https://reviews.csiden.org/r/229/
https://reviews.freebsd.org/D2764
Ported-by: kernelOfTruth <kerneloftruth@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3400
Closes #3433
Closes #3451
2015-06-12 21:20:29 +02:00
|
|
|
/*
|
|
|
|
* Assume that the buffer is not going to be compressed
|
|
|
|
* and could take more space on disk because of a larger
|
|
|
|
* disk block size.
|
|
|
|
*/
|
|
|
|
buf_sz = hdr->b_size;
|
|
|
|
buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
|
|
|
|
|
|
|
|
if ((write_asize + buf_a_sz) > target_sz) {
|
2008-11-20 12:01:55 -08:00
|
|
|
full = B_TRUE;
|
|
|
|
mutex_exit(hash_lock);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (pio == NULL) {
|
|
|
|
/*
|
|
|
|
* Insert a dummy header on the buflist so
|
|
|
|
* l2arc_write_done() can find where the
|
|
|
|
* write buffers begin without searching.
|
|
|
|
*/
|
2015-01-12 19:52:19 -08:00
|
|
|
mutex_enter(&dev->l2ad_mtx);
|
2014-12-29 19:12:23 -08:00
|
|
|
list_insert_head(&dev->l2ad_buflist, head);
|
2015-01-12 19:52:19 -08:00
|
|
|
mutex_exit(&dev->l2ad_mtx);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2015-06-29 10:02:03 -07:00
|
|
|
cb = kmem_alloc(
|
|
|
|
sizeof (l2arc_write_callback_t), KM_SLEEP);
|
2008-11-20 12:01:55 -08:00
|
|
|
cb->l2wcb_dev = dev;
|
|
|
|
cb->l2wcb_head = head;
|
|
|
|
pio = zio_root(spa, l2arc_write_done, cb,
|
|
|
|
ZIO_FLAG_CANFAIL);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create and add a new L2ARC header.
|
|
|
|
*/
|
2014-12-29 19:12:23 -08:00
|
|
|
hdr->b_l2hdr.b_dev = dev;
|
2014-12-06 09:24:32 -08:00
|
|
|
hdr->b_flags |= ARC_FLAG_L2_WRITING;
|
2013-08-01 13:02:10 -07:00
|
|
|
/*
|
|
|
|
* Temporarily stash the data buffer in b_tmp_cdata.
|
|
|
|
* The subsequent write step will pick it up from
|
2014-12-29 19:12:23 -08:00
|
|
|
* there. This is because can't access b_l1hdr.b_buf
|
2013-08-01 13:02:10 -07:00
|
|
|
* without holding the hash_lock, which we in turn
|
|
|
|
* can't access without holding the ARC list locks
|
|
|
|
* (which we want to avoid during compression/writing)
|
|
|
|
*/
|
2015-09-11 09:18:56 -07:00
|
|
|
hdr->b_l2hdr.b_compress = ZIO_COMPRESS_OFF;
|
2014-12-29 19:12:23 -08:00
|
|
|
hdr->b_l2hdr.b_asize = hdr->b_size;
|
|
|
|
hdr->b_l2hdr.b_hits = 0;
|
|
|
|
hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data;
|
2013-08-01 13:02:10 -07:00
|
|
|
|
2015-06-16 01:12:19 +02:00
|
|
|
/*
|
|
|
|
* Explicitly set the b_daddr field to a known
|
|
|
|
* value which means "invalid address". This
|
|
|
|
* enables us to differentiate which stage of
|
|
|
|
* l2arc_write_buffers() the particular header
|
|
|
|
* is in (e.g. this loop, or the one below).
|
|
|
|
* ARC_FLAG_L2_WRITING is not enough to make
|
|
|
|
* this distinction, and we need to know in
|
|
|
|
* order to do proper l2arc vdev accounting in
|
|
|
|
* arc_release() and arc_hdr_destroy().
|
|
|
|
*
|
|
|
|
* Note, we can't use a new flag to distinguish
|
|
|
|
* the two stages because we don't hold the
|
|
|
|
* header's hash_lock below, in the second stage
|
|
|
|
* of this function. Thus, we can't simply
|
|
|
|
* change the b_flags field to denote that the
|
|
|
|
* IO has been sent. We can change the b_daddr
|
|
|
|
* field of the L2 portion, though, since we'll
|
|
|
|
* be holding the l2ad_mtx; which is why we're
|
|
|
|
* using it to denote the header's state change.
|
|
|
|
*/
|
|
|
|
hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET;
|
2014-12-29 19:12:23 -08:00
|
|
|
hdr->b_flags |= ARC_FLAG_HAS_L2HDR;
|
2013-08-01 13:02:10 -07:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
mutex_enter(&dev->l2ad_mtx);
|
2014-12-29 19:12:23 -08:00
|
|
|
list_insert_head(&dev->l2ad_buflist, hdr);
|
2015-01-12 19:52:19 -08:00
|
|
|
mutex_exit(&dev->l2ad_mtx);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Compute and store the buffer cksum before
|
|
|
|
* writing. On debug the cksum is verified first.
|
|
|
|
*/
|
2014-12-29 19:12:23 -08:00
|
|
|
arc_cksum_verify(hdr->b_l1hdr.b_buf);
|
|
|
|
arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
mutex_exit(hash_lock);
|
|
|
|
|
2013-08-01 13:02:10 -07:00
|
|
|
write_sz += buf_sz;
|
Account for ashift when gathering buffers to be written to l2arc device
If we don't account for that, then we might end up overwriting disk
area of buffers that have not been evicted yet, because l2arc_evict
operates in terms of disk addresses.
The discrepancy between the write size calculation and the actual
increment to l2ad_hand was introduced in commit 3a17a7a9.
The change that introduced l2ad_hand alignment was almost correct
as the write size was accumulated as a sum of rounded buffer sizes.
See commit illumos/illumos-gate@e14bb32.
Also, we now consistently use asize / a_sz for the allocated size and
psize / p_sz for the physical size. The latter accounts for a
possible size reduction because of the compression, whereas the
former accounts for a possible subsequent size expansion because of
the alignment requirements.
The code still assumes that either underlying storage subsystems or
hardware is able to do read-modify-write when an L2ARC buffer size is
not a multiple of a disk's block size. This is true for 4KB sector disks
that provide 512B sector emulation, but may not be true in general.
In other words, we currently do not have any code to make sure that
an L2ARC buffer, whether compressed or not, which is used for physical
I/O has a suitable size.
Note that currently the cache device utilization is calculated based
on the physical size, not the allocated size. The same applies to
l2_asize kstat. That is wrong, but this commit does not fix that.
The accounting problem was introduced partially in commit 3a17a7a9
and partially in 3038a2b (accounting became consistent but in favour
of the wrong size).
Porting Notes:
Reworked to be C90 compatible and the 'write_psize' variable was
removed because it is now unused.
References:
https://reviews.csiden.org/r/229/
https://reviews.freebsd.org/D2764
Ported-by: kernelOfTruth <kerneloftruth@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3400
Closes #3433
Closes #3451
2015-06-12 21:20:29 +02:00
|
|
|
write_asize += buf_a_sz;
|
2013-08-01 13:02:10 -07:00
|
|
|
}
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
multilist_sublist_unlock(mls);
|
2013-08-01 13:02:10 -07:00
|
|
|
|
|
|
|
if (full == B_TRUE)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* No buffers selected for writing? */
|
|
|
|
if (pio == NULL) {
|
|
|
|
ASSERT0(write_sz);
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(!HDR_HAS_L1HDR(head));
|
|
|
|
kmem_cache_free(hdr_l2only_cache, head);
|
2013-08-01 13:02:10 -07:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
mutex_enter(&dev->l2ad_mtx);
|
|
|
|
|
Account for ashift when gathering buffers to be written to l2arc device
If we don't account for that, then we might end up overwriting disk
area of buffers that have not been evicted yet, because l2arc_evict
operates in terms of disk addresses.
The discrepancy between the write size calculation and the actual
increment to l2ad_hand was introduced in commit 3a17a7a9.
The change that introduced l2ad_hand alignment was almost correct
as the write size was accumulated as a sum of rounded buffer sizes.
See commit illumos/illumos-gate@e14bb32.
Also, we now consistently use asize / a_sz for the allocated size and
psize / p_sz for the physical size. The latter accounts for a
possible size reduction because of the compression, whereas the
former accounts for a possible subsequent size expansion because of
the alignment requirements.
The code still assumes that either underlying storage subsystems or
hardware is able to do read-modify-write when an L2ARC buffer size is
not a multiple of a disk's block size. This is true for 4KB sector disks
that provide 512B sector emulation, but may not be true in general.
In other words, we currently do not have any code to make sure that
an L2ARC buffer, whether compressed or not, which is used for physical
I/O has a suitable size.
Note that currently the cache device utilization is calculated based
on the physical size, not the allocated size. The same applies to
l2_asize kstat. That is wrong, but this commit does not fix that.
The accounting problem was introduced partially in commit 3a17a7a9
and partially in 3038a2b (accounting became consistent but in favour
of the wrong size).
Porting Notes:
Reworked to be C90 compatible and the 'write_psize' variable was
removed because it is now unused.
References:
https://reviews.csiden.org/r/229/
https://reviews.freebsd.org/D2764
Ported-by: kernelOfTruth <kerneloftruth@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3400
Closes #3433
Closes #3451
2015-06-12 21:20:29 +02:00
|
|
|
/*
|
|
|
|
* Note that elsewhere in this file arcstat_l2_asize
|
|
|
|
* and the used space on l2ad_vdev are updated using b_asize,
|
|
|
|
* which is not necessarily rounded up to the device block size.
|
|
|
|
* Too keep accounting consistent we do the same here as well:
|
|
|
|
* stats_size accumulates the sum of b_asize of the written buffers,
|
|
|
|
* while write_asize accumulates the sum of b_asize rounded up
|
|
|
|
* to the device block size.
|
|
|
|
* The latter sum is used only to validate the corectness of the code.
|
|
|
|
*/
|
|
|
|
stats_size = 0;
|
|
|
|
write_asize = 0;
|
|
|
|
|
2013-08-01 13:02:10 -07:00
|
|
|
/*
|
|
|
|
* Now start writing the buffers. We're starting at the write head
|
|
|
|
* and work backwards, retracing the course of the buffer selector
|
|
|
|
* loop above.
|
|
|
|
*/
|
2014-12-29 19:12:23 -08:00
|
|
|
for (hdr = list_prev(&dev->l2ad_buflist, head); hdr;
|
|
|
|
hdr = list_prev(&dev->l2ad_buflist, hdr)) {
|
2013-08-01 13:02:10 -07:00
|
|
|
uint64_t buf_sz;
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
/*
|
|
|
|
* We rely on the L1 portion of the header below, so
|
|
|
|
* it's invalid for this header to have been evicted out
|
|
|
|
* of the ghost cache, prior to being written out. The
|
|
|
|
* ARC_FLAG_L2_WRITING bit ensures this won't happen.
|
|
|
|
*/
|
|
|
|
ASSERT(HDR_HAS_L1HDR(hdr));
|
|
|
|
|
2013-08-01 13:02:10 -07:00
|
|
|
/*
|
|
|
|
* We shouldn't need to lock the buffer here, since we flagged
|
2014-12-06 09:24:32 -08:00
|
|
|
* it as ARC_FLAG_L2_WRITING in the previous step, but we must
|
|
|
|
* take care to only access its L2 cache parameters. In
|
2014-12-29 19:12:23 -08:00
|
|
|
* particular, hdr->l1hdr.b_buf may be invalid by now due to
|
2014-12-06 09:24:32 -08:00
|
|
|
* ARC eviction.
|
2013-08-01 13:02:10 -07:00
|
|
|
*/
|
2014-12-29 19:12:23 -08:00
|
|
|
hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
|
2013-08-01 13:02:10 -07:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
if ((!l2arc_nocompress && HDR_L2COMPRESS(hdr)) &&
|
|
|
|
hdr->b_l2hdr.b_asize >= buf_compress_minsz) {
|
|
|
|
if (l2arc_compress_buf(hdr)) {
|
2013-08-01 13:02:10 -07:00
|
|
|
/*
|
|
|
|
* If compression succeeded, enable headroom
|
|
|
|
* boost on the next scan cycle.
|
|
|
|
*/
|
|
|
|
*headroom_boost = B_TRUE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Pick up the buffer data we had previously stashed away
|
|
|
|
* (and now potentially also compressed).
|
|
|
|
*/
|
2014-12-29 19:12:23 -08:00
|
|
|
buf_data = hdr->b_l1hdr.b_tmp_cdata;
|
|
|
|
buf_sz = hdr->b_l2hdr.b_asize;
|
2013-08-01 13:02:10 -07:00
|
|
|
|
2015-06-16 01:12:19 +02:00
|
|
|
/*
|
|
|
|
* We need to do this regardless if buf_sz is zero or
|
|
|
|
* not, otherwise, when this l2hdr is evicted we'll
|
|
|
|
* remove a reference that was never added.
|
|
|
|
*/
|
|
|
|
(void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr);
|
|
|
|
|
2013-08-01 13:02:10 -07:00
|
|
|
/* Compression may have squashed the buffer to zero length. */
|
|
|
|
if (buf_sz != 0) {
|
Account for ashift when gathering buffers to be written to l2arc device
If we don't account for that, then we might end up overwriting disk
area of buffers that have not been evicted yet, because l2arc_evict
operates in terms of disk addresses.
The discrepancy between the write size calculation and the actual
increment to l2ad_hand was introduced in commit 3a17a7a9.
The change that introduced l2ad_hand alignment was almost correct
as the write size was accumulated as a sum of rounded buffer sizes.
See commit illumos/illumos-gate@e14bb32.
Also, we now consistently use asize / a_sz for the allocated size and
psize / p_sz for the physical size. The latter accounts for a
possible size reduction because of the compression, whereas the
former accounts for a possible subsequent size expansion because of
the alignment requirements.
The code still assumes that either underlying storage subsystems or
hardware is able to do read-modify-write when an L2ARC buffer size is
not a multiple of a disk's block size. This is true for 4KB sector disks
that provide 512B sector emulation, but may not be true in general.
In other words, we currently do not have any code to make sure that
an L2ARC buffer, whether compressed or not, which is used for physical
I/O has a suitable size.
Note that currently the cache device utilization is calculated based
on the physical size, not the allocated size. The same applies to
l2_asize kstat. That is wrong, but this commit does not fix that.
The accounting problem was introduced partially in commit 3a17a7a9
and partially in 3038a2b (accounting became consistent but in favour
of the wrong size).
Porting Notes:
Reworked to be C90 compatible and the 'write_psize' variable was
removed because it is now unused.
References:
https://reviews.csiden.org/r/229/
https://reviews.freebsd.org/D2764
Ported-by: kernelOfTruth <kerneloftruth@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3400
Closes #3433
Closes #3451
2015-06-12 21:20:29 +02:00
|
|
|
uint64_t buf_a_sz;
|
2013-08-01 13:02:10 -07:00
|
|
|
|
2016-02-10 10:42:01 -08:00
|
|
|
/*
|
|
|
|
* Buffers which are larger than l2arc_max_block_size
|
|
|
|
* after compression are skipped and removed from L2
|
|
|
|
* eligibility.
|
|
|
|
*/
|
|
|
|
if (buf_sz > l2arc_max_block_size) {
|
|
|
|
hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
wzio = zio_write_phys(pio, dev->l2ad_vdev,
|
|
|
|
dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
|
|
|
|
NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
|
|
|
|
ZIO_FLAG_CANFAIL, B_FALSE);
|
|
|
|
|
|
|
|
DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
|
|
|
|
zio_t *, wzio);
|
|
|
|
(void) zio_nowait(wzio);
|
|
|
|
|
Account for ashift when gathering buffers to be written to l2arc device
If we don't account for that, then we might end up overwriting disk
area of buffers that have not been evicted yet, because l2arc_evict
operates in terms of disk addresses.
The discrepancy between the write size calculation and the actual
increment to l2ad_hand was introduced in commit 3a17a7a9.
The change that introduced l2ad_hand alignment was almost correct
as the write size was accumulated as a sum of rounded buffer sizes.
See commit illumos/illumos-gate@e14bb32.
Also, we now consistently use asize / a_sz for the allocated size and
psize / p_sz for the physical size. The latter accounts for a
possible size reduction because of the compression, whereas the
former accounts for a possible subsequent size expansion because of
the alignment requirements.
The code still assumes that either underlying storage subsystems or
hardware is able to do read-modify-write when an L2ARC buffer size is
not a multiple of a disk's block size. This is true for 4KB sector disks
that provide 512B sector emulation, but may not be true in general.
In other words, we currently do not have any code to make sure that
an L2ARC buffer, whether compressed or not, which is used for physical
I/O has a suitable size.
Note that currently the cache device utilization is calculated based
on the physical size, not the allocated size. The same applies to
l2_asize kstat. That is wrong, but this commit does not fix that.
The accounting problem was introduced partially in commit 3a17a7a9
and partially in 3038a2b (accounting became consistent but in favour
of the wrong size).
Porting Notes:
Reworked to be C90 compatible and the 'write_psize' variable was
removed because it is now unused.
References:
https://reviews.csiden.org/r/229/
https://reviews.freebsd.org/D2764
Ported-by: kernelOfTruth <kerneloftruth@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3400
Closes #3433
Closes #3451
2015-06-12 21:20:29 +02:00
|
|
|
stats_size += buf_sz;
|
2015-06-16 01:12:19 +02:00
|
|
|
|
2008-12-03 12:09:06 -08:00
|
|
|
/*
|
|
|
|
* Keep the clock hand suitably device-aligned.
|
|
|
|
*/
|
Account for ashift when gathering buffers to be written to l2arc device
If we don't account for that, then we might end up overwriting disk
area of buffers that have not been evicted yet, because l2arc_evict
operates in terms of disk addresses.
The discrepancy between the write size calculation and the actual
increment to l2ad_hand was introduced in commit 3a17a7a9.
The change that introduced l2ad_hand alignment was almost correct
as the write size was accumulated as a sum of rounded buffer sizes.
See commit illumos/illumos-gate@e14bb32.
Also, we now consistently use asize / a_sz for the allocated size and
psize / p_sz for the physical size. The latter accounts for a
possible size reduction because of the compression, whereas the
former accounts for a possible subsequent size expansion because of
the alignment requirements.
The code still assumes that either underlying storage subsystems or
hardware is able to do read-modify-write when an L2ARC buffer size is
not a multiple of a disk's block size. This is true for 4KB sector disks
that provide 512B sector emulation, but may not be true in general.
In other words, we currently do not have any code to make sure that
an L2ARC buffer, whether compressed or not, which is used for physical
I/O has a suitable size.
Note that currently the cache device utilization is calculated based
on the physical size, not the allocated size. The same applies to
l2_asize kstat. That is wrong, but this commit does not fix that.
The accounting problem was introduced partially in commit 3a17a7a9
and partially in 3038a2b (accounting became consistent but in favour
of the wrong size).
Porting Notes:
Reworked to be C90 compatible and the 'write_psize' variable was
removed because it is now unused.
References:
https://reviews.csiden.org/r/229/
https://reviews.freebsd.org/D2764
Ported-by: kernelOfTruth <kerneloftruth@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3400
Closes #3433
Closes #3451
2015-06-12 21:20:29 +02:00
|
|
|
buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
|
|
|
|
write_asize += buf_a_sz;
|
|
|
|
dev->l2ad_hand += buf_a_sz;
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
mutex_exit(&dev->l2ad_mtx);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2013-08-01 13:02:10 -07:00
|
|
|
ASSERT3U(write_asize, <=, target_sz);
|
2008-11-20 12:01:55 -08:00
|
|
|
ARCSTAT_BUMP(arcstat_l2_writes_sent);
|
2013-08-01 13:02:10 -07:00
|
|
|
ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
|
2008-11-20 12:01:55 -08:00
|
|
|
ARCSTAT_INCR(arcstat_l2_size, write_sz);
|
Account for ashift when gathering buffers to be written to l2arc device
If we don't account for that, then we might end up overwriting disk
area of buffers that have not been evicted yet, because l2arc_evict
operates in terms of disk addresses.
The discrepancy between the write size calculation and the actual
increment to l2ad_hand was introduced in commit 3a17a7a9.
The change that introduced l2ad_hand alignment was almost correct
as the write size was accumulated as a sum of rounded buffer sizes.
See commit illumos/illumos-gate@e14bb32.
Also, we now consistently use asize / a_sz for the allocated size and
psize / p_sz for the physical size. The latter accounts for a
possible size reduction because of the compression, whereas the
former accounts for a possible subsequent size expansion because of
the alignment requirements.
The code still assumes that either underlying storage subsystems or
hardware is able to do read-modify-write when an L2ARC buffer size is
not a multiple of a disk's block size. This is true for 4KB sector disks
that provide 512B sector emulation, but may not be true in general.
In other words, we currently do not have any code to make sure that
an L2ARC buffer, whether compressed or not, which is used for physical
I/O has a suitable size.
Note that currently the cache device utilization is calculated based
on the physical size, not the allocated size. The same applies to
l2_asize kstat. That is wrong, but this commit does not fix that.
The accounting problem was introduced partially in commit 3a17a7a9
and partially in 3038a2b (accounting became consistent but in favour
of the wrong size).
Porting Notes:
Reworked to be C90 compatible and the 'write_psize' variable was
removed because it is now unused.
References:
https://reviews.csiden.org/r/229/
https://reviews.freebsd.org/D2764
Ported-by: kernelOfTruth <kerneloftruth@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3400
Closes #3433
Closes #3451
2015-06-12 21:20:29 +02:00
|
|
|
ARCSTAT_INCR(arcstat_l2_asize, stats_size);
|
|
|
|
vdev_space_update(dev->l2ad_vdev, stats_size, 0, 0);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Bump device hand to the device start if it is approaching the end.
|
|
|
|
* l2arc_evict() will already have evicted ahead for this case.
|
|
|
|
*/
|
2008-12-03 12:09:06 -08:00
|
|
|
if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
|
2008-11-20 12:01:55 -08:00
|
|
|
dev->l2ad_hand = dev->l2ad_start;
|
|
|
|
dev->l2ad_first = B_FALSE;
|
|
|
|
}
|
|
|
|
|
2009-02-18 12:51:31 -08:00
|
|
|
dev->l2ad_writing = B_TRUE;
|
2008-11-20 12:01:55 -08:00
|
|
|
(void) zio_wait(pio);
|
2009-02-18 12:51:31 -08:00
|
|
|
dev->l2ad_writing = B_FALSE;
|
|
|
|
|
2013-08-01 13:02:10 -07:00
|
|
|
return (write_asize);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Compresses an L2ARC buffer.
|
2014-12-29 19:12:23 -08:00
|
|
|
* The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its
|
2013-08-01 13:02:10 -07:00
|
|
|
* size in l2hdr->b_asize. This routine tries to compress the data and
|
|
|
|
* depending on the compression result there are three possible outcomes:
|
|
|
|
* *) The buffer was incompressible. The original l2hdr contents were left
|
|
|
|
* untouched and are ready for writing to an L2 device.
|
|
|
|
* *) The buffer was all-zeros, so there is no need to write it to an L2
|
|
|
|
* device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
|
|
|
|
* set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
|
|
|
|
* *) Compression succeeded and b_tmp_cdata was replaced with a temporary
|
|
|
|
* data buffer which holds the compressed data to be written, and b_asize
|
|
|
|
* tells us how much data there is. b_compress is set to the appropriate
|
|
|
|
* compression algorithm. Once writing is done, invoke
|
|
|
|
* l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
|
|
|
|
*
|
|
|
|
* Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
|
|
|
|
* buffer was incompressible).
|
|
|
|
*/
|
|
|
|
static boolean_t
|
2014-12-29 19:12:23 -08:00
|
|
|
l2arc_compress_buf(arc_buf_hdr_t *hdr)
|
2013-08-01 13:02:10 -07:00
|
|
|
{
|
|
|
|
void *cdata;
|
2014-06-05 13:19:08 -08:00
|
|
|
size_t csize, len, rounded;
|
2014-12-29 19:12:23 -08:00
|
|
|
l2arc_buf_hdr_t *l2hdr;
|
2013-08-01 13:02:10 -07:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(HDR_HAS_L2HDR(hdr));
|
|
|
|
|
|
|
|
l2hdr = &hdr->b_l2hdr;
|
|
|
|
|
|
|
|
ASSERT(HDR_HAS_L1HDR(hdr));
|
2015-09-11 09:18:56 -07:00
|
|
|
ASSERT3U(l2hdr->b_compress, ==, ZIO_COMPRESS_OFF);
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
|
2013-08-01 13:02:10 -07:00
|
|
|
|
|
|
|
len = l2hdr->b_asize;
|
|
|
|
cdata = zio_data_buf_alloc(len);
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT3P(cdata, !=, NULL);
|
|
|
|
csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata,
|
2013-08-01 13:02:10 -07:00
|
|
|
cdata, l2hdr->b_asize);
|
|
|
|
|
2014-06-05 13:19:08 -08:00
|
|
|
rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE);
|
|
|
|
if (rounded > csize) {
|
|
|
|
bzero((char *)cdata + csize, rounded - csize);
|
|
|
|
csize = rounded;
|
|
|
|
}
|
|
|
|
|
2013-08-01 13:02:10 -07:00
|
|
|
if (csize == 0) {
|
|
|
|
/* zero block, indicate that there's nothing to write */
|
|
|
|
zio_data_buf_free(cdata, len);
|
2015-09-11 09:18:56 -07:00
|
|
|
l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
|
2013-08-01 13:02:10 -07:00
|
|
|
l2hdr->b_asize = 0;
|
2014-12-29 19:12:23 -08:00
|
|
|
hdr->b_l1hdr.b_tmp_cdata = NULL;
|
2013-08-01 13:02:10 -07:00
|
|
|
ARCSTAT_BUMP(arcstat_l2_compress_zeros);
|
|
|
|
return (B_TRUE);
|
|
|
|
} else if (csize > 0 && csize < len) {
|
|
|
|
/*
|
|
|
|
* Compression succeeded, we'll keep the cdata around for
|
|
|
|
* writing and release it afterwards.
|
|
|
|
*/
|
2015-09-11 09:18:56 -07:00
|
|
|
l2hdr->b_compress = ZIO_COMPRESS_LZ4;
|
2013-08-01 13:02:10 -07:00
|
|
|
l2hdr->b_asize = csize;
|
2014-12-29 19:12:23 -08:00
|
|
|
hdr->b_l1hdr.b_tmp_cdata = cdata;
|
2013-08-01 13:02:10 -07:00
|
|
|
ARCSTAT_BUMP(arcstat_l2_compress_successes);
|
|
|
|
return (B_TRUE);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Compression failed, release the compressed buffer.
|
|
|
|
* l2hdr will be left unmodified.
|
|
|
|
*/
|
|
|
|
zio_data_buf_free(cdata, len);
|
|
|
|
ARCSTAT_BUMP(arcstat_l2_compress_failures);
|
|
|
|
return (B_FALSE);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Decompresses a zio read back from an l2arc device. On success, the
|
|
|
|
* underlying zio's io_data buffer is overwritten by the uncompressed
|
|
|
|
* version. On decompression error (corrupt compressed stream), the
|
|
|
|
* zio->io_error value is set to signal an I/O error.
|
|
|
|
*
|
|
|
|
* Please note that the compressed data stream is not checksummed, so
|
|
|
|
* if the underlying device is experiencing data corruption, we may feed
|
|
|
|
* corrupt data to the decompressor, so the decompressor needs to be
|
|
|
|
* able to handle this situation (LZ4 does).
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
|
|
|
|
{
|
|
|
|
uint64_t csize;
|
|
|
|
void *cdata;
|
|
|
|
|
|
|
|
ASSERT(L2ARC_IS_VALID_COMPRESS(c));
|
|
|
|
|
|
|
|
if (zio->io_error != 0) {
|
|
|
|
/*
|
|
|
|
* An io error has occured, just restore the original io
|
|
|
|
* size in preparation for a main pool read.
|
|
|
|
*/
|
|
|
|
zio->io_orig_size = zio->io_size = hdr->b_size;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (c == ZIO_COMPRESS_EMPTY) {
|
|
|
|
/*
|
|
|
|
* An empty buffer results in a null zio, which means we
|
|
|
|
* need to fill its io_data after we're done restoring the
|
|
|
|
* buffer's contents.
|
|
|
|
*/
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(hdr->b_l1hdr.b_buf != NULL);
|
|
|
|
bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size);
|
|
|
|
zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data;
|
2013-08-01 13:02:10 -07:00
|
|
|
} else {
|
|
|
|
ASSERT(zio->io_data != NULL);
|
|
|
|
/*
|
|
|
|
* We copy the compressed data from the start of the arc buffer
|
|
|
|
* (the zio_read will have pulled in only what we need, the
|
|
|
|
* rest is garbage which we will overwrite at decompression)
|
|
|
|
* and then decompress back to the ARC data buffer. This way we
|
|
|
|
* can minimize copying by simply decompressing back over the
|
|
|
|
* original compressed data (rather than decompressing to an
|
|
|
|
* aux buffer and then copying back the uncompressed buffer,
|
|
|
|
* which is likely to be much larger).
|
|
|
|
*/
|
|
|
|
csize = zio->io_size;
|
|
|
|
cdata = zio_data_buf_alloc(csize);
|
|
|
|
bcopy(zio->io_data, cdata, csize);
|
|
|
|
if (zio_decompress_data(c, cdata, zio->io_data, csize,
|
|
|
|
hdr->b_size) != 0)
|
2015-06-29 10:02:03 -07:00
|
|
|
zio->io_error = EIO;
|
2013-08-01 13:02:10 -07:00
|
|
|
zio_data_buf_free(cdata, csize);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Restore the expected uncompressed IO size. */
|
|
|
|
zio->io_orig_size = zio->io_size = hdr->b_size;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
|
|
|
|
* This buffer serves as a temporary holder of compressed data while
|
|
|
|
* the buffer entry is being written to an l2arc device. Once that is
|
|
|
|
* done, we can dispose of it.
|
|
|
|
*/
|
|
|
|
static void
|
2014-12-06 09:24:32 -08:00
|
|
|
l2arc_release_cdata_buf(arc_buf_hdr_t *hdr)
|
2013-08-01 13:02:10 -07:00
|
|
|
{
|
2015-09-11 09:18:56 -07:00
|
|
|
enum zio_compress comp;
|
2015-01-12 19:52:19 -08:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(HDR_HAS_L1HDR(hdr));
|
2015-09-11 09:18:56 -07:00
|
|
|
ASSERT(HDR_HAS_L2HDR(hdr));
|
|
|
|
comp = hdr->b_l2hdr.b_compress;
|
2015-01-12 19:52:19 -08:00
|
|
|
ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp));
|
|
|
|
|
|
|
|
if (comp == ZIO_COMPRESS_OFF) {
|
|
|
|
/*
|
|
|
|
* In this case, b_tmp_cdata points to the same buffer
|
|
|
|
* as the arc_buf_t's b_data field. We don't want to
|
|
|
|
* free it, since the arc_buf_t will handle that.
|
|
|
|
*/
|
|
|
|
hdr->b_l1hdr.b_tmp_cdata = NULL;
|
|
|
|
} else if (comp == ZIO_COMPRESS_EMPTY) {
|
|
|
|
/*
|
|
|
|
* In this case, b_tmp_cdata was compressed to an empty
|
|
|
|
* buffer, thus there's nothing to free and b_tmp_cdata
|
|
|
|
* should have been set to NULL in l2arc_write_buffers().
|
|
|
|
*/
|
|
|
|
ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
|
|
|
|
} else {
|
2013-08-01 13:02:10 -07:00
|
|
|
/*
|
|
|
|
* If the data was compressed, then we've allocated a
|
|
|
|
* temporary buffer for it, so now we need to release it.
|
|
|
|
*/
|
2014-12-29 19:12:23 -08:00
|
|
|
ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
|
|
|
|
zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata,
|
|
|
|
hdr->b_size);
|
2015-01-12 19:52:19 -08:00
|
|
|
hdr->b_l1hdr.b_tmp_cdata = NULL;
|
2013-08-01 13:02:10 -07:00
|
|
|
}
|
2015-01-12 19:52:19 -08:00
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This thread feeds the L2ARC at regular intervals. This is the beating
|
|
|
|
* heart of the L2ARC.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
l2arc_feed_thread(void)
|
|
|
|
{
|
|
|
|
callb_cpr_t cpr;
|
|
|
|
l2arc_dev_t *dev;
|
|
|
|
spa_t *spa;
|
2009-02-18 12:51:31 -08:00
|
|
|
uint64_t size, wrote;
|
2010-05-28 13:45:14 -07:00
|
|
|
clock_t begin, next = ddi_get_lbolt();
|
2013-08-01 13:02:10 -07:00
|
|
|
boolean_t headroom_boost = B_FALSE;
|
2015-03-30 22:43:29 -05:00
|
|
|
fstrans_cookie_t cookie;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
|
|
|
|
|
|
|
|
mutex_enter(&l2arc_feed_thr_lock);
|
|
|
|
|
2015-03-30 22:43:29 -05:00
|
|
|
cookie = spl_fstrans_mark();
|
2008-11-20 12:01:55 -08:00
|
|
|
while (l2arc_thread_exit == 0) {
|
|
|
|
CALLB_CPR_SAFE_BEGIN(&cpr);
|
2015-06-11 10:47:19 -07:00
|
|
|
(void) cv_timedwait_sig(&l2arc_feed_thr_cv,
|
2010-12-10 12:00:00 -08:00
|
|
|
&l2arc_feed_thr_lock, next);
|
2008-11-20 12:01:55 -08:00
|
|
|
CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
|
2010-05-28 13:45:14 -07:00
|
|
|
next = ddi_get_lbolt() + hz;
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/*
|
2008-12-03 12:09:06 -08:00
|
|
|
* Quick check for L2ARC devices.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
|
|
|
mutex_enter(&l2arc_dev_mtx);
|
|
|
|
if (l2arc_ndev == 0) {
|
|
|
|
mutex_exit(&l2arc_dev_mtx);
|
|
|
|
continue;
|
|
|
|
}
|
2008-12-03 12:09:06 -08:00
|
|
|
mutex_exit(&l2arc_dev_mtx);
|
2010-05-28 13:45:14 -07:00
|
|
|
begin = ddi_get_lbolt();
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/*
|
2008-12-03 12:09:06 -08:00
|
|
|
* This selects the next l2arc device to write to, and in
|
|
|
|
* doing so the next spa to feed from: dev->l2ad_spa. This
|
|
|
|
* will return NULL if there are now no l2arc devices or if
|
|
|
|
* they are all faulted.
|
|
|
|
*
|
|
|
|
* If a device is returned, its spa's config lock is also
|
|
|
|
* held to prevent device removal. l2arc_dev_get_next()
|
|
|
|
* will grab and release l2arc_dev_mtx.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
2008-12-03 12:09:06 -08:00
|
|
|
if ((dev = l2arc_dev_get_next()) == NULL)
|
2008-11-20 12:01:55 -08:00
|
|
|
continue;
|
2008-12-03 12:09:06 -08:00
|
|
|
|
|
|
|
spa = dev->l2ad_spa;
|
|
|
|
ASSERT(spa != NULL);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2010-08-26 14:24:34 -07:00
|
|
|
/*
|
|
|
|
* If the pool is read-only then force the feed thread to
|
|
|
|
* sleep a little longer.
|
|
|
|
*/
|
|
|
|
if (!spa_writeable(spa)) {
|
|
|
|
next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
|
|
|
|
spa_config_exit(spa, SCL_L2ARC, dev);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
2008-12-03 12:09:06 -08:00
|
|
|
* Avoid contributing to memory pressure.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
2015-06-26 11:28:18 -07:00
|
|
|
if (arc_reclaim_needed()) {
|
2008-12-03 12:09:06 -08:00
|
|
|
ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
|
|
|
|
spa_config_exit(spa, SCL_L2ARC, dev);
|
2008-11-20 12:01:55 -08:00
|
|
|
continue;
|
|
|
|
}
|
2008-12-03 12:09:06 -08:00
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
ARCSTAT_BUMP(arcstat_l2_feeds);
|
|
|
|
|
2013-08-01 13:02:10 -07:00
|
|
|
size = l2arc_write_size();
|
2008-12-03 12:09:06 -08:00
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* Evict L2ARC buffers that will be overwritten.
|
|
|
|
*/
|
2008-12-03 12:09:06 -08:00
|
|
|
l2arc_evict(dev, size, B_FALSE);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Write ARC buffers.
|
|
|
|
*/
|
2013-08-01 13:02:10 -07:00
|
|
|
wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
|
2009-02-18 12:51:31 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Calculate interval between writes.
|
|
|
|
*/
|
|
|
|
next = l2arc_write_interval(begin, size, wrote);
|
2008-12-03 12:09:06 -08:00
|
|
|
spa_config_exit(spa, SCL_L2ARC, dev);
|
2008-11-20 12:01:55 -08:00
|
|
|
}
|
2015-03-30 22:43:29 -05:00
|
|
|
spl_fstrans_unmark(cookie);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
l2arc_thread_exit = 0;
|
|
|
|
cv_broadcast(&l2arc_feed_thr_cv);
|
|
|
|
CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
|
|
|
|
thread_exit();
|
|
|
|
}
|
|
|
|
|
2008-12-03 12:09:06 -08:00
|
|
|
boolean_t
|
|
|
|
l2arc_vdev_present(vdev_t *vd)
|
|
|
|
{
|
|
|
|
l2arc_dev_t *dev;
|
|
|
|
|
|
|
|
mutex_enter(&l2arc_dev_mtx);
|
|
|
|
for (dev = list_head(l2arc_dev_list); dev != NULL;
|
|
|
|
dev = list_next(l2arc_dev_list, dev)) {
|
|
|
|
if (dev->l2ad_vdev == vd)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
mutex_exit(&l2arc_dev_mtx);
|
|
|
|
|
|
|
|
return (dev != NULL);
|
|
|
|
}
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* Add a vdev for use by the L2ARC. By this point the spa has already
|
|
|
|
* validated the vdev and opened it.
|
|
|
|
*/
|
|
|
|
void
|
2009-07-02 15:44:48 -07:00
|
|
|
l2arc_add_vdev(spa_t *spa, vdev_t *vd)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
|
|
|
l2arc_dev_t *adddev;
|
|
|
|
|
2008-12-03 12:09:06 -08:00
|
|
|
ASSERT(!l2arc_vdev_present(vd));
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* Create a new l2arc device entry.
|
|
|
|
*/
|
|
|
|
adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
|
|
|
|
adddev->l2ad_spa = spa;
|
|
|
|
adddev->l2ad_vdev = vd;
|
2009-07-02 15:44:48 -07:00
|
|
|
adddev->l2ad_start = VDEV_LABEL_START_SIZE;
|
|
|
|
adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
|
2008-11-20 12:01:55 -08:00
|
|
|
adddev->l2ad_hand = adddev->l2ad_start;
|
|
|
|
adddev->l2ad_first = B_TRUE;
|
2009-02-18 12:51:31 -08:00
|
|
|
adddev->l2ad_writing = B_FALSE;
|
2010-08-26 10:26:44 -07:00
|
|
|
list_link_init(&adddev->l2ad_node);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2014-12-29 19:12:23 -08:00
|
|
|
mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* This is a list of all ARC buffers that are still valid on the
|
|
|
|
* device.
|
|
|
|
*/
|
2014-12-29 19:12:23 -08:00
|
|
|
list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
|
|
|
|
offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
|
2015-06-16 01:12:19 +02:00
|
|
|
refcount_create(&adddev->l2ad_alloc);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Add device to global list
|
|
|
|
*/
|
|
|
|
mutex_enter(&l2arc_dev_mtx);
|
|
|
|
list_insert_head(l2arc_dev_list, adddev);
|
|
|
|
atomic_inc_64(&l2arc_ndev);
|
|
|
|
mutex_exit(&l2arc_dev_mtx);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Remove a vdev from the L2ARC.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
l2arc_remove_vdev(vdev_t *vd)
|
|
|
|
{
|
|
|
|
l2arc_dev_t *dev, *nextdev, *remdev = NULL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Find the device by vdev
|
|
|
|
*/
|
|
|
|
mutex_enter(&l2arc_dev_mtx);
|
|
|
|
for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
|
|
|
|
nextdev = list_next(l2arc_dev_list, dev);
|
|
|
|
if (vd == dev->l2ad_vdev) {
|
|
|
|
remdev = dev;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT(remdev != NULL);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Remove device from global list
|
|
|
|
*/
|
|
|
|
list_remove(l2arc_dev_list, remdev);
|
|
|
|
l2arc_dev_last = NULL; /* may have been invalidated */
|
2008-12-03 12:09:06 -08:00
|
|
|
atomic_dec_64(&l2arc_ndev);
|
|
|
|
mutex_exit(&l2arc_dev_mtx);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Clear all buflists and ARC references. L2ARC device flush.
|
|
|
|
*/
|
|
|
|
l2arc_evict(remdev, 0, B_TRUE);
|
2014-12-29 19:12:23 -08:00
|
|
|
list_destroy(&remdev->l2ad_buflist);
|
|
|
|
mutex_destroy(&remdev->l2ad_mtx);
|
2015-06-16 01:12:19 +02:00
|
|
|
refcount_destroy(&remdev->l2ad_alloc);
|
2008-11-20 12:01:55 -08:00
|
|
|
kmem_free(remdev, sizeof (l2arc_dev_t));
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2008-12-03 12:09:06 -08:00
|
|
|
l2arc_init(void)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
|
|
|
l2arc_thread_exit = 0;
|
|
|
|
l2arc_ndev = 0;
|
|
|
|
l2arc_writes_sent = 0;
|
|
|
|
l2arc_writes_done = 0;
|
|
|
|
|
|
|
|
mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
|
|
|
|
cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
|
|
|
|
mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
|
|
|
|
mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
|
|
|
|
|
|
|
|
l2arc_dev_list = &L2ARC_dev_list;
|
|
|
|
l2arc_free_on_write = &L2ARC_free_on_write;
|
|
|
|
list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
|
|
|
|
offsetof(l2arc_dev_t, l2ad_node));
|
|
|
|
list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
|
|
|
|
offsetof(l2arc_data_free_t, l2df_list_node));
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2008-12-03 12:09:06 -08:00
|
|
|
l2arc_fini(void)
|
2008-11-20 12:01:55 -08:00
|
|
|
{
|
2008-12-03 12:09:06 -08:00
|
|
|
/*
|
|
|
|
* This is called from dmu_fini(), which is called from spa_fini();
|
|
|
|
* Because of this, we can assume that all l2arc devices have
|
|
|
|
* already been removed when the pools themselves were removed.
|
|
|
|
*/
|
|
|
|
|
|
|
|
l2arc_do_free_on_write();
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
mutex_destroy(&l2arc_feed_thr_lock);
|
|
|
|
cv_destroy(&l2arc_feed_thr_cv);
|
|
|
|
mutex_destroy(&l2arc_dev_mtx);
|
|
|
|
mutex_destroy(&l2arc_free_on_write_mtx);
|
|
|
|
|
|
|
|
list_destroy(l2arc_dev_list);
|
|
|
|
list_destroy(l2arc_free_on_write);
|
|
|
|
}
|
2008-12-03 12:09:06 -08:00
|
|
|
|
|
|
|
void
|
|
|
|
l2arc_start(void)
|
|
|
|
{
|
2009-01-15 13:59:39 -08:00
|
|
|
if (!(spa_mode_global & FWRITE))
|
2008-12-03 12:09:06 -08:00
|
|
|
return;
|
|
|
|
|
|
|
|
(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
|
2015-07-24 10:08:31 -07:00
|
|
|
TS_RUN, defclsyspri);
|
2008-12-03 12:09:06 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
l2arc_stop(void)
|
|
|
|
{
|
2009-01-15 13:59:39 -08:00
|
|
|
if (!(spa_mode_global & FWRITE))
|
2008-12-03 12:09:06 -08:00
|
|
|
return;
|
|
|
|
|
|
|
|
mutex_enter(&l2arc_feed_thr_lock);
|
|
|
|
cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
|
|
|
|
l2arc_thread_exit = 1;
|
|
|
|
while (l2arc_thread_exit != 0)
|
|
|
|
cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
|
|
|
|
mutex_exit(&l2arc_feed_thr_lock);
|
|
|
|
}
|
2010-08-26 11:49:16 -07:00
|
|
|
|
|
|
|
#if defined(_KERNEL) && defined(HAVE_SPL)
|
2014-11-13 10:09:05 -08:00
|
|
|
EXPORT_SYMBOL(arc_buf_size);
|
|
|
|
EXPORT_SYMBOL(arc_write);
|
2010-08-26 11:49:16 -07:00
|
|
|
EXPORT_SYMBOL(arc_read);
|
|
|
|
EXPORT_SYMBOL(arc_buf_remove_ref);
|
2013-10-02 17:11:19 -07:00
|
|
|
EXPORT_SYMBOL(arc_buf_info);
|
2010-08-26 11:49:16 -07:00
|
|
|
EXPORT_SYMBOL(arc_getbuf_func);
|
2011-12-22 12:20:43 -08:00
|
|
|
EXPORT_SYMBOL(arc_add_prune_callback);
|
|
|
|
EXPORT_SYMBOL(arc_remove_prune_callback);
|
2010-08-26 11:49:16 -07:00
|
|
|
|
2013-07-24 10:14:11 -07:00
|
|
|
module_param(zfs_arc_min, ulong, 0644);
|
2011-05-03 15:09:28 -07:00
|
|
|
MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
|
2010-08-26 11:49:16 -07:00
|
|
|
|
2013-07-24 10:14:11 -07:00
|
|
|
module_param(zfs_arc_max, ulong, 0644);
|
2011-05-03 15:09:28 -07:00
|
|
|
MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
|
2010-08-26 11:49:16 -07:00
|
|
|
|
2013-07-24 10:14:11 -07:00
|
|
|
module_param(zfs_arc_meta_limit, ulong, 0644);
|
2010-08-26 11:49:16 -07:00
|
|
|
MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
|
2011-03-30 18:59:17 -07:00
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
module_param(zfs_arc_meta_min, ulong, 0644);
|
|
|
|
MODULE_PARM_DESC(zfs_arc_meta_min, "Min arc metadata");
|
|
|
|
|
2013-07-24 10:14:11 -07:00
|
|
|
module_param(zfs_arc_meta_prune, int, 0644);
|
2015-03-17 15:07:47 -07:00
|
|
|
MODULE_PARM_DESC(zfs_arc_meta_prune, "Meta objects to scan for prune");
|
2011-05-03 15:09:28 -07:00
|
|
|
|
2015-06-26 11:28:18 -07:00
|
|
|
module_param(zfs_arc_meta_adjust_restarts, int, 0644);
|
2015-03-17 15:08:22 -07:00
|
|
|
MODULE_PARM_DESC(zfs_arc_meta_adjust_restarts,
|
|
|
|
"Limit number of restarts in arc_adjust_meta");
|
|
|
|
|
2015-05-30 09:57:53 -05:00
|
|
|
module_param(zfs_arc_meta_strategy, int, 0644);
|
|
|
|
MODULE_PARM_DESC(zfs_arc_meta_strategy, "Meta reclaim strategy");
|
|
|
|
|
2013-07-24 10:14:11 -07:00
|
|
|
module_param(zfs_arc_grow_retry, int, 0644);
|
2011-05-03 15:09:28 -07:00
|
|
|
MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
|
|
|
|
|
Disable aggressive arc_p growth by default
For specific workloads consisting mainly of mfu data and new anon data
buffers, the aggressive growth of arc_p found in the arc_get_data_buf()
function can have detrimental effects on the mfu list size and ghost
list hit rate.
Running a workload consisting of two processes:
* Process 1 is creating many small files
* Process 2 is tar'ing a directory consisting of many small files
I've seen arc_p and the mru grow to their maximum size, while the mru
ghost list receives 100K times fewer hits than the mfu ghost list.
Ideally, as the mfu ghost list receives hits, arc_p should be driven
down and the size of the mfu should increase. Given the specific
workload I was testing with, the mfu list size should grow to a point
where almost no mfu ghost list hits would occur. Unfortunately, this
does not happen because the newly dirtied anon buffers constancy drive
arc_p to its maximum value and keep it there (effectively prioritizing
the mru list and starving the mfu list down to a negligible size).
The logic to increment arc_p from within the arc_get_data_buf() function
was introduced many years ago in this upstream commit:
commit 641fbdae3a027d12b3c3dcd18927ccafae6d58bc
Author: maybee <none@none>
Date: Wed Dec 20 15:46:12 2006 -0800
6505658 target MRU size (arc.p) needs to be adjusted more aggressively
and since I don't fully understand the motivation for the change, I am
reluctant to completely remove it.
As a way to test out how it's removal might affect performance, I've
disabled that code by default, but left it tunable via a module option.
Thus, if its removal is found to be grossly detrimental for certain
workloads, it can be re-enabled on the fly, without a code change.
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #2110
2013-12-11 09:40:13 -08:00
|
|
|
module_param(zfs_arc_p_aggressive_disable, int, 0644);
|
|
|
|
MODULE_PARM_DESC(zfs_arc_p_aggressive_disable, "disable aggressive arc_p grow");
|
|
|
|
|
2014-01-03 10:36:26 -08:00
|
|
|
module_param(zfs_arc_p_dampener_disable, int, 0644);
|
|
|
|
MODULE_PARM_DESC(zfs_arc_p_dampener_disable, "disable arc_p adapt dampener");
|
|
|
|
|
2013-07-24 10:14:11 -07:00
|
|
|
module_param(zfs_arc_shrink_shift, int, 0644);
|
2011-05-03 15:09:28 -07:00
|
|
|
MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
|
|
|
|
|
2015-06-26 15:59:23 -07:00
|
|
|
module_param(zfs_arc_p_min_shift, int, 0644);
|
|
|
|
MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p");
|
|
|
|
|
2013-02-01 09:18:45 -08:00
|
|
|
module_param(zfs_disable_dup_eviction, int, 0644);
|
|
|
|
MODULE_PARM_DESC(zfs_disable_dup_eviction, "disable duplicate buffer eviction");
|
|
|
|
|
2014-08-20 10:09:40 -07:00
|
|
|
module_param(zfs_arc_average_blocksize, int, 0444);
|
|
|
|
MODULE_PARM_DESC(zfs_arc_average_blocksize, "Target average block size");
|
|
|
|
|
2013-07-24 10:14:11 -07:00
|
|
|
module_param(zfs_arc_min_prefetch_lifespan, int, 0644);
|
|
|
|
MODULE_PARM_DESC(zfs_arc_min_prefetch_lifespan, "Min life of prefetch block");
|
|
|
|
|
2015-01-12 19:52:19 -08:00
|
|
|
module_param(zfs_arc_num_sublists_per_state, int, 0644);
|
|
|
|
MODULE_PARM_DESC(zfs_arc_num_sublists_per_state,
|
|
|
|
"Number of sublists used in each of the ARC state lists");
|
|
|
|
|
2013-07-24 10:14:11 -07:00
|
|
|
module_param(l2arc_write_max, ulong, 0644);
|
2011-07-08 12:41:57 -07:00
|
|
|
MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
|
|
|
|
|
2013-07-24 10:14:11 -07:00
|
|
|
module_param(l2arc_write_boost, ulong, 0644);
|
2011-07-08 12:41:57 -07:00
|
|
|
MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup");
|
|
|
|
|
2013-07-24 10:14:11 -07:00
|
|
|
module_param(l2arc_headroom, ulong, 0644);
|
2011-07-08 12:41:57 -07:00
|
|
|
MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache");
|
|
|
|
|
2013-08-01 13:02:10 -07:00
|
|
|
module_param(l2arc_headroom_boost, ulong, 0644);
|
|
|
|
MODULE_PARM_DESC(l2arc_headroom_boost, "Compressed l2arc_headroom multiplier");
|
|
|
|
|
2016-02-10 10:42:01 -08:00
|
|
|
module_param(l2arc_max_block_size, ulong, 0644);
|
|
|
|
MODULE_PARM_DESC(l2arc_max_block_size, "Skip L2ARC buffers larger than N");
|
|
|
|
|
2013-07-24 10:14:11 -07:00
|
|
|
module_param(l2arc_feed_secs, ulong, 0644);
|
2011-07-08 12:41:57 -07:00
|
|
|
MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing");
|
|
|
|
|
2013-07-24 10:14:11 -07:00
|
|
|
module_param(l2arc_feed_min_ms, ulong, 0644);
|
2011-07-08 12:41:57 -07:00
|
|
|
MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds");
|
|
|
|
|
2013-07-24 10:14:11 -07:00
|
|
|
module_param(l2arc_noprefetch, int, 0644);
|
2011-07-08 12:41:57 -07:00
|
|
|
MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers");
|
|
|
|
|
2013-08-01 13:02:10 -07:00
|
|
|
module_param(l2arc_nocompress, int, 0644);
|
|
|
|
MODULE_PARM_DESC(l2arc_nocompress, "Skip compressing L2ARC buffers");
|
|
|
|
|
2013-07-24 10:14:11 -07:00
|
|
|
module_param(l2arc_feed_again, int, 0644);
|
2011-07-08 12:41:57 -07:00
|
|
|
MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup");
|
|
|
|
|
2013-07-24 10:14:11 -07:00
|
|
|
module_param(l2arc_norw, int, 0644);
|
2011-07-08 12:41:57 -07:00
|
|
|
MODULE_PARM_DESC(l2arc_norw, "No reads during writes");
|
|
|
|
|
2015-07-28 11:30:00 -07:00
|
|
|
module_param(zfs_arc_lotsfree_percent, int, 0644);
|
|
|
|
MODULE_PARM_DESC(zfs_arc_lotsfree_percent,
|
|
|
|
"System free memory I/O throttle in bytes");
|
|
|
|
|
2015-07-27 13:17:32 -07:00
|
|
|
module_param(zfs_arc_sys_free, ulong, 0644);
|
|
|
|
MODULE_PARM_DESC(zfs_arc_sys_free, "System free memory target size in bytes");
|
|
|
|
|
2016-07-13 07:42:40 -05:00
|
|
|
module_param(zfs_arc_dnode_limit, ulong, 0644);
|
|
|
|
MODULE_PARM_DESC(zfs_arc_dnode_limit, "Minimum bytes of dnodes in arc");
|
|
|
|
|
|
|
|
module_param(zfs_arc_dnode_reduce_percent, ulong, 0644);
|
|
|
|
MODULE_PARM_DESC(zfs_arc_dnode_reduce_percent,
|
|
|
|
"Percentage of excess dnodes to try to unpin");
|
|
|
|
|
2010-08-26 11:49:16 -07:00
|
|
|
#endif
|