This originated from ZFS On Linux, as

d4a72f2386

During scans (scrubs or resilvers), it sorts the blocks in each transaction
group by block offset; the result can be a significant improvement. (On my
test system just now, which I put some effort to introduce fragmentation into
the pool since I set it up yesterday, a scrub went from 1h2m to 33.5m with the
changes.) I've seen similar rations on production systems.

Approved by:	Alexander Motin
Obtained from:	ZFS On Linux
Relnotes:	Yes (improved scrub performance, with tunables)
Differential Revision:	https://reviews.freebsd.org/D15562
This commit is contained in:
Sean Eric Fagan 2018-06-08 17:38:28 +00:00
parent 3db28e6656
commit 69724399c4
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=334844
39 changed files with 3119 additions and 776 deletions

View File

@ -2281,14 +2281,14 @@ dump_dir(objset_t *os)
object_count++;
}
ASSERT3U(object_count, ==, usedobjs);
(void) printf("\n");
if (error != ESRCH) {
(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
abort();
}
ASSERT3U(object_count, ==, usedobjs);
}
static void
@ -2788,6 +2788,7 @@ zdb_blkptr_done(zio_t *zio)
mutex_enter(&spa->spa_scrub_lock);
spa->spa_scrub_inflight--;
spa->spa_load_verify_ios--;
cv_broadcast(&spa->spa_scrub_io_cv);
if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
@ -2859,9 +2860,10 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
flags |= ZIO_FLAG_SPECULATIVE;
mutex_enter(&spa->spa_scrub_lock);
while (spa->spa_scrub_inflight > max_inflight)
while (spa->spa_load_verify_ios > max_inflight)
cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
spa->spa_scrub_inflight++;
spa->spa_load_verify_ios++;
mutex_exit(&spa->spa_scrub_lock);
zio_nowait(zio_read(NULL, spa, bp, abd, size,

View File

@ -1643,7 +1643,7 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
(void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_SCAN_STATS,
(uint64_t **)&ps, &c);
if (ps && ps->pss_state == DSS_SCANNING &&
if (ps != NULL && ps->pss_state == DSS_SCANNING &&
vs->vs_scan_processed != 0 && children == 0) {
(void) printf(gettext(" (%s)"),
(ps->pss_func == POOL_SCAN_RESILVER) ?
@ -4254,11 +4254,13 @@ static void
print_scan_status(pool_scan_stat_t *ps)
{
time_t start, end, pause;
uint64_t elapsed, mins_left, hours_left;
uint64_t pass_exam, examined, total;
uint_t rate;
uint64_t total_secs_left;
uint64_t elapsed, secs_left, mins_left, hours_left, days_left;
uint64_t pass_scanned, scanned, pass_issued, issued, total;
uint_t scan_rate, issue_rate;
double fraction_done;
char processed_buf[7], examined_buf[7], total_buf[7], rate_buf[7];
char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7];
char srate_buf[7], irate_buf[7];
(void) printf(gettext(" scan: "));
@ -4272,30 +4274,37 @@ print_scan_status(pool_scan_stat_t *ps)
start = ps->pss_start_time;
end = ps->pss_end_time;
pause = ps->pss_pass_scrub_pause;
zfs_nicenum(ps->pss_processed, processed_buf, sizeof (processed_buf));
assert(ps->pss_func == POOL_SCAN_SCRUB ||
ps->pss_func == POOL_SCAN_RESILVER);
/*
* Scan is finished or canceled.
*/
if (ps->pss_state == DSS_FINISHED) {
uint64_t minutes_taken = (end - start) / 60;
char *fmt = NULL;
/* Scan is finished or canceled. */
if (ps->pss_state == DSS_FINISHED) {
total_secs_left = end - start;
days_left = total_secs_left / 60 / 60 / 24;
hours_left = (total_secs_left / 60 / 60) % 24;
mins_left = (total_secs_left / 60) % 60;
secs_left = (total_secs_left % 60);
if (ps->pss_func == POOL_SCAN_SCRUB) {
fmt = gettext("scrub repaired %s in %lluh%um with "
"%llu errors on %s");
(void) printf(gettext("scrub repaired %s "
"in %llu days %02llu:%02llu:%02llu "
"with %llu errors on %s"), processed_buf,
(u_longlong_t)days_left, (u_longlong_t)hours_left,
(u_longlong_t)mins_left, (u_longlong_t)secs_left,
(u_longlong_t)ps->pss_errors, ctime(&end));
} else if (ps->pss_func == POOL_SCAN_RESILVER) {
fmt = gettext("resilvered %s in %lluh%um with "
"%llu errors on %s");
(void) printf(gettext("resilvered %s "
"in %llu days %02llu:%02llu:%02llu "
"with %llu errors on %s"), processed_buf,
(u_longlong_t)days_left, (u_longlong_t)hours_left,
(u_longlong_t)mins_left, (u_longlong_t)secs_left,
(u_longlong_t)ps->pss_errors, ctime(&end));
}
/* LINTED */
(void) printf(fmt, processed_buf,
(u_longlong_t)(minutes_taken / 60),
(uint_t)(minutes_taken % 60),
(u_longlong_t)ps->pss_errors,
ctime((time_t *)&end));
return;
} else if (ps->pss_state == DSS_CANCELED) {
if (ps->pss_func == POOL_SCAN_SCRUB) {
@ -4310,19 +4319,15 @@ print_scan_status(pool_scan_stat_t *ps)
assert(ps->pss_state == DSS_SCANNING);
/*
* Scan is in progress.
*/
/* Scan is in progress. Resilvers can't be paused. */
if (ps->pss_func == POOL_SCAN_SCRUB) {
if (pause == 0) {
(void) printf(gettext("scrub in progress since %s"),
ctime(&start));
} else {
char buf[32];
struct tm *p = localtime(&pause);
(void) strftime(buf, sizeof (buf), "%a %b %e %T %Y", p);
(void) printf(gettext("scrub paused since %s\n"), buf);
(void) printf(gettext("\tscrub started on %s"),
(void) printf(gettext("scrub paused since %s"),
ctime(&pause));
(void) printf(gettext("\tscrub started on %s"),
ctime(&start));
}
} else if (ps->pss_func == POOL_SCAN_RESILVER) {
@ -4330,50 +4335,68 @@ print_scan_status(pool_scan_stat_t *ps)
ctime(&start));
}
examined = ps->pss_examined ? ps->pss_examined : 1;
scanned = ps->pss_examined;
pass_scanned = ps->pss_pass_exam;
issued = ps->pss_issued;
pass_issued = ps->pss_pass_issued;
total = ps->pss_to_examine;
fraction_done = (double)examined / total;
/* elapsed time for this pass */
/* we are only done with a block once we have issued the IO for it */
fraction_done = (double)issued / total;
/* elapsed time for this pass, rounding up to 1 if it's 0 */
elapsed = time(NULL) - ps->pss_pass_start;
elapsed -= ps->pss_pass_scrub_spent_paused;
elapsed = elapsed ? elapsed : 1;
pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1;
rate = pass_exam / elapsed;
rate = rate ? rate : 1;
mins_left = ((total - examined) / rate) / 60;
hours_left = mins_left / 60;
elapsed = (elapsed != 0) ? elapsed : 1;
zfs_nicenum(examined, examined_buf, sizeof (examined_buf));
scan_rate = pass_scanned / elapsed;
issue_rate = pass_issued / elapsed;
total_secs_left = (issue_rate != 0) ?
((total - issued) / issue_rate) : UINT64_MAX;
days_left = total_secs_left / 60 / 60 / 24;
hours_left = (total_secs_left / 60 / 60) % 24;
mins_left = (total_secs_left / 60) % 60;
secs_left = (total_secs_left % 60);
/* format all of the numbers we will be reporting */
zfs_nicenum(scanned, scanned_buf, sizeof (scanned_buf));
zfs_nicenum(issued, issued_buf, sizeof (issued_buf));
zfs_nicenum(total, total_buf, sizeof (total_buf));
zfs_nicenum(scan_rate, srate_buf, sizeof (srate_buf));
zfs_nicenum(issue_rate, irate_buf, sizeof (irate_buf));
/*
* do not print estimated time if hours_left is more than 30 days
* or we have a paused scrub
*/
/* doo not print estimated time if we have a paused scrub */
if (pause == 0) {
zfs_nicenum(rate, rate_buf, sizeof (rate_buf));
(void) printf(gettext("\t%s scanned out of %s at %s/s"),
examined_buf, total_buf, rate_buf);
if (hours_left < (30 * 24)) {
(void) printf(gettext(", %lluh%um to go\n"),
(u_longlong_t)hours_left, (uint_t)(mins_left % 60));
} else {
(void) printf(gettext(
", (scan is slow, no estimated time)\n"));
}
(void) printf(gettext("\t%s scanned at %s/s, "
"%s issued at %s/s, %s total\n"),
scanned_buf, srate_buf, issued_buf, irate_buf, total_buf);
} else {
(void) printf(gettext("\t%s scanned out of %s\n"),
examined_buf, total_buf);
(void) printf(gettext("\t%s scanned, %s issued, %s total\n"),
scanned_buf, issued_buf, total_buf);
}
if (ps->pss_func == POOL_SCAN_RESILVER) {
(void) printf(gettext(" %s resilvered, %.2f%% done\n"),
(void) printf(gettext("\t%s resilvered, %.2f%% done"),
processed_buf, 100 * fraction_done);
} else if (ps->pss_func == POOL_SCAN_SCRUB) {
(void) printf(gettext(" %s repaired, %.2f%% done\n"),
(void) printf(gettext("\t%s repaired, %.2f%% done"),
processed_buf, 100 * fraction_done);
}
if (pause == 0) {
if (issue_rate >= 10 * 1024 * 1024) {
(void) printf(gettext(", %llu days "
"%02llu:%02llu:%02llu to go\n"),
(u_longlong_t)days_left, (u_longlong_t)hours_left,
(u_longlong_t)mins_left, (u_longlong_t)secs_left);
} else {
(void) printf(gettext(", no estimated "
"completion time\n"));
}
} else {
(void) printf(gettext("\n"));
}
}
/*

View File

@ -374,15 +374,15 @@ ztest_info_t ztest_info[] = {
{ ztest_fzap, 1, &zopt_sometimes },
{ ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes },
{ ztest_spa_create_destroy, 1, &zopt_sometimes },
{ ztest_fault_inject, 1, &zopt_sometimes },
{ ztest_fault_inject, 1, &zopt_incessant },
{ ztest_ddt_repair, 1, &zopt_sometimes },
{ ztest_dmu_snapshot_hold, 1, &zopt_sometimes },
{ ztest_reguid, 1, &zopt_rarely },
{ ztest_spa_rename, 1, &zopt_rarely },
{ ztest_scrub, 1, &zopt_rarely },
{ ztest_scrub, 1, &zopt_often },
{ ztest_spa_upgrade, 1, &zopt_rarely },
{ ztest_dsl_dataset_promote_busy, 1, &zopt_rarely },
{ ztest_vdev_attach_detach, 1, &zopt_sometimes },
{ ztest_vdev_attach_detach, 1, &zopt_incessant },
{ ztest_vdev_LUN_growth, 1, &zopt_rarely },
{ ztest_vdev_add_remove, 1,
&ztest_opts.zo_vdevtime },

View File

@ -219,7 +219,7 @@ check_status(nvlist_t *config, boolean_t isimport)
*/
(void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS,
(uint64_t **)&ps, &psc);
if (ps && ps->pss_func == POOL_SCAN_RESILVER &&
if (ps != NULL && ps->pss_func == POOL_SCAN_RESILVER &&
ps->pss_state == DSS_SCANNING)
return (ZPOOL_STATUS_RESILVERING);

View File

@ -408,6 +408,7 @@ typedef struct taskq_ent {
#define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */
#define TQ_FRONT 0x08 /* Queue in front */
#define TASKQID_INVALID ((taskqid_t)0)
extern taskq_t *system_taskq;
@ -421,6 +422,7 @@ extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
taskq_ent_t *);
extern void taskq_destroy(taskq_t *);
extern void taskq_wait(taskq_t *);
extern void taskq_wait_id(taskq_t *, taskqid_t);
extern int taskq_member(taskq_t *, void *);
extern void system_taskq_init(void);
extern void system_taskq_fini(void);

View File

@ -187,6 +187,12 @@ taskq_wait(taskq_t *tq)
mutex_exit(&tq->tq_lock);
}
void
taskq_wait_id(taskq_t *tq, taskqid_t id)
{
taskq_wait(tq);
}
static void *
taskq_thread(void *arg)
{

View File

@ -173,3 +173,9 @@ taskq_wait(taskq_t *tq)
{
taskqueue_drain_all(tq->tq_queue);
}
void
taskq_wait_id(taskq_t *tq, taskqid_t id)
{
taskq_wait(tq);
}

View File

@ -339,7 +339,8 @@ int arc_no_grow_shift = 5;
* minimum lifespan of a prefetch block in clock ticks
* (initialized in arc_init())
*/
static int arc_min_prefetch_lifespan;
static int zfs_arc_min_prefetch_ms = 1;
static int zfs_arc_min_prescient_prefetch_ms = 6;
/*
* If this percent of memory is free, don't throttle.
@ -779,8 +780,9 @@ typedef struct arc_stats {
kstat_named_t arcstat_meta_limit;
kstat_named_t arcstat_meta_max;
kstat_named_t arcstat_meta_min;
kstat_named_t arcstat_sync_wait_for_async;
kstat_named_t arcstat_async_upgrade_sync;
kstat_named_t arcstat_demand_hit_predictive_prefetch;
kstat_named_t arcstat_demand_hit_prescient_prefetch;
} arc_stats_t;
static arc_stats_t arc_stats = {
@ -877,8 +879,9 @@ static arc_stats_t arc_stats = {
{ "arc_meta_limit", KSTAT_DATA_UINT64 },
{ "arc_meta_max", KSTAT_DATA_UINT64 },
{ "arc_meta_min", KSTAT_DATA_UINT64 },
{ "sync_wait_for_async", KSTAT_DATA_UINT64 },
{ "async_upgrade_sync", KSTAT_DATA_UINT64 },
{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
{ "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
};
#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
@ -974,22 +977,23 @@ typedef struct arc_callback arc_callback_t;
struct arc_callback {
void *acb_private;
arc_done_func_t *acb_done;
arc_read_done_func_t *acb_done;
arc_buf_t *acb_buf;
boolean_t acb_compressed;
zio_t *acb_zio_dummy;
zio_t *acb_zio_head;
arc_callback_t *acb_next;
};
typedef struct arc_write_callback arc_write_callback_t;
struct arc_write_callback {
void *awcb_private;
arc_done_func_t *awcb_ready;
arc_done_func_t *awcb_children_ready;
arc_done_func_t *awcb_physdone;
arc_done_func_t *awcb_done;
arc_buf_t *awcb_buf;
void *awcb_private;
arc_write_done_func_t *awcb_ready;
arc_write_done_func_t *awcb_children_ready;
arc_write_done_func_t *awcb_physdone;
arc_write_done_func_t *awcb_done;
arc_buf_t *awcb_buf;
};
/*
@ -1229,6 +1233,8 @@ sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS)
#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH)
#define HDR_PRESCIENT_PREFETCH(hdr) \
((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
#define HDR_COMPRESSION_ENABLED(hdr) \
((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
@ -1392,6 +1398,11 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD,
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
&ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state");
SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prfetch_ms, CTLFLAG_RW,
&zfs_arc_min_prefetch_ms, 0, "Min life of prefetch block in ms");
SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prescient_prefetch_ms, CTLFLAG_RW,
&zfs_arc_min_prescient_prefetch_ms, 0, "Min life oof prescient prefetched block in ms");
/*
* L2ARC Internals
*/
@ -3544,6 +3555,8 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
{
arc_state_t *evicted_state, *state;
int64_t bytes_evicted = 0;
int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
zfs_arc_min_prescient_prefetch_ms : zfs_arc_min_prefetch_ms;
ASSERT(MUTEX_HELD(hash_lock));
ASSERT(HDR_HAS_L1HDR(hdr));
@ -3596,8 +3609,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
/* prefetch buffers have a minimum lifespan */
if (HDR_IO_IN_PROGRESS(hdr) ||
((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
arc_min_prefetch_lifespan)) {
ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) {
ARCSTAT_BUMP(arcstat_evict_skip);
return (bytes_evicted);
}
@ -4968,13 +4980,15 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* - move the buffer to the head of the list if this is
* another prefetch (to make it less likely to be evicted).
*/
if (HDR_PREFETCH(hdr)) {
if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
/* link protected by hash lock */
ASSERT(multilist_link_active(
&hdr->b_l1hdr.b_arc_node));
} else {
arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
arc_hdr_clear_flags(hdr,
ARC_FLAG_PREFETCH |
ARC_FLAG_PRESCIENT_PREFETCH);
ARCSTAT_BUMP(arcstat_mru_hits);
}
hdr->b_l1hdr.b_arc_access = now;
@ -5005,10 +5019,13 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* MFU state.
*/
if (HDR_PREFETCH(hdr)) {
if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
new_state = arc_mru;
if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
arc_hdr_clear_flags(hdr,
ARC_FLAG_PREFETCH |
ARC_FLAG_PRESCIENT_PREFETCH);
}
DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
} else {
new_state = arc_mfu;
@ -5029,11 +5046,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* If it was a prefetch, we will explicitly move it to
* the head of the list now.
*/
if ((HDR_PREFETCH(hdr)) != 0) {
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
/* link protected by hash_lock */
ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
}
ARCSTAT_BUMP(arcstat_mfu_hits);
hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
@ -5044,12 +5057,11 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* MFU state.
*/
if (HDR_PREFETCH(hdr)) {
if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
/*
* This is a prefetch access...
* move this block back to the MRU state.
*/
ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
new_state = arc_mru;
}
@ -5116,23 +5128,28 @@ arc_buf_access(arc_buf_t *buf)
demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
}
/* a generic arc_done_func_t which you can use */
/* a generic arc_read_done_func_t which you can use */
/* ARGSUSED */
void
arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
arc_buf_t *buf, void *arg)
{
if (zio == NULL || zio->io_error == 0)
bcopy(buf->b_data, arg, arc_buf_size(buf));
if (buf == NULL)
return;
bcopy(buf->b_data, arg, arc_buf_size(buf));
arc_buf_destroy(buf, arg);
}
/* a generic arc_done_func_t */
/* a generic arc_read_done_func_t */
/* ARGSUSED */
void
arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
arc_buf_t *buf, void *arg)
{
arc_buf_t **bufp = arg;
if (zio && zio->io_error) {
arc_buf_destroy(buf, arg);
if (buf == NULL) {
*bufp = NULL;
} else {
*bufp = buf;
@ -5164,7 +5181,6 @@ arc_read_done(zio_t *zio)
arc_callback_t *callback_list;
arc_callback_t *acb;
boolean_t freeable = B_FALSE;
boolean_t no_zio_error = (zio->io_error == 0);
/*
* The hdr was inserted into hash-table and removed from lists
@ -5190,7 +5206,7 @@ arc_read_done(zio_t *zio)
ASSERT3P(hash_lock, !=, NULL);
}
if (no_zio_error) {
if (zio->io_error == 0) {
/* byteswap if necessary */
if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
if (BP_GET_LEVEL(zio->io_bp) > 0) {
@ -5211,7 +5227,8 @@ arc_read_done(zio_t *zio)
callback_list = hdr->b_l1hdr.b_acb;
ASSERT3P(callback_list, !=, NULL);
if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) {
if (hash_lock && zio->io_error == 0 &&
hdr->b_l1hdr.b_state == arc_anon) {
/*
* Only call arc_access on anonymous buffers. This is because
* if we've issued an I/O for an evicted buffer, we've already
@ -5232,14 +5249,21 @@ arc_read_done(zio_t *zio)
if (!acb->acb_done)
continue;
/* This is a demand read since prefetches don't use callbacks */
callback_cnt++;
if (zio->io_error != 0)
continue;
int error = arc_buf_alloc_impl(hdr, acb->acb_private,
acb->acb_compressed, no_zio_error, &acb->acb_buf);
if (no_zio_error) {
zio->io_error = error;
acb->acb_compressed,
B_TRUE, &acb->acb_buf);
if (error != 0) {
arc_buf_destroy(acb->acb_buf, acb->acb_private);
acb->acb_buf = NULL;
}
if (zio->io_error == 0)
zio->io_error = error;
}
hdr->b_l1hdr.b_acb = NULL;
arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
@ -5252,7 +5276,7 @@ arc_read_done(zio_t *zio)
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
callback_list != NULL);
if (no_zio_error) {
if (zio->io_error == 0) {
arc_hdr_verify(hdr, zio->io_bp);
} else {
arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
@ -5285,8 +5309,10 @@ arc_read_done(zio_t *zio)
/* execute each callback and free its structure */
while ((acb = callback_list) != NULL) {
if (acb->acb_done)
acb->acb_done(zio, acb->acb_buf, acb->acb_private);
if (acb->acb_done) {
acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
acb->acb_buf, acb->acb_private);
}
if (acb->acb_zio_dummy != NULL) {
acb->acb_zio_dummy->io_error = zio->io_error;
@ -5320,7 +5346,7 @@ arc_read_done(zio_t *zio)
* for readers of this block.
*/
int
arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done,
void *private, zio_priority_t priority, int zio_flags,
arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
{
@ -5329,7 +5355,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
zio_t *rzio;
uint64_t guid = spa_load_guid(spa);
boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0;
int rc = 0;
ASSERT(!BP_IS_EMBEDDED(bp) ||
BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
@ -5347,32 +5374,20 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
*arc_flags |= ARC_FLAG_CACHED;
if (HDR_IO_IN_PROGRESS(hdr)) {
zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
ASSERT3P(head_zio, !=, NULL);
if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
priority == ZIO_PRIORITY_SYNC_READ) {
/*
* This sync read must wait for an
* in-progress async read (e.g. a predictive
* prefetch). Async reads are queued
* separately at the vdev_queue layer, so
* this is a form of priority inversion.
* Ideally, we would "inherit" the demand
* i/o's priority by moving the i/o from
* the async queue to the synchronous queue,
* but there is currently no mechanism to do
* so. Track this so that we can evaluate
* the magnitude of this potential performance
* problem.
*
* Note that if the prefetch i/o is already
* active (has been issued to the device),
* the prefetch improved performance, because
* we issued it sooner than we would have
* without the prefetch.
* This is a sync read that needs to wait for
* an in-flight async read. Request that the
* zio have its priority upgraded.
*/
DTRACE_PROBE1(arc__sync__wait__for__async,
zio_change_priority(head_zio, priority);
DTRACE_PROBE1(arc__async__upgrade__sync,
arc_buf_hdr_t *, hdr);
ARCSTAT_BUMP(arcstat_sync_wait_for_async);
ARCSTAT_BUMP(arcstat_async_upgrade_sync);
}
if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
arc_hdr_clear_flags(hdr,
@ -5399,6 +5414,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
spa, NULL, NULL, NULL, zio_flags);
ASSERT3P(acb->acb_done, !=, NULL);
acb->acb_zio_head = head_zio;
acb->acb_next = hdr->b_l1hdr.b_acb;
hdr->b_l1hdr.b_acb = acb;
mutex_exit(hash_lock);
@ -5426,17 +5442,32 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
arc_hdr_clear_flags(hdr,
ARC_FLAG_PREDICTIVE_PREFETCH);
}
ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
ARCSTAT_BUMP(
arcstat_demand_hit_prescient_prefetch);
arc_hdr_clear_flags(hdr,
ARC_FLAG_PRESCIENT_PREFETCH);
}
ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
/* Get a buf with the desired data in it. */
VERIFY0(arc_buf_alloc_impl(hdr, private,
compressed_read, B_TRUE, &buf));
rc = arc_buf_alloc_impl(hdr, private,
compressed_read, B_TRUE, &buf);
if (rc != 0) {
arc_buf_destroy(buf, private);
buf = NULL;
}
ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
rc == 0 || rc != ENOENT);
} else if (*arc_flags & ARC_FLAG_PREFETCH &&
refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
}
DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
arc_access(hdr, hash_lock);
if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
if (*arc_flags & ARC_FLAG_L2CACHE)
arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
mutex_exit(hash_lock);
@ -5446,7 +5477,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
data, metadata, hits);
if (done)
done(NULL, buf, private);
done(NULL, zb, bp, buf, private);
} else {
uint64_t lsize = BP_GET_LSIZE(bp);
uint64_t psize = BP_GET_PSIZE(bp);
@ -5520,6 +5551,9 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
if (*arc_flags & ARC_FLAG_PREFETCH)
arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
if (*arc_flags & ARC_FLAG_L2CACHE)
arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
if (BP_GET_LEVEL(bp) > 0)
@ -5549,14 +5583,17 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
vd = NULL;
}
if (priority == ZIO_PRIORITY_ASYNC_READ)
/*
* We count both async reads and scrub IOs as asynchronous so
* that both can be upgraded in the event of a cache hit while
* the read IO is still in-flight.
*/
if (priority == ZIO_PRIORITY_ASYNC_READ ||
priority == ZIO_PRIORITY_SCRUB)
arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
else
arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
if (hash_lock != NULL)
mutex_exit(hash_lock);
/*
* At this point, we have a level 1 cache miss. Try again in
* L2ARC if possible.
@ -5637,6 +5674,11 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
ZIO_FLAG_CANFAIL |
ZIO_FLAG_DONT_PROPAGATE |
ZIO_FLAG_DONT_RETRY, B_FALSE);
acb->acb_zio_head = rzio;
if (hash_lock != NULL)
mutex_exit(hash_lock);
DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
zio_t *, rzio);
ARCSTAT_INCR(arcstat_l2_read_bytes, size);
@ -5651,6 +5693,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
return (0);
/* l2arc read error; goto zio_read() */
if (hash_lock != NULL)
mutex_enter(hash_lock);
} else {
DTRACE_PROBE1(l2arc__miss,
arc_buf_hdr_t *, hdr);
@ -5671,6 +5715,10 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size,
arc_read_done, hdr, priority, zio_flags, zb);
acb->acb_zio_head = rzio;
if (hash_lock != NULL)
mutex_exit(hash_lock);
if (*arc_flags & ARC_FLAG_WAIT)
return (zio_wait(rzio));
@ -6162,9 +6210,9 @@ arc_write_done(zio_t *zio)
zio_t *
arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready,
arc_done_func_t *children_ready, arc_done_func_t *physdone,
arc_done_func_t *done, void *private, zio_priority_t priority,
boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready,
arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone,
arc_write_done_func_t *done, void *private, zio_priority_t priority,
int zio_flags, const zbookmark_phys_t *zb)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
@ -6591,9 +6639,6 @@ arc_init(void)
mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL);
/* Convert seconds to clock ticks */
arc_min_prefetch_lifespan = 1 * hz;
/* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */
arc_c_min = MAX(allmem / 32, arc_abs_min);
/* set max to 5/8 of all memory, or all but 1GB, whichever is more */

View File

@ -902,7 +902,8 @@ dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset)
}
static void
dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
arc_buf_t *buf, void *vdb)
{
dmu_buf_impl_t *db = vdb;
@ -916,19 +917,22 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
ASSERT(db->db.db_data == NULL);
if (db->db_level == 0 && db->db_freed_in_flight) {
/* we were freed in flight; disregard any error */
if (buf == NULL) {
buf = arc_alloc_buf(db->db_objset->os_spa,
db, DBUF_GET_BUFC_TYPE(db), db->db.db_size);
}
arc_release(buf, db);
bzero(buf->b_data, db->db.db_size);
arc_buf_freeze(buf);
db->db_freed_in_flight = FALSE;
dbuf_set_data(db, buf);
db->db_state = DB_CACHED;
} else if (zio == NULL || zio->io_error == 0) {
} else if (buf != NULL) {
dbuf_set_data(db, buf);
db->db_state = DB_CACHED;
} else {
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT3P(db->db_buf, ==, NULL);
arc_buf_destroy(buf, db);
db->db_state = DB_UNCACHED;
}
cv_broadcast(&db->db_changed);
@ -2326,7 +2330,8 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
* prefetch if the next block down is our target.
*/
static void
dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
const blkptr_t *iobp, arc_buf_t *abuf, void *private)
{
dbuf_prefetch_arg_t *dpa = private;
@ -2365,13 +2370,18 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
dbuf_rele(db, FTAG);
}
if (abuf == NULL) {
kmem_free(dpa, sizeof(*dpa));
return;
}
dpa->dpa_curlevel--;
uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
(dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) {
if (BP_IS_HOLE(bp)) {
kmem_free(dpa, sizeof (*dpa));
} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
@ -3746,7 +3756,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
* ready callback so that we can properly handle an indirect
* block that only contains holes.
*/
arc_done_func_t *children_ready_cb = NULL;
arc_write_done_func_t *children_ready_cb = NULL;
if (db->db_level != 0)
children_ready_cb = dbuf_write_children_ready;

View File

@ -1112,14 +1112,26 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
void
ddt_sync(spa_t *spa, uint64_t txg)
{
dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
dmu_tx_t *tx;
zio_t *rio = zio_root(spa, NULL, NULL,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
zio_t *rio;
ASSERT(spa_syncing_txg(spa) == txg);
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
rio = zio_root(spa, NULL, NULL,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
/*
* This function may cause an immediate scan of ddt blocks (see
* the comment above dsl_scan_ddt() for details). We set the
* scan's root zio here so that we can wait for any scan IOs in
* addition to the regular ddt IOs.
*/
ASSERT3P(scn->scn_zio_root, ==, NULL);
scn->scn_zio_root = rio;
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
ddt_t *ddt = spa->spa_ddt[c];
if (ddt == NULL)
@ -1129,6 +1141,7 @@ ddt_sync(spa_t *spa, uint64_t txg)
}
(void) zio_wait(rio);
scn->scn_zio_root = NULL;
dmu_tx_commit(tx);
}

View File

@ -349,6 +349,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
#if 0
/*
* The $ORIGIN dataset (if it exists) doesn't have an associated
* objset, so there's no reason to open it. The $ORIGIN dataset
@ -359,6 +360,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
ASSERT3P(ds->ds_dir, !=,
spa_get_dsl(spa)->dp_origin_snap->ds_dir);
}
#endif
os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
os->os_dsl_dataset = ds;

View File

@ -499,8 +499,9 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
{
prefetch_data_t *pfd = arg;
arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
ARC_FLAG_PRESCIENT_PREFETCH;
ASSERT(pfd->pd_bytes_fetched >= 0);
if (bp == NULL)
return (0);

File diff suppressed because it is too large Load Diff

View File

@ -1119,85 +1119,6 @@ metaslab_rangesize_compare(const void *x1, const void *x2)
return (0);
}
/*
* Create any block allocator specific components. The current allocators
* rely on using both a size-ordered range_tree_t and an array of uint64_t's.
*/
static void
metaslab_rt_create(range_tree_t *rt, void *arg)
{
metaslab_t *msp = arg;
ASSERT3P(rt->rt_arg, ==, msp);
ASSERT(msp->ms_allocatable == NULL);
avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare,
sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
}
/*
* Destroy the block allocator specific components.
*/
static void
metaslab_rt_destroy(range_tree_t *rt, void *arg)
{
metaslab_t *msp = arg;
ASSERT3P(rt->rt_arg, ==, msp);
ASSERT3P(msp->ms_allocatable, ==, rt);
ASSERT0(avl_numnodes(&msp->ms_allocatable_by_size));
avl_destroy(&msp->ms_allocatable_by_size);
}
static void
metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
{
metaslab_t *msp = arg;
ASSERT3P(rt->rt_arg, ==, msp);
ASSERT3P(msp->ms_allocatable, ==, rt);
VERIFY(!msp->ms_condensing);
avl_add(&msp->ms_allocatable_by_size, rs);
}
static void
metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
{
metaslab_t *msp = arg;
ASSERT3P(rt->rt_arg, ==, msp);
ASSERT3P(msp->ms_allocatable, ==, rt);
VERIFY(!msp->ms_condensing);
avl_remove(&msp->ms_allocatable_by_size, rs);
}
static void
metaslab_rt_vacate(range_tree_t *rt, void *arg)
{
metaslab_t *msp = arg;
ASSERT3P(rt->rt_arg, ==, msp);
ASSERT3P(msp->ms_allocatable, ==, rt);
/*
* Normally one would walk the tree freeing nodes along the way.
* Since the nodes are shared with the range trees we can avoid
* walking all nodes and just reinitialize the avl tree. The nodes
* will be freed by the range tree, so we don't want to free them here.
*/
avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare,
sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
}
static range_tree_ops_t metaslab_rt_ops = {
metaslab_rt_create,
metaslab_rt_destroy,
metaslab_rt_add,
metaslab_rt_remove,
metaslab_rt_vacate
};
/*
* ==========================================================================
* Common allocator routines
@ -1574,7 +1495,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
* addition of new space; and for debugging, it ensures that we'd
* data fault on any attempt to use this metaslab before it's ready.
*/
ms->ms_allocatable = range_tree_create(&metaslab_rt_ops, ms);
ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, &ms->ms_allocatable_by_size,
metaslab_rangesize_compare, 0);
metaslab_group_add(mg, ms);
metaslab_set_fragmentation(ms);

View File

@ -33,8 +33,58 @@
#include <sys/zio.h>
#include <sys/range_tree.h>
/*
* Range trees are tree-based data structures that can be used to
* track free space or generally any space allocation information.
* A range tree keeps track of individual segments and automatically
* provides facilities such as adjacent extent merging and extent
* splitting in response to range add/remove requests.
*
* A range tree starts out completely empty, with no segments in it.
* Adding an allocation via range_tree_add to the range tree can either:
* 1) create a new extent
* 2) extend an adjacent extent
* 3) merge two adjacent extents
* Conversely, removing an allocation via range_tree_remove can:
* 1) completely remove an extent
* 2) shorten an extent (if the allocation was near one of its ends)
* 3) split an extent into two extents, in effect punching a hole
*
* A range tree is also capable of 'bridging' gaps when adding
* allocations. This is useful for cases when close proximity of
* allocations is an important detail that needs to be represented
* in the range tree. See range_tree_set_gap(). The default behavior
* is not to bridge gaps (i.e. the maximum allowed gap size is 0).
*
* In order to traverse a range tree, use either the range_tree_walk()
* or range_tree_vacate() functions.
*
* To obtain more accurate information on individual segment
* operations that the range tree performs "under the hood", you can
* specify a set of callbacks by passing a range_tree_ops_t structure
* to the range_tree_create function. Any callbacks that are non-NULL
* are then called at the appropriate times.
*
* The range tree code also supports a special variant of range trees
* that can bridge small gaps between segments. This kind of tree is used
* by the dsl scanning code to group I/Os into mostly sequential chunks to
* optimize disk performance. The code here attempts to do this with as
* little memory and computational overhead as possible. One limitation of
* this implementation is that segments of range trees with gaps can only
* support removing complete segments.
*/
kmem_cache_t *range_seg_cache;
/* Generic ops for managing an AVL tree alongside a range tree */
struct range_tree_ops rt_avl_ops = {
.rtop_create = rt_avl_create,
.rtop_destroy = rt_avl_destroy,
.rtop_add = rt_avl_add,
.rtop_remove = rt_avl_remove,
.rtop_vacate = rt_avl_vacate,
};
void
range_tree_init(void)
{
@ -109,47 +159,47 @@ range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
static int
range_tree_seg_compare(const void *x1, const void *x2)
{
const range_seg_t *r1 = x1;
const range_seg_t *r2 = x2;
const range_seg_t *r1 = (const range_seg_t *)x1;
const range_seg_t *r2 = (const range_seg_t *)x2;
if (r1->rs_start < r2->rs_start) {
if (r1->rs_end > r2->rs_start)
return (0);
return (-1);
}
if (r1->rs_start > r2->rs_start) {
if (r1->rs_start < r2->rs_end)
return (0);
return (1);
}
return (0);
ASSERT3U(r1->rs_start, <=, r1->rs_end);
ASSERT3U(r2->rs_start, <=, r2->rs_end);
return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
}
range_tree_t *
range_tree_create(range_tree_ops_t *ops, void *arg)
range_tree_create_impl(range_tree_ops_t *ops, void *arg,
int (*avl_compare) (const void *, const void *), uint64_t gap)
{
range_tree_t *rt;
rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
avl_create(&rt->rt_root, range_tree_seg_compare,
sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
rt->rt_ops = ops;
rt->rt_arg = arg;
rt->rt_gap = gap;
rt->rt_avl_compare = avl_compare;
if (rt->rt_ops != NULL)
if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
rt->rt_ops->rtop_create(rt, rt->rt_arg);
return (rt);
}
range_tree_t *
range_tree_create(range_tree_ops_t *ops, void *arg)
{
return (range_tree_create_impl(ops, arg, NULL, 0));
}
void
range_tree_destroy(range_tree_t *rt)
{
VERIFY0(rt->rt_space);
if (rt->rt_ops != NULL)
if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
avl_destroy(&rt->rt_root);
@ -157,39 +207,99 @@ range_tree_destroy(range_tree_t *rt)
}
void
range_tree_add(void *arg, uint64_t start, uint64_t size)
range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta)
{
ASSERT3U(rs->rs_fill + delta, !=, 0);
ASSERT3U(rs->rs_fill + delta, <=, rs->rs_end - rs->rs_start);
if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
rs->rs_fill += delta;
if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
}
static void
range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
{
range_tree_t *rt = arg;
avl_index_t where;
range_seg_t rsearch, *rs_before, *rs_after, *rs;
uint64_t end = start + size;
uint64_t end = start + size, gap = rt->rt_gap;
uint64_t bridge_size = 0;
boolean_t merge_before, merge_after;
VERIFY(size != 0);
ASSERT3U(size, !=, 0);
ASSERT3U(fill, <=, size);
rsearch.rs_start = start;
rsearch.rs_end = end;
rs = avl_find(&rt->rt_root, &rsearch, &where);
if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) {
if (gap == 0 && rs != NULL &&
rs->rs_start <= start && rs->rs_end >= end) {
zfs_panic_recover("zfs: allocating allocated segment"
"(offset=%llu size=%llu)\n",
(longlong_t)start, (longlong_t)size);
"(offset=%llu size=%llu) of (offset=%llu size=%llu)\n",
(longlong_t)start, (longlong_t)size,
(longlong_t)rs->rs_start,
(longlong_t)rs->rs_end - rs->rs_start);
return;
}
/* Make sure we don't overlap with either of our neighbors */
VERIFY(rs == NULL);
/*
* If this is a gap-supporting range tree, it is possible that we
* are inserting into an existing segment. In this case simply
* bump the fill count and call the remove / add callbacks. If the
* new range will extend an existing segment, we remove the
* existing one, apply the new extent to it and re-insert it using
* the normal code paths.
*/
if (rs != NULL) {
ASSERT3U(gap, !=, 0);
if (rs->rs_start <= start && rs->rs_end >= end) {
range_tree_adjust_fill(rt, rs, fill);
return;
}
avl_remove(&rt->rt_root, rs);
if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
range_tree_stat_decr(rt, rs);
rt->rt_space -= rs->rs_end - rs->rs_start;
fill += rs->rs_fill;
start = MIN(start, rs->rs_start);
end = MAX(end, rs->rs_end);
size = end - start;
range_tree_add_impl(rt, start, size, fill);
kmem_cache_free(range_seg_cache, rs);
return;
}
ASSERT3P(rs, ==, NULL);
/*
* Determine whether or not we will have to merge with our neighbors.
* If gap != 0, we might need to merge with our neighbors even if we
* aren't directly touching.
*/
rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
merge_before = (rs_before != NULL && rs_before->rs_end == start);
merge_after = (rs_after != NULL && rs_after->rs_start == end);
merge_before = (rs_before != NULL && rs_before->rs_end >= start - gap);
merge_after = (rs_after != NULL && rs_after->rs_start <= end + gap);
if (merge_before && gap != 0)
bridge_size += start - rs_before->rs_end;
if (merge_after && gap != 0)
bridge_size += rs_after->rs_start - end;
if (merge_before && merge_after) {
avl_remove(&rt->rt_root, rs_before);
if (rt->rt_ops != NULL) {
if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
}
@ -197,43 +307,59 @@ range_tree_add(void *arg, uint64_t start, uint64_t size)
range_tree_stat_decr(rt, rs_before);
range_tree_stat_decr(rt, rs_after);
rs_after->rs_fill += rs_before->rs_fill + fill;
rs_after->rs_start = rs_before->rs_start;
kmem_cache_free(range_seg_cache, rs_before);
rs = rs_after;
} else if (merge_before) {
if (rt->rt_ops != NULL)
if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
range_tree_stat_decr(rt, rs_before);
rs_before->rs_fill += fill;
rs_before->rs_end = end;
rs = rs_before;
} else if (merge_after) {
if (rt->rt_ops != NULL)
if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
range_tree_stat_decr(rt, rs_after);
rs_after->rs_fill += fill;
rs_after->rs_start = start;
rs = rs_after;
} else {
rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
rs->rs_fill = fill;
rs->rs_start = start;
rs->rs_end = end;
avl_insert(&rt->rt_root, rs, where);
}
if (rt->rt_ops != NULL)
if (gap != 0)
ASSERT3U(rs->rs_fill, <=, rs->rs_end - rs->rs_start);
else
ASSERT3U(rs->rs_fill, ==, rs->rs_end - rs->rs_start);
if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
range_tree_stat_incr(rt, rs);
rt->rt_space += size;
rt->rt_space += size + bridge_size;
}
void
range_tree_remove(void *arg, uint64_t start, uint64_t size)
range_tree_add(void *arg, uint64_t start, uint64_t size)
{
range_tree_add_impl(arg, start, size, size);
}
static void
range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size,
boolean_t do_fill)
{
range_tree_t *rt = arg;
avl_index_t where;
range_seg_t rsearch, *rs, *newseg;
uint64_t end = start + size;
@ -253,6 +379,34 @@ range_tree_remove(void *arg, uint64_t start, uint64_t size)
(longlong_t)start, (longlong_t)size);
return;
}
/*
* Range trees with gap support must only remove complete segments
* from the tree. This allows us to maintain accurate fill accounting
* and to ensure that bridged sections are not leaked. If we need to
* remove less than the full segment, we can only adjust the fill count.
*/
if (rt->rt_gap != 0) {
if (do_fill) {
if (rs->rs_fill == size) {
start = rs->rs_start;
end = rs->rs_end;
size = end - start;
} else {
range_tree_adjust_fill(rt, rs, -size);
return;
}
} else if (rs->rs_start != start || rs->rs_end != end) {
zfs_panic_recover("zfs: freeing partial segment of "
"gap tree (offset=%llu size=%llu) of "
"(offset=%llu size=%llu)",
(longlong_t)start, (longlong_t)size,
(longlong_t)rs->rs_start,
(longlong_t)rs->rs_end - rs->rs_start);
return;
}
}
VERIFY3U(rs->rs_start, <=, start);
VERIFY3U(rs->rs_end, >=, end);
@ -261,19 +415,20 @@ range_tree_remove(void *arg, uint64_t start, uint64_t size)
range_tree_stat_decr(rt, rs);
if (rt->rt_ops != NULL)
if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
if (left_over && right_over) {
newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
newseg->rs_start = end;
newseg->rs_end = rs->rs_end;
newseg->rs_fill = newseg->rs_end - newseg->rs_start;
range_tree_stat_incr(rt, newseg);
rs->rs_end = start;
avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
if (rt->rt_ops != NULL)
if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
} else if (left_over) {
rs->rs_end = start;
@ -286,15 +441,53 @@ range_tree_remove(void *arg, uint64_t start, uint64_t size)
}
if (rs != NULL) {
/*
* The fill of the leftover segment will always be equal to
* the size, since we do not support removing partial segments
* of range trees with gaps.
*/
rs->rs_fill = rs->rs_end - rs->rs_start;
range_tree_stat_incr(rt, rs);
if (rt->rt_ops != NULL)
if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
}
rt->rt_space -= size;
}
void
range_tree_remove(void *arg, uint64_t start, uint64_t size)
{
range_tree_remove_impl(arg, start, size, B_FALSE);
}
void
range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size)
{
range_tree_remove_impl(rt, start, size, B_TRUE);
}
void
range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
uint64_t newstart, uint64_t newsize)
{
int64_t delta = newsize - (rs->rs_end - rs->rs_start);
range_tree_stat_decr(rt, rs);
if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
rs->rs_start = newstart;
rs->rs_end = newstart + newsize;
range_tree_stat_incr(rt, rs);
if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
rt->rt_space += delta;
}
static range_seg_t *
range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
{
@ -309,7 +502,7 @@ range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
return (avl_find(&rt->rt_root, &rsearch, &where));
}
static range_seg_t *
range_seg_t *
range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
{
range_seg_t *rs = range_tree_find_impl(rt, start, size);
@ -373,7 +566,7 @@ range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
void *cookie = NULL;
if (rt->rt_ops != NULL)
if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
@ -395,12 +588,63 @@ range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
}
range_seg_t *
range_tree_first(range_tree_t *rt)
{
return (avl_first(&rt->rt_root));
}
uint64_t
range_tree_space(range_tree_t *rt)
{
return (rt->rt_space);
}
/* Generic range tree functions for maintaining segments in an AVL tree. */
void
rt_avl_create(range_tree_t *rt, void *arg)
{
avl_tree_t *tree = arg;
avl_create(tree, rt->rt_avl_compare, sizeof (range_seg_t),
offsetof(range_seg_t, rs_pp_node));
}
void
rt_avl_destroy(range_tree_t *rt, void *arg)
{
avl_tree_t *tree = arg;
ASSERT0(avl_numnodes(tree));
avl_destroy(tree);
}
void
rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg)
{
avl_tree_t *tree = arg;
avl_add(tree, rs);
}
void
rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
{
avl_tree_t *tree = arg;
avl_remove(tree, rs);
}
void
rt_avl_vacate(range_tree_t *rt, void *arg)
{
/*
* Normally one would walk the tree freeing nodes along the way.
* Since the nodes are shared with the range trees we can avoid
* walking all nodes and just reinitialize the avl tree. The nodes
* will be freed by the range tree, so we don't want to free them here.
*/
rt_avl_create(rt, arg);
}
boolean_t
range_tree_is_empty(range_tree_t *rt)
{

View File

@ -2035,7 +2035,7 @@ spa_load_verify_done(zio_t *zio)
}
mutex_enter(&spa->spa_scrub_lock);
spa->spa_scrub_inflight--;
spa->spa_load_verify_ios--;
cv_broadcast(&spa->spa_scrub_io_cv);
mutex_exit(&spa->spa_scrub_lock);
}
@ -2082,9 +2082,9 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
size_t size = BP_GET_PSIZE(bp);
mutex_enter(&spa->spa_scrub_lock);
while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
while (spa->spa_load_verify_ios >= spa_load_verify_maxinflight)
cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
spa->spa_scrub_inflight++;
spa->spa_load_verify_ios++;
mutex_exit(&spa->spa_scrub_lock);
zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,

View File

@ -2095,6 +2095,8 @@ spa_init(int mode)
zpool_feature_init();
spa_config_load();
l2arc_start();
scan_init();
dsl_scan_global_init();
#ifndef illumos
#ifdef _KERNEL
zfs_deadman_init();
@ -2119,7 +2121,8 @@ spa_fini(void)
range_tree_fini();
unique_fini();
refcount_fini();
scan_fini();
avl_destroy(&spa_namespace_avl);
avl_destroy(&spa_spare_avl);
avl_destroy(&spa_l2cache_avl);
@ -2220,6 +2223,7 @@ spa_scan_stat_init(spa_t *spa)
spa->spa_scan_pass_scrub_pause = 0;
spa->spa_scan_pass_scrub_spent_paused = 0;
spa->spa_scan_pass_exam = 0;
spa->spa_scan_pass_issued = 0;
vdev_scan_stat_init(spa->spa_root_vdev);
}
@ -2237,18 +2241,20 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
/* data stored on disk */
ps->pss_func = scn->scn_phys.scn_func;
ps->pss_state = scn->scn_phys.scn_state;
ps->pss_start_time = scn->scn_phys.scn_start_time;
ps->pss_end_time = scn->scn_phys.scn_end_time;
ps->pss_to_examine = scn->scn_phys.scn_to_examine;
ps->pss_examined = scn->scn_phys.scn_examined;
ps->pss_to_process = scn->scn_phys.scn_to_process;
ps->pss_processed = scn->scn_phys.scn_processed;
ps->pss_errors = scn->scn_phys.scn_errors;
ps->pss_state = scn->scn_phys.scn_state;
ps->pss_examined = scn->scn_phys.scn_examined;
ps->pss_issued =
scn->scn_issued_before_pass + spa->spa_scan_pass_issued;
/* data not stored on disk */
ps->pss_pass_start = spa->spa_scan_pass_start;
ps->pss_pass_exam = spa->spa_scan_pass_exam;
ps->pss_pass_issued = spa->spa_scan_pass_issued;
ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause;
ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused;

View File

@ -58,11 +58,13 @@ _NOTE(CONSTCOND) } while (0)
typedef struct arc_buf_hdr arc_buf_hdr_t;
typedef struct arc_buf arc_buf_t;
typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv);
typedef void arc_read_done_func_t(zio_t *zio, const zbookmark_phys_t *zb,
const blkptr_t *bp, arc_buf_t *buf, void *priv);
typedef void arc_write_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv);
/* generic arc_done_func_t's which you can use */
arc_done_func_t arc_bcopy_func;
arc_done_func_t arc_getbuf_func;
arc_read_done_func_t arc_bcopy_func;
arc_read_done_func_t arc_getbuf_func;
typedef enum arc_flags
{
@ -75,35 +77,36 @@ typedef enum arc_flags
ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */
ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */
ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 5, /* I/O from zfetch */
ARC_FLAG_PRESCIENT_PREFETCH = 1 << 6, /* long min lifespan */
/*
* Private ARC flags. These flags are private ARC only flags that
* will show up in b_flags in the arc_hdr_buf_t. These flags should
* only be set by ARC code.
*/
ARC_FLAG_IN_HASH_TABLE = 1 << 6, /* buffer is hashed */
ARC_FLAG_IO_IN_PROGRESS = 1 << 7, /* I/O in progress */
ARC_FLAG_IO_ERROR = 1 << 8, /* I/O failed for buf */
ARC_FLAG_INDIRECT = 1 << 9, /* indirect block */
ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */
ARC_FLAG_IO_IN_PROGRESS = 1 << 8, /* I/O in progress */
ARC_FLAG_IO_ERROR = 1 << 9, /* I/O failed for buf */
ARC_FLAG_INDIRECT = 1 << 10, /* indirect block */
/* Indicates that block was read with ASYNC priority. */
ARC_FLAG_PRIO_ASYNC_READ = 1 << 10,
ARC_FLAG_L2_WRITING = 1 << 11, /* write in progress */
ARC_FLAG_L2_EVICTED = 1 << 12, /* evicted during I/O */
ARC_FLAG_L2_WRITE_HEAD = 1 << 13, /* head of write list */
ARC_FLAG_PRIO_ASYNC_READ = 1 << 11,
ARC_FLAG_L2_WRITING = 1 << 12, /* write in progress */
ARC_FLAG_L2_EVICTED = 1 << 13, /* evicted during I/O */
ARC_FLAG_L2_WRITE_HEAD = 1 << 14, /* head of write list */
/* indicates that the buffer contains metadata (otherwise, data) */
ARC_FLAG_BUFC_METADATA = 1 << 14,
ARC_FLAG_BUFC_METADATA = 1 << 15,
/* Flags specifying whether optional hdr struct fields are defined */
ARC_FLAG_HAS_L1HDR = 1 << 15,
ARC_FLAG_HAS_L2HDR = 1 << 16,
ARC_FLAG_HAS_L1HDR = 1 << 16,
ARC_FLAG_HAS_L2HDR = 1 << 17,
/*
* Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data.
* This allows the l2arc to use the blkptr's checksum to verify
* the data without having to store the checksum in the hdr.
*/
ARC_FLAG_COMPRESSED_ARC = 1 << 17,
ARC_FLAG_SHARED_DATA = 1 << 18,
ARC_FLAG_COMPRESSED_ARC = 1 << 18,
ARC_FLAG_SHARED_DATA = 1 << 19,
/*
* The arc buffer's compression mode is stored in the top 7 bits of the
@ -179,12 +182,12 @@ int arc_referenced(arc_buf_t *buf);
#endif
int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
arc_done_func_t *done, void *priv, zio_priority_t priority, int flags,
arc_flags_t *arc_flags, const zbookmark_phys_t *zb);
arc_read_done_func_t *done, void *priv, zio_priority_t priority,
int flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb);
zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
arc_done_func_t *ready, arc_done_func_t *child_ready,
arc_done_func_t *physdone, arc_done_func_t *done,
arc_write_done_func_t *ready, arc_write_done_func_t *child_ready,
arc_write_done_func_t *physdone, arc_write_done_func_t *done,
void *priv, zio_priority_t priority, int zio_flags,
const zbookmark_phys_t *zb);
void arc_freed(spa_t *spa, const blkptr_t *bp);

View File

@ -76,6 +76,7 @@ typedef struct zfs_blkstat {
typedef struct zfs_all_blkstats {
zfs_blkstat_t zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1];
kmutex_t zab_lock;
} zfs_all_blkstats_t;

View File

@ -107,24 +107,58 @@ typedef enum dsl_scan_flags {
typedef struct dsl_scan {
struct dsl_pool *scn_dp;
boolean_t scn_suspending;
uint64_t scn_restart_txg;
uint64_t scn_done_txg;
uint64_t scn_sync_start_time;
zio_t *scn_zio_root;
uint64_t scn_issued_before_pass;
/* for freeing blocks */
boolean_t scn_is_bptree;
boolean_t scn_async_destroying;
boolean_t scn_async_stalled;
uint64_t scn_async_block_min_time_ms;
/* flags and stats for controlling scan state */
boolean_t scn_is_sorted; /* doing sequential scan */
boolean_t scn_clearing; /* scan is issuing sequential extents */
boolean_t scn_checkpointing; /* scan is issuing all queued extents */
boolean_t scn_suspending; /* scan is suspending until next txg */
uint64_t scn_last_checkpoint; /* time of last checkpoint */
/* for debugging / information */
uint64_t scn_visited_this_txg;
/* members for thread synchronization */
zio_t *scn_zio_root; /* root zio for waiting on IO */
taskq_t *scn_taskq; /* task queue for issuing extents */
dsl_scan_phys_t scn_phys;
/* for controlling scan prefetch, protected by spa_scrub_lock */
boolean_t scn_prefetch_stop; /* prefetch should stop */
zbookmark_phys_t scn_prefetch_bookmark; /* prefetch start bookmark */
avl_tree_t scn_prefetch_queue; /* priority queue of prefetch IOs */
uint64_t scn_maxinflight_bytes; /* max bytes in flight for poool */
/* per txg statistics */
uint64_t scn_visited_this_txg; /* total bps visited this txg */
uint64_t scn_holes_this_txg;
uint64_t scn_lt_min_this_txg;
uint64_t scn_gt_max_this_txg;
uint64_t scn_ddt_contained_this_txg;
uint64_t scn_objsets_visited_this_txg;
uint64_t scn_avg_seg_size_this_txg;
uint64_t scn_segs_this_txg;
uint64_t scn_avg_zio_size_this_txg;
uint64_t scn_zios_this_txg;
/* members needed for syncing scan status to disk */
dsl_scan_phys_t scn_phys; /* on disk representation of scan */
dsl_scan_phys_t scn_phys_cached;
avl_tree_t scn_queue; /* queue of datasets to scan */
uint64_t scn_bytes_pending; /* outstanding data to issue */
} dsl_scan_t;
typedef struct dsl_scan_io_queue dsl_scan_io_queue_t;
void dsl_scan_global_init(void);
void scan_init(void);
void scan_fini(void);
int dsl_scan_init(struct dsl_pool *dp, uint64_t txg);
void dsl_scan_fini(struct dsl_pool *dp);
void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *);
@ -143,6 +177,9 @@ void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
struct dmu_tx *tx);
boolean_t dsl_scan_active(dsl_scan_t *scn);
boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn);
void dsl_scan_freed(spa_t *spa, const blkptr_t *bp);
void dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue);
void dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd);
#ifdef __cplusplus
}

View File

@ -51,6 +51,9 @@ typedef struct range_tree {
range_tree_ops_t *rt_ops;
void *rt_arg;
/* rt_avl_compare should only be set it rt_arg is an AVL tree */
uint64_t rt_gap; /* allowable inter-segment gap */
int (*rt_avl_compare)(const void *, const void *);
/*
* The rt_histogram maintains a histogram of ranges. Each bucket,
* rt_histogram[i], contains the number of ranges whose size is:
@ -64,6 +67,7 @@ typedef struct range_seg {
avl_node_t rs_pp_node; /* AVL picker-private node */
uint64_t rs_start; /* starting offset of this segment */
uint64_t rs_end; /* ending offset (non-inclusive) */
uint64_t rs_fill; /* actual fill if gap mode is on */
} range_seg_t;
struct range_tree_ops {
@ -78,9 +82,14 @@ typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size);
void range_tree_init(void);
void range_tree_fini(void);
range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg);
range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg,
int (*avl_compare)(const void*, const void*), uint64_t gap);
range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg);
void range_tree_destroy(range_tree_t *rt);
boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size);
range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size);
void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
uint64_t newstart, uint64_t newsize);
uint64_t range_tree_space(range_tree_t *rt);
boolean_t range_tree_is_empty(range_tree_t *rt);
void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size);
@ -89,10 +98,27 @@ void range_tree_stat_verify(range_tree_t *rt);
void range_tree_add(void *arg, uint64_t start, uint64_t size);
void range_tree_remove(void *arg, uint64_t start, uint64_t size);
void range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size);
void range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta);
void range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size);
void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg);
void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg);
range_seg_t *range_tree_first(range_tree_t *rt);
void rt_avl_create(range_tree_t *rt, void *arg);
void rt_avl_destroy(range_tree_t *rt, void *arg);
void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg);
void rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg);
void rt_avl_vacate(range_tree_t *rt, void *arg);
extern struct range_tree_ops rt_avl_ops;
void rt_avl_create(range_tree_t *rt, void *arg);
void rt_avl_destroy(range_tree_t *rt, void *arg);
void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg);
void rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg);
void rt_avl_vacate(range_tree_t *rt, void *arg);
extern struct range_tree_ops rt_avl_ops;
#ifdef __cplusplus
}

View File

@ -257,7 +257,8 @@ struct spa {
boolean_t spa_extreme_rewind; /* rewind past deferred frees */
uint64_t spa_last_io; /* lbolt of last non-scan I/O */
kmutex_t spa_scrub_lock; /* resilver/scrub lock */
uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */
uint64_t spa_scrub_inflight; /* in-flight scrub bytes */
uint64_t spa_load_verify_ios; /* in-flight verifications IOs */
kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */
uint8_t spa_scrub_active; /* active or suspended? */
uint8_t spa_scrub_type; /* type of scrub we're doing */
@ -268,6 +269,7 @@ struct spa {
uint64_t spa_scan_pass_scrub_pause; /* scrub pause time */
uint64_t spa_scan_pass_scrub_spent_paused; /* total paused */
uint64_t spa_scan_pass_exam; /* examined bytes per pass */
uint64_t spa_scan_pass_issued; /* issued bytes per pass */
kmutex_t spa_async_lock; /* protect async state */
kthread_t *spa_async_thread; /* thread doing async task */
kthread_t *spa_async_thread_vd; /* thread doing vd async task */

View File

@ -71,6 +71,7 @@ extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
uint64_t txg, uint64_t size);
extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d);
extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t off, size_t size);
extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
int scrub_done);
extern boolean_t vdev_dtl_required(vdev_t *vd);
@ -135,6 +136,7 @@ extern void vdev_queue_init(vdev_t *vd);
extern void vdev_queue_fini(vdev_t *vd);
extern zio_t *vdev_queue_io(zio_t *zio);
extern void vdev_queue_io_done(zio_t *zio);
extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
extern int vdev_queue_length(vdev_t *vd);
extern uint64_t vdev_queue_lastoffset(vdev_t *vd);
extern void vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio);

View File

@ -71,6 +71,7 @@ typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
typedef void vdev_io_start_func_t(zio_t *zio);
typedef void vdev_io_done_func_t(zio_t *zio);
typedef void vdev_state_change_func_t(vdev_t *vd, int, int);
typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, uint64_t, size_t);
typedef void vdev_hold_func_t(vdev_t *vd);
typedef void vdev_rele_func_t(vdev_t *vd);
@ -86,6 +87,7 @@ typedef struct vdev_ops {
vdev_io_start_func_t *vdev_op_io_start;
vdev_io_done_func_t *vdev_op_io_done;
vdev_state_change_func_t *vdev_op_state_change;
vdev_need_resilver_func_t *vdev_op_need_resilver;
vdev_hold_func_t *vdev_op_hold;
vdev_rele_func_t *vdev_op_rele;
vdev_remap_func_t *vdev_op_remap;
@ -294,6 +296,13 @@ struct vdev {
uint64_t vdev_async_write_queue_depth;
uint64_t vdev_max_async_write_queue_depth;
/*
* Protects the vdev_scan_io_queue field itself as well as the
* structure's contents (when present).
*/
kmutex_t vdev_scan_io_queue_lock;
struct dsl_scan_io_queue *vdev_scan_io_queue;
/*
* Leaf vdev state.
*/

View File

@ -593,6 +593,8 @@ extern void zio_vdev_io_bypass(zio_t *zio);
extern void zio_vdev_io_reissue(zio_t *zio);
extern void zio_vdev_io_redone(zio_t *zio);
extern void zio_change_priority(zio_t *pio, zio_priority_t priority);
extern void zio_checksum_verified(zio_t *zio);
extern int zio_worst_error(int e1, int e2);

View File

@ -559,6 +559,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
for (int t = 0; t < DTL_TYPES; t++) {
vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
}
@ -831,6 +833,18 @@ vdev_free(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
/*
* Scan queues are normally destroyed at the end of a scan. If the
* queue exists here, that implies the vdev is being removed while
* the scan is still running.
*/
if (vd->vdev_scan_io_queue != NULL) {
mutex_enter(&vd->vdev_scan_io_queue_lock);
dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue);
vd->vdev_scan_io_queue = NULL;
mutex_exit(&vd->vdev_scan_io_queue_lock);
}
/*
* vdev_free() implies closing the vdev first. This is simpler than
* trying to ensure complicated semantics for all callers.
@ -920,6 +934,7 @@ vdev_free(vdev_t *vd)
mutex_destroy(&vd->vdev_dtl_lock);
mutex_destroy(&vd->vdev_stat_lock);
mutex_destroy(&vd->vdev_probe_lock);
mutex_destroy(&vd->vdev_scan_io_queue_lock);
if (vd == spa->spa_root_vdev)
spa->spa_root_vdev = NULL;
@ -996,6 +1011,8 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
tvd->vdev_islog = svd->vdev_islog;
svd->vdev_islog = 0;
dsl_scan_io_queue_vdev_xfer(svd, tvd);
}
static void
@ -2288,6 +2305,21 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
return (empty);
}
/*
* Returns B_TRUE if vdev determines offset needs to be resilvered.
*/
boolean_t
vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
{
ASSERT(vd != vd->vdev_spa->spa_root_vdev);
if (vd->vdev_ops->vdev_op_need_resilver == NULL ||
vd->vdev_ops->vdev_op_leaf)
return (B_TRUE);
return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize));
}
/*
* Returns the lowest txg in the DTL range.
*/

View File

@ -837,6 +837,7 @@ vdev_ops_t vdev_disk_ops = {
vdev_disk_io_start,
vdev_disk_io_done,
NULL,
NULL,
vdev_disk_hold,
vdev_disk_rele,
NULL,

View File

@ -267,6 +267,7 @@ vdev_ops_t vdev_file_ops = {
vdev_file_io_start,
vdev_file_io_done,
NULL,
NULL,
vdev_file_hold,
vdev_file_rele,
NULL,
@ -286,6 +287,7 @@ vdev_ops_t vdev_disk_ops = {
vdev_file_io_start,
vdev_file_io_done,
NULL,
NULL,
vdev_file_hold,
vdev_file_rele,
NULL,

View File

@ -1147,6 +1147,7 @@ vdev_ops_t vdev_geom_ops = {
vdev_geom_io_start,
vdev_geom_io_done,
NULL,
NULL,
vdev_geom_hold,
vdev_geom_rele,
NULL,

View File

@ -1111,6 +1111,7 @@ vdev_ops_t vdev_indirect_ops = {
NULL,
NULL,
NULL,
NULL,
vdev_indirect_remap,
VDEV_TYPE_INDIRECT, /* name of this vdev type */
B_FALSE /* leaf vdev */

View File

@ -722,6 +722,7 @@ vdev_ops_t vdev_mirror_ops = {
NULL,
NULL,
NULL,
NULL,
VDEV_TYPE_MIRROR, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
@ -736,6 +737,7 @@ vdev_ops_t vdev_replacing_ops = {
NULL,
NULL,
NULL,
NULL,
VDEV_TYPE_REPLACING, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
@ -750,6 +752,7 @@ vdev_ops_t vdev_spare_ops = {
NULL,
NULL,
NULL,
NULL,
VDEV_TYPE_SPARE, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};

View File

@ -90,6 +90,7 @@ vdev_ops_t vdev_missing_ops = {
NULL,
NULL,
NULL,
NULL,
VDEV_TYPE_MISSING, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
@ -104,6 +105,7 @@ vdev_ops_t vdev_hole_ops = {
NULL,
NULL,
NULL,
NULL,
VDEV_TYPE_HOLE, /* name of this vdev type */
B_TRUE /* leaf vdev */
};

View File

@ -175,7 +175,7 @@ int zfs_vdev_async_write_active_max_dirty_percent = 60;
* we include spans of optional I/Os to aid aggregation at the disk even when
* they aren't able to help us aggregate at this level.
*/
int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE;
int zfs_vdev_aggregation_limit = 1 << 20;
int zfs_vdev_read_gap_limit = 32 << 10;
int zfs_vdev_write_gap_limit = 4 << 10;
@ -938,6 +938,48 @@ vdev_queue_io_done(zio_t *zio)
mutex_exit(&vq->vq_lock);
}
void
vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
{
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
avl_tree_t *tree;
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
if (zio->io_type == ZIO_TYPE_READ) {
if (priority != ZIO_PRIORITY_SYNC_READ &&
priority != ZIO_PRIORITY_ASYNC_READ &&
priority != ZIO_PRIORITY_SCRUB)
priority = ZIO_PRIORITY_ASYNC_READ;
} else {
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
if (priority != ZIO_PRIORITY_SYNC_WRITE &&
priority != ZIO_PRIORITY_ASYNC_WRITE)
priority = ZIO_PRIORITY_ASYNC_WRITE;
}
mutex_enter(&vq->vq_lock);
/*
* If the zio is in none of the queues we can simply change
* the priority. If the zio is waiting to be submitted we must
* remove it from the queue and re-insert it with the new priority.
* Otherwise, the zio is currently active and we cannot change its
* priority.
*/
tree = vdev_queue_class_tree(vq, zio->io_priority);
if (avl_find(tree, zio, NULL) == zio) {
avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
zio->io_priority = priority;
avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
} else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) {
zio->io_priority = priority;
}
mutex_exit(&vq->vq_lock);
}
/*
* As these three methods are only used for load calculations we're not concerned
* if we get an incorrect value on 32bit platforms due to lack of vq_lock mutex

View File

@ -2584,6 +2584,44 @@ vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
}
/*
* Determine if any portion of the provided block resides on a child vdev
* with a dirty DTL and therefore needs to be resilvered. The function
* assumes that at least one DTL is dirty which imples that full stripe
* width blocks must be resilvered.
*/
static boolean_t
vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
{
uint64_t dcols = vd->vdev_children;
uint64_t nparity = vd->vdev_nparity;
uint64_t ashift = vd->vdev_top->vdev_ashift;
/* The starting RAIDZ (parent) vdev sector of the block. */
uint64_t b = offset >> ashift;
/* The zio's size in units of the vdev's minimum sector size. */
uint64_t s = ((psize - 1) >> ashift) + 1;
/* The first column for this stripe. */
uint64_t f = b % dcols;
if (s + nparity >= dcols)
return (B_TRUE);
for (uint64_t c = 0; c < s + nparity; c++) {
uint64_t devidx = (f + c) % dcols;
vdev_t *cvd = vd->vdev_child[devidx];
/*
* dsl_scan_need_resilver() already checked vd with
* vdev_dtl_contains(). So here just check cvd with
* vdev_dtl_empty(), cheaper and a good approximation.
*/
if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
return (B_TRUE);
}
return (B_FALSE);
}
vdev_ops_t vdev_raidz_ops = {
vdev_raidz_open,
vdev_raidz_close,
@ -2591,6 +2629,7 @@ vdev_ops_t vdev_raidz_ops = {
vdev_raidz_io_start,
vdev_raidz_io_done,
vdev_raidz_state_change,
vdev_raidz_need_resilver,
NULL,
NULL,
NULL,

View File

@ -150,6 +150,7 @@ vdev_ops_t vdev_root_ops = {
NULL,
NULL,
NULL,
NULL,
VDEV_TYPE_ROOT, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};

View File

@ -1051,7 +1051,7 @@ zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
}
err = zap_add(os, intoobj, za.za_name,
8, 1, &value, tx);
if (err)
if (err != 0)
break;
}
zap_cursor_fini(&zc);

View File

@ -41,6 +41,7 @@
#include <sys/trim_map.h>
#include <sys/blkptr.h>
#include <sys/zfeature.h>
#include <sys/dsl_scan.h>
#include <sys/metaslab_impl.h>
#include <sys/abd.h>
@ -438,6 +439,8 @@ zio_walk_children(zio_t *pio, zio_link_t **zl)
{
list_t *cl = &pio->io_child_list;
ASSERT(MUTEX_HELD(&pio->io_lock));
*zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
if (*zl == NULL)
return (NULL);
@ -472,8 +475,8 @@ zio_add_child(zio_t *pio, zio_t *cio)
zl->zl_parent = pio;
zl->zl_child = cio;
mutex_enter(&cio->io_lock);
mutex_enter(&pio->io_lock);
mutex_enter(&cio->io_lock);
ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
@ -486,8 +489,8 @@ zio_add_child(zio_t *pio, zio_t *cio)
pio->io_child_count++;
cio->io_parent_count++;
mutex_exit(&pio->io_lock);
mutex_exit(&cio->io_lock);
mutex_exit(&pio->io_lock);
}
static void
@ -496,8 +499,8 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
ASSERT(zl->zl_parent == pio);
ASSERT(zl->zl_child == cio);
mutex_enter(&cio->io_lock);
mutex_enter(&pio->io_lock);
mutex_enter(&cio->io_lock);
list_remove(&pio->io_child_list, zl);
list_remove(&cio->io_parent_list, zl);
@ -505,9 +508,8 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
pio->io_child_count--;
cio->io_parent_count--;
mutex_exit(&pio->io_lock);
mutex_exit(&cio->io_lock);
mutex_exit(&pio->io_lock);
kmem_cache_free(zio_link_cache, zl);
}
@ -988,6 +990,7 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
metaslab_check_free(spa, bp);
arc_freed(spa, bp);
dsl_scan_freed(spa, bp);
if (zfs_trim_enabled)
stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START |
@ -1865,14 +1868,16 @@ zio_reexecute(zio_t *pio)
* cannot be affected by any side effects of reexecuting 'cio'.
*/
zio_link_t *zl = NULL;
mutex_enter(&pio->io_lock);
for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
cio_next = zio_walk_children(pio, &zl);
mutex_enter(&pio->io_lock);
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
pio->io_children[cio->io_child_type][w]++;
mutex_exit(&pio->io_lock);
zio_reexecute(cio);
mutex_enter(&pio->io_lock);
}
mutex_exit(&pio->io_lock);
/*
* Now that all children have been reexecuted, execute the parent.
@ -3184,26 +3189,25 @@ zio_vdev_io_start(zio_t *zio)
}
}
/*
* We keep track of time-sensitive I/Os so that the scan thread
* can quickly react to certain workloads. In particular, we care
* about non-scrubbing, top-level reads and writes with the following
* characteristics:
* - synchronous writes of user data to non-slog devices
* - any reads of user data
* When these conditions are met, adjust the timestamp of spa_last_io
* which allows the scan thread to adjust its workload accordingly.
*/
if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
vd == vd->vdev_top && !vd->vdev_islog &&
zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
zio->io_txg != spa_syncing_txg(spa)) {
uint64_t old = spa->spa_last_io;
uint64_t new = ddi_get_lbolt64();
if (old != new)
(void) atomic_cas_64(&spa->spa_last_io, old, new);
}
/*
* We keep track of time-sensitive I/Os so that the scan thread
* can quickly react to certain workloads. In particular, we care
* about non-scrubbing, top-level reads and writes with the following
* characteristics:
* - synchronous writes of user data to non-slog devices
* - any reads of user data
* When these conditions are met, adjust the timestamp of spa_last_io
* which allows the scan thread to adjust its workload accordingly.
*/
if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
vd == vd->vdev_top && !vd->vdev_islog &&
zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
zio->io_txg != spa_syncing_txg(spa)) {
uint64_t old = spa->spa_last_io;
uint64_t new = ddi_get_lbolt64();
if (old != new)
(void) atomic_cas_64(&spa->spa_last_io, old, new);
}
align = 1ULL << vd->vdev_top->vdev_ashift;
if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
@ -3352,6 +3356,35 @@ zio_vdev_io_done(zio_t *zio)
return (ZIO_PIPELINE_CONTINUE);
}
/*
* This function is used to change the priority of an existing zio that is
* currently in-flight. This is used by the arc to upgrade priority in the
* event that a demand read is made for a block that is currently queued
* as a scrub or async read IO. Otherwise, the high priority read request
* would end up having to wait for the lower priority IO.
*/
void
zio_change_priority(zio_t *pio, zio_priority_t priority)
{
zio_t *cio, *cio_next;
zio_link_t *zl = NULL;
ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) {
vdev_queue_change_io_priority(pio, priority);
} else {
pio->io_priority = priority;
}
mutex_enter(&pio->io_lock);
for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
cio_next = zio_walk_children(pio, &zl);
zio_change_priority(cio, priority);
}
mutex_exit(&pio->io_lock);
}
/*
* For non-raidz ZIOs, we can just copy aside the bad data read from the
* disk, and use that to finish the checksum ereport later.

View File

@ -760,7 +760,7 @@ typedef struct pool_scan_stat {
uint64_t pss_start_time; /* scan start time */
uint64_t pss_end_time; /* scan end time */
uint64_t pss_to_examine; /* total bytes to scan */
uint64_t pss_examined; /* total examined bytes */
uint64_t pss_examined; /* total bytes located by scanner */
uint64_t pss_to_process; /* total bytes to process */
uint64_t pss_processed; /* total processed bytes */
uint64_t pss_errors; /* scan errors */
@ -771,6 +771,12 @@ typedef struct pool_scan_stat {
uint64_t pss_pass_scrub_pause; /* pause time of a scurb pass */
/* cumulative time scrub spent paused, needed for rate calculation */
uint64_t pss_pass_scrub_spent_paused;
/* Sorted scrubbing new fields */
/* Stored on disk */
uint64_t pss_issued; /* total bytes checked by scanner */
/* Not stored on disk */
uint64_t pss_pass_issued; /* issued bytes per scan pass */
} pool_scan_stat_t;
typedef struct pool_removal_stat {

View File

@ -72,6 +72,8 @@ struct proc;
#define TQ_NOALLOC 0x04 /* cannot allocate memory; may fail */
#define TQ_FRONT 0x08 /* Put task at the front of the queue */
#define TASKQID_INVALID ((taskqid_t)0)
#ifdef _KERNEL
extern taskq_t *system_taskq;
@ -91,6 +93,7 @@ void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
void nulltask(void *);
void taskq_destroy(taskq_t *);
void taskq_wait(taskq_t *);
void taskq_wait_id(taskq_t *, taskqid_t);
void taskq_suspend(taskq_t *);
int taskq_suspended(taskq_t *);
void taskq_resume(taskq_t *);