This originated from ZFS On Linux, as
d4a72f2386
During scans (scrubs or resilvers), it sorts the blocks in each transaction
group by block offset; the result can be a significant improvement. (On my
test system just now, which I put some effort to introduce fragmentation into
the pool since I set it up yesterday, a scrub went from 1h2m to 33.5m with the
changes.) I've seen similar rations on production systems.
Approved by: Alexander Motin
Obtained from: ZFS On Linux
Relnotes: Yes (improved scrub performance, with tunables)
Differential Revision: https://reviews.freebsd.org/D15562
This commit is contained in:
parent
16dd3255ec
commit
a73aecd5f1
@ -2281,14 +2281,14 @@ dump_dir(objset_t *os)
|
||||
object_count++;
|
||||
}
|
||||
|
||||
ASSERT3U(object_count, ==, usedobjs);
|
||||
|
||||
(void) printf("\n");
|
||||
|
||||
if (error != ESRCH) {
|
||||
(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
|
||||
abort();
|
||||
}
|
||||
|
||||
ASSERT3U(object_count, ==, usedobjs);
|
||||
}
|
||||
|
||||
static void
|
||||
@ -2788,6 +2788,7 @@ zdb_blkptr_done(zio_t *zio)
|
||||
|
||||
mutex_enter(&spa->spa_scrub_lock);
|
||||
spa->spa_scrub_inflight--;
|
||||
spa->spa_load_verify_ios--;
|
||||
cv_broadcast(&spa->spa_scrub_io_cv);
|
||||
|
||||
if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
|
||||
@ -2859,9 +2860,10 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
|
||||
flags |= ZIO_FLAG_SPECULATIVE;
|
||||
|
||||
mutex_enter(&spa->spa_scrub_lock);
|
||||
while (spa->spa_scrub_inflight > max_inflight)
|
||||
while (spa->spa_load_verify_ios > max_inflight)
|
||||
cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
|
||||
spa->spa_scrub_inflight++;
|
||||
spa->spa_load_verify_ios++;
|
||||
mutex_exit(&spa->spa_scrub_lock);
|
||||
|
||||
zio_nowait(zio_read(NULL, spa, bp, abd, size,
|
||||
|
@ -1643,7 +1643,7 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
|
||||
(void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_SCAN_STATS,
|
||||
(uint64_t **)&ps, &c);
|
||||
|
||||
if (ps && ps->pss_state == DSS_SCANNING &&
|
||||
if (ps != NULL && ps->pss_state == DSS_SCANNING &&
|
||||
vs->vs_scan_processed != 0 && children == 0) {
|
||||
(void) printf(gettext(" (%s)"),
|
||||
(ps->pss_func == POOL_SCAN_RESILVER) ?
|
||||
@ -4254,11 +4254,13 @@ static void
|
||||
print_scan_status(pool_scan_stat_t *ps)
|
||||
{
|
||||
time_t start, end, pause;
|
||||
uint64_t elapsed, mins_left, hours_left;
|
||||
uint64_t pass_exam, examined, total;
|
||||
uint_t rate;
|
||||
uint64_t total_secs_left;
|
||||
uint64_t elapsed, secs_left, mins_left, hours_left, days_left;
|
||||
uint64_t pass_scanned, scanned, pass_issued, issued, total;
|
||||
uint_t scan_rate, issue_rate;
|
||||
double fraction_done;
|
||||
char processed_buf[7], examined_buf[7], total_buf[7], rate_buf[7];
|
||||
char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7];
|
||||
char srate_buf[7], irate_buf[7];
|
||||
|
||||
(void) printf(gettext(" scan: "));
|
||||
|
||||
@ -4272,30 +4274,37 @@ print_scan_status(pool_scan_stat_t *ps)
|
||||
start = ps->pss_start_time;
|
||||
end = ps->pss_end_time;
|
||||
pause = ps->pss_pass_scrub_pause;
|
||||
|
||||
zfs_nicenum(ps->pss_processed, processed_buf, sizeof (processed_buf));
|
||||
|
||||
assert(ps->pss_func == POOL_SCAN_SCRUB ||
|
||||
ps->pss_func == POOL_SCAN_RESILVER);
|
||||
/*
|
||||
* Scan is finished or canceled.
|
||||
*/
|
||||
if (ps->pss_state == DSS_FINISHED) {
|
||||
uint64_t minutes_taken = (end - start) / 60;
|
||||
char *fmt = NULL;
|
||||
|
||||
/* Scan is finished or canceled. */
|
||||
if (ps->pss_state == DSS_FINISHED) {
|
||||
total_secs_left = end - start;
|
||||
days_left = total_secs_left / 60 / 60 / 24;
|
||||
hours_left = (total_secs_left / 60 / 60) % 24;
|
||||
mins_left = (total_secs_left / 60) % 60;
|
||||
secs_left = (total_secs_left % 60);
|
||||
|
||||
if (ps->pss_func == POOL_SCAN_SCRUB) {
|
||||
fmt = gettext("scrub repaired %s in %lluh%um with "
|
||||
"%llu errors on %s");
|
||||
(void) printf(gettext("scrub repaired %s "
|
||||
"in %llu days %02llu:%02llu:%02llu "
|
||||
"with %llu errors on %s"), processed_buf,
|
||||
(u_longlong_t)days_left, (u_longlong_t)hours_left,
|
||||
(u_longlong_t)mins_left, (u_longlong_t)secs_left,
|
||||
(u_longlong_t)ps->pss_errors, ctime(&end));
|
||||
} else if (ps->pss_func == POOL_SCAN_RESILVER) {
|
||||
fmt = gettext("resilvered %s in %lluh%um with "
|
||||
"%llu errors on %s");
|
||||
(void) printf(gettext("resilvered %s "
|
||||
"in %llu days %02llu:%02llu:%02llu "
|
||||
"with %llu errors on %s"), processed_buf,
|
||||
(u_longlong_t)days_left, (u_longlong_t)hours_left,
|
||||
(u_longlong_t)mins_left, (u_longlong_t)secs_left,
|
||||
(u_longlong_t)ps->pss_errors, ctime(&end));
|
||||
|
||||
}
|
||||
/* LINTED */
|
||||
(void) printf(fmt, processed_buf,
|
||||
(u_longlong_t)(minutes_taken / 60),
|
||||
(uint_t)(minutes_taken % 60),
|
||||
(u_longlong_t)ps->pss_errors,
|
||||
ctime((time_t *)&end));
|
||||
|
||||
return;
|
||||
} else if (ps->pss_state == DSS_CANCELED) {
|
||||
if (ps->pss_func == POOL_SCAN_SCRUB) {
|
||||
@ -4310,19 +4319,15 @@ print_scan_status(pool_scan_stat_t *ps)
|
||||
|
||||
assert(ps->pss_state == DSS_SCANNING);
|
||||
|
||||
/*
|
||||
* Scan is in progress.
|
||||
*/
|
||||
/* Scan is in progress. Resilvers can't be paused. */
|
||||
if (ps->pss_func == POOL_SCAN_SCRUB) {
|
||||
if (pause == 0) {
|
||||
(void) printf(gettext("scrub in progress since %s"),
|
||||
ctime(&start));
|
||||
} else {
|
||||
char buf[32];
|
||||
struct tm *p = localtime(&pause);
|
||||
(void) strftime(buf, sizeof (buf), "%a %b %e %T %Y", p);
|
||||
(void) printf(gettext("scrub paused since %s\n"), buf);
|
||||
(void) printf(gettext("\tscrub started on %s"),
|
||||
(void) printf(gettext("scrub paused since %s"),
|
||||
ctime(&pause));
|
||||
(void) printf(gettext("\tscrub started on %s"),
|
||||
ctime(&start));
|
||||
}
|
||||
} else if (ps->pss_func == POOL_SCAN_RESILVER) {
|
||||
@ -4330,50 +4335,68 @@ print_scan_status(pool_scan_stat_t *ps)
|
||||
ctime(&start));
|
||||
}
|
||||
|
||||
examined = ps->pss_examined ? ps->pss_examined : 1;
|
||||
scanned = ps->pss_examined;
|
||||
pass_scanned = ps->pss_pass_exam;
|
||||
issued = ps->pss_issued;
|
||||
pass_issued = ps->pss_pass_issued;
|
||||
total = ps->pss_to_examine;
|
||||
fraction_done = (double)examined / total;
|
||||
|
||||
/* elapsed time for this pass */
|
||||
/* we are only done with a block once we have issued the IO for it */
|
||||
fraction_done = (double)issued / total;
|
||||
|
||||
/* elapsed time for this pass, rounding up to 1 if it's 0 */
|
||||
elapsed = time(NULL) - ps->pss_pass_start;
|
||||
elapsed -= ps->pss_pass_scrub_spent_paused;
|
||||
elapsed = elapsed ? elapsed : 1;
|
||||
pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1;
|
||||
rate = pass_exam / elapsed;
|
||||
rate = rate ? rate : 1;
|
||||
mins_left = ((total - examined) / rate) / 60;
|
||||
hours_left = mins_left / 60;
|
||||
elapsed = (elapsed != 0) ? elapsed : 1;
|
||||
|
||||
zfs_nicenum(examined, examined_buf, sizeof (examined_buf));
|
||||
scan_rate = pass_scanned / elapsed;
|
||||
issue_rate = pass_issued / elapsed;
|
||||
total_secs_left = (issue_rate != 0) ?
|
||||
((total - issued) / issue_rate) : UINT64_MAX;
|
||||
|
||||
days_left = total_secs_left / 60 / 60 / 24;
|
||||
hours_left = (total_secs_left / 60 / 60) % 24;
|
||||
mins_left = (total_secs_left / 60) % 60;
|
||||
secs_left = (total_secs_left % 60);
|
||||
|
||||
/* format all of the numbers we will be reporting */
|
||||
zfs_nicenum(scanned, scanned_buf, sizeof (scanned_buf));
|
||||
zfs_nicenum(issued, issued_buf, sizeof (issued_buf));
|
||||
zfs_nicenum(total, total_buf, sizeof (total_buf));
|
||||
zfs_nicenum(scan_rate, srate_buf, sizeof (srate_buf));
|
||||
zfs_nicenum(issue_rate, irate_buf, sizeof (irate_buf));
|
||||
|
||||
/*
|
||||
* do not print estimated time if hours_left is more than 30 days
|
||||
* or we have a paused scrub
|
||||
*/
|
||||
/* doo not print estimated time if we have a paused scrub */
|
||||
if (pause == 0) {
|
||||
zfs_nicenum(rate, rate_buf, sizeof (rate_buf));
|
||||
(void) printf(gettext("\t%s scanned out of %s at %s/s"),
|
||||
examined_buf, total_buf, rate_buf);
|
||||
if (hours_left < (30 * 24)) {
|
||||
(void) printf(gettext(", %lluh%um to go\n"),
|
||||
(u_longlong_t)hours_left, (uint_t)(mins_left % 60));
|
||||
} else {
|
||||
(void) printf(gettext(
|
||||
", (scan is slow, no estimated time)\n"));
|
||||
}
|
||||
(void) printf(gettext("\t%s scanned at %s/s, "
|
||||
"%s issued at %s/s, %s total\n"),
|
||||
scanned_buf, srate_buf, issued_buf, irate_buf, total_buf);
|
||||
} else {
|
||||
(void) printf(gettext("\t%s scanned out of %s\n"),
|
||||
examined_buf, total_buf);
|
||||
(void) printf(gettext("\t%s scanned, %s issued, %s total\n"),
|
||||
scanned_buf, issued_buf, total_buf);
|
||||
}
|
||||
|
||||
if (ps->pss_func == POOL_SCAN_RESILVER) {
|
||||
(void) printf(gettext(" %s resilvered, %.2f%% done\n"),
|
||||
(void) printf(gettext("\t%s resilvered, %.2f%% done"),
|
||||
processed_buf, 100 * fraction_done);
|
||||
} else if (ps->pss_func == POOL_SCAN_SCRUB) {
|
||||
(void) printf(gettext(" %s repaired, %.2f%% done\n"),
|
||||
(void) printf(gettext("\t%s repaired, %.2f%% done"),
|
||||
processed_buf, 100 * fraction_done);
|
||||
}
|
||||
|
||||
if (pause == 0) {
|
||||
if (issue_rate >= 10 * 1024 * 1024) {
|
||||
(void) printf(gettext(", %llu days "
|
||||
"%02llu:%02llu:%02llu to go\n"),
|
||||
(u_longlong_t)days_left, (u_longlong_t)hours_left,
|
||||
(u_longlong_t)mins_left, (u_longlong_t)secs_left);
|
||||
} else {
|
||||
(void) printf(gettext(", no estimated "
|
||||
"completion time\n"));
|
||||
}
|
||||
} else {
|
||||
(void) printf(gettext("\n"));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -374,15 +374,15 @@ ztest_info_t ztest_info[] = {
|
||||
{ ztest_fzap, 1, &zopt_sometimes },
|
||||
{ ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes },
|
||||
{ ztest_spa_create_destroy, 1, &zopt_sometimes },
|
||||
{ ztest_fault_inject, 1, &zopt_sometimes },
|
||||
{ ztest_fault_inject, 1, &zopt_incessant },
|
||||
{ ztest_ddt_repair, 1, &zopt_sometimes },
|
||||
{ ztest_dmu_snapshot_hold, 1, &zopt_sometimes },
|
||||
{ ztest_reguid, 1, &zopt_rarely },
|
||||
{ ztest_spa_rename, 1, &zopt_rarely },
|
||||
{ ztest_scrub, 1, &zopt_rarely },
|
||||
{ ztest_scrub, 1, &zopt_often },
|
||||
{ ztest_spa_upgrade, 1, &zopt_rarely },
|
||||
{ ztest_dsl_dataset_promote_busy, 1, &zopt_rarely },
|
||||
{ ztest_vdev_attach_detach, 1, &zopt_sometimes },
|
||||
{ ztest_vdev_attach_detach, 1, &zopt_incessant },
|
||||
{ ztest_vdev_LUN_growth, 1, &zopt_rarely },
|
||||
{ ztest_vdev_add_remove, 1,
|
||||
&ztest_opts.zo_vdevtime },
|
||||
|
@ -219,7 +219,7 @@ check_status(nvlist_t *config, boolean_t isimport)
|
||||
*/
|
||||
(void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS,
|
||||
(uint64_t **)&ps, &psc);
|
||||
if (ps && ps->pss_func == POOL_SCAN_RESILVER &&
|
||||
if (ps != NULL && ps->pss_func == POOL_SCAN_RESILVER &&
|
||||
ps->pss_state == DSS_SCANNING)
|
||||
return (ZPOOL_STATUS_RESILVERING);
|
||||
|
||||
|
@ -408,6 +408,7 @@ typedef struct taskq_ent {
|
||||
#define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */
|
||||
#define TQ_FRONT 0x08 /* Queue in front */
|
||||
|
||||
#define TASKQID_INVALID ((taskqid_t)0)
|
||||
|
||||
extern taskq_t *system_taskq;
|
||||
|
||||
@ -421,6 +422,7 @@ extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
|
||||
taskq_ent_t *);
|
||||
extern void taskq_destroy(taskq_t *);
|
||||
extern void taskq_wait(taskq_t *);
|
||||
extern void taskq_wait_id(taskq_t *, taskqid_t);
|
||||
extern int taskq_member(taskq_t *, void *);
|
||||
extern void system_taskq_init(void);
|
||||
extern void system_taskq_fini(void);
|
||||
|
@ -187,6 +187,12 @@ taskq_wait(taskq_t *tq)
|
||||
mutex_exit(&tq->tq_lock);
|
||||
}
|
||||
|
||||
void
|
||||
taskq_wait_id(taskq_t *tq, taskqid_t id)
|
||||
{
|
||||
taskq_wait(tq);
|
||||
}
|
||||
|
||||
static void *
|
||||
taskq_thread(void *arg)
|
||||
{
|
||||
|
@ -173,3 +173,9 @@ taskq_wait(taskq_t *tq)
|
||||
{
|
||||
taskqueue_drain_all(tq->tq_queue);
|
||||
}
|
||||
|
||||
void
|
||||
taskq_wait_id(taskq_t *tq, taskqid_t id)
|
||||
{
|
||||
taskq_wait(tq);
|
||||
}
|
||||
|
@ -339,7 +339,8 @@ int arc_no_grow_shift = 5;
|
||||
* minimum lifespan of a prefetch block in clock ticks
|
||||
* (initialized in arc_init())
|
||||
*/
|
||||
static int arc_min_prefetch_lifespan;
|
||||
static int zfs_arc_min_prefetch_ms = 1;
|
||||
static int zfs_arc_min_prescient_prefetch_ms = 6;
|
||||
|
||||
/*
|
||||
* If this percent of memory is free, don't throttle.
|
||||
@ -779,8 +780,9 @@ typedef struct arc_stats {
|
||||
kstat_named_t arcstat_meta_limit;
|
||||
kstat_named_t arcstat_meta_max;
|
||||
kstat_named_t arcstat_meta_min;
|
||||
kstat_named_t arcstat_sync_wait_for_async;
|
||||
kstat_named_t arcstat_async_upgrade_sync;
|
||||
kstat_named_t arcstat_demand_hit_predictive_prefetch;
|
||||
kstat_named_t arcstat_demand_hit_prescient_prefetch;
|
||||
} arc_stats_t;
|
||||
|
||||
static arc_stats_t arc_stats = {
|
||||
@ -877,8 +879,9 @@ static arc_stats_t arc_stats = {
|
||||
{ "arc_meta_limit", KSTAT_DATA_UINT64 },
|
||||
{ "arc_meta_max", KSTAT_DATA_UINT64 },
|
||||
{ "arc_meta_min", KSTAT_DATA_UINT64 },
|
||||
{ "sync_wait_for_async", KSTAT_DATA_UINT64 },
|
||||
{ "async_upgrade_sync", KSTAT_DATA_UINT64 },
|
||||
{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
|
||||
{ "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
|
||||
};
|
||||
|
||||
#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
|
||||
@ -974,22 +977,23 @@ typedef struct arc_callback arc_callback_t;
|
||||
|
||||
struct arc_callback {
|
||||
void *acb_private;
|
||||
arc_done_func_t *acb_done;
|
||||
arc_read_done_func_t *acb_done;
|
||||
arc_buf_t *acb_buf;
|
||||
boolean_t acb_compressed;
|
||||
zio_t *acb_zio_dummy;
|
||||
zio_t *acb_zio_head;
|
||||
arc_callback_t *acb_next;
|
||||
};
|
||||
|
||||
typedef struct arc_write_callback arc_write_callback_t;
|
||||
|
||||
struct arc_write_callback {
|
||||
void *awcb_private;
|
||||
arc_done_func_t *awcb_ready;
|
||||
arc_done_func_t *awcb_children_ready;
|
||||
arc_done_func_t *awcb_physdone;
|
||||
arc_done_func_t *awcb_done;
|
||||
arc_buf_t *awcb_buf;
|
||||
void *awcb_private;
|
||||
arc_write_done_func_t *awcb_ready;
|
||||
arc_write_done_func_t *awcb_children_ready;
|
||||
arc_write_done_func_t *awcb_physdone;
|
||||
arc_write_done_func_t *awcb_done;
|
||||
arc_buf_t *awcb_buf;
|
||||
};
|
||||
|
||||
/*
|
||||
@ -1229,6 +1233,8 @@ sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS)
|
||||
#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
|
||||
#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
|
||||
#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH)
|
||||
#define HDR_PRESCIENT_PREFETCH(hdr) \
|
||||
((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
|
||||
#define HDR_COMPRESSION_ENABLED(hdr) \
|
||||
((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
|
||||
|
||||
@ -1392,6 +1398,11 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD,
|
||||
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
|
||||
&ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state");
|
||||
|
||||
SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prfetch_ms, CTLFLAG_RW,
|
||||
&zfs_arc_min_prefetch_ms, 0, "Min life of prefetch block in ms");
|
||||
SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prescient_prefetch_ms, CTLFLAG_RW,
|
||||
&zfs_arc_min_prescient_prefetch_ms, 0, "Min life oof prescient prefetched block in ms");
|
||||
|
||||
/*
|
||||
* L2ARC Internals
|
||||
*/
|
||||
@ -3544,6 +3555,8 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
|
||||
{
|
||||
arc_state_t *evicted_state, *state;
|
||||
int64_t bytes_evicted = 0;
|
||||
int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
|
||||
zfs_arc_min_prescient_prefetch_ms : zfs_arc_min_prefetch_ms;
|
||||
|
||||
ASSERT(MUTEX_HELD(hash_lock));
|
||||
ASSERT(HDR_HAS_L1HDR(hdr));
|
||||
@ -3596,8 +3609,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
|
||||
/* prefetch buffers have a minimum lifespan */
|
||||
if (HDR_IO_IN_PROGRESS(hdr) ||
|
||||
((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
|
||||
ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
|
||||
arc_min_prefetch_lifespan)) {
|
||||
ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) {
|
||||
ARCSTAT_BUMP(arcstat_evict_skip);
|
||||
return (bytes_evicted);
|
||||
}
|
||||
@ -4968,13 +4980,15 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
|
||||
* - move the buffer to the head of the list if this is
|
||||
* another prefetch (to make it less likely to be evicted).
|
||||
*/
|
||||
if (HDR_PREFETCH(hdr)) {
|
||||
if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
|
||||
if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
|
||||
/* link protected by hash lock */
|
||||
ASSERT(multilist_link_active(
|
||||
&hdr->b_l1hdr.b_arc_node));
|
||||
} else {
|
||||
arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
|
||||
arc_hdr_clear_flags(hdr,
|
||||
ARC_FLAG_PREFETCH |
|
||||
ARC_FLAG_PRESCIENT_PREFETCH);
|
||||
ARCSTAT_BUMP(arcstat_mru_hits);
|
||||
}
|
||||
hdr->b_l1hdr.b_arc_access = now;
|
||||
@ -5005,10 +5019,13 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
|
||||
* MFU state.
|
||||
*/
|
||||
|
||||
if (HDR_PREFETCH(hdr)) {
|
||||
if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
|
||||
new_state = arc_mru;
|
||||
if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
|
||||
arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
|
||||
if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
|
||||
arc_hdr_clear_flags(hdr,
|
||||
ARC_FLAG_PREFETCH |
|
||||
ARC_FLAG_PRESCIENT_PREFETCH);
|
||||
}
|
||||
DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
|
||||
} else {
|
||||
new_state = arc_mfu;
|
||||
@ -5029,11 +5046,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
|
||||
* If it was a prefetch, we will explicitly move it to
|
||||
* the head of the list now.
|
||||
*/
|
||||
if ((HDR_PREFETCH(hdr)) != 0) {
|
||||
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
|
||||
/* link protected by hash_lock */
|
||||
ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
|
||||
}
|
||||
|
||||
ARCSTAT_BUMP(arcstat_mfu_hits);
|
||||
hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
|
||||
} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
|
||||
@ -5044,12 +5057,11 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
|
||||
* MFU state.
|
||||
*/
|
||||
|
||||
if (HDR_PREFETCH(hdr)) {
|
||||
if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
|
||||
/*
|
||||
* This is a prefetch access...
|
||||
* move this block back to the MRU state.
|
||||
*/
|
||||
ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
|
||||
new_state = arc_mru;
|
||||
}
|
||||
|
||||
@ -5116,23 +5128,28 @@ arc_buf_access(arc_buf_t *buf)
|
||||
demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
|
||||
}
|
||||
|
||||
/* a generic arc_done_func_t which you can use */
|
||||
/* a generic arc_read_done_func_t which you can use */
|
||||
/* ARGSUSED */
|
||||
void
|
||||
arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
|
||||
arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
|
||||
arc_buf_t *buf, void *arg)
|
||||
{
|
||||
if (zio == NULL || zio->io_error == 0)
|
||||
bcopy(buf->b_data, arg, arc_buf_size(buf));
|
||||
if (buf == NULL)
|
||||
return;
|
||||
|
||||
bcopy(buf->b_data, arg, arc_buf_size(buf));
|
||||
arc_buf_destroy(buf, arg);
|
||||
}
|
||||
|
||||
/* a generic arc_done_func_t */
|
||||
/* a generic arc_read_done_func_t */
|
||||
/* ARGSUSED */
|
||||
void
|
||||
arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
|
||||
arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
|
||||
arc_buf_t *buf, void *arg)
|
||||
{
|
||||
arc_buf_t **bufp = arg;
|
||||
if (zio && zio->io_error) {
|
||||
arc_buf_destroy(buf, arg);
|
||||
|
||||
if (buf == NULL) {
|
||||
*bufp = NULL;
|
||||
} else {
|
||||
*bufp = buf;
|
||||
@ -5164,7 +5181,6 @@ arc_read_done(zio_t *zio)
|
||||
arc_callback_t *callback_list;
|
||||
arc_callback_t *acb;
|
||||
boolean_t freeable = B_FALSE;
|
||||
boolean_t no_zio_error = (zio->io_error == 0);
|
||||
|
||||
/*
|
||||
* The hdr was inserted into hash-table and removed from lists
|
||||
@ -5190,7 +5206,7 @@ arc_read_done(zio_t *zio)
|
||||
ASSERT3P(hash_lock, !=, NULL);
|
||||
}
|
||||
|
||||
if (no_zio_error) {
|
||||
if (zio->io_error == 0) {
|
||||
/* byteswap if necessary */
|
||||
if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
|
||||
if (BP_GET_LEVEL(zio->io_bp) > 0) {
|
||||
@ -5211,7 +5227,8 @@ arc_read_done(zio_t *zio)
|
||||
callback_list = hdr->b_l1hdr.b_acb;
|
||||
ASSERT3P(callback_list, !=, NULL);
|
||||
|
||||
if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) {
|
||||
if (hash_lock && zio->io_error == 0 &&
|
||||
hdr->b_l1hdr.b_state == arc_anon) {
|
||||
/*
|
||||
* Only call arc_access on anonymous buffers. This is because
|
||||
* if we've issued an I/O for an evicted buffer, we've already
|
||||
@ -5232,14 +5249,21 @@ arc_read_done(zio_t *zio)
|
||||
if (!acb->acb_done)
|
||||
continue;
|
||||
|
||||
/* This is a demand read since prefetches don't use callbacks */
|
||||
callback_cnt++;
|
||||
|
||||
if (zio->io_error != 0)
|
||||
continue;
|
||||
|
||||
int error = arc_buf_alloc_impl(hdr, acb->acb_private,
|
||||
acb->acb_compressed, no_zio_error, &acb->acb_buf);
|
||||
if (no_zio_error) {
|
||||
zio->io_error = error;
|
||||
acb->acb_compressed,
|
||||
B_TRUE, &acb->acb_buf);
|
||||
if (error != 0) {
|
||||
arc_buf_destroy(acb->acb_buf, acb->acb_private);
|
||||
acb->acb_buf = NULL;
|
||||
}
|
||||
|
||||
if (zio->io_error == 0)
|
||||
zio->io_error = error;
|
||||
}
|
||||
hdr->b_l1hdr.b_acb = NULL;
|
||||
arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
|
||||
@ -5252,7 +5276,7 @@ arc_read_done(zio_t *zio)
|
||||
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
|
||||
callback_list != NULL);
|
||||
|
||||
if (no_zio_error) {
|
||||
if (zio->io_error == 0) {
|
||||
arc_hdr_verify(hdr, zio->io_bp);
|
||||
} else {
|
||||
arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
|
||||
@ -5285,8 +5309,10 @@ arc_read_done(zio_t *zio)
|
||||
|
||||
/* execute each callback and free its structure */
|
||||
while ((acb = callback_list) != NULL) {
|
||||
if (acb->acb_done)
|
||||
acb->acb_done(zio, acb->acb_buf, acb->acb_private);
|
||||
if (acb->acb_done) {
|
||||
acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
|
||||
acb->acb_buf, acb->acb_private);
|
||||
}
|
||||
|
||||
if (acb->acb_zio_dummy != NULL) {
|
||||
acb->acb_zio_dummy->io_error = zio->io_error;
|
||||
@ -5320,7 +5346,7 @@ arc_read_done(zio_t *zio)
|
||||
* for readers of this block.
|
||||
*/
|
||||
int
|
||||
arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
|
||||
arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done,
|
||||
void *private, zio_priority_t priority, int zio_flags,
|
||||
arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
|
||||
{
|
||||
@ -5329,7 +5355,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
|
||||
zio_t *rzio;
|
||||
uint64_t guid = spa_load_guid(spa);
|
||||
boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0;
|
||||
|
||||
int rc = 0;
|
||||
|
||||
ASSERT(!BP_IS_EMBEDDED(bp) ||
|
||||
BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
|
||||
|
||||
@ -5347,32 +5374,20 @@ top:
|
||||
*arc_flags |= ARC_FLAG_CACHED;
|
||||
|
||||
if (HDR_IO_IN_PROGRESS(hdr)) {
|
||||
zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
|
||||
|
||||
ASSERT3P(head_zio, !=, NULL);
|
||||
if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
|
||||
priority == ZIO_PRIORITY_SYNC_READ) {
|
||||
/*
|
||||
* This sync read must wait for an
|
||||
* in-progress async read (e.g. a predictive
|
||||
* prefetch). Async reads are queued
|
||||
* separately at the vdev_queue layer, so
|
||||
* this is a form of priority inversion.
|
||||
* Ideally, we would "inherit" the demand
|
||||
* i/o's priority by moving the i/o from
|
||||
* the async queue to the synchronous queue,
|
||||
* but there is currently no mechanism to do
|
||||
* so. Track this so that we can evaluate
|
||||
* the magnitude of this potential performance
|
||||
* problem.
|
||||
*
|
||||
* Note that if the prefetch i/o is already
|
||||
* active (has been issued to the device),
|
||||
* the prefetch improved performance, because
|
||||
* we issued it sooner than we would have
|
||||
* without the prefetch.
|
||||
* This is a sync read that needs to wait for
|
||||
* an in-flight async read. Request that the
|
||||
* zio have its priority upgraded.
|
||||
*/
|
||||
DTRACE_PROBE1(arc__sync__wait__for__async,
|
||||
zio_change_priority(head_zio, priority);
|
||||
DTRACE_PROBE1(arc__async__upgrade__sync,
|
||||
arc_buf_hdr_t *, hdr);
|
||||
ARCSTAT_BUMP(arcstat_sync_wait_for_async);
|
||||
ARCSTAT_BUMP(arcstat_async_upgrade_sync);
|
||||
}
|
||||
if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
|
||||
arc_hdr_clear_flags(hdr,
|
||||
@ -5399,6 +5414,7 @@ top:
|
||||
spa, NULL, NULL, NULL, zio_flags);
|
||||
|
||||
ASSERT3P(acb->acb_done, !=, NULL);
|
||||
acb->acb_zio_head = head_zio;
|
||||
acb->acb_next = hdr->b_l1hdr.b_acb;
|
||||
hdr->b_l1hdr.b_acb = acb;
|
||||
mutex_exit(hash_lock);
|
||||
@ -5426,17 +5442,32 @@ top:
|
||||
arc_hdr_clear_flags(hdr,
|
||||
ARC_FLAG_PREDICTIVE_PREFETCH);
|
||||
}
|
||||
ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
|
||||
|
||||
if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
|
||||
ARCSTAT_BUMP(
|
||||
arcstat_demand_hit_prescient_prefetch);
|
||||
arc_hdr_clear_flags(hdr,
|
||||
ARC_FLAG_PRESCIENT_PREFETCH);
|
||||
}
|
||||
|
||||
ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
|
||||
/* Get a buf with the desired data in it. */
|
||||
VERIFY0(arc_buf_alloc_impl(hdr, private,
|
||||
compressed_read, B_TRUE, &buf));
|
||||
rc = arc_buf_alloc_impl(hdr, private,
|
||||
compressed_read, B_TRUE, &buf);
|
||||
if (rc != 0) {
|
||||
arc_buf_destroy(buf, private);
|
||||
buf = NULL;
|
||||
}
|
||||
ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
|
||||
rc == 0 || rc != ENOENT);
|
||||
} else if (*arc_flags & ARC_FLAG_PREFETCH &&
|
||||
refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
|
||||
arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
|
||||
}
|
||||
DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
|
||||
arc_access(hdr, hash_lock);
|
||||
if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
|
||||
arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
|
||||
if (*arc_flags & ARC_FLAG_L2CACHE)
|
||||
arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
|
||||
mutex_exit(hash_lock);
|
||||
@ -5446,7 +5477,7 @@ top:
|
||||
data, metadata, hits);
|
||||
|
||||
if (done)
|
||||
done(NULL, buf, private);
|
||||
done(NULL, zb, bp, buf, private);
|
||||
} else {
|
||||
uint64_t lsize = BP_GET_LSIZE(bp);
|
||||
uint64_t psize = BP_GET_PSIZE(bp);
|
||||
@ -5520,6 +5551,9 @@ top:
|
||||
|
||||
if (*arc_flags & ARC_FLAG_PREFETCH)
|
||||
arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
|
||||
if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
|
||||
arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
|
||||
|
||||
if (*arc_flags & ARC_FLAG_L2CACHE)
|
||||
arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
|
||||
if (BP_GET_LEVEL(bp) > 0)
|
||||
@ -5549,14 +5583,17 @@ top:
|
||||
vd = NULL;
|
||||
}
|
||||
|
||||
if (priority == ZIO_PRIORITY_ASYNC_READ)
|
||||
/*
|
||||
* We count both async reads and scrub IOs as asynchronous so
|
||||
* that both can be upgraded in the event of a cache hit while
|
||||
* the read IO is still in-flight.
|
||||
*/
|
||||
if (priority == ZIO_PRIORITY_ASYNC_READ ||
|
||||
priority == ZIO_PRIORITY_SCRUB)
|
||||
arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
|
||||
else
|
||||
arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
|
||||
|
||||
if (hash_lock != NULL)
|
||||
mutex_exit(hash_lock);
|
||||
|
||||
/*
|
||||
* At this point, we have a level 1 cache miss. Try again in
|
||||
* L2ARC if possible.
|
||||
@ -5637,6 +5674,11 @@ top:
|
||||
ZIO_FLAG_CANFAIL |
|
||||
ZIO_FLAG_DONT_PROPAGATE |
|
||||
ZIO_FLAG_DONT_RETRY, B_FALSE);
|
||||
acb->acb_zio_head = rzio;
|
||||
|
||||
if (hash_lock != NULL)
|
||||
mutex_exit(hash_lock);
|
||||
|
||||
DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
|
||||
zio_t *, rzio);
|
||||
ARCSTAT_INCR(arcstat_l2_read_bytes, size);
|
||||
@ -5651,6 +5693,8 @@ top:
|
||||
return (0);
|
||||
|
||||
/* l2arc read error; goto zio_read() */
|
||||
if (hash_lock != NULL)
|
||||
mutex_enter(hash_lock);
|
||||
} else {
|
||||
DTRACE_PROBE1(l2arc__miss,
|
||||
arc_buf_hdr_t *, hdr);
|
||||
@ -5671,6 +5715,10 @@ top:
|
||||
|
||||
rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size,
|
||||
arc_read_done, hdr, priority, zio_flags, zb);
|
||||
acb->acb_zio_head = rzio;
|
||||
|
||||
if (hash_lock != NULL)
|
||||
mutex_exit(hash_lock);
|
||||
|
||||
if (*arc_flags & ARC_FLAG_WAIT)
|
||||
return (zio_wait(rzio));
|
||||
@ -6162,9 +6210,9 @@ arc_write_done(zio_t *zio)
|
||||
|
||||
zio_t *
|
||||
arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
|
||||
boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready,
|
||||
arc_done_func_t *children_ready, arc_done_func_t *physdone,
|
||||
arc_done_func_t *done, void *private, zio_priority_t priority,
|
||||
boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready,
|
||||
arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone,
|
||||
arc_write_done_func_t *done, void *private, zio_priority_t priority,
|
||||
int zio_flags, const zbookmark_phys_t *zb)
|
||||
{
|
||||
arc_buf_hdr_t *hdr = buf->b_hdr;
|
||||
@ -6591,9 +6639,6 @@ arc_init(void)
|
||||
mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL);
|
||||
|
||||
/* Convert seconds to clock ticks */
|
||||
arc_min_prefetch_lifespan = 1 * hz;
|
||||
|
||||
/* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */
|
||||
arc_c_min = MAX(allmem / 32, arc_abs_min);
|
||||
/* set max to 5/8 of all memory, or all but 1GB, whichever is more */
|
||||
|
@ -902,7 +902,8 @@ dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset)
|
||||
}
|
||||
|
||||
static void
|
||||
dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
|
||||
dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
|
||||
arc_buf_t *buf, void *vdb)
|
||||
{
|
||||
dmu_buf_impl_t *db = vdb;
|
||||
|
||||
@ -916,19 +917,22 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
|
||||
ASSERT(db->db.db_data == NULL);
|
||||
if (db->db_level == 0 && db->db_freed_in_flight) {
|
||||
/* we were freed in flight; disregard any error */
|
||||
if (buf == NULL) {
|
||||
buf = arc_alloc_buf(db->db_objset->os_spa,
|
||||
db, DBUF_GET_BUFC_TYPE(db), db->db.db_size);
|
||||
}
|
||||
arc_release(buf, db);
|
||||
bzero(buf->b_data, db->db.db_size);
|
||||
arc_buf_freeze(buf);
|
||||
db->db_freed_in_flight = FALSE;
|
||||
dbuf_set_data(db, buf);
|
||||
db->db_state = DB_CACHED;
|
||||
} else if (zio == NULL || zio->io_error == 0) {
|
||||
} else if (buf != NULL) {
|
||||
dbuf_set_data(db, buf);
|
||||
db->db_state = DB_CACHED;
|
||||
} else {
|
||||
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
||||
ASSERT3P(db->db_buf, ==, NULL);
|
||||
arc_buf_destroy(buf, db);
|
||||
db->db_state = DB_UNCACHED;
|
||||
}
|
||||
cv_broadcast(&db->db_changed);
|
||||
@ -2326,7 +2330,8 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
|
||||
* prefetch if the next block down is our target.
|
||||
*/
|
||||
static void
|
||||
dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
|
||||
dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
|
||||
const blkptr_t *iobp, arc_buf_t *abuf, void *private)
|
||||
{
|
||||
dbuf_prefetch_arg_t *dpa = private;
|
||||
|
||||
@ -2365,13 +2370,18 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
|
||||
dbuf_rele(db, FTAG);
|
||||
}
|
||||
|
||||
if (abuf == NULL) {
|
||||
kmem_free(dpa, sizeof(*dpa));
|
||||
return;
|
||||
}
|
||||
|
||||
dpa->dpa_curlevel--;
|
||||
|
||||
uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
|
||||
(dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
|
||||
blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
|
||||
P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
|
||||
if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) {
|
||||
if (BP_IS_HOLE(bp)) {
|
||||
kmem_free(dpa, sizeof (*dpa));
|
||||
} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
|
||||
ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
|
||||
@ -3746,7 +3756,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
|
||||
* ready callback so that we can properly handle an indirect
|
||||
* block that only contains holes.
|
||||
*/
|
||||
arc_done_func_t *children_ready_cb = NULL;
|
||||
arc_write_done_func_t *children_ready_cb = NULL;
|
||||
if (db->db_level != 0)
|
||||
children_ready_cb = dbuf_write_children_ready;
|
||||
|
||||
|
@ -1112,14 +1112,26 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
|
||||
void
|
||||
ddt_sync(spa_t *spa, uint64_t txg)
|
||||
{
|
||||
dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
|
||||
dmu_tx_t *tx;
|
||||
zio_t *rio = zio_root(spa, NULL, NULL,
|
||||
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
|
||||
zio_t *rio;
|
||||
|
||||
ASSERT(spa_syncing_txg(spa) == txg);
|
||||
|
||||
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
|
||||
|
||||
rio = zio_root(spa, NULL, NULL,
|
||||
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
|
||||
|
||||
/*
|
||||
* This function may cause an immediate scan of ddt blocks (see
|
||||
* the comment above dsl_scan_ddt() for details). We set the
|
||||
* scan's root zio here so that we can wait for any scan IOs in
|
||||
* addition to the regular ddt IOs.
|
||||
*/
|
||||
ASSERT3P(scn->scn_zio_root, ==, NULL);
|
||||
scn->scn_zio_root = rio;
|
||||
|
||||
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
|
||||
ddt_t *ddt = spa->spa_ddt[c];
|
||||
if (ddt == NULL)
|
||||
@ -1129,6 +1141,7 @@ ddt_sync(spa_t *spa, uint64_t txg)
|
||||
}
|
||||
|
||||
(void) zio_wait(rio);
|
||||
scn->scn_zio_root = NULL;
|
||||
|
||||
dmu_tx_commit(tx);
|
||||
}
|
||||
|
@ -349,6 +349,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
|
||||
|
||||
ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
|
||||
|
||||
#if 0
|
||||
/*
|
||||
* The $ORIGIN dataset (if it exists) doesn't have an associated
|
||||
* objset, so there's no reason to open it. The $ORIGIN dataset
|
||||
@ -359,6 +360,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
|
||||
ASSERT3P(ds->ds_dir, !=,
|
||||
spa_get_dsl(spa)->dp_origin_snap->ds_dir);
|
||||
}
|
||||
#endif
|
||||
|
||||
os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
|
||||
os->os_dsl_dataset = ds;
|
||||
|
@ -499,8 +499,9 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
|
||||
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
|
||||
{
|
||||
prefetch_data_t *pfd = arg;
|
||||
arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
|
||||
|
||||
arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
|
||||
ARC_FLAG_PRESCIENT_PREFETCH;
|
||||
|
||||
ASSERT(pfd->pd_bytes_fetched >= 0);
|
||||
if (bp == NULL)
|
||||
return (0);
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1119,85 +1119,6 @@ metaslab_rangesize_compare(const void *x1, const void *x2)
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Create any block allocator specific components. The current allocators
|
||||
* rely on using both a size-ordered range_tree_t and an array of uint64_t's.
|
||||
*/
|
||||
static void
|
||||
metaslab_rt_create(range_tree_t *rt, void *arg)
|
||||
{
|
||||
metaslab_t *msp = arg;
|
||||
|
||||
ASSERT3P(rt->rt_arg, ==, msp);
|
||||
ASSERT(msp->ms_allocatable == NULL);
|
||||
|
||||
avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare,
|
||||
sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
|
||||
}
|
||||
|
||||
/*
|
||||
* Destroy the block allocator specific components.
|
||||
*/
|
||||
static void
|
||||
metaslab_rt_destroy(range_tree_t *rt, void *arg)
|
||||
{
|
||||
metaslab_t *msp = arg;
|
||||
|
||||
ASSERT3P(rt->rt_arg, ==, msp);
|
||||
ASSERT3P(msp->ms_allocatable, ==, rt);
|
||||
ASSERT0(avl_numnodes(&msp->ms_allocatable_by_size));
|
||||
|
||||
avl_destroy(&msp->ms_allocatable_by_size);
|
||||
}
|
||||
|
||||
static void
|
||||
metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
|
||||
{
|
||||
metaslab_t *msp = arg;
|
||||
|
||||
ASSERT3P(rt->rt_arg, ==, msp);
|
||||
ASSERT3P(msp->ms_allocatable, ==, rt);
|
||||
VERIFY(!msp->ms_condensing);
|
||||
avl_add(&msp->ms_allocatable_by_size, rs);
|
||||
}
|
||||
|
||||
static void
|
||||
metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
|
||||
{
|
||||
metaslab_t *msp = arg;
|
||||
|
||||
ASSERT3P(rt->rt_arg, ==, msp);
|
||||
ASSERT3P(msp->ms_allocatable, ==, rt);
|
||||
VERIFY(!msp->ms_condensing);
|
||||
avl_remove(&msp->ms_allocatable_by_size, rs);
|
||||
}
|
||||
|
||||
static void
|
||||
metaslab_rt_vacate(range_tree_t *rt, void *arg)
|
||||
{
|
||||
metaslab_t *msp = arg;
|
||||
|
||||
ASSERT3P(rt->rt_arg, ==, msp);
|
||||
ASSERT3P(msp->ms_allocatable, ==, rt);
|
||||
|
||||
/*
|
||||
* Normally one would walk the tree freeing nodes along the way.
|
||||
* Since the nodes are shared with the range trees we can avoid
|
||||
* walking all nodes and just reinitialize the avl tree. The nodes
|
||||
* will be freed by the range tree, so we don't want to free them here.
|
||||
*/
|
||||
avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare,
|
||||
sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
|
||||
}
|
||||
|
||||
static range_tree_ops_t metaslab_rt_ops = {
|
||||
metaslab_rt_create,
|
||||
metaslab_rt_destroy,
|
||||
metaslab_rt_add,
|
||||
metaslab_rt_remove,
|
||||
metaslab_rt_vacate
|
||||
};
|
||||
|
||||
/*
|
||||
* ==========================================================================
|
||||
* Common allocator routines
|
||||
@ -1574,7 +1495,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
|
||||
* addition of new space; and for debugging, it ensures that we'd
|
||||
* data fault on any attempt to use this metaslab before it's ready.
|
||||
*/
|
||||
ms->ms_allocatable = range_tree_create(&metaslab_rt_ops, ms);
|
||||
ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, &ms->ms_allocatable_by_size,
|
||||
metaslab_rangesize_compare, 0);
|
||||
metaslab_group_add(mg, ms);
|
||||
|
||||
metaslab_set_fragmentation(ms);
|
||||
|
@ -33,8 +33,58 @@
|
||||
#include <sys/zio.h>
|
||||
#include <sys/range_tree.h>
|
||||
|
||||
/*
|
||||
* Range trees are tree-based data structures that can be used to
|
||||
* track free space or generally any space allocation information.
|
||||
* A range tree keeps track of individual segments and automatically
|
||||
* provides facilities such as adjacent extent merging and extent
|
||||
* splitting in response to range add/remove requests.
|
||||
*
|
||||
* A range tree starts out completely empty, with no segments in it.
|
||||
* Adding an allocation via range_tree_add to the range tree can either:
|
||||
* 1) create a new extent
|
||||
* 2) extend an adjacent extent
|
||||
* 3) merge two adjacent extents
|
||||
* Conversely, removing an allocation via range_tree_remove can:
|
||||
* 1) completely remove an extent
|
||||
* 2) shorten an extent (if the allocation was near one of its ends)
|
||||
* 3) split an extent into two extents, in effect punching a hole
|
||||
*
|
||||
* A range tree is also capable of 'bridging' gaps when adding
|
||||
* allocations. This is useful for cases when close proximity of
|
||||
* allocations is an important detail that needs to be represented
|
||||
* in the range tree. See range_tree_set_gap(). The default behavior
|
||||
* is not to bridge gaps (i.e. the maximum allowed gap size is 0).
|
||||
*
|
||||
* In order to traverse a range tree, use either the range_tree_walk()
|
||||
* or range_tree_vacate() functions.
|
||||
*
|
||||
* To obtain more accurate information on individual segment
|
||||
* operations that the range tree performs "under the hood", you can
|
||||
* specify a set of callbacks by passing a range_tree_ops_t structure
|
||||
* to the range_tree_create function. Any callbacks that are non-NULL
|
||||
* are then called at the appropriate times.
|
||||
*
|
||||
* The range tree code also supports a special variant of range trees
|
||||
* that can bridge small gaps between segments. This kind of tree is used
|
||||
* by the dsl scanning code to group I/Os into mostly sequential chunks to
|
||||
* optimize disk performance. The code here attempts to do this with as
|
||||
* little memory and computational overhead as possible. One limitation of
|
||||
* this implementation is that segments of range trees with gaps can only
|
||||
* support removing complete segments.
|
||||
*/
|
||||
|
||||
kmem_cache_t *range_seg_cache;
|
||||
|
||||
/* Generic ops for managing an AVL tree alongside a range tree */
|
||||
struct range_tree_ops rt_avl_ops = {
|
||||
.rtop_create = rt_avl_create,
|
||||
.rtop_destroy = rt_avl_destroy,
|
||||
.rtop_add = rt_avl_add,
|
||||
.rtop_remove = rt_avl_remove,
|
||||
.rtop_vacate = rt_avl_vacate,
|
||||
};
|
||||
|
||||
void
|
||||
range_tree_init(void)
|
||||
{
|
||||
@ -109,47 +159,47 @@ range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
|
||||
static int
|
||||
range_tree_seg_compare(const void *x1, const void *x2)
|
||||
{
|
||||
const range_seg_t *r1 = x1;
|
||||
const range_seg_t *r2 = x2;
|
||||
const range_seg_t *r1 = (const range_seg_t *)x1;
|
||||
const range_seg_t *r2 = (const range_seg_t *)x2;
|
||||
|
||||
if (r1->rs_start < r2->rs_start) {
|
||||
if (r1->rs_end > r2->rs_start)
|
||||
return (0);
|
||||
return (-1);
|
||||
}
|
||||
if (r1->rs_start > r2->rs_start) {
|
||||
if (r1->rs_start < r2->rs_end)
|
||||
return (0);
|
||||
return (1);
|
||||
}
|
||||
return (0);
|
||||
ASSERT3U(r1->rs_start, <=, r1->rs_end);
|
||||
ASSERT3U(r2->rs_start, <=, r2->rs_end);
|
||||
|
||||
return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
|
||||
}
|
||||
|
||||
range_tree_t *
|
||||
range_tree_create(range_tree_ops_t *ops, void *arg)
|
||||
range_tree_create_impl(range_tree_ops_t *ops, void *arg,
|
||||
int (*avl_compare) (const void *, const void *), uint64_t gap)
|
||||
{
|
||||
range_tree_t *rt;
|
||||
|
||||
rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
|
||||
range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
|
||||
|
||||
avl_create(&rt->rt_root, range_tree_seg_compare,
|
||||
sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
|
||||
|
||||
rt->rt_ops = ops;
|
||||
rt->rt_arg = arg;
|
||||
rt->rt_gap = gap;
|
||||
rt->rt_avl_compare = avl_compare;
|
||||
|
||||
if (rt->rt_ops != NULL)
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
|
||||
rt->rt_ops->rtop_create(rt, rt->rt_arg);
|
||||
|
||||
return (rt);
|
||||
}
|
||||
|
||||
range_tree_t *
|
||||
range_tree_create(range_tree_ops_t *ops, void *arg)
|
||||
{
|
||||
return (range_tree_create_impl(ops, arg, NULL, 0));
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_destroy(range_tree_t *rt)
|
||||
{
|
||||
VERIFY0(rt->rt_space);
|
||||
|
||||
if (rt->rt_ops != NULL)
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
|
||||
rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
|
||||
|
||||
avl_destroy(&rt->rt_root);
|
||||
@ -157,39 +207,99 @@ range_tree_destroy(range_tree_t *rt)
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_add(void *arg, uint64_t start, uint64_t size)
|
||||
range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta)
|
||||
{
|
||||
ASSERT3U(rs->rs_fill + delta, !=, 0);
|
||||
ASSERT3U(rs->rs_fill + delta, <=, rs->rs_end - rs->rs_start);
|
||||
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
|
||||
rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
|
||||
rs->rs_fill += delta;
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
|
||||
rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
|
||||
}
|
||||
|
||||
static void
|
||||
range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
|
||||
{
|
||||
range_tree_t *rt = arg;
|
||||
avl_index_t where;
|
||||
range_seg_t rsearch, *rs_before, *rs_after, *rs;
|
||||
uint64_t end = start + size;
|
||||
uint64_t end = start + size, gap = rt->rt_gap;
|
||||
uint64_t bridge_size = 0;
|
||||
boolean_t merge_before, merge_after;
|
||||
|
||||
VERIFY(size != 0);
|
||||
ASSERT3U(size, !=, 0);
|
||||
ASSERT3U(fill, <=, size);
|
||||
|
||||
rsearch.rs_start = start;
|
||||
rsearch.rs_end = end;
|
||||
rs = avl_find(&rt->rt_root, &rsearch, &where);
|
||||
|
||||
if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) {
|
||||
if (gap == 0 && rs != NULL &&
|
||||
rs->rs_start <= start && rs->rs_end >= end) {
|
||||
zfs_panic_recover("zfs: allocating allocated segment"
|
||||
"(offset=%llu size=%llu)\n",
|
||||
(longlong_t)start, (longlong_t)size);
|
||||
"(offset=%llu size=%llu) of (offset=%llu size=%llu)\n",
|
||||
(longlong_t)start, (longlong_t)size,
|
||||
(longlong_t)rs->rs_start,
|
||||
(longlong_t)rs->rs_end - rs->rs_start);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Make sure we don't overlap with either of our neighbors */
|
||||
VERIFY(rs == NULL);
|
||||
/*
|
||||
* If this is a gap-supporting range tree, it is possible that we
|
||||
* are inserting into an existing segment. In this case simply
|
||||
* bump the fill count and call the remove / add callbacks. If the
|
||||
* new range will extend an existing segment, we remove the
|
||||
* existing one, apply the new extent to it and re-insert it using
|
||||
* the normal code paths.
|
||||
*/
|
||||
if (rs != NULL) {
|
||||
ASSERT3U(gap, !=, 0);
|
||||
if (rs->rs_start <= start && rs->rs_end >= end) {
|
||||
range_tree_adjust_fill(rt, rs, fill);
|
||||
return;
|
||||
}
|
||||
|
||||
avl_remove(&rt->rt_root, rs);
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
|
||||
rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
|
||||
|
||||
range_tree_stat_decr(rt, rs);
|
||||
rt->rt_space -= rs->rs_end - rs->rs_start;
|
||||
|
||||
fill += rs->rs_fill;
|
||||
start = MIN(start, rs->rs_start);
|
||||
end = MAX(end, rs->rs_end);
|
||||
size = end - start;
|
||||
|
||||
range_tree_add_impl(rt, start, size, fill);
|
||||
|
||||
kmem_cache_free(range_seg_cache, rs);
|
||||
return;
|
||||
}
|
||||
|
||||
ASSERT3P(rs, ==, NULL);
|
||||
|
||||
/*
|
||||
* Determine whether or not we will have to merge with our neighbors.
|
||||
* If gap != 0, we might need to merge with our neighbors even if we
|
||||
* aren't directly touching.
|
||||
*/
|
||||
rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
|
||||
rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
|
||||
|
||||
merge_before = (rs_before != NULL && rs_before->rs_end == start);
|
||||
merge_after = (rs_after != NULL && rs_after->rs_start == end);
|
||||
merge_before = (rs_before != NULL && rs_before->rs_end >= start - gap);
|
||||
merge_after = (rs_after != NULL && rs_after->rs_start <= end + gap);
|
||||
|
||||
if (merge_before && gap != 0)
|
||||
bridge_size += start - rs_before->rs_end;
|
||||
if (merge_after && gap != 0)
|
||||
bridge_size += rs_after->rs_start - end;
|
||||
|
||||
if (merge_before && merge_after) {
|
||||
avl_remove(&rt->rt_root, rs_before);
|
||||
if (rt->rt_ops != NULL) {
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
|
||||
rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
|
||||
rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
|
||||
}
|
||||
@ -197,43 +307,59 @@ range_tree_add(void *arg, uint64_t start, uint64_t size)
|
||||
range_tree_stat_decr(rt, rs_before);
|
||||
range_tree_stat_decr(rt, rs_after);
|
||||
|
||||
rs_after->rs_fill += rs_before->rs_fill + fill;
|
||||
rs_after->rs_start = rs_before->rs_start;
|
||||
kmem_cache_free(range_seg_cache, rs_before);
|
||||
rs = rs_after;
|
||||
} else if (merge_before) {
|
||||
if (rt->rt_ops != NULL)
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
|
||||
rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
|
||||
|
||||
range_tree_stat_decr(rt, rs_before);
|
||||
|
||||
rs_before->rs_fill += fill;
|
||||
rs_before->rs_end = end;
|
||||
rs = rs_before;
|
||||
} else if (merge_after) {
|
||||
if (rt->rt_ops != NULL)
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
|
||||
rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
|
||||
|
||||
range_tree_stat_decr(rt, rs_after);
|
||||
|
||||
rs_after->rs_fill += fill;
|
||||
rs_after->rs_start = start;
|
||||
rs = rs_after;
|
||||
} else {
|
||||
rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
|
||||
|
||||
rs->rs_fill = fill;
|
||||
rs->rs_start = start;
|
||||
rs->rs_end = end;
|
||||
avl_insert(&rt->rt_root, rs, where);
|
||||
}
|
||||
|
||||
if (rt->rt_ops != NULL)
|
||||
if (gap != 0)
|
||||
ASSERT3U(rs->rs_fill, <=, rs->rs_end - rs->rs_start);
|
||||
else
|
||||
ASSERT3U(rs->rs_fill, ==, rs->rs_end - rs->rs_start);
|
||||
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
|
||||
rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
|
||||
|
||||
range_tree_stat_incr(rt, rs);
|
||||
rt->rt_space += size;
|
||||
rt->rt_space += size + bridge_size;
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_remove(void *arg, uint64_t start, uint64_t size)
|
||||
range_tree_add(void *arg, uint64_t start, uint64_t size)
|
||||
{
|
||||
range_tree_add_impl(arg, start, size, size);
|
||||
}
|
||||
|
||||
static void
|
||||
range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size,
|
||||
boolean_t do_fill)
|
||||
{
|
||||
range_tree_t *rt = arg;
|
||||
avl_index_t where;
|
||||
range_seg_t rsearch, *rs, *newseg;
|
||||
uint64_t end = start + size;
|
||||
@ -253,6 +379,34 @@ range_tree_remove(void *arg, uint64_t start, uint64_t size)
|
||||
(longlong_t)start, (longlong_t)size);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Range trees with gap support must only remove complete segments
|
||||
* from the tree. This allows us to maintain accurate fill accounting
|
||||
* and to ensure that bridged sections are not leaked. If we need to
|
||||
* remove less than the full segment, we can only adjust the fill count.
|
||||
*/
|
||||
if (rt->rt_gap != 0) {
|
||||
if (do_fill) {
|
||||
if (rs->rs_fill == size) {
|
||||
start = rs->rs_start;
|
||||
end = rs->rs_end;
|
||||
size = end - start;
|
||||
} else {
|
||||
range_tree_adjust_fill(rt, rs, -size);
|
||||
return;
|
||||
}
|
||||
} else if (rs->rs_start != start || rs->rs_end != end) {
|
||||
zfs_panic_recover("zfs: freeing partial segment of "
|
||||
"gap tree (offset=%llu size=%llu) of "
|
||||
"(offset=%llu size=%llu)",
|
||||
(longlong_t)start, (longlong_t)size,
|
||||
(longlong_t)rs->rs_start,
|
||||
(longlong_t)rs->rs_end - rs->rs_start);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
VERIFY3U(rs->rs_start, <=, start);
|
||||
VERIFY3U(rs->rs_end, >=, end);
|
||||
|
||||
@ -261,19 +415,20 @@ range_tree_remove(void *arg, uint64_t start, uint64_t size)
|
||||
|
||||
range_tree_stat_decr(rt, rs);
|
||||
|
||||
if (rt->rt_ops != NULL)
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
|
||||
rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
|
||||
|
||||
if (left_over && right_over) {
|
||||
newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
|
||||
newseg->rs_start = end;
|
||||
newseg->rs_end = rs->rs_end;
|
||||
newseg->rs_fill = newseg->rs_end - newseg->rs_start;
|
||||
range_tree_stat_incr(rt, newseg);
|
||||
|
||||
rs->rs_end = start;
|
||||
|
||||
avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
|
||||
if (rt->rt_ops != NULL)
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
|
||||
rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
|
||||
} else if (left_over) {
|
||||
rs->rs_end = start;
|
||||
@ -286,15 +441,53 @@ range_tree_remove(void *arg, uint64_t start, uint64_t size)
|
||||
}
|
||||
|
||||
if (rs != NULL) {
|
||||
/*
|
||||
* The fill of the leftover segment will always be equal to
|
||||
* the size, since we do not support removing partial segments
|
||||
* of range trees with gaps.
|
||||
*/
|
||||
rs->rs_fill = rs->rs_end - rs->rs_start;
|
||||
range_tree_stat_incr(rt, rs);
|
||||
|
||||
if (rt->rt_ops != NULL)
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
|
||||
rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
|
||||
}
|
||||
|
||||
rt->rt_space -= size;
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_remove(void *arg, uint64_t start, uint64_t size)
|
||||
{
|
||||
range_tree_remove_impl(arg, start, size, B_FALSE);
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size)
|
||||
{
|
||||
range_tree_remove_impl(rt, start, size, B_TRUE);
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
|
||||
uint64_t newstart, uint64_t newsize)
|
||||
{
|
||||
int64_t delta = newsize - (rs->rs_end - rs->rs_start);
|
||||
|
||||
range_tree_stat_decr(rt, rs);
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
|
||||
rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
|
||||
|
||||
rs->rs_start = newstart;
|
||||
rs->rs_end = newstart + newsize;
|
||||
|
||||
range_tree_stat_incr(rt, rs);
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
|
||||
rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
|
||||
|
||||
rt->rt_space += delta;
|
||||
}
|
||||
|
||||
static range_seg_t *
|
||||
range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
|
||||
{
|
||||
@ -309,7 +502,7 @@ range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
|
||||
return (avl_find(&rt->rt_root, &rsearch, &where));
|
||||
}
|
||||
|
||||
static range_seg_t *
|
||||
range_seg_t *
|
||||
range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
|
||||
{
|
||||
range_seg_t *rs = range_tree_find_impl(rt, start, size);
|
||||
@ -373,7 +566,7 @@ range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
|
||||
void *cookie = NULL;
|
||||
|
||||
|
||||
if (rt->rt_ops != NULL)
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
|
||||
rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
|
||||
|
||||
while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
|
||||
@ -395,12 +588,63 @@ range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
|
||||
func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
|
||||
}
|
||||
|
||||
range_seg_t *
|
||||
range_tree_first(range_tree_t *rt)
|
||||
{
|
||||
return (avl_first(&rt->rt_root));
|
||||
}
|
||||
|
||||
uint64_t
|
||||
range_tree_space(range_tree_t *rt)
|
||||
{
|
||||
return (rt->rt_space);
|
||||
}
|
||||
|
||||
/* Generic range tree functions for maintaining segments in an AVL tree. */
|
||||
void
|
||||
rt_avl_create(range_tree_t *rt, void *arg)
|
||||
{
|
||||
avl_tree_t *tree = arg;
|
||||
|
||||
avl_create(tree, rt->rt_avl_compare, sizeof (range_seg_t),
|
||||
offsetof(range_seg_t, rs_pp_node));
|
||||
}
|
||||
|
||||
void
|
||||
rt_avl_destroy(range_tree_t *rt, void *arg)
|
||||
{
|
||||
avl_tree_t *tree = arg;
|
||||
|
||||
ASSERT0(avl_numnodes(tree));
|
||||
avl_destroy(tree);
|
||||
}
|
||||
|
||||
void
|
||||
rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg)
|
||||
{
|
||||
avl_tree_t *tree = arg;
|
||||
avl_add(tree, rs);
|
||||
}
|
||||
|
||||
void
|
||||
rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
|
||||
{
|
||||
avl_tree_t *tree = arg;
|
||||
avl_remove(tree, rs);
|
||||
}
|
||||
|
||||
void
|
||||
rt_avl_vacate(range_tree_t *rt, void *arg)
|
||||
{
|
||||
/*
|
||||
* Normally one would walk the tree freeing nodes along the way.
|
||||
* Since the nodes are shared with the range trees we can avoid
|
||||
* walking all nodes and just reinitialize the avl tree. The nodes
|
||||
* will be freed by the range tree, so we don't want to free them here.
|
||||
*/
|
||||
rt_avl_create(rt, arg);
|
||||
}
|
||||
|
||||
boolean_t
|
||||
range_tree_is_empty(range_tree_t *rt)
|
||||
{
|
||||
|
@ -2035,7 +2035,7 @@ spa_load_verify_done(zio_t *zio)
|
||||
}
|
||||
|
||||
mutex_enter(&spa->spa_scrub_lock);
|
||||
spa->spa_scrub_inflight--;
|
||||
spa->spa_load_verify_ios--;
|
||||
cv_broadcast(&spa->spa_scrub_io_cv);
|
||||
mutex_exit(&spa->spa_scrub_lock);
|
||||
}
|
||||
@ -2082,9 +2082,9 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
|
||||
size_t size = BP_GET_PSIZE(bp);
|
||||
|
||||
mutex_enter(&spa->spa_scrub_lock);
|
||||
while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
|
||||
while (spa->spa_load_verify_ios >= spa_load_verify_maxinflight)
|
||||
cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
|
||||
spa->spa_scrub_inflight++;
|
||||
spa->spa_load_verify_ios++;
|
||||
mutex_exit(&spa->spa_scrub_lock);
|
||||
|
||||
zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
|
||||
|
@ -2095,6 +2095,8 @@ spa_init(int mode)
|
||||
zpool_feature_init();
|
||||
spa_config_load();
|
||||
l2arc_start();
|
||||
scan_init();
|
||||
dsl_scan_global_init();
|
||||
#ifndef illumos
|
||||
#ifdef _KERNEL
|
||||
zfs_deadman_init();
|
||||
@ -2119,7 +2121,8 @@ spa_fini(void)
|
||||
range_tree_fini();
|
||||
unique_fini();
|
||||
refcount_fini();
|
||||
|
||||
scan_fini();
|
||||
|
||||
avl_destroy(&spa_namespace_avl);
|
||||
avl_destroy(&spa_spare_avl);
|
||||
avl_destroy(&spa_l2cache_avl);
|
||||
@ -2220,6 +2223,7 @@ spa_scan_stat_init(spa_t *spa)
|
||||
spa->spa_scan_pass_scrub_pause = 0;
|
||||
spa->spa_scan_pass_scrub_spent_paused = 0;
|
||||
spa->spa_scan_pass_exam = 0;
|
||||
spa->spa_scan_pass_issued = 0;
|
||||
vdev_scan_stat_init(spa->spa_root_vdev);
|
||||
}
|
||||
|
||||
@ -2237,18 +2241,20 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
|
||||
|
||||
/* data stored on disk */
|
||||
ps->pss_func = scn->scn_phys.scn_func;
|
||||
ps->pss_state = scn->scn_phys.scn_state;
|
||||
ps->pss_start_time = scn->scn_phys.scn_start_time;
|
||||
ps->pss_end_time = scn->scn_phys.scn_end_time;
|
||||
ps->pss_to_examine = scn->scn_phys.scn_to_examine;
|
||||
ps->pss_examined = scn->scn_phys.scn_examined;
|
||||
ps->pss_to_process = scn->scn_phys.scn_to_process;
|
||||
ps->pss_processed = scn->scn_phys.scn_processed;
|
||||
ps->pss_errors = scn->scn_phys.scn_errors;
|
||||
ps->pss_state = scn->scn_phys.scn_state;
|
||||
|
||||
ps->pss_examined = scn->scn_phys.scn_examined;
|
||||
ps->pss_issued =
|
||||
scn->scn_issued_before_pass + spa->spa_scan_pass_issued;
|
||||
/* data not stored on disk */
|
||||
ps->pss_pass_start = spa->spa_scan_pass_start;
|
||||
ps->pss_pass_exam = spa->spa_scan_pass_exam;
|
||||
ps->pss_pass_issued = spa->spa_scan_pass_issued;
|
||||
ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause;
|
||||
ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused;
|
||||
|
||||
|
@ -58,11 +58,13 @@ _NOTE(CONSTCOND) } while (0)
|
||||
|
||||
typedef struct arc_buf_hdr arc_buf_hdr_t;
|
||||
typedef struct arc_buf arc_buf_t;
|
||||
typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv);
|
||||
typedef void arc_read_done_func_t(zio_t *zio, const zbookmark_phys_t *zb,
|
||||
const blkptr_t *bp, arc_buf_t *buf, void *priv);
|
||||
typedef void arc_write_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv);
|
||||
|
||||
/* generic arc_done_func_t's which you can use */
|
||||
arc_done_func_t arc_bcopy_func;
|
||||
arc_done_func_t arc_getbuf_func;
|
||||
arc_read_done_func_t arc_bcopy_func;
|
||||
arc_read_done_func_t arc_getbuf_func;
|
||||
|
||||
typedef enum arc_flags
|
||||
{
|
||||
@ -75,35 +77,36 @@ typedef enum arc_flags
|
||||
ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */
|
||||
ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */
|
||||
ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 5, /* I/O from zfetch */
|
||||
ARC_FLAG_PRESCIENT_PREFETCH = 1 << 6, /* long min lifespan */
|
||||
|
||||
/*
|
||||
* Private ARC flags. These flags are private ARC only flags that
|
||||
* will show up in b_flags in the arc_hdr_buf_t. These flags should
|
||||
* only be set by ARC code.
|
||||
*/
|
||||
ARC_FLAG_IN_HASH_TABLE = 1 << 6, /* buffer is hashed */
|
||||
ARC_FLAG_IO_IN_PROGRESS = 1 << 7, /* I/O in progress */
|
||||
ARC_FLAG_IO_ERROR = 1 << 8, /* I/O failed for buf */
|
||||
ARC_FLAG_INDIRECT = 1 << 9, /* indirect block */
|
||||
ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */
|
||||
ARC_FLAG_IO_IN_PROGRESS = 1 << 8, /* I/O in progress */
|
||||
ARC_FLAG_IO_ERROR = 1 << 9, /* I/O failed for buf */
|
||||
ARC_FLAG_INDIRECT = 1 << 10, /* indirect block */
|
||||
/* Indicates that block was read with ASYNC priority. */
|
||||
ARC_FLAG_PRIO_ASYNC_READ = 1 << 10,
|
||||
ARC_FLAG_L2_WRITING = 1 << 11, /* write in progress */
|
||||
ARC_FLAG_L2_EVICTED = 1 << 12, /* evicted during I/O */
|
||||
ARC_FLAG_L2_WRITE_HEAD = 1 << 13, /* head of write list */
|
||||
ARC_FLAG_PRIO_ASYNC_READ = 1 << 11,
|
||||
ARC_FLAG_L2_WRITING = 1 << 12, /* write in progress */
|
||||
ARC_FLAG_L2_EVICTED = 1 << 13, /* evicted during I/O */
|
||||
ARC_FLAG_L2_WRITE_HEAD = 1 << 14, /* head of write list */
|
||||
/* indicates that the buffer contains metadata (otherwise, data) */
|
||||
ARC_FLAG_BUFC_METADATA = 1 << 14,
|
||||
ARC_FLAG_BUFC_METADATA = 1 << 15,
|
||||
|
||||
/* Flags specifying whether optional hdr struct fields are defined */
|
||||
ARC_FLAG_HAS_L1HDR = 1 << 15,
|
||||
ARC_FLAG_HAS_L2HDR = 1 << 16,
|
||||
ARC_FLAG_HAS_L1HDR = 1 << 16,
|
||||
ARC_FLAG_HAS_L2HDR = 1 << 17,
|
||||
|
||||
/*
|
||||
* Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data.
|
||||
* This allows the l2arc to use the blkptr's checksum to verify
|
||||
* the data without having to store the checksum in the hdr.
|
||||
*/
|
||||
ARC_FLAG_COMPRESSED_ARC = 1 << 17,
|
||||
ARC_FLAG_SHARED_DATA = 1 << 18,
|
||||
ARC_FLAG_COMPRESSED_ARC = 1 << 18,
|
||||
ARC_FLAG_SHARED_DATA = 1 << 19,
|
||||
|
||||
/*
|
||||
* The arc buffer's compression mode is stored in the top 7 bits of the
|
||||
@ -179,12 +182,12 @@ int arc_referenced(arc_buf_t *buf);
|
||||
#endif
|
||||
|
||||
int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
|
||||
arc_done_func_t *done, void *priv, zio_priority_t priority, int flags,
|
||||
arc_flags_t *arc_flags, const zbookmark_phys_t *zb);
|
||||
arc_read_done_func_t *done, void *priv, zio_priority_t priority,
|
||||
int flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb);
|
||||
zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
|
||||
blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
|
||||
arc_done_func_t *ready, arc_done_func_t *child_ready,
|
||||
arc_done_func_t *physdone, arc_done_func_t *done,
|
||||
arc_write_done_func_t *ready, arc_write_done_func_t *child_ready,
|
||||
arc_write_done_func_t *physdone, arc_write_done_func_t *done,
|
||||
void *priv, zio_priority_t priority, int zio_flags,
|
||||
const zbookmark_phys_t *zb);
|
||||
void arc_freed(spa_t *spa, const blkptr_t *bp);
|
||||
|
@ -76,6 +76,7 @@ typedef struct zfs_blkstat {
|
||||
|
||||
typedef struct zfs_all_blkstats {
|
||||
zfs_blkstat_t zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1];
|
||||
kmutex_t zab_lock;
|
||||
} zfs_all_blkstats_t;
|
||||
|
||||
|
||||
|
@ -107,24 +107,58 @@ typedef enum dsl_scan_flags {
|
||||
typedef struct dsl_scan {
|
||||
struct dsl_pool *scn_dp;
|
||||
|
||||
boolean_t scn_suspending;
|
||||
uint64_t scn_restart_txg;
|
||||
uint64_t scn_done_txg;
|
||||
uint64_t scn_sync_start_time;
|
||||
zio_t *scn_zio_root;
|
||||
uint64_t scn_issued_before_pass;
|
||||
|
||||
/* for freeing blocks */
|
||||
boolean_t scn_is_bptree;
|
||||
boolean_t scn_async_destroying;
|
||||
boolean_t scn_async_stalled;
|
||||
uint64_t scn_async_block_min_time_ms;
|
||||
/* flags and stats for controlling scan state */
|
||||
boolean_t scn_is_sorted; /* doing sequential scan */
|
||||
boolean_t scn_clearing; /* scan is issuing sequential extents */
|
||||
boolean_t scn_checkpointing; /* scan is issuing all queued extents */
|
||||
boolean_t scn_suspending; /* scan is suspending until next txg */
|
||||
uint64_t scn_last_checkpoint; /* time of last checkpoint */
|
||||
|
||||
/* for debugging / information */
|
||||
uint64_t scn_visited_this_txg;
|
||||
/* members for thread synchronization */
|
||||
zio_t *scn_zio_root; /* root zio for waiting on IO */
|
||||
taskq_t *scn_taskq; /* task queue for issuing extents */
|
||||
|
||||
dsl_scan_phys_t scn_phys;
|
||||
/* for controlling scan prefetch, protected by spa_scrub_lock */
|
||||
boolean_t scn_prefetch_stop; /* prefetch should stop */
|
||||
zbookmark_phys_t scn_prefetch_bookmark; /* prefetch start bookmark */
|
||||
avl_tree_t scn_prefetch_queue; /* priority queue of prefetch IOs */
|
||||
uint64_t scn_maxinflight_bytes; /* max bytes in flight for poool */
|
||||
|
||||
/* per txg statistics */
|
||||
uint64_t scn_visited_this_txg; /* total bps visited this txg */
|
||||
uint64_t scn_holes_this_txg;
|
||||
uint64_t scn_lt_min_this_txg;
|
||||
uint64_t scn_gt_max_this_txg;
|
||||
uint64_t scn_ddt_contained_this_txg;
|
||||
uint64_t scn_objsets_visited_this_txg;
|
||||
uint64_t scn_avg_seg_size_this_txg;
|
||||
uint64_t scn_segs_this_txg;
|
||||
uint64_t scn_avg_zio_size_this_txg;
|
||||
uint64_t scn_zios_this_txg;
|
||||
|
||||
/* members needed for syncing scan status to disk */
|
||||
dsl_scan_phys_t scn_phys; /* on disk representation of scan */
|
||||
dsl_scan_phys_t scn_phys_cached;
|
||||
avl_tree_t scn_queue; /* queue of datasets to scan */
|
||||
uint64_t scn_bytes_pending; /* outstanding data to issue */
|
||||
} dsl_scan_t;
|
||||
|
||||
typedef struct dsl_scan_io_queue dsl_scan_io_queue_t;
|
||||
|
||||
void dsl_scan_global_init(void);
|
||||
|
||||
void scan_init(void);
|
||||
void scan_fini(void);
|
||||
int dsl_scan_init(struct dsl_pool *dp, uint64_t txg);
|
||||
void dsl_scan_fini(struct dsl_pool *dp);
|
||||
void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *);
|
||||
@ -143,6 +177,9 @@ void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
|
||||
struct dmu_tx *tx);
|
||||
boolean_t dsl_scan_active(dsl_scan_t *scn);
|
||||
boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn);
|
||||
void dsl_scan_freed(spa_t *spa, const blkptr_t *bp);
|
||||
void dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue);
|
||||
void dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
@ -51,6 +51,9 @@ typedef struct range_tree {
|
||||
range_tree_ops_t *rt_ops;
|
||||
void *rt_arg;
|
||||
|
||||
/* rt_avl_compare should only be set it rt_arg is an AVL tree */
|
||||
uint64_t rt_gap; /* allowable inter-segment gap */
|
||||
int (*rt_avl_compare)(const void *, const void *);
|
||||
/*
|
||||
* The rt_histogram maintains a histogram of ranges. Each bucket,
|
||||
* rt_histogram[i], contains the number of ranges whose size is:
|
||||
@ -64,6 +67,7 @@ typedef struct range_seg {
|
||||
avl_node_t rs_pp_node; /* AVL picker-private node */
|
||||
uint64_t rs_start; /* starting offset of this segment */
|
||||
uint64_t rs_end; /* ending offset (non-inclusive) */
|
||||
uint64_t rs_fill; /* actual fill if gap mode is on */
|
||||
} range_seg_t;
|
||||
|
||||
struct range_tree_ops {
|
||||
@ -78,9 +82,14 @@ typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size);
|
||||
|
||||
void range_tree_init(void);
|
||||
void range_tree_fini(void);
|
||||
range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg);
|
||||
range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg,
|
||||
int (*avl_compare)(const void*, const void*), uint64_t gap);
|
||||
range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg);
|
||||
void range_tree_destroy(range_tree_t *rt);
|
||||
boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size);
|
||||
range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size);
|
||||
void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
|
||||
uint64_t newstart, uint64_t newsize);
|
||||
uint64_t range_tree_space(range_tree_t *rt);
|
||||
boolean_t range_tree_is_empty(range_tree_t *rt);
|
||||
void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size);
|
||||
@ -89,10 +98,27 @@ void range_tree_stat_verify(range_tree_t *rt);
|
||||
|
||||
void range_tree_add(void *arg, uint64_t start, uint64_t size);
|
||||
void range_tree_remove(void *arg, uint64_t start, uint64_t size);
|
||||
void range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size);
|
||||
void range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta);
|
||||
void range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size);
|
||||
|
||||
void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg);
|
||||
void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg);
|
||||
range_seg_t *range_tree_first(range_tree_t *rt);
|
||||
|
||||
void rt_avl_create(range_tree_t *rt, void *arg);
|
||||
void rt_avl_destroy(range_tree_t *rt, void *arg);
|
||||
void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg);
|
||||
void rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg);
|
||||
void rt_avl_vacate(range_tree_t *rt, void *arg);
|
||||
extern struct range_tree_ops rt_avl_ops;
|
||||
|
||||
void rt_avl_create(range_tree_t *rt, void *arg);
|
||||
void rt_avl_destroy(range_tree_t *rt, void *arg);
|
||||
void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg);
|
||||
void rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg);
|
||||
void rt_avl_vacate(range_tree_t *rt, void *arg);
|
||||
extern struct range_tree_ops rt_avl_ops;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
@ -257,7 +257,8 @@ struct spa {
|
||||
boolean_t spa_extreme_rewind; /* rewind past deferred frees */
|
||||
uint64_t spa_last_io; /* lbolt of last non-scan I/O */
|
||||
kmutex_t spa_scrub_lock; /* resilver/scrub lock */
|
||||
uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */
|
||||
uint64_t spa_scrub_inflight; /* in-flight scrub bytes */
|
||||
uint64_t spa_load_verify_ios; /* in-flight verifications IOs */
|
||||
kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */
|
||||
uint8_t spa_scrub_active; /* active or suspended? */
|
||||
uint8_t spa_scrub_type; /* type of scrub we're doing */
|
||||
@ -268,6 +269,7 @@ struct spa {
|
||||
uint64_t spa_scan_pass_scrub_pause; /* scrub pause time */
|
||||
uint64_t spa_scan_pass_scrub_spent_paused; /* total paused */
|
||||
uint64_t spa_scan_pass_exam; /* examined bytes per pass */
|
||||
uint64_t spa_scan_pass_issued; /* issued bytes per pass */
|
||||
kmutex_t spa_async_lock; /* protect async state */
|
||||
kthread_t *spa_async_thread; /* thread doing async task */
|
||||
kthread_t *spa_async_thread_vd; /* thread doing vd async task */
|
||||
|
@ -71,6 +71,7 @@ extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
|
||||
extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
|
||||
uint64_t txg, uint64_t size);
|
||||
extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d);
|
||||
extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t off, size_t size);
|
||||
extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
|
||||
int scrub_done);
|
||||
extern boolean_t vdev_dtl_required(vdev_t *vd);
|
||||
@ -135,6 +136,7 @@ extern void vdev_queue_init(vdev_t *vd);
|
||||
extern void vdev_queue_fini(vdev_t *vd);
|
||||
extern zio_t *vdev_queue_io(zio_t *zio);
|
||||
extern void vdev_queue_io_done(zio_t *zio);
|
||||
extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
|
||||
extern int vdev_queue_length(vdev_t *vd);
|
||||
extern uint64_t vdev_queue_lastoffset(vdev_t *vd);
|
||||
extern void vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio);
|
||||
|
@ -71,6 +71,7 @@ typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
|
||||
typedef void vdev_io_start_func_t(zio_t *zio);
|
||||
typedef void vdev_io_done_func_t(zio_t *zio);
|
||||
typedef void vdev_state_change_func_t(vdev_t *vd, int, int);
|
||||
typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, uint64_t, size_t);
|
||||
typedef void vdev_hold_func_t(vdev_t *vd);
|
||||
typedef void vdev_rele_func_t(vdev_t *vd);
|
||||
|
||||
@ -86,6 +87,7 @@ typedef struct vdev_ops {
|
||||
vdev_io_start_func_t *vdev_op_io_start;
|
||||
vdev_io_done_func_t *vdev_op_io_done;
|
||||
vdev_state_change_func_t *vdev_op_state_change;
|
||||
vdev_need_resilver_func_t *vdev_op_need_resilver;
|
||||
vdev_hold_func_t *vdev_op_hold;
|
||||
vdev_rele_func_t *vdev_op_rele;
|
||||
vdev_remap_func_t *vdev_op_remap;
|
||||
@ -294,6 +296,13 @@ struct vdev {
|
||||
uint64_t vdev_async_write_queue_depth;
|
||||
uint64_t vdev_max_async_write_queue_depth;
|
||||
|
||||
/*
|
||||
* Protects the vdev_scan_io_queue field itself as well as the
|
||||
* structure's contents (when present).
|
||||
*/
|
||||
kmutex_t vdev_scan_io_queue_lock;
|
||||
struct dsl_scan_io_queue *vdev_scan_io_queue;
|
||||
|
||||
/*
|
||||
* Leaf vdev state.
|
||||
*/
|
||||
|
@ -593,6 +593,8 @@ extern void zio_vdev_io_bypass(zio_t *zio);
|
||||
extern void zio_vdev_io_reissue(zio_t *zio);
|
||||
extern void zio_vdev_io_redone(zio_t *zio);
|
||||
|
||||
extern void zio_change_priority(zio_t *pio, zio_priority_t priority);
|
||||
|
||||
extern void zio_checksum_verified(zio_t *zio);
|
||||
extern int zio_worst_error(int e1, int e2);
|
||||
|
||||
|
@ -559,6 +559,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
|
||||
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
|
||||
for (int t = 0; t < DTL_TYPES; t++) {
|
||||
vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
|
||||
}
|
||||
@ -831,6 +833,18 @@ vdev_free(vdev_t *vd)
|
||||
{
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
|
||||
/*
|
||||
* Scan queues are normally destroyed at the end of a scan. If the
|
||||
* queue exists here, that implies the vdev is being removed while
|
||||
* the scan is still running.
|
||||
*/
|
||||
if (vd->vdev_scan_io_queue != NULL) {
|
||||
mutex_enter(&vd->vdev_scan_io_queue_lock);
|
||||
dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue);
|
||||
vd->vdev_scan_io_queue = NULL;
|
||||
mutex_exit(&vd->vdev_scan_io_queue_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* vdev_free() implies closing the vdev first. This is simpler than
|
||||
* trying to ensure complicated semantics for all callers.
|
||||
@ -920,6 +934,7 @@ vdev_free(vdev_t *vd)
|
||||
mutex_destroy(&vd->vdev_dtl_lock);
|
||||
mutex_destroy(&vd->vdev_stat_lock);
|
||||
mutex_destroy(&vd->vdev_probe_lock);
|
||||
mutex_destroy(&vd->vdev_scan_io_queue_lock);
|
||||
|
||||
if (vd == spa->spa_root_vdev)
|
||||
spa->spa_root_vdev = NULL;
|
||||
@ -996,6 +1011,8 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
|
||||
|
||||
tvd->vdev_islog = svd->vdev_islog;
|
||||
svd->vdev_islog = 0;
|
||||
|
||||
dsl_scan_io_queue_vdev_xfer(svd, tvd);
|
||||
}
|
||||
|
||||
static void
|
||||
@ -2288,6 +2305,21 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
|
||||
return (empty);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns B_TRUE if vdev determines offset needs to be resilvered.
|
||||
*/
|
||||
boolean_t
|
||||
vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
|
||||
{
|
||||
ASSERT(vd != vd->vdev_spa->spa_root_vdev);
|
||||
|
||||
if (vd->vdev_ops->vdev_op_need_resilver == NULL ||
|
||||
vd->vdev_ops->vdev_op_leaf)
|
||||
return (B_TRUE);
|
||||
|
||||
return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize));
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns the lowest txg in the DTL range.
|
||||
*/
|
||||
|
@ -837,6 +837,7 @@ vdev_ops_t vdev_disk_ops = {
|
||||
vdev_disk_io_start,
|
||||
vdev_disk_io_done,
|
||||
NULL,
|
||||
NULL,
|
||||
vdev_disk_hold,
|
||||
vdev_disk_rele,
|
||||
NULL,
|
||||
|
@ -267,6 +267,7 @@ vdev_ops_t vdev_file_ops = {
|
||||
vdev_file_io_start,
|
||||
vdev_file_io_done,
|
||||
NULL,
|
||||
NULL,
|
||||
vdev_file_hold,
|
||||
vdev_file_rele,
|
||||
NULL,
|
||||
@ -286,6 +287,7 @@ vdev_ops_t vdev_disk_ops = {
|
||||
vdev_file_io_start,
|
||||
vdev_file_io_done,
|
||||
NULL,
|
||||
NULL,
|
||||
vdev_file_hold,
|
||||
vdev_file_rele,
|
||||
NULL,
|
||||
|
@ -1147,6 +1147,7 @@ vdev_ops_t vdev_geom_ops = {
|
||||
vdev_geom_io_start,
|
||||
vdev_geom_io_done,
|
||||
NULL,
|
||||
NULL,
|
||||
vdev_geom_hold,
|
||||
vdev_geom_rele,
|
||||
NULL,
|
||||
|
@ -1111,6 +1111,7 @@ vdev_ops_t vdev_indirect_ops = {
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
vdev_indirect_remap,
|
||||
VDEV_TYPE_INDIRECT, /* name of this vdev type */
|
||||
B_FALSE /* leaf vdev */
|
||||
|
@ -722,6 +722,7 @@ vdev_ops_t vdev_mirror_ops = {
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
VDEV_TYPE_MIRROR, /* name of this vdev type */
|
||||
B_FALSE /* not a leaf vdev */
|
||||
};
|
||||
@ -736,6 +737,7 @@ vdev_ops_t vdev_replacing_ops = {
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
VDEV_TYPE_REPLACING, /* name of this vdev type */
|
||||
B_FALSE /* not a leaf vdev */
|
||||
};
|
||||
@ -750,6 +752,7 @@ vdev_ops_t vdev_spare_ops = {
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
VDEV_TYPE_SPARE, /* name of this vdev type */
|
||||
B_FALSE /* not a leaf vdev */
|
||||
};
|
||||
|
@ -90,6 +90,7 @@ vdev_ops_t vdev_missing_ops = {
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
VDEV_TYPE_MISSING, /* name of this vdev type */
|
||||
B_TRUE /* leaf vdev */
|
||||
};
|
||||
@ -104,6 +105,7 @@ vdev_ops_t vdev_hole_ops = {
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
VDEV_TYPE_HOLE, /* name of this vdev type */
|
||||
B_TRUE /* leaf vdev */
|
||||
};
|
||||
|
@ -175,7 +175,7 @@ int zfs_vdev_async_write_active_max_dirty_percent = 60;
|
||||
* we include spans of optional I/Os to aid aggregation at the disk even when
|
||||
* they aren't able to help us aggregate at this level.
|
||||
*/
|
||||
int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE;
|
||||
int zfs_vdev_aggregation_limit = 1 << 20;
|
||||
int zfs_vdev_read_gap_limit = 32 << 10;
|
||||
int zfs_vdev_write_gap_limit = 4 << 10;
|
||||
|
||||
@ -938,6 +938,48 @@ vdev_queue_io_done(zio_t *zio)
|
||||
mutex_exit(&vq->vq_lock);
|
||||
}
|
||||
|
||||
void
|
||||
vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
|
||||
{
|
||||
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
|
||||
avl_tree_t *tree;
|
||||
|
||||
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
|
||||
ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
|
||||
|
||||
if (zio->io_type == ZIO_TYPE_READ) {
|
||||
if (priority != ZIO_PRIORITY_SYNC_READ &&
|
||||
priority != ZIO_PRIORITY_ASYNC_READ &&
|
||||
priority != ZIO_PRIORITY_SCRUB)
|
||||
priority = ZIO_PRIORITY_ASYNC_READ;
|
||||
} else {
|
||||
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
|
||||
if (priority != ZIO_PRIORITY_SYNC_WRITE &&
|
||||
priority != ZIO_PRIORITY_ASYNC_WRITE)
|
||||
priority = ZIO_PRIORITY_ASYNC_WRITE;
|
||||
}
|
||||
|
||||
mutex_enter(&vq->vq_lock);
|
||||
|
||||
/*
|
||||
* If the zio is in none of the queues we can simply change
|
||||
* the priority. If the zio is waiting to be submitted we must
|
||||
* remove it from the queue and re-insert it with the new priority.
|
||||
* Otherwise, the zio is currently active and we cannot change its
|
||||
* priority.
|
||||
*/
|
||||
tree = vdev_queue_class_tree(vq, zio->io_priority);
|
||||
if (avl_find(tree, zio, NULL) == zio) {
|
||||
avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
|
||||
zio->io_priority = priority;
|
||||
avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
|
||||
} else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) {
|
||||
zio->io_priority = priority;
|
||||
}
|
||||
|
||||
mutex_exit(&vq->vq_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* As these three methods are only used for load calculations we're not concerned
|
||||
* if we get an incorrect value on 32bit platforms due to lack of vq_lock mutex
|
||||
|
@ -2584,6 +2584,44 @@ vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
|
||||
vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Determine if any portion of the provided block resides on a child vdev
|
||||
* with a dirty DTL and therefore needs to be resilvered. The function
|
||||
* assumes that at least one DTL is dirty which imples that full stripe
|
||||
* width blocks must be resilvered.
|
||||
*/
|
||||
static boolean_t
|
||||
vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
|
||||
{
|
||||
uint64_t dcols = vd->vdev_children;
|
||||
uint64_t nparity = vd->vdev_nparity;
|
||||
uint64_t ashift = vd->vdev_top->vdev_ashift;
|
||||
/* The starting RAIDZ (parent) vdev sector of the block. */
|
||||
uint64_t b = offset >> ashift;
|
||||
/* The zio's size in units of the vdev's minimum sector size. */
|
||||
uint64_t s = ((psize - 1) >> ashift) + 1;
|
||||
/* The first column for this stripe. */
|
||||
uint64_t f = b % dcols;
|
||||
|
||||
if (s + nparity >= dcols)
|
||||
return (B_TRUE);
|
||||
|
||||
for (uint64_t c = 0; c < s + nparity; c++) {
|
||||
uint64_t devidx = (f + c) % dcols;
|
||||
vdev_t *cvd = vd->vdev_child[devidx];
|
||||
|
||||
/*
|
||||
* dsl_scan_need_resilver() already checked vd with
|
||||
* vdev_dtl_contains(). So here just check cvd with
|
||||
* vdev_dtl_empty(), cheaper and a good approximation.
|
||||
*/
|
||||
if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
return (B_FALSE);
|
||||
}
|
||||
|
||||
vdev_ops_t vdev_raidz_ops = {
|
||||
vdev_raidz_open,
|
||||
vdev_raidz_close,
|
||||
@ -2591,6 +2629,7 @@ vdev_ops_t vdev_raidz_ops = {
|
||||
vdev_raidz_io_start,
|
||||
vdev_raidz_io_done,
|
||||
vdev_raidz_state_change,
|
||||
vdev_raidz_need_resilver,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
|
@ -150,6 +150,7 @@ vdev_ops_t vdev_root_ops = {
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
VDEV_TYPE_ROOT, /* name of this vdev type */
|
||||
B_FALSE /* not a leaf vdev */
|
||||
};
|
||||
|
@ -1051,7 +1051,7 @@ zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
|
||||
}
|
||||
err = zap_add(os, intoobj, za.za_name,
|
||||
8, 1, &value, tx);
|
||||
if (err)
|
||||
if (err != 0)
|
||||
break;
|
||||
}
|
||||
zap_cursor_fini(&zc);
|
||||
|
@ -41,6 +41,7 @@
|
||||
#include <sys/trim_map.h>
|
||||
#include <sys/blkptr.h>
|
||||
#include <sys/zfeature.h>
|
||||
#include <sys/dsl_scan.h>
|
||||
#include <sys/metaslab_impl.h>
|
||||
#include <sys/abd.h>
|
||||
|
||||
@ -438,6 +439,8 @@ zio_walk_children(zio_t *pio, zio_link_t **zl)
|
||||
{
|
||||
list_t *cl = &pio->io_child_list;
|
||||
|
||||
ASSERT(MUTEX_HELD(&pio->io_lock));
|
||||
|
||||
*zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
|
||||
if (*zl == NULL)
|
||||
return (NULL);
|
||||
@ -472,8 +475,8 @@ zio_add_child(zio_t *pio, zio_t *cio)
|
||||
zl->zl_parent = pio;
|
||||
zl->zl_child = cio;
|
||||
|
||||
mutex_enter(&cio->io_lock);
|
||||
mutex_enter(&pio->io_lock);
|
||||
mutex_enter(&cio->io_lock);
|
||||
|
||||
ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
|
||||
|
||||
@ -486,8 +489,8 @@ zio_add_child(zio_t *pio, zio_t *cio)
|
||||
pio->io_child_count++;
|
||||
cio->io_parent_count++;
|
||||
|
||||
mutex_exit(&pio->io_lock);
|
||||
mutex_exit(&cio->io_lock);
|
||||
mutex_exit(&pio->io_lock);
|
||||
}
|
||||
|
||||
static void
|
||||
@ -496,8 +499,8 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
|
||||
ASSERT(zl->zl_parent == pio);
|
||||
ASSERT(zl->zl_child == cio);
|
||||
|
||||
mutex_enter(&cio->io_lock);
|
||||
mutex_enter(&pio->io_lock);
|
||||
mutex_enter(&cio->io_lock);
|
||||
|
||||
list_remove(&pio->io_child_list, zl);
|
||||
list_remove(&cio->io_parent_list, zl);
|
||||
@ -505,9 +508,8 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
|
||||
pio->io_child_count--;
|
||||
cio->io_parent_count--;
|
||||
|
||||
mutex_exit(&pio->io_lock);
|
||||
mutex_exit(&cio->io_lock);
|
||||
|
||||
mutex_exit(&pio->io_lock);
|
||||
kmem_cache_free(zio_link_cache, zl);
|
||||
}
|
||||
|
||||
@ -988,6 +990,7 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
|
||||
|
||||
metaslab_check_free(spa, bp);
|
||||
arc_freed(spa, bp);
|
||||
dsl_scan_freed(spa, bp);
|
||||
|
||||
if (zfs_trim_enabled)
|
||||
stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START |
|
||||
@ -1865,14 +1868,16 @@ zio_reexecute(zio_t *pio)
|
||||
* cannot be affected by any side effects of reexecuting 'cio'.
|
||||
*/
|
||||
zio_link_t *zl = NULL;
|
||||
mutex_enter(&pio->io_lock);
|
||||
for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
|
||||
cio_next = zio_walk_children(pio, &zl);
|
||||
mutex_enter(&pio->io_lock);
|
||||
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
|
||||
pio->io_children[cio->io_child_type][w]++;
|
||||
mutex_exit(&pio->io_lock);
|
||||
zio_reexecute(cio);
|
||||
mutex_enter(&pio->io_lock);
|
||||
}
|
||||
mutex_exit(&pio->io_lock);
|
||||
|
||||
/*
|
||||
* Now that all children have been reexecuted, execute the parent.
|
||||
@ -3184,26 +3189,25 @@ zio_vdev_io_start(zio_t *zio)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* We keep track of time-sensitive I/Os so that the scan thread
|
||||
* can quickly react to certain workloads. In particular, we care
|
||||
* about non-scrubbing, top-level reads and writes with the following
|
||||
* characteristics:
|
||||
* - synchronous writes of user data to non-slog devices
|
||||
* - any reads of user data
|
||||
* When these conditions are met, adjust the timestamp of spa_last_io
|
||||
* which allows the scan thread to adjust its workload accordingly.
|
||||
*/
|
||||
if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
|
||||
vd == vd->vdev_top && !vd->vdev_islog &&
|
||||
zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
|
||||
zio->io_txg != spa_syncing_txg(spa)) {
|
||||
uint64_t old = spa->spa_last_io;
|
||||
uint64_t new = ddi_get_lbolt64();
|
||||
if (old != new)
|
||||
(void) atomic_cas_64(&spa->spa_last_io, old, new);
|
||||
}
|
||||
|
||||
/*
|
||||
* We keep track of time-sensitive I/Os so that the scan thread
|
||||
* can quickly react to certain workloads. In particular, we care
|
||||
* about non-scrubbing, top-level reads and writes with the following
|
||||
* characteristics:
|
||||
* - synchronous writes of user data to non-slog devices
|
||||
* - any reads of user data
|
||||
* When these conditions are met, adjust the timestamp of spa_last_io
|
||||
* which allows the scan thread to adjust its workload accordingly.
|
||||
*/
|
||||
if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
|
||||
vd == vd->vdev_top && !vd->vdev_islog &&
|
||||
zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
|
||||
zio->io_txg != spa_syncing_txg(spa)) {
|
||||
uint64_t old = spa->spa_last_io;
|
||||
uint64_t new = ddi_get_lbolt64();
|
||||
if (old != new)
|
||||
(void) atomic_cas_64(&spa->spa_last_io, old, new);
|
||||
}
|
||||
align = 1ULL << vd->vdev_top->vdev_ashift;
|
||||
|
||||
if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
|
||||
@ -3352,6 +3356,35 @@ zio_vdev_io_done(zio_t *zio)
|
||||
return (ZIO_PIPELINE_CONTINUE);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is used to change the priority of an existing zio that is
|
||||
* currently in-flight. This is used by the arc to upgrade priority in the
|
||||
* event that a demand read is made for a block that is currently queued
|
||||
* as a scrub or async read IO. Otherwise, the high priority read request
|
||||
* would end up having to wait for the lower priority IO.
|
||||
*/
|
||||
void
|
||||
zio_change_priority(zio_t *pio, zio_priority_t priority)
|
||||
{
|
||||
zio_t *cio, *cio_next;
|
||||
zio_link_t *zl = NULL;
|
||||
|
||||
ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
|
||||
|
||||
if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) {
|
||||
vdev_queue_change_io_priority(pio, priority);
|
||||
} else {
|
||||
pio->io_priority = priority;
|
||||
}
|
||||
|
||||
mutex_enter(&pio->io_lock);
|
||||
for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
|
||||
cio_next = zio_walk_children(pio, &zl);
|
||||
zio_change_priority(cio, priority);
|
||||
}
|
||||
mutex_exit(&pio->io_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* For non-raidz ZIOs, we can just copy aside the bad data read from the
|
||||
* disk, and use that to finish the checksum ereport later.
|
||||
|
@ -760,7 +760,7 @@ typedef struct pool_scan_stat {
|
||||
uint64_t pss_start_time; /* scan start time */
|
||||
uint64_t pss_end_time; /* scan end time */
|
||||
uint64_t pss_to_examine; /* total bytes to scan */
|
||||
uint64_t pss_examined; /* total examined bytes */
|
||||
uint64_t pss_examined; /* total bytes located by scanner */
|
||||
uint64_t pss_to_process; /* total bytes to process */
|
||||
uint64_t pss_processed; /* total processed bytes */
|
||||
uint64_t pss_errors; /* scan errors */
|
||||
@ -771,6 +771,12 @@ typedef struct pool_scan_stat {
|
||||
uint64_t pss_pass_scrub_pause; /* pause time of a scurb pass */
|
||||
/* cumulative time scrub spent paused, needed for rate calculation */
|
||||
uint64_t pss_pass_scrub_spent_paused;
|
||||
|
||||
/* Sorted scrubbing new fields */
|
||||
/* Stored on disk */
|
||||
uint64_t pss_issued; /* total bytes checked by scanner */
|
||||
/* Not stored on disk */
|
||||
uint64_t pss_pass_issued; /* issued bytes per scan pass */
|
||||
} pool_scan_stat_t;
|
||||
|
||||
typedef struct pool_removal_stat {
|
||||
|
@ -72,6 +72,8 @@ struct proc;
|
||||
#define TQ_NOALLOC 0x04 /* cannot allocate memory; may fail */
|
||||
#define TQ_FRONT 0x08 /* Put task at the front of the queue */
|
||||
|
||||
#define TASKQID_INVALID ((taskqid_t)0)
|
||||
|
||||
#ifdef _KERNEL
|
||||
|
||||
extern taskq_t *system_taskq;
|
||||
@ -91,6 +93,7 @@ void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
|
||||
void nulltask(void *);
|
||||
void taskq_destroy(taskq_t *);
|
||||
void taskq_wait(taskq_t *);
|
||||
void taskq_wait_id(taskq_t *, taskqid_t);
|
||||
void taskq_suspend(taskq_t *);
|
||||
int taskq_suspended(taskq_t *);
|
||||
void taskq_resume(taskq_t *);
|
||||
|
Loading…
x
Reference in New Issue
Block a user