MFV r286704: 5960 zfs recv should prefetch indirect blocks

5925 zfs receive -o origin=

Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Author: Paul Dagnelie <pcd@delphix.com>

While running 'zfs recv' we noticed that every 128th 8K block required a
read. We were seeing that restore_write() was calling dmu_tx_hold_write()
and the indirect block was not cached. We should prefetch upcoming indirect
blocks to avoid having to go to disk and blocking the restore_write().

Allow an incremental send stream to be received as a clone, even if the
stream does not mark it as a clone.
This commit is contained in:
mav 2015-08-12 22:41:06 +00:00
parent cf4bfabada
commit 99cadf9eed
38 changed files with 1394 additions and 386 deletions

View File

@ -2428,6 +2428,9 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
dmu_object_type_t type; dmu_object_type_t type;
boolean_t is_metadata; boolean_t is_metadata;
if (bp == NULL)
return (0);
if (dump_opt['b'] >= 5 && bp->blk_birth > 0) { if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
char blkbuf[BP_SPRINTF_LEN]; char blkbuf[BP_SPRINTF_LEN];
snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
@ -2917,7 +2920,7 @@ zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
avl_index_t where; avl_index_t where;
zdb_ddt_entry_t *zdde, zdde_search; zdb_ddt_entry_t *zdde, zdde_search;
if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
return (0); return (0);
if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {

View File

@ -191,11 +191,13 @@
.Nm .Nm
.Cm receive Ns | Ns Cm recv .Cm receive Ns | Ns Cm recv
.Op Fl vnFu .Op Fl vnFu
.Op Fl o Sy origin Ns = Ns Ar snapshot
.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
.Nm .Nm
.Cm receive Ns | Ns Cm recv .Cm receive Ns | Ns Cm recv
.Op Fl vnFu .Op Fl vnFu
.Op Fl d | e .Op Fl d | e
.Op Fl o Sy origin Ns = Ns Ar snapshot
.Ar filesystem .Ar filesystem
.Nm .Nm
.Cm allow .Cm allow
@ -2705,6 +2707,7 @@ feature.
.Nm .Nm
.Cm receive Ns | Ns Cm recv .Cm receive Ns | Ns Cm recv
.Op Fl vnFu .Op Fl vnFu
.Op Fl o Sy origin Ns = Ns Ar snapshot
.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
.Xc .Xc
.It Xo .It Xo
@ -2712,6 +2715,7 @@ feature.
.Cm receive Ns | Ns Cm recv .Cm receive Ns | Ns Cm recv
.Op Fl vnFu .Op Fl vnFu
.Op Fl d | e .Op Fl d | e
.Op Fl o Sy origin Ns = Ns Ar snapshot
.Ar filesystem .Ar filesystem
.Xc .Xc
.Pp .Pp
@ -2796,6 +2800,10 @@ receive operation.
Do not actually receive the stream. This can be useful in conjunction with the Do not actually receive the stream. This can be useful in conjunction with the
.Fl v .Fl v
option to verify the name the receive operation would use. option to verify the name the receive operation would use.
.It Fl o Sy origin Ns = Ns Ar snapshot
Forces the stream to be received as a clone of the given snapshot.
This is only valid if the stream is an incremental stream whose source
is the same as the provided origin.
.It Fl F .It Fl F
Force a rollback of the file system to the most recent snapshot before Force a rollback of the file system to the most recent snapshot before
performing the receive operation. If receiving an incremental replication performing the receive operation. If receiving an incremental replication

View File

@ -264,8 +264,9 @@ get_usage(zfs_help_t idx)
return (gettext("\tpromote <clone-filesystem>\n")); return (gettext("\tpromote <clone-filesystem>\n"));
case HELP_RECEIVE: case HELP_RECEIVE:
return (gettext("\treceive|recv [-vnFu] <filesystem|volume|" return (gettext("\treceive|recv [-vnFu] <filesystem|volume|"
"snapshot>\n" "snapshot>\n"
"\treceive|recv [-vnFu] [-d | -e] <filesystem>\n")); "\treceive|recv [-vnFu] [-o origin=<snapshot>] [-d | -e] "
"<filesystem>\n"));
case HELP_RENAME: case HELP_RENAME:
return (gettext("\trename [-f] <filesystem|volume|snapshot> " return (gettext("\trename [-f] <filesystem|volume|snapshot> "
"<filesystem|volume|snapshot>\n" "<filesystem|volume|snapshot>\n"
@ -791,7 +792,7 @@ zfs_do_create(int argc, char **argv)
nomem(); nomem();
break; break;
case 'o': case 'o':
if (parseprop(props, optarg)) if (parseprop(props, optarg) != 0)
goto error; goto error;
break; break;
case 's': case 's':
@ -3659,7 +3660,7 @@ zfs_do_snapshot(int argc, char **argv)
while ((c = getopt(argc, argv, "ro:")) != -1) { while ((c = getopt(argc, argv, "ro:")) != -1) {
switch (c) { switch (c) {
case 'o': case 'o':
if (parseprop(props, optarg)) if (parseprop(props, optarg) != 0)
return (1); return (1);
break; break;
case 'r': case 'r':
@ -3918,10 +3919,19 @@ zfs_do_receive(int argc, char **argv)
{ {
int c, err; int c, err;
recvflags_t flags = { 0 }; recvflags_t flags = { 0 };
nvlist_t *props;
nvpair_t *nvp = NULL;
if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
nomem();
/* check options */ /* check options */
while ((c = getopt(argc, argv, ":denuvF")) != -1) { while ((c = getopt(argc, argv, ":o:denuvF")) != -1) {
switch (c) { switch (c) {
case 'o':
if (parseprop(props, optarg) != 0)
return (1);
break;
case 'd': case 'd':
flags.isprefix = B_TRUE; flags.isprefix = B_TRUE;
break; break;
@ -3966,6 +3976,13 @@ zfs_do_receive(int argc, char **argv)
usage(B_FALSE); usage(B_FALSE);
} }
while ((nvp = nvlist_next_nvpair(props, nvp))) {
if (strcmp(nvpair_name(nvp), "origin") != 0) {
(void) fprintf(stderr, gettext("invalid option"));
usage(B_FALSE);
}
}
if (isatty(STDIN_FILENO)) { if (isatty(STDIN_FILENO)) {
(void) fprintf(stderr, (void) fprintf(stderr,
gettext("Error: Backup stream can not be read " gettext("Error: Backup stream can not be read "
@ -3974,7 +3991,7 @@ zfs_do_receive(int argc, char **argv)
return (1); return (1);
} }
err = zfs_receive(g_zfs, argv[0], &flags, STDIN_FILENO, NULL); err = zfs_receive(g_zfs, argv[0], props, &flags, STDIN_FILENO, NULL);
return (err != 0); return (err != 0);
} }

View File

@ -3586,7 +3586,8 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
*/ */
n = ztest_random(regions) * stride + ztest_random(width); n = ztest_random(regions) * stride + ztest_random(width);
s = 1 + ztest_random(2 * width - 1); s = 1 + ztest_random(2 * width - 1);
dmu_prefetch(os, bigobj, n * chunksize, s * chunksize); dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize,
ZIO_PRIORITY_SYNC_READ);
/* /*
* Pick a random index and compute the offsets into packobj and bigobj. * Pick a random index and compute the offsets into packobj and bigobj.
@ -5705,8 +5706,10 @@ ztest_run(ztest_shared_t *zs)
* Right before closing the pool, kick off a bunch of async I/O; * Right before closing the pool, kick off a bunch of async I/O;
* spa_close() should wait for it to complete. * spa_close() should wait for it to complete.
*/ */
for (uint64_t object = 1; object < 50; object++) for (uint64_t object = 1; object < 50; object++) {
dmu_prefetch(spa->spa_meta_objset, object, 0, 1ULL << 20); dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20,
ZIO_PRIORITY_SYNC_READ);
}
spa_close(spa, FTAG); spa_close(spa, FTAG);

View File

@ -668,8 +668,8 @@ typedef struct recvflags {
boolean_t nomount; boolean_t nomount;
} recvflags_t; } recvflags_t;
extern int zfs_receive(libzfs_handle_t *, const char *, recvflags_t *, extern int zfs_receive(libzfs_handle_t *, const char *, nvlist_t *,
int, avl_tree_t *); recvflags_t *, int, avl_tree_t *);
typedef enum diff_flags { typedef enum diff_flags {
ZFS_DIFF_PARSEABLE = 0x1, ZFS_DIFF_PARSEABLE = 0x1,

View File

@ -3535,7 +3535,7 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
} }
static int static int
zbookmark_compare(const void *a, const void *b) zbookmark_mem_compare(const void *a, const void *b)
{ {
return (memcmp(a, b, sizeof (zbookmark_phys_t))); return (memcmp(a, b, sizeof (zbookmark_phys_t)));
} }
@ -3598,7 +3598,7 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp)
zc.zc_nvlist_dst_size; zc.zc_nvlist_dst_size;
count -= zc.zc_nvlist_dst_size; count -= zc.zc_nvlist_dst_size;
qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_compare); qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_mem_compare);
verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0); verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0);

View File

@ -64,8 +64,9 @@ extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *);
/* We need to use something for ENODATA. */ /* We need to use something for ENODATA. */
#define ENODATA EIDRM #define ENODATA EIDRM
static int zfs_receive_impl(libzfs_handle_t *, const char *, recvflags_t *, static int zfs_receive_impl(libzfs_handle_t *, const char *, const char *,
int, const char *, nvlist_t *, avl_tree_t *, char **, int, uint64_t *); recvflags_t *, int, const char *, nvlist_t *, avl_tree_t *, char **, int,
uint64_t *);
static const zio_cksum_t zero_cksum = { 0 }; static const zio_cksum_t zero_cksum = { 0 };
@ -2498,7 +2499,7 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname,
* zfs_receive_one() will take care of it (ie, * zfs_receive_one() will take care of it (ie,
* recv_skip() and return 0). * recv_skip() and return 0).
*/ */
error = zfs_receive_impl(hdl, destname, flags, fd, error = zfs_receive_impl(hdl, destname, NULL, flags, fd,
sendfs, stream_nv, stream_avl, top_zfs, cleanup_fd, sendfs, stream_nv, stream_avl, top_zfs, cleanup_fd,
action_handlep); action_handlep);
if (error == ENODATA) { if (error == ENODATA) {
@ -2631,9 +2632,9 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
*/ */
static int static int
zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
recvflags_t *flags, dmu_replay_record_t *drr, const char *originsnap, recvflags_t *flags, dmu_replay_record_t *drr,
dmu_replay_record_t *drr_noswap, const char *sendfs, dmu_replay_record_t *drr_noswap, const char *sendfs, nvlist_t *stream_nv,
nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd,
uint64_t *action_handlep) uint64_t *action_handlep)
{ {
zfs_cmd_t zc = { 0 }; zfs_cmd_t zc = { 0 };
@ -2798,10 +2799,15 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
} }
if (flags->verbose) if (flags->verbose)
(void) printf("found clone origin %s\n", zc.zc_string); (void) printf("found clone origin %s\n", zc.zc_string);
} else if (originsnap) {
(void) strncpy(zc.zc_string, originsnap, ZFS_MAXNAMELEN);
if (flags->verbose)
(void) printf("using provided clone origin %s\n",
zc.zc_string);
} }
stream_wantsnewfs = (drrb->drr_fromguid == 0 || stream_wantsnewfs = (drrb->drr_fromguid == 0 ||
(drrb->drr_flags & DRR_FLAG_CLONE)); (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap);
if (stream_wantsnewfs) { if (stream_wantsnewfs) {
/* /*
@ -3179,9 +3185,10 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
} }
static int static int
zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags, zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap,
int infd, const char *sendfs, nvlist_t *stream_nv, avl_tree_t *stream_avl, const char *originsnap, recvflags_t *flags, int infd, const char *sendfs,
char **top_zfs, int cleanup_fd, uint64_t *action_handlep) nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd,
uint64_t *action_handlep)
{ {
int err; int err;
dmu_replay_record_t drr, drr_noswap; dmu_replay_record_t drr, drr_noswap;
@ -3200,6 +3207,12 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags,
"(%s) does not exist"), tosnap); "(%s) does not exist"), tosnap);
return (zfs_error(hdl, EZFS_NOENT, errbuf)); return (zfs_error(hdl, EZFS_NOENT, errbuf));
} }
if (originsnap &&
!zfs_dataset_exists(hdl, originsnap, ZFS_TYPE_DATASET)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified origin fs "
"(%s) does not exist"), originsnap);
return (zfs_error(hdl, EZFS_NOENT, errbuf));
}
/* read in the BEGIN record */ /* read in the BEGIN record */
if (0 != (err = recv_read(hdl, infd, &drr, sizeof (drr), B_FALSE, if (0 != (err = recv_read(hdl, infd, &drr, sizeof (drr), B_FALSE,
@ -3272,14 +3285,14 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags,
*cp = '\0'; *cp = '\0';
sendfs = nonpackage_sendfs; sendfs = nonpackage_sendfs;
} }
return (zfs_receive_one(hdl, infd, tosnap, flags, return (zfs_receive_one(hdl, infd, tosnap, originsnap, flags,
&drr, &drr_noswap, sendfs, stream_nv, stream_avl, &drr, &drr_noswap, sendfs, stream_nv, stream_avl, top_zfs,
top_zfs, cleanup_fd, action_handlep)); cleanup_fd, action_handlep));
} else { } else {
assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
DMU_COMPOUNDSTREAM); DMU_COMPOUNDSTREAM);
return (zfs_receive_package(hdl, infd, tosnap, flags, return (zfs_receive_package(hdl, infd, tosnap, flags, &drr,
&drr, &zcksum, top_zfs, cleanup_fd, action_handlep)); &zcksum, top_zfs, cleanup_fd, action_handlep));
} }
} }
@ -3290,18 +3303,24 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags,
* (-1 will override -2). * (-1 will override -2).
*/ */
int int
zfs_receive(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags, zfs_receive(libzfs_handle_t *hdl, const char *tosnap, nvlist_t *props,
int infd, avl_tree_t *stream_avl) recvflags_t *flags, int infd, avl_tree_t *stream_avl)
{ {
char *top_zfs = NULL; char *top_zfs = NULL;
int err; int err;
int cleanup_fd; int cleanup_fd;
uint64_t action_handle = 0; uint64_t action_handle = 0;
char *originsnap = NULL;
if (props) {
err = nvlist_lookup_string(props, "origin", &originsnap);
if (err && err != ENOENT)
return (err);
}
cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL); cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL);
VERIFY(cleanup_fd >= 0); VERIFY(cleanup_fd >= 0);
err = zfs_receive_impl(hdl, tosnap, flags, infd, NULL, NULL, err = zfs_receive_impl(hdl, tosnap, originsnap, flags, infd, NULL, NULL,
stream_avl, &top_zfs, cleanup_fd, &action_handle); stream_avl, &top_zfs, cleanup_fd, &action_handle);
VERIFY(0 == close(cleanup_fd)); VERIFY(0 == close(cleanup_fd));

View File

@ -135,8 +135,18 @@ extern int aok;
/* /*
* DTrace SDT probes have different signatures in userland than they do in * DTrace SDT probes have different signatures in userland than they do in
* kernel. If they're being used in kernel code, re-define them out of * the kernel. If they're being used in kernel code, re-define them out of
* existence for their counterparts in libzpool. * existence for their counterparts in libzpool.
*
* Here's an example of how to use the set-error probes in userland:
* zfs$target:::set-error /arg0 == EBUSY/ {stack();}
*
* Here's an example of how to use DTRACE_PROBE probes in userland:
* If there is a probe declared as follows:
* DTRACE_PROBE2(zfs__probe_name, uint64_t, blkid, dnode_t *, dn);
* Then you can use it as follows:
* zfs$target:::probe2 /copyinstr(arg0) == "zfs__probe_name"/
* {printf("%u %p\n", arg1, arg2);}
*/ */
#ifdef DTRACE_PROBE #ifdef DTRACE_PROBE

View File

@ -22,7 +22,9 @@
# #
# Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. # Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved. # Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
# Copyright (c) 2013 by Delphix. All rights reserved. # Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
# Copyright (c) 2012 Joyent, Inc. All rights reserved.
# Copyright (c) 2011, 2014 by Delphix. All rights reserved.
# Copyright (c) 2013 by Saso Kiselkov. All rights reserved. # Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
# #
# #
@ -36,6 +38,7 @@ ZFS_COMMON_OBJS += \
blkptr.o \ blkptr.o \
bpobj.o \ bpobj.o \
bptree.o \ bptree.o \
bqueue.o \
dbuf.o \ dbuf.o \
ddt.o \ ddt.o \
ddt_zap.o \ ddt_zap.o \

View File

@ -154,7 +154,7 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
int err; int err;
struct bptree_args *ba = arg; struct bptree_args *ba = arg;
if (BP_IS_HOLE(bp)) if (bp == NULL || BP_IS_HOLE(bp))
return (0); return (0);
err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx);

View File

@ -0,0 +1,111 @@
/*
* CDDL HEADER START
*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2014 by Delphix. All rights reserved.
*/
#include <sys/bqueue.h>
#include <sys/zfs_context.h>
static inline bqueue_node_t *
obj2node(bqueue_t *q, void *data)
{
return ((bqueue_node_t *)((char *)data + q->bq_node_offset));
}
/*
* Initialize a blocking queue The maximum capacity of the queue is set to
* size. Types that want to be stored in a bqueue must contain a bqueue_node_t,
* and offset should give its offset from the start of the struct. Return 0 on
* success, or -1 on failure.
*/
int
bqueue_init(bqueue_t *q, uint64_t size, size_t node_offset)
{
list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t),
node_offset + offsetof(bqueue_node_t, bqn_node));
cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL);
cv_init(&q->bq_pop_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&q->bq_lock, NULL, MUTEX_DEFAULT, NULL);
q->bq_node_offset = node_offset;
q->bq_size = 0;
q->bq_maxsize = size;
return (0);
}
/*
* Destroy a blocking queue. This function asserts that there are no
* elements in the queue, and no one is blocked on the condition
* variables.
*/
void
bqueue_destroy(bqueue_t *q)
{
ASSERT0(q->bq_size);
cv_destroy(&q->bq_add_cv);
cv_destroy(&q->bq_pop_cv);
mutex_destroy(&q->bq_lock);
list_destroy(&q->bq_list);
}
/*
* Add data to q, consuming size units of capacity. If there is insufficient
* capacity to consume size units, block until capacity exists. Asserts size is
* > 0.
*/
void
bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size)
{
ASSERT3U(item_size, >, 0);
ASSERT3U(item_size, <, q->bq_maxsize);
mutex_enter(&q->bq_lock);
obj2node(q, data)->bqn_size = item_size;
while (q->bq_size + item_size > q->bq_maxsize) {
cv_wait(&q->bq_add_cv, &q->bq_lock);
}
q->bq_size += item_size;
list_insert_tail(&q->bq_list, data);
cv_signal(&q->bq_pop_cv);
mutex_exit(&q->bq_lock);
}
/*
* Take the first element off of q. If there are no elements on the queue, wait
* until one is put there. Return the removed element.
*/
void *
bqueue_dequeue(bqueue_t *q)
{
void *ret;
uint64_t item_size;
mutex_enter(&q->bq_lock);
while (q->bq_size == 0) {
cv_wait(&q->bq_pop_cv, &q->bq_lock);
}
ret = list_remove_head(&q->bq_list);
item_size = obj2node(q, ret)->bqn_size;
q->bq_size -= item_size;
mutex_exit(&q->bq_lock);
cv_signal(&q->bq_add_cv);
return (ret);
}
/*
* Returns true if the space used is 0.
*/
boolean_t
bqueue_empty(bqueue_t *q)
{
return (q->bq_size == 0);
}

View File

@ -548,11 +548,35 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db)
return (abuf); return (abuf);
} }
/*
* Calculate which level n block references the data at the level 0 offset
* provided.
*/
uint64_t uint64_t
dbuf_whichblock(dnode_t *dn, uint64_t offset) dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset)
{ {
if (dn->dn_datablkshift) { if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
return (offset >> dn->dn_datablkshift); /*
* The level n blkid is equal to the level 0 blkid divided by
* the number of level 0s in a level n block.
*
* The level 0 blkid is offset >> datablkshift =
* offset / 2^datablkshift.
*
* The number of level 0s in a level n is the number of block
* pointers in an indirect block, raised to the power of level.
* This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
* 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
*
* Thus, the level n blkid is: offset /
* ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT)))
* = offset / 2^(datablkshift + level *
* (indblkshift - SPA_BLKPTRSHIFT))
* = offset >> (datablkshift + level *
* (indblkshift - SPA_BLKPTRSHIFT))
*/
return (offset >> (dn->dn_datablkshift + level *
(dn->dn_indblkshift - SPA_BLKPTRSHIFT)));
} else { } else {
ASSERT3U(offset, <, dn->dn_datablksz); ASSERT3U(offset, <, dn->dn_datablksz);
return (0); return (0);
@ -1715,6 +1739,12 @@ dbuf_clear(dmu_buf_impl_t *db)
dbuf_rele(parent, db); dbuf_rele(parent, db);
} }
/*
* Note: While bpp will always be updated if the function returns success,
* parentp will not be updated if the dnode does not have dn_dbuf filled in;
* this happens when the dnode is the meta-dnode, or a userused or groupused
* object.
*/
static int static int
dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
dmu_buf_impl_t **parentp, blkptr_t **bpp) dmu_buf_impl_t **parentp, blkptr_t **bpp)
@ -1755,7 +1785,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
} else if (level < nlevels-1) { } else if (level < nlevels-1) {
/* this block is referenced from an indirect block */ /* this block is referenced from an indirect block */
int err = dbuf_hold_impl(dn, level+1, int err = dbuf_hold_impl(dn, level+1,
blkid >> epbs, fail_sparse, NULL, parentp); blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
if (err) if (err)
return (err); return (err);
err = dbuf_read(*parentp, NULL, err = dbuf_read(*parentp, NULL,
@ -1930,11 +1960,96 @@ dbuf_destroy(dmu_buf_impl_t *db)
arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
} }
void typedef struct dbuf_prefetch_arg {
dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) spa_t *dpa_spa; /* The spa to issue the prefetch in. */
zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
int dpa_curlevel; /* The current level that we're reading */
zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
} dbuf_prefetch_arg_t;
/*
* Actually issue the prefetch read for the block given.
*/
static void
dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
{ {
dmu_buf_impl_t *db = NULL; if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
blkptr_t *bp = NULL; return;
arc_flags_t aflags =
dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
ASSERT(dpa->dpa_zio != NULL);
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&aflags, &dpa->dpa_zb);
}
/*
* Called when an indirect block above our prefetch target is read in. This
* will either read in the next indirect block down the tree or issue the actual
* prefetch if the next block down is our target.
*/
static void
dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
{
dbuf_prefetch_arg_t *dpa = private;
ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
ASSERT3S(dpa->dpa_curlevel, >, 0);
if (zio != NULL) {
ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
}
dpa->dpa_curlevel--;
uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
(dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) {
kmem_free(dpa, sizeof (*dpa));
} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
dbuf_issue_final_prefetch(dpa, bp);
kmem_free(dpa, sizeof (*dpa));
} else {
arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
zbookmark_phys_t zb;
ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&iter_aflags, &zb);
}
(void) arc_buf_remove_ref(abuf, private);
}
/*
* Issue prefetch reads for the given block on the given level. If the indirect
* blocks above that block are not in memory, we will read them in
* asynchronously. As a result, this call never blocks waiting for a read to
* complete.
*/
void
dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
arc_flags_t aflags)
{
blkptr_t bp;
int epbs, nlevels, curlevel;
uint64_t curblkid;
ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(blkid != DMU_BONUS_BLKID);
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
@ -1942,35 +2057,104 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
if (dnode_block_freed(dn, blkid)) if (dnode_block_freed(dn, blkid))
return; return;
/* dbuf_find() returns with db_mtx held */ /*
if (db = dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid)) { * This dnode hasn't been written to disk yet, so there's nothing to
/* * prefetch.
* This dbuf is already in the cache. We assume that */
* it is already CACHED, or else about to be either nlevels = dn->dn_phys->dn_nlevels;
* read or filled. if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
*/ return;
epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
return;
dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
level, blkid);
if (db != NULL) {
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
/*
* This dbuf already exists. It is either CACHED, or
* (we assume) about to be read or filled.
*/
return; return;
} }
if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { /*
if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { * Find the closest ancestor (indirect block) of the target block
dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; * that is present in the cache. In this indirect block, we will
arc_flags_t aflags = * find the bp that is at curlevel, curblkid.
ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; */
zbookmark_phys_t zb; curlevel = level;
curblkid = blkid;
while (curlevel < nlevels - 1) {
int parent_level = curlevel + 1;
uint64_t parent_blkid = curblkid >> epbs;
dmu_buf_impl_t *db;
SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, if (dbuf_hold_impl(dn, parent_level, parent_blkid,
dn->dn_object, 0, blkid); FALSE, TRUE, FTAG, &db) == 0) {
blkptr_t *bpp = db->db_buf->b_data;
(void) arc_read(NULL, dn->dn_objset->os_spa, bp = bpp[P2PHASE(curblkid, 1 << epbs)];
bp, NULL, NULL, prio, dbuf_rele(db, FTAG);
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, break;
&aflags, &zb);
} }
if (db)
dbuf_rele(db, NULL); curlevel = parent_level;
curblkid = parent_blkid;
} }
if (curlevel == nlevels - 1) {
/* No cached indirect blocks found. */
ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
bp = dn->dn_phys->dn_blkptr[curblkid];
}
if (BP_IS_HOLE(&bp))
return;
ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
ZIO_FLAG_CANFAIL);
dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
dn->dn_object, level, blkid);
dpa->dpa_curlevel = curlevel;
dpa->dpa_prio = prio;
dpa->dpa_aflags = aflags;
dpa->dpa_spa = dn->dn_objset->os_spa;
dpa->dpa_epbs = epbs;
dpa->dpa_zio = pio;
/*
* If we have the indirect just above us, no need to do the asynchronous
* prefetch chain; we'll just run the last step ourselves. If we're at
* a higher level, though, we want to issue the prefetches for all the
* indirect blocks asynchronously, so we can go on with whatever we were
* doing.
*/
if (curlevel == level) {
ASSERT3U(curblkid, ==, blkid);
dbuf_issue_final_prefetch(dpa, &bp);
kmem_free(dpa, sizeof (*dpa));
} else {
arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
zbookmark_phys_t zb;
SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
dn->dn_object, curlevel, curblkid);
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
&bp, dbuf_prefetch_indirect_done, dpa, prio,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&iter_aflags, &zb);
}
/*
* We use pio here instead of dpa_zio since it's possible that
* dpa may have already been freed.
*/
zio_nowait(pio);
} }
/* /*
@ -1978,7 +2162,8 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
* Note: dn_struct_rwlock must be held. * Note: dn_struct_rwlock must be held.
*/ */
int int
dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
boolean_t fail_sparse, boolean_t fail_uncached,
void *tag, dmu_buf_impl_t **dbp) void *tag, dmu_buf_impl_t **dbp)
{ {
dmu_buf_impl_t *db, *parent = NULL; dmu_buf_impl_t *db, *parent = NULL;
@ -1996,6 +2181,9 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
blkptr_t *bp = NULL; blkptr_t *bp = NULL;
int err; int err;
if (fail_uncached)
return (SET_ERROR(ENOENT));
ASSERT3P(parent, ==, NULL); ASSERT3P(parent, ==, NULL);
err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
if (fail_sparse) { if (fail_sparse) {
@ -2012,6 +2200,11 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
db = dbuf_create(dn, level, blkid, parent, bp); db = dbuf_create(dn, level, blkid, parent, bp);
} }
if (fail_uncached && db->db_state != DB_CACHED) {
mutex_exit(&db->db_mtx);
return (SET_ERROR(ENOENT));
}
if (db->db_buf && refcount_is_zero(&db->db_holds)) { if (db->db_buf && refcount_is_zero(&db->db_holds)) {
arc_buf_add_ref(db->db_buf, db); arc_buf_add_ref(db->db_buf, db);
if (db->db_buf->b_data == NULL) { if (db->db_buf->b_data == NULL) {
@ -2067,16 +2260,14 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
dmu_buf_impl_t * dmu_buf_impl_t *
dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
{ {
dmu_buf_impl_t *db; return (dbuf_hold_level(dn, 0, blkid, tag));
int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
return (err ? NULL : db);
} }
dmu_buf_impl_t * dmu_buf_impl_t *
dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
{ {
dmu_buf_impl_t *db; dmu_buf_impl_t *db;
int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
return (err ? NULL : db); return (err ? NULL : db);
} }
@ -2429,8 +2620,8 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
if (parent == NULL) { if (parent == NULL) {
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
(void) dbuf_hold_impl(dn, db->db_level+1, parent = dbuf_hold_level(dn, db->db_level + 1,
db->db_blkid >> epbs, FALSE, db, &parent); db->db_blkid >> epbs, db);
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
mutex_enter(&db->db_mtx); mutex_enter(&db->db_mtx);
db->db_parent = parent; db->db_parent = parent;

View File

@ -141,7 +141,7 @@ dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
err = dnode_hold(os, object, FTAG, &dn); err = dnode_hold(os, object, FTAG, &dn);
if (err) if (err)
return (err); return (err);
blkid = dbuf_whichblock(dn, offset); blkid = dbuf_whichblock(dn, 0, offset);
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
db = dbuf_hold(dn, blkid, tag); db = dbuf_hold(dn, blkid, tag);
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
@ -424,7 +424,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
blkid = dbuf_whichblock(dn, offset); blkid = dbuf_whichblock(dn, 0, offset);
for (i = 0; i < nblks; i++) { for (i = 0; i < nblks; i++) {
dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
if (db == NULL) { if (db == NULL) {
@ -528,17 +528,16 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
} }
/* /*
* Issue prefetch i/os for the given blocks. * Issue prefetch i/os for the given blocks. If level is greater than 0, the
* indirect blocks prefeteched will be those that point to the blocks containing
* the data starting at offset, and continuing to offset + len.
* *
* Note: The assumption is that we *know* these blocks will be needed * Note that if the indirect blocks above the blocks being prefetched are not in
* almost immediately. Therefore, the prefetch i/os will be issued at * cache, they will be asychronously read in.
* ZIO_PRIORITY_SYNC_READ
*
* Note: indirect blocks and other metadata will be read synchronously,
* causing this function to block if they are not already cached.
*/ */
void void
dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
uint64_t len, zio_priority_t pri)
{ {
dnode_t *dn; dnode_t *dn;
uint64_t blkid; uint64_t blkid;
@ -554,8 +553,9 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
return; return;
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); blkid = dbuf_whichblock(dn, level,
dbuf_prefetch(dn, blkid, ZIO_PRIORITY_SYNC_READ); object * sizeof (dnode_phys_t));
dbuf_prefetch(dn, level, blkid, pri, 0);
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
return; return;
} }
@ -570,18 +570,24 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
return; return;
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_datablkshift) { /*
int blkshift = dn->dn_datablkshift; * offset + len - 1 is the last byte we want to prefetch for, and offset
nblks = (P2ROUNDUP(offset + len, 1 << blkshift) - * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the
P2ALIGN(offset, 1 << blkshift)) >> blkshift; * last block we want to prefetch, and dbuf_whichblock(dn, level,
* offset) is the first. Then the number we need to prefetch is the
* last - first + 1.
*/
if (level > 0 || dn->dn_datablkshift != 0) {
nblks = dbuf_whichblock(dn, level, offset + len - 1) -
dbuf_whichblock(dn, level, offset) + 1;
} else { } else {
nblks = (offset < dn->dn_datablksz); nblks = (offset < dn->dn_datablksz);
} }
if (nblks != 0) { if (nblks != 0) {
blkid = dbuf_whichblock(dn, offset); blkid = dbuf_whichblock(dn, level, offset);
for (int i = 0; i < nblks; i++) for (int i = 0; i < nblks; i++)
dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_SYNC_READ); dbuf_prefetch(dn, level, blkid + i, pri, 0);
} }
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
@ -1393,7 +1399,7 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
DB_DNODE_ENTER(dbuf); DB_DNODE_ENTER(dbuf);
dn = DB_DNODE(dbuf); dn = DB_DNODE(dbuf);
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
blkid = dbuf_whichblock(dn, offset); blkid = dbuf_whichblock(dn, 0, offset);
VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
DB_DNODE_EXIT(dbuf); DB_DNODE_EXIT(dbuf);

View File

@ -138,7 +138,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
if (issig(JUSTLOOKING) && issig(FORREAL)) if (issig(JUSTLOOKING) && issig(FORREAL))
return (SET_ERROR(EINTR)); return (SET_ERROR(EINTR));
if (zb->zb_object != DMU_META_DNODE_OBJECT) if (bp == NULL || zb->zb_object != DMU_META_DNODE_OBJECT)
return (0); return (0);
if (BP_IS_HOLE(bp)) { if (BP_IS_HOLE(bp)) {

View File

@ -148,6 +148,11 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
return (0); return (0);
} }
/*
* Return (in *objectp) the next object which is allocated (or a hole)
* after *object, taking into account only objects that may have been modified
* after the specified txg.
*/
int int
dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
{ {

File diff suppressed because it is too large Load Diff

View File

@ -158,7 +158,7 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
* If we already visited this bp & everything below, * If we already visited this bp & everything below,
* don't bother doing it again. * don't bother doing it again.
*/ */
if (zbookmark_is_before(dnp, zb, td->td_resume)) if (zbookmark_subtree_completed(dnp, zb, td->td_resume))
return (RESUME_SKIP_ALL); return (RESUME_SKIP_ALL);
/* /*
@ -425,6 +425,17 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
int j, err = 0; int j, err = 0;
zbookmark_phys_t czb; zbookmark_phys_t czb;
if (td->td_flags & TRAVERSE_PRE) {
SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
ZB_DNODE_BLKID);
err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
td->td_arg);
if (err == TRAVERSE_VISIT_NO_CHILDREN)
return (0);
if (err != 0)
return (err);
}
for (j = 0; j < dnp->dn_nblkptr; j++) { for (j = 0; j < dnp->dn_nblkptr; j++) {
SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb); err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
@ -432,10 +443,21 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
break; break;
} }
if (err == 0 && dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb); err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
} }
if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
ZB_DNODE_BLKID);
err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
td->td_arg);
if (err == TRAVERSE_VISIT_NO_CHILDREN)
return (0);
if (err != 0)
return (err);
}
return (err); return (err);
} }
@ -448,6 +470,8 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
ASSERT(pfd->pd_bytes_fetched >= 0); ASSERT(pfd->pd_bytes_fetched >= 0);
if (bp == NULL)
return (0);
if (pfd->pd_cancel) if (pfd->pd_cancel)
return (SET_ERROR(EINTR)); return (SET_ERROR(EINTR));

View File

@ -315,7 +315,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
dmu_buf_impl_t *db; dmu_buf_impl_t *db;
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db); err = dbuf_hold_impl(dn, 0, start,
FALSE, FALSE, FTAG, &db);
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
if (err) { if (err) {
@ -516,7 +517,8 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
blkoff = P2PHASE(blkid, epb); blkoff = P2PHASE(blkid, epb);
tochk = MIN(epb - blkoff, nblks); tochk = MIN(epb - blkoff, nblks);
err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf); err = dbuf_hold_impl(dn, 1, blkid >> epbs,
FALSE, FALSE, FTAG, &dbuf);
if (err) { if (err) {
txh->txh_tx->tx_err = err; txh->txh_tx->tx_err = err;
break; break;

View File

@ -305,7 +305,8 @@ dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks)
fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks); fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks);
for (i = 0; i < fetchsz; i++) { for (i = 0; i < fetchsz; i++) {
dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_ASYNC_READ); dbuf_prefetch(dn, 0, blkid + i, ZIO_PRIORITY_ASYNC_READ,
ARC_FLAG_PREFETCH);
} }
return (fetchsz); return (fetchsz);

View File

@ -1116,7 +1116,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
drop_struct_lock = TRUE; drop_struct_lock = TRUE;
} }
blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t)); blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
db = dbuf_hold(mdn, blk, FTAG); db = dbuf_hold(mdn, blk, FTAG);
if (drop_struct_lock) if (drop_struct_lock)
@ -1413,7 +1413,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
goto fail; goto fail;
/* resize the old block */ /* resize the old block */
err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db); err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
if (err == 0) if (err == 0)
dbuf_new_size(db, size, tx); dbuf_new_size(db, size, tx);
else if (err != ENOENT) else if (err != ENOENT)
@ -1586,8 +1586,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
ASSERT3U(blkoff + head, ==, blksz); ASSERT3U(blkoff + head, ==, blksz);
if (len < head) if (len < head)
head = len; head = len;
if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE, if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
FTAG, &db) == 0) { TRUE, FALSE, FTAG, &db) == 0) {
caddr_t data; caddr_t data;
/* don't dirty if it isn't on disk and isn't dirty */ /* don't dirty if it isn't on disk and isn't dirty */
@ -1624,8 +1624,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
if (tail) { if (tail) {
if (len < tail) if (len < tail)
tail = len; tail = len;
if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len), if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
TRUE, FTAG, &db) == 0) { TRUE, FALSE, FTAG, &db) == 0) {
/* don't dirty if not on disk and not dirty */ /* don't dirty if not on disk and not dirty */
if (db->db_last_dirty || if (db->db_last_dirty ||
(db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
@ -1854,7 +1854,7 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
*/ */
static int static int
dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
int lvl, uint64_t blkfill, uint64_t txg) int lvl, uint64_t blkfill, uint64_t txg)
{ {
dmu_buf_impl_t *db = NULL; dmu_buf_impl_t *db = NULL;
void *data = NULL; void *data = NULL;
@ -1876,8 +1876,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
epb = dn->dn_phys->dn_nblkptr; epb = dn->dn_phys->dn_nblkptr;
data = dn->dn_phys->dn_blkptr; data = dn->dn_phys->dn_blkptr;
} else { } else {
uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl); uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db); error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
if (error) { if (error) {
if (error != ENOENT) if (error != ENOENT)
return (error); return (error);

View File

@ -188,7 +188,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
err = dbuf_hold_impl(dn, db->db_level-1, err = dbuf_hold_impl(dn, db->db_level-1,
(db->db_blkid << epbs) + i, TRUE, FTAG, &child); (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child);
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
if (err == ENOENT) if (err == ENOENT)
continue; continue;
@ -284,7 +284,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
continue; continue;
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
VERIFY0(dbuf_hold_impl(dn, db->db_level - 1, VERIFY0(dbuf_hold_impl(dn, db->db_level - 1,
i, B_TRUE, FTAG, &subdb)); i, TRUE, FALSE, FTAG, &subdb));
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
ASSERT3P(bp, ==, subdb->db_blkptr); ASSERT3P(bp, ==, subdb->db_blkptr);
@ -357,7 +357,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
continue; continue;
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i, VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,
TRUE, FTAG, &db)); TRUE, FALSE, FTAG, &db));
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
free_children(db, blkid, nblks, tx); free_children(db, blkid, nblks, tx);

View File

@ -540,6 +540,7 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name,
const char *snapname; const char *snapname;
uint64_t obj; uint64_t obj;
int err = 0; int err = 0;
dsl_dataset_t *ds;
err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname); err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
if (err != 0) if (err != 0)
@ -548,36 +549,37 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name,
ASSERT(dsl_pool_config_held(dp)); ASSERT(dsl_pool_config_held(dp));
obj = dsl_dir_phys(dd)->dd_head_dataset_obj; obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
if (obj != 0) if (obj != 0)
err = dsl_dataset_hold_obj(dp, obj, tag, dsp); err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
else else
err = SET_ERROR(ENOENT); err = SET_ERROR(ENOENT);
/* we may be looking for a snapshot */ /* we may be looking for a snapshot */
if (err == 0 && snapname != NULL) { if (err == 0 && snapname != NULL) {
dsl_dataset_t *ds; dsl_dataset_t *snap_ds;
if (*snapname++ != '@') { if (*snapname++ != '@') {
dsl_dataset_rele(*dsp, tag); dsl_dataset_rele(ds, tag);
dsl_dir_rele(dd, FTAG); dsl_dir_rele(dd, FTAG);
return (SET_ERROR(ENOENT)); return (SET_ERROR(ENOENT));
} }
dprintf("looking for snapshot '%s'\n", snapname); dprintf("looking for snapshot '%s'\n", snapname);
err = dsl_dataset_snap_lookup(*dsp, snapname, &obj); err = dsl_dataset_snap_lookup(ds, snapname, &obj);
if (err == 0) if (err == 0)
err = dsl_dataset_hold_obj(dp, obj, tag, &ds); err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds);
dsl_dataset_rele(*dsp, tag); dsl_dataset_rele(ds, tag);
if (err == 0) { if (err == 0) {
mutex_enter(&ds->ds_lock); mutex_enter(&snap_ds->ds_lock);
if (ds->ds_snapname[0] == 0) if (snap_ds->ds_snapname[0] == 0)
(void) strlcpy(ds->ds_snapname, snapname, (void) strlcpy(snap_ds->ds_snapname, snapname,
sizeof (ds->ds_snapname)); sizeof (snap_ds->ds_snapname));
mutex_exit(&ds->ds_lock); mutex_exit(&snap_ds->ds_lock);
*dsp = ds; ds = snap_ds;
} }
} }
if (err == 0)
*dsp = ds;
dsl_dir_rele(dd, FTAG); dsl_dir_rele(dd, FTAG);
return (err); return (err);
} }

View File

@ -552,7 +552,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
struct killarg *ka = arg; struct killarg *ka = arg;
dmu_tx_t *tx = ka->tx; dmu_tx_t *tx = ka->tx;
if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
return (0); return (0);
if (zb->zb_level == ZB_ZIL_LEVEL) { if (zb->zb_level == ZB_ZIL_LEVEL) {

View File

@ -600,7 +600,8 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
* If we already visited this bp & everything below (in * If we already visited this bp & everything below (in
* a prior txg sync), don't bother doing it again. * a prior txg sync), don't bother doing it again.
*/ */
if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark)) if (zbookmark_subtree_completed(dnp, zb,
&scn->scn_phys.scn_bookmark))
return (B_TRUE); return (B_TRUE);
/* /*

View File

@ -1943,7 +1943,7 @@ static int
spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
{ {
if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
return (0); return (0);
/* /*
* Note: normally this routine will not be called if * Note: normally this routine will not be called if

View File

@ -80,8 +80,8 @@ space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
mutex_exit(sm->sm_lock); mutex_exit(sm->sm_lock);
if (end > bufsize) { if (end > bufsize) {
dmu_prefetch(sm->sm_os, space_map_object(sm), bufsize, dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize,
end - bufsize); end - bufsize, ZIO_PRIORITY_SYNC_READ);
} }
mutex_enter(sm->sm_lock); mutex_enter(sm->sm_lock);

View File

@ -0,0 +1,54 @@
/*
* CDDL HEADER START
*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2014 by Delphix. All rights reserved.
*/
#ifndef _BQUEUE_H
#define _BQUEUE_H
#ifdef __cplusplus
extern "C" {
#endif
#include <sys/zfs_context.h>
typedef struct bqueue {
list_t bq_list;
kmutex_t bq_lock;
kcondvar_t bq_add_cv;
kcondvar_t bq_pop_cv;
uint64_t bq_size;
uint64_t bq_maxsize;
size_t bq_node_offset;
} bqueue_t;
typedef struct bqueue_node {
list_node_t bqn_node;
uint64_t bqn_size;
} bqueue_node_t;
int bqueue_init(bqueue_t *, uint64_t, size_t);
void bqueue_destroy(bqueue_t *);
void bqueue_enqueue(bqueue_t *, void *, uint64_t);
void *bqueue_dequeue(bqueue_t *);
boolean_t bqueue_empty(bqueue_t *);
#ifdef __cplusplus
}
#endif
#endif /* _BQUEUE_H */

View File

@ -245,8 +245,7 @@ typedef struct dbuf_hash_table {
kmutex_t hash_mutexes[DBUF_MUTEXES]; kmutex_t hash_mutexes[DBUF_MUTEXES];
} dbuf_hash_table_t; } dbuf_hash_table_t;
uint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset);
uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data); dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
void dbuf_create_bonus(struct dnode *dn); void dbuf_create_bonus(struct dnode *dn);
@ -258,10 +257,12 @@ void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx);
dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag); dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid, dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
void *tag); void *tag);
int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create, int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid,
boolean_t fail_sparse, boolean_t fail_uncached,
void *tag, dmu_buf_impl_t **dbp); void *tag, dmu_buf_impl_t **dbp);
void dbuf_prefetch(struct dnode *dn, uint64_t blkid, zio_priority_t prio); void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid,
zio_priority_t prio, arc_flags_t aflags);
void dbuf_add_ref(dmu_buf_impl_t *db, void *tag); void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj, boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj,

View File

@ -45,6 +45,7 @@
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
#include <sys/cred.h> #include <sys/cred.h>
#include <sys/fs/zfs.h> #include <sys/fs/zfs.h>
#include <sys/zio_priority.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
@ -748,8 +749,8 @@ extern int zfs_max_recordsize;
/* /*
* Asynchronously try to read in the data. * Asynchronously try to read in the data.
*/ */
void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
uint64_t len); uint64_t len, enum zio_priority pri);
typedef struct dmu_object_info { typedef struct dmu_object_info {
/* All sizes are in bytes unless otherwise indicated. */ /* All sizes are in bytes unless otherwise indicated. */

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.

View File

@ -29,6 +29,7 @@
#ifndef _ZIO_H #ifndef _ZIO_H
#define _ZIO_H #define _ZIO_H
#include <sys/zio_priority.h>
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
#include <sys/spa.h> #include <sys/spa.h>
#include <sys/txg.h> #include <sys/txg.h>
@ -144,18 +145,6 @@ enum zio_compress {
#define ZIO_FAILURE_MODE_CONTINUE 1 #define ZIO_FAILURE_MODE_CONTINUE 1
#define ZIO_FAILURE_MODE_PANIC 2 #define ZIO_FAILURE_MODE_PANIC 2
typedef enum zio_priority {
ZIO_PRIORITY_SYNC_READ,
ZIO_PRIORITY_SYNC_WRITE, /* ZIL */
ZIO_PRIORITY_ASYNC_READ, /* prefetch */
ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
ZIO_PRIORITY_TRIM, /* free requests used for TRIM */
ZIO_PRIORITY_NUM_QUEUEABLE,
ZIO_PRIORITY_NOW /* non-queued I/Os (e.g. ioctl) */
} zio_priority_t;
enum zio_flag { enum zio_flag {
/* /*
* Flags inherited by gang, ddt, and vdev children, * Flags inherited by gang, ddt, and vdev children,
@ -260,6 +249,7 @@ extern const char *zio_type_name[ZIO_TYPES];
* Root blocks (objset_phys_t) are object 0, level -1: <objset, 0, -1, 0>. * Root blocks (objset_phys_t) are object 0, level -1: <objset, 0, -1, 0>.
* ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>. * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
* dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>. * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
* dnode visit bookmarks are <objset, object id of dnode, -3, 0>.
* *
* Note: this structure is called a bookmark because its original purpose * Note: this structure is called a bookmark because its original purpose
* was to remember where to resume a pool-wide traverse. * was to remember where to resume a pool-wide traverse.
@ -292,6 +282,9 @@ typedef struct zbookmark_phys {
#define ZB_ZIL_OBJECT (0ULL) #define ZB_ZIL_OBJECT (0ULL)
#define ZB_ZIL_LEVEL (-2LL) #define ZB_ZIL_LEVEL (-2LL)
#define ZB_DNODE_LEVEL (-3LL)
#define ZB_DNODE_BLKID (0ULL)
#define ZB_IS_ZERO(zb) \ #define ZB_IS_ZERO(zb) \
((zb)->zb_objset == 0 && (zb)->zb_object == 0 && \ ((zb)->zb_objset == 0 && (zb)->zb_object == 0 && \
(zb)->zb_level == 0 && (zb)->zb_blkid == 0) (zb)->zb_level == 0 && (zb)->zb_blkid == 0)
@ -633,8 +626,10 @@ extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
extern void spa_handle_ignored_writes(spa_t *spa); extern void spa_handle_ignored_writes(spa_t *spa);
/* zbookmark_phys functions */ /* zbookmark_phys functions */
boolean_t zbookmark_is_before(const struct dnode_phys *dnp, boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp,
const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2); const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block);
int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2,
uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -44,7 +44,7 @@ typedef struct zio_checksum_info {
zio_checksum_func_t *ci_func[2]; /* checksum function per byteorder */ zio_checksum_func_t *ci_func[2]; /* checksum function per byteorder */
int ci_correctable; /* number of correctable bits */ int ci_correctable; /* number of correctable bits */
int ci_eck; /* uses zio embedded checksum? */ int ci_eck; /* uses zio embedded checksum? */
int ci_dedup; /* strong enough for dedup? */ boolean_t ci_dedup; /* strong enough for dedup? */
char *ci_name; /* descriptive name */ char *ci_name; /* descriptive name */
} zio_checksum_info_t; } zio_checksum_info_t;

View File

@ -0,0 +1,41 @@
/*
* CDDL HEADER START
*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2014 by Delphix. All rights reserved.
*/
#ifndef _ZIO_PRIORITY_H
#define _ZIO_PRIORITY_H
#ifdef __cplusplus
extern "C" {
#endif
typedef enum zio_priority {
ZIO_PRIORITY_SYNC_READ,
ZIO_PRIORITY_SYNC_WRITE, /* ZIL */
ZIO_PRIORITY_ASYNC_READ, /* prefetch */
ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
ZIO_PRIORITY_TRIM, /* free requests used for TRIM */
ZIO_PRIORITY_NUM_QUEUEABLE,
ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */
} zio_priority_t;
#ifdef __cplusplus
}
#endif
#endif /* _ZIO_PRIORITY_H */

View File

@ -162,8 +162,9 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2); newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
tbl->zt_nextblk = newblk; tbl->zt_nextblk = newblk;
ASSERT0(tbl->zt_blks_copied); ASSERT0(tbl->zt_blks_copied);
dmu_prefetch(zap->zap_objset, zap->zap_object, dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
tbl->zt_blk << bs, tbl->zt_numblks << bs); tbl->zt_blk << bs, tbl->zt_numblks << bs,
ZIO_PRIORITY_SYNC_READ);
} }
/* /*
@ -939,7 +940,8 @@ fzap_prefetch(zap_name_t *zn)
if (zap_idx_to_blk(zap, idx, &blk) != 0) if (zap_idx_to_blk(zap, idx, &blk) != 0)
return; return;
bs = FZAP_BLOCK_SHIFT(zap); bs = FZAP_BLOCK_SHIFT(zap);
dmu_prefetch(zap->zap_objset, zap->zap_object, blk << bs, 1 << bs); dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
ZIO_PRIORITY_SYNC_READ);
} }
/* /*
@ -1310,9 +1312,10 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
} else { } else {
int b; int b;
dmu_prefetch(zap->zap_objset, zap->zap_object, dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs, zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs); zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
ZIO_PRIORITY_SYNC_READ);
for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks; for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
b++) { b++) {

View File

@ -22,7 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>. * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
* All rights reserved. * All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/ */
/* Portions Copyright 2010 Robert Milkowski */ /* Portions Copyright 2010 Robert Milkowski */
@ -950,7 +950,7 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
&sa_obj); &sa_obj);
if (error) if (error)
return (error); goto out;
} else { } else {
/* /*
* Pre SA versions file systems should never touch * Pre SA versions file systems should never touch

View File

@ -2675,7 +2675,8 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
/* Prefetch znode */ /* Prefetch znode */
if (prefetch) if (prefetch)
dmu_prefetch(os, objnum, 0, 0); dmu_prefetch(os, objnum, 0, 0, 0,
ZIO_PRIORITY_SYNC_READ);
skip_entry: skip_entry:
/* /*

View File

@ -94,6 +94,9 @@ extern vmem_t *zio_alloc_arena;
#define ZIO_PIPELINE_CONTINUE 0x100 #define ZIO_PIPELINE_CONTINUE 0x100
#define ZIO_PIPELINE_STOP 0x101 #define ZIO_PIPELINE_STOP 0x101
#define BP_SPANB(indblkshift, level) \
(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
#define COMPARE_META_LEVEL 0x80000000ul
/* /*
* The following actions directly effect the spa's sync-to-convergence logic. * The following actions directly effect the spa's sync-to-convergence logic.
* The values below define the sync pass when we start performing the action. * The values below define the sync pass when we start performing the action.
@ -3461,37 +3464,127 @@ static zio_pipe_stage_t *zio_pipeline[] = {
zio_done zio_done
}; };
/* dnp is the dnode for zb1->zb_object */
boolean_t
zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1,
const zbookmark_phys_t *zb2)
{
uint64_t zb1nextL0, zb2thisobj;
ASSERT(zb1->zb_objset == zb2->zb_objset);
ASSERT(zb2->zb_level == 0);
/*
* Compare two zbookmark_phys_t's to see which we would reach first in a
* pre-order traversal of the object tree.
*
* This is simple in every case aside from the meta-dnode object. For all other
* objects, we traverse them in order (object 1 before object 2, and so on).
* However, all of these objects are traversed while traversing object 0, since
* the data it points to is the list of objects. Thus, we need to convert to a
* canonical representation so we can compare meta-dnode bookmarks to
* non-meta-dnode bookmarks.
*
* We do this by calculating "equivalents" for each field of the zbookmark.
* zbookmarks outside of the meta-dnode use their own object and level, and
* calculate the level 0 equivalent (the first L0 blkid that is contained in the
* blocks this bookmark refers to) by multiplying their blkid by their span
* (the number of L0 blocks contained within one block at their level).
* zbookmarks inside the meta-dnode calculate their object equivalent
* (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
* level + 1<<31 (any value larger than a level could ever be) for their level.
* This causes them to always compare before a bookmark in their object
* equivalent, compare appropriately to bookmarks in other objects, and to
* compare appropriately to other bookmarks in the meta-dnode.
*/
int
zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
{
/*
* These variables represent the "equivalent" values for the zbookmark,
* after converting zbookmarks inside the meta dnode to their
* normal-object equivalents.
*/
uint64_t zb1obj, zb2obj;
uint64_t zb1L0, zb2L0;
uint64_t zb1level, zb2level;
if (zb1->zb_object == zb2->zb_object &&
zb1->zb_level == zb2->zb_level &&
zb1->zb_blkid == zb2->zb_blkid)
return (0);
/*
* BP_SPANB calculates the span in blocks.
*/
zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
zb1L0 = 0;
zb1level = zb1->zb_level + COMPARE_META_LEVEL;
} else {
zb1obj = zb1->zb_object;
zb1level = zb1->zb_level;
}
if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
zb2L0 = 0;
zb2level = zb2->zb_level + COMPARE_META_LEVEL;
} else {
zb2obj = zb2->zb_object;
zb2level = zb2->zb_level;
}
/* Now that we have a canonical representation, do the comparison. */
if (zb1obj != zb2obj)
return (zb1obj < zb2obj ? -1 : 1);
else if (zb1L0 != zb2L0)
return (zb1L0 < zb2L0 ? -1 : 1);
else if (zb1level != zb2level)
return (zb1level > zb2level ? -1 : 1);
/*
* This can (theoretically) happen if the bookmarks have the same object
* and level, but different blkids, if the block sizes are not the same.
* There is presently no way to change the indirect block sizes
*/
return (0);
}
/*
* This function checks the following: given that last_block is the place that
* our traversal stopped last time, does that guarantee that we've visited
* every node under subtree_root? Therefore, we can't just use the raw output
* of zbookmark_compare. We have to pass in a modified version of
* subtree_root; by incrementing the block id, and then checking whether
* last_block is before or equal to that, we can tell whether or not having
* visited last_block implies that all of subtree_root's children have been
* visited.
*/
boolean_t
zbookmark_subtree_completed(const dnode_phys_t *dnp,
const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
{
zbookmark_phys_t mod_zb = *subtree_root;
mod_zb.zb_blkid++;
ASSERT(last_block->zb_level == 0);
/* The objset_phys_t isn't before anything. */ /* The objset_phys_t isn't before anything. */
if (dnp == NULL) if (dnp == NULL)
return (B_FALSE); return (B_FALSE);
zb1nextL0 = (zb1->zb_blkid + 1) << /*
((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
* data block size in sectors, because that variable is only used if
zb2thisobj = zb2->zb_object ? zb2->zb_object : * the bookmark refers to a block in the meta-dnode. Since we don't
zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); * know without examining it what object it refers to, and there's no
* harm in passing in this value in other cases, we always pass it in.
if (zb1->zb_object == DMU_META_DNODE_OBJECT) { *
uint64_t nextobj = zb1nextL0 * * We pass in 0 for the indirect block size shift because zb2 must be
(dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; * level 0. The indirect block size is only used to calculate the span
return (nextobj <= zb2thisobj); * of the bookmark, but since the bookmark must be level 0, the span is
} * always 1, so the math works out.
*
if (zb1->zb_object < zb2thisobj) * If you make changes to how the zbookmark_compare code works, be sure
return (B_TRUE); * to make sure that this code still works afterwards.
if (zb1->zb_object > zb2thisobj) */
return (B_FALSE); return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
if (zb2->zb_object == DMU_META_DNODE_OBJECT) 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
return (B_FALSE); last_block) <= 0);
return (zb1nextL0 <= zb2->zb_blkid);
} }

View File

@ -358,7 +358,7 @@ zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
zvol_extent_t *ze; zvol_extent_t *ze;
int bs = ma->ma_zv->zv_volblocksize; int bs = ma->ma_zv->zv_volblocksize;
if (BP_IS_HOLE(bp) || if (bp == NULL || BP_IS_HOLE(bp) ||
zb->zb_object != ZVOL_OBJ || zb->zb_level != 0) zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
return (0); return (0);