MFV r286704: 5960 zfs recv should prefetch indirect blocks
5925 zfs receive -o origin= Reviewed by: Prakash Surya <prakash.surya@delphix.com> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Author: Paul Dagnelie <pcd@delphix.com> While running 'zfs recv' we noticed that every 128th 8K block required a read. We were seeing that restore_write() was calling dmu_tx_hold_write() and the indirect block was not cached. We should prefetch upcoming indirect blocks to avoid having to go to disk and blocking the restore_write(). Allow an incremental send stream to be received as a clone, even if the stream does not mark it as a clone.
This commit is contained in:
parent
cf4bfabada
commit
99cadf9eed
@ -2428,6 +2428,9 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
|
|||||||
dmu_object_type_t type;
|
dmu_object_type_t type;
|
||||||
boolean_t is_metadata;
|
boolean_t is_metadata;
|
||||||
|
|
||||||
|
if (bp == NULL)
|
||||||
|
return (0);
|
||||||
|
|
||||||
if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
|
if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
|
||||||
char blkbuf[BP_SPRINTF_LEN];
|
char blkbuf[BP_SPRINTF_LEN];
|
||||||
snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
|
snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
|
||||||
@ -2917,7 +2920,7 @@ zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
|
|||||||
avl_index_t where;
|
avl_index_t where;
|
||||||
zdb_ddt_entry_t *zdde, zdde_search;
|
zdb_ddt_entry_t *zdde, zdde_search;
|
||||||
|
|
||||||
if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
|
if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
|
||||||
return (0);
|
return (0);
|
||||||
|
|
||||||
if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
|
if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
|
||||||
|
@ -191,11 +191,13 @@
|
|||||||
.Nm
|
.Nm
|
||||||
.Cm receive Ns | Ns Cm recv
|
.Cm receive Ns | Ns Cm recv
|
||||||
.Op Fl vnFu
|
.Op Fl vnFu
|
||||||
|
.Op Fl o Sy origin Ns = Ns Ar snapshot
|
||||||
.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
|
.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
|
||||||
.Nm
|
.Nm
|
||||||
.Cm receive Ns | Ns Cm recv
|
.Cm receive Ns | Ns Cm recv
|
||||||
.Op Fl vnFu
|
.Op Fl vnFu
|
||||||
.Op Fl d | e
|
.Op Fl d | e
|
||||||
|
.Op Fl o Sy origin Ns = Ns Ar snapshot
|
||||||
.Ar filesystem
|
.Ar filesystem
|
||||||
.Nm
|
.Nm
|
||||||
.Cm allow
|
.Cm allow
|
||||||
@ -2705,6 +2707,7 @@ feature.
|
|||||||
.Nm
|
.Nm
|
||||||
.Cm receive Ns | Ns Cm recv
|
.Cm receive Ns | Ns Cm recv
|
||||||
.Op Fl vnFu
|
.Op Fl vnFu
|
||||||
|
.Op Fl o Sy origin Ns = Ns Ar snapshot
|
||||||
.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
|
.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
|
||||||
.Xc
|
.Xc
|
||||||
.It Xo
|
.It Xo
|
||||||
@ -2712,6 +2715,7 @@ feature.
|
|||||||
.Cm receive Ns | Ns Cm recv
|
.Cm receive Ns | Ns Cm recv
|
||||||
.Op Fl vnFu
|
.Op Fl vnFu
|
||||||
.Op Fl d | e
|
.Op Fl d | e
|
||||||
|
.Op Fl o Sy origin Ns = Ns Ar snapshot
|
||||||
.Ar filesystem
|
.Ar filesystem
|
||||||
.Xc
|
.Xc
|
||||||
.Pp
|
.Pp
|
||||||
@ -2796,6 +2800,10 @@ receive operation.
|
|||||||
Do not actually receive the stream. This can be useful in conjunction with the
|
Do not actually receive the stream. This can be useful in conjunction with the
|
||||||
.Fl v
|
.Fl v
|
||||||
option to verify the name the receive operation would use.
|
option to verify the name the receive operation would use.
|
||||||
|
.It Fl o Sy origin Ns = Ns Ar snapshot
|
||||||
|
Forces the stream to be received as a clone of the given snapshot.
|
||||||
|
This is only valid if the stream is an incremental stream whose source
|
||||||
|
is the same as the provided origin.
|
||||||
.It Fl F
|
.It Fl F
|
||||||
Force a rollback of the file system to the most recent snapshot before
|
Force a rollback of the file system to the most recent snapshot before
|
||||||
performing the receive operation. If receiving an incremental replication
|
performing the receive operation. If receiving an incremental replication
|
||||||
|
@ -264,8 +264,9 @@ get_usage(zfs_help_t idx)
|
|||||||
return (gettext("\tpromote <clone-filesystem>\n"));
|
return (gettext("\tpromote <clone-filesystem>\n"));
|
||||||
case HELP_RECEIVE:
|
case HELP_RECEIVE:
|
||||||
return (gettext("\treceive|recv [-vnFu] <filesystem|volume|"
|
return (gettext("\treceive|recv [-vnFu] <filesystem|volume|"
|
||||||
"snapshot>\n"
|
"snapshot>\n"
|
||||||
"\treceive|recv [-vnFu] [-d | -e] <filesystem>\n"));
|
"\treceive|recv [-vnFu] [-o origin=<snapshot>] [-d | -e] "
|
||||||
|
"<filesystem>\n"));
|
||||||
case HELP_RENAME:
|
case HELP_RENAME:
|
||||||
return (gettext("\trename [-f] <filesystem|volume|snapshot> "
|
return (gettext("\trename [-f] <filesystem|volume|snapshot> "
|
||||||
"<filesystem|volume|snapshot>\n"
|
"<filesystem|volume|snapshot>\n"
|
||||||
@ -791,7 +792,7 @@ zfs_do_create(int argc, char **argv)
|
|||||||
nomem();
|
nomem();
|
||||||
break;
|
break;
|
||||||
case 'o':
|
case 'o':
|
||||||
if (parseprop(props, optarg))
|
if (parseprop(props, optarg) != 0)
|
||||||
goto error;
|
goto error;
|
||||||
break;
|
break;
|
||||||
case 's':
|
case 's':
|
||||||
@ -3659,7 +3660,7 @@ zfs_do_snapshot(int argc, char **argv)
|
|||||||
while ((c = getopt(argc, argv, "ro:")) != -1) {
|
while ((c = getopt(argc, argv, "ro:")) != -1) {
|
||||||
switch (c) {
|
switch (c) {
|
||||||
case 'o':
|
case 'o':
|
||||||
if (parseprop(props, optarg))
|
if (parseprop(props, optarg) != 0)
|
||||||
return (1);
|
return (1);
|
||||||
break;
|
break;
|
||||||
case 'r':
|
case 'r':
|
||||||
@ -3918,10 +3919,19 @@ zfs_do_receive(int argc, char **argv)
|
|||||||
{
|
{
|
||||||
int c, err;
|
int c, err;
|
||||||
recvflags_t flags = { 0 };
|
recvflags_t flags = { 0 };
|
||||||
|
nvlist_t *props;
|
||||||
|
nvpair_t *nvp = NULL;
|
||||||
|
|
||||||
|
if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
|
||||||
|
nomem();
|
||||||
|
|
||||||
/* check options */
|
/* check options */
|
||||||
while ((c = getopt(argc, argv, ":denuvF")) != -1) {
|
while ((c = getopt(argc, argv, ":o:denuvF")) != -1) {
|
||||||
switch (c) {
|
switch (c) {
|
||||||
|
case 'o':
|
||||||
|
if (parseprop(props, optarg) != 0)
|
||||||
|
return (1);
|
||||||
|
break;
|
||||||
case 'd':
|
case 'd':
|
||||||
flags.isprefix = B_TRUE;
|
flags.isprefix = B_TRUE;
|
||||||
break;
|
break;
|
||||||
@ -3966,6 +3976,13 @@ zfs_do_receive(int argc, char **argv)
|
|||||||
usage(B_FALSE);
|
usage(B_FALSE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
while ((nvp = nvlist_next_nvpair(props, nvp))) {
|
||||||
|
if (strcmp(nvpair_name(nvp), "origin") != 0) {
|
||||||
|
(void) fprintf(stderr, gettext("invalid option"));
|
||||||
|
usage(B_FALSE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (isatty(STDIN_FILENO)) {
|
if (isatty(STDIN_FILENO)) {
|
||||||
(void) fprintf(stderr,
|
(void) fprintf(stderr,
|
||||||
gettext("Error: Backup stream can not be read "
|
gettext("Error: Backup stream can not be read "
|
||||||
@ -3974,7 +3991,7 @@ zfs_do_receive(int argc, char **argv)
|
|||||||
return (1);
|
return (1);
|
||||||
}
|
}
|
||||||
|
|
||||||
err = zfs_receive(g_zfs, argv[0], &flags, STDIN_FILENO, NULL);
|
err = zfs_receive(g_zfs, argv[0], props, &flags, STDIN_FILENO, NULL);
|
||||||
|
|
||||||
return (err != 0);
|
return (err != 0);
|
||||||
}
|
}
|
||||||
|
@ -3586,7 +3586,8 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
|
|||||||
*/
|
*/
|
||||||
n = ztest_random(regions) * stride + ztest_random(width);
|
n = ztest_random(regions) * stride + ztest_random(width);
|
||||||
s = 1 + ztest_random(2 * width - 1);
|
s = 1 + ztest_random(2 * width - 1);
|
||||||
dmu_prefetch(os, bigobj, n * chunksize, s * chunksize);
|
dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize,
|
||||||
|
ZIO_PRIORITY_SYNC_READ);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Pick a random index and compute the offsets into packobj and bigobj.
|
* Pick a random index and compute the offsets into packobj and bigobj.
|
||||||
@ -5705,8 +5706,10 @@ ztest_run(ztest_shared_t *zs)
|
|||||||
* Right before closing the pool, kick off a bunch of async I/O;
|
* Right before closing the pool, kick off a bunch of async I/O;
|
||||||
* spa_close() should wait for it to complete.
|
* spa_close() should wait for it to complete.
|
||||||
*/
|
*/
|
||||||
for (uint64_t object = 1; object < 50; object++)
|
for (uint64_t object = 1; object < 50; object++) {
|
||||||
dmu_prefetch(spa->spa_meta_objset, object, 0, 1ULL << 20);
|
dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20,
|
||||||
|
ZIO_PRIORITY_SYNC_READ);
|
||||||
|
}
|
||||||
|
|
||||||
spa_close(spa, FTAG);
|
spa_close(spa, FTAG);
|
||||||
|
|
||||||
|
@ -668,8 +668,8 @@ typedef struct recvflags {
|
|||||||
boolean_t nomount;
|
boolean_t nomount;
|
||||||
} recvflags_t;
|
} recvflags_t;
|
||||||
|
|
||||||
extern int zfs_receive(libzfs_handle_t *, const char *, recvflags_t *,
|
extern int zfs_receive(libzfs_handle_t *, const char *, nvlist_t *,
|
||||||
int, avl_tree_t *);
|
recvflags_t *, int, avl_tree_t *);
|
||||||
|
|
||||||
typedef enum diff_flags {
|
typedef enum diff_flags {
|
||||||
ZFS_DIFF_PARSEABLE = 0x1,
|
ZFS_DIFF_PARSEABLE = 0x1,
|
||||||
|
@ -3535,7 +3535,7 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
zbookmark_compare(const void *a, const void *b)
|
zbookmark_mem_compare(const void *a, const void *b)
|
||||||
{
|
{
|
||||||
return (memcmp(a, b, sizeof (zbookmark_phys_t)));
|
return (memcmp(a, b, sizeof (zbookmark_phys_t)));
|
||||||
}
|
}
|
||||||
@ -3598,7 +3598,7 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp)
|
|||||||
zc.zc_nvlist_dst_size;
|
zc.zc_nvlist_dst_size;
|
||||||
count -= zc.zc_nvlist_dst_size;
|
count -= zc.zc_nvlist_dst_size;
|
||||||
|
|
||||||
qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_compare);
|
qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_mem_compare);
|
||||||
|
|
||||||
verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0);
|
verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0);
|
||||||
|
|
||||||
|
@ -64,8 +64,9 @@ extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *);
|
|||||||
/* We need to use something for ENODATA. */
|
/* We need to use something for ENODATA. */
|
||||||
#define ENODATA EIDRM
|
#define ENODATA EIDRM
|
||||||
|
|
||||||
static int zfs_receive_impl(libzfs_handle_t *, const char *, recvflags_t *,
|
static int zfs_receive_impl(libzfs_handle_t *, const char *, const char *,
|
||||||
int, const char *, nvlist_t *, avl_tree_t *, char **, int, uint64_t *);
|
recvflags_t *, int, const char *, nvlist_t *, avl_tree_t *, char **, int,
|
||||||
|
uint64_t *);
|
||||||
|
|
||||||
static const zio_cksum_t zero_cksum = { 0 };
|
static const zio_cksum_t zero_cksum = { 0 };
|
||||||
|
|
||||||
@ -2498,7 +2499,7 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname,
|
|||||||
* zfs_receive_one() will take care of it (ie,
|
* zfs_receive_one() will take care of it (ie,
|
||||||
* recv_skip() and return 0).
|
* recv_skip() and return 0).
|
||||||
*/
|
*/
|
||||||
error = zfs_receive_impl(hdl, destname, flags, fd,
|
error = zfs_receive_impl(hdl, destname, NULL, flags, fd,
|
||||||
sendfs, stream_nv, stream_avl, top_zfs, cleanup_fd,
|
sendfs, stream_nv, stream_avl, top_zfs, cleanup_fd,
|
||||||
action_handlep);
|
action_handlep);
|
||||||
if (error == ENODATA) {
|
if (error == ENODATA) {
|
||||||
@ -2631,9 +2632,9 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
|
|||||||
*/
|
*/
|
||||||
static int
|
static int
|
||||||
zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
|
zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
|
||||||
recvflags_t *flags, dmu_replay_record_t *drr,
|
const char *originsnap, recvflags_t *flags, dmu_replay_record_t *drr,
|
||||||
dmu_replay_record_t *drr_noswap, const char *sendfs,
|
dmu_replay_record_t *drr_noswap, const char *sendfs, nvlist_t *stream_nv,
|
||||||
nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd,
|
avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd,
|
||||||
uint64_t *action_handlep)
|
uint64_t *action_handlep)
|
||||||
{
|
{
|
||||||
zfs_cmd_t zc = { 0 };
|
zfs_cmd_t zc = { 0 };
|
||||||
@ -2798,10 +2799,15 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
|
|||||||
}
|
}
|
||||||
if (flags->verbose)
|
if (flags->verbose)
|
||||||
(void) printf("found clone origin %s\n", zc.zc_string);
|
(void) printf("found clone origin %s\n", zc.zc_string);
|
||||||
|
} else if (originsnap) {
|
||||||
|
(void) strncpy(zc.zc_string, originsnap, ZFS_MAXNAMELEN);
|
||||||
|
if (flags->verbose)
|
||||||
|
(void) printf("using provided clone origin %s\n",
|
||||||
|
zc.zc_string);
|
||||||
}
|
}
|
||||||
|
|
||||||
stream_wantsnewfs = (drrb->drr_fromguid == 0 ||
|
stream_wantsnewfs = (drrb->drr_fromguid == 0 ||
|
||||||
(drrb->drr_flags & DRR_FLAG_CLONE));
|
(drrb->drr_flags & DRR_FLAG_CLONE) || originsnap);
|
||||||
|
|
||||||
if (stream_wantsnewfs) {
|
if (stream_wantsnewfs) {
|
||||||
/*
|
/*
|
||||||
@ -3179,9 +3185,10 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags,
|
zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap,
|
||||||
int infd, const char *sendfs, nvlist_t *stream_nv, avl_tree_t *stream_avl,
|
const char *originsnap, recvflags_t *flags, int infd, const char *sendfs,
|
||||||
char **top_zfs, int cleanup_fd, uint64_t *action_handlep)
|
nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd,
|
||||||
|
uint64_t *action_handlep)
|
||||||
{
|
{
|
||||||
int err;
|
int err;
|
||||||
dmu_replay_record_t drr, drr_noswap;
|
dmu_replay_record_t drr, drr_noswap;
|
||||||
@ -3200,6 +3207,12 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags,
|
|||||||
"(%s) does not exist"), tosnap);
|
"(%s) does not exist"), tosnap);
|
||||||
return (zfs_error(hdl, EZFS_NOENT, errbuf));
|
return (zfs_error(hdl, EZFS_NOENT, errbuf));
|
||||||
}
|
}
|
||||||
|
if (originsnap &&
|
||||||
|
!zfs_dataset_exists(hdl, originsnap, ZFS_TYPE_DATASET)) {
|
||||||
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified origin fs "
|
||||||
|
"(%s) does not exist"), originsnap);
|
||||||
|
return (zfs_error(hdl, EZFS_NOENT, errbuf));
|
||||||
|
}
|
||||||
|
|
||||||
/* read in the BEGIN record */
|
/* read in the BEGIN record */
|
||||||
if (0 != (err = recv_read(hdl, infd, &drr, sizeof (drr), B_FALSE,
|
if (0 != (err = recv_read(hdl, infd, &drr, sizeof (drr), B_FALSE,
|
||||||
@ -3272,14 +3285,14 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags,
|
|||||||
*cp = '\0';
|
*cp = '\0';
|
||||||
sendfs = nonpackage_sendfs;
|
sendfs = nonpackage_sendfs;
|
||||||
}
|
}
|
||||||
return (zfs_receive_one(hdl, infd, tosnap, flags,
|
return (zfs_receive_one(hdl, infd, tosnap, originsnap, flags,
|
||||||
&drr, &drr_noswap, sendfs, stream_nv, stream_avl,
|
&drr, &drr_noswap, sendfs, stream_nv, stream_avl, top_zfs,
|
||||||
top_zfs, cleanup_fd, action_handlep));
|
cleanup_fd, action_handlep));
|
||||||
} else {
|
} else {
|
||||||
assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
|
assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
|
||||||
DMU_COMPOUNDSTREAM);
|
DMU_COMPOUNDSTREAM);
|
||||||
return (zfs_receive_package(hdl, infd, tosnap, flags,
|
return (zfs_receive_package(hdl, infd, tosnap, flags, &drr,
|
||||||
&drr, &zcksum, top_zfs, cleanup_fd, action_handlep));
|
&zcksum, top_zfs, cleanup_fd, action_handlep));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3290,18 +3303,24 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags,
|
|||||||
* (-1 will override -2).
|
* (-1 will override -2).
|
||||||
*/
|
*/
|
||||||
int
|
int
|
||||||
zfs_receive(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags,
|
zfs_receive(libzfs_handle_t *hdl, const char *tosnap, nvlist_t *props,
|
||||||
int infd, avl_tree_t *stream_avl)
|
recvflags_t *flags, int infd, avl_tree_t *stream_avl)
|
||||||
{
|
{
|
||||||
char *top_zfs = NULL;
|
char *top_zfs = NULL;
|
||||||
int err;
|
int err;
|
||||||
int cleanup_fd;
|
int cleanup_fd;
|
||||||
uint64_t action_handle = 0;
|
uint64_t action_handle = 0;
|
||||||
|
char *originsnap = NULL;
|
||||||
|
if (props) {
|
||||||
|
err = nvlist_lookup_string(props, "origin", &originsnap);
|
||||||
|
if (err && err != ENOENT)
|
||||||
|
return (err);
|
||||||
|
}
|
||||||
|
|
||||||
cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL);
|
cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL);
|
||||||
VERIFY(cleanup_fd >= 0);
|
VERIFY(cleanup_fd >= 0);
|
||||||
|
|
||||||
err = zfs_receive_impl(hdl, tosnap, flags, infd, NULL, NULL,
|
err = zfs_receive_impl(hdl, tosnap, originsnap, flags, infd, NULL, NULL,
|
||||||
stream_avl, &top_zfs, cleanup_fd, &action_handle);
|
stream_avl, &top_zfs, cleanup_fd, &action_handle);
|
||||||
|
|
||||||
VERIFY(0 == close(cleanup_fd));
|
VERIFY(0 == close(cleanup_fd));
|
||||||
|
@ -135,8 +135,18 @@ extern int aok;
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* DTrace SDT probes have different signatures in userland than they do in
|
* DTrace SDT probes have different signatures in userland than they do in
|
||||||
* kernel. If they're being used in kernel code, re-define them out of
|
* the kernel. If they're being used in kernel code, re-define them out of
|
||||||
* existence for their counterparts in libzpool.
|
* existence for their counterparts in libzpool.
|
||||||
|
*
|
||||||
|
* Here's an example of how to use the set-error probes in userland:
|
||||||
|
* zfs$target:::set-error /arg0 == EBUSY/ {stack();}
|
||||||
|
*
|
||||||
|
* Here's an example of how to use DTRACE_PROBE probes in userland:
|
||||||
|
* If there is a probe declared as follows:
|
||||||
|
* DTRACE_PROBE2(zfs__probe_name, uint64_t, blkid, dnode_t *, dn);
|
||||||
|
* Then you can use it as follows:
|
||||||
|
* zfs$target:::probe2 /copyinstr(arg0) == "zfs__probe_name"/
|
||||||
|
* {printf("%u %p\n", arg1, arg2);}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifdef DTRACE_PROBE
|
#ifdef DTRACE_PROBE
|
||||||
|
@ -22,7 +22,9 @@
|
|||||||
#
|
#
|
||||||
# Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
|
# Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
|
# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
|
||||||
# Copyright (c) 2013 by Delphix. All rights reserved.
|
# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
|
||||||
|
# Copyright (c) 2012 Joyent, Inc. All rights reserved.
|
||||||
|
# Copyright (c) 2011, 2014 by Delphix. All rights reserved.
|
||||||
# Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
# Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
@ -36,6 +38,7 @@ ZFS_COMMON_OBJS += \
|
|||||||
blkptr.o \
|
blkptr.o \
|
||||||
bpobj.o \
|
bpobj.o \
|
||||||
bptree.o \
|
bptree.o \
|
||||||
|
bqueue.o \
|
||||||
dbuf.o \
|
dbuf.o \
|
||||||
ddt.o \
|
ddt.o \
|
||||||
ddt_zap.o \
|
ddt_zap.o \
|
||||||
|
@ -154,7 +154,7 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
|
|||||||
int err;
|
int err;
|
||||||
struct bptree_args *ba = arg;
|
struct bptree_args *ba = arg;
|
||||||
|
|
||||||
if (BP_IS_HOLE(bp))
|
if (bp == NULL || BP_IS_HOLE(bp))
|
||||||
return (0);
|
return (0);
|
||||||
|
|
||||||
err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx);
|
err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx);
|
||||||
|
111
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c
Normal file
111
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c
Normal file
@ -0,0 +1,111 @@
|
|||||||
|
/*
|
||||||
|
* CDDL HEADER START
|
||||||
|
*
|
||||||
|
* This file and its contents are supplied under the terms of the
|
||||||
|
* Common Development and Distribution License ("CDDL"), version 1.0.
|
||||||
|
* You may only use this file in accordance with the terms of version
|
||||||
|
* 1.0 of the CDDL.
|
||||||
|
*
|
||||||
|
* A full copy of the text of the CDDL should have accompanied this
|
||||||
|
* source. A copy of the CDDL is also available via the Internet at
|
||||||
|
* http://www.illumos.org/license/CDDL.
|
||||||
|
*
|
||||||
|
* CDDL HEADER END
|
||||||
|
*/
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2014 by Delphix. All rights reserved.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <sys/bqueue.h>
|
||||||
|
#include <sys/zfs_context.h>
|
||||||
|
|
||||||
|
static inline bqueue_node_t *
|
||||||
|
obj2node(bqueue_t *q, void *data)
|
||||||
|
{
|
||||||
|
return ((bqueue_node_t *)((char *)data + q->bq_node_offset));
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Initialize a blocking queue The maximum capacity of the queue is set to
|
||||||
|
* size. Types that want to be stored in a bqueue must contain a bqueue_node_t,
|
||||||
|
* and offset should give its offset from the start of the struct. Return 0 on
|
||||||
|
* success, or -1 on failure.
|
||||||
|
*/
|
||||||
|
int
|
||||||
|
bqueue_init(bqueue_t *q, uint64_t size, size_t node_offset)
|
||||||
|
{
|
||||||
|
list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t),
|
||||||
|
node_offset + offsetof(bqueue_node_t, bqn_node));
|
||||||
|
cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL);
|
||||||
|
cv_init(&q->bq_pop_cv, NULL, CV_DEFAULT, NULL);
|
||||||
|
mutex_init(&q->bq_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||||
|
q->bq_node_offset = node_offset;
|
||||||
|
q->bq_size = 0;
|
||||||
|
q->bq_maxsize = size;
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Destroy a blocking queue. This function asserts that there are no
|
||||||
|
* elements in the queue, and no one is blocked on the condition
|
||||||
|
* variables.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
bqueue_destroy(bqueue_t *q)
|
||||||
|
{
|
||||||
|
ASSERT0(q->bq_size);
|
||||||
|
cv_destroy(&q->bq_add_cv);
|
||||||
|
cv_destroy(&q->bq_pop_cv);
|
||||||
|
mutex_destroy(&q->bq_lock);
|
||||||
|
list_destroy(&q->bq_list);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Add data to q, consuming size units of capacity. If there is insufficient
|
||||||
|
* capacity to consume size units, block until capacity exists. Asserts size is
|
||||||
|
* > 0.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size)
|
||||||
|
{
|
||||||
|
ASSERT3U(item_size, >, 0);
|
||||||
|
ASSERT3U(item_size, <, q->bq_maxsize);
|
||||||
|
mutex_enter(&q->bq_lock);
|
||||||
|
obj2node(q, data)->bqn_size = item_size;
|
||||||
|
while (q->bq_size + item_size > q->bq_maxsize) {
|
||||||
|
cv_wait(&q->bq_add_cv, &q->bq_lock);
|
||||||
|
}
|
||||||
|
q->bq_size += item_size;
|
||||||
|
list_insert_tail(&q->bq_list, data);
|
||||||
|
cv_signal(&q->bq_pop_cv);
|
||||||
|
mutex_exit(&q->bq_lock);
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
* Take the first element off of q. If there are no elements on the queue, wait
|
||||||
|
* until one is put there. Return the removed element.
|
||||||
|
*/
|
||||||
|
void *
|
||||||
|
bqueue_dequeue(bqueue_t *q)
|
||||||
|
{
|
||||||
|
void *ret;
|
||||||
|
uint64_t item_size;
|
||||||
|
mutex_enter(&q->bq_lock);
|
||||||
|
while (q->bq_size == 0) {
|
||||||
|
cv_wait(&q->bq_pop_cv, &q->bq_lock);
|
||||||
|
}
|
||||||
|
ret = list_remove_head(&q->bq_list);
|
||||||
|
item_size = obj2node(q, ret)->bqn_size;
|
||||||
|
q->bq_size -= item_size;
|
||||||
|
mutex_exit(&q->bq_lock);
|
||||||
|
cv_signal(&q->bq_add_cv);
|
||||||
|
return (ret);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Returns true if the space used is 0.
|
||||||
|
*/
|
||||||
|
boolean_t
|
||||||
|
bqueue_empty(bqueue_t *q)
|
||||||
|
{
|
||||||
|
return (q->bq_size == 0);
|
||||||
|
}
|
@ -548,11 +548,35 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db)
|
|||||||
return (abuf);
|
return (abuf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Calculate which level n block references the data at the level 0 offset
|
||||||
|
* provided.
|
||||||
|
*/
|
||||||
uint64_t
|
uint64_t
|
||||||
dbuf_whichblock(dnode_t *dn, uint64_t offset)
|
dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset)
|
||||||
{
|
{
|
||||||
if (dn->dn_datablkshift) {
|
if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
|
||||||
return (offset >> dn->dn_datablkshift);
|
/*
|
||||||
|
* The level n blkid is equal to the level 0 blkid divided by
|
||||||
|
* the number of level 0s in a level n block.
|
||||||
|
*
|
||||||
|
* The level 0 blkid is offset >> datablkshift =
|
||||||
|
* offset / 2^datablkshift.
|
||||||
|
*
|
||||||
|
* The number of level 0s in a level n is the number of block
|
||||||
|
* pointers in an indirect block, raised to the power of level.
|
||||||
|
* This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
|
||||||
|
* 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
|
||||||
|
*
|
||||||
|
* Thus, the level n blkid is: offset /
|
||||||
|
* ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT)))
|
||||||
|
* = offset / 2^(datablkshift + level *
|
||||||
|
* (indblkshift - SPA_BLKPTRSHIFT))
|
||||||
|
* = offset >> (datablkshift + level *
|
||||||
|
* (indblkshift - SPA_BLKPTRSHIFT))
|
||||||
|
*/
|
||||||
|
return (offset >> (dn->dn_datablkshift + level *
|
||||||
|
(dn->dn_indblkshift - SPA_BLKPTRSHIFT)));
|
||||||
} else {
|
} else {
|
||||||
ASSERT3U(offset, <, dn->dn_datablksz);
|
ASSERT3U(offset, <, dn->dn_datablksz);
|
||||||
return (0);
|
return (0);
|
||||||
@ -1715,6 +1739,12 @@ dbuf_clear(dmu_buf_impl_t *db)
|
|||||||
dbuf_rele(parent, db);
|
dbuf_rele(parent, db);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Note: While bpp will always be updated if the function returns success,
|
||||||
|
* parentp will not be updated if the dnode does not have dn_dbuf filled in;
|
||||||
|
* this happens when the dnode is the meta-dnode, or a userused or groupused
|
||||||
|
* object.
|
||||||
|
*/
|
||||||
static int
|
static int
|
||||||
dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
|
dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
|
||||||
dmu_buf_impl_t **parentp, blkptr_t **bpp)
|
dmu_buf_impl_t **parentp, blkptr_t **bpp)
|
||||||
@ -1755,7 +1785,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
|
|||||||
} else if (level < nlevels-1) {
|
} else if (level < nlevels-1) {
|
||||||
/* this block is referenced from an indirect block */
|
/* this block is referenced from an indirect block */
|
||||||
int err = dbuf_hold_impl(dn, level+1,
|
int err = dbuf_hold_impl(dn, level+1,
|
||||||
blkid >> epbs, fail_sparse, NULL, parentp);
|
blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
|
||||||
if (err)
|
if (err)
|
||||||
return (err);
|
return (err);
|
||||||
err = dbuf_read(*parentp, NULL,
|
err = dbuf_read(*parentp, NULL,
|
||||||
@ -1930,11 +1960,96 @@ dbuf_destroy(dmu_buf_impl_t *db)
|
|||||||
arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
|
arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
typedef struct dbuf_prefetch_arg {
|
||||||
dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
|
spa_t *dpa_spa; /* The spa to issue the prefetch in. */
|
||||||
|
zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
|
||||||
|
int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
|
||||||
|
int dpa_curlevel; /* The current level that we're reading */
|
||||||
|
zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
|
||||||
|
zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
|
||||||
|
arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
|
||||||
|
} dbuf_prefetch_arg_t;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Actually issue the prefetch read for the block given.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
|
||||||
{
|
{
|
||||||
dmu_buf_impl_t *db = NULL;
|
if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
|
||||||
blkptr_t *bp = NULL;
|
return;
|
||||||
|
|
||||||
|
arc_flags_t aflags =
|
||||||
|
dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
|
||||||
|
|
||||||
|
ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
|
||||||
|
ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
|
||||||
|
ASSERT(dpa->dpa_zio != NULL);
|
||||||
|
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
|
||||||
|
dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
|
||||||
|
&aflags, &dpa->dpa_zb);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Called when an indirect block above our prefetch target is read in. This
|
||||||
|
* will either read in the next indirect block down the tree or issue the actual
|
||||||
|
* prefetch if the next block down is our target.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
|
||||||
|
{
|
||||||
|
dbuf_prefetch_arg_t *dpa = private;
|
||||||
|
|
||||||
|
ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
|
||||||
|
ASSERT3S(dpa->dpa_curlevel, >, 0);
|
||||||
|
if (zio != NULL) {
|
||||||
|
ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
|
||||||
|
ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
|
||||||
|
ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
|
||||||
|
}
|
||||||
|
|
||||||
|
dpa->dpa_curlevel--;
|
||||||
|
|
||||||
|
uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
|
||||||
|
(dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
|
||||||
|
blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
|
||||||
|
P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
|
||||||
|
if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) {
|
||||||
|
kmem_free(dpa, sizeof (*dpa));
|
||||||
|
} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
|
||||||
|
ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
|
||||||
|
dbuf_issue_final_prefetch(dpa, bp);
|
||||||
|
kmem_free(dpa, sizeof (*dpa));
|
||||||
|
} else {
|
||||||
|
arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
|
||||||
|
zbookmark_phys_t zb;
|
||||||
|
|
||||||
|
ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
|
||||||
|
|
||||||
|
SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
|
||||||
|
dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
|
||||||
|
|
||||||
|
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
|
||||||
|
bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
|
||||||
|
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
|
||||||
|
&iter_aflags, &zb);
|
||||||
|
}
|
||||||
|
(void) arc_buf_remove_ref(abuf, private);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Issue prefetch reads for the given block on the given level. If the indirect
|
||||||
|
* blocks above that block are not in memory, we will read them in
|
||||||
|
* asynchronously. As a result, this call never blocks waiting for a read to
|
||||||
|
* complete.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
|
||||||
|
arc_flags_t aflags)
|
||||||
|
{
|
||||||
|
blkptr_t bp;
|
||||||
|
int epbs, nlevels, curlevel;
|
||||||
|
uint64_t curblkid;
|
||||||
|
|
||||||
ASSERT(blkid != DMU_BONUS_BLKID);
|
ASSERT(blkid != DMU_BONUS_BLKID);
|
||||||
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
|
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
|
||||||
@ -1942,35 +2057,104 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
|
|||||||
if (dnode_block_freed(dn, blkid))
|
if (dnode_block_freed(dn, blkid))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
/* dbuf_find() returns with db_mtx held */
|
/*
|
||||||
if (db = dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid)) {
|
* This dnode hasn't been written to disk yet, so there's nothing to
|
||||||
/*
|
* prefetch.
|
||||||
* This dbuf is already in the cache. We assume that
|
*/
|
||||||
* it is already CACHED, or else about to be either
|
nlevels = dn->dn_phys->dn_nlevels;
|
||||||
* read or filled.
|
if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
|
||||||
*/
|
return;
|
||||||
|
|
||||||
|
epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
|
||||||
|
if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
|
||||||
|
return;
|
||||||
|
|
||||||
|
dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
|
||||||
|
level, blkid);
|
||||||
|
if (db != NULL) {
|
||||||
mutex_exit(&db->db_mtx);
|
mutex_exit(&db->db_mtx);
|
||||||
|
/*
|
||||||
|
* This dbuf already exists. It is either CACHED, or
|
||||||
|
* (we assume) about to be read or filled.
|
||||||
|
*/
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
|
/*
|
||||||
if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
|
* Find the closest ancestor (indirect block) of the target block
|
||||||
dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
|
* that is present in the cache. In this indirect block, we will
|
||||||
arc_flags_t aflags =
|
* find the bp that is at curlevel, curblkid.
|
||||||
ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
|
*/
|
||||||
zbookmark_phys_t zb;
|
curlevel = level;
|
||||||
|
curblkid = blkid;
|
||||||
|
while (curlevel < nlevels - 1) {
|
||||||
|
int parent_level = curlevel + 1;
|
||||||
|
uint64_t parent_blkid = curblkid >> epbs;
|
||||||
|
dmu_buf_impl_t *db;
|
||||||
|
|
||||||
SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
|
if (dbuf_hold_impl(dn, parent_level, parent_blkid,
|
||||||
dn->dn_object, 0, blkid);
|
FALSE, TRUE, FTAG, &db) == 0) {
|
||||||
|
blkptr_t *bpp = db->db_buf->b_data;
|
||||||
(void) arc_read(NULL, dn->dn_objset->os_spa,
|
bp = bpp[P2PHASE(curblkid, 1 << epbs)];
|
||||||
bp, NULL, NULL, prio,
|
dbuf_rele(db, FTAG);
|
||||||
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
|
break;
|
||||||
&aflags, &zb);
|
|
||||||
}
|
}
|
||||||
if (db)
|
|
||||||
dbuf_rele(db, NULL);
|
curlevel = parent_level;
|
||||||
|
curblkid = parent_blkid;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (curlevel == nlevels - 1) {
|
||||||
|
/* No cached indirect blocks found. */
|
||||||
|
ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
|
||||||
|
bp = dn->dn_phys->dn_blkptr[curblkid];
|
||||||
|
}
|
||||||
|
if (BP_IS_HOLE(&bp))
|
||||||
|
return;
|
||||||
|
|
||||||
|
ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
|
||||||
|
|
||||||
|
zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
|
||||||
|
ZIO_FLAG_CANFAIL);
|
||||||
|
|
||||||
|
dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
|
||||||
|
dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
|
||||||
|
SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
|
||||||
|
dn->dn_object, level, blkid);
|
||||||
|
dpa->dpa_curlevel = curlevel;
|
||||||
|
dpa->dpa_prio = prio;
|
||||||
|
dpa->dpa_aflags = aflags;
|
||||||
|
dpa->dpa_spa = dn->dn_objset->os_spa;
|
||||||
|
dpa->dpa_epbs = epbs;
|
||||||
|
dpa->dpa_zio = pio;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we have the indirect just above us, no need to do the asynchronous
|
||||||
|
* prefetch chain; we'll just run the last step ourselves. If we're at
|
||||||
|
* a higher level, though, we want to issue the prefetches for all the
|
||||||
|
* indirect blocks asynchronously, so we can go on with whatever we were
|
||||||
|
* doing.
|
||||||
|
*/
|
||||||
|
if (curlevel == level) {
|
||||||
|
ASSERT3U(curblkid, ==, blkid);
|
||||||
|
dbuf_issue_final_prefetch(dpa, &bp);
|
||||||
|
kmem_free(dpa, sizeof (*dpa));
|
||||||
|
} else {
|
||||||
|
arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
|
||||||
|
zbookmark_phys_t zb;
|
||||||
|
|
||||||
|
SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
|
||||||
|
dn->dn_object, curlevel, curblkid);
|
||||||
|
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
|
||||||
|
&bp, dbuf_prefetch_indirect_done, dpa, prio,
|
||||||
|
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
|
||||||
|
&iter_aflags, &zb);
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
* We use pio here instead of dpa_zio since it's possible that
|
||||||
|
* dpa may have already been freed.
|
||||||
|
*/
|
||||||
|
zio_nowait(pio);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1978,7 +2162,8 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
|
|||||||
* Note: dn_struct_rwlock must be held.
|
* Note: dn_struct_rwlock must be held.
|
||||||
*/
|
*/
|
||||||
int
|
int
|
||||||
dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
|
dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
|
||||||
|
boolean_t fail_sparse, boolean_t fail_uncached,
|
||||||
void *tag, dmu_buf_impl_t **dbp)
|
void *tag, dmu_buf_impl_t **dbp)
|
||||||
{
|
{
|
||||||
dmu_buf_impl_t *db, *parent = NULL;
|
dmu_buf_impl_t *db, *parent = NULL;
|
||||||
@ -1996,6 +2181,9 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
|
|||||||
blkptr_t *bp = NULL;
|
blkptr_t *bp = NULL;
|
||||||
int err;
|
int err;
|
||||||
|
|
||||||
|
if (fail_uncached)
|
||||||
|
return (SET_ERROR(ENOENT));
|
||||||
|
|
||||||
ASSERT3P(parent, ==, NULL);
|
ASSERT3P(parent, ==, NULL);
|
||||||
err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
|
err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
|
||||||
if (fail_sparse) {
|
if (fail_sparse) {
|
||||||
@ -2012,6 +2200,11 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
|
|||||||
db = dbuf_create(dn, level, blkid, parent, bp);
|
db = dbuf_create(dn, level, blkid, parent, bp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (fail_uncached && db->db_state != DB_CACHED) {
|
||||||
|
mutex_exit(&db->db_mtx);
|
||||||
|
return (SET_ERROR(ENOENT));
|
||||||
|
}
|
||||||
|
|
||||||
if (db->db_buf && refcount_is_zero(&db->db_holds)) {
|
if (db->db_buf && refcount_is_zero(&db->db_holds)) {
|
||||||
arc_buf_add_ref(db->db_buf, db);
|
arc_buf_add_ref(db->db_buf, db);
|
||||||
if (db->db_buf->b_data == NULL) {
|
if (db->db_buf->b_data == NULL) {
|
||||||
@ -2067,16 +2260,14 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
|
|||||||
dmu_buf_impl_t *
|
dmu_buf_impl_t *
|
||||||
dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
|
dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
|
||||||
{
|
{
|
||||||
dmu_buf_impl_t *db;
|
return (dbuf_hold_level(dn, 0, blkid, tag));
|
||||||
int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
|
|
||||||
return (err ? NULL : db);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
dmu_buf_impl_t *
|
dmu_buf_impl_t *
|
||||||
dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
|
dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
|
||||||
{
|
{
|
||||||
dmu_buf_impl_t *db;
|
dmu_buf_impl_t *db;
|
||||||
int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
|
int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
|
||||||
return (err ? NULL : db);
|
return (err ? NULL : db);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2429,8 +2620,8 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
|
|||||||
if (parent == NULL) {
|
if (parent == NULL) {
|
||||||
mutex_exit(&db->db_mtx);
|
mutex_exit(&db->db_mtx);
|
||||||
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
||||||
(void) dbuf_hold_impl(dn, db->db_level+1,
|
parent = dbuf_hold_level(dn, db->db_level + 1,
|
||||||
db->db_blkid >> epbs, FALSE, db, &parent);
|
db->db_blkid >> epbs, db);
|
||||||
rw_exit(&dn->dn_struct_rwlock);
|
rw_exit(&dn->dn_struct_rwlock);
|
||||||
mutex_enter(&db->db_mtx);
|
mutex_enter(&db->db_mtx);
|
||||||
db->db_parent = parent;
|
db->db_parent = parent;
|
||||||
|
@ -141,7 +141,7 @@ dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
|
|||||||
err = dnode_hold(os, object, FTAG, &dn);
|
err = dnode_hold(os, object, FTAG, &dn);
|
||||||
if (err)
|
if (err)
|
||||||
return (err);
|
return (err);
|
||||||
blkid = dbuf_whichblock(dn, offset);
|
blkid = dbuf_whichblock(dn, 0, offset);
|
||||||
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
||||||
db = dbuf_hold(dn, blkid, tag);
|
db = dbuf_hold(dn, blkid, tag);
|
||||||
rw_exit(&dn->dn_struct_rwlock);
|
rw_exit(&dn->dn_struct_rwlock);
|
||||||
@ -424,7 +424,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
|
|||||||
dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
|
dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
|
||||||
|
|
||||||
zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
|
zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
|
||||||
blkid = dbuf_whichblock(dn, offset);
|
blkid = dbuf_whichblock(dn, 0, offset);
|
||||||
for (i = 0; i < nblks; i++) {
|
for (i = 0; i < nblks; i++) {
|
||||||
dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
|
dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
|
||||||
if (db == NULL) {
|
if (db == NULL) {
|
||||||
@ -528,17 +528,16 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Issue prefetch i/os for the given blocks.
|
* Issue prefetch i/os for the given blocks. If level is greater than 0, the
|
||||||
|
* indirect blocks prefeteched will be those that point to the blocks containing
|
||||||
|
* the data starting at offset, and continuing to offset + len.
|
||||||
*
|
*
|
||||||
* Note: The assumption is that we *know* these blocks will be needed
|
* Note that if the indirect blocks above the blocks being prefetched are not in
|
||||||
* almost immediately. Therefore, the prefetch i/os will be issued at
|
* cache, they will be asychronously read in.
|
||||||
* ZIO_PRIORITY_SYNC_READ
|
|
||||||
*
|
|
||||||
* Note: indirect blocks and other metadata will be read synchronously,
|
|
||||||
* causing this function to block if they are not already cached.
|
|
||||||
*/
|
*/
|
||||||
void
|
void
|
||||||
dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
|
dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
|
||||||
|
uint64_t len, zio_priority_t pri)
|
||||||
{
|
{
|
||||||
dnode_t *dn;
|
dnode_t *dn;
|
||||||
uint64_t blkid;
|
uint64_t blkid;
|
||||||
@ -554,8 +553,9 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
||||||
blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
|
blkid = dbuf_whichblock(dn, level,
|
||||||
dbuf_prefetch(dn, blkid, ZIO_PRIORITY_SYNC_READ);
|
object * sizeof (dnode_phys_t));
|
||||||
|
dbuf_prefetch(dn, level, blkid, pri, 0);
|
||||||
rw_exit(&dn->dn_struct_rwlock);
|
rw_exit(&dn->dn_struct_rwlock);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -570,18 +570,24 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
||||||
if (dn->dn_datablkshift) {
|
/*
|
||||||
int blkshift = dn->dn_datablkshift;
|
* offset + len - 1 is the last byte we want to prefetch for, and offset
|
||||||
nblks = (P2ROUNDUP(offset + len, 1 << blkshift) -
|
* is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the
|
||||||
P2ALIGN(offset, 1 << blkshift)) >> blkshift;
|
* last block we want to prefetch, and dbuf_whichblock(dn, level,
|
||||||
|
* offset) is the first. Then the number we need to prefetch is the
|
||||||
|
* last - first + 1.
|
||||||
|
*/
|
||||||
|
if (level > 0 || dn->dn_datablkshift != 0) {
|
||||||
|
nblks = dbuf_whichblock(dn, level, offset + len - 1) -
|
||||||
|
dbuf_whichblock(dn, level, offset) + 1;
|
||||||
} else {
|
} else {
|
||||||
nblks = (offset < dn->dn_datablksz);
|
nblks = (offset < dn->dn_datablksz);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (nblks != 0) {
|
if (nblks != 0) {
|
||||||
blkid = dbuf_whichblock(dn, offset);
|
blkid = dbuf_whichblock(dn, level, offset);
|
||||||
for (int i = 0; i < nblks; i++)
|
for (int i = 0; i < nblks; i++)
|
||||||
dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_SYNC_READ);
|
dbuf_prefetch(dn, level, blkid + i, pri, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
rw_exit(&dn->dn_struct_rwlock);
|
rw_exit(&dn->dn_struct_rwlock);
|
||||||
@ -1393,7 +1399,7 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
|
|||||||
DB_DNODE_ENTER(dbuf);
|
DB_DNODE_ENTER(dbuf);
|
||||||
dn = DB_DNODE(dbuf);
|
dn = DB_DNODE(dbuf);
|
||||||
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
||||||
blkid = dbuf_whichblock(dn, offset);
|
blkid = dbuf_whichblock(dn, 0, offset);
|
||||||
VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
|
VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
|
||||||
rw_exit(&dn->dn_struct_rwlock);
|
rw_exit(&dn->dn_struct_rwlock);
|
||||||
DB_DNODE_EXIT(dbuf);
|
DB_DNODE_EXIT(dbuf);
|
||||||
|
@ -138,7 +138,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
|
|||||||
if (issig(JUSTLOOKING) && issig(FORREAL))
|
if (issig(JUSTLOOKING) && issig(FORREAL))
|
||||||
return (SET_ERROR(EINTR));
|
return (SET_ERROR(EINTR));
|
||||||
|
|
||||||
if (zb->zb_object != DMU_META_DNODE_OBJECT)
|
if (bp == NULL || zb->zb_object != DMU_META_DNODE_OBJECT)
|
||||||
return (0);
|
return (0);
|
||||||
|
|
||||||
if (BP_IS_HOLE(bp)) {
|
if (BP_IS_HOLE(bp)) {
|
||||||
|
@ -148,6 +148,11 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
|
|||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Return (in *objectp) the next object which is allocated (or a hole)
|
||||||
|
* after *object, taking into account only objects that may have been modified
|
||||||
|
* after the specified txg.
|
||||||
|
*/
|
||||||
int
|
int
|
||||||
dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
|
dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
|
||||||
{
|
{
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -158,7 +158,7 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
|
|||||||
* If we already visited this bp & everything below,
|
* If we already visited this bp & everything below,
|
||||||
* don't bother doing it again.
|
* don't bother doing it again.
|
||||||
*/
|
*/
|
||||||
if (zbookmark_is_before(dnp, zb, td->td_resume))
|
if (zbookmark_subtree_completed(dnp, zb, td->td_resume))
|
||||||
return (RESUME_SKIP_ALL);
|
return (RESUME_SKIP_ALL);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -425,6 +425,17 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
|
|||||||
int j, err = 0;
|
int j, err = 0;
|
||||||
zbookmark_phys_t czb;
|
zbookmark_phys_t czb;
|
||||||
|
|
||||||
|
if (td->td_flags & TRAVERSE_PRE) {
|
||||||
|
SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
|
||||||
|
ZB_DNODE_BLKID);
|
||||||
|
err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
|
||||||
|
td->td_arg);
|
||||||
|
if (err == TRAVERSE_VISIT_NO_CHILDREN)
|
||||||
|
return (0);
|
||||||
|
if (err != 0)
|
||||||
|
return (err);
|
||||||
|
}
|
||||||
|
|
||||||
for (j = 0; j < dnp->dn_nblkptr; j++) {
|
for (j = 0; j < dnp->dn_nblkptr; j++) {
|
||||||
SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
|
SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
|
||||||
err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
|
err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
|
||||||
@ -432,10 +443,21 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (err == 0 && dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
|
if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
|
||||||
SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
|
SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
|
||||||
err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
|
err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
|
||||||
|
SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
|
||||||
|
ZB_DNODE_BLKID);
|
||||||
|
err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
|
||||||
|
td->td_arg);
|
||||||
|
if (err == TRAVERSE_VISIT_NO_CHILDREN)
|
||||||
|
return (0);
|
||||||
|
if (err != 0)
|
||||||
|
return (err);
|
||||||
|
}
|
||||||
return (err);
|
return (err);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -448,6 +470,8 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
|
|||||||
arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
|
arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
|
||||||
|
|
||||||
ASSERT(pfd->pd_bytes_fetched >= 0);
|
ASSERT(pfd->pd_bytes_fetched >= 0);
|
||||||
|
if (bp == NULL)
|
||||||
|
return (0);
|
||||||
if (pfd->pd_cancel)
|
if (pfd->pd_cancel)
|
||||||
return (SET_ERROR(EINTR));
|
return (SET_ERROR(EINTR));
|
||||||
|
|
||||||
|
@ -315,7 +315,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
|
|||||||
dmu_buf_impl_t *db;
|
dmu_buf_impl_t *db;
|
||||||
|
|
||||||
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
||||||
err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db);
|
err = dbuf_hold_impl(dn, 0, start,
|
||||||
|
FALSE, FALSE, FTAG, &db);
|
||||||
rw_exit(&dn->dn_struct_rwlock);
|
rw_exit(&dn->dn_struct_rwlock);
|
||||||
|
|
||||||
if (err) {
|
if (err) {
|
||||||
@ -516,7 +517,8 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
|
|||||||
blkoff = P2PHASE(blkid, epb);
|
blkoff = P2PHASE(blkid, epb);
|
||||||
tochk = MIN(epb - blkoff, nblks);
|
tochk = MIN(epb - blkoff, nblks);
|
||||||
|
|
||||||
err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf);
|
err = dbuf_hold_impl(dn, 1, blkid >> epbs,
|
||||||
|
FALSE, FALSE, FTAG, &dbuf);
|
||||||
if (err) {
|
if (err) {
|
||||||
txh->txh_tx->tx_err = err;
|
txh->txh_tx->tx_err = err;
|
||||||
break;
|
break;
|
||||||
|
@ -305,7 +305,8 @@ dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks)
|
|||||||
fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks);
|
fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks);
|
||||||
|
|
||||||
for (i = 0; i < fetchsz; i++) {
|
for (i = 0; i < fetchsz; i++) {
|
||||||
dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_ASYNC_READ);
|
dbuf_prefetch(dn, 0, blkid + i, ZIO_PRIORITY_ASYNC_READ,
|
||||||
|
ARC_FLAG_PREFETCH);
|
||||||
}
|
}
|
||||||
|
|
||||||
return (fetchsz);
|
return (fetchsz);
|
||||||
|
@ -1116,7 +1116,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
|
|||||||
drop_struct_lock = TRUE;
|
drop_struct_lock = TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
|
blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
|
||||||
|
|
||||||
db = dbuf_hold(mdn, blk, FTAG);
|
db = dbuf_hold(mdn, blk, FTAG);
|
||||||
if (drop_struct_lock)
|
if (drop_struct_lock)
|
||||||
@ -1413,7 +1413,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
|
|||||||
goto fail;
|
goto fail;
|
||||||
|
|
||||||
/* resize the old block */
|
/* resize the old block */
|
||||||
err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db);
|
err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
|
||||||
if (err == 0)
|
if (err == 0)
|
||||||
dbuf_new_size(db, size, tx);
|
dbuf_new_size(db, size, tx);
|
||||||
else if (err != ENOENT)
|
else if (err != ENOENT)
|
||||||
@ -1586,8 +1586,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
|
|||||||
ASSERT3U(blkoff + head, ==, blksz);
|
ASSERT3U(blkoff + head, ==, blksz);
|
||||||
if (len < head)
|
if (len < head)
|
||||||
head = len;
|
head = len;
|
||||||
if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE,
|
if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
|
||||||
FTAG, &db) == 0) {
|
TRUE, FALSE, FTAG, &db) == 0) {
|
||||||
caddr_t data;
|
caddr_t data;
|
||||||
|
|
||||||
/* don't dirty if it isn't on disk and isn't dirty */
|
/* don't dirty if it isn't on disk and isn't dirty */
|
||||||
@ -1624,8 +1624,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
|
|||||||
if (tail) {
|
if (tail) {
|
||||||
if (len < tail)
|
if (len < tail)
|
||||||
tail = len;
|
tail = len;
|
||||||
if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
|
if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
|
||||||
TRUE, FTAG, &db) == 0) {
|
TRUE, FALSE, FTAG, &db) == 0) {
|
||||||
/* don't dirty if not on disk and not dirty */
|
/* don't dirty if not on disk and not dirty */
|
||||||
if (db->db_last_dirty ||
|
if (db->db_last_dirty ||
|
||||||
(db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
|
(db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
|
||||||
@ -1854,7 +1854,7 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
|
|||||||
*/
|
*/
|
||||||
static int
|
static int
|
||||||
dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
|
dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
|
||||||
int lvl, uint64_t blkfill, uint64_t txg)
|
int lvl, uint64_t blkfill, uint64_t txg)
|
||||||
{
|
{
|
||||||
dmu_buf_impl_t *db = NULL;
|
dmu_buf_impl_t *db = NULL;
|
||||||
void *data = NULL;
|
void *data = NULL;
|
||||||
@ -1876,8 +1876,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
|
|||||||
epb = dn->dn_phys->dn_nblkptr;
|
epb = dn->dn_phys->dn_nblkptr;
|
||||||
data = dn->dn_phys->dn_blkptr;
|
data = dn->dn_phys->dn_blkptr;
|
||||||
} else {
|
} else {
|
||||||
uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
|
uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
|
||||||
error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
|
error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
|
||||||
if (error) {
|
if (error) {
|
||||||
if (error != ENOENT)
|
if (error != ENOENT)
|
||||||
return (error);
|
return (error);
|
||||||
|
@ -188,7 +188,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
|
|||||||
|
|
||||||
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
||||||
err = dbuf_hold_impl(dn, db->db_level-1,
|
err = dbuf_hold_impl(dn, db->db_level-1,
|
||||||
(db->db_blkid << epbs) + i, TRUE, FTAG, &child);
|
(db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child);
|
||||||
rw_exit(&dn->dn_struct_rwlock);
|
rw_exit(&dn->dn_struct_rwlock);
|
||||||
if (err == ENOENT)
|
if (err == ENOENT)
|
||||||
continue;
|
continue;
|
||||||
@ -284,7 +284,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
|
|||||||
continue;
|
continue;
|
||||||
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
||||||
VERIFY0(dbuf_hold_impl(dn, db->db_level - 1,
|
VERIFY0(dbuf_hold_impl(dn, db->db_level - 1,
|
||||||
i, B_TRUE, FTAG, &subdb));
|
i, TRUE, FALSE, FTAG, &subdb));
|
||||||
rw_exit(&dn->dn_struct_rwlock);
|
rw_exit(&dn->dn_struct_rwlock);
|
||||||
ASSERT3P(bp, ==, subdb->db_blkptr);
|
ASSERT3P(bp, ==, subdb->db_blkptr);
|
||||||
|
|
||||||
@ -357,7 +357,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
|
|||||||
continue;
|
continue;
|
||||||
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
||||||
VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,
|
VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,
|
||||||
TRUE, FTAG, &db));
|
TRUE, FALSE, FTAG, &db));
|
||||||
rw_exit(&dn->dn_struct_rwlock);
|
rw_exit(&dn->dn_struct_rwlock);
|
||||||
|
|
||||||
free_children(db, blkid, nblks, tx);
|
free_children(db, blkid, nblks, tx);
|
||||||
|
@ -540,6 +540,7 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name,
|
|||||||
const char *snapname;
|
const char *snapname;
|
||||||
uint64_t obj;
|
uint64_t obj;
|
||||||
int err = 0;
|
int err = 0;
|
||||||
|
dsl_dataset_t *ds;
|
||||||
|
|
||||||
err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
|
err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
|
||||||
if (err != 0)
|
if (err != 0)
|
||||||
@ -548,36 +549,37 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name,
|
|||||||
ASSERT(dsl_pool_config_held(dp));
|
ASSERT(dsl_pool_config_held(dp));
|
||||||
obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
|
obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
|
||||||
if (obj != 0)
|
if (obj != 0)
|
||||||
err = dsl_dataset_hold_obj(dp, obj, tag, dsp);
|
err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
|
||||||
else
|
else
|
||||||
err = SET_ERROR(ENOENT);
|
err = SET_ERROR(ENOENT);
|
||||||
|
|
||||||
/* we may be looking for a snapshot */
|
/* we may be looking for a snapshot */
|
||||||
if (err == 0 && snapname != NULL) {
|
if (err == 0 && snapname != NULL) {
|
||||||
dsl_dataset_t *ds;
|
dsl_dataset_t *snap_ds;
|
||||||
|
|
||||||
if (*snapname++ != '@') {
|
if (*snapname++ != '@') {
|
||||||
dsl_dataset_rele(*dsp, tag);
|
dsl_dataset_rele(ds, tag);
|
||||||
dsl_dir_rele(dd, FTAG);
|
dsl_dir_rele(dd, FTAG);
|
||||||
return (SET_ERROR(ENOENT));
|
return (SET_ERROR(ENOENT));
|
||||||
}
|
}
|
||||||
|
|
||||||
dprintf("looking for snapshot '%s'\n", snapname);
|
dprintf("looking for snapshot '%s'\n", snapname);
|
||||||
err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
|
err = dsl_dataset_snap_lookup(ds, snapname, &obj);
|
||||||
if (err == 0)
|
if (err == 0)
|
||||||
err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
|
err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds);
|
||||||
dsl_dataset_rele(*dsp, tag);
|
dsl_dataset_rele(ds, tag);
|
||||||
|
|
||||||
if (err == 0) {
|
if (err == 0) {
|
||||||
mutex_enter(&ds->ds_lock);
|
mutex_enter(&snap_ds->ds_lock);
|
||||||
if (ds->ds_snapname[0] == 0)
|
if (snap_ds->ds_snapname[0] == 0)
|
||||||
(void) strlcpy(ds->ds_snapname, snapname,
|
(void) strlcpy(snap_ds->ds_snapname, snapname,
|
||||||
sizeof (ds->ds_snapname));
|
sizeof (snap_ds->ds_snapname));
|
||||||
mutex_exit(&ds->ds_lock);
|
mutex_exit(&snap_ds->ds_lock);
|
||||||
*dsp = ds;
|
ds = snap_ds;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (err == 0)
|
||||||
|
*dsp = ds;
|
||||||
dsl_dir_rele(dd, FTAG);
|
dsl_dir_rele(dd, FTAG);
|
||||||
return (err);
|
return (err);
|
||||||
}
|
}
|
||||||
|
@ -552,7 +552,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
|
|||||||
struct killarg *ka = arg;
|
struct killarg *ka = arg;
|
||||||
dmu_tx_t *tx = ka->tx;
|
dmu_tx_t *tx = ka->tx;
|
||||||
|
|
||||||
if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
|
if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
|
||||||
return (0);
|
return (0);
|
||||||
|
|
||||||
if (zb->zb_level == ZB_ZIL_LEVEL) {
|
if (zb->zb_level == ZB_ZIL_LEVEL) {
|
||||||
|
@ -600,7 +600,8 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
|
|||||||
* If we already visited this bp & everything below (in
|
* If we already visited this bp & everything below (in
|
||||||
* a prior txg sync), don't bother doing it again.
|
* a prior txg sync), don't bother doing it again.
|
||||||
*/
|
*/
|
||||||
if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
|
if (zbookmark_subtree_completed(dnp, zb,
|
||||||
|
&scn->scn_phys.scn_bookmark))
|
||||||
return (B_TRUE);
|
return (B_TRUE);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1943,7 +1943,7 @@ static int
|
|||||||
spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
|
spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
|
||||||
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
|
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
|
||||||
{
|
{
|
||||||
if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
|
if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
|
||||||
return (0);
|
return (0);
|
||||||
/*
|
/*
|
||||||
* Note: normally this routine will not be called if
|
* Note: normally this routine will not be called if
|
||||||
|
@ -80,8 +80,8 @@ space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
|
|||||||
|
|
||||||
mutex_exit(sm->sm_lock);
|
mutex_exit(sm->sm_lock);
|
||||||
if (end > bufsize) {
|
if (end > bufsize) {
|
||||||
dmu_prefetch(sm->sm_os, space_map_object(sm), bufsize,
|
dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize,
|
||||||
end - bufsize);
|
end - bufsize, ZIO_PRIORITY_SYNC_READ);
|
||||||
}
|
}
|
||||||
mutex_enter(sm->sm_lock);
|
mutex_enter(sm->sm_lock);
|
||||||
|
|
||||||
|
54
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h
Normal file
54
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
/*
|
||||||
|
* CDDL HEADER START
|
||||||
|
*
|
||||||
|
* This file and its contents are supplied under the terms of the
|
||||||
|
* Common Development and Distribution License ("CDDL"), version 1.0.
|
||||||
|
* You may only use this file in accordance with the terms of version
|
||||||
|
* 1.0 of the CDDL.
|
||||||
|
*
|
||||||
|
* A full copy of the text of the CDDL should have accompanied this
|
||||||
|
* source. A copy of the CDDL is also available via the Internet at
|
||||||
|
* http://www.illumos.org/license/CDDL.
|
||||||
|
*
|
||||||
|
* CDDL HEADER END
|
||||||
|
*/
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2014 by Delphix. All rights reserved.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef _BQUEUE_H
|
||||||
|
#define _BQUEUE_H
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <sys/zfs_context.h>
|
||||||
|
|
||||||
|
typedef struct bqueue {
|
||||||
|
list_t bq_list;
|
||||||
|
kmutex_t bq_lock;
|
||||||
|
kcondvar_t bq_add_cv;
|
||||||
|
kcondvar_t bq_pop_cv;
|
||||||
|
uint64_t bq_size;
|
||||||
|
uint64_t bq_maxsize;
|
||||||
|
size_t bq_node_offset;
|
||||||
|
} bqueue_t;
|
||||||
|
|
||||||
|
typedef struct bqueue_node {
|
||||||
|
list_node_t bqn_node;
|
||||||
|
uint64_t bqn_size;
|
||||||
|
} bqueue_node_t;
|
||||||
|
|
||||||
|
|
||||||
|
int bqueue_init(bqueue_t *, uint64_t, size_t);
|
||||||
|
void bqueue_destroy(bqueue_t *);
|
||||||
|
void bqueue_enqueue(bqueue_t *, void *, uint64_t);
|
||||||
|
void *bqueue_dequeue(bqueue_t *);
|
||||||
|
boolean_t bqueue_empty(bqueue_t *);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* _BQUEUE_H */
|
@ -245,8 +245,7 @@ typedef struct dbuf_hash_table {
|
|||||||
kmutex_t hash_mutexes[DBUF_MUTEXES];
|
kmutex_t hash_mutexes[DBUF_MUTEXES];
|
||||||
} dbuf_hash_table_t;
|
} dbuf_hash_table_t;
|
||||||
|
|
||||||
|
uint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset);
|
||||||
uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
|
|
||||||
|
|
||||||
dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
|
dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
|
||||||
void dbuf_create_bonus(struct dnode *dn);
|
void dbuf_create_bonus(struct dnode *dn);
|
||||||
@ -258,10 +257,12 @@ void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx);
|
|||||||
dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
|
dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
|
||||||
dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
|
dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
|
||||||
void *tag);
|
void *tag);
|
||||||
int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create,
|
int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid,
|
||||||
|
boolean_t fail_sparse, boolean_t fail_uncached,
|
||||||
void *tag, dmu_buf_impl_t **dbp);
|
void *tag, dmu_buf_impl_t **dbp);
|
||||||
|
|
||||||
void dbuf_prefetch(struct dnode *dn, uint64_t blkid, zio_priority_t prio);
|
void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid,
|
||||||
|
zio_priority_t prio, arc_flags_t aflags);
|
||||||
|
|
||||||
void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
|
void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
|
||||||
boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj,
|
boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj,
|
||||||
|
@ -45,6 +45,7 @@
|
|||||||
#include <sys/zfs_context.h>
|
#include <sys/zfs_context.h>
|
||||||
#include <sys/cred.h>
|
#include <sys/cred.h>
|
||||||
#include <sys/fs/zfs.h>
|
#include <sys/fs/zfs.h>
|
||||||
|
#include <sys/zio_priority.h>
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
@ -748,8 +749,8 @@ extern int zfs_max_recordsize;
|
|||||||
/*
|
/*
|
||||||
* Asynchronously try to read in the data.
|
* Asynchronously try to read in the data.
|
||||||
*/
|
*/
|
||||||
void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset,
|
void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
|
||||||
uint64_t len);
|
uint64_t len, enum zio_priority pri);
|
||||||
|
|
||||||
typedef struct dmu_object_info {
|
typedef struct dmu_object_info {
|
||||||
/* All sizes are in bytes unless otherwise indicated. */
|
/* All sizes are in bytes unless otherwise indicated. */
|
||||||
|
@ -20,7 +20,7 @@
|
|||||||
*/
|
*/
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
|
||||||
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
|
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
|
||||||
* Copyright (c) 2013 Steven Hartland. All rights reserved.
|
* Copyright (c) 2013 Steven Hartland. All rights reserved.
|
||||||
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
|
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
|
||||||
|
@ -29,6 +29,7 @@
|
|||||||
#ifndef _ZIO_H
|
#ifndef _ZIO_H
|
||||||
#define _ZIO_H
|
#define _ZIO_H
|
||||||
|
|
||||||
|
#include <sys/zio_priority.h>
|
||||||
#include <sys/zfs_context.h>
|
#include <sys/zfs_context.h>
|
||||||
#include <sys/spa.h>
|
#include <sys/spa.h>
|
||||||
#include <sys/txg.h>
|
#include <sys/txg.h>
|
||||||
@ -144,18 +145,6 @@ enum zio_compress {
|
|||||||
#define ZIO_FAILURE_MODE_CONTINUE 1
|
#define ZIO_FAILURE_MODE_CONTINUE 1
|
||||||
#define ZIO_FAILURE_MODE_PANIC 2
|
#define ZIO_FAILURE_MODE_PANIC 2
|
||||||
|
|
||||||
typedef enum zio_priority {
|
|
||||||
ZIO_PRIORITY_SYNC_READ,
|
|
||||||
ZIO_PRIORITY_SYNC_WRITE, /* ZIL */
|
|
||||||
ZIO_PRIORITY_ASYNC_READ, /* prefetch */
|
|
||||||
ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
|
|
||||||
ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
|
|
||||||
ZIO_PRIORITY_TRIM, /* free requests used for TRIM */
|
|
||||||
ZIO_PRIORITY_NUM_QUEUEABLE,
|
|
||||||
|
|
||||||
ZIO_PRIORITY_NOW /* non-queued I/Os (e.g. ioctl) */
|
|
||||||
} zio_priority_t;
|
|
||||||
|
|
||||||
enum zio_flag {
|
enum zio_flag {
|
||||||
/*
|
/*
|
||||||
* Flags inherited by gang, ddt, and vdev children,
|
* Flags inherited by gang, ddt, and vdev children,
|
||||||
@ -260,6 +249,7 @@ extern const char *zio_type_name[ZIO_TYPES];
|
|||||||
* Root blocks (objset_phys_t) are object 0, level -1: <objset, 0, -1, 0>.
|
* Root blocks (objset_phys_t) are object 0, level -1: <objset, 0, -1, 0>.
|
||||||
* ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
|
* ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
|
||||||
* dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
|
* dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
|
||||||
|
* dnode visit bookmarks are <objset, object id of dnode, -3, 0>.
|
||||||
*
|
*
|
||||||
* Note: this structure is called a bookmark because its original purpose
|
* Note: this structure is called a bookmark because its original purpose
|
||||||
* was to remember where to resume a pool-wide traverse.
|
* was to remember where to resume a pool-wide traverse.
|
||||||
@ -292,6 +282,9 @@ typedef struct zbookmark_phys {
|
|||||||
#define ZB_ZIL_OBJECT (0ULL)
|
#define ZB_ZIL_OBJECT (0ULL)
|
||||||
#define ZB_ZIL_LEVEL (-2LL)
|
#define ZB_ZIL_LEVEL (-2LL)
|
||||||
|
|
||||||
|
#define ZB_DNODE_LEVEL (-3LL)
|
||||||
|
#define ZB_DNODE_BLKID (0ULL)
|
||||||
|
|
||||||
#define ZB_IS_ZERO(zb) \
|
#define ZB_IS_ZERO(zb) \
|
||||||
((zb)->zb_objset == 0 && (zb)->zb_object == 0 && \
|
((zb)->zb_objset == 0 && (zb)->zb_object == 0 && \
|
||||||
(zb)->zb_level == 0 && (zb)->zb_blkid == 0)
|
(zb)->zb_level == 0 && (zb)->zb_blkid == 0)
|
||||||
@ -633,8 +626,10 @@ extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
|
|||||||
extern void spa_handle_ignored_writes(spa_t *spa);
|
extern void spa_handle_ignored_writes(spa_t *spa);
|
||||||
|
|
||||||
/* zbookmark_phys functions */
|
/* zbookmark_phys functions */
|
||||||
boolean_t zbookmark_is_before(const struct dnode_phys *dnp,
|
boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp,
|
||||||
const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2);
|
const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block);
|
||||||
|
int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2,
|
||||||
|
uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
@ -44,7 +44,7 @@ typedef struct zio_checksum_info {
|
|||||||
zio_checksum_func_t *ci_func[2]; /* checksum function per byteorder */
|
zio_checksum_func_t *ci_func[2]; /* checksum function per byteorder */
|
||||||
int ci_correctable; /* number of correctable bits */
|
int ci_correctable; /* number of correctable bits */
|
||||||
int ci_eck; /* uses zio embedded checksum? */
|
int ci_eck; /* uses zio embedded checksum? */
|
||||||
int ci_dedup; /* strong enough for dedup? */
|
boolean_t ci_dedup; /* strong enough for dedup? */
|
||||||
char *ci_name; /* descriptive name */
|
char *ci_name; /* descriptive name */
|
||||||
} zio_checksum_info_t;
|
} zio_checksum_info_t;
|
||||||
|
|
||||||
|
@ -0,0 +1,41 @@
|
|||||||
|
/*
|
||||||
|
* CDDL HEADER START
|
||||||
|
*
|
||||||
|
* This file and its contents are supplied under the terms of the
|
||||||
|
* Common Development and Distribution License ("CDDL"), version 1.0.
|
||||||
|
* You may only use this file in accordance with the terms of version
|
||||||
|
* 1.0 of the CDDL.
|
||||||
|
*
|
||||||
|
* A full copy of the text of the CDDL should have accompanied this
|
||||||
|
* source. A copy of the CDDL is also available via the Internet at
|
||||||
|
* http://www.illumos.org/license/CDDL.
|
||||||
|
*
|
||||||
|
* CDDL HEADER END
|
||||||
|
*/
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2014 by Delphix. All rights reserved.
|
||||||
|
*/
|
||||||
|
#ifndef _ZIO_PRIORITY_H
|
||||||
|
#define _ZIO_PRIORITY_H
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef enum zio_priority {
|
||||||
|
ZIO_PRIORITY_SYNC_READ,
|
||||||
|
ZIO_PRIORITY_SYNC_WRITE, /* ZIL */
|
||||||
|
ZIO_PRIORITY_ASYNC_READ, /* prefetch */
|
||||||
|
ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
|
||||||
|
ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
|
||||||
|
ZIO_PRIORITY_TRIM, /* free requests used for TRIM */
|
||||||
|
ZIO_PRIORITY_NUM_QUEUEABLE,
|
||||||
|
|
||||||
|
ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */
|
||||||
|
} zio_priority_t;
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* _ZIO_PRIORITY_H */
|
@ -162,8 +162,9 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
|
|||||||
newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
|
newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
|
||||||
tbl->zt_nextblk = newblk;
|
tbl->zt_nextblk = newblk;
|
||||||
ASSERT0(tbl->zt_blks_copied);
|
ASSERT0(tbl->zt_blks_copied);
|
||||||
dmu_prefetch(zap->zap_objset, zap->zap_object,
|
dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
|
||||||
tbl->zt_blk << bs, tbl->zt_numblks << bs);
|
tbl->zt_blk << bs, tbl->zt_numblks << bs,
|
||||||
|
ZIO_PRIORITY_SYNC_READ);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -939,7 +940,8 @@ fzap_prefetch(zap_name_t *zn)
|
|||||||
if (zap_idx_to_blk(zap, idx, &blk) != 0)
|
if (zap_idx_to_blk(zap, idx, &blk) != 0)
|
||||||
return;
|
return;
|
||||||
bs = FZAP_BLOCK_SHIFT(zap);
|
bs = FZAP_BLOCK_SHIFT(zap);
|
||||||
dmu_prefetch(zap->zap_objset, zap->zap_object, blk << bs, 1 << bs);
|
dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
|
||||||
|
ZIO_PRIORITY_SYNC_READ);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1310,9 +1312,10 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
|
|||||||
} else {
|
} else {
|
||||||
int b;
|
int b;
|
||||||
|
|
||||||
dmu_prefetch(zap->zap_objset, zap->zap_object,
|
dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
|
||||||
zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
|
zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
|
||||||
zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs);
|
zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
|
||||||
|
ZIO_PRIORITY_SYNC_READ);
|
||||||
|
|
||||||
for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
|
for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
|
||||||
b++) {
|
b++) {
|
||||||
|
@ -22,7 +22,7 @@
|
|||||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
|
* Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/* Portions Copyright 2010 Robert Milkowski */
|
/* Portions Copyright 2010 Robert Milkowski */
|
||||||
@ -950,7 +950,7 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
|
|||||||
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
|
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
|
||||||
&sa_obj);
|
&sa_obj);
|
||||||
if (error)
|
if (error)
|
||||||
return (error);
|
goto out;
|
||||||
} else {
|
} else {
|
||||||
/*
|
/*
|
||||||
* Pre SA versions file systems should never touch
|
* Pre SA versions file systems should never touch
|
||||||
|
@ -2675,7 +2675,8 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
|
|||||||
|
|
||||||
/* Prefetch znode */
|
/* Prefetch znode */
|
||||||
if (prefetch)
|
if (prefetch)
|
||||||
dmu_prefetch(os, objnum, 0, 0);
|
dmu_prefetch(os, objnum, 0, 0, 0,
|
||||||
|
ZIO_PRIORITY_SYNC_READ);
|
||||||
|
|
||||||
skip_entry:
|
skip_entry:
|
||||||
/*
|
/*
|
||||||
|
@ -94,6 +94,9 @@ extern vmem_t *zio_alloc_arena;
|
|||||||
#define ZIO_PIPELINE_CONTINUE 0x100
|
#define ZIO_PIPELINE_CONTINUE 0x100
|
||||||
#define ZIO_PIPELINE_STOP 0x101
|
#define ZIO_PIPELINE_STOP 0x101
|
||||||
|
|
||||||
|
#define BP_SPANB(indblkshift, level) \
|
||||||
|
(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
|
||||||
|
#define COMPARE_META_LEVEL 0x80000000ul
|
||||||
/*
|
/*
|
||||||
* The following actions directly effect the spa's sync-to-convergence logic.
|
* The following actions directly effect the spa's sync-to-convergence logic.
|
||||||
* The values below define the sync pass when we start performing the action.
|
* The values below define the sync pass when we start performing the action.
|
||||||
@ -3461,37 +3464,127 @@ static zio_pipe_stage_t *zio_pipeline[] = {
|
|||||||
zio_done
|
zio_done
|
||||||
};
|
};
|
||||||
|
|
||||||
/* dnp is the dnode for zb1->zb_object */
|
|
||||||
boolean_t
|
|
||||||
zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1,
|
|
||||||
const zbookmark_phys_t *zb2)
|
|
||||||
{
|
|
||||||
uint64_t zb1nextL0, zb2thisobj;
|
|
||||||
|
|
||||||
ASSERT(zb1->zb_objset == zb2->zb_objset);
|
|
||||||
ASSERT(zb2->zb_level == 0);
|
|
||||||
|
/*
|
||||||
|
* Compare two zbookmark_phys_t's to see which we would reach first in a
|
||||||
|
* pre-order traversal of the object tree.
|
||||||
|
*
|
||||||
|
* This is simple in every case aside from the meta-dnode object. For all other
|
||||||
|
* objects, we traverse them in order (object 1 before object 2, and so on).
|
||||||
|
* However, all of these objects are traversed while traversing object 0, since
|
||||||
|
* the data it points to is the list of objects. Thus, we need to convert to a
|
||||||
|
* canonical representation so we can compare meta-dnode bookmarks to
|
||||||
|
* non-meta-dnode bookmarks.
|
||||||
|
*
|
||||||
|
* We do this by calculating "equivalents" for each field of the zbookmark.
|
||||||
|
* zbookmarks outside of the meta-dnode use their own object and level, and
|
||||||
|
* calculate the level 0 equivalent (the first L0 blkid that is contained in the
|
||||||
|
* blocks this bookmark refers to) by multiplying their blkid by their span
|
||||||
|
* (the number of L0 blocks contained within one block at their level).
|
||||||
|
* zbookmarks inside the meta-dnode calculate their object equivalent
|
||||||
|
* (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
|
||||||
|
* level + 1<<31 (any value larger than a level could ever be) for their level.
|
||||||
|
* This causes them to always compare before a bookmark in their object
|
||||||
|
* equivalent, compare appropriately to bookmarks in other objects, and to
|
||||||
|
* compare appropriately to other bookmarks in the meta-dnode.
|
||||||
|
*/
|
||||||
|
int
|
||||||
|
zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
|
||||||
|
const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* These variables represent the "equivalent" values for the zbookmark,
|
||||||
|
* after converting zbookmarks inside the meta dnode to their
|
||||||
|
* normal-object equivalents.
|
||||||
|
*/
|
||||||
|
uint64_t zb1obj, zb2obj;
|
||||||
|
uint64_t zb1L0, zb2L0;
|
||||||
|
uint64_t zb1level, zb2level;
|
||||||
|
|
||||||
|
if (zb1->zb_object == zb2->zb_object &&
|
||||||
|
zb1->zb_level == zb2->zb_level &&
|
||||||
|
zb1->zb_blkid == zb2->zb_blkid)
|
||||||
|
return (0);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* BP_SPANB calculates the span in blocks.
|
||||||
|
*/
|
||||||
|
zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
|
||||||
|
zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
|
||||||
|
|
||||||
|
if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
|
||||||
|
zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
|
||||||
|
zb1L0 = 0;
|
||||||
|
zb1level = zb1->zb_level + COMPARE_META_LEVEL;
|
||||||
|
} else {
|
||||||
|
zb1obj = zb1->zb_object;
|
||||||
|
zb1level = zb1->zb_level;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
|
||||||
|
zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
|
||||||
|
zb2L0 = 0;
|
||||||
|
zb2level = zb2->zb_level + COMPARE_META_LEVEL;
|
||||||
|
} else {
|
||||||
|
zb2obj = zb2->zb_object;
|
||||||
|
zb2level = zb2->zb_level;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Now that we have a canonical representation, do the comparison. */
|
||||||
|
if (zb1obj != zb2obj)
|
||||||
|
return (zb1obj < zb2obj ? -1 : 1);
|
||||||
|
else if (zb1L0 != zb2L0)
|
||||||
|
return (zb1L0 < zb2L0 ? -1 : 1);
|
||||||
|
else if (zb1level != zb2level)
|
||||||
|
return (zb1level > zb2level ? -1 : 1);
|
||||||
|
/*
|
||||||
|
* This can (theoretically) happen if the bookmarks have the same object
|
||||||
|
* and level, but different blkids, if the block sizes are not the same.
|
||||||
|
* There is presently no way to change the indirect block sizes
|
||||||
|
*/
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This function checks the following: given that last_block is the place that
|
||||||
|
* our traversal stopped last time, does that guarantee that we've visited
|
||||||
|
* every node under subtree_root? Therefore, we can't just use the raw output
|
||||||
|
* of zbookmark_compare. We have to pass in a modified version of
|
||||||
|
* subtree_root; by incrementing the block id, and then checking whether
|
||||||
|
* last_block is before or equal to that, we can tell whether or not having
|
||||||
|
* visited last_block implies that all of subtree_root's children have been
|
||||||
|
* visited.
|
||||||
|
*/
|
||||||
|
boolean_t
|
||||||
|
zbookmark_subtree_completed(const dnode_phys_t *dnp,
|
||||||
|
const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
|
||||||
|
{
|
||||||
|
zbookmark_phys_t mod_zb = *subtree_root;
|
||||||
|
mod_zb.zb_blkid++;
|
||||||
|
ASSERT(last_block->zb_level == 0);
|
||||||
|
|
||||||
/* The objset_phys_t isn't before anything. */
|
/* The objset_phys_t isn't before anything. */
|
||||||
if (dnp == NULL)
|
if (dnp == NULL)
|
||||||
return (B_FALSE);
|
return (B_FALSE);
|
||||||
|
|
||||||
zb1nextL0 = (zb1->zb_blkid + 1) <<
|
/*
|
||||||
((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
|
* We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
|
||||||
|
* data block size in sectors, because that variable is only used if
|
||||||
zb2thisobj = zb2->zb_object ? zb2->zb_object :
|
* the bookmark refers to a block in the meta-dnode. Since we don't
|
||||||
zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
|
* know without examining it what object it refers to, and there's no
|
||||||
|
* harm in passing in this value in other cases, we always pass it in.
|
||||||
if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
|
*
|
||||||
uint64_t nextobj = zb1nextL0 *
|
* We pass in 0 for the indirect block size shift because zb2 must be
|
||||||
(dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
|
* level 0. The indirect block size is only used to calculate the span
|
||||||
return (nextobj <= zb2thisobj);
|
* of the bookmark, but since the bookmark must be level 0, the span is
|
||||||
}
|
* always 1, so the math works out.
|
||||||
|
*
|
||||||
if (zb1->zb_object < zb2thisobj)
|
* If you make changes to how the zbookmark_compare code works, be sure
|
||||||
return (B_TRUE);
|
* to make sure that this code still works afterwards.
|
||||||
if (zb1->zb_object > zb2thisobj)
|
*/
|
||||||
return (B_FALSE);
|
return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
|
||||||
if (zb2->zb_object == DMU_META_DNODE_OBJECT)
|
1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
|
||||||
return (B_FALSE);
|
last_block) <= 0);
|
||||||
return (zb1nextL0 <= zb2->zb_blkid);
|
|
||||||
}
|
}
|
||||||
|
@ -358,7 +358,7 @@ zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
|
|||||||
zvol_extent_t *ze;
|
zvol_extent_t *ze;
|
||||||
int bs = ma->ma_zv->zv_volblocksize;
|
int bs = ma->ma_zv->zv_volblocksize;
|
||||||
|
|
||||||
if (BP_IS_HOLE(bp) ||
|
if (bp == NULL || BP_IS_HOLE(bp) ||
|
||||||
zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
|
zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
|
||||||
return (0);
|
return (0);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user