Merge ^/head r305301 through r305345.

This commit is contained in:
Dimitry Andric 2016-09-03 13:57:47 +00:00
commit 86ea529132
67 changed files with 4089 additions and 2434 deletions

View File

@ -68,6 +68,9 @@ ${PACKAGE}FILES+= dot1.0
${PACKAGE}FILES+= dot2.0
${PACKAGE}FILES+= dot3.0
${PACKAGE}FILES+= dot4.0
${PACKAGE}FILES+= echo1.0
${PACKAGE}FILES+= echo2.0
${PACKAGE}FILES+= echo3.0
${PACKAGE}FILES+= eval1.0
${PACKAGE}FILES+= eval2.0
${PACKAGE}FILES+= eval3.0

View File

@ -0,0 +1,6 @@
# $FreeBSD$
# Not specified by POSIX.
[ "`echo -n a b; echo c d; echo e f`" = "a bc d
e f" ]

View File

@ -0,0 +1,7 @@
# $FreeBSD$
# Not specified by POSIX.
a=`echo -e '\a\b\e\f\n\r\t\v\\\\\0041\c'; echo .`
b=`printf '\a\b\033\f\n\r\t\v\\\\!.'`
[ "$a" = "$b" ]

View File

@ -0,0 +1,5 @@
# $FreeBSD$
# Not specified by POSIX.
[ "`echo -e 'a\cb' c; echo d`" = "ad" ]

View File

@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
@ -117,7 +117,7 @@ static void
usage(void)
{
(void) fprintf(stderr,
"Usage: %s [-CumMdibcsDvhLXFPA] [-t txg] [-e [-p path...]] "
"Usage: %s [-CumMdibcsDvhLXFPAG] [-t txg] [-e [-p path...]] "
"[-U config] [-I inflight I/Os] [-x dumpdir] poolname [object...]\n"
" %s [-divPA] [-e -p path...] [-U config] dataset "
"[object...]\n"
@ -178,12 +178,23 @@ usage(void)
(void) fprintf(stderr, " -I <number of inflight I/Os> -- "
"specify the maximum number of "
"checksumming I/Os [default is 200]\n");
(void) fprintf(stderr, " -G dump zfs_dbgmsg buffer before "
"exiting\n");
(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
"to make only that option verbose\n");
(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
exit(1);
}
static void
dump_debug_buffer()
{
if (dump_opt['G']) {
(void) printf("\n");
zfs_dbgmsg_print("zdb");
}
}
/*
* Called for usage errors that are discovered after a call to spa_open(),
* dmu_bonus_hold(), or pool_match(). abort() is called for other errors.
@ -200,6 +211,8 @@ fatal(const char *fmt, ...)
va_end(ap);
(void) fprintf(stderr, "\n");
dump_debug_buffer();
exit(1);
}
@ -1289,7 +1302,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
}
if (!err)
ASSERT3U(fill, ==, BP_GET_FILL(bp));
(void) arc_buf_remove_ref(buf, &buf);
arc_buf_destroy(buf, &buf);
}
return (err);
@ -3103,8 +3116,10 @@ dump_zpool(spa_t *spa)
if (dump_opt['h'])
dump_history(spa);
if (rc != 0)
if (rc != 0) {
dump_debug_buffer();
exit(rc);
}
}
#define ZDB_FLAG_CHECKSUM 0x0001
@ -3575,7 +3590,7 @@ main(int argc, char **argv)
spa_config_path = spa_config_path_env;
while ((c = getopt(argc, argv,
"bcdhilmMI:suCDRSAFLXx:evp:t:U:P")) != -1) {
"bcdhilmMI:suCDRSAFLXx:evp:t:U:PG")) != -1) {
switch (c) {
case 'b':
case 'c':
@ -3591,6 +3606,7 @@ main(int argc, char **argv)
case 'M':
case 'R':
case 'S':
case 'G':
dump_opt[c]++;
dump_all = 0;
break;
@ -3826,6 +3842,8 @@ main(int argc, char **argv)
fuid_table_destroy();
sa_loaded = B_FALSE;
dump_debug_buffer();
libzfs_fini(g_zfs);
kernel_fini();

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
@ -189,6 +189,7 @@ extern uint64_t metaslab_gang_bang;
extern uint64_t metaslab_df_alloc_threshold;
extern uint64_t zfs_deadman_synctime_ms;
extern int metaslab_preload_limit;
extern boolean_t zfs_compressed_arc_enabled;
static ztest_shared_opts_t *ztest_shared_opts;
static ztest_shared_opts_t ztest_opts;
@ -4792,7 +4793,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
char path0[MAXPATHLEN];
char pathrand[MAXPATHLEN];
size_t fsize;
int bshift = SPA_OLD_MAXBLOCKSHIFT + 2; /* don't scrog all labels */
int bshift = SPA_MAXBLOCKSHIFT + 2;
int iters = 1000;
int maxfaults;
int mirror_save;
@ -4953,11 +4954,58 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
fsize = lseek(fd, 0, SEEK_END);
while (--iters != 0) {
/*
* The offset must be chosen carefully to ensure that
* we do not inject a given logical block with errors
* on two different leaf devices, because ZFS can not
* tolerate that (if maxfaults==1).
*
* We divide each leaf into chunks of size
* (# leaves * SPA_MAXBLOCKSIZE * 4). Within each chunk
* there is a series of ranges to which we can inject errors.
* Each range can accept errors on only a single leaf vdev.
* The error injection ranges are separated by ranges
* which we will not inject errors on any device (DMZs).
* Each DMZ must be large enough such that a single block
* can not straddle it, so that a single block can not be
* a target in two different injection ranges (on different
* leaf vdevs).
*
* For example, with 3 leaves, each chunk looks like:
* 0 to 32M: injection range for leaf 0
* 32M to 64M: DMZ - no injection allowed
* 64M to 96M: injection range for leaf 1
* 96M to 128M: DMZ - no injection allowed
* 128M to 160M: injection range for leaf 2
* 160M to 192M: DMZ - no injection allowed
*/
offset = ztest_random(fsize / (leaves << bshift)) *
(leaves << bshift) + (leaf << bshift) +
(ztest_random(1ULL << (bshift - 1)) & -8ULL);
if (offset >= fsize)
/*
* Only allow damage to the labels at one end of the vdev.
*
* If all labels are damaged, the device will be totally
* inaccessible, which will result in loss of data,
* because we also damage (parts of) the other side of
* the mirror/raidz.
*
* Additionally, we will always have both an even and an
* odd label, so that we can handle crashes in the
* middle of vdev_config_sync().
*/
if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE)
continue;
/*
* The two end labels are stored at the "end" of the disk, but
* the end of the disk (vdev_psize) is aligned to
* sizeof (vdev_label_t).
*/
uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t));
if ((leaf & 1) == 1 &&
offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE)
continue;
VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
@ -5021,9 +5069,14 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
return;
}
dmu_objset_stats_t dds;
dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
dmu_objset_fast_stat(os, &dds);
dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
object = od[0].od_object;
blocksize = od[0].od_blocksize;
pattern = zs->zs_guid ^ dmu_objset_fsid_guid(os);
pattern = zs->zs_guid ^ dds.dds_guid;
ASSERT(object != 0);
@ -5355,6 +5408,12 @@ ztest_resume_thread(void *arg)
if (spa_suspended(spa))
ztest_resume(spa);
(void) poll(NULL, 0, 100);
/*
* Periodically change the zfs_compressed_arc_enabled setting.
*/
if (ztest_random(10) == 0)
zfs_compressed_arc_enabled = ztest_random(2);
}
return (NULL);
}
@ -5620,9 +5679,13 @@ ztest_run(ztest_shared_t *zs)
metaslab_preload_limit = ztest_random(20) + 1;
ztest_spa = spa;
dmu_objset_stats_t dds;
VERIFY0(dmu_objset_own(ztest_opts.zo_pool,
DMU_OST_ANY, B_TRUE, FTAG, &os));
zs->zs_guid = dmu_objset_fsid_guid(os);
dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
dmu_objset_fast_stat(os, &dds);
dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
zs->zs_guid = dds.dds_guid;
dmu_objset_disown(os, FTAG);
spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN;

View File

@ -102,6 +102,8 @@ read_client_conf(void)
[top_level_config.requested_option_count++] = DHO_HOST_NAME;
top_level_config.requested_options
[top_level_config.requested_option_count++] = DHO_DOMAIN_SEARCH;
top_level_config.requested_options
[top_level_config.requested_option_count++] = DHO_INTERFACE_MTU;
if ((cfile = fopen(path_dhclient_conf, "r")) != NULL) {
do {

View File

@ -60,6 +60,7 @@ __FBSDID("$FreeBSD$");
#include "privsep.h"
#include <sys/capsicum.h>
#include <sys/endian.h>
#include <net80211/ieee80211_freebsd.h>
@ -132,6 +133,9 @@ int fork_privchld(int, int);
((a) > 0 ? (1 + (((a) - 1) | (sizeof(long) - 1))) : sizeof(long))
#define ADVANCE(x, n) (x += ROUNDUP((n)->sa_len))
/* Minimum MTU is 68 as per RFC791, p. 24 */
#define MIN_MTU 68
static time_t scripttime;
int
@ -798,9 +802,20 @@ dhcpack(struct packet *packet)
void
bind_lease(struct interface_info *ip)
{
struct option_data *opt;
/* Remember the medium. */
ip->client->new->medium = ip->client->medium;
opt = &ip->client->new->options[DHO_INTERFACE_MTU];
if (opt->len == sizeof(u_int16_t)) {
u_int16_t mtu = be16dec(opt->data);
if (mtu < MIN_MTU)
warning("mtu size %u < %d: ignored", (unsigned)mtu, MIN_MTU);
else
interface_set_mtu_unpriv(privfd, mtu);
}
/* Write out the new lease. */
write_client_lease(ip, ip->client->new, 0);

View File

@ -319,6 +319,8 @@ void cancel_timeout(void (*)(void *), void *);
void add_protocol(char *, int, void (*)(struct protocol *), void *);
void remove_protocol(struct protocol *);
int interface_link_status(char *);
void interface_set_mtu_unpriv(int, u_int16_t);
void interface_set_mtu_priv(char *, u_int16_t);
/* hash.c */
struct hash_table *new_hash(void);

View File

@ -43,6 +43,7 @@
__FBSDID("$FreeBSD$");
#include "dhcpd.h"
#include "privsep.h"
#include <sys/ioctl.h>
@ -501,3 +502,46 @@ interface_link_status(char *ifname)
}
return (1);
}
void
interface_set_mtu_unpriv(int privfd, u_int16_t mtu)
{
struct imsg_hdr hdr;
struct buf *buf;
int errs = 0;
hdr.code = IMSG_SET_INTERFACE_MTU;
hdr.len = sizeof(hdr) +
sizeof(u_int16_t);
if ((buf = buf_open(hdr.len)) == NULL)
error("buf_open: %m");
errs += buf_add(buf, &hdr, sizeof(hdr));
errs += buf_add(buf, &mtu, sizeof(mtu));
if (errs)
error("buf_add: %m");
if (buf_close(privfd, buf) == -1)
error("buf_close: %m");
}
void
interface_set_mtu_priv(char *ifname, u_int16_t mtu)
{
struct ifreq ifr;
int sock;
if ((sock = socket(AF_INET, SOCK_DGRAM, 0)) == -1)
error("Can't create socket");
memset(&ifr, 0, sizeof(ifr));
strlcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name));
ifr.ifr_mtu = mtu;
if (ioctl(sock, SIOCSIFMTU, &ifr) == -1)
warning("SIOCSIFMTU failed (%d): %s", mtu,
strerror(errno));
close(sock);
}

View File

@ -111,6 +111,7 @@ dispatch_imsg(struct interface_info *ifi, int fd)
struct client_lease lease;
int ret, i, optlen;
struct buf *buf;
u_int16_t mtu;
buf_read(fd, &hdr, sizeof(hdr));
@ -235,6 +236,13 @@ dispatch_imsg(struct interface_info *ifi, int fd)
case IMSG_SEND_PACKET:
send_packet_priv(ifi, &hdr, fd);
break;
case IMSG_SET_INTERFACE_MTU:
if (hdr.len < sizeof(hdr) + sizeof(u_int16_t))
error("corrupted message received");
buf_read(fd, &mtu, sizeof(u_int16_t));
interface_set_mtu_priv(ifi->name, mtu);
break;
default:
error("received unknown message, code %d", hdr.code);
}

View File

@ -36,7 +36,8 @@ enum imsg_code {
IMSG_SCRIPT_WRITE_PARAMS,
IMSG_SCRIPT_GO,
IMSG_SCRIPT_GO_RET,
IMSG_SEND_PACKET
IMSG_SEND_PACKET,
IMSG_SET_INTERFACE_MTU,
};
struct imsg_hdr {

View File

@ -5178,6 +5178,14 @@ pmap_is_referenced(vm_page_t m)
* XXX: The exact number of bits to check and clear is a matter that
* should be tested and standardized at some point in the future for
* optimal aging of shared pages.
*
* As an optimization, update the page's dirty field if a modified bit is
* found while counting reference bits. This opportunistic update can be
* performed at low cost and can eliminate the need for some future calls
* to pmap_is_modified(). However, since this function stops after
* finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
* dirty pages. Those dirty pages will only be detected by a future call
* to pmap_is_modified().
*/
int
pmap_ts_referenced(vm_page_t m)
@ -5186,7 +5194,7 @@ pmap_ts_referenced(vm_page_t m)
pv_entry_t pv, pvf;
pmap_t pmap;
pt1_entry_t *pte1p, opte1;
pt2_entry_t *pte2p;
pt2_entry_t *pte2p, opte2;
vm_paddr_t pa;
int rtval = 0;
@ -5205,6 +5213,14 @@ pmap_ts_referenced(vm_page_t m)
PMAP_LOCK(pmap);
pte1p = pmap_pte1(pmap, pv->pv_va);
opte1 = pte1_load(pte1p);
if (pte1_is_dirty(opte1)) {
/*
* Although "opte1" is mapping a 1MB page, because
* this function is called at a 4KB page granularity,
* we only update the 4KB page under test.
*/
vm_page_dirty(m);
}
if ((opte1 & PTE1_A) != 0) {
/*
* Since this reference bit is shared by 256 4KB pages,
@ -5253,7 +5269,10 @@ pmap_ts_referenced(vm_page_t m)
("%s: not found a link in page %p's pv list", __func__, m));
pte2p = pmap_pte2_quick(pmap, pv->pv_va);
if ((pte2_load(pte2p) & PTE2_A) != 0) {
opte2 = pte2_load(pte2p);
if (pte2_is_dirty(opte2))
vm_page_dirty(m);
if ((opte2 & PTE2_A) != 0) {
pte2_clear_bit(pte2p, PTE2_A);
pmap_tlb_flush(pmap, pv->pv_va);
rtval++;

View File

@ -55,7 +55,6 @@ SECTIONS
{
*(.data)
*(.gnu.linkonce.d*)
CONSTRUCTORS
}
.data1 : { *(.data1) }
.got1 : { *(.got1) }

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -130,6 +130,26 @@ const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
{ zfs_acl_byteswap, "acl" }
};
int
dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
void *tag, dmu_buf_t **dbp)
{
uint64_t blkid;
dmu_buf_impl_t *db;
blkid = dbuf_whichblock(dn, 0, offset);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
db = dbuf_hold(dn, blkid, tag);
rw_exit(&dn->dn_struct_rwlock);
if (db == NULL) {
*dbp = NULL;
return (SET_ERROR(EIO));
}
*dbp = &db->db;
return (0);
}
int
dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
void *tag, dmu_buf_t **dbp)
@ -157,6 +177,29 @@ dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
return (err);
}
int
dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
void *tag, dmu_buf_t **dbp, int flags)
{
int err;
int db_flags = DB_RF_CANFAIL;
if (flags & DMU_READ_NO_PREFETCH)
db_flags |= DB_RF_NOPREFETCH;
err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
if (err == 0) {
dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
err = dbuf_read(db, NULL, db_flags);
if (err != 0) {
dbuf_rele(db, tag);
*dbp = NULL;
}
}
return (err);
}
int
dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
void *tag, dmu_buf_t **dbp, int flags)
@ -1407,7 +1450,7 @@ void
dmu_return_arcbuf(arc_buf_t *buf)
{
arc_return_buf(buf, FTAG);
VERIFY(arc_buf_remove_ref(buf, FTAG));
arc_buf_destroy(buf, FTAG);
}
/*
@ -1763,8 +1806,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
zio_nowait(arc_write(pio, os->os_spa, txg,
bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready,
NULL, NULL, dmu_sync_done, dsa,
&zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
return (0);
@ -2137,11 +2179,11 @@ dmu_init(void)
xuio_stat_init();
dmu_objset_init();
dnode_init();
dbuf_init();
zfetch_init();
zio_compress_init();
l2arc_init();
arc_init();
dbuf_init();
}
void

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#include <sys/dmu.h>
@ -169,7 +169,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
if (err)
break;
}
(void) arc_buf_remove_ref(abuf, &abuf);
arc_buf_destroy(abuf, &abuf);
if (err)
return (err);
/* Don't care about the data blocks */

View File

@ -316,8 +316,6 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
if (DMU_OS_IS_L2CACHEABLE(os))
aflags |= ARC_FLAG_L2CACHE;
if (DMU_OS_IS_L2COMPRESSIBLE(os))
aflags |= ARC_FLAG_L2COMPRESS;
dprintf_bp(os->os_rootbp, "reading %s", "");
err = arc_read(NULL, spa, os->os_rootbp,
@ -334,14 +332,13 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
/* Increase the blocksize if we are permitted. */
if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) {
arc_buf_t *buf = arc_buf_alloc(spa,
arc_buf_t *buf = arc_alloc_buf(spa,
sizeof (objset_phys_t), &os->os_phys_buf,
ARC_BUFC_METADATA);
bzero(buf->b_data, sizeof (objset_phys_t));
bcopy(os->os_phys_buf->b_data, buf->b_data,
arc_buf_size(os->os_phys_buf));
(void) arc_buf_remove_ref(os->os_phys_buf,
&os->os_phys_buf);
arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
os->os_phys_buf = buf;
}
@ -350,7 +347,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
} else {
int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
os->os_phys_buf = arc_buf_alloc(spa, size,
os->os_phys_buf = arc_alloc_buf(spa, size,
&os->os_phys_buf, ARC_BUFC_METADATA);
os->os_phys = os->os_phys_buf->b_data;
bzero(os->os_phys, size);
@ -428,8 +425,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
if (needlock)
dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
if (err != 0) {
VERIFY(arc_buf_remove_ref(os->os_phys_buf,
&os->os_phys_buf));
arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
kmem_free(os, sizeof (objset_t));
return (err);
}
@ -731,7 +727,7 @@ dmu_objset_evict_done(objset_t *os)
}
zil_free(os->os_zil);
VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf));
arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
/*
* This is a barrier to prevent the objset from going away in
@ -1128,7 +1124,6 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
zio = arc_write(pio, os->os_spa, tx->tx_txg,
os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
DMU_OS_IS_L2COMPRESSIBLE(os),
&zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);

View File

@ -160,11 +160,16 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
fletcher_4_incremental_native(dsp->dsa_drr,
offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
&dsp->dsa_zc);
if (dsp->dsa_drr->drr_type != DRR_BEGIN) {
if (dsp->dsa_drr->drr_type == DRR_BEGIN) {
dsp->dsa_sent_begin = B_TRUE;
} else {
ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u.
drr_checksum.drr_checksum));
dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc;
}
if (dsp->dsa_drr->drr_type == DRR_END) {
dsp->dsa_sent_end = B_TRUE;
}
fletcher_4_incremental_native(&dsp->dsa_drr->
drr_u.drr_checksum.drr_checksum,
sizeof (zio_cksum_t), &dsp->dsa_zc);
@ -634,7 +639,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
if (err != 0)
break;
}
(void) arc_buf_remove_ref(abuf, &abuf);
arc_buf_destroy(abuf, &abuf);
} else if (type == DMU_OT_SA) {
arc_flags_t aflags = ARC_FLAG_WAIT;
arc_buf_t *abuf;
@ -646,7 +651,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
return (SET_ERROR(EIO));
err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data);
(void) arc_buf_remove_ref(abuf, &abuf);
arc_buf_destroy(abuf, &abuf);
} else if (backup_do_embed(dsa, bp)) {
/* it's an embedded level-0 block of a regular object */
int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
@ -670,7 +675,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
&aflags, zb) != 0) {
if (zfs_send_corrupt_data) {
/* Send a block filled with 0x"zfs badd bloc" */
abuf = arc_buf_alloc(spa, blksz, &abuf,
abuf = arc_alloc_buf(spa, blksz, &abuf,
ARC_BUFC_DATA);
uint64_t *ptr;
for (ptr = abuf->b_data;
@ -700,7 +705,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
err = dump_write(dsa, type, zb->zb_object,
offset, blksz, bp, abuf->b_data);
}
(void) arc_buf_remove_ref(abuf, &abuf);
arc_buf_destroy(abuf, &abuf);
}
ASSERT(err == 0 || err == EINTR);
@ -912,6 +917,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
list_remove(&to_ds->ds_sendstreams, dsp);
mutex_exit(&to_ds->ds_sendstream_lock);
VERIFY(err != 0 || (dsp->dsa_sent_begin && dsp->dsa_sent_end));
kmem_free(drr, sizeof (dmu_replay_record_t));
kmem_free(dsp, sizeof (dmu_sendarg_t));
@ -3106,6 +3113,9 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
dsl_dataset_phys(origin_head)->ds_flags &=
~DS_FLAG_INCONSISTENT;
drc->drc_newsnapobj =
dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
dsl_dataset_rele(origin_head, FTAG);
dsl_destroy_head_sync_impl(drc->drc_ds, tx);
@ -3141,8 +3151,9 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
DS_FIELD_RESUME_TONAME, tx);
}
drc->drc_newsnapobj =
dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
}
drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
/*
* Release the hold from dmu_recv_begin. This must be done before
* we return to open context, so that when we free the dataset's dnode,
@ -3184,8 +3195,6 @@ static int dmu_recv_end_modified_blocks = 3;
static int
dmu_recv_existing_end(dmu_recv_cookie_t *drc)
{
int error;
#ifdef _KERNEL
/*
* We will be destroying the ds; make sure its origin is unmounted if
@ -3196,23 +3205,30 @@ dmu_recv_existing_end(dmu_recv_cookie_t *drc)
zfs_destroy_unmount_origin(name);
#endif
error = dsl_sync_task(drc->drc_tofs,
return (dsl_sync_task(drc->drc_tofs,
dmu_recv_end_check, dmu_recv_end_sync, drc,
dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL);
if (error != 0)
dmu_recv_cleanup_ds(drc);
return (error);
dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
}
static int
dmu_recv_new_end(dmu_recv_cookie_t *drc)
{
return (dsl_sync_task(drc->drc_tofs,
dmu_recv_end_check, dmu_recv_end_sync, drc,
dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
}
int
dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
{
int error;
error = dsl_sync_task(drc->drc_tofs,
dmu_recv_end_check, dmu_recv_end_sync, drc,
dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL);
drc->drc_owner = owner;
if (drc->drc_newfs)
error = dmu_recv_new_end(drc);
else
error = dmu_recv_existing_end(drc);
if (error != 0) {
dmu_recv_cleanup_ds(drc);
@ -3224,17 +3240,6 @@ dmu_recv_new_end(dmu_recv_cookie_t *drc)
return (error);
}
int
dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
{
drc->drc_owner = owner;
if (drc->drc_newfs)
return (dmu_recv_new_end(drc));
else
return (dmu_recv_existing_end(drc));
}
/*
* Return TRUE if this objset is currently being received into.
*/

View File

@ -380,7 +380,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
}
if (buf)
(void) arc_buf_remove_ref(buf, &buf);
arc_buf_destroy(buf, &buf);
post:
if (err == 0 && (td->td_flags & TRAVERSE_POST))
@ -595,7 +595,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
osp = buf->b_data;
traverse_zil(&td, &osp->os_zil_header);
(void) arc_buf_remove_ref(buf, &buf);
arc_buf_destroy(buf, &buf);
}
if (!(flags & TRAVERSE_PREFETCH_DATA) ||

View File

@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
@ -808,15 +808,14 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
* access the name in this fat-zap so that we'll check
* for i/o errors to the leaf blocks, etc.
*/
err = zap_lookup(dn->dn_objset, dn->dn_object, name,
8, 0, NULL);
err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);
if (err == EIO) {
tx->tx_err = err;
return;
}
}
err = zap_count_write(dn->dn_objset, dn->dn_object, name, add,
err = zap_count_write_by_dnode(dn, name, add,
&txh->txh_space_towrite, &txh->txh_space_tooverwrite);
/*

View File

@ -512,7 +512,7 @@ dnode_destroy(dnode_t *dn)
}
if (dn->dn_bonus != NULL) {
mutex_enter(&dn->dn_bonus->db_mtx);
dbuf_evict(dn->dn_bonus);
dbuf_destroy(dn->dn_bonus);
dn->dn_bonus = NULL;
}
dn->dn_zio = NULL;

View File

@ -413,7 +413,7 @@ dnode_evict_dbufs(dnode_t *dn)
avl_insert_here(&dn->dn_dbufs, &db_marker, db,
AVL_BEFORE);
dbuf_clear(db);
dbuf_destroy(db);
db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker);
avl_remove(&dn->dn_dbufs, &db_marker);
@ -435,7 +435,7 @@ dnode_evict_bonus(dnode_t *dn)
if (dn->dn_bonus != NULL) {
if (refcount_is_zero(&dn->dn_bonus->db_holds)) {
mutex_enter(&dn->dn_bonus->db_mtx);
dbuf_evict(dn->dn_bonus);
dbuf_destroy(dn->dn_bonus);
dn->dn_bonus = NULL;
} else {
dn->dn_bonus->db_pending_evict = TRUE;

View File

@ -1060,19 +1060,6 @@ dsl_dataset_get_blkptr(dsl_dataset_t *ds)
return (&dsl_dataset_phys(ds)->ds_bp);
}
void
dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
{
ASSERT(dmu_tx_is_syncing(tx));
/* If it's the meta-objset, set dp_meta_rootbp */
if (ds == NULL) {
tx->tx_pool->dp_meta_rootbp = *bp;
} else {
dmu_buf_will_dirty(ds->ds_dbuf, tx);
dsl_dataset_phys(ds)->ds_bp = *bp;
}
}
spa_t *
dsl_dataset_get_spa(dsl_dataset_t *ds)
{

View File

@ -679,7 +679,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
dsl_scan_visitbp(cbp, &czb, dnp,
ds, scn, ostype, tx);
}
(void) arc_buf_remove_ref(buf, &buf);
arc_buf_destroy(buf, &buf);
} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
arc_flags_t flags = ARC_FLAG_WAIT;
dnode_phys_t *cdnp;
@ -705,7 +705,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
cdnp, zb->zb_blkid * epb + i, tx);
}
(void) arc_buf_remove_ref(buf, &buf);
arc_buf_destroy(buf, &buf);
} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
arc_flags_t flags = ARC_FLAG_WAIT;
objset_phys_t *osp;
@ -737,7 +737,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
&osp->os_userused_dnode,
DMU_USERUSED_OBJECT, tx);
}
(void) arc_buf_remove_ref(buf, &buf);
arc_buf_destroy(buf, &buf);
}
return (0);

View File

@ -38,17 +38,8 @@
SYSCTL_DECL(_vfs_zfs);
SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab");
/*
* Allow allocations to switch to gang blocks quickly. We do this to
* avoid having to load lots of space_maps in a given txg. There are,
* however, some cases where we want to avoid "fast" ganging and instead
* we want to do an exhaustive search of all metaslabs on this device.
* Currently we don't allow any gang, slog, or dump device related allocations
* to "fast" gang.
*/
#define CAN_FASTGANG(flags) \
(!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
METASLAB_GANG_AVOID)))
#define GANG_ALLOCATION(flags) \
((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
#define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
@ -256,6 +247,8 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
mc->mc_spa = spa;
mc->mc_rotor = NULL;
mc->mc_ops = ops;
mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
refcount_create_tracked(&mc->mc_alloc_slots);
return (mc);
}
@ -269,6 +262,8 @@ metaslab_class_destroy(metaslab_class_t *mc)
ASSERT(mc->mc_space == 0);
ASSERT(mc->mc_dspace == 0);
refcount_destroy(&mc->mc_alloc_slots);
mutex_destroy(&mc->mc_lock);
kmem_free(mc, sizeof (metaslab_class_t));
}
@ -468,7 +463,13 @@ metaslab_class_expandable_space(metaslab_class_t *mc)
continue;
}
space += tvd->vdev_max_asize - tvd->vdev_asize;
/*
* Calculate if we have enough space to add additional
* metaslabs. We report the expandable space in terms
* of the metaslab size since that's the unit of expansion.
*/
space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
1ULL << tvd->vdev_ms_shift);
}
spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
return (space);
@ -506,9 +507,10 @@ metaslab_compare(const void *x1, const void *x2)
/*
* Update the allocatable flag and the metaslab group's capacity.
* The allocatable flag is set to true if the capacity is below
* the zfs_mg_noalloc_threshold. If a metaslab group transitions
* from allocatable to non-allocatable or vice versa then the metaslab
* group's class is updated to reflect the transition.
* the zfs_mg_noalloc_threshold or has a fragmentation value that is
* greater than zfs_mg_fragmentation_threshold. If a metaslab group
* transitions from allocatable to non-allocatable or vice versa then the
* metaslab group's class is updated to reflect the transition.
*/
static void
metaslab_group_alloc_update(metaslab_group_t *mg)
@ -517,22 +519,45 @@ metaslab_group_alloc_update(metaslab_group_t *mg)
metaslab_class_t *mc = mg->mg_class;
vdev_stat_t *vs = &vd->vdev_stat;
boolean_t was_allocatable;
boolean_t was_initialized;
ASSERT(vd == vd->vdev_top);
mutex_enter(&mg->mg_lock);
was_allocatable = mg->mg_allocatable;
was_initialized = mg->mg_initialized;
mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
(vs->vs_space + 1);
mutex_enter(&mc->mc_lock);
/*
* If the metaslab group was just added then it won't
* have any space until we finish syncing out this txg.
* At that point we will consider it initialized and available
* for allocations. We also don't consider non-activated
* metaslab groups (e.g. vdevs that are in the middle of being removed)
* to be initialized, because they can't be used for allocation.
*/
mg->mg_initialized = metaslab_group_initialized(mg);
if (!was_initialized && mg->mg_initialized) {
mc->mc_groups++;
} else if (was_initialized && !mg->mg_initialized) {
ASSERT3U(mc->mc_groups, >, 0);
mc->mc_groups--;
}
if (mg->mg_initialized)
mg->mg_no_free_space = B_FALSE;
/*
* A metaslab group is considered allocatable if it has plenty
* of free space or is not heavily fragmented. We only take
* fragmentation into account if the metaslab group has a valid
* fragmentation metric (i.e. a value between 0 and 100).
*/
mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
mg->mg_allocatable = (mg->mg_activation_count > 0 &&
mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
(mg->mg_fragmentation == ZFS_FRAG_INVALID ||
mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
@ -555,6 +580,7 @@ metaslab_group_alloc_update(metaslab_group_t *mg)
mc->mc_alloc_groups--;
else if (!was_allocatable && mg->mg_allocatable)
mc->mc_alloc_groups++;
mutex_exit(&mc->mc_lock);
mutex_exit(&mg->mg_lock);
}
@ -571,6 +597,9 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
mg->mg_vd = vd;
mg->mg_class = mc;
mg->mg_activation_count = 0;
mg->mg_initialized = B_FALSE;
mg->mg_no_free_space = B_TRUE;
refcount_create_tracked(&mg->mg_alloc_queue_depth);
mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
@ -593,6 +622,7 @@ metaslab_group_destroy(metaslab_group_t *mg)
taskq_destroy(mg->mg_taskq);
avl_destroy(&mg->mg_metaslab_tree);
mutex_destroy(&mg->mg_lock);
refcount_destroy(&mg->mg_alloc_queue_depth);
kmem_free(mg, sizeof (metaslab_group_t));
}
@ -664,6 +694,15 @@ metaslab_group_passivate(metaslab_group_t *mg)
metaslab_class_minblocksize_update(mc);
}
boolean_t
metaslab_group_initialized(metaslab_group_t *mg)
{
vdev_t *vd = mg->mg_vd;
vdev_stat_t *vs = &vd->vdev_stat;
return (vs->vs_space != 0 && mg->mg_activation_count > 0);
}
uint64_t
metaslab_group_get_space(metaslab_group_t *mg)
{
@ -833,30 +872,97 @@ metaslab_group_fragmentation(metaslab_group_t *mg)
* group should avoid allocations if its free capacity is less than the
* zfs_mg_noalloc_threshold or its fragmentation metric is greater than
* zfs_mg_fragmentation_threshold and there is at least one metaslab group
* that can still handle allocations.
* that can still handle allocations. If the allocation throttle is enabled
* then we skip allocations to devices that have reached their maximum
* allocation queue depth unless the selected metaslab group is the only
* eligible group remaining.
*/
static boolean_t
metaslab_group_allocatable(metaslab_group_t *mg)
metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
uint64_t psize)
{
vdev_t *vd = mg->mg_vd;
spa_t *spa = vd->vdev_spa;
spa_t *spa = mg->mg_vd->vdev_spa;
metaslab_class_t *mc = mg->mg_class;
/*
* We use two key metrics to determine if a metaslab group is
* considered allocatable -- free space and fragmentation. If
* the free space is greater than the free space threshold and
* the fragmentation is less than the fragmentation threshold then
* consider the group allocatable. There are two case when we will
* not consider these key metrics. The first is if the group is
* associated with a slog device and the second is if all groups
* in this metaslab class have already been consider ineligible
* We can only consider skipping this metaslab group if it's
* in the normal metaslab class and there are other metaslab
* groups to select from. Otherwise, we always consider it eligible
* for allocations.
*/
return ((mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
(mg->mg_fragmentation == ZFS_FRAG_INVALID ||
mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)) ||
mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
return (B_TRUE);
/*
* If the metaslab group's mg_allocatable flag is set (see comments
* in metaslab_group_alloc_update() for more information) and
* the allocation throttle is disabled then allow allocations to this
* device. However, if the allocation throttle is enabled then
* check if we have reached our allocation limit (mg_alloc_queue_depth)
* to determine if we should allow allocations to this metaslab group.
* If all metaslab groups are no longer considered allocatable
* (mc_alloc_groups == 0) or we're trying to allocate the smallest
* gang block size then we allow allocations on this metaslab group
* regardless of the mg_allocatable or throttle settings.
*/
if (mg->mg_allocatable) {
metaslab_group_t *mgp;
int64_t qdepth;
uint64_t qmax = mg->mg_max_alloc_queue_depth;
if (!mc->mc_alloc_throttle_enabled)
return (B_TRUE);
/*
* If this metaslab group does not have any free space, then
* there is no point in looking further.
*/
if (mg->mg_no_free_space)
return (B_FALSE);
qdepth = refcount_count(&mg->mg_alloc_queue_depth);
/*
* If this metaslab group is below its qmax or it's
* the only allocatable metasable group, then attempt
* to allocate from it.
*/
if (qdepth < qmax || mc->mc_alloc_groups == 1)
return (B_TRUE);
ASSERT3U(mc->mc_alloc_groups, >, 1);
/*
* Since this metaslab group is at or over its qmax, we
* need to determine if there are metaslab groups after this
* one that might be able to handle this allocation. This is
* racy since we can't hold the locks for all metaslab
* groups at the same time when we make this check.
*/
for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
qmax = mgp->mg_max_alloc_queue_depth;
qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
/*
* If there is another metaslab group that
* might be able to handle the allocation, then
* we return false so that we skip this group.
*/
if (qdepth < qmax && !mgp->mg_no_free_space)
return (B_FALSE);
}
/*
* We didn't find another group to handle the allocation
* so we can't skip this metaslab group even though
* we are at or over our qmax.
*/
return (B_TRUE);
} else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
return (B_TRUE);
}
return (B_FALSE);
}
/*
@ -2124,8 +2230,57 @@ metaslab_distance(metaslab_t *msp, dva_t *dva)
return (0);
}
/*
* ==========================================================================
* Metaslab block operations
* ==========================================================================
*/
static void
metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
{
if (!(flags & METASLAB_ASYNC_ALLOC) ||
flags & METASLAB_DONT_THROTTLE)
return;
metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
if (!mg->mg_class->mc_alloc_throttle_enabled)
return;
(void) refcount_add(&mg->mg_alloc_queue_depth, tag);
}
void
metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
{
if (!(flags & METASLAB_ASYNC_ALLOC) ||
flags & METASLAB_DONT_THROTTLE)
return;
metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
if (!mg->mg_class->mc_alloc_throttle_enabled)
return;
(void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
}
void
metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
{
#ifdef ZFS_DEBUG
const dva_t *dva = bp->blk_dva;
int ndvas = BP_GET_NDVAS(bp);
for (int d = 0; d < ndvas; d++) {
uint64_t vdev = DVA_GET_VDEV(&dva[d]);
metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
}
#endif
}
static uint64_t
metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
metaslab_group_alloc(metaslab_group_t *mg, uint64_t asize,
uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
{
spa_t *spa = mg->mg_vd->vdev_spa;
@ -2152,10 +2307,10 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
if (msp->ms_weight < asize) {
spa_dbgmsg(spa, "%s: failed to meet weight "
"requirement: vdev %llu, txg %llu, mg %p, "
"msp %p, psize %llu, asize %llu, "
"msp %p, asize %llu, "
"weight %llu", spa_name(spa),
mg->mg_vd->vdev_id, txg,
mg, msp, psize, asize, msp->ms_weight);
mg, msp, asize, msp->ms_weight);
mutex_exit(&mg->mg_lock);
return (-1ULL);
}
@ -2237,7 +2392,6 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
msp->ms_access_txg = txg + metaslab_unload_delay;
mutex_exit(&msp->ms_lock);
return (offset);
}
@ -2254,7 +2408,6 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
int all_zero;
int zio_lock = B_FALSE;
boolean_t allocatable;
uint64_t offset = -1ULL;
uint64_t asize;
uint64_t distance;
@ -2324,7 +2477,6 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
all_zero = B_TRUE;
do {
ASSERT(mg->mg_activation_count == 1);
vd = mg->mg_vd;
/*
@ -2340,24 +2492,23 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
/*
* Determine if the selected metaslab group is eligible
* for allocations. If we're ganging or have requested
* an allocation for the smallest gang block size
* then we don't want to avoid allocating to the this
* metaslab group. If we're in this condition we should
* try to allocate from any device possible so that we
* don't inadvertently return ENOSPC and suspend the pool
* for allocations. If we're ganging then don't allow
* this metaslab group to skip allocations since that would
* inadvertently return ENOSPC and suspend the pool
* even though space is still available.
*/
if (allocatable && CAN_FASTGANG(flags) &&
psize > SPA_GANGBLOCKSIZE)
allocatable = metaslab_group_allocatable(mg);
if (allocatable && !GANG_ALLOCATION(flags) && !zio_lock) {
allocatable = metaslab_group_allocatable(mg, rotor,
psize);
}
if (!allocatable)
goto next;
ASSERT(mg->mg_initialized);
/*
* Avoid writing single-copy data to a failing vdev
* unless the user instructs us that it is okay.
* Avoid writing single-copy data to a failing vdev.
*/
if ((vd->vdev_stat.vs_write_errors > 0 ||
vd->vdev_state < VDEV_STATE_HEALTHY) &&
@ -2377,8 +2528,32 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
asize = vdev_psize_to_asize(vd, psize);
ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
dva, d);
uint64_t offset = metaslab_group_alloc(mg, asize, txg,
distance, dva, d);
mutex_enter(&mg->mg_lock);
if (offset == -1ULL) {
mg->mg_failed_allocations++;
if (asize == SPA_GANGBLOCKSIZE) {
/*
* This metaslab group was unable to allocate
* the minimum gang block size so it must be
* out of space. We must notify the allocation
* throttle to start skipping allocation
* attempts to this metaslab group until more
* space becomes available.
*
* Note: this failure cannot be caused by the
* allocation throttle since the allocation
* throttle is only responsible for skipping
* devices and not failing block allocations.
*/
mg->mg_no_free_space = B_TRUE;
}
}
mg->mg_allocations++;
mutex_exit(&mg->mg_lock);
if (offset != -1ULL) {
/*
* If we've just selected this metaslab group,
@ -2559,9 +2734,57 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
return (0);
}
/*
* Reserve some allocation slots. The reservation system must be called
* before we call into the allocator. If there aren't any available slots
* then the I/O will be throttled until an I/O completes and its slots are
* freed up. The function returns true if it was successful in placing
* the reservation.
*/
boolean_t
metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
int flags)
{
uint64_t available_slots = 0;
boolean_t slot_reserved = B_FALSE;
ASSERT(mc->mc_alloc_throttle_enabled);
mutex_enter(&mc->mc_lock);
uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
if (reserved_slots < mc->mc_alloc_max_slots)
available_slots = mc->mc_alloc_max_slots - reserved_slots;
if (slots <= available_slots || GANG_ALLOCATION(flags)) {
/*
* We reserve the slots individually so that we can unreserve
* them individually when an I/O completes.
*/
for (int d = 0; d < slots; d++) {
reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
}
zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
slot_reserved = B_TRUE;
}
mutex_exit(&mc->mc_lock);
return (slot_reserved);
}
void
metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
{
ASSERT(mc->mc_alloc_throttle_enabled);
mutex_enter(&mc->mc_lock);
for (int d = 0; d < slots; d++) {
(void) refcount_remove(&mc->mc_alloc_slots, zio);
}
mutex_exit(&mc->mc_lock);
}
int
metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, zio_t *zio)
{
dva_t *dva = bp->blk_dva;
dva_t *hintdva = hintbp->blk_dva;
@ -2587,11 +2810,21 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
if (error != 0) {
for (d--; d >= 0; d--) {
metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
metaslab_group_alloc_decrement(spa,
DVA_GET_VDEV(&dva[d]), zio, flags);
bzero(&dva[d], sizeof (dva_t));
}
spa_config_exit(spa, SCL_ALLOC, FTAG);
return (error);
} else {
/*
* Update the metaslab group's queue depth
* based on the newly allocated dva.
*/
metaslab_group_alloc_increment(spa,
DVA_GET_VDEV(&dva[d]), zio, flags);
}
}
ASSERT(error == 0);
ASSERT(BP_GET_NDVAS(bp) == ndvas);

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@ -72,6 +72,13 @@ refcount_create(refcount_t *rc)
rc->rc_tracked = reference_tracking_enable;
}
void
refcount_create_tracked(refcount_t *rc)
{
refcount_create(rc);
rc->rc_tracked = B_TRUE;
}
void
refcount_create_untracked(refcount_t *rc)
{
@ -231,4 +238,84 @@ refcount_transfer(refcount_t *dst, refcount_t *src)
list_destroy(&removed);
}
void
refcount_transfer_ownership(refcount_t *rc, void *current_holder,
void *new_holder)
{
reference_t *ref;
boolean_t found = B_FALSE;
mutex_enter(&rc->rc_mtx);
if (!rc->rc_tracked) {
mutex_exit(&rc->rc_mtx);
return;
}
for (ref = list_head(&rc->rc_list); ref;
ref = list_next(&rc->rc_list, ref)) {
if (ref->ref_holder == current_holder) {
ref->ref_holder = new_holder;
found = B_TRUE;
break;
}
}
ASSERT(found);
mutex_exit(&rc->rc_mtx);
}
/*
* If tracking is enabled, return true if a reference exists that matches
* the "holder" tag. If tracking is disabled, then return true if a reference
* might be held.
*/
boolean_t
refcount_held(refcount_t *rc, void *holder)
{
reference_t *ref;
mutex_enter(&rc->rc_mtx);
if (!rc->rc_tracked) {
mutex_exit(&rc->rc_mtx);
return (rc->rc_count > 0);
}
for (ref = list_head(&rc->rc_list); ref;
ref = list_next(&rc->rc_list, ref)) {
if (ref->ref_holder == holder) {
mutex_exit(&rc->rc_mtx);
return (B_TRUE);
}
}
mutex_exit(&rc->rc_mtx);
return (B_FALSE);
}
/*
* If tracking is enabled, return true if a reference does not exist that
* matches the "holder" tag. If tracking is disabled, always return true
* since the reference might not be held.
*/
boolean_t
refcount_not_held(refcount_t *rc, void *holder)
{
reference_t *ref;
mutex_enter(&rc->rc_mtx);
if (!rc->rc_tracked) {
mutex_exit(&rc->rc_mtx);
return (B_TRUE);
}
for (ref = list_head(&rc->rc_list); ref;
ref = list_next(&rc->rc_list, ref)) {
if (ref->ref_holder == holder) {
mutex_exit(&rc->rc_mtx);
return (B_FALSE);
}
}
mutex_exit(&rc->rc_mtx);
return (B_TRUE);
}
#endif /* ZFS_DEBUG */

View File

@ -1332,7 +1332,6 @@ spa_unload(spa_t *spa)
ddt_unload(spa);
/*
* Drop and purge level 2 cache
*/
@ -3720,6 +3719,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
spa->spa_uberblock.ub_txg = txg - 1;
spa->spa_uberblock.ub_version = version;
spa->spa_ubsync = spa->spa_uberblock;
spa->spa_load_state = SPA_LOAD_CREATE;
/*
* Create "The Godfather" zio to hold all async IOs
@ -3905,6 +3905,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
*/
spa_evicting_os_wait(spa);
spa->spa_minref = refcount_count(&spa->spa_refcount);
spa->spa_load_state = SPA_LOAD_NONE;
mutex_exit(&spa_namespace_lock);
@ -5615,7 +5616,7 @@ spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
static void
spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
nvlist_t *dev_to_remove)
nvlist_t *dev_to_remove)
{
nvlist_t **newdev = NULL;
@ -6830,6 +6831,8 @@ spa_sync(spa_t *spa, uint64_t txg)
vdev_t *vd;
dmu_tx_t *tx;
int error;
uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
zfs_vdev_queue_depth_pct / 100;
VERIFY(spa_writeable(spa));
@ -6841,6 +6844,10 @@ spa_sync(spa_t *spa, uint64_t txg)
spa->spa_syncing_txg = txg;
spa->spa_sync_pass = 0;
mutex_enter(&spa->spa_alloc_lock);
VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
mutex_exit(&spa->spa_alloc_lock);
/*
* If there are any pending vdev state changes, convert them
* into config changes that go out with this transaction group.
@ -6899,6 +6906,38 @@ spa_sync(spa_t *spa, uint64_t txg)
}
}
/*
* Set the top-level vdev's max queue depth. Evaluate each
* top-level's async write queue depth in case it changed.
* The max queue depth will not change in the middle of syncing
* out this txg.
*/
uint64_t queue_depth_total = 0;
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
metaslab_group_t *mg = tvd->vdev_mg;
if (mg == NULL || mg->mg_class != spa_normal_class(spa) ||
!metaslab_group_initialized(mg))
continue;
/*
* It is safe to do a lock-free check here because only async
* allocations look at mg_max_alloc_queue_depth, and async
* allocations all happen from spa_sync().
*/
ASSERT0(refcount_count(&mg->mg_alloc_queue_depth));
mg->mg_max_alloc_queue_depth = max_queue_depth;
queue_depth_total += mg->mg_max_alloc_queue_depth;
}
metaslab_class_t *mc = spa_normal_class(spa);
ASSERT0(refcount_count(&mc->mc_alloc_slots));
mc->mc_alloc_max_slots = queue_depth_total;
mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
ASSERT3U(mc->mc_alloc_max_slots, <=,
max_queue_depth * rvd->vdev_children);
/*
* Iterate to convergence.
*/
@ -7056,6 +7095,10 @@ spa_sync(spa_t *spa, uint64_t txg)
dsl_pool_sync_done(dp, txg);
mutex_enter(&spa->spa_alloc_lock);
VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
mutex_exit(&spa->spa_alloc_lock);
/*
* Update usable space statistics.
*/

View File

@ -657,6 +657,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_alloc_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
@ -713,6 +714,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa_active_count++;
}
avl_create(&spa->spa_alloc_tree, zio_timestamp_compare,
sizeof (zio_t), offsetof(zio_t, io_alloc_node));
/*
* Every pool starts with the default cachefile
*/
@ -791,6 +795,7 @@ spa_remove(spa_t *spa)
kmem_free(dp, sizeof (spa_config_dirent_t));
}
avl_destroy(&spa->spa_alloc_tree);
list_destroy(&spa->spa_config_list);
nvlist_free(spa->spa_label_features);
@ -824,6 +829,7 @@ spa_remove(spa_t *spa)
cv_destroy(&spa->spa_scrub_io_cv);
cv_destroy(&spa->spa_suspend_cv);
mutex_destroy(&spa->spa_alloc_lock);
mutex_destroy(&spa->spa_async_lock);
mutex_destroy(&spa->spa_errlist_lock);
mutex_destroy(&spa->spa_errlog_lock);

View File

@ -43,51 +43,83 @@ extern "C" {
*/
#define ARC_EVICT_ALL -1ULL
#define HDR_SET_LSIZE(hdr, x) do { \
ASSERT(IS_P2ALIGNED(x, 1U << SPA_MINBLOCKSHIFT)); \
(hdr)->b_lsize = ((x) >> SPA_MINBLOCKSHIFT); \
_NOTE(CONSTCOND) } while (0)
#define HDR_SET_PSIZE(hdr, x) do { \
ASSERT(IS_P2ALIGNED((x), 1U << SPA_MINBLOCKSHIFT)); \
(hdr)->b_psize = ((x) >> SPA_MINBLOCKSHIFT); \
_NOTE(CONSTCOND) } while (0)
#define HDR_GET_LSIZE(hdr) ((hdr)->b_lsize << SPA_MINBLOCKSHIFT)
#define HDR_GET_PSIZE(hdr) ((hdr)->b_psize << SPA_MINBLOCKSHIFT)
typedef struct arc_buf_hdr arc_buf_hdr_t;
typedef struct arc_buf arc_buf_t;
typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv);
typedef int arc_evict_func_t(void *priv);
/* generic arc_done_func_t's which you can use */
arc_done_func_t arc_bcopy_func;
arc_done_func_t arc_getbuf_func;
extern int zfs_arc_num_sublists_per_state;
typedef enum arc_flags
{
/*
* Public flags that can be passed into the ARC by external consumers.
*/
ARC_FLAG_NONE = 1 << 0, /* No flags set */
ARC_FLAG_WAIT = 1 << 1, /* perform sync I/O */
ARC_FLAG_NOWAIT = 1 << 2, /* perform async I/O */
ARC_FLAG_PREFETCH = 1 << 3, /* I/O is a prefetch */
ARC_FLAG_CACHED = 1 << 4, /* I/O was in cache */
ARC_FLAG_L2CACHE = 1 << 5, /* cache in L2ARC */
ARC_FLAG_L2COMPRESS = 1 << 6, /* compress in L2ARC */
ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 7, /* I/O from zfetch */
ARC_FLAG_WAIT = 1 << 0, /* perform sync I/O */
ARC_FLAG_NOWAIT = 1 << 1, /* perform async I/O */
ARC_FLAG_PREFETCH = 1 << 2, /* I/O is a prefetch */
ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */
ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */
ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 5, /* I/O from zfetch */
/*
* Private ARC flags. These flags are private ARC only flags that
* will show up in b_flags in the arc_hdr_buf_t. These flags should
* only be set by ARC code.
*/
ARC_FLAG_IN_HASH_TABLE = 1 << 8, /* buffer is hashed */
ARC_FLAG_IO_IN_PROGRESS = 1 << 9, /* I/O in progress */
ARC_FLAG_IO_ERROR = 1 << 10, /* I/O failed for buf */
ARC_FLAG_FREED_IN_READ = 1 << 11, /* freed during read */
ARC_FLAG_BUF_AVAILABLE = 1 << 12, /* block not in use */
ARC_FLAG_INDIRECT = 1 << 13, /* indirect block */
ARC_FLAG_IN_HASH_TABLE = 1 << 6, /* buffer is hashed */
ARC_FLAG_IO_IN_PROGRESS = 1 << 7, /* I/O in progress */
ARC_FLAG_IO_ERROR = 1 << 8, /* I/O failed for buf */
ARC_FLAG_INDIRECT = 1 << 9, /* indirect block */
/* Indicates that block was read with ASYNC priority. */
ARC_FLAG_PRIO_ASYNC_READ = 1 << 14,
ARC_FLAG_L2_WRITING = 1 << 15, /* write in progress */
ARC_FLAG_L2_EVICTED = 1 << 16, /* evicted during I/O */
ARC_FLAG_L2_WRITE_HEAD = 1 << 17, /* head of write list */
ARC_FLAG_PRIO_ASYNC_READ = 1 << 10,
ARC_FLAG_L2_WRITING = 1 << 11, /* write in progress */
ARC_FLAG_L2_EVICTED = 1 << 12, /* evicted during I/O */
ARC_FLAG_L2_WRITE_HEAD = 1 << 13, /* head of write list */
/* indicates that the buffer contains metadata (otherwise, data) */
ARC_FLAG_BUFC_METADATA = 1 << 18,
ARC_FLAG_BUFC_METADATA = 1 << 14,
/* Flags specifying whether optional hdr struct fields are defined */
ARC_FLAG_HAS_L1HDR = 1 << 19,
ARC_FLAG_HAS_L2HDR = 1 << 20,
ARC_FLAG_HAS_L1HDR = 1 << 15,
ARC_FLAG_HAS_L2HDR = 1 << 16,
/*
* Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data.
* This allows the l2arc to use the blkptr's checksum to verify
* the data without having to store the checksum in the hdr.
*/
ARC_FLAG_COMPRESSED_ARC = 1 << 17,
ARC_FLAG_SHARED_DATA = 1 << 18,
/*
* The arc buffer's compression mode is stored in the top 7 bits of the
* flags field, so these dummy flags are included so that MDB can
* interpret the enum properly.
*/
ARC_FLAG_COMPRESS_0 = 1 << 24,
ARC_FLAG_COMPRESS_1 = 1 << 25,
ARC_FLAG_COMPRESS_2 = 1 << 26,
ARC_FLAG_COMPRESS_3 = 1 << 27,
ARC_FLAG_COMPRESS_4 = 1 << 28,
ARC_FLAG_COMPRESS_5 = 1 << 29,
ARC_FLAG_COMPRESS_6 = 1 << 30
} arc_flags_t;
struct arc_buf {
@ -95,11 +127,10 @@ struct arc_buf {
arc_buf_t *b_next;
kmutex_t b_evict_lock;
void *b_data;
arc_evict_func_t *b_efunc;
void *b_private;
};
typedef enum arc_buf_contents {
ARC_BUFC_INVALID, /* invalid type */
ARC_BUFC_DATA, /* buffer contains data */
ARC_BUFC_METADATA, /* buffer contains metadata */
ARC_BUFC_NUMTYPES
@ -119,19 +150,17 @@ typedef enum arc_space_type {
void arc_space_consume(uint64_t space, arc_space_type_t type);
void arc_space_return(uint64_t space, arc_space_type_t type);
arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,
arc_buf_t *arc_alloc_buf(spa_t *spa, int32_t size, void *tag,
arc_buf_contents_t type);
arc_buf_t *arc_loan_buf(spa_t *spa, int size);
void arc_return_buf(arc_buf_t *buf, void *tag);
void arc_loan_inuse_buf(arc_buf_t *buf, void *tag);
void arc_buf_add_ref(arc_buf_t *buf, void *tag);
boolean_t arc_buf_remove_ref(arc_buf_t *buf, void *tag);
void arc_buf_destroy(arc_buf_t *buf, void *tag);
int arc_buf_size(arc_buf_t *buf);
void arc_release(arc_buf_t *buf, void *tag);
int arc_released(arc_buf_t *buf);
void arc_buf_freeze(arc_buf_t *buf);
void arc_buf_thaw(arc_buf_t *buf);
boolean_t arc_buf_eviction_needed(arc_buf_t *buf);
#ifdef ZFS_DEBUG
int arc_referenced(arc_buf_t *buf);
#endif
@ -140,21 +169,18 @@ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
arc_done_func_t *done, void *priv, zio_priority_t priority, int flags,
arc_flags_t *arc_flags, const zbookmark_phys_t *zb);
zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
const zio_prop_t *zp,
blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
arc_done_func_t *ready, arc_done_func_t *child_ready,
arc_done_func_t *physdone, arc_done_func_t *done,
void *priv, zio_priority_t priority, int zio_flags,
const zbookmark_phys_t *zb);
void arc_freed(spa_t *spa, const blkptr_t *bp);
void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *priv);
boolean_t arc_clear_callback(arc_buf_t *buf);
void arc_flush(spa_t *spa, boolean_t retry);
void arc_tempreserve_clear(uint64_t reserve);
int arc_tempreserve_space(uint64_t reserve, uint64_t txg);
uint64_t arc_max_bytes(void);
void arc_init(void);
void arc_fini(void);

View File

@ -36,6 +36,7 @@
#include <sys/zfs_context.h>
#include <sys/refcount.h>
#include <sys/zrlock.h>
#include <sys/multilist.h>
#ifdef __cplusplus
extern "C" {
@ -228,6 +229,11 @@ typedef struct dmu_buf_impl {
*/
avl_node_t db_link;
/*
* Link in dbuf_cache.
*/
multilist_node_t db_cache_link;
/* Data which is unique to data (leaf) blocks: */
/* User callback information. */
@ -305,8 +311,7 @@ void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
bp_embedded_type_t etype, enum zio_compress comp,
int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx);
void dbuf_clear(dmu_buf_impl_t *db);
void dbuf_evict(dmu_buf_impl_t *db);
void dbuf_destroy(dmu_buf_impl_t *db);
void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
void dbuf_unoverride(dbuf_dirty_record_t *dr);
@ -342,10 +347,6 @@ boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);
(dbuf_is_metadata(_db) && \
((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
#define DBUF_IS_L2COMPRESSIBLE(_db) \
((_db)->db_objset->os_compress != ZIO_COMPRESS_OFF || \
(dbuf_is_metadata(_db) && zfs_mdcomp_disable == B_FALSE))
#ifdef ZFS_DEBUG
/*

View File

@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright 2013 DEY Storage Systems, Inc.
@ -78,6 +78,7 @@ struct file;
typedef struct objset objset_t;
typedef struct dmu_tx dmu_tx_t;
typedef struct dsl_dir dsl_dir_t;
typedef struct dnode dnode_t;
typedef enum dmu_object_byteswap {
DMU_BSWAP_UINT8,
@ -419,7 +420,7 @@ dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
#define WP_DMU_SYNC 0x2
#define WP_SPILL 0x4
void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp,
void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
struct zio_prop *zp);
/*
* The bonus data is accessed more or less like a regular buffer.
@ -445,7 +446,7 @@ int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
*/
int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
int dmu_spill_hold_by_dnode(struct dnode *dn, uint32_t flags,
int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags,
void *tag, dmu_buf_t **dbp);
int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
@ -465,6 +466,8 @@ int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
*/
int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
void *tag, dmu_buf_t **, int flags);
int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
void *tag, dmu_buf_t **dbp, int flags);
/*
* Add a reference to a dmu buffer that has already been held via
@ -617,6 +620,10 @@ void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user);
*/
void *dmu_buf_get_user(dmu_buf_t *db);
objset_t *dmu_buf_get_objset(dmu_buf_t *db);
dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db);
void dmu_buf_dnode_exit(dmu_buf_t *db);
/* Block until any in-progress dmu buf user evictions complete. */
void dmu_buf_user_evict_wait(void);
@ -799,7 +806,7 @@ extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS];
*/
int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
/* Like dmu_object_info, but faster if you have a held dnode in hand. */
void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi);
/* Like dmu_object_info, but faster if you have a held dbuf in hand. */
void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
/*

View File

@ -301,6 +301,8 @@ typedef struct dmu_sendarg {
uint64_t dsa_last_data_offset;
uint64_t dsa_resume_object;
uint64_t dsa_resume_offset;
boolean_t dsa_sent_begin;
boolean_t dsa_sent_end;
} dmu_sendarg_t;
void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *);

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
@ -149,7 +149,7 @@ typedef struct dnode_phys {
blkptr_t dn_spill;
} dnode_phys_t;
typedef struct dnode {
struct dnode {
/*
* Protects the structure of the dnode, including the number of levels
* of indirection (dn_nlevels), dn_maxblkid, and dn_next_*
@ -247,7 +247,7 @@ typedef struct dnode {
/* holds prefetch structure */
struct zfetch dn_zfetch;
} dnode_t;
};
/*
* Adds a level of indirection between the dbuf and the dnode to avoid

View File

@ -272,7 +272,6 @@ int dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
minor_t cleanup_minor, const char *htag);
blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds);

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_METASLAB_H
@ -55,14 +55,15 @@ void metaslab_sync_done(metaslab_t *, uint64_t);
void metaslab_sync_reassess(metaslab_group_t *);
uint64_t metaslab_block_maxsize(metaslab_t *);
#define METASLAB_HINTBP_FAVOR 0x0
#define METASLAB_HINTBP_AVOID 0x1
#define METASLAB_GANG_HEADER 0x2
#define METASLAB_GANG_CHILD 0x4
#define METASLAB_GANG_AVOID 0x8
#define METASLAB_HINTBP_FAVOR 0x0
#define METASLAB_HINTBP_AVOID 0x1
#define METASLAB_GANG_HEADER 0x2
#define METASLAB_GANG_CHILD 0x4
#define METASLAB_ASYNC_ALLOC 0x8
#define METASLAB_DONT_THROTTLE 0x10
int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
blkptr_t *, int, uint64_t, blkptr_t *, int);
blkptr_t *, int, uint64_t, blkptr_t *, int, zio_t *);
void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
void metaslab_check_free(spa_t *, const blkptr_t *);
@ -73,6 +74,9 @@ int metaslab_class_validate(metaslab_class_t *);
void metaslab_class_histogram_verify(metaslab_class_t *);
uint64_t metaslab_class_fragmentation(metaslab_class_t *);
uint64_t metaslab_class_expandable_space(metaslab_class_t *);
boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int,
zio_t *, int);
void metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *);
void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t,
int64_t, int64_t);
@ -86,10 +90,13 @@ metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *);
void metaslab_group_destroy(metaslab_group_t *);
void metaslab_group_activate(metaslab_group_t *);
void metaslab_group_passivate(metaslab_group_t *);
boolean_t metaslab_group_initialized(metaslab_group_t *);
uint64_t metaslab_group_get_space(metaslab_group_t *);
void metaslab_group_histogram_verify(metaslab_group_t *);
uint64_t metaslab_group_fragmentation(metaslab_group_t *);
void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int);
void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *);
#ifdef __cplusplus
}

View File

@ -24,7 +24,7 @@
*/
/*
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_METASLAB_IMPL_H
@ -59,11 +59,42 @@ extern "C" {
* to use a block allocator that best suits that class.
*/
struct metaslab_class {
kmutex_t mc_lock;
spa_t *mc_spa;
metaslab_group_t *mc_rotor;
metaslab_ops_t *mc_ops;
uint64_t mc_aliquot;
/*
* Track the number of metaslab groups that have been initialized
* and can accept allocations. An initialized metaslab group is
* one has been completely added to the config (i.e. we have
* updated the MOS config and the space has been added to the pool).
*/
uint64_t mc_groups;
/*
* Toggle to enable/disable the allocation throttle.
*/
boolean_t mc_alloc_throttle_enabled;
/*
* The allocation throttle works on a reservation system. Whenever
* an asynchronous zio wants to perform an allocation it must
* first reserve the number of blocks that it wants to allocate.
* If there aren't sufficient slots available for the pending zio
* then that I/O is throttled until more slots free up. The current
* number of reserved allocations is maintained by the mc_alloc_slots
* refcount. The mc_alloc_max_slots value determines the maximum
* number of allocations that the system allows. Gang blocks are
* allowed to reserve slots even if we've reached the maximum
* number of allocations allowed.
*/
uint64_t mc_alloc_max_slots;
refcount_t mc_alloc_slots;
uint64_t mc_alloc_groups; /* # of allocatable groups */
uint64_t mc_alloc; /* total allocated space */
uint64_t mc_deferred; /* total deferred frees */
uint64_t mc_space; /* total space (alloc + free) */
@ -86,6 +117,15 @@ struct metaslab_group {
avl_tree_t mg_metaslab_tree;
uint64_t mg_aliquot;
boolean_t mg_allocatable; /* can we allocate? */
/*
* A metaslab group is considered to be initialized only after
* we have updated the MOS config and added the space to the pool.
* We only allow allocation attempts to a metaslab group if it
* has been initialized.
*/
boolean_t mg_initialized;
uint64_t mg_free_capacity; /* percentage free */
int64_t mg_bias;
int64_t mg_activation_count;
@ -94,6 +134,27 @@ struct metaslab_group {
taskq_t *mg_taskq;
metaslab_group_t *mg_prev;
metaslab_group_t *mg_next;
/*
* Each metaslab group can handle mg_max_alloc_queue_depth allocations
* which are tracked by mg_alloc_queue_depth. It's possible for a
* metaslab group to handle more allocations than its max. This
* can occur when gang blocks are required or when other groups
* are unable to handle their share of allocations.
*/
uint64_t mg_max_alloc_queue_depth;
refcount_t mg_alloc_queue_depth;
/*
* A metalab group that can no longer allocate the minimum block
* size will set mg_no_free_space. Once a metaslab group is out
* of space then its share of work must be distributed to other
* groups.
*/
boolean_t mg_no_free_space;
uint64_t mg_allocations;
uint64_t mg_failed_allocations;
uint64_t mg_fragmentation;
uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
};

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_REFCOUNT_H
@ -64,6 +64,7 @@ typedef struct refcount {
void refcount_create(refcount_t *rc);
void refcount_create_untracked(refcount_t *rc);
void refcount_create_tracked(refcount_t *rc);
void refcount_destroy(refcount_t *rc);
void refcount_destroy_many(refcount_t *rc, uint64_t number);
int refcount_is_zero(refcount_t *rc);
@ -73,6 +74,9 @@ int64_t refcount_remove(refcount_t *rc, void *holder_tag);
int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag);
int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag);
void refcount_transfer(refcount_t *dst, refcount_t *src);
void refcount_transfer_ownership(refcount_t *, void *, void *);
boolean_t refcount_held(refcount_t *, void *);
boolean_t refcount_not_held(refcount_t *, void *);
void refcount_sysinit(void);
void refcount_fini(void);
@ -85,6 +89,7 @@ typedef struct refcount {
#define refcount_create(rc) ((rc)->rc_count = 0)
#define refcount_create_untracked(rc) ((rc)->rc_count = 0)
#define refcount_create_tracked(rc) ((rc)->rc_count = 0)
#define refcount_destroy(rc) ((rc)->rc_count = 0)
#define refcount_destroy_many(rc, number) ((rc)->rc_count = 0)
#define refcount_is_zero(rc) ((rc)->rc_count == 0)
@ -100,6 +105,9 @@ typedef struct refcount {
atomic_add_64(&(src)->rc_count, -__tmp); \
atomic_add_64(&(dst)->rc_count, __tmp); \
}
#define refcount_transfer_ownership(rc, current_holder, new_holder)
#define refcount_held(rc, holder) ((rc)->rc_count > 0)
#define refcount_not_held(rc, holder) (B_TRUE)
#define refcount_sysinit()
#define refcount_fini()

View File

@ -149,6 +149,8 @@ _NOTE(CONSTCOND) } while (0)
#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */
#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */
#define SPA_COMPRESSBITS 7
/*
* All SPA data is represented by 128-bit data virtual addresses (DVAs).
* The members of the dva_t should be considered opaque outside the SPA.
@ -391,8 +393,10 @@ _NOTE(CONSTCOND) } while (0)
16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
_NOTE(CONSTCOND) } while (0)
#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 7)
#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 7, x)
#define BP_GET_COMPRESS(bp) \
BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS)
#define BP_SET_COMPRESS(bp, x) \
BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x)
#define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1)
#define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x)

View File

@ -165,6 +165,8 @@ struct spa {
uint64_t spa_last_synced_guid; /* last synced guid */
list_t spa_config_dirty_list; /* vdevs with dirty config */
list_t spa_state_dirty_list; /* vdevs with dirty state */
kmutex_t spa_alloc_lock;
avl_tree_t spa_alloc_tree;
spa_aux_vdev_t spa_spares; /* hot spares */
spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */
nvlist_t *spa_label_features; /* Features for reading MOS */

View File

@ -53,6 +53,9 @@ typedef struct vdev_queue vdev_queue_t;
typedef struct vdev_cache vdev_cache_t;
typedef struct vdev_cache_entry vdev_cache_entry_t;
extern int zfs_vdev_queue_depth_pct;
extern uint32_t zfs_vdev_async_write_max_active;
/*
* Virtual device operations
*/
@ -190,9 +193,20 @@ struct vdev {
uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */
uint64_t vdev_islog; /* is an intent log device */
uint64_t vdev_removing; /* device is being removed? */
boolean_t vdev_ishole; /* is a hole in the namespace */
boolean_t vdev_ishole; /* is a hole in the namespace */
kmutex_t vdev_queue_lock; /* protects vdev_queue_depth */
uint64_t vdev_top_zap;
/*
* The queue depth parameters determine how many async writes are
* still pending (i.e. allocated by net yet issued to disk) per
* top-level (vdev_async_write_queue_depth) and the maximum allowed
* (vdev_max_async_write_queue_depth). These values only apply to
* top-level vdevs.
*/
uint64_t vdev_async_write_queue_depth;
uint64_t vdev_max_async_write_queue_depth;
/*
* Leaf vdev state.
*/

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
*/
#ifndef _SYS_ZAP_H
@ -216,8 +216,14 @@ int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int zap_contains(objset_t *ds, uint64_t zapobj, const char *name);
int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints);
int zap_lookup_by_dnode(dnode_t *dn, const char *name,
uint64_t integer_size, uint64_t num_integers, void *buf);
int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
uint64_t integer_size, uint64_t num_integers, void *buf,
matchtype_t mt, char *realname, int rn_len,
boolean_t *ncp);
int zap_count_write(objset_t *os, uint64_t zapobj, const char *name,
int zap_count_write_by_dnode(dnode_t *dn, const char *name,
int add, refcount_t *towrite, refcount_t *tooverwrite);
/*

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013, 2015 by Delphix. All rights reserved.
* Copyright (c) 2013, 2016 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
@ -197,8 +197,8 @@ typedef struct zap_name {
boolean_t zap_match(zap_name_t *zn, const char *matchname);
int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp);
void zap_unlockdir(zap_t *zap);
krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp);
void zap_unlockdir(zap_t *zap, void *tag);
void zap_evict(void *dbu);
zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt);
void zap_name_free(zap_name_t *zn);
@ -217,9 +217,10 @@ void fzap_prefetch(zap_name_t *zn);
int fzap_count_write(zap_name_t *zn, int add, refcount_t *towrite,
refcount_t *tooverwrite);
int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx);
const void *val, void *tag, dmu_tx_t *tx);
int fzap_update(zap_name_t *zn,
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
int integer_size, uint64_t num_integers, const void *val,
void *tag, dmu_tx_t *tx);
int fzap_length(zap_name_t *zn,
uint64_t *integer_size, uint64_t *num_integers);
int fzap_remove(zap_name_t *zn, dmu_tx_t *tx);
@ -229,7 +230,7 @@ void zap_put_leaf(struct zap_leaf *l);
int fzap_add_cd(zap_name_t *zn,
uint64_t integer_size, uint64_t num_integers,
const void *val, uint32_t cd, dmu_tx_t *tx);
const void *val, uint32_t cd, void *tag, dmu_tx_t *tx);
void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags);
int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn);

View File

@ -175,6 +175,7 @@ enum zio_flag {
ZIO_FLAG_DONT_CACHE = 1 << 11,
ZIO_FLAG_NODATA = 1 << 12,
ZIO_FLAG_INDUCE_DAMAGE = 1 << 13,
ZIO_FLAG_IO_ALLOCATING = 1 << 14,
#define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1)
#define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1)
@ -182,27 +183,27 @@ enum zio_flag {
/*
* Flags inherited by vdev children.
*/
ZIO_FLAG_IO_RETRY = 1 << 14, /* must be first for INHERIT */
ZIO_FLAG_PROBE = 1 << 15,
ZIO_FLAG_TRYHARD = 1 << 16,
ZIO_FLAG_OPTIONAL = 1 << 17,
ZIO_FLAG_IO_RETRY = 1 << 15, /* must be first for INHERIT */
ZIO_FLAG_PROBE = 1 << 16,
ZIO_FLAG_TRYHARD = 1 << 17,
ZIO_FLAG_OPTIONAL = 1 << 18,
#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1)
/*
* Flags not inherited by any children.
*/
ZIO_FLAG_DONT_QUEUE = 1 << 18, /* must be first for INHERIT */
ZIO_FLAG_DONT_PROPAGATE = 1 << 19,
ZIO_FLAG_IO_BYPASS = 1 << 20,
ZIO_FLAG_IO_REWRITE = 1 << 21,
ZIO_FLAG_RAW = 1 << 22,
ZIO_FLAG_GANG_CHILD = 1 << 23,
ZIO_FLAG_DDT_CHILD = 1 << 24,
ZIO_FLAG_GODFATHER = 1 << 25,
ZIO_FLAG_NOPWRITE = 1 << 26,
ZIO_FLAG_REEXECUTED = 1 << 27,
ZIO_FLAG_DELEGATED = 1 << 28,
ZIO_FLAG_DONT_QUEUE = 1 << 19, /* must be first for INHERIT */
ZIO_FLAG_DONT_PROPAGATE = 1 << 20,
ZIO_FLAG_IO_BYPASS = 1 << 21,
ZIO_FLAG_IO_REWRITE = 1 << 22,
ZIO_FLAG_RAW = 1 << 23,
ZIO_FLAG_GANG_CHILD = 1 << 24,
ZIO_FLAG_DDT_CHILD = 1 << 25,
ZIO_FLAG_GODFATHER = 1 << 26,
ZIO_FLAG_NOPWRITE = 1 << 27,
ZIO_FLAG_REEXECUTED = 1 << 28,
ZIO_FLAG_DELEGATED = 1 << 29,
};
#define ZIO_FLAG_MUSTSUCCEED 0
@ -243,6 +244,7 @@ enum zio_wait_type {
typedef void zio_done_func_t(zio_t *zio);
extern boolean_t zio_dva_throttle_enabled;
extern const char *zio_type_name[ZIO_TYPES];
/*
@ -430,7 +432,6 @@ struct zio {
blkptr_t io_bp_copy;
list_t io_parent_list;
list_t io_child_list;
zio_link_t *io_walk_link;
zio_t *io_logical;
zio_transform_t *io_transform_stack;
@ -456,9 +457,11 @@ struct zio {
uint64_t io_offset;
hrtime_t io_timestamp;
hrtime_t io_queued_timestamp;
hrtime_t io_target_timestamp;
avl_node_t io_queue_node;
avl_node_t io_offset_node;
avl_node_t io_alloc_node;
/* Internal pipeline state */
enum zio_flag io_flags;
@ -467,6 +470,7 @@ struct zio {
enum zio_flag io_orig_flags;
enum zio_stage io_orig_stage;
enum zio_stage io_orig_pipeline;
enum zio_stage io_pipeline_trace;
int io_error;
int io_child_error[ZIO_CHILD_TYPES];
uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
@ -492,6 +496,8 @@ struct zio {
list_node_t io_trim_link;
};
extern int zio_timestamp_compare(const void *, const void *);
extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
zio_done_func_t *done, void *priv, enum zio_flag flags);
@ -554,8 +560,8 @@ extern void zio_interrupt(zio_t *zio);
extern void zio_delay_init(zio_t *zio);
extern void zio_delay_interrupt(zio_t *zio);
extern zio_t *zio_walk_parents(zio_t *cio);
extern zio_t *zio_walk_children(zio_t *pio);
extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **);
extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **);
extern zio_t *zio_unique_parent(zio_t *cio);
extern void zio_add_child(zio_t *pio, zio_t *cio);
@ -564,6 +570,10 @@ extern void zio_buf_free(void *buf, size_t size);
extern void *zio_data_buf_alloc(size_t size);
extern void zio_data_buf_free(void *buf, size_t size);
extern void zio_push_transform(zio_t *zio, void *data, uint64_t size,
uint64_t bufsize, zio_transform_func_t *transform);
extern void zio_pop_transforms(zio_t *zio);
extern void zio_resubmit_stage_async(void *);
extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,

View File

@ -99,8 +99,12 @@ extern zio_checksum_tmpl_init_t zio_checksum_edonr_tmpl_init;
extern zio_checksum_tmpl_free_t zio_checksum_edonr_tmpl_free;
#endif
extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum,
void *, uint64_t, uint64_t, zio_bad_cksum_t *);
extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
void *data, uint64_t size);
extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum,
void *, uint64_t, uint64_t, zio_bad_cksum_t *);
extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out);
extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
extern void zio_checksum_templates_free(spa_t *spa);

View File

@ -24,7 +24,7 @@
*/
/*
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#ifndef _ZIO_IMPL_H
@ -108,35 +108,37 @@ enum zio_stage {
ZIO_STAGE_OPEN = 1 << 0, /* RWFCI */
ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R---- */
ZIO_STAGE_FREE_BP_INIT = 1 << 2, /* --F-- */
ZIO_STAGE_ISSUE_ASYNC = 1 << 3, /* RWF-- */
ZIO_STAGE_WRITE_BP_INIT = 1 << 4, /* -W--- */
ZIO_STAGE_WRITE_BP_INIT = 1 << 2, /* -W--- */
ZIO_STAGE_FREE_BP_INIT = 1 << 3, /* --F-- */
ZIO_STAGE_ISSUE_ASYNC = 1 << 4, /* RWF-- */
ZIO_STAGE_WRITE_COMPRESS = 1 << 5, /* -W--- */
ZIO_STAGE_CHECKSUM_GENERATE = 1 << 5, /* -W--- */
ZIO_STAGE_CHECKSUM_GENERATE = 1 << 6, /* -W--- */
ZIO_STAGE_NOP_WRITE = 1 << 6, /* -W--- */
ZIO_STAGE_NOP_WRITE = 1 << 7, /* -W--- */
ZIO_STAGE_DDT_READ_START = 1 << 7, /* R---- */
ZIO_STAGE_DDT_READ_DONE = 1 << 8, /* R---- */
ZIO_STAGE_DDT_WRITE = 1 << 9, /* -W--- */
ZIO_STAGE_DDT_FREE = 1 << 10, /* --F-- */
ZIO_STAGE_DDT_READ_START = 1 << 8, /* R---- */
ZIO_STAGE_DDT_READ_DONE = 1 << 9, /* R---- */
ZIO_STAGE_DDT_WRITE = 1 << 10, /* -W--- */
ZIO_STAGE_DDT_FREE = 1 << 11, /* --F-- */
ZIO_STAGE_GANG_ASSEMBLE = 1 << 11, /* RWFC- */
ZIO_STAGE_GANG_ISSUE = 1 << 12, /* RWFC- */
ZIO_STAGE_GANG_ASSEMBLE = 1 << 12, /* RWFC- */
ZIO_STAGE_GANG_ISSUE = 1 << 13, /* RWFC- */
ZIO_STAGE_DVA_ALLOCATE = 1 << 13, /* -W--- */
ZIO_STAGE_DVA_FREE = 1 << 14, /* --F-- */
ZIO_STAGE_DVA_CLAIM = 1 << 15, /* ---C- */
ZIO_STAGE_DVA_THROTTLE = 1 << 14, /* -W--- */
ZIO_STAGE_DVA_ALLOCATE = 1 << 15, /* -W--- */
ZIO_STAGE_DVA_FREE = 1 << 16, /* --F-- */
ZIO_STAGE_DVA_CLAIM = 1 << 17, /* ---C- */
ZIO_STAGE_READY = 1 << 16, /* RWFCI */
ZIO_STAGE_READY = 1 << 18, /* RWFCI */
ZIO_STAGE_VDEV_IO_START = 1 << 17, /* RWF-I */
ZIO_STAGE_VDEV_IO_DONE = 1 << 18, /* RWF-- */
ZIO_STAGE_VDEV_IO_ASSESS = 1 << 19, /* RWF-I */
ZIO_STAGE_VDEV_IO_START = 1 << 19, /* RWF-I */
ZIO_STAGE_VDEV_IO_DONE = 1 << 20, /* RWF-I */
ZIO_STAGE_VDEV_IO_ASSESS = 1 << 21, /* RWF-I */
ZIO_STAGE_CHECKSUM_VERIFY = 1 << 20, /* R---- */
ZIO_STAGE_CHECKSUM_VERIFY = 1 << 22, /* R---- */
ZIO_STAGE_DONE = 1 << 21 /* RWFCI */
ZIO_STAGE_DONE = 1 << 23 /* RWFCI */
};
#define ZIO_INTERLOCK_STAGES \
@ -187,22 +189,27 @@ enum zio_stage {
#define ZIO_REWRITE_PIPELINE \
(ZIO_WRITE_COMMON_STAGES | \
ZIO_STAGE_WRITE_COMPRESS | \
ZIO_STAGE_WRITE_BP_INIT)
#define ZIO_WRITE_PIPELINE \
(ZIO_WRITE_COMMON_STAGES | \
ZIO_STAGE_WRITE_BP_INIT | \
ZIO_STAGE_WRITE_COMPRESS | \
ZIO_STAGE_DVA_THROTTLE | \
ZIO_STAGE_DVA_ALLOCATE)
#define ZIO_DDT_CHILD_WRITE_PIPELINE \
(ZIO_INTERLOCK_STAGES | \
ZIO_VDEV_IO_STAGES | \
ZIO_STAGE_DVA_THROTTLE | \
ZIO_STAGE_DVA_ALLOCATE)
#define ZIO_DDT_WRITE_PIPELINE \
(ZIO_INTERLOCK_STAGES | \
ZIO_STAGE_ISSUE_ASYNC | \
ZIO_STAGE_WRITE_BP_INIT | \
ZIO_STAGE_ISSUE_ASYNC | \
ZIO_STAGE_WRITE_COMPRESS | \
ZIO_STAGE_CHECKSUM_GENERATE | \
ZIO_STAGE_DDT_WRITE)

View File

@ -441,6 +441,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
for (int t = 0; t < DTL_TYPES; t++) {
vd->vdev_dtl[t] = range_tree_create(NULL, NULL,
&vd->vdev_dtl_lock);
@ -770,6 +771,7 @@ vdev_free(vdev_t *vd)
}
mutex_exit(&vd->vdev_dtl_lock);
mutex_destroy(&vd->vdev_queue_lock);
mutex_destroy(&vd->vdev_dtl_lock);
mutex_destroy(&vd->vdev_stat_lock);
mutex_destroy(&vd->vdev_probe_lock);
@ -1086,7 +1088,8 @@ vdev_probe_done(zio_t *zio)
vd->vdev_probe_zio = NULL;
mutex_exit(&vd->vdev_probe_lock);
while ((pio = zio_walk_parents(zio)) != NULL)
zio_link_t *zl = NULL;
while ((pio = zio_walk_parents(zio, &zl)) != NULL)
if (!vdev_accessible(vd, pio))
pio->io_error = SET_ERROR(ENXIO);
@ -2857,7 +2860,8 @@ vdev_allocatable(vdev_t *vd)
* we're asking two separate questions about it.
*/
return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
!vd->vdev_cant_write && !vd->vdev_ishole);
!vd->vdev_cant_write && !vd->vdev_ishole &&
vd->vdev_mg->mg_initialized);
}
boolean_t
@ -2885,6 +2889,7 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
{
spa_t *spa = vd->vdev_spa;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *tvd = vd->vdev_top;
ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
@ -2895,8 +2900,15 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
vs->vs_rsize = vdev_get_min_asize(vd);
if (vd->vdev_ops->vdev_op_leaf)
vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
if (vd->vdev_max_asize != 0)
vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize;
/*
* Report expandable space on top-level, non-auxillary devices only.
* The expandable space is reported in terms of metaslab sized units
* since that determines how much space the pool can expand.
*/
if (vd->vdev_aux == NULL && tvd != NULL && vd->vdev_max_asize != 0) {
vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize,
1ULL << tvd->vdev_ms_shift);
}
vs->vs_configured_ashift = vd->vdev_top != NULL
? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
vs->vs_logical_ashift = vd->vdev_logical_ashift;

View File

@ -23,7 +23,7 @@
* Use is subject to license terms.
*/
/*
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@ -247,7 +247,8 @@ vdev_cache_fill(zio_t *fio)
* any reads that were queued up before the missed update are still
* valid, so we can satisfy them from this line before we evict it.
*/
while ((pio = zio_walk_parents(fio)) != NULL)
zio_link_t *zl = NULL;
while ((pio = zio_walk_parents(fio, &zl)) != NULL)
vdev_cache_hit(vc, ve, pio);
if (fio->io_error || ve->ve_missed_update)

View File

@ -241,34 +241,6 @@ vdev_disk_rele(vdev_t *vd)
}
}
static uint64_t
vdev_disk_get_space(vdev_t *vd, uint64_t capacity, uint_t blksz)
{
ASSERT(vd->vdev_wholedisk);
vdev_disk_t *dvd = vd->vdev_tsd;
dk_efi_t dk_ioc;
efi_gpt_t *efi;
uint64_t avail_space = 0;
int efisize = EFI_LABEL_SIZE * 2;
dk_ioc.dki_data = kmem_alloc(efisize, KM_SLEEP);
dk_ioc.dki_lba = 1;
dk_ioc.dki_length = efisize;
dk_ioc.dki_data_64 = (uint64_t)(uintptr_t)dk_ioc.dki_data;
efi = dk_ioc.dki_data;
if (ldi_ioctl(dvd->vd_lh, DKIOCGETEFI, (intptr_t)&dk_ioc,
FKIOCTL, kcred, NULL) == 0) {
uint64_t efi_altern_lba = LE_64(efi->efi_gpt_AlternateLBA);
if (capacity > efi_altern_lba)
avail_space = (capacity - efi_altern_lba) * blksz;
}
kmem_free(dk_ioc.dki_data, efisize);
return (avail_space);
}
/*
* We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when
* even a fallback to DKIOCGMEDIAINFO fails.
@ -559,10 +531,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
* Adjust max_psize upward accordingly since we know
* we own the whole disk now.
*/
*max_psize += vdev_disk_get_space(vd, capacity, blksz);
zfs_dbgmsg("capacity change: vdev %s, psize %llu, "
"max_psize %llu", vd->vdev_path, *psize,
*max_psize);
*max_psize = capacity * blksz;
}
/*

View File

@ -24,7 +24,7 @@
*/
/*
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@ -293,9 +293,10 @@ vdev_mirror_scrub_done(zio_t *zio)
if (zio->io_error == 0) {
zio_t *pio;
zio_link_t *zl = NULL;
mutex_enter(&zio->io_lock);
while ((pio = zio_walk_parents(zio)) != NULL) {
while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
mutex_enter(&pio->io_lock);
ASSERT3U(zio->io_size, >=, pio->io_size);
bcopy(zio->io_data, pio->io_data, pio->io_size);

View File

@ -34,6 +34,7 @@
#include <sys/zio.h>
#include <sys/avl.h>
#include <sys/dsl_pool.h>
#include <sys/metaslab_impl.h>
/*
* ZFS I/O Scheduler
@ -175,6 +176,23 @@ int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE;
int zfs_vdev_read_gap_limit = 32 << 10;
int zfs_vdev_write_gap_limit = 4 << 10;
/*
* Define the queue depth percentage for each top-level. This percentage is
* used in conjunction with zfs_vdev_async_max_active to determine how many
* allocations a specific top-level vdev should handle. Once the queue depth
* reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100
* then allocator will stop allocating blocks on that top-level device.
* The default kernel setting is 1000% which will yield 100 allocations per
* device. For userland testing, the default setting is 300% which equates
* to 30 allocations per device.
*/
#ifdef _KERNEL
int zfs_vdev_queue_depth_pct = 1000;
#else
int zfs_vdev_queue_depth_pct = 300;
#endif
#ifdef __FreeBSD__
#ifdef _KERNEL
SYSCTL_DECL(_vfs_zfs_vdev);
@ -233,6 +251,9 @@ SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, read_gap_limit, CTLFLAG_RWTUN,
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, write_gap_limit, CTLFLAG_RWTUN,
&zfs_vdev_write_gap_limit, 0,
"Acceptable gap between two writes being aggregated");
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, queue_depth_pct, CTLFLAG_RWTUN,
&zfs_vdev_queue_depth_pct, 0,
"Queue depth percentage for each top-level");
static int
sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS)
@ -390,6 +411,7 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
{
spa_t *spa = zio->io_spa;
avl_tree_t *qtt;
ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
@ -411,6 +433,7 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
{
spa_t *spa = zio->io_spa;
avl_tree_t *qtt;
ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
@ -480,7 +503,8 @@ vdev_queue_agg_io_done(zio_t *aio)
{
if (aio->io_type == ZIO_TYPE_READ) {
zio_t *pio;
while ((pio = zio_walk_parents(aio)) != NULL) {
zio_link_t *zl = NULL;
while ((pio = zio_walk_parents(aio, &zl)) != NULL) {
bcopy((char *)aio->io_data + (pio->io_offset -
aio->io_offset), pio->io_data, pio->io_size);
}

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
@ -270,6 +270,7 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
uint64_t blk, off;
int err;
dmu_buf_t *db;
dnode_t *dn;
int bs = FZAP_BLOCK_SHIFT(zap);
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
@ -277,8 +278,15 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
blk = idx >> (bs-3);
off = idx & ((1<<(bs-3))-1);
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
/*
* Note: this is equivalent to dmu_buf_hold(), but we use
* _dnode_enter / _by_dnode because it's faster because we don't
* have to hold the dnode.
*/
dn = dmu_buf_dnode_enter(zap->zap_dbuf);
err = dmu_buf_hold_by_dnode(dn,
(tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
dmu_buf_dnode_exit(zap->zap_dbuf);
if (err)
return (err);
*valp = ((uint64_t *)db->db_data)[off];
@ -292,9 +300,11 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
*/
blk = (idx*2) >> (bs-3);
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
dn = dmu_buf_dnode_enter(zap->zap_dbuf);
err = dmu_buf_hold_by_dnode(dn,
(tbl->zt_nextblk + blk) << bs, FTAG, &db,
DMU_READ_NO_PREFETCH);
dmu_buf_dnode_exit(zap->zap_dbuf);
if (err == 0)
dmu_buf_rele(db, FTAG);
}
@ -505,8 +515,10 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
err = dmu_buf_hold_by_dnode(dn,
blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
dmu_buf_dnode_exit(zap->zap_dbuf);
if (err)
return (err);
@ -596,7 +608,8 @@ zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
}
static int
zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp)
zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
void *tag, dmu_tx_t *tx, zap_leaf_t **lp)
{
zap_t *zap = zn->zn_zap;
uint64_t hash = zn->zn_hash;
@ -618,9 +631,9 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp)
uint64_t object = zap->zap_object;
zap_put_leaf(l);
zap_unlockdir(zap);
zap_unlockdir(zap, tag);
err = zap_lockdir(os, object, tx, RW_WRITER,
FALSE, FALSE, &zn->zn_zap);
FALSE, FALSE, tag, &zn->zn_zap);
zap = zn->zn_zap;
if (err)
return (err);
@ -683,7 +696,8 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp)
}
static void
zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l,
void *tag, dmu_tx_t *tx)
{
zap_t *zap = zn->zn_zap;
int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
@ -703,9 +717,9 @@ zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
objset_t *os = zap->zap_objset;
uint64_t zapobj = zap->zap_object;
zap_unlockdir(zap);
zap_unlockdir(zap, tag);
err = zap_lockdir(os, zapobj, tx,
RW_WRITER, FALSE, FALSE, &zn->zn_zap);
RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap);
zap = zn->zn_zap;
if (err)
return;
@ -795,7 +809,7 @@ fzap_lookup(zap_name_t *zn,
int
fzap_add_cd(zap_name_t *zn,
uint64_t integer_size, uint64_t num_integers,
const void *val, uint32_t cd, dmu_tx_t *tx)
const void *val, uint32_t cd, void *tag, dmu_tx_t *tx)
{
zap_leaf_t *l;
int err;
@ -824,7 +838,7 @@ fzap_add_cd(zap_name_t *zn,
if (err == 0) {
zap_increment_num_entries(zap, 1, tx);
} else if (err == EAGAIN) {
err = zap_expand_leaf(zn, l, tx, &l);
err = zap_expand_leaf(zn, l, tag, tx, &l);
zap = zn->zn_zap; /* zap_expand_leaf() may change zap */
if (err == 0)
goto retry;
@ -832,26 +846,27 @@ fzap_add_cd(zap_name_t *zn,
out:
if (zap != NULL)
zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx);
zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
return (err);
}
int
fzap_add(zap_name_t *zn,
uint64_t integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx)
const void *val, void *tag, dmu_tx_t *tx)
{
int err = fzap_check(zn, integer_size, num_integers);
if (err != 0)
return (err);
return (fzap_add_cd(zn, integer_size, num_integers,
val, ZAP_NEED_CD, tx));
val, ZAP_NEED_CD, tag, tx));
}
int
fzap_update(zap_name_t *zn,
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
int integer_size, uint64_t num_integers, const void *val,
void *tag, dmu_tx_t *tx)
{
zap_leaf_t *l;
int err, create;
@ -881,14 +896,14 @@ fzap_update(zap_name_t *zn,
}
if (err == EAGAIN) {
err = zap_expand_leaf(zn, l, tx, &l);
err = zap_expand_leaf(zn, l, tag, tx, &l);
zap = zn->zn_zap; /* zap_expand_leaf() may change zap */
if (err == 0)
goto retry;
}
if (zap != NULL)
zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx);
zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
return (err);
}

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
@ -43,7 +43,8 @@
extern inline mzap_phys_t *zap_m_phys(zap_t *zap);
static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags);
static int mzap_upgrade(zap_t **zapp,
void *tag, dmu_tx_t *tx, zap_flags_t flags);
uint64_t
zap_getflags(zap_t *zap)
@ -468,21 +469,19 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
return (winner);
}
int
zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
static int
zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
{
zap_t *zap;
dmu_buf_t *db;
krw_t lt;
int err;
ASSERT0(db->db_offset);
objset_t *os = dmu_buf_get_objset(db);
uint64_t obj = db->db_object;
*zapp = NULL;
err = dmu_buf_hold(os, obj, 0, NULL, &db, DMU_READ_NO_PREFETCH);
if (err)
return (err);
#ifdef ZFS_DEBUG
{
dmu_object_info_t doi;
@ -499,7 +498,6 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
* mzap_open() didn't like what it saw on-disk.
* Check for corruption!
*/
dmu_buf_rele(db, NULL);
return (SET_ERROR(EIO));
}
}
@ -538,10 +536,12 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
dprintf("upgrading obj %llu: num_entries=%u\n",
obj, zap->zap_m.zap_num_entries);
*zapp = zap;
return (mzap_upgrade(zapp, tx, 0));
int err = mzap_upgrade(zapp, tag, tx, 0);
if (err != 0)
rw_exit(&zap->zap_rwlock);
return (err);
}
err = dmu_object_set_blocksize(os, obj, newsz, 0, tx);
ASSERT0(err);
VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx));
zap->zap_m.zap_num_chunks =
db->db_size / MZAP_ENT_LEN - 1;
}
@ -550,15 +550,49 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
return (0);
}
static int
zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
{
dmu_buf_t *db;
int err;
err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
if (err != 0) {
return (err);
}
err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
if (err != 0) {
dmu_buf_rele(db, tag);
}
return (err);
}
int
zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
{
dmu_buf_t *db;
int err;
err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH);
if (err != 0)
return (err);
err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
if (err != 0)
dmu_buf_rele(db, tag);
return (err);
}
void
zap_unlockdir(zap_t *zap)
zap_unlockdir(zap_t *zap, void *tag)
{
rw_exit(&zap->zap_rwlock);
dmu_buf_rele(zap->zap_dbuf, NULL);
dmu_buf_rele(zap->zap_dbuf, tag);
}
static int
mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags)
mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
{
mzap_phys_t *mzp;
int i, sz, nchunks;
@ -596,7 +630,8 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags)
dprintf("adding %s=%llu\n",
mze->mze_name, mze->mze_value);
zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT);
err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tx);
err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
tag, tx);
zap = zn->zn_zap; /* fzap_add_cd() may change zap */
zap_name_free(zn);
if (err)
@ -635,9 +670,9 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
zap_t *zap;
/* Only fat zap supports flags; upgrade immediately. */
VERIFY(0 == zap_lockdir(os, obj, tx, RW_WRITER,
B_FALSE, B_FALSE, &zap));
VERIFY3U(0, ==, mzap_upgrade(&zap, tx, flags));
zap_unlockdir(zap);
B_FALSE, B_FALSE, FTAG, &zap));
VERIFY3U(0, ==, mzap_upgrade(&zap, FTAG, tx, flags));
zap_unlockdir(zap, FTAG);
}
}
@ -732,7 +767,7 @@ zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
zap_t *zap;
int err;
err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err)
return (err);
if (!zap->zap_ismicro) {
@ -740,7 +775,7 @@ zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
} else {
*count = zap->zap_m.zap_num_entries;
}
zap_unlockdir(zap);
zap_unlockdir(zap, FTAG);
return (err);
}
@ -797,25 +832,19 @@ zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
num_integers, buf, MT_EXACT, NULL, 0, NULL));
}
int
zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
static int
zap_lookup_impl(zap_t *zap, const char *name,
uint64_t integer_size, uint64_t num_integers, void *buf,
matchtype_t mt, char *realname, int rn_len,
boolean_t *ncp)
{
zap_t *zap;
int err;
int err = 0;
mzap_ent_t *mze;
zap_name_t *zn;
err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
if (err)
return (err);
zn = zap_name_alloc(zap, name, mt);
if (zn == NULL) {
zap_unlockdir(zap);
if (zn == NULL)
return (SET_ERROR(ENOTSUP));
}
if (!zap->zap_ismicro) {
err = fzap_lookup(zn, integer_size, num_integers, buf,
@ -842,7 +871,51 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
}
}
zap_name_free(zn);
zap_unlockdir(zap);
return (err);
}
int
zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
uint64_t integer_size, uint64_t num_integers, void *buf,
matchtype_t mt, char *realname, int rn_len,
boolean_t *ncp)
{
zap_t *zap;
int err;
err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err != 0)
return (err);
err = zap_lookup_impl(zap, name, integer_size,
num_integers, buf, mt, realname, rn_len, ncp);
zap_unlockdir(zap, FTAG);
return (err);
}
int
zap_lookup_by_dnode(dnode_t *dn, const char *name,
uint64_t integer_size, uint64_t num_integers, void *buf)
{
return (zap_lookup_norm_by_dnode(dn, name, integer_size,
num_integers, buf, MT_EXACT, NULL, 0, NULL));
}
int
zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
uint64_t integer_size, uint64_t num_integers, void *buf,
matchtype_t mt, char *realname, int rn_len,
boolean_t *ncp)
{
zap_t *zap;
int err;
err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
FTAG, &zap);
if (err != 0)
return (err);
err = zap_lookup_impl(zap, name, integer_size,
num_integers, buf, mt, realname, rn_len, ncp);
zap_unlockdir(zap, FTAG);
return (err);
}
@ -854,18 +927,18 @@ zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int err;
zap_name_t *zn;
err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
zap_unlockdir(zap);
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
fzap_prefetch(zn);
zap_name_free(zn);
zap_unlockdir(zap);
zap_unlockdir(zap, FTAG);
return (err);
}
@ -877,19 +950,19 @@ zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int err;
zap_name_t *zn;
err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
zap_unlockdir(zap);
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
err = fzap_lookup(zn, integer_size, num_integers, buf,
NULL, 0, NULL);
zap_name_free(zn);
zap_unlockdir(zap);
zap_unlockdir(zap, FTAG);
return (err);
}
@ -912,12 +985,12 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name,
mzap_ent_t *mze;
zap_name_t *zn;
err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc(zap, name, MT_EXACT);
if (zn == NULL) {
zap_unlockdir(zap);
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
if (!zap->zap_ismicro) {
@ -934,7 +1007,7 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name,
}
}
zap_name_free(zn);
zap_unlockdir(zap);
zap_unlockdir(zap, FTAG);
return (err);
}
@ -946,17 +1019,17 @@ zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int err;
zap_name_t *zn;
err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
zap_unlockdir(zap);
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
err = fzap_length(zn, integer_size, num_integers);
zap_name_free(zn);
zap_unlockdir(zap);
zap_unlockdir(zap, FTAG);
return (err);
}
@ -1015,22 +1088,24 @@ zap_add(objset_t *os, uint64_t zapobj, const char *key,
const uint64_t *intval = val;
zap_name_t *zn;
err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc(zap, key, MT_EXACT);
if (zn == NULL) {
zap_unlockdir(zap);
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
if (!zap->zap_ismicro) {
err = fzap_add(zn, integer_size, num_integers, val, tx);
err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx);
zap = zn->zn_zap; /* fzap_add() may change zap */
} else if (integer_size != 8 || num_integers != 1 ||
strlen(key) >= MZAP_NAME_LEN) {
err = mzap_upgrade(&zn->zn_zap, tx, 0);
if (err == 0)
err = fzap_add(zn, integer_size, num_integers, val, tx);
err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
if (err == 0) {
err = fzap_add(zn, integer_size, num_integers, val,
FTAG, tx);
}
zap = zn->zn_zap; /* fzap_add() may change zap */
} else {
mze = mze_find(zn);
@ -1043,7 +1118,7 @@ zap_add(objset_t *os, uint64_t zapobj, const char *key,
ASSERT(zap == zn->zn_zap);
zap_name_free(zn);
if (zap != NULL) /* may be NULL if fzap_add() failed */
zap_unlockdir(zap);
zap_unlockdir(zap, FTAG);
return (err);
}
@ -1056,19 +1131,19 @@ zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int err;
zap_name_t *zn;
err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
zap_unlockdir(zap);
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
err = fzap_add(zn, integer_size, num_integers, val, tx);
err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx);
zap = zn->zn_zap; /* fzap_add() may change zap */
zap_name_free(zn);
if (zap != NULL) /* may be NULL if fzap_add() failed */
zap_unlockdir(zap);
zap_unlockdir(zap, FTAG);
return (err);
}
@ -1092,25 +1167,27 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
(void) zap_lookup(os, zapobj, name, 8, 1, &oldval);
#endif
err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc(zap, name, MT_EXACT);
if (zn == NULL) {
zap_unlockdir(zap);
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
if (!zap->zap_ismicro) {
err = fzap_update(zn, integer_size, num_integers, val, tx);
err = fzap_update(zn, integer_size, num_integers, val,
FTAG, tx);
zap = zn->zn_zap; /* fzap_update() may change zap */
} else if (integer_size != 8 || num_integers != 1 ||
strlen(name) >= MZAP_NAME_LEN) {
dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
zapobj, integer_size, num_integers, name);
err = mzap_upgrade(&zn->zn_zap, tx, 0);
if (err == 0)
err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
if (err == 0) {
err = fzap_update(zn, integer_size, num_integers,
val, tx);
val, FTAG, tx);
}
zap = zn->zn_zap; /* fzap_update() may change zap */
} else {
mze = mze_find(zn);
@ -1124,7 +1201,7 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
ASSERT(zap == zn->zn_zap);
zap_name_free(zn);
if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
zap_unlockdir(zap);
zap_unlockdir(zap, FTAG);
return (err);
}
@ -1137,19 +1214,19 @@ zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
zap_name_t *zn;
int err;
err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
zap_unlockdir(zap);
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
err = fzap_update(zn, integer_size, num_integers, val, tx);
err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx);
zap = zn->zn_zap; /* fzap_update() may change zap */
zap_name_free(zn);
if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
zap_unlockdir(zap);
zap_unlockdir(zap, FTAG);
return (err);
}
@ -1168,12 +1245,12 @@ zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
mzap_ent_t *mze;
zap_name_t *zn;
err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc(zap, name, mt);
if (zn == NULL) {
zap_unlockdir(zap);
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
if (!zap->zap_ismicro) {
@ -1190,7 +1267,7 @@ zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
}
}
zap_name_free(zn);
zap_unlockdir(zap);
zap_unlockdir(zap, FTAG);
return (err);
}
@ -1202,17 +1279,17 @@ zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int err;
zap_name_t *zn;
err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
zap_unlockdir(zap);
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
err = fzap_remove(zn, tx);
zap_name_free(zn);
zap_unlockdir(zap);
zap_unlockdir(zap, FTAG);
return (err);
}
@ -1244,7 +1321,7 @@ zap_cursor_fini(zap_cursor_t *zc)
{
if (zc->zc_zap) {
rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
zap_unlockdir(zc->zc_zap);
zap_unlockdir(zc->zc_zap, NULL);
zc->zc_zap = NULL;
}
if (zc->zc_leaf) {
@ -1291,7 +1368,7 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
if (zc->zc_zap == NULL) {
int hb;
err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
RW_READER, TRUE, FALSE, &zc->zc_zap);
RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);
if (err)
return (err);
@ -1358,7 +1435,7 @@ zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt)
if (zc->zc_zap == NULL) {
err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
RW_READER, TRUE, FALSE, &zc->zc_zap);
RW_READER, TRUE, FALSE, FTAG, &zc->zc_zap);
if (err)
return (err);
} else {
@ -1395,7 +1472,7 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
int err;
zap_t *zap;
err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err)
return (err);
@ -1408,12 +1485,12 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
} else {
fzap_get_stats(zap, zs);
}
zap_unlockdir(zap);
zap_unlockdir(zap, FTAG);
return (0);
}
int
zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
zap_count_write_by_dnode(dnode_t *dn, const char *name, int add,
refcount_t *towrite, refcount_t *tooverwrite)
{
zap_t *zap;
@ -1427,7 +1504,7 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
* - 2 blocks for possibly split leaves,
* - 2 grown ptrtbl blocks
*
* This also accomodates the case where an add operation to a fairly
* This also accommodates the case where an add operation to a fairly
* large microzap results in a promotion to fatzap.
*/
if (name == NULL) {
@ -1440,10 +1517,11 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
* We lock the zap with adding == FALSE. Because, if we pass
* the actual value of add, it could trigger a mzap_upgrade().
* At present we are just evaluating the possibility of this operation
* and hence we donot want to trigger an upgrade.
* and hence we do not want to trigger an upgrade.
*/
err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
if (err)
err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
FTAG, &zap);
if (err != 0)
return (err);
if (!zap->zap_ismicro) {
@ -1489,6 +1567,6 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
}
}
zap_unlockdir(zap);
zap_unlockdir(zap, FTAG);
return (err);
}

View File

@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
@ -253,7 +254,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
}
}
VERIFY(arc_buf_remove_ref(abuf, &abuf));
arc_buf_destroy(abuf, &abuf);
}
return (error);
@ -290,7 +291,7 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
if (error == 0) {
if (wbuf != NULL)
bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
(void) arc_buf_remove_ref(abuf, &abuf);
arc_buf_destroy(abuf, &abuf);
}
return (error);

View File

@ -41,6 +41,7 @@
#include <sys/trim_map.h>
#include <sys/blkptr.h>
#include <sys/zfeature.h>
#include <sys/metaslab_impl.h>
SYSCTL_DECL(_vfs_zfs);
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
@ -78,6 +79,10 @@ const char *zio_type_name[ZIO_TYPES] = {
"zio_ioctl"
};
boolean_t zio_dva_throttle_enabled = B_TRUE;
SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, dva_throttle_enabled, CTLFLAG_RDTUN,
&zio_dva_throttle_enabled, 0, "");
/*
* ==========================================================================
* I/O kmem caches
@ -136,6 +141,8 @@ int zio_buf_debug_limit = 0;
#endif
#endif
static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t);
void
zio_init(void)
{
@ -329,7 +336,7 @@ zio_data_buf_free(void *buf, size_t size)
* Push and pop I/O transform buffers
* ==========================================================================
*/
static void
void
zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
zio_transform_func_t *transform)
{
@ -347,7 +354,7 @@ zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
zio->io_size = size;
}
static void
void
zio_pop_transforms(zio_t *zio)
{
zio_transform_t *zt;
@ -396,52 +403,39 @@ zio_decompress(zio_t *zio, void *data, uint64_t size)
* I/O parent/child relationships and pipeline interlocks
* ==========================================================================
*/
/*
* NOTE - Callers to zio_walk_parents() and zio_walk_children must
* continue calling these functions until they return NULL.
* Otherwise, the next caller will pick up the list walk in
* some indeterminate state. (Otherwise every caller would
* have to pass in a cookie to keep the state represented by
* io_walk_link, which gets annoying.)
*/
zio_t *
zio_walk_parents(zio_t *cio)
zio_walk_parents(zio_t *cio, zio_link_t **zl)
{
zio_link_t *zl = cio->io_walk_link;
list_t *pl = &cio->io_parent_list;
zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
cio->io_walk_link = zl;
if (zl == NULL)
*zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl);
if (*zl == NULL)
return (NULL);
ASSERT(zl->zl_child == cio);
return (zl->zl_parent);
ASSERT((*zl)->zl_child == cio);
return ((*zl)->zl_parent);
}
zio_t *
zio_walk_children(zio_t *pio)
zio_walk_children(zio_t *pio, zio_link_t **zl)
{
zio_link_t *zl = pio->io_walk_link;
list_t *cl = &pio->io_child_list;
zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
pio->io_walk_link = zl;
if (zl == NULL)
*zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
if (*zl == NULL)
return (NULL);
ASSERT(zl->zl_parent == pio);
return (zl->zl_child);
ASSERT((*zl)->zl_parent == pio);
return ((*zl)->zl_child);
}
zio_t *
zio_unique_parent(zio_t *cio)
{
zio_t *pio = zio_walk_parents(cio);
zio_link_t *zl = NULL;
zio_t *pio = zio_walk_parents(cio, &zl);
VERIFY(zio_walk_parents(cio) == NULL);
VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL);
return (pio);
}
@ -510,6 +504,7 @@ zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
ASSERT(zio->io_stall == NULL);
if (*countp != 0) {
zio->io_stage >>= 1;
ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
zio->io_stall = countp;
waiting = B_TRUE;
}
@ -533,9 +528,18 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
(*countp)--;
if (*countp == 0 && pio->io_stall == countp) {
zio_taskq_type_t type =
pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE :
ZIO_TASKQ_INTERRUPT;
pio->io_stall = NULL;
mutex_exit(&pio->io_lock);
zio_execute(pio);
/*
* Dispatch the parent zio in its own taskq so that
* the child can continue to make progress. This also
* prevents overflowing the stack when we have deeply nested
* parent-child relationships.
*/
zio_taskq_dispatch(pio, type, B_FALSE);
} else {
mutex_exit(&pio->io_lock);
}
@ -548,6 +552,30 @@ zio_inherit_child_errors(zio_t *zio, enum zio_child c)
zio->io_error = zio->io_child_error[c];
}
int
zio_timestamp_compare(const void *x1, const void *x2)
{
const zio_t *z1 = x1;
const zio_t *z2 = x2;
if (z1->io_queued_timestamp < z2->io_queued_timestamp)
return (-1);
if (z1->io_queued_timestamp > z2->io_queued_timestamp)
return (1);
if (z1->io_offset < z2->io_offset)
return (-1);
if (z1->io_offset > z2->io_offset)
return (1);
if (z1 < z2)
return (-1);
if (z1 > z2)
return (1);
return (0);
}
/*
* ==========================================================================
* Create the various types of I/O (read, write, free, etc)
@ -616,6 +644,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_orig_flags = zio->io_flags = flags;
zio->io_orig_stage = zio->io_stage = stage;
zio->io_orig_pipeline = zio->io_pipeline = pipeline;
zio->io_pipeline_trace = ZIO_STAGE_OPEN;
zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
@ -813,7 +842,7 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
zio_t *zio;
zio = zio_create(pio, spa, txg, bp, data, size, done, private,
ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb,
ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
return (zio);
@ -934,6 +963,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
ASSERT0(zio->io_queued_timestamp);
return (zio);
}
@ -1022,8 +1052,8 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
*/
zio_t *
zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
void *data, uint64_t size, int type, zio_priority_t priority,
enum zio_flag flags, zio_done_func_t *done, void *private)
void *data, uint64_t size, int type, zio_priority_t priority,
enum zio_flag flags, zio_done_func_t *done, void *private)
{
enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
zio_t *zio;
@ -1058,9 +1088,30 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
if (flags & ZIO_FLAG_IO_REPAIR)
flags &= ~ZIO_FLAG_SPECULATIVE;
/*
* If we're creating a child I/O that is not associated with a
* top-level vdev, then the child zio is not an allocating I/O.
* If this is a retried I/O then we ignore it since we will
* have already processed the original allocating I/O.
*/
if (flags & ZIO_FLAG_IO_ALLOCATING &&
(vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
metaslab_class_t *mc = spa_normal_class(pio->io_spa);
ASSERT(mc->mc_alloc_throttle_enabled);
ASSERT(type == ZIO_TYPE_WRITE);
ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) ||
pio->io_child_type == ZIO_CHILD_GANG);
flags &= ~ZIO_FLAG_IO_ALLOCATING;
}
zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
zio->io_physdone = pio->io_physdone;
if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
@ -1166,6 +1217,65 @@ zio_read_bp_init(zio_t *zio)
static int
zio_write_bp_init(zio_t *zio)
{
if (!IO_IS_ALLOCATING(zio))
return (ZIO_PIPELINE_CONTINUE);
ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
if (zio->io_bp_override) {
blkptr_t *bp = zio->io_bp;
zio_prop_t *zp = &zio->io_prop;
ASSERT(bp->blk_birth != zio->io_txg);
ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
*bp = *zio->io_bp_override;
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
if (BP_IS_EMBEDDED(bp))
return (ZIO_PIPELINE_CONTINUE);
/*
* If we've been overridden and nopwrite is set then
* set the flag accordingly to indicate that a nopwrite
* has already occurred.
*/
if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
ASSERT(!zp->zp_dedup);
ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum);
zio->io_flags |= ZIO_FLAG_NOPWRITE;
return (ZIO_PIPELINE_CONTINUE);
}
ASSERT(!zp->zp_nopwrite);
if (BP_IS_HOLE(bp) || !zp->zp_dedup)
return (ZIO_PIPELINE_CONTINUE);
ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
BP_SET_DEDUP(bp, 1);
zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
return (ZIO_PIPELINE_CONTINUE);
}
/*
* We were unable to handle this as an override bp, treat
* it as a regular write I/O.
*/
zio->io_bp_override = NULL;
*bp = zio->io_bp_orig;
zio->io_pipeline = zio->io_orig_pipeline;
}
return (ZIO_PIPELINE_CONTINUE);
}
static int
zio_write_compress(zio_t *zio)
{
spa_t *spa = zio->io_spa;
zio_prop_t *zp = &zio->io_prop;
@ -1197,44 +1307,7 @@ zio_write_bp_init(zio_t *zio)
}
ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
if (zio->io_bp_override) {
ASSERT(bp->blk_birth != zio->io_txg);
ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
*bp = *zio->io_bp_override;
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
if (BP_IS_EMBEDDED(bp))
return (ZIO_PIPELINE_CONTINUE);
/*
* If we've been overridden and nopwrite is set then
* set the flag accordingly to indicate that a nopwrite
* has already occurred.
*/
if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
ASSERT(!zp->zp_dedup);
zio->io_flags |= ZIO_FLAG_NOPWRITE;
return (ZIO_PIPELINE_CONTINUE);
}
ASSERT(!zp->zp_nopwrite);
if (BP_IS_HOLE(bp) || !zp->zp_dedup)
return (ZIO_PIPELINE_CONTINUE);
ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
BP_SET_DEDUP(bp, 1);
zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
return (ZIO_PIPELINE_CONTINUE);
}
zio->io_bp_override = NULL;
BP_ZERO(bp);
}
ASSERT(zio->io_bp_override == NULL);
if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
/*
@ -1303,6 +1376,14 @@ zio_write_bp_init(zio_t *zio)
psize, lsize, NULL);
}
}
/*
* We were unable to handle this as an override bp, treat
* it as a regular write I/O.
*/
zio->io_bp_override = NULL;
*bp = zio->io_bp_orig;
zio->io_pipeline = zio->io_orig_pipeline;
}
/*
@ -1355,7 +1436,6 @@ zio_write_bp_init(zio_t *zio)
zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
}
}
return (ZIO_PIPELINE_CONTINUE);
}
@ -1532,6 +1612,8 @@ zio_execute(zio_t *zio)
{
zio->io_executor = curthread;
ASSERT3U(zio->io_queued_timestamp, >, 0);
while (zio->io_stage < ZIO_STAGE_DONE) {
enum zio_stage pipeline = zio->io_pipeline;
enum zio_stage stage = zio->io_stage;
@ -1565,6 +1647,7 @@ zio_execute(zio_t *zio)
}
zio->io_stage = stage;
zio->io_pipeline_trace |= zio->io_stage;
rv = zio_pipeline[highbit64(stage) - 1](zio);
if (rv == ZIO_PIPELINE_STOP)
@ -1588,6 +1671,8 @@ zio_wait(zio_t *zio)
ASSERT(zio->io_executor == NULL);
zio->io_waiter = curthread;
ASSERT0(zio->io_queued_timestamp);
zio->io_queued_timestamp = gethrtime();
zio_execute(zio);
@ -1619,6 +1704,8 @@ zio_nowait(zio_t *zio)
zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio);
}
ASSERT0(zio->io_queued_timestamp);
zio->io_queued_timestamp = gethrtime();
zio_execute(zio);
}
@ -1643,6 +1730,7 @@ zio_reexecute(zio_t *pio)
pio->io_pipeline = pio->io_orig_pipeline;
pio->io_reexecute = 0;
pio->io_flags |= ZIO_FLAG_REEXECUTED;
pio->io_pipeline_trace = 0;
pio->io_error = 0;
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
pio->io_state[w] = 0;
@ -1659,8 +1747,9 @@ zio_reexecute(zio_t *pio)
* the remainder of pio's io_child_list, from 'cio_next' onward,
* cannot be affected by any side effects of reexecuting 'cio'.
*/
for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
cio_next = zio_walk_children(pio);
zio_link_t *zl = NULL;
for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
cio_next = zio_walk_children(pio, &zl);
mutex_enter(&pio->io_lock);
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
pio->io_children[cio->io_child_type][w]++;
@ -1673,8 +1762,10 @@ zio_reexecute(zio_t *pio)
* We don't reexecute "The Godfather" I/O here as it's the
* responsibility of the caller to wait on him.
*/
if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) {
pio->io_queued_timestamp = gethrtime();
zio_execute(pio);
}
}
void
@ -2068,6 +2159,7 @@ static int
zio_write_gang_block(zio_t *pio)
{
spa_t *spa = pio->io_spa;
metaslab_class_t *mc = spa_normal_class(spa);
blkptr_t *bp = pio->io_bp;
zio_t *gio = pio->io_gang_leader;
zio_t *zio;
@ -2081,10 +2173,43 @@ zio_write_gang_block(zio_t *pio)
zio_prop_t zp;
int error;
error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
flags |= METASLAB_ASYNC_ALLOC;
VERIFY(refcount_held(&mc->mc_alloc_slots, pio));
/*
* The logical zio has already placed a reservation for
* 'copies' allocation slots but gang blocks may require
* additional copies. These additional copies
* (i.e. gbh_copies - copies) are guaranteed to succeed
* since metaslab_class_throttle_reserve() always allows
* additional reservations for gang blocks.
*/
VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
pio, flags));
}
error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, pio);
if (error) {
if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
/*
* If we failed to allocate the gang block header then
* we remove any additional allocation reservations that
* we placed here. The original reservation will
* be removed when the logical I/O goes to the ready
* stage.
*/
metaslab_class_throttle_unreserve(mc,
gbh_copies - copies, pio);
}
pio->io_error = error;
return (ZIO_PIPELINE_CONTINUE);
}
@ -2123,11 +2248,25 @@ zio_write_gang_block(zio_t *pio)
zp.zp_dedup_verify = B_FALSE;
zp.zp_nopwrite = B_FALSE;
zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
(char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
zio_write_gang_member_ready, NULL, NULL, NULL,
&gn->gn_child[g], pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark));
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
/*
* Gang children won't throttle but we should
* account for their work, so reserve an allocation
* slot for them here.
*/
VERIFY(metaslab_class_throttle_reserve(mc,
zp.zp_copies, cio, flags));
}
zio_nowait(cio);
}
/*
@ -2356,7 +2495,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
bcmp(abuf->b_data, zio->io_orig_data,
zio->io_orig_size) != 0)
error = SET_ERROR(EEXIST);
VERIFY(arc_buf_remove_ref(abuf, &abuf));
arc_buf_destroy(abuf, &abuf);
}
ddt_enter(ddt);
@ -2385,7 +2524,8 @@ zio_ddt_child_write_ready(zio_t *zio)
ddt_phys_fill(ddp, zio->io_bp);
while ((pio = zio_walk_parents(zio)) != NULL)
zio_link_t *zl = NULL;
while ((pio = zio_walk_parents(zio, &zl)) != NULL)
ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
ddt_exit(ddt);
@ -2406,7 +2546,8 @@ zio_ddt_child_write_done(zio_t *zio)
dde->dde_lead_zio[p] = NULL;
if (zio->io_error == 0) {
while (zio_walk_parents(zio) != NULL)
zio_link_t *zl = NULL;
while (zio_walk_parents(zio, &zl) != NULL)
ddt_phys_addref(ddp);
} else {
ddt_phys_clear(ddp);
@ -2584,6 +2725,97 @@ zio_ddt_free(zio_t *zio)
* Allocate and free blocks
* ==========================================================================
*/
static zio_t *
zio_io_to_allocate(spa_t *spa)
{
zio_t *zio;
ASSERT(MUTEX_HELD(&spa->spa_alloc_lock));
zio = avl_first(&spa->spa_alloc_tree);
if (zio == NULL)
return (NULL);
ASSERT(IO_IS_ALLOCATING(zio));
/*
* Try to place a reservation for this zio. If we're unable to
* reserve then we throttle.
*/
if (!metaslab_class_throttle_reserve(spa_normal_class(spa),
zio->io_prop.zp_copies, zio, 0)) {
return (NULL);
}
avl_remove(&spa->spa_alloc_tree, zio);
ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
return (zio);
}
static int
zio_dva_throttle(zio_t *zio)
{
spa_t *spa = zio->io_spa;
zio_t *nio;
if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
!spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled ||
zio->io_child_type == ZIO_CHILD_GANG ||
zio->io_flags & ZIO_FLAG_NODATA) {
return (ZIO_PIPELINE_CONTINUE);
}
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
ASSERT3U(zio->io_queued_timestamp, >, 0);
ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
mutex_enter(&spa->spa_alloc_lock);
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
avl_add(&spa->spa_alloc_tree, zio);
nio = zio_io_to_allocate(zio->io_spa);
mutex_exit(&spa->spa_alloc_lock);
if (nio == zio)
return (ZIO_PIPELINE_CONTINUE);
if (nio != NULL) {
ASSERT3U(nio->io_queued_timestamp, <=,
zio->io_queued_timestamp);
ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE);
/*
* We are passing control to a new zio so make sure that
* it is processed by a different thread. We do this to
* avoid stack overflows that can occur when parents are
* throttled and children are making progress. We allow
* it to go to the head of the taskq since it's already
* been waiting.
*/
zio_taskq_dispatch(nio, ZIO_TASKQ_ISSUE, B_TRUE);
}
return (ZIO_PIPELINE_STOP);
}
void
zio_allocate_dispatch(spa_t *spa)
{
zio_t *zio;
mutex_enter(&spa->spa_alloc_lock);
zio = zio_io_to_allocate(spa);
mutex_exit(&spa->spa_alloc_lock);
if (zio == NULL)
return;
ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE);
ASSERT0(zio->io_error);
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE);
}
static int
zio_dva_allocate(zio_t *zio)
{
@ -2604,18 +2836,20 @@ zio_dva_allocate(zio_t *zio)
ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
/*
* The dump device does not support gang blocks so allocation on
* behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
* the "fast" gang feature.
*/
flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
METASLAB_GANG_CHILD : 0;
error = metaslab_alloc(spa, mc, zio->io_size, bp,
zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
if (zio->io_flags & ZIO_FLAG_NODATA) {
flags |= METASLAB_DONT_THROTTLE;
}
if (zio->io_flags & ZIO_FLAG_GANG_CHILD) {
flags |= METASLAB_GANG_CHILD;
}
if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) {
flags |= METASLAB_ASYNC_ALLOC;
}
if (error) {
error = metaslab_alloc(spa, mc, zio->io_size, bp,
zio->io_prop.zp_copies, zio->io_txg, NULL, flags, zio);
if (error != 0) {
spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
"size %llu, error %d", spa_name(spa), zio, zio->io_size,
error);
@ -2680,21 +2914,14 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
ASSERT(txg > spa_syncing_txg(spa));
/*
* ZIL blocks are always contiguous (i.e. not gang blocks) so we
* set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
* when allocating them.
*/
if (use_slog) {
error = metaslab_alloc(spa, spa_log_class(spa), size,
new_bp, 1, txg, old_bp,
METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL);
}
if (error) {
error = metaslab_alloc(spa, spa_normal_class(spa), size,
new_bp, 1, txg, old_bp,
METASLAB_HINTBP_AVOID);
new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL);
}
if (error == 0) {
@ -2770,6 +2997,8 @@ zio_vdev_io_start(zio_t *zio)
return (ZIO_PIPELINE_CONTINUE);
}
ASSERT3P(zio->io_logical, !=, zio);
/*
* We keep track of time-sensitive I/Os so that the scan thread
* can quickly react to certain workloads. In particular, we care
@ -3188,6 +3417,7 @@ zio_ready(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
zio_t *pio, *pio_next;
zio_link_t *zl = NULL;
if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
@ -3205,12 +3435,26 @@ zio_ready(zio_t *zio)
if (bp != NULL && bp != &zio->io_bp_copy)
zio->io_bp_copy = *bp;
if (zio->io_error)
if (zio->io_error != 0) {
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
ASSERT(IO_IS_ALLOCATING(zio));
ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
/*
* We were unable to allocate anything, unreserve and
* issue the next I/O to allocate.
*/
metaslab_class_throttle_unreserve(
spa_normal_class(zio->io_spa),
zio->io_prop.zp_copies, zio);
zio_allocate_dispatch(zio->io_spa);
}
}
mutex_enter(&zio->io_lock);
zio->io_state[ZIO_WAIT_READY] = 1;
pio = zio_walk_parents(zio);
pio = zio_walk_parents(zio, &zl);
mutex_exit(&zio->io_lock);
/*
@ -3221,7 +3465,7 @@ zio_ready(zio_t *zio)
* all parents must wait for us to be done before they can be done.
*/
for (; pio != NULL; pio = pio_next) {
pio_next = zio_walk_parents(zio);
pio_next = zio_walk_parents(zio, &zl);
zio_notify_parent(pio, zio, ZIO_WAIT_READY);
}
@ -3241,6 +3485,66 @@ zio_ready(zio_t *zio)
return (ZIO_PIPELINE_CONTINUE);
}
/*
* Update the allocation throttle accounting.
*/
static void
zio_dva_throttle_done(zio_t *zio)
{
zio_t *lio = zio->io_logical;
zio_t *pio = zio_unique_parent(zio);
vdev_t *vd = zio->io_vd;
int flags = METASLAB_ASYNC_ALLOC;
ASSERT3P(zio->io_bp, !=, NULL);
ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE);
ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
ASSERT(vd != NULL);
ASSERT3P(vd, ==, vd->vdev_top);
ASSERT(!(zio->io_flags & (ZIO_FLAG_IO_REPAIR | ZIO_FLAG_IO_RETRY)));
ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING);
ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE));
ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA));
/*
* Parents of gang children can have two flavors -- ones that
* allocated the gang header (will have ZIO_FLAG_IO_REWRITE set)
* and ones that allocated the constituent blocks. The allocation
* throttle needs to know the allocating parent zio so we must find
* it here.
*/
if (pio->io_child_type == ZIO_CHILD_GANG) {
/*
* If our parent is a rewrite gang child then our grandparent
* would have been the one that performed the allocation.
*/
if (pio->io_flags & ZIO_FLAG_IO_REWRITE)
pio = zio_unique_parent(pio);
flags |= METASLAB_GANG_CHILD;
}
ASSERT(IO_IS_ALLOCATING(pio));
ASSERT3P(zio, !=, zio->io_logical);
ASSERT(zio->io_logical != NULL);
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
mutex_enter(&pio->io_lock);
metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags);
mutex_exit(&pio->io_lock);
metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa),
1, pio);
/*
* Call into the pipeline to see if there is more work that
* needs to be done. If there is work to be done it will be
* dispatched to another taskq thread.
*/
zio_allocate_dispatch(zio->io_spa);
}
static int
zio_done(zio_t *zio)
{
@ -3250,6 +3554,8 @@ zio_done(zio_t *zio)
vdev_t *vd = zio->io_vd;
uint64_t psize = zio->io_size;
zio_t *pio, *pio_next;
metaslab_class_t *mc = spa_normal_class(spa);
zio_link_t *zl = NULL;
/*
* If our children haven't all completed,
@ -3261,6 +3567,30 @@ zio_done(zio_t *zio)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
return (ZIO_PIPELINE_STOP);
/*
* If the allocation throttle is enabled, then update the accounting.
* We only track child I/Os that are part of an allocating async
* write. We must do this since the allocation is performed
* by the logical I/O but the actual write is done by child I/Os.
*/
if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
zio->io_child_type == ZIO_CHILD_VDEV) {
ASSERT(mc->mc_alloc_throttle_enabled);
zio_dva_throttle_done(zio);
}
/*
* If the allocation throttle is enabled, verify that
* we have decremented the refcounts for every I/O that was throttled.
*/
if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(bp != NULL);
metaslab_group_alloc_verify(spa, zio->io_bp, zio);
VERIFY(refcount_not_held(&mc->mc_alloc_slots, zio));
}
for (int c = 0; c < ZIO_CHILD_TYPES; c++)
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
ASSERT(zio->io_children[c][w] == 0);
@ -3430,13 +3760,15 @@ zio_done(zio_t *zio)
* trouble (e.g. suspended). This allows "The Godfather"
* I/O to return status without blocking.
*/
for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
zio_link_t *zl = zio->io_walk_link;
pio_next = zio_walk_parents(zio);
zl = NULL;
for (pio = zio_walk_parents(zio, &zl); pio != NULL;
pio = pio_next) {
zio_link_t *remove_zl = zl;
pio_next = zio_walk_parents(zio, &zl);
if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
(zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
zio_remove_child(pio, zio, zl);
zio_remove_child(pio, zio, remove_zl);
zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
}
}
@ -3500,10 +3832,11 @@ zio_done(zio_t *zio)
zio->io_state[ZIO_WAIT_DONE] = 1;
mutex_exit(&zio->io_lock);
for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
zio_link_t *zl = zio->io_walk_link;
pio_next = zio_walk_parents(zio);
zio_remove_child(pio, zio, zl);
zl = NULL;
for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) {
zio_link_t *remove_zl = zl;
pio_next = zio_walk_parents(zio, &zl);
zio_remove_child(pio, zio, remove_zl);
zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
}
@ -3527,9 +3860,10 @@ zio_done(zio_t *zio)
static zio_pipe_stage_t *zio_pipeline[] = {
NULL,
zio_read_bp_init,
zio_write_bp_init,
zio_free_bp_init,
zio_issue_async,
zio_write_bp_init,
zio_write_compress,
zio_checksum_generate,
zio_nop_write,
zio_ddt_read_start,
@ -3538,6 +3872,7 @@ static zio_pipe_stage_t *zio_pipeline[] = {
zio_ddt_free,
zio_gang_assemble,
zio_gang_issue,
zio_dva_throttle,
zio_dva_allocate,
zio_dva_free,
zio_dva_claim,

View File

@ -297,20 +297,12 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
}
int
zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
void *data, uint64_t size, uint64_t offset, zio_bad_cksum_t *info)
{
blkptr_t *bp = zio->io_bp;
uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
(BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
int byteswap;
int error;
uint64_t size = (bp == NULL ? zio->io_size :
(BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
uint64_t offset = zio->io_offset;
void *data = zio->io_data;
zio_checksum_info_t *ci = &zio_checksum_table[checksum];
zio_cksum_t actual_cksum, expected_cksum, verifier;
spa_t *spa = zio->io_spa;
zio_cksum_t actual_cksum, expected_cksum;
int byteswap;
if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
return (SET_ERROR(EINVAL));
@ -319,6 +311,7 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
zio_eck_t *eck;
zio_cksum_t verifier;
if (checksum == ZIO_CHECKSUM_ZILOG2) {
zil_chain_t *zilc = data;
@ -358,35 +351,54 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
spa->spa_cksum_tmpls[checksum], &actual_cksum);
eck->zec_cksum = expected_cksum;
if (byteswap)
if (byteswap) {
byteswap_uint64_array(&expected_cksum,
sizeof (zio_cksum_t));
}
} else {
ASSERT(!BP_IS_GANG(bp));
byteswap = BP_SHOULD_BYTESWAP(bp);
expected_cksum = bp->blk_cksum;
ci->ci_func[byteswap](data, size,
spa->spa_cksum_tmpls[checksum], &actual_cksum);
}
info->zbc_expected = expected_cksum;
info->zbc_actual = actual_cksum;
info->zbc_checksum_name = ci->ci_name;
info->zbc_byteswapped = byteswap;
info->zbc_injected = 0;
info->zbc_has_cksum = 1;
if (info != NULL) {
info->zbc_expected = expected_cksum;
info->zbc_actual = actual_cksum;
info->zbc_checksum_name = ci->ci_name;
info->zbc_byteswapped = byteswap;
info->zbc_injected = 0;
info->zbc_has_cksum = 1;
}
if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
return (SET_ERROR(ECKSUM));
if (zio_injection_enabled && !zio->io_error &&
return (0);
}
int
zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
{
blkptr_t *bp = zio->io_bp;
uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
(BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
int error;
uint64_t size = (bp == NULL ? zio->io_size :
(BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
uint64_t offset = zio->io_offset;
void *data = zio->io_data;
spa_t *spa = zio->io_spa;
error = zio_checksum_error_impl(spa, bp, checksum, data, size,
offset, info);
if (error != 0 && zio_injection_enabled && !zio->io_error &&
(error = zio_handle_fault_injection(zio, ECKSUM)) != 0) {
info->zbc_injected = 1;
return (error);
}
return (0);
return (error);
}
/*

View File

@ -903,7 +903,8 @@ typedef enum {
SPA_LOAD_IMPORT, /* import in progress */
SPA_LOAD_TRYIMPORT, /* tryimport in progress */
SPA_LOAD_RECOVER, /* recovery requested */
SPA_LOAD_ERROR /* load failed */
SPA_LOAD_ERROR, /* load failed */
SPA_LOAD_CREATE /* creation in progress */
} spa_load_state_t;
/*

View File

@ -103,8 +103,16 @@ __FBSDID("$FreeBSD$");
#include <dev/drm2/drm_os_freebsd.h>
#define __OS_HAS_AGP (defined(CONFIG_AGP) || (defined(CONFIG_AGP_MODULE) && defined(MODULE)))
#define __OS_HAS_MTRR (defined(CONFIG_MTRR))
#if defined(CONFIG_AGP) || (defined(CONFIG_AGP_MODULE) && defined(MODULE))
#define __OS_HAS_AGP 1
#else
#define __OS_HAS_AGP 0
#endif
#if defined(CONFIG_MTRR)
#define __OS_HAS_MTRR 1
#else
#define __OS_HAS_MTRR 0
#endif
struct drm_file;
struct drm_device;

View File

@ -57,13 +57,13 @@ __FBSDID("$FreeBSD$");
* Wait for the peripherial up to 40ms
*/
static int
do_1284_wait(device_t bus, char mask, char status)
do_1284_wait(device_t bus, uint8_t mask, uint8_t status)
{
return (ppb_poll_bus(bus, 4, mask, status, PPB_NOINTR | PPB_POLL));
}
static int
do_peripheral_wait(device_t bus, char mask, char status)
do_peripheral_wait(device_t bus, uint8_t mask, uint8_t status)
{
return (ppb_poll_bus(bus, 100, mask, status, PPB_NOINTR | PPB_POLL));
}

View File

@ -54,11 +54,11 @@ MODULE_VERSION(ppbus, 1);
*/
int
ppb_poll_bus(device_t bus, int max,
char mask, char status, int how)
uint8_t mask, uint8_t status, int how)
{
struct ppb_data *ppb = DEVTOSOFTC(bus);
int i, j, error;
char r;
uint8_t r;
ppb_assert_locked(bus);
@ -186,7 +186,7 @@ ppb_ecp_sync(device_t bus)
int
ppb_get_status(device_t bus, struct ppb_status *status)
{
register char r;
uint8_t r;
ppb_assert_locked(bus);

View File

@ -263,7 +263,7 @@ extern void _ppb_assert_locked(device_t, const char *, int);
extern void ppb_init_callout(device_t, struct callout *, int);
extern int ppb_sleep(device_t, void *, int, const char *, int);
extern int ppb_get_status(device_t, struct ppb_status *);
extern int ppb_poll_bus(device_t, int, char, char, int);
extern int ppb_poll_bus(device_t, int, uint8_t, uint8_t, int);
extern int ppb_reset_epp_timeout(device_t);
extern int ppb_ecp_sync(device_t);
extern int ppb_get_epp_protocol(device_t);

View File

@ -367,7 +367,7 @@ mpc85xx_map_dcsr(void)
err = fdt_get_range(node, 0, &b, &s);
if (err != 0)
return (err);
return (0);
law_enable(OCP85XX_TGTIF_DCSR, b, 0x400000);
return pmap_early_io_map(b, 0x400000);

View File

@ -663,7 +663,7 @@ static void
cpu_idle_booke(sbintime_t sbt)
{
#ifdef E500
#ifdef BOOKE_E500
platform_cpu_idle(PCPU_GET(cpuid));
#endif
}

32
tools/tools/crypto/cryptorun.sh Executable file
View File

@ -0,0 +1,32 @@
#!/bin/sh
#
# A simple test runner for cryptotest
#
# Althought cryptotest itself has a -z mode to test all algorithms at
# a variety of sizes, this script allows us to be more selective.
# Threads and buffer sizes move in powers of two from 1, for threads,
# and 256 for buffer sizes.
#
# e.g. cryptorun.sh aes 4 512
#
# Test aes with 1, 2 and 4 processes, and at sizes of 256 and 512 bytes.
#
# $FreeBSD$
#
threads=1
size=256
iterations=1000000
crypto="/tank/users/gnn/Repos/svn/FreeBSD.HEAD/tools/tools/crypto/cryptotest"
max_threads=$2
max_size=$3
while [ "$threads" -le "$max_threads" ]; do
echo "Testing with $threads processes."
while [ "$size" -le "$max_size" ]; do
$crypto -t $threads -a $1 $iterations $size
size=$(($size * 2))
done
size=256
threads=$(($threads * 2))
done

View File

@ -84,6 +84,7 @@
*/
#include <sys/param.h>
#include <sys/cpuset.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/sysctl.h>
@ -96,6 +97,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sysexits.h>
#include <unistd.h>
#include <crypto/cryptodev.h>
@ -130,9 +132,6 @@ struct alg {
{ "aes", 0, 16, 16, 16, CRYPTO_AES_CBC},
{ "aes192", 0, 16, 24, 24, CRYPTO_AES_CBC},
{ "aes256", 0, 16, 32, 32, CRYPTO_AES_CBC},
#ifdef notdef
{ "arc4", 0, 8, 1, 32, CRYPTO_ARC4 },
#endif
{ "md5", 1, 8, 16, 16, CRYPTO_MD5_HMAC },
{ "sha1", 1, 8, 20, 20, CRYPTO_SHA1_HMAC },
{ "sha256", 1, 8, 32, 32, CRYPTO_SHA2_256_HMAC },
@ -146,8 +145,8 @@ usage(const char* cmd)
printf("usage: %s [-czsbv] [-d dev] [-a algorithm] [count] [size ...]\n",
cmd);
printf("where algorithm is one of:\n");
printf(" des 3des (default) blowfish cast skipjack rij\n");
printf(" aes aes192 aes256 arc4\n");
printf(" null des 3des (default) blowfish cast skipjack rij\n");
printf(" aes aes192 aes256 md5 sha1 sha256 sha384 sha512\n");
printf("count is the number of encrypt/decrypt ops to do\n");
printf("size is the number of bytes of text to encrypt+decrypt\n");
printf("\n");
@ -158,6 +157,7 @@ usage(const char* cmd)
printf("-v be verbose\n");
printf("-b mark operations for batching\n");
printf("-p profile kernel crypto operation (must be root)\n");
printf("-t n for n threads and run tests concurrently\n");
exit(-1);
}
@ -469,6 +469,11 @@ runtests(struct alg *alg, int count, int size, u_long cmd, int threads, int prof
if (threads > 1) {
for (i = 0; i < threads; i++)
if (fork() == 0) {
cpuset_t mask;
CPU_ZERO(&mask);
CPU_SET(i, &mask);
cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID,
-1, sizeof(mask), &mask);
runtest(alg, count, size, cmd, &tvp[i]);
exit(0);
}
@ -483,17 +488,10 @@ runtests(struct alg *alg, int count, int size, u_long cmd, int threads, int prof
if (t) {
int nops = alg->ishash ? count : 2*count;
#if 0
t /= threads;
printf("%6.3lf sec, %7d %6s crypts, %7d bytes, %8.0lf byte/sec, %7.1lf Mb/sec\n",
t, nops, alg->name, size, (double)nops*size / t,
(double)nops*size / t * 8 / 1024 / 1024);
#else
nops *= threads;
printf("%8.3lf sec, %7d %6s crypts, %7d bytes, %8.0lf byte/sec, %7.1lf Mb/sec\n",
t, nops, alg->name, size, (double)nops*size / t,
(double)nops*size / t * 8 / 1024 / 1024);
#endif
}
#ifdef __FreeBSD__
if (profile) {
@ -581,6 +579,9 @@ main(int argc, char **argv)
}
argc--, argv++;
}
if (maxthreads > CPU_SETSIZE)
errx(EX_USAGE, "Too many threads, %d, choose fewer.", maxthreads);
if (nsizes == 0) {
if (alg)
sizes[nsizes++] = alg->blocksize;

View File

@ -16,6 +16,7 @@ DIRDEPS = \
lib/libopenbsd \
lib/librpcsvc \
lib/libutil \
usr.bin/yacc.host \
.include <dirdeps.mk>