diff --git a/cddl/contrib/opensolaris/cmd/ztest/ztest.c b/cddl/contrib/opensolaris/cmd/ztest/ztest.c index be2f2cdd029f..65a4858b95d9 100644 --- a/cddl/contrib/opensolaris/cmd/ztest/ztest.c +++ b/cddl/contrib/opensolaris/cmd/ztest/ztest.c @@ -1544,7 +1544,7 @@ ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) if (zil_replaying(zd->zd_zilog, tx)) return; - if (lr->lr_length > ZIL_MAX_LOG_DATA) + if (lr->lr_length > zil_max_log_data(zd->zd_zilog)) write_state = WR_INDIRECT; itx = zil_itx_create(TX_WRITE, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h index 040dfaa29a94..a27a9547ac43 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -452,6 +452,9 @@ extern void zil_set_sync(zilog_t *zilog, uint64_t syncval); extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval); +extern uint64_t zil_max_copied_data(zilog_t *zilog); +extern uint64_t zil_max_log_data(zilog_t *zilog); + extern int zil_replay_disable; #ifdef __cplusplus diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h index 5717c94a0c0c..a19ba970574f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -208,6 +208,13 @@ struct zilog { uint_t zl_prev_rotor; /* rotor for zl_prev[] */ txg_node_t zl_dirty_link; /* protected by dp_dirty_zilogs list */ uint64_t zl_dirty_max_txg; /* highest txg used to dirty zilog */ + /* + * Max block size for this ZIL. Note that this can not be changed + * while the ZIL is in use because consumers (ZPL/zvol) need to take + * this into account when deciding between WR_COPIED and WR_NEED_COPY + * (see zil_max_copied_data()). + */ + uint64_t zl_max_block_size; }; typedef struct zil_bp_node { @@ -215,28 +222,6 @@ typedef struct zil_bp_node { avl_node_t zn_node; } zil_bp_node_t; -/* - * Maximum amount of write data that can be put into single log block. - */ -#define ZIL_MAX_LOG_DATA (SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t) - \ - sizeof (lr_write_t)) -#define ZIL_MAX_COPIED_DATA \ - ((SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t)) / 2 - sizeof (lr_write_t)) - -/* - * Maximum amount of log space we agree to waste to reduce number of - * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%). - */ -#define ZIL_MAX_WASTE_SPACE (ZIL_MAX_LOG_DATA / 8) - -/* - * Maximum amount of write data for WR_COPIED. Fall back to WR_NEED_COPY - * as more space efficient if we can't fit at least two log records into - * maximum sized log block. - */ -#define ZIL_MAX_COPIED_DATA ((SPA_OLD_MAXBLOCKSIZE - \ - sizeof (zil_chain_t)) / 2 - sizeof (lr_write_t)) - #ifdef __cplusplus } #endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c index b1f98071a42a..c00c60a25ebb 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2015 by Delphix. All rights reserved. + * Copyright (c) 2015, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -493,7 +493,14 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, itx_wr_state_t wr_state = write_state; ssize_t len = resid; - if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA) + /* + * A WR_COPIED record must fit entirely in one log block. + * Large writes can use WR_NEED_COPY, which the ZIL will + * split into multiple records across several log blocks + * if necessary. + */ + if (wr_state == WR_COPIED && + resid > zil_max_copied_data(zilog)) wr_state = WR_NEED_COPY; else if (wr_state == WR_INDIRECT) len = MIN(blocksize - P2PHASE(off, blocksize), resid); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c index d8b7498a71a8..a2b9f9bbeaa0 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c @@ -1364,6 +1364,15 @@ struct { { UINT64_MAX, SPA_OLD_MAXBLOCKSIZE}, /* > 128KB writes */ }; +/* + * Maximum block size used by the ZIL. This is picked up when the ZIL is + * initialized. Otherwise this should not be used directly; see + * zl_max_block_size instead. + */ +int zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE; +SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_maxblocksize, CTLFLAG_RWTUN, + &zil_maxblocksize, 0, "Limit in bytes of ZIL log block size"); + /* * Start a log block write and advance to the next log block. * Calls are serialized. @@ -1440,7 +1449,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++) continue; - zil_blksz = zil_block_buckets[i].blksz; + zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size); zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; for (i = 0; i < ZIL_PREV_BLKS; i++) zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); @@ -1497,13 +1506,47 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) return (nlwb); } +/* + * Maximum amount of write data that can be put into single log block. + */ +uint64_t +zil_max_log_data(zilog_t *zilog) +{ + return (zilog->zl_max_block_size - + sizeof (zil_chain_t) - sizeof (lr_write_t)); +} + +/* + * Maximum amount of log space we agree to waste to reduce number of + * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%). + */ +static inline uint64_t +zil_max_waste_space(zilog_t *zilog) +{ + return (zil_max_log_data(zilog) / 8); +} + +/* + * Maximum amount of write data for WR_COPIED. For correctness, consumers + * must fall back to WR_NEED_COPY if we can't fit the entire record into one + * maximum sized log block, because each WR_COPIED record must fit in a + * single log block. For space efficiency, we want to fit two records into a + * max-sized log block. + */ +uint64_t +zil_max_copied_data(zilog_t *zilog) +{ + return ((zilog->zl_max_block_size - sizeof (zil_chain_t)) / 2 - + sizeof (lr_write_t)); +} + static lwb_t * zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) { lr_t *lrcb, *lrc; lr_write_t *lrwb, *lrw; char *lr_buf; - uint64_t dlen, dnow, lwb_sp, reclen, txg; + uint64_t dlen, dnow, lwb_sp, reclen, txg, max_log_data; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT3P(lwb, !=, NULL); @@ -1552,15 +1595,27 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) * For WR_NEED_COPY optimize layout for minimal number of chunks. */ lwb_sp = lwb->lwb_sz - lwb->lwb_nused; + max_log_data = zil_max_log_data(zilog); if (reclen > lwb_sp || (reclen + dlen > lwb_sp && - lwb_sp < ZIL_MAX_WASTE_SPACE && (dlen % ZIL_MAX_LOG_DATA == 0 || - lwb_sp < reclen + dlen % ZIL_MAX_LOG_DATA))) { + lwb_sp < zil_max_waste_space(zilog) && + (dlen % max_log_data == 0 || + lwb_sp < reclen + dlen % max_log_data))) { lwb = zil_lwb_write_issue(zilog, lwb); if (lwb == NULL) return (NULL); zil_lwb_write_open(zilog, lwb); ASSERT(LWB_EMPTY(lwb)); lwb_sp = lwb->lwb_sz - lwb->lwb_nused; + + /* + * There must be enough space in the new, empty log block to + * hold reclen. For WR_COPIED, we need to fit the whole + * record in one block, and reclen is the header size + the + * data size. For WR_NEED_COPY, we can create multiple + * records, splitting the data into multiple blocks, so we + * only need to fit one word of data per block; in this case + * reclen is just the header size (no data). + */ ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp); } @@ -2992,6 +3047,7 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys) zilog->zl_dirty_max_txg = 0; zilog->zl_last_lwb_opened = NULL; zilog->zl_last_lwb_latency = 0; + zilog->zl_max_block_size = zil_maxblocksize; mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c index 822e795720e4..2397247a2244 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c @@ -1438,7 +1438,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid, itx_wr_state_t wr_state = write_state; ssize_t len = resid; - if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA) + if (wr_state == WR_COPIED && resid > zil_max_copied_data(zilog)) wr_state = WR_NEED_COPY; else if (wr_state == WR_INDIRECT) len = MIN(blocksize - P2PHASE(off, blocksize), resid);