MFV r318947: 7578 Fix/improve some aspects of ZIL writing.

FreeBSD note: this commit removes small differences between what mav
committed to FreeBSD in r308782 and what ended up committed to illumos
after addressing all review comments.

illumos/illumos-gate@c5ee46810f
c5ee46810f

https://www.illumos.org/issues/7578
  After some ZIL changes 6 years ago zil_slog_limit got partially broken
  due to zl_itx_list_sz not updated when async itx'es upgraded to sync.
  Actually because of other changes about that time zl_itx_list_sz is not
  really required to implement the functionality, so this patch removes
  some unneeded broken code and variables.
  Original idea of zil_slog_limit was to reduce chance of SLOG abuse by
  single heavy logger, that increased latency for other (more latency critical)
  loggers, by pushing heavy log out into the main pool instead of SLOG. Beside
  huge latency increase for heavy writers, this implementation caused double
  write of all data, since the log records were explicitly prepared for SLOG.
  Since we now have I/O scheduler, I've found it can be much more efficient
  to reduce priority of heavy logger SLOG writes from ZIO_PRIORITY_SYNC_WRITE
  to ZIO_PRIORITY_ASYNC_WRITE, while still leave them on SLOG.
  Existing ZIL implementation had problem with space efficiency when it
  has to write large chunks of data into log blocks of limited size. In some
  cases efficiency stopped to almost as low as 50%. In case of ZIL stored on
  spinning rust, that also reduced log write speed in half, since head had to
  uselessly fly over allocated but not written areas. This change improves
  the situation by offloading problematic operations from z*_log_write() to
  zil_lwb_commit(), which knows real situation of log blocks allocation and
  can split large requests into pieces much more efficiently. Also as side
  effect it removes one of two data copy operations done by ZIL code WR_COPIED
  case.
  While there, untangle and unify code of z*_log_write() functions.
  Also zfs_log_write() alike to zvol_log_write() can now handle writes crossing
  block boundary, that may also improve efficiency if ZPL is made to do that.

Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Steven Hartland <steven.hartland@multiplay.co.uk>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Author: Alexander Motin <mav@FreeBSD.org>

MFC after:	3 weeks
This commit is contained in:
Andriy Gapon 2017-06-22 16:52:22 +00:00
commit e70097b50f
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=320237
2 changed files with 41 additions and 20 deletions

View File

@ -139,11 +139,28 @@ typedef struct zil_bp_node {
avl_node_t zn_node;
} zil_bp_node_t;
/*
* Maximum amount of write data that can be put into single log block.
*/
#define ZIL_MAX_LOG_DATA (SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
sizeof (lr_write_t))
#define ZIL_MAX_COPIED_DATA \
((SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t)) / 2 - sizeof (lr_write_t))
/*
* Maximum amount of log space we agree to waste to reduce number of
* WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%).
*/
#define ZIL_MAX_WASTE_SPACE (ZIL_MAX_LOG_DATA / 8)
/*
* Maximum amount of write data for WR_COPIED. Fall back to WR_NEED_COPY
* as more space efficient if we can't fit at least two log records into
* maximum sized log block.
*/
#define ZIL_MAX_COPIED_DATA ((SPA_OLD_MAXBLOCKSIZE - \
sizeof (zil_chain_t)) / 2 - sizeof (lr_write_t))
#ifdef __cplusplus
}
#endif

View File

@ -90,12 +90,12 @@ SYSCTL_INT(_vfs_zfs_trim, OID_AUTO, enabled, CTLFLAG_RDTUN, &zfs_trim_enabled, 0
/*
* Limit SLOG write size per commit executed with synchronous priority.
* Any writes above that executed with lower (asynchronous) priority to
* limit potential SLOG device abuse by single active ZIL writer.
* Any writes above that will be executed with lower (asynchronous) priority
* to limit potential SLOG device abuse by single active ZIL writer.
*/
uint64_t zil_slog_limit = 768 * 1024;
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, zil_slog_limit, CTLFLAG_RWTUN,
&zil_slog_limit, 0, "Maximal SLOG commit size with sync priority");
uint64_t zil_slog_bulk = 768 * 1024;
SYSCTL_QUAD(_vfs_zfs, OID_AUTO, zil_slog_bulk, CTLFLAG_RWTUN,
&zil_slog_bulk, 0, "Maximal SLOG commit size with sync priority");
static kmem_cache_t *zil_lwb_cache;
@ -923,7 +923,7 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
if (lwb->lwb_zio == NULL) {
abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf,
BP_GET_LSIZE(&lwb->lwb_blk));
if (zilog->zl_cur_used <= zil_slog_limit || !lwb->lwb_slog)
if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
prio = ZIO_PRIORITY_SYNC_WRITE;
else
prio = ZIO_PRIORITY_ASYNC_WRITE;
@ -1068,36 +1068,38 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb, boolean_t last)
static lwb_t *
zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
{
lr_t *lrcb, *lrc = &itx->itx_lr; /* common log record */
lr_write_t *lrwb, *lrw = (lr_write_t *)lrc;
lr_t *lrcb, *lrc;
lr_write_t *lrwb, *lrw;
char *lr_buf;
uint64_t txg = lrc->lrc_txg;
uint64_t reclen = lrc->lrc_reclen;
uint64_t dlen = 0;
uint64_t dnow, lwb_sp;
uint64_t dlen, dnow, lwb_sp, reclen, txg;
if (lwb == NULL)
return (NULL);
ASSERT(lwb->lwb_buf != NULL);
if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY)
lrc = &itx->itx_lr; /* Common log record inside itx. */
lrw = (lr_write_t *)lrc; /* Write log record inside itx. */
if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
dlen = P2ROUNDUP_TYPED(
lrw->lr_length, sizeof (uint64_t), uint64_t);
} else {
dlen = 0;
}
reclen = lrc->lrc_reclen;
zilog->zl_cur_used += (reclen + dlen);
txg = lrc->lrc_txg;
zil_lwb_write_init(zilog, lwb);
cont:
/*
* If this record won't fit in the current log block, start a new one.
* For WR_NEED_COPY optimize layout for minimal number of chunks, but
* try to keep wasted space withing reasonable range (12%).
* For WR_NEED_COPY optimize layout for minimal number of chunks.
*/
lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
lwb_sp < ZIL_MAX_LOG_DATA / 8 && (dlen % ZIL_MAX_LOG_DATA == 0 ||
lwb_sp < ZIL_MAX_WASTE_SPACE && (dlen % ZIL_MAX_LOG_DATA == 0 ||
lwb_sp < reclen + dlen % ZIL_MAX_LOG_DATA))) {
lwb = zil_lwb_write_start(zilog, lwb, B_FALSE);
if (lwb == NULL)
@ -1105,14 +1107,14 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
zil_lwb_write_init(zilog, lwb);
ASSERT(LWB_EMPTY(lwb));
lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
ASSERT3U(reclen + MIN(dlen, sizeof(uint64_t)), <=, lwb_sp);
ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
}
dnow = MIN(dlen, lwb_sp - reclen);
lr_buf = lwb->lwb_buf + lwb->lwb_nused;
bcopy(lrc, lr_buf, reclen);
lrcb = (lr_t *)lr_buf;
lrwb = (lr_write_t *)lrcb;
lrcb = (lr_t *)lr_buf; /* Like lrc, but inside lwb. */
lrwb = (lr_write_t *)lrcb; /* Like lrw, but inside lwb. */
/*
* If it's a write, fetch the data or get its blkptr as appropriate.
@ -1328,6 +1330,8 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
* this itxg. Save the itxs for release below.
* This should be rare.
*/
zfs_dbgmsg("zil_itx_assign: missed itx cleanup for "
"txg %llu", itxg->itxg_txg);
clean = itxg->itxg_itxs;
}
itxg->itxg_txg = txg;