3ec3bc2167
Reviewed by: Steve Gonczi <steve.gonczi@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com> Ported-by: Brian Behlendorf <behlendorf1@llnl.gov> Background information: This assertion about tx_space_* verifies that we are not dirtying more stuff than we thought we would. We “need” to know how much we will dirty so that we can check if we should fail this transaction with ENOSPC/EDQUOT, in dmu_tx_assign(). While the transaction is open (i.e. between dmu_tx_assign() and dmu_tx_commit() — typically less than a millisecond), we call dbuf_dirty() on the exact blocks that will be modified. Once this happens, the temporary accounting in tx_space_* is unnecessary, because we know exactly what blocks are newly dirtied; we call dnode_willuse_space() to track this more exact accounting. The fundamental problem causing this bug is that dmu_tx_hold_*() relies on the current state in the DMU (e.g. dn_nlevels) to predict how much will be dirtied by this transaction, but this state can change before we actually perform the transaction (i.e. call dbuf_dirty()). This bug will be fixed by removing the assertion that the tx_space_* accounting is perfectly accurate (i.e. we never dirty more than was predicted by dmu_tx_hold_*()). By removing the requirement that this accounting be perfectly accurate, we can also vastly simplify it, e.g. removing most of the logic in dmu_tx_count_*(). The new tx space accounting will be very approximate, and may be more or less than what is actually dirtied. It will still be used to determine if this transaction will put us over quota. Transactions that are marked by dmu_tx_mark_netfree() will be excepted from this check. We won’t make an attempt to determine how much space will be freed by the transaction — this was rarely accurate enough to determine if a transaction should be permitted when we are over quota, which is why dmu_tx_mark_netfree() was introduced in 2014. We also won’t attempt to give “credit” when overwriting existing blocks, if those blocks may be freed. This allows us to remove the do_free_accounting logic in dbuf_dirty(), and associated routines. This logic attempted to predict what will be on disk when this txg syncs, to know if the overwritten block will be freed (i.e. exists, and has no snapshots). OpenZFS-issue: https://www.illumos.org/issues/7793 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/3704e0a Upstream bugs: DLPX-32883a Closes #5804 Porting notes: - DNODE_SIZE replaced with DNODE_MIN_SIZE in dmu_tx_count_dnode(), Using the default dnode size would be slightly better. - DEBUG_DMU_TX wrappers and configure option removed. - Resolved _by_dnode() conflicts these changes have not yet been applied to OpenZFS.
129 lines
4.1 KiB
C
129 lines
4.1 KiB
C
/*
|
|
* CDDL HEADER START
|
|
*
|
|
* The contents of this file are subject to the terms of the
|
|
* Common Development and Distribution License (the "License").
|
|
* You may not use this file except in compliance with the License.
|
|
*
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
* See the License for the specific language governing permissions
|
|
* and limitations under the License.
|
|
*
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
*
|
|
* CDDL HEADER END
|
|
*/
|
|
|
|
#if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS)
|
|
|
|
#undef TRACE_SYSTEM
|
|
#define TRACE_SYSTEM zfs
|
|
|
|
#undef TRACE_SYSTEM_VAR
|
|
#define TRACE_SYSTEM_VAR zfs_dmu
|
|
|
|
#if !defined(_TRACE_DMU_H) || defined(TRACE_HEADER_MULTI_READ)
|
|
#define _TRACE_DMU_H
|
|
|
|
#include <linux/tracepoint.h>
|
|
#include <sys/types.h>
|
|
|
|
/*
|
|
* Generic support for three argument tracepoints of the form:
|
|
*
|
|
* DTRACE_PROBE3(...,
|
|
* dmu_tx_t *, ...,
|
|
* uint64_t, ...,
|
|
* uint64_t, ...);
|
|
*/
|
|
/* BEGIN CSTYLED */
|
|
DECLARE_EVENT_CLASS(zfs_delay_mintime_class,
|
|
TP_PROTO(dmu_tx_t *tx, uint64_t dirty, uint64_t min_tx_time),
|
|
TP_ARGS(tx, dirty, min_tx_time),
|
|
TP_STRUCT__entry(
|
|
__field(uint64_t, tx_txg)
|
|
__field(uint64_t, tx_lastsnap_txg)
|
|
__field(uint64_t, tx_lasttried_txg)
|
|
__field(boolean_t, tx_anyobj)
|
|
__field(boolean_t, tx_waited)
|
|
__field(hrtime_t, tx_start)
|
|
__field(boolean_t, tx_wait_dirty)
|
|
__field(int, tx_err)
|
|
__field(uint64_t, min_tx_time)
|
|
__field(uint64_t, dirty)
|
|
),
|
|
TP_fast_assign(
|
|
__entry->tx_txg = tx->tx_txg;
|
|
__entry->tx_lastsnap_txg = tx->tx_lastsnap_txg;
|
|
__entry->tx_lasttried_txg = tx->tx_lasttried_txg;
|
|
__entry->tx_anyobj = tx->tx_anyobj;
|
|
__entry->tx_waited = tx->tx_waited;
|
|
__entry->tx_start = tx->tx_start;
|
|
__entry->tx_wait_dirty = tx->tx_wait_dirty;
|
|
__entry->tx_err = tx->tx_err;
|
|
__entry->dirty = dirty;
|
|
__entry->min_tx_time = min_tx_time;
|
|
),
|
|
TP_printk("tx { txg %llu lastsnap_txg %llu tx_lasttried_txg %llu "
|
|
"anyobj %d waited %d start %llu wait_dirty %d err %i "
|
|
"} dirty %llu min_tx_time %llu",
|
|
__entry->tx_txg, __entry->tx_lastsnap_txg,
|
|
__entry->tx_lasttried_txg, __entry->tx_anyobj, __entry->tx_waited,
|
|
__entry->tx_start, __entry->tx_wait_dirty, __entry->tx_err,
|
|
__entry->dirty, __entry->min_tx_time)
|
|
);
|
|
/* END CSTYLED */
|
|
|
|
/* BEGIN CSTYLED */
|
|
#define DEFINE_DELAY_MINTIME_EVENT(name) \
|
|
DEFINE_EVENT(zfs_delay_mintime_class, name, \
|
|
TP_PROTO(dmu_tx_t *tx, uint64_t dirty, uint64_t min_tx_time), \
|
|
TP_ARGS(tx, dirty, min_tx_time))
|
|
/* END CSTYLED */
|
|
DEFINE_DELAY_MINTIME_EVENT(zfs_delay__mintime);
|
|
|
|
/* BEGIN CSTYLED */
|
|
DECLARE_EVENT_CLASS(zfs_free_long_range_class,
|
|
TP_PROTO(uint64_t long_free_dirty_all_txgs, uint64_t chunk_len, \
|
|
uint64_t txg),
|
|
TP_ARGS(long_free_dirty_all_txgs, chunk_len, txg),
|
|
TP_STRUCT__entry(
|
|
__field(uint64_t, long_free_dirty_all_txgs)
|
|
__field(uint64_t, chunk_len)
|
|
__field(uint64_t, txg)
|
|
),
|
|
TP_fast_assign(
|
|
__entry->long_free_dirty_all_txgs = long_free_dirty_all_txgs;
|
|
__entry->chunk_len = chunk_len;
|
|
__entry->txg = txg;
|
|
),
|
|
TP_printk("long_free_dirty_all_txgs %llu chunk_len %llu txg %llu",
|
|
__entry->long_free_dirty_all_txgs,
|
|
__entry->chunk_len, __entry->txg)
|
|
);
|
|
/* END CSTYLED */
|
|
|
|
/* BEGIN CSTYLED */
|
|
#define DEFINE_FREE_LONG_RANGE_EVENT(name) \
|
|
DEFINE_EVENT(zfs_free_long_range_class, name, \
|
|
TP_PROTO(uint64_t long_free_dirty_all_txgs, \
|
|
uint64_t chunk_len, uint64_t txg), \
|
|
TP_ARGS(long_free_dirty_all_txgs, chunk_len, txg))
|
|
/* END CSTYLED */
|
|
DEFINE_FREE_LONG_RANGE_EVENT(zfs_free__long__range);
|
|
|
|
#endif /* _TRACE_DMU_H */
|
|
|
|
#undef TRACE_INCLUDE_PATH
|
|
#undef TRACE_INCLUDE_FILE
|
|
#define TRACE_INCLUDE_PATH sys
|
|
#define TRACE_INCLUDE_FILE trace_dmu
|
|
#include <trace/define_trace.h>
|
|
|
|
#endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */
|