0735ecb334
PROBLEM ======= When `dmu_tx_assign` is called from `zil_lwb_write_issue`, it's possible for either `ERESTART` or `EIO` to be returned. If `ERESTART` is returned, this will cause an assertion to fail directly in `zil_lwb_write_issue`, where the code assumes the return value is `EIO` if `dmu_tx_assign` returns a non-zero value. This can occur if the SPA is suspended when `dmu_tx_assign` is called, and most often occurs when running `zloop`. If `EIO` is returned, this can cause assertions to fail elsewhere in the ZIL code. For example, `zil_commit_waiter_timeout` contains the following logic: lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED); In this case, if `dmu_tx_assign` returned `EIO` from within `zil_lwb_write_issue`, the `lwb` variable passed in will not be issued to disk. Thus, it's `lwb_state` field will remain `LWB_STATE_OPENED` and this assertion will fail. `zil_commit_waiter_timeout` assumes that after it calls `zil_lwb_write_issue`, the `lwb` will be issued to disk, and doesn't handle the case where this is not true; i.e. it doesn't handle the case where `dmu_tx_assign` returns `EIO`. SOLUTION ======== This change modifies the `dmu_tx_assign` function such that `txg_how` is a bitmask, rather than of the `txg_how_t` enum type. Now, the previous `TXG_WAITED` semantics can be used via `TXG_NOTHROTTLE`, along with specifying either `TXG_NOWAIT` or `TXG_WAIT` semantics. Previously, when `TXG_WAITED` was specified, `TXG_NOWAIT` semantics was automatically invoked. This was not ideal when using `TXG_WAITED` within `zil_lwb_write_issued`, leading the problem described above. Rather, we want to achieve the semantics of `TXG_WAIT`, while also preventing the `tx` from being penalized via the dirty delay throttling. With this change, `zil_lwb_write_issued` can acheive the semtantics that it requires by passing in the value `TXG_WAIT | TXG_NOTHROTTLE` to `dmu_tx_assign`. Further, consumers of `dmu_tx_assign` wishing to achieve the old `TXG_WAITED` semantics can pass in the value `TXG_NOWAIT | TXG_NOTHROTTLE`. Authored by: Prakash Surya <prakash.surya@delphix.com> Approved by: Robert Mustacchi <rm@joyent.com> Reviewed by: Matt Ahrens <mahrens@delphix.com> Reviewed by: Andriy Gapon <avg@FreeBSD.org> Ported-by: Brian Behlendorf <behlendorf1@llnl.gov> Porting Notes: - Additionally updated `zfs_tmpfile` to use `TXG_NOTHROTTLE` OpenZFS-issue: https://www.illumos.org/issues/8997 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/19ea6cb0f9 Closes #7084
177 lines
4.4 KiB
C
177 lines
4.4 KiB
C
/*
|
|
* CDDL HEADER START
|
|
*
|
|
* The contents of this file are subject to the terms of the
|
|
* Common Development and Distribution License (the "License").
|
|
* You may not use this file except in compliance with the License.
|
|
*
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
* See the License for the specific language governing permissions
|
|
* and limitations under the License.
|
|
*
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
*
|
|
* CDDL HEADER END
|
|
*/
|
|
/*
|
|
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
|
|
* Use is subject to license terms.
|
|
*/
|
|
/*
|
|
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
|
|
*/
|
|
|
|
#ifndef _SYS_DMU_TX_H
|
|
#define _SYS_DMU_TX_H
|
|
|
|
#include <sys/inttypes.h>
|
|
#include <sys/dmu.h>
|
|
#include <sys/txg.h>
|
|
#include <sys/refcount.h>
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
struct dmu_buf_impl;
|
|
struct dmu_tx_hold;
|
|
struct dnode_link;
|
|
struct dsl_pool;
|
|
struct dnode;
|
|
struct dsl_dir;
|
|
|
|
struct dmu_tx {
|
|
/*
|
|
* No synchronization is needed because a tx can only be handled
|
|
* by one thread.
|
|
*/
|
|
list_t tx_holds; /* list of dmu_tx_hold_t */
|
|
objset_t *tx_objset;
|
|
struct dsl_dir *tx_dir;
|
|
struct dsl_pool *tx_pool;
|
|
uint64_t tx_txg;
|
|
uint64_t tx_lastsnap_txg;
|
|
uint64_t tx_lasttried_txg;
|
|
txg_handle_t tx_txgh;
|
|
void *tx_tempreserve_cookie;
|
|
struct dmu_tx_hold *tx_needassign_txh;
|
|
|
|
/* list of dmu_tx_callback_t on this dmu_tx */
|
|
list_t tx_callbacks;
|
|
|
|
/* placeholder for syncing context, doesn't need specific holds */
|
|
boolean_t tx_anyobj;
|
|
|
|
/* transaction is marked as being a "net free" of space */
|
|
boolean_t tx_netfree;
|
|
|
|
/* time this transaction was created */
|
|
hrtime_t tx_start;
|
|
|
|
/* need to wait for sufficient dirty space */
|
|
boolean_t tx_wait_dirty;
|
|
|
|
/* has this transaction already been delayed? */
|
|
boolean_t tx_dirty_delayed;
|
|
|
|
int tx_err;
|
|
};
|
|
|
|
enum dmu_tx_hold_type {
|
|
THT_NEWOBJECT,
|
|
THT_WRITE,
|
|
THT_BONUS,
|
|
THT_FREE,
|
|
THT_ZAP,
|
|
THT_SPACE,
|
|
THT_SPILL,
|
|
THT_NUMTYPES
|
|
};
|
|
|
|
typedef struct dmu_tx_hold {
|
|
dmu_tx_t *txh_tx;
|
|
list_node_t txh_node;
|
|
struct dnode *txh_dnode;
|
|
refcount_t txh_space_towrite;
|
|
refcount_t txh_memory_tohold;
|
|
enum dmu_tx_hold_type txh_type;
|
|
uint64_t txh_arg1;
|
|
uint64_t txh_arg2;
|
|
} dmu_tx_hold_t;
|
|
|
|
typedef struct dmu_tx_callback {
|
|
list_node_t dcb_node; /* linked to tx_callbacks list */
|
|
dmu_tx_callback_func_t *dcb_func; /* caller function pointer */
|
|
void *dcb_data; /* caller private data */
|
|
} dmu_tx_callback_t;
|
|
|
|
/*
|
|
* Used for dmu tx kstat.
|
|
*/
|
|
typedef struct dmu_tx_stats {
|
|
kstat_named_t dmu_tx_assigned;
|
|
kstat_named_t dmu_tx_delay;
|
|
kstat_named_t dmu_tx_error;
|
|
kstat_named_t dmu_tx_suspended;
|
|
kstat_named_t dmu_tx_group;
|
|
kstat_named_t dmu_tx_memory_reserve;
|
|
kstat_named_t dmu_tx_memory_reclaim;
|
|
kstat_named_t dmu_tx_dirty_throttle;
|
|
kstat_named_t dmu_tx_dirty_delay;
|
|
kstat_named_t dmu_tx_dirty_over_max;
|
|
kstat_named_t dmu_tx_quota;
|
|
} dmu_tx_stats_t;
|
|
|
|
extern dmu_tx_stats_t dmu_tx_stats;
|
|
|
|
#define DMU_TX_STAT_INCR(stat, val) \
|
|
atomic_add_64(&dmu_tx_stats.stat.value.ui64, (val));
|
|
#define DMU_TX_STAT_BUMP(stat) \
|
|
DMU_TX_STAT_INCR(stat, 1);
|
|
|
|
/*
|
|
* These routines are defined in dmu.h, and are called by the user.
|
|
*/
|
|
dmu_tx_t *dmu_tx_create(objset_t *dd);
|
|
int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
|
|
void dmu_tx_commit(dmu_tx_t *tx);
|
|
void dmu_tx_abort(dmu_tx_t *tx);
|
|
uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
|
|
struct dsl_pool *dmu_tx_pool(dmu_tx_t *tx);
|
|
void dmu_tx_wait(dmu_tx_t *tx);
|
|
|
|
/*
|
|
* These routines are defined in dmu_spa.h, and are called by the SPA.
|
|
*/
|
|
extern dmu_tx_t *dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg);
|
|
|
|
/*
|
|
* These routines are only called by the DMU.
|
|
*/
|
|
dmu_tx_t *dmu_tx_create_dd(dsl_dir_t *dd);
|
|
int dmu_tx_is_syncing(dmu_tx_t *tx);
|
|
int dmu_tx_private_ok(dmu_tx_t *tx);
|
|
void dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn);
|
|
void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db);
|
|
void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space);
|
|
|
|
#ifdef ZFS_DEBUG
|
|
#define DMU_TX_DIRTY_BUF(tx, db) dmu_tx_dirty_buf(tx, db)
|
|
#else
|
|
#define DMU_TX_DIRTY_BUF(tx, db)
|
|
#endif
|
|
|
|
void dmu_tx_init(void);
|
|
void dmu_tx_fini(void);
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
#endif /* _SYS_DMU_TX_H */
|