e48afbc4eb
txg_kick() fails to see that we are quiescing, forcing transactions to their next stages without leaving them accumulate changes Creating a fragmented pool in a DCenter VM and continuously writing to it with multiple instances of randwritecomp, we get the following output from txg.d: 0ms 311MB in 4114ms (95% p1) 75MB/s 544MB (76%) 336us 153ms 0ms 0ms 8MB in 51ms ( 0% p1) 163MB/s 474MB (66%) 129us 34ms 0ms 0ms 366MB in 4454ms (93% p1) 82MB/s 572MB (79%) 498us 20ms 0ms 0ms 406MB in 5212ms (95% p1) 77MB/s 591MB (82%) 661us 37ms 0ms 0ms 340MB in 5110ms (94% p1) 66MB/s 622MB (86%) 1048us 41ms 1ms 0ms 3MB in 61ms ( 0% p1) 51MB/s 419MB (58%) 33us 0ms 0ms 0ms 361MB in 3555ms (88% p1) 101MB/s 542MB (75%) 335us 40ms 0ms 0ms 356MB in 4592ms (92% p1) 77MB/s 561MB (78%) 430us 89ms 1ms 0ms 11MB in 129ms (13% p1) 90MB/s 507MB (70%) 222us 15ms 0ms 0ms 281MB in 2520ms (89% p1) 111MB/s 542MB (75%) 334us 42ms 0ms 0ms 383MB in 3666ms (91% p1) 104MB/s 557MB (77%) 411us 133ms 0ms 0ms 404MB in 5757ms (94% p1) 70MB/s 635MB (88%) 1274us 123ms 2ms 4ms 367MB in 4172ms (89% p1) 88MB/s 556MB (77%) 401us 51ms 0ms 0ms 42MB in 470ms (44% p1) 90MB/s 557MB (77%) 412us 43ms 0ms 0ms 261MB in 2273ms (88% p1) 114MB/s 556MB (77%) 407us 27ms 0ms 0ms 394MB in 3646ms (85% p1) 108MB/s 552MB (77%) 393us 304ms 0ms 0ms 275MB in 2416ms (89% p1) 113MB/s 510MB (71%) 200us 53ms 0ms 0ms 9MB in 53ms ( 0% p1) 169MB/s 483MB (67%) 140us 100ms 1ms The TXGs that are getting synced and don't have lots of changes are pushed by txg_kick() which basically forces the current open txg to get to the quiesced state: if (tx->tx_syncing_txg == 0 && tx->tx_quiesce_txg_waiting <= tx->tx_open_txg && tx->tx_sync_txg_waiting <= tx->tx_synced_txg && tx->tx_quiesced_txg <= tx->tx_synced_txg) { tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1; cv_broadcast(&tx->tx_quiesce_more_cv); } The problem is that the above code doesn't check if we are currently quiescing anything (only if a quiesce or a sync has been requested, ..etc) so the following scenario can happen: 1] We have an open txg A that had enough dirty data (more than zfs_dirty_data_sync) and it was pushed to the quiesced state, and opened a new txg B. No txg is currently being synced. 2] Immediately after the opening of B, txg_kick() was run by some other write (and because of A's dirty data) and saw that we are not currently syncing any txg and no one has requested quiescing so it requests one by bumping tx_quiesce_txg_waiting and broadcasts the quiesce thread. 3] The quiesce thread just passed txg A to be synced and sees that a quiescing request has been sent to it so it immediately grabs B without letting it gather enough data, putting it in a quiesced state and opening a new txg C. In this scenario txg B, is an example of how the entries of interest show up in the txg.d output. Ideally we would like txg_kick() to get triggered only when we are sure that we are not syncing AND not quiescing any txg. This way we can kick an open TXG to the quiescing state when we are sure that there is nothing going on and we would benefit from the different states running concurrently. Authored by: Serapheim Dimitropoulos <serapheim@delphix.com> Reviewed by: Matt Ahrens <matt@delphix.com> Reviewed by: Brad Lewis <brad.lewis@delphix.com> Reviewed by: Andriy Gapon <avg@FreeBSD.org> Approved by: Dan McDonald <danmcd@joyent.com> Ported-by: Brian Behlendorf <behlendorf1@llnl.gov> OpenZFS-issue: https://illumos.org/issues/9464 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/1cd7635b Closes #7587
126 lines
4.8 KiB
C
126 lines
4.8 KiB
C
/*
|
|
* CDDL HEADER START
|
|
*
|
|
* The contents of this file are subject to the terms of the
|
|
* Common Development and Distribution License (the "License").
|
|
* You may not use this file except in compliance with the License.
|
|
*
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
* See the License for the specific language governing permissions
|
|
* and limitations under the License.
|
|
*
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
*
|
|
* CDDL HEADER END
|
|
*/
|
|
|
|
/*
|
|
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
|
|
* Use is subject to license terms.
|
|
*/
|
|
|
|
/*
|
|
* Copyright (c) 2013, 2017 by Delphix. All rights reserved.
|
|
*/
|
|
|
|
#ifndef _SYS_TXG_IMPL_H
|
|
#define _SYS_TXG_IMPL_H
|
|
|
|
#include <sys/spa.h>
|
|
#include <sys/txg.h>
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
/*
|
|
* The tx_cpu structure is a per-cpu structure that is used to track
|
|
* the number of active transaction holds (tc_count). As transactions
|
|
* are assigned into a transaction group the appropriate tc_count is
|
|
* incremented to indicate that there are pending changes that have yet
|
|
* to quiesce. Consumers evenutally call txg_rele_to_sync() to decrement
|
|
* the tc_count. A transaction group is not considered quiesced until all
|
|
* tx_cpu structures have reached a tc_count of zero.
|
|
*
|
|
* This structure is a per-cpu structure by design. Updates to this structure
|
|
* are frequent and concurrent. Having a single structure would result in
|
|
* heavy lock contention so a per-cpu design was implemented. With the fanned
|
|
* out mutex design, consumers only need to lock the mutex associated with
|
|
* thread's cpu.
|
|
*
|
|
* The tx_cpu contains two locks, the tc_lock and tc_open_lock.
|
|
* The tc_lock is used to protect all members of the tx_cpu structure with
|
|
* the exception of the tc_open_lock. This lock should only be held for a
|
|
* short period of time, typically when updating the value of tc_count.
|
|
*
|
|
* The tc_open_lock protects the tx_open_txg member of the tx_state structure.
|
|
* This lock is used to ensure that transactions are only assigned into
|
|
* the current open transaction group. In order to move the current open
|
|
* transaction group to the quiesce phase, the txg_quiesce thread must
|
|
* grab all tc_open_locks, increment the tx_open_txg, and drop the locks.
|
|
* The tc_open_lock is held until the transaction is assigned into the
|
|
* transaction group. Typically, this is a short operation but if throttling
|
|
* is occurring it may be held for longer periods of time.
|
|
*/
|
|
struct tx_cpu {
|
|
kmutex_t tc_open_lock; /* protects tx_open_txg */
|
|
kmutex_t tc_lock; /* protects the rest of this struct */
|
|
kcondvar_t tc_cv[TXG_SIZE];
|
|
uint64_t tc_count[TXG_SIZE]; /* tx hold count on each txg */
|
|
list_t tc_callbacks[TXG_SIZE]; /* commit cb list */
|
|
char tc_pad[8]; /* pad to fill 3 cache lines */
|
|
};
|
|
|
|
/*
|
|
* The tx_state structure maintains the state information about the different
|
|
* stages of the pool's transcation groups. A per pool tx_state structure
|
|
* is used to track this information. The tx_state structure also points to
|
|
* an array of tx_cpu structures (described above). Although the tx_sync_lock
|
|
* is used to protect the members of this structure, it is not used to
|
|
* protect the tx_open_txg. Instead a special lock in the tx_cpu structure
|
|
* is used. Readers of tx_open_txg must grab the per-cpu tc_open_lock.
|
|
* Any thread wishing to update tx_open_txg must grab the tc_open_lock on
|
|
* every cpu (see txg_quiesce()).
|
|
*/
|
|
typedef struct tx_state {
|
|
tx_cpu_t *tx_cpu; /* protects access to tx_open_txg */
|
|
kmutex_t tx_sync_lock; /* protects the rest of this struct */
|
|
|
|
uint64_t tx_open_txg; /* currently open txg id */
|
|
uint64_t tx_quiescing_txg; /* currently quiescing txg id */
|
|
uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */
|
|
uint64_t tx_syncing_txg; /* currently syncing txg id */
|
|
uint64_t tx_synced_txg; /* last synced txg id */
|
|
|
|
hrtime_t tx_open_time; /* start time of tx_open_txg */
|
|
|
|
uint64_t tx_sync_txg_waiting; /* txg we're waiting to sync */
|
|
uint64_t tx_quiesce_txg_waiting; /* txg we're waiting to open */
|
|
|
|
kcondvar_t tx_sync_more_cv;
|
|
kcondvar_t tx_sync_done_cv;
|
|
kcondvar_t tx_quiesce_more_cv;
|
|
kcondvar_t tx_quiesce_done_cv;
|
|
kcondvar_t tx_timeout_cv;
|
|
kcondvar_t tx_exit_cv; /* wait for all threads to exit */
|
|
|
|
uint8_t tx_threads; /* number of threads */
|
|
uint8_t tx_exiting; /* set when we're exiting */
|
|
|
|
kthread_t *tx_sync_thread;
|
|
kthread_t *tx_quiesce_thread;
|
|
|
|
taskq_t *tx_commit_cb_taskq; /* commit callback taskq */
|
|
} tx_state_t;
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
#endif /* _SYS_TXG_IMPL_H */
|