0dc2f70c5c
Device removal allocates a new location for each allocated segment on the disk that's being removed. Each allocation results in one entry in the mapping table, which maps from old location + length to new location. When a fragmented disk is removed, this can result in a large number of mapping entries, and thus a large amount of memory consumed by the mapping table. In the worst real-world cases, we've seen around 1GB of RAM per 1TB of storage removed. We can improve on this situation by allocating larger segments, which span across both allocated and free regions of the device being removed. By including free regions in the allocation (and thus mapping), we reduce the number of mapping entries. For example, if we have a 4K allocation followed by 1K free and then 4K allocated, we would allocate 4+1+4 = 9KB, and then move the entire region (including allocated and free parts). In this case we used one mapping where previously we would have used two, but often the ratio is much higher (up to 20:1 in real-world use). We then need to mark the regions that were free on the removing device as free in the new locations, and also obsolete in the mapping entry. This method preserves the fragmentation of the removing device, rather than consolidating its allocated space into a small number of chunks where possible. But it results in drastic reduction of memory used by the mapping table - around 20x in the most-fragmented cases. In the most fragmented real-world cases, this reduces memory used by the mapping from ~1GB to ~50MB of RAM per 1TB of storage removed. Less fragmented cases will typically also see around 50-100MB of RAM per 1TB of storage. Porting notes: * Add the following as module parameters: * zfs_condense_indirect_vdevs_enable * zfs_condense_max_obsolete_bytes * Document the following module parameters: * zfs_condense_indirect_vdevs_enable * zfs_condense_max_obsolete_bytes * zfs_condense_min_mapping_bytes Authored by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov> Ported-by: Tim Chase <tim@chase2k.com> Signed-off-by: Tim Chase <tim@chase2k.com> OpenZFS-issue: https://illumos.org/issues/9486 OpenZFS-commit: https://github.com/ahrens/illumos/commit/07152e142e44c External-issue: DLPX-57962 Closes #7536
97 lines
2.6 KiB
C
97 lines
2.6 KiB
C
/*
|
|
* CDDL HEADER START
|
|
*
|
|
* This file and its contents are supplied under the terms of the
|
|
* Common Development and Distribution License ("CDDL"), version 1.0.
|
|
* You may only use this file in accordance with the terms of version
|
|
* 1.0 of the CDDL.
|
|
*
|
|
* A full copy of the text of the CDDL should have accompanied this
|
|
* source. A copy of the CDDL is also available via the Internet at
|
|
* http://www.illumos.org/license/CDDL.
|
|
*
|
|
* CDDL HEADER END
|
|
*/
|
|
|
|
/*
|
|
* Copyright (c) 2014, 2015 by Delphix. All rights reserved.
|
|
*/
|
|
|
|
#ifndef _SYS_VDEV_REMOVAL_H
|
|
#define _SYS_VDEV_REMOVAL_H
|
|
|
|
#include <sys/spa.h>
|
|
#include <sys/bpobj.h>
|
|
#include <sys/vdev_indirect_mapping.h>
|
|
#include <sys/vdev_indirect_births.h>
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
typedef struct spa_vdev_removal {
|
|
uint64_t svr_vdev_id;
|
|
uint64_t svr_max_offset_to_sync[TXG_SIZE];
|
|
/* Thread performing a vdev removal. */
|
|
kthread_t *svr_thread;
|
|
/* Segments left to copy from the current metaslab. */
|
|
range_tree_t *svr_allocd_segs;
|
|
kmutex_t svr_lock;
|
|
kcondvar_t svr_cv;
|
|
boolean_t svr_thread_exit;
|
|
|
|
/*
|
|
* New mappings to write out each txg.
|
|
*/
|
|
list_t svr_new_segments[TXG_SIZE];
|
|
|
|
/*
|
|
* Ranges that were freed while a mapping was in flight. This is
|
|
* a subset of the ranges covered by vdev_im_new_segments.
|
|
*/
|
|
range_tree_t *svr_frees[TXG_SIZE];
|
|
|
|
/*
|
|
* Number of bytes which we have finished our work for
|
|
* in each txg. This could be data copied (which will be part of
|
|
* the mappings in vdev_im_new_segments), or data freed before
|
|
* we got around to copying it.
|
|
*/
|
|
uint64_t svr_bytes_done[TXG_SIZE];
|
|
|
|
/* List of leaf zap objects to be unlinked */
|
|
nvlist_t *svr_zaplist;
|
|
} spa_vdev_removal_t;
|
|
|
|
typedef struct spa_condensing_indirect {
|
|
/*
|
|
* New mappings to write out each txg.
|
|
*/
|
|
list_t sci_new_mapping_entries[TXG_SIZE];
|
|
|
|
vdev_indirect_mapping_t *sci_new_mapping;
|
|
} spa_condensing_indirect_t;
|
|
|
|
extern int spa_remove_init(spa_t *);
|
|
extern void spa_restart_removal(spa_t *);
|
|
extern int spa_condense_init(spa_t *);
|
|
extern void spa_condense_fini(spa_t *);
|
|
extern void spa_start_indirect_condensing_thread(spa_t *);
|
|
extern void spa_vdev_condense_suspend(spa_t *);
|
|
extern int spa_vdev_remove(spa_t *, uint64_t, boolean_t);
|
|
extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t, uint64_t);
|
|
extern int spa_removal_get_stats(spa_t *, pool_removal_stat_t *);
|
|
extern void svr_sync(spa_t *spa, dmu_tx_t *tx);
|
|
extern void spa_vdev_remove_suspend(spa_t *);
|
|
extern int spa_vdev_remove_cancel(spa_t *);
|
|
extern void spa_vdev_removal_destroy(spa_vdev_removal_t *svr);
|
|
|
|
extern int vdev_removal_max_span;
|
|
extern int zfs_remove_max_segment;
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
#endif /* _SYS_VDEV_REMOVAL_H */
|