/*- * BSD LICENSE * * Copyright (c) Intel Corporation. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * Neither the name of Intel Corporation nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "spdk/stdinc.h" #include "spdk/ftl.h" #include "spdk/util.h" #include "spdk/likely.h" #include "spdk/string.h" #include "spdk/crc32.h" #include "ftl_core.h" #include "ftl_band.h" #include "ftl_io.h" struct ftl_restore_band { struct ftl_restore *parent; /* Associated band */ struct ftl_band *band; /* Status of retrieving this band's metadata */ enum ftl_md_status md_status; /* Padded queue link */ STAILQ_ENTRY(ftl_restore_band) stailq; }; struct ftl_nv_cache_restore; /* Describes single phase to be restored from non-volatile cache */ struct ftl_nv_cache_range { struct ftl_nv_cache_restore *parent; /* Start offset */ uint64_t start_addr; /* Last block's address */ uint64_t last_addr; /* * Number of blocks (can be smaller than the difference between the last * and the starting block due to range overlap) */ uint64_t num_blocks; /* Number of blocks already recovered */ uint64_t num_recovered; /* Current address during recovery */ uint64_t current_addr; /* Phase of the range */ unsigned int phase; /* Indicates whether the data from this range needs to be recovered */ bool recovery; }; struct ftl_nv_cache_block { struct ftl_nv_cache_restore *parent; /* Data buffer */ void *buf; /* Metadata buffer */ void *md_buf; /* Block offset within the cache */ uint64_t offset; }; struct ftl_nv_cache_restore { struct ftl_nv_cache *nv_cache; /* IO channel to use */ struct spdk_io_channel *ioch; /* * Non-volatile cache ranges. The ranges can overlap, as we have no * control over the order of completions. The phase of the range is the * index within the table. The range with index 0 marks blocks that were * never written. */ struct ftl_nv_cache_range range[FTL_NV_CACHE_PHASE_COUNT]; #define FTL_NV_CACHE_RESTORE_DEPTH 128 /* Non-volatile cache buffers */ struct ftl_nv_cache_block block[FTL_NV_CACHE_RESTORE_DEPTH]; /* Current address */ uint64_t current_addr; /* Number of outstanding requests */ size_t num_outstanding; /* Recovery/scan status */ int status; /* Current phase of the recovery */ unsigned int phase; }; struct ftl_restore { struct spdk_ftl_dev *dev; /* Completion callback (called for each phase of the restoration) */ ftl_restore_fn cb; /* Number of inflight IOs */ unsigned int num_ios; /* Current band number (index in the below bands array) */ unsigned int current; /* Array of bands */ struct ftl_restore_band *bands; /* Queue of bands to be padded (due to unsafe shutdown) */ STAILQ_HEAD(, ftl_restore_band) pad_bands; /* Status of the padding */ int pad_status; /* Metadata buffer */ void *md_buf; /* LBA map buffer */ void *lba_map; /* Indicates we're in the final phase of the restoration */ bool final_phase; /* Non-volatile cache recovery */ struct ftl_nv_cache_restore nv_cache; }; static int ftl_restore_tail_md(struct ftl_restore_band *rband); static void ftl_pad_chunk_cb(struct ftl_io *io, void *arg, int status); static void ftl_restore_pad_band(struct ftl_restore_band *rband); static void ftl_restore_free(struct ftl_restore *restore) { unsigned int i; if (!restore) { return; } for (i = 0; i < FTL_NV_CACHE_RESTORE_DEPTH; ++i) { spdk_dma_free(restore->nv_cache.block[i].buf); } spdk_dma_free(restore->md_buf); free(restore->bands); free(restore); } static struct ftl_restore * ftl_restore_init(struct spdk_ftl_dev *dev, ftl_restore_fn cb) { struct ftl_restore *restore; struct ftl_restore_band *rband; size_t i; restore = calloc(1, sizeof(*restore)); if (!restore) { goto error; } restore->dev = dev; restore->cb = cb; restore->final_phase = false; restore->bands = calloc(ftl_dev_num_bands(dev), sizeof(*restore->bands)); if (!restore->bands) { goto error; } STAILQ_INIT(&restore->pad_bands); for (i = 0; i < ftl_dev_num_bands(dev); ++i) { rband = &restore->bands[i]; rband->band = &dev->bands[i]; rband->parent = restore; rband->md_status = FTL_MD_NO_MD; } /* Allocate buffer capable of holding head mds of all bands */ restore->md_buf = spdk_dma_zmalloc(ftl_dev_num_bands(dev) * ftl_head_md_num_lbks(dev) * FTL_BLOCK_SIZE, 0, NULL); if (!restore->md_buf) { goto error; } return restore; error: ftl_restore_free(restore); return NULL; } static void ftl_restore_complete(struct ftl_restore *restore, int status) { struct ftl_restore *ctx = status ? NULL : restore; bool final_phase = restore->final_phase; restore->cb(restore->dev, ctx, status); if (status || final_phase) { ftl_restore_free(restore); } } static int ftl_band_cmp(const void *lband, const void *rband) { uint64_t lseq = ((struct ftl_restore_band *)lband)->band->seq; uint64_t rseq = ((struct ftl_restore_band *)rband)->band->seq; if (lseq < rseq) { return -1; } else { return 1; } } static int ftl_restore_check_seq(const struct ftl_restore *restore) { const struct spdk_ftl_dev *dev = restore->dev; const struct ftl_restore_band *rband; const struct ftl_band *next_band; size_t i; for (i = 0; i < ftl_dev_num_bands(dev); ++i) { rband = &restore->bands[i]; if (rband->md_status != FTL_MD_SUCCESS) { continue; } next_band = LIST_NEXT(rband->band, list_entry); if (next_band && rband->band->seq == next_band->seq) { return -1; } } return 0; } static bool ftl_restore_head_valid(struct spdk_ftl_dev *dev, struct ftl_restore *restore, size_t *num_valid) { struct ftl_restore_band *rband; size_t i; for (i = 0; i < ftl_dev_num_bands(dev); ++i) { rband = &restore->bands[i]; if (rband->md_status != FTL_MD_SUCCESS && rband->md_status != FTL_MD_NO_MD && rband->md_status != FTL_MD_IO_FAILURE) { SPDK_ERRLOG("Inconsistent head metadata found on band %u\n", rband->band->id); return false; } if (rband->md_status == FTL_MD_SUCCESS) { (*num_valid)++; } } return true; } static void ftl_restore_head_complete(struct ftl_restore *restore) { struct spdk_ftl_dev *dev = restore->dev; size_t num_valid = 0; int status = -EIO; if (!ftl_restore_head_valid(dev, restore, &num_valid)) { goto out; } if (num_valid == 0) { SPDK_ERRLOG("Couldn't find any valid bands\n"); goto out; } /* Sort bands in sequence number ascending order */ qsort(restore->bands, ftl_dev_num_bands(dev), sizeof(struct ftl_restore_band), ftl_band_cmp); if (ftl_restore_check_seq(restore)) { SPDK_ERRLOG("Band sequence consistency failed\n"); goto out; } dev->num_lbas = dev->global_md.num_lbas; status = 0; out: ftl_restore_complete(restore, status); } static void ftl_restore_head_cb(struct ftl_io *io, void *ctx, int status) { struct ftl_restore_band *rband = ctx; struct ftl_restore *restore = rband->parent; unsigned int num_ios; rband->md_status = status; num_ios = __atomic_fetch_sub(&restore->num_ios, 1, __ATOMIC_SEQ_CST); assert(num_ios > 0); if (num_ios == 1) { ftl_restore_head_complete(restore); } } static int ftl_restore_head_md(struct ftl_restore *restore) { struct spdk_ftl_dev *dev = restore->dev; struct ftl_restore_band *rband; struct ftl_lba_map *lba_map; unsigned int num_failed = 0, num_ios; size_t i; restore->num_ios = ftl_dev_num_bands(dev); for (i = 0; i < ftl_dev_num_bands(dev); ++i) { rband = &restore->bands[i]; lba_map = &rband->band->lba_map; lba_map->dma_buf = restore->md_buf + i * ftl_head_md_num_lbks(dev) * FTL_BLOCK_SIZE; if (ftl_band_read_head_md(rband->band, ftl_restore_head_cb, rband)) { if (spdk_likely(rband->band->num_chunks)) { SPDK_ERRLOG("Failed to read metadata on band %zu\n", i); rband->md_status = FTL_MD_INVALID_CRC; /* If the first IO fails, don't bother sending anything else */ if (i == 0) { ftl_restore_free(restore); return -EIO; } } num_failed++; } } if (spdk_unlikely(num_failed > 0)) { num_ios = __atomic_fetch_sub(&restore->num_ios, num_failed, __ATOMIC_SEQ_CST); if (num_ios == num_failed) { ftl_restore_free(restore); return -EIO; } } return 0; } int ftl_restore_md(struct spdk_ftl_dev *dev, ftl_restore_fn cb) { struct ftl_restore *restore; restore = ftl_restore_init(dev, cb); if (!restore) { return -ENOMEM; } return ftl_restore_head_md(restore); } static int ftl_restore_l2p(struct ftl_band *band) { struct spdk_ftl_dev *dev = band->dev; struct ftl_ppa ppa; uint64_t lba; size_t i; for (i = 0; i < ftl_num_band_lbks(band->dev); ++i) { if (!spdk_bit_array_get(band->lba_map.vld, i)) { continue; } lba = band->lba_map.map[i]; if (lba >= dev->num_lbas) { return -1; } ppa = ftl_l2p_get(dev, lba); if (!ftl_ppa_invalid(ppa)) { ftl_invalidate_addr(dev, ppa); } ppa = ftl_band_ppa_from_lbkoff(band, i); ftl_band_set_addr(band, lba, ppa); ftl_l2p_set(dev, lba, ppa); } return 0; } static struct ftl_restore_band * ftl_restore_next_band(struct ftl_restore *restore) { struct ftl_restore_band *rband; for (; restore->current < ftl_dev_num_bands(restore->dev); ++restore->current) { rband = &restore->bands[restore->current]; if (spdk_likely(rband->band->num_chunks) && rband->md_status == FTL_MD_SUCCESS) { restore->current++; return rband; } } return NULL; } static void ftl_nv_cache_restore_complete(struct ftl_nv_cache_restore *restore, int status) { struct ftl_restore *ftl_restore = SPDK_CONTAINEROF(restore, struct ftl_restore, nv_cache); restore->status = restore->status ? : status; if (restore->num_outstanding == 0) { ftl_restore_complete(ftl_restore, restore->status); } } static void ftl_nv_cache_block_read_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); static void ftl_nv_cache_restore_done(struct ftl_nv_cache_restore *restore, uint64_t current_addr) { struct ftl_nv_cache *nv_cache = restore->nv_cache; pthread_spin_lock(&nv_cache->lock); nv_cache->current_addr = current_addr; nv_cache->ready = true; pthread_spin_unlock(&nv_cache->lock); SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Enabling non-volatile cache (phase: %u, addr: %" PRIu64")\n", nv_cache->phase, current_addr); ftl_nv_cache_restore_complete(restore, 0); } static void ftl_nv_cache_write_header_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) { struct ftl_nv_cache_restore *restore = cb_arg; spdk_bdev_free_io(bdev_io); if (spdk_unlikely(!success)) { SPDK_ERRLOG("Unable to write the non-volatile cache metadata header\n"); ftl_nv_cache_restore_complete(restore, -EIO); return; } ftl_nv_cache_restore_done(restore, FTL_NV_CACHE_DATA_OFFSET); } static void ftl_nv_cache_scrub_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) { struct ftl_nv_cache_restore *restore = cb_arg; struct ftl_nv_cache *nv_cache = restore->nv_cache; int rc; spdk_bdev_free_io(bdev_io); if (spdk_unlikely(!success)) { SPDK_ERRLOG("Scrubbing non-volatile cache failed\n"); ftl_nv_cache_restore_complete(restore, -EIO); return; } nv_cache->phase = 1; rc = ftl_nv_cache_write_header(nv_cache, false, ftl_nv_cache_write_header_cb, restore); if (spdk_unlikely(rc != 0)) { SPDK_ERRLOG("Unable to write the non-volatile cache metadata header: %s\n", spdk_strerror(-rc)); ftl_nv_cache_restore_complete(restore, -EIO); } } static void ftl_nv_cache_scrub_header_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) { struct ftl_nv_cache_restore *restore = cb_arg; struct ftl_nv_cache *nv_cache = restore->nv_cache; int rc; spdk_bdev_free_io(bdev_io); if (spdk_unlikely(!success)) { SPDK_ERRLOG("Unable to write non-volatile cache metadata header\n"); ftl_nv_cache_restore_complete(restore, -EIO); return; } rc = ftl_nv_cache_scrub(nv_cache, ftl_nv_cache_scrub_cb, restore); if (spdk_unlikely(rc != 0)) { SPDK_ERRLOG("Unable to scrub the non-volatile cache: %s\n", spdk_strerror(-rc)); ftl_nv_cache_restore_complete(restore, rc); } } static void ftl_nv_cache_band_flush_cb(void *ctx, int status) { struct ftl_nv_cache_restore *restore = ctx; struct ftl_nv_cache *nv_cache = restore->nv_cache; int rc; if (spdk_unlikely(status != 0)) { SPDK_ERRLOG("Flushing active bands failed: %s\n", spdk_strerror(-status)); ftl_nv_cache_restore_complete(restore, status); return; } /* * Use phase 0 to indicate that the cache is being scrubbed. If the power is lost during * this process, we'll know it needs to be resumed. */ nv_cache->phase = 0; rc = ftl_nv_cache_write_header(nv_cache, false, ftl_nv_cache_scrub_header_cb, restore); if (spdk_unlikely(rc != 0)) { SPDK_ERRLOG("Unable to write non-volatile cache metadata header: %s\n", spdk_strerror(-rc)); ftl_nv_cache_restore_complete(restore, rc); } } static void ftl_nv_cache_recovery_done(struct ftl_nv_cache_restore *restore) { struct ftl_nv_cache *nv_cache = restore->nv_cache; struct ftl_nv_cache_range *range_prev, *range_current; struct spdk_ftl_dev *dev = SPDK_CONTAINEROF(nv_cache, struct spdk_ftl_dev, nv_cache); struct spdk_bdev *bdev; uint64_t current_addr; int rc; range_prev = &restore->range[ftl_nv_cache_prev_phase(nv_cache->phase)]; range_current = &restore->range[nv_cache->phase]; bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc); /* * If there are more than two ranges or the ranges overlap, scrub the non-volatile cache to * make sure that any subsequent power loss will find the cache in usable state */ if ((range_prev->num_blocks + range_current->num_blocks < nv_cache->num_data_blocks) || (range_prev->start_addr < range_current->last_addr && range_current->start_addr < range_prev->last_addr)) { SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Non-volatile cache inconsistency detected\n"); rc = ftl_flush_active_bands(dev, ftl_nv_cache_band_flush_cb, restore); if (spdk_unlikely(rc != 0)) { SPDK_ERRLOG("Unable to flush active bands: %s\n", spdk_strerror(-rc)); ftl_nv_cache_restore_complete(restore, rc); } return; } /* The latest phase is the one written in the header (set in nvc_cache->phase) */ current_addr = range_current->last_addr + 1; /* * The first range might be empty (only the header was written) or the range might * end at the last available address, in which case set current address to the * beginning of the device. */ if (range_current->num_blocks == 0 || current_addr >= spdk_bdev_get_num_blocks(bdev)) { current_addr = FTL_NV_CACHE_DATA_OFFSET; } ftl_nv_cache_restore_done(restore, current_addr); } static void ftl_nv_cache_recover_block(struct ftl_nv_cache_block *block) { struct ftl_nv_cache_restore *restore = block->parent; struct ftl_nv_cache *nv_cache = restore->nv_cache; struct ftl_nv_cache_range *range = &restore->range[restore->phase]; int rc; assert(range->current_addr <= range->last_addr); restore->num_outstanding++; block->offset = range->current_addr++; rc = spdk_bdev_read_blocks_with_md(nv_cache->bdev_desc, restore->ioch, block->buf, block->md_buf, block->offset, 1, ftl_nv_cache_block_read_cb, block); if (spdk_unlikely(rc != 0)) { SPDK_ERRLOG("Non-volatile cache restoration failed on block %"PRIu64" (%s)\n", block->offset, spdk_strerror(-rc)); restore->num_outstanding--; ftl_nv_cache_restore_complete(restore, rc); } } static void ftl_nv_cache_recover_range(struct ftl_nv_cache_restore *restore) { struct ftl_nv_cache_range *range; unsigned int phase = restore->phase; do { /* Find first range with non-zero number of blocks that is marked for recovery */ range = &restore->range[phase]; if (range->recovery && range->num_recovered < range->num_blocks) { break; } phase = ftl_nv_cache_next_phase(phase); } while (phase != restore->phase); /* There are no ranges to be recovered, we're done */ if (range->num_recovered == range->num_blocks || !range->recovery) { SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Non-volatile cache recovery done\n"); ftl_nv_cache_recovery_done(restore); return; } range->current_addr = range->start_addr; restore->phase = phase; SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Recovering range %u %"PRIu64"-%"PRIu64" (%"PRIu64")\n", phase, range->start_addr, range->last_addr, range->num_blocks); ftl_nv_cache_recover_block(&restore->block[0]); } static void ftl_nv_cache_write_cb(struct ftl_io *io, void *cb_arg, int status) { struct ftl_nv_cache_block *block = cb_arg; struct ftl_nv_cache_restore *restore = block->parent; struct ftl_nv_cache_range *range = &restore->range[restore->phase]; restore->num_outstanding--; if (status != 0) { SPDK_ERRLOG("Non-volatile cache restoration failed on block %"PRIu64" (%s)\n", block->offset, spdk_strerror(-status)); ftl_nv_cache_restore_complete(restore, -ENOMEM); return; } range->num_recovered++; if (range->current_addr <= range->last_addr) { ftl_nv_cache_recover_block(block); } else if (restore->num_outstanding == 0) { assert(range->num_recovered == range->num_blocks); ftl_nv_cache_recover_range(restore); } } static struct ftl_io * ftl_nv_cache_alloc_io(struct ftl_nv_cache_block *block, uint64_t lba) { struct ftl_restore *restore = SPDK_CONTAINEROF(block->parent, struct ftl_restore, nv_cache); struct ftl_io_init_opts opts = { .dev = restore->dev, .io = NULL, .flags = FTL_IO_BYPASS_CACHE, .type = FTL_IO_WRITE, .lbk_cnt = 1, .cb_fn = ftl_nv_cache_write_cb, .cb_ctx = block, .data = block->buf, }; struct ftl_io *io; io = ftl_io_init_internal(&opts); if (spdk_unlikely(!io)) { return NULL; } io->lba.single = lba; return io; } static void ftl_nv_cache_block_read_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) { struct ftl_nv_cache_block *block = cb_arg; struct ftl_nv_cache_restore *restore = block->parent; struct ftl_nv_cache_range *range = &restore->range[restore->phase]; struct ftl_io *io; unsigned int phase; uint64_t lba; spdk_bdev_free_io(bdev_io); restore->num_outstanding--; if (!success) { SPDK_ERRLOG("Non-volatile cache restoration failed on block %"PRIu64"\n", block->offset); ftl_nv_cache_restore_complete(restore, -EIO); return; } ftl_nv_cache_unpack_lba(*(uint64_t *)block->md_buf, &lba, &phase); if (spdk_unlikely(phase != restore->phase)) { if (range->current_addr < range->last_addr) { ftl_nv_cache_recover_block(block); } else if (restore->num_outstanding == 0) { ftl_nv_cache_recover_range(restore); } return; } io = ftl_nv_cache_alloc_io(block, lba); if (spdk_unlikely(!io)) { SPDK_ERRLOG("Failed to allocate ftl_io during non-volatile cache recovery\n"); ftl_nv_cache_restore_complete(restore, -ENOMEM); return; } restore->num_outstanding++; ftl_io_write(io); } /* * Since we have no control over the order in which the requests complete in regards to their * submission, the cache can be in either of the following states: * - [1 1 1 1 1 1 1 1 1 1]: simplest case, whole cache contains single phase (although it should be * very rare), * - [1 1 1 1 3 3 3 3 3 3]: two phases, changing somewhere in the middle with no overlap. This is * the state left by clean shutdown, * - [1 1 1 1 3 1 3 3 3 3]: similar to the above, but this time the two ranges overlap. This * happens when completions are reordered during unsafe shutdown, * - [2 1 2 1 1 1 1 3 1 3]: three different phases, each one of which can overlap with * previous/next one. The data from the oldest phase doesn't need to be * recovered, as it was already being written to, which means it's * already on the main storage. */ static void ftl_nv_cache_scan_done(struct ftl_nv_cache_restore *restore) { struct ftl_nv_cache *nv_cache = restore->nv_cache; #if defined(DEBUG) struct ftl_nv_cache_range *range; uint64_t i, num_blocks = 0; for (i = 0; i < FTL_NV_CACHE_PHASE_COUNT; ++i) { range = &restore->range[i]; SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Range %"PRIu64": %"PRIu64"-%"PRIu64" (%" PRIu64 ")\n", i, range->start_addr, range->last_addr, range->num_blocks); num_blocks += range->num_blocks; } assert(num_blocks == nv_cache->num_data_blocks); #endif restore->phase = ftl_nv_cache_prev_phase(nv_cache->phase); /* * Only the latest two phases need to be recovered. The third one, even if present, * already has to be stored on the main storage, as it's already started to be * overwritten (only present here because of reordering of requests' completions). */ restore->range[nv_cache->phase].recovery = true; restore->range[restore->phase].recovery = true; ftl_nv_cache_recover_range(restore); } static int ftl_nv_cache_scan_block(struct ftl_nv_cache_block *block); static void ftl_nv_cache_scan_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) { struct ftl_nv_cache_block *block = cb_arg; struct ftl_nv_cache_restore *restore = block->parent; struct ftl_nv_cache_range *range; struct spdk_bdev *bdev; unsigned int phase; uint64_t lba; restore->num_outstanding--; bdev = spdk_bdev_desc_get_bdev(restore->nv_cache->bdev_desc); spdk_bdev_free_io(bdev_io); if (!success) { SPDK_ERRLOG("Non-volatile cache scan failed on block %"PRIu64"\n", block->offset); ftl_nv_cache_restore_complete(restore, -EIO); return; } /* If we've already hit an error, don't bother with scanning anything else */ if (spdk_unlikely(restore->status != 0)) { ftl_nv_cache_restore_complete(restore, restore->status); return; } ftl_nv_cache_unpack_lba(*(uint64_t *)block->md_buf, &lba, &phase); range = &restore->range[phase]; range->num_blocks++; if (range->start_addr == FTL_LBA_INVALID || range->start_addr > block->offset) { range->start_addr = block->offset; } if (range->last_addr == FTL_LBA_INVALID || range->last_addr < block->offset) { range->last_addr = block->offset; } /* All the blocks were read, once they're all completed and we're finished */ if (restore->current_addr == spdk_bdev_get_num_blocks(bdev)) { if (restore->num_outstanding == 0) { ftl_nv_cache_scan_done(restore); } return; } ftl_nv_cache_scan_block(block); } static int ftl_nv_cache_scan_block(struct ftl_nv_cache_block *block) { struct ftl_nv_cache_restore *restore = block->parent; struct ftl_nv_cache *nv_cache = restore->nv_cache; int rc; restore->num_outstanding++; block->offset = restore->current_addr++; rc = spdk_bdev_read_blocks_with_md(nv_cache->bdev_desc, restore->ioch, block->buf, block->md_buf, block->offset, 1, ftl_nv_cache_scan_cb, block); if (spdk_unlikely(rc != 0)) { SPDK_ERRLOG("Non-volatile cache scan failed on block %"PRIu64" (%s)\n", block->offset, spdk_strerror(-rc)); restore->num_outstanding--; ftl_nv_cache_restore_complete(restore, rc); return rc; } return 0; } static void ftl_nv_cache_clean_header_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) { struct ftl_nv_cache_restore *restore = cb_arg; spdk_bdev_free_io(bdev_io); if (spdk_unlikely(!success)) { SPDK_ERRLOG("Unable to write the non-volatile cache metadata header\n"); ftl_nv_cache_restore_complete(restore, -EIO); return; } ftl_nv_cache_restore_done(restore, restore->current_addr); } static bool ftl_nv_cache_header_valid(struct spdk_ftl_dev *dev, const struct ftl_nv_cache_header *hdr) { struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(dev->nv_cache.bdev_desc); uint32_t checksum; checksum = spdk_crc32c_update(hdr, offsetof(struct ftl_nv_cache_header, checksum), 0); if (checksum != hdr->checksum) { SPDK_ERRLOG("Invalid header checksum (found: %"PRIu32", expected: %"PRIu32")\n", checksum, hdr->checksum); return false; } if (hdr->version != FTL_NV_CACHE_HEADER_VERSION) { SPDK_ERRLOG("Invalid header version (found: %"PRIu32", expected: %"PRIu32")\n", hdr->version, FTL_NV_CACHE_HEADER_VERSION); return false; } if (hdr->size != spdk_bdev_get_num_blocks(bdev)) { SPDK_ERRLOG("Unexpected size of the non-volatile cache bdev (%"PRIu64", expected: %" PRIu64")\n", hdr->size, spdk_bdev_get_num_blocks(bdev)); return false; } if (spdk_uuid_compare(&hdr->uuid, &dev->uuid)) { SPDK_ERRLOG("Invalid device UUID\n"); return false; } if (!ftl_nv_cache_phase_is_valid(hdr->phase) && hdr->phase != 0) { return false; } if ((hdr->current_addr >= spdk_bdev_get_num_blocks(bdev) || hdr->current_addr < FTL_NV_CACHE_DATA_OFFSET) && (hdr->current_addr != FTL_LBA_INVALID)) { SPDK_ERRLOG("Unexpected value of non-volatile cache's current address: %"PRIu64"\n", hdr->current_addr); return false; } return true; } static void ftl_nv_cache_read_header_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) { struct ftl_restore *restore = cb_arg; struct spdk_ftl_dev *dev = restore->dev; struct ftl_nv_cache *nv_cache = &dev->nv_cache; struct ftl_nv_cache_header *hdr; struct iovec *iov = NULL; int iov_cnt = 0, i, rc; if (!success) { SPDK_ERRLOG("Unable to read non-volatile cache metadata header\n"); ftl_restore_complete(restore, -ENOTRECOVERABLE); goto out; } spdk_bdev_io_get_iovec(bdev_io, &iov, &iov_cnt); assert(iov != NULL); hdr = iov[0].iov_base; if (!ftl_nv_cache_header_valid(dev, hdr)) { ftl_restore_complete(restore, -ENOTRECOVERABLE); goto out; } /* Remember the latest phase */ nv_cache->phase = hdr->phase; /* If the phase equals zero, we lost power during recovery. We need to finish it up * by scrubbing the device once again. */ if (hdr->phase == 0) { SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Detected phase 0, restarting scrub\n"); rc = ftl_nv_cache_scrub(nv_cache, ftl_nv_cache_scrub_cb, restore); if (spdk_unlikely(rc != 0)) { SPDK_ERRLOG("Unable to scrub the non-volatile cache: %s\n", spdk_strerror(-rc)); ftl_restore_complete(restore, -ENOTRECOVERABLE); } goto out; } /* Valid current_addr means that the shutdown was clean, so we just need to overwrite the * header to make sure that any power loss occurring before the cache is wrapped won't be * mistaken for a clean shutdown. */ if (hdr->current_addr != FTL_LBA_INVALID) { restore->nv_cache.current_addr = hdr->current_addr; rc = ftl_nv_cache_write_header(nv_cache, false, ftl_nv_cache_clean_header_cb, &restore->nv_cache); if (spdk_unlikely(rc != 0)) { SPDK_ERRLOG("Failed to overwrite the non-volatile cache header: %s\n", spdk_strerror(-rc)); ftl_restore_complete(restore, -ENOTRECOVERABLE); } goto out; } /* Otherwise the shutdown was unexpected, so we need to recover the data from the cache */ restore->nv_cache.current_addr = FTL_NV_CACHE_DATA_OFFSET; for (i = 0; i < FTL_NV_CACHE_RESTORE_DEPTH; ++i) { if (ftl_nv_cache_scan_block(&restore->nv_cache.block[i])) { break; } } out: spdk_bdev_free_io(bdev_io); } void ftl_restore_nv_cache(struct ftl_restore *restore, ftl_restore_fn cb) { struct spdk_ftl_dev *dev = restore->dev; struct spdk_bdev *bdev; struct ftl_nv_cache *nv_cache = &dev->nv_cache; struct ftl_io_channel *ioch; struct ftl_nv_cache_restore *nvc_restore = &restore->nv_cache; struct ftl_nv_cache_block *block; size_t alignment; int rc, i; ioch = spdk_io_channel_get_ctx(dev->ioch); bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc); alignment = spdk_max(spdk_bdev_get_buf_align(bdev), sizeof(uint64_t)); nvc_restore->nv_cache = nv_cache; nvc_restore->ioch = ioch->cache_ioch; restore->final_phase = true; restore->cb = cb; for (i = 0; i < FTL_NV_CACHE_RESTORE_DEPTH; ++i) { block = &nvc_restore->block[i]; block->parent = nvc_restore; block->buf = spdk_dma_zmalloc(spdk_bdev_get_block_size(bdev) + spdk_bdev_get_md_size(bdev), alignment, NULL); if (!block->buf) { /* The memory will be freed in ftl_restore_free */ SPDK_ERRLOG("Unable to allocate memory\n"); ftl_restore_complete(restore, -ENOMEM); return; } block->md_buf = (char *)block->buf + spdk_bdev_get_block_size(bdev); } for (i = 0; i < FTL_NV_CACHE_PHASE_COUNT; ++i) { nvc_restore->range[i].parent = nvc_restore; nvc_restore->range[i].start_addr = FTL_LBA_INVALID; nvc_restore->range[i].last_addr = FTL_LBA_INVALID; nvc_restore->range[i].num_blocks = 0; nvc_restore->range[i].recovery = false; nvc_restore->range[i].phase = i; } rc = spdk_bdev_read_blocks(nv_cache->bdev_desc, ioch->cache_ioch, nv_cache->dma_buf, 0, FTL_NV_CACHE_DATA_OFFSET, ftl_nv_cache_read_header_cb, restore); if (spdk_unlikely(rc != 0)) { SPDK_ERRLOG("Failed to read non-volatile cache metadata header: %s\n", spdk_strerror(-rc)); ftl_restore_complete(restore, rc); } } static bool ftl_pad_chunk_pad_finish(struct ftl_restore_band *rband, bool direct_access) { struct ftl_restore *restore = rband->parent; struct ftl_restore_band *next_band; size_t i, num_pad_chunks = 0; if (spdk_unlikely(restore->pad_status && !restore->num_ios)) { if (direct_access) { /* In case of any errors found we want to clear direct access. */ /* Direct access bands have their own allocated md, which would be lost */ /* on restore complete otherwise. */ rband->band->state = FTL_BAND_STATE_CLOSED; ftl_band_set_direct_access(rband->band, false); } ftl_restore_complete(restore, restore->pad_status); return true; } for (i = 0; i < rband->band->num_chunks; ++i) { if (rband->band->chunk_buf[i].state != FTL_CHUNK_STATE_CLOSED) { num_pad_chunks++; } } /* Finished all chunks in a band, check if all bands are done */ if (num_pad_chunks == 0) { if (direct_access) { rband->band->state = FTL_BAND_STATE_CLOSED; ftl_band_set_direct_access(rband->band, false); } next_band = STAILQ_NEXT(rband, stailq); if (!next_band) { ftl_restore_complete(restore, restore->pad_status); return true; } else { /* Start off padding in the next band */ ftl_restore_pad_band(next_band); return true; } } return false; } static struct ftl_io * ftl_restore_init_pad_io(struct ftl_restore_band *rband, void *buffer, struct ftl_ppa ppa) { struct ftl_band *band = rband->band; struct spdk_ftl_dev *dev = band->dev; int flags = FTL_IO_PAD | FTL_IO_INTERNAL | FTL_IO_PPA_MODE | FTL_IO_MD | FTL_IO_DIRECT_ACCESS; struct ftl_io_init_opts opts = { .dev = dev, .io = NULL, .rwb_batch = NULL, .band = band, .size = sizeof(struct ftl_io), .flags = flags, .type = FTL_IO_WRITE, .lbk_cnt = dev->xfer_size, .cb_fn = ftl_pad_chunk_cb, .cb_ctx = rband, .data = buffer, .parent = NULL, }; struct ftl_io *io; io = ftl_io_init_internal(&opts); if (spdk_unlikely(!io)) { return NULL; } io->ppa = ppa; rband->parent->num_ios++; return io; } static void ftl_pad_chunk_cb(struct ftl_io *io, void *arg, int status) { struct ftl_restore_band *rband = arg; struct ftl_restore *restore = rband->parent; struct ftl_band *band = io->band; struct ftl_chunk *chunk; struct ftl_io *new_io; restore->num_ios--; /* TODO check for next unit error vs early close error */ if (status) { restore->pad_status = status; goto end; } if (io->ppa.lbk + io->lbk_cnt == band->dev->geo.clba) { chunk = ftl_band_chunk_from_ppa(band, io->ppa); chunk->state = FTL_CHUNK_STATE_CLOSED; } else { struct ftl_ppa ppa = io->ppa; ppa.lbk += io->lbk_cnt; new_io = ftl_restore_init_pad_io(rband, io->iov[0].iov_base, ppa); if (spdk_unlikely(!new_io)) { restore->pad_status = -ENOMEM; goto end; } ftl_io_write(new_io); return; } end: spdk_dma_free(io->iov[0].iov_base); ftl_pad_chunk_pad_finish(rband, true); } static void ftl_restore_pad_band(struct ftl_restore_band *rband) { struct spdk_ocssd_chunk_information_entry info; struct ftl_restore *restore = rband->parent; struct ftl_band *band = rband->band; struct spdk_ftl_dev *dev = band->dev; void *buffer = NULL; struct ftl_io *io; struct ftl_ppa ppa; size_t i; int rc = 0; /* Check if some chunks are not closed */ if (ftl_pad_chunk_pad_finish(rband, false)) { /* * If we're here, end meta wasn't recognized, but the whole band is written * Assume the band was padded and ignore it */ return; } band->state = FTL_BAND_STATE_OPEN; rc = ftl_band_set_direct_access(band, true); if (rc) { ftl_restore_complete(restore, rc); return; } for (i = 0; i < band->num_chunks; ++i) { if (band->chunk_buf[i].state == FTL_CHUNK_STATE_CLOSED) { continue; } rc = ftl_retrieve_chunk_info(dev, band->chunk_buf[i].start_ppa, &info, 1); if (spdk_unlikely(rc)) { goto error; } ppa = band->chunk_buf[i].start_ppa; ppa.lbk = info.wp; buffer = spdk_dma_zmalloc(FTL_BLOCK_SIZE * dev->xfer_size, 0, NULL); if (spdk_unlikely(!buffer)) { rc = -ENOMEM; goto error; } io = ftl_restore_init_pad_io(rband, buffer, ppa); if (spdk_unlikely(!io)) { rc = -ENOMEM; spdk_dma_free(buffer); goto error; } ftl_io_write(io); } return; error: restore->pad_status = rc; ftl_pad_chunk_pad_finish(rband, true); } static void ftl_restore_pad_open_bands(void *ctx) { struct ftl_restore *restore = ctx; ftl_restore_pad_band(STAILQ_FIRST(&restore->pad_bands)); } static void ftl_restore_tail_md_cb(struct ftl_io *io, void *ctx, int status) { struct ftl_restore_band *rband = ctx; struct ftl_restore *restore = rband->parent; struct spdk_ftl_dev *dev = restore->dev; if (status) { if (!dev->conf.allow_open_bands) { SPDK_ERRLOG("%s while restoring tail md in band %u.\n", spdk_strerror(-status), rband->band->id); ftl_band_release_lba_map(rband->band); ftl_restore_complete(restore, status); return; } else { SPDK_ERRLOG("%s while restoring tail md. Will attempt to pad band %u.\n", spdk_strerror(-status), rband->band->id); STAILQ_INSERT_TAIL(&restore->pad_bands, rband, stailq); } } if (!status && ftl_restore_l2p(rband->band)) { ftl_band_release_lba_map(rband->band); ftl_restore_complete(restore, -ENOTRECOVERABLE); return; } ftl_band_release_lba_map(rband->band); rband = ftl_restore_next_band(restore); if (!rband) { if (!STAILQ_EMPTY(&restore->pad_bands)) { spdk_thread_send_msg(ftl_get_core_thread(dev), ftl_restore_pad_open_bands, restore); } else { ftl_restore_complete(restore, 0); } return; } ftl_restore_tail_md(rband); } static int ftl_restore_tail_md(struct ftl_restore_band *rband) { struct ftl_restore *restore = rband->parent; struct ftl_band *band = rband->band; if (ftl_band_alloc_lba_map(band)) { SPDK_ERRLOG("Failed to allocate lba map\n"); ftl_restore_complete(restore, -ENOMEM); return -ENOMEM; } if (ftl_band_read_tail_md(band, band->tail_md_ppa, ftl_restore_tail_md_cb, rband)) { SPDK_ERRLOG("Failed to send tail metadata read\n"); ftl_restore_complete(restore, -EIO); return -EIO; } return 0; } int ftl_restore_device(struct ftl_restore *restore, ftl_restore_fn cb) { struct spdk_ftl_dev *dev = restore->dev; struct ftl_restore_band *rband; restore->current = 0; restore->cb = cb; restore->final_phase = dev->nv_cache.bdev_desc == NULL; /* If restore_device is called, there must be at least one valid band */ rband = ftl_restore_next_band(restore); assert(rband); return ftl_restore_tail_md(rband); }