ac2fffa4b7
Uses of mallocarray(9). The use of mallocarray(9) has rocketed the required swap to build FreeBSD. This is likely caused by the allocation size attributes which put extra pressure on the compiler. Given that most of these checks are superfluous we have to choose better where to use mallocarray(9). We still have more uses of mallocarray(9) but hopefully this is enough to bring swap usage to a reasonable level. Reported by: wosch PR: 225197
585 lines
14 KiB
C
585 lines
14 KiB
C
/*-
|
|
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
|
|
*
|
|
* Copyright (C) 2012-2013 Intel Corporation
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE.
|
|
*/
|
|
|
|
#include <sys/cdefs.h>
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
#include <sys/param.h>
|
|
#include <sys/bio.h>
|
|
#include <sys/bus.h>
|
|
#include <sys/conf.h>
|
|
#include <sys/disk.h>
|
|
#include <sys/fcntl.h>
|
|
#include <sys/ioccom.h>
|
|
#include <sys/malloc.h>
|
|
#include <sys/module.h>
|
|
#include <sys/proc.h>
|
|
#include <sys/systm.h>
|
|
|
|
#include <dev/pci/pcivar.h>
|
|
|
|
#include <geom/geom.h>
|
|
|
|
#include "nvme_private.h"
|
|
|
|
static void nvme_bio_child_inbed(struct bio *parent, int bio_error);
|
|
static void nvme_bio_child_done(void *arg,
|
|
const struct nvme_completion *cpl);
|
|
static uint32_t nvme_get_num_segments(uint64_t addr, uint64_t size,
|
|
uint32_t alignment);
|
|
static void nvme_free_child_bios(int num_bios,
|
|
struct bio **child_bios);
|
|
static struct bio ** nvme_allocate_child_bios(int num_bios);
|
|
static struct bio ** nvme_construct_child_bios(struct bio *bp,
|
|
uint32_t alignment,
|
|
int *num_bios);
|
|
static int nvme_ns_split_bio(struct nvme_namespace *ns,
|
|
struct bio *bp,
|
|
uint32_t alignment);
|
|
|
|
static int
|
|
nvme_ns_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
|
|
struct thread *td)
|
|
{
|
|
struct nvme_namespace *ns;
|
|
struct nvme_controller *ctrlr;
|
|
struct nvme_pt_command *pt;
|
|
|
|
ns = cdev->si_drv1;
|
|
ctrlr = ns->ctrlr;
|
|
|
|
switch (cmd) {
|
|
case NVME_IO_TEST:
|
|
case NVME_BIO_TEST:
|
|
nvme_ns_test(ns, cmd, arg);
|
|
break;
|
|
case NVME_PASSTHROUGH_CMD:
|
|
pt = (struct nvme_pt_command *)arg;
|
|
return (nvme_ctrlr_passthrough_cmd(ctrlr, pt, ns->id,
|
|
1 /* is_user_buffer */, 0 /* is_admin_cmd */));
|
|
case DIOCGMEDIASIZE:
|
|
*(off_t *)arg = (off_t)nvme_ns_get_size(ns);
|
|
break;
|
|
case DIOCGSECTORSIZE:
|
|
*(u_int *)arg = nvme_ns_get_sector_size(ns);
|
|
break;
|
|
default:
|
|
return (ENOTTY);
|
|
}
|
|
|
|
return (0);
|
|
}
|
|
|
|
static int
|
|
nvme_ns_open(struct cdev *dev __unused, int flags, int fmt __unused,
|
|
struct thread *td)
|
|
{
|
|
int error = 0;
|
|
|
|
if (flags & FWRITE)
|
|
error = securelevel_gt(td->td_ucred, 0);
|
|
|
|
return (error);
|
|
}
|
|
|
|
static int
|
|
nvme_ns_close(struct cdev *dev __unused, int flags, int fmt __unused,
|
|
struct thread *td)
|
|
{
|
|
|
|
return (0);
|
|
}
|
|
|
|
static void
|
|
nvme_ns_strategy_done(void *arg, const struct nvme_completion *cpl)
|
|
{
|
|
struct bio *bp = arg;
|
|
|
|
/*
|
|
* TODO: add more extensive translation of NVMe status codes
|
|
* to different bio error codes (i.e. EIO, EINVAL, etc.)
|
|
*/
|
|
if (nvme_completion_is_error(cpl)) {
|
|
bp->bio_error = EIO;
|
|
bp->bio_flags |= BIO_ERROR;
|
|
bp->bio_resid = bp->bio_bcount;
|
|
} else
|
|
bp->bio_resid = 0;
|
|
|
|
biodone(bp);
|
|
}
|
|
|
|
static void
|
|
nvme_ns_strategy(struct bio *bp)
|
|
{
|
|
struct nvme_namespace *ns;
|
|
int err;
|
|
|
|
ns = bp->bio_dev->si_drv1;
|
|
err = nvme_ns_bio_process(ns, bp, nvme_ns_strategy_done);
|
|
|
|
if (err) {
|
|
bp->bio_error = err;
|
|
bp->bio_flags |= BIO_ERROR;
|
|
bp->bio_resid = bp->bio_bcount;
|
|
biodone(bp);
|
|
}
|
|
|
|
}
|
|
|
|
static struct cdevsw nvme_ns_cdevsw = {
|
|
.d_version = D_VERSION,
|
|
.d_flags = D_DISK,
|
|
.d_read = physread,
|
|
.d_write = physwrite,
|
|
.d_open = nvme_ns_open,
|
|
.d_close = nvme_ns_close,
|
|
.d_strategy = nvme_ns_strategy,
|
|
.d_ioctl = nvme_ns_ioctl
|
|
};
|
|
|
|
uint32_t
|
|
nvme_ns_get_max_io_xfer_size(struct nvme_namespace *ns)
|
|
{
|
|
return ns->ctrlr->max_xfer_size;
|
|
}
|
|
|
|
uint32_t
|
|
nvme_ns_get_sector_size(struct nvme_namespace *ns)
|
|
{
|
|
return (1 << ns->data.lbaf[ns->data.flbas.format].lbads);
|
|
}
|
|
|
|
uint64_t
|
|
nvme_ns_get_num_sectors(struct nvme_namespace *ns)
|
|
{
|
|
return (ns->data.nsze);
|
|
}
|
|
|
|
uint64_t
|
|
nvme_ns_get_size(struct nvme_namespace *ns)
|
|
{
|
|
return (nvme_ns_get_num_sectors(ns) * nvme_ns_get_sector_size(ns));
|
|
}
|
|
|
|
uint32_t
|
|
nvme_ns_get_flags(struct nvme_namespace *ns)
|
|
{
|
|
return (ns->flags);
|
|
}
|
|
|
|
const char *
|
|
nvme_ns_get_serial_number(struct nvme_namespace *ns)
|
|
{
|
|
return ((const char *)ns->ctrlr->cdata.sn);
|
|
}
|
|
|
|
const char *
|
|
nvme_ns_get_model_number(struct nvme_namespace *ns)
|
|
{
|
|
return ((const char *)ns->ctrlr->cdata.mn);
|
|
}
|
|
|
|
const struct nvme_namespace_data *
|
|
nvme_ns_get_data(struct nvme_namespace *ns)
|
|
{
|
|
|
|
return (&ns->data);
|
|
}
|
|
|
|
uint32_t
|
|
nvme_ns_get_stripesize(struct nvme_namespace *ns)
|
|
{
|
|
|
|
return (ns->stripesize);
|
|
}
|
|
|
|
static void
|
|
nvme_ns_bio_done(void *arg, const struct nvme_completion *status)
|
|
{
|
|
struct bio *bp = arg;
|
|
nvme_cb_fn_t bp_cb_fn;
|
|
|
|
bp_cb_fn = bp->bio_driver1;
|
|
|
|
if (bp->bio_driver2)
|
|
free(bp->bio_driver2, M_NVME);
|
|
|
|
if (nvme_completion_is_error(status)) {
|
|
bp->bio_flags |= BIO_ERROR;
|
|
if (bp->bio_error == 0)
|
|
bp->bio_error = EIO;
|
|
}
|
|
|
|
if ((bp->bio_flags & BIO_ERROR) == 0)
|
|
bp->bio_resid = 0;
|
|
else
|
|
bp->bio_resid = bp->bio_bcount;
|
|
|
|
bp_cb_fn(bp, status);
|
|
}
|
|
|
|
static void
|
|
nvme_bio_child_inbed(struct bio *parent, int bio_error)
|
|
{
|
|
struct nvme_completion parent_cpl;
|
|
int children, inbed;
|
|
|
|
if (bio_error != 0) {
|
|
parent->bio_flags |= BIO_ERROR;
|
|
parent->bio_error = bio_error;
|
|
}
|
|
|
|
/*
|
|
* atomic_fetchadd will return value before adding 1, so we still
|
|
* must add 1 to get the updated inbed number. Save bio_children
|
|
* before incrementing to guard against race conditions when
|
|
* two children bios complete on different queues.
|
|
*/
|
|
children = atomic_load_acq_int(&parent->bio_children);
|
|
inbed = atomic_fetchadd_int(&parent->bio_inbed, 1) + 1;
|
|
if (inbed == children) {
|
|
bzero(&parent_cpl, sizeof(parent_cpl));
|
|
if (parent->bio_flags & BIO_ERROR)
|
|
parent_cpl.status.sc = NVME_SC_DATA_TRANSFER_ERROR;
|
|
nvme_ns_bio_done(parent, &parent_cpl);
|
|
}
|
|
}
|
|
|
|
static void
|
|
nvme_bio_child_done(void *arg, const struct nvme_completion *cpl)
|
|
{
|
|
struct bio *child = arg;
|
|
struct bio *parent;
|
|
int bio_error;
|
|
|
|
parent = child->bio_parent;
|
|
g_destroy_bio(child);
|
|
bio_error = nvme_completion_is_error(cpl) ? EIO : 0;
|
|
nvme_bio_child_inbed(parent, bio_error);
|
|
}
|
|
|
|
static uint32_t
|
|
nvme_get_num_segments(uint64_t addr, uint64_t size, uint32_t align)
|
|
{
|
|
uint32_t num_segs, offset, remainder;
|
|
|
|
if (align == 0)
|
|
return (1);
|
|
|
|
KASSERT((align & (align - 1)) == 0, ("alignment not power of 2\n"));
|
|
|
|
num_segs = size / align;
|
|
remainder = size & (align - 1);
|
|
offset = addr & (align - 1);
|
|
if (remainder > 0 || offset > 0)
|
|
num_segs += 1 + (remainder + offset - 1) / align;
|
|
return (num_segs);
|
|
}
|
|
|
|
static void
|
|
nvme_free_child_bios(int num_bios, struct bio **child_bios)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < num_bios; i++) {
|
|
if (child_bios[i] != NULL)
|
|
g_destroy_bio(child_bios[i]);
|
|
}
|
|
|
|
free(child_bios, M_NVME);
|
|
}
|
|
|
|
static struct bio **
|
|
nvme_allocate_child_bios(int num_bios)
|
|
{
|
|
struct bio **child_bios;
|
|
int err = 0, i;
|
|
|
|
child_bios = malloc(num_bios * sizeof(struct bio *), M_NVME, M_NOWAIT);
|
|
if (child_bios == NULL)
|
|
return (NULL);
|
|
|
|
for (i = 0; i < num_bios; i++) {
|
|
child_bios[i] = g_new_bio();
|
|
if (child_bios[i] == NULL)
|
|
err = ENOMEM;
|
|
}
|
|
|
|
if (err == ENOMEM) {
|
|
nvme_free_child_bios(num_bios, child_bios);
|
|
return (NULL);
|
|
}
|
|
|
|
return (child_bios);
|
|
}
|
|
|
|
static struct bio **
|
|
nvme_construct_child_bios(struct bio *bp, uint32_t alignment, int *num_bios)
|
|
{
|
|
struct bio **child_bios;
|
|
struct bio *child;
|
|
uint64_t cur_offset;
|
|
caddr_t data;
|
|
uint32_t rem_bcount;
|
|
int i;
|
|
#ifdef NVME_UNMAPPED_BIO_SUPPORT
|
|
struct vm_page **ma;
|
|
uint32_t ma_offset;
|
|
#endif
|
|
|
|
*num_bios = nvme_get_num_segments(bp->bio_offset, bp->bio_bcount,
|
|
alignment);
|
|
child_bios = nvme_allocate_child_bios(*num_bios);
|
|
if (child_bios == NULL)
|
|
return (NULL);
|
|
|
|
bp->bio_children = *num_bios;
|
|
bp->bio_inbed = 0;
|
|
cur_offset = bp->bio_offset;
|
|
rem_bcount = bp->bio_bcount;
|
|
data = bp->bio_data;
|
|
#ifdef NVME_UNMAPPED_BIO_SUPPORT
|
|
ma_offset = bp->bio_ma_offset;
|
|
ma = bp->bio_ma;
|
|
#endif
|
|
|
|
for (i = 0; i < *num_bios; i++) {
|
|
child = child_bios[i];
|
|
child->bio_parent = bp;
|
|
child->bio_cmd = bp->bio_cmd;
|
|
child->bio_offset = cur_offset;
|
|
child->bio_bcount = min(rem_bcount,
|
|
alignment - (cur_offset & (alignment - 1)));
|
|
child->bio_flags = bp->bio_flags;
|
|
#ifdef NVME_UNMAPPED_BIO_SUPPORT
|
|
if (bp->bio_flags & BIO_UNMAPPED) {
|
|
child->bio_ma_offset = ma_offset;
|
|
child->bio_ma = ma;
|
|
child->bio_ma_n =
|
|
nvme_get_num_segments(child->bio_ma_offset,
|
|
child->bio_bcount, PAGE_SIZE);
|
|
ma_offset = (ma_offset + child->bio_bcount) &
|
|
PAGE_MASK;
|
|
ma += child->bio_ma_n;
|
|
if (ma_offset != 0)
|
|
ma -= 1;
|
|
} else
|
|
#endif
|
|
{
|
|
child->bio_data = data;
|
|
data += child->bio_bcount;
|
|
}
|
|
cur_offset += child->bio_bcount;
|
|
rem_bcount -= child->bio_bcount;
|
|
}
|
|
|
|
return (child_bios);
|
|
}
|
|
|
|
static int
|
|
nvme_ns_split_bio(struct nvme_namespace *ns, struct bio *bp,
|
|
uint32_t alignment)
|
|
{
|
|
struct bio *child;
|
|
struct bio **child_bios;
|
|
int err, i, num_bios;
|
|
|
|
child_bios = nvme_construct_child_bios(bp, alignment, &num_bios);
|
|
if (child_bios == NULL)
|
|
return (ENOMEM);
|
|
|
|
for (i = 0; i < num_bios; i++) {
|
|
child = child_bios[i];
|
|
err = nvme_ns_bio_process(ns, child, nvme_bio_child_done);
|
|
if (err != 0) {
|
|
nvme_bio_child_inbed(bp, err);
|
|
g_destroy_bio(child);
|
|
}
|
|
}
|
|
|
|
free(child_bios, M_NVME);
|
|
return (0);
|
|
}
|
|
|
|
int
|
|
nvme_ns_bio_process(struct nvme_namespace *ns, struct bio *bp,
|
|
nvme_cb_fn_t cb_fn)
|
|
{
|
|
struct nvme_dsm_range *dsm_range;
|
|
uint32_t num_bios;
|
|
int err;
|
|
|
|
bp->bio_driver1 = cb_fn;
|
|
|
|
if (ns->stripesize > 0 &&
|
|
(bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) {
|
|
num_bios = nvme_get_num_segments(bp->bio_offset,
|
|
bp->bio_bcount, ns->stripesize);
|
|
if (num_bios > 1)
|
|
return (nvme_ns_split_bio(ns, bp, ns->stripesize));
|
|
}
|
|
|
|
switch (bp->bio_cmd) {
|
|
case BIO_READ:
|
|
err = nvme_ns_cmd_read_bio(ns, bp, nvme_ns_bio_done, bp);
|
|
break;
|
|
case BIO_WRITE:
|
|
err = nvme_ns_cmd_write_bio(ns, bp, nvme_ns_bio_done, bp);
|
|
break;
|
|
case BIO_FLUSH:
|
|
err = nvme_ns_cmd_flush(ns, nvme_ns_bio_done, bp);
|
|
break;
|
|
case BIO_DELETE:
|
|
dsm_range =
|
|
malloc(sizeof(struct nvme_dsm_range), M_NVME,
|
|
M_ZERO | M_WAITOK);
|
|
dsm_range->length =
|
|
bp->bio_bcount/nvme_ns_get_sector_size(ns);
|
|
dsm_range->starting_lba =
|
|
bp->bio_offset/nvme_ns_get_sector_size(ns);
|
|
bp->bio_driver2 = dsm_range;
|
|
err = nvme_ns_cmd_deallocate(ns, dsm_range, 1,
|
|
nvme_ns_bio_done, bp);
|
|
if (err != 0)
|
|
free(dsm_range, M_NVME);
|
|
break;
|
|
default:
|
|
err = EIO;
|
|
break;
|
|
}
|
|
|
|
return (err);
|
|
}
|
|
|
|
int
|
|
nvme_ns_construct(struct nvme_namespace *ns, uint32_t id,
|
|
struct nvme_controller *ctrlr)
|
|
{
|
|
struct nvme_completion_poll_status status;
|
|
int unit;
|
|
|
|
ns->ctrlr = ctrlr;
|
|
ns->id = id;
|
|
ns->stripesize = 0;
|
|
|
|
if (pci_get_devid(ctrlr->dev) == 0x09538086 && ctrlr->cdata.vs[3] != 0)
|
|
ns->stripesize =
|
|
(1 << ctrlr->cdata.vs[3]) * ctrlr->min_page_size;
|
|
|
|
/*
|
|
* Namespaces are reconstructed after a controller reset, so check
|
|
* to make sure we only call mtx_init once on each mtx.
|
|
*
|
|
* TODO: Move this somewhere where it gets called at controller
|
|
* construction time, which is not invoked as part of each
|
|
* controller reset.
|
|
*/
|
|
if (!mtx_initialized(&ns->lock))
|
|
mtx_init(&ns->lock, "nvme ns lock", NULL, MTX_DEF);
|
|
|
|
status.done = FALSE;
|
|
nvme_ctrlr_cmd_identify_namespace(ctrlr, id, &ns->data,
|
|
nvme_completion_poll_cb, &status);
|
|
while (status.done == FALSE)
|
|
DELAY(5);
|
|
if (nvme_completion_is_error(&status.cpl)) {
|
|
nvme_printf(ctrlr, "nvme_identify_namespace failed\n");
|
|
return (ENXIO);
|
|
}
|
|
|
|
/*
|
|
* If the size of is zero, chances are this isn't a valid
|
|
* namespace (eg one that's not been configured yet). The
|
|
* standard says the entire id will be zeros, so this is a
|
|
* cheap way to test for that.
|
|
*/
|
|
if (ns->data.nsze == 0)
|
|
return (ENXIO);
|
|
|
|
/*
|
|
* Note: format is a 0-based value, so > is appropriate here,
|
|
* not >=.
|
|
*/
|
|
if (ns->data.flbas.format > ns->data.nlbaf) {
|
|
printf("lba format %d exceeds number supported (%d)\n",
|
|
ns->data.flbas.format, ns->data.nlbaf+1);
|
|
return (ENXIO);
|
|
}
|
|
|
|
if (ctrlr->cdata.oncs.dsm)
|
|
ns->flags |= NVME_NS_DEALLOCATE_SUPPORTED;
|
|
|
|
if (ctrlr->cdata.vwc.present)
|
|
ns->flags |= NVME_NS_FLUSH_SUPPORTED;
|
|
|
|
/*
|
|
* cdev may have already been created, if we are reconstructing the
|
|
* namespace after a controller-level reset.
|
|
*/
|
|
if (ns->cdev != NULL)
|
|
return (0);
|
|
|
|
/*
|
|
* Namespace IDs start at 1, so we need to subtract 1 to create a
|
|
* correct unit number.
|
|
*/
|
|
unit = device_get_unit(ctrlr->dev) * NVME_MAX_NAMESPACES + ns->id - 1;
|
|
|
|
/*
|
|
* MAKEDEV_ETERNAL was added in r210923, for cdevs that will never
|
|
* be destroyed. This avoids refcounting on the cdev object.
|
|
* That should be OK case here, as long as we're not supporting PCIe
|
|
* surprise removal nor namespace deletion.
|
|
*/
|
|
#ifdef MAKEDEV_ETERNAL_KLD
|
|
ns->cdev = make_dev_credf(MAKEDEV_ETERNAL_KLD, &nvme_ns_cdevsw, unit,
|
|
NULL, UID_ROOT, GID_WHEEL, 0600, "nvme%dns%d",
|
|
device_get_unit(ctrlr->dev), ns->id);
|
|
#else
|
|
ns->cdev = make_dev_credf(0, &nvme_ns_cdevsw, unit,
|
|
NULL, UID_ROOT, GID_WHEEL, 0600, "nvme%dns%d",
|
|
device_get_unit(ctrlr->dev), ns->id);
|
|
#endif
|
|
#ifdef NVME_UNMAPPED_BIO_SUPPORT
|
|
ns->cdev->si_flags |= SI_UNMAPPED;
|
|
#endif
|
|
|
|
if (ns->cdev != NULL)
|
|
ns->cdev->si_drv1 = ns;
|
|
|
|
return (0);
|
|
}
|
|
|
|
void nvme_ns_destruct(struct nvme_namespace *ns)
|
|
{
|
|
|
|
if (ns->cdev != NULL)
|
|
destroy_dev(ns->cdev);
|
|
}
|