freebsd-nq/sys/dev/pci/pci_iov.c
Konstantin Belousov 94f5c1cc71 pci_iov: When pci_iov_detach(9) is called, destroy VF children
instead of bailing out with EBUSY if there are any.

If driver module is unloaded, or just device is forcibly detached from
the driver, there is no way for driver to correctly unload otherwise.
Esp. if there are resources dedicated to the VFs which prevent turning
down other resources.

Reviewed by:	jhb
Sponsored by:	Mellanox Technologies / NVidia Networking
MFC after:	1 week
Differential revision:	https://reviews.freebsd.org/D27615
2020-12-18 03:46:50 +00:00

1086 lines
25 KiB
C

/*-
* Copyright (c) 2013-2015 Sandvine Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_bus.h"
#include <sys/param.h>
#include <sys/conf.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/bus.h>
#include <sys/fcntl.h>
#include <sys/ioccom.h>
#include <sys/iov.h>
#include <sys/linker.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/pciio.h>
#include <sys/queue.h>
#include <sys/rman.h>
#include <sys/sysctl.h>
#include <machine/bus.h>
#include <machine/stdarg.h>
#include <sys/nv.h>
#include <sys/iov_schema.h>
#include <dev/pci/pcireg.h>
#include <dev/pci/pcivar.h>
#include <dev/pci/pci_iov.h>
#include <dev/pci/pci_private.h>
#include <dev/pci/pci_iov_private.h>
#include <dev/pci/schema_private.h>
#include "pcib_if.h"
static MALLOC_DEFINE(M_SRIOV, "sr_iov", "PCI SR-IOV allocations");
static d_ioctl_t pci_iov_ioctl;
static struct cdevsw iov_cdevsw = {
.d_version = D_VERSION,
.d_name = "iov",
.d_ioctl = pci_iov_ioctl
};
SYSCTL_DECL(_hw_pci);
/*
* The maximum amount of memory we will allocate for user configuration of an
* SR-IOV device. 1MB ought to be enough for anyone, but leave this
* configurable just in case.
*/
static u_long pci_iov_max_config = 1024 * 1024;
SYSCTL_ULONG(_hw_pci, OID_AUTO, iov_max_config, CTLFLAG_RWTUN,
&pci_iov_max_config, 0, "Maximum allowed size of SR-IOV configuration.");
#define IOV_READ(d, r, w) \
pci_read_config((d)->cfg.dev, (d)->cfg.iov->iov_pos + r, w)
#define IOV_WRITE(d, r, v, w) \
pci_write_config((d)->cfg.dev, (d)->cfg.iov->iov_pos + r, v, w)
static nvlist_t *pci_iov_build_schema(nvlist_t **pf_schema,
nvlist_t **vf_schema);
static void pci_iov_build_pf_schema(nvlist_t *schema,
nvlist_t **driver_schema);
static void pci_iov_build_vf_schema(nvlist_t *schema,
nvlist_t **driver_schema);
static int pci_iov_delete_iov_children(struct pci_devinfo *dinfo);
static nvlist_t *pci_iov_get_pf_subsystem_schema(void);
static nvlist_t *pci_iov_get_vf_subsystem_schema(void);
int
pci_iov_attach_name(device_t dev, struct nvlist *pf_schema,
struct nvlist *vf_schema, const char *fmt, ...)
{
char buf[NAME_MAX + 1];
va_list ap;
va_start(ap, fmt);
vsnprintf(buf, sizeof(buf), fmt, ap);
va_end(ap);
return (PCI_IOV_ATTACH(device_get_parent(dev), dev, pf_schema,
vf_schema, buf));
}
int
pci_iov_attach_method(device_t bus, device_t dev, nvlist_t *pf_schema,
nvlist_t *vf_schema, const char *name)
{
device_t pcib;
struct pci_devinfo *dinfo;
struct pcicfg_iov *iov;
nvlist_t *schema;
uint32_t version;
int error;
int iov_pos;
dinfo = device_get_ivars(dev);
pcib = device_get_parent(bus);
schema = NULL;
error = pci_find_extcap(dev, PCIZ_SRIOV, &iov_pos);
if (error != 0)
return (error);
version = pci_read_config(dev, iov_pos, 4);
if (PCI_EXTCAP_VER(version) != 1) {
if (bootverbose)
device_printf(dev,
"Unsupported version of SR-IOV (%d) detected\n",
PCI_EXTCAP_VER(version));
return (ENXIO);
}
iov = malloc(sizeof(*dinfo->cfg.iov), M_SRIOV, M_WAITOK | M_ZERO);
mtx_lock(&Giant);
if (dinfo->cfg.iov != NULL) {
error = EBUSY;
goto cleanup;
}
iov->iov_pos = iov_pos;
schema = pci_iov_build_schema(&pf_schema, &vf_schema);
if (schema == NULL) {
error = ENOMEM;
goto cleanup;
}
error = pci_iov_validate_schema(schema);
if (error != 0)
goto cleanup;
iov->iov_schema = schema;
iov->iov_cdev = make_dev(&iov_cdevsw, device_get_unit(dev),
UID_ROOT, GID_WHEEL, 0600, "iov/%s", name);
if (iov->iov_cdev == NULL) {
error = ENOMEM;
goto cleanup;
}
dinfo->cfg.iov = iov;
iov->iov_cdev->si_drv1 = dinfo;
mtx_unlock(&Giant);
return (0);
cleanup:
nvlist_destroy(schema);
nvlist_destroy(pf_schema);
nvlist_destroy(vf_schema);
free(iov, M_SRIOV);
mtx_unlock(&Giant);
return (error);
}
int
pci_iov_detach_method(device_t bus, device_t dev)
{
struct pci_devinfo *dinfo;
struct pcicfg_iov *iov;
int error;
mtx_lock(&Giant);
dinfo = device_get_ivars(dev);
iov = dinfo->cfg.iov;
if (iov == NULL) {
mtx_unlock(&Giant);
return (0);
}
if ((iov->iov_flags & IOV_BUSY) != 0) {
mtx_unlock(&Giant);
return (EBUSY);
}
error = pci_iov_delete_iov_children(dinfo);
if (error != 0) {
mtx_unlock(&Giant);
return (error);
}
dinfo->cfg.iov = NULL;
if (iov->iov_cdev) {
destroy_dev(iov->iov_cdev);
iov->iov_cdev = NULL;
}
nvlist_destroy(iov->iov_schema);
free(iov, M_SRIOV);
mtx_unlock(&Giant);
return (0);
}
static nvlist_t *
pci_iov_build_schema(nvlist_t **pf, nvlist_t **vf)
{
nvlist_t *schema, *pf_driver, *vf_driver;
/* We always take ownership of the schemas. */
pf_driver = *pf;
*pf = NULL;
vf_driver = *vf;
*vf = NULL;
schema = pci_iov_schema_alloc_node();
if (schema == NULL)
goto cleanup;
pci_iov_build_pf_schema(schema, &pf_driver);
pci_iov_build_vf_schema(schema, &vf_driver);
if (nvlist_error(schema) != 0)
goto cleanup;
return (schema);
cleanup:
nvlist_destroy(schema);
nvlist_destroy(pf_driver);
nvlist_destroy(vf_driver);
return (NULL);
}
static void
pci_iov_build_pf_schema(nvlist_t *schema, nvlist_t **driver_schema)
{
nvlist_t *pf_schema, *iov_schema;
pf_schema = pci_iov_schema_alloc_node();
if (pf_schema == NULL) {
nvlist_set_error(schema, ENOMEM);
return;
}
iov_schema = pci_iov_get_pf_subsystem_schema();
/*
* Note that if either *driver_schema or iov_schema is NULL, then
* nvlist_move_nvlist will put the schema in the error state and
* SR-IOV will fail to initialize later, so we don't have to explicitly
* handle that case.
*/
nvlist_move_nvlist(pf_schema, DRIVER_CONFIG_NAME, *driver_schema);
nvlist_move_nvlist(pf_schema, IOV_CONFIG_NAME, iov_schema);
nvlist_move_nvlist(schema, PF_CONFIG_NAME, pf_schema);
*driver_schema = NULL;
}
static void
pci_iov_build_vf_schema(nvlist_t *schema, nvlist_t **driver_schema)
{
nvlist_t *vf_schema, *iov_schema;
vf_schema = pci_iov_schema_alloc_node();
if (vf_schema == NULL) {
nvlist_set_error(schema, ENOMEM);
return;
}
iov_schema = pci_iov_get_vf_subsystem_schema();
/*
* Note that if either *driver_schema or iov_schema is NULL, then
* nvlist_move_nvlist will put the schema in the error state and
* SR-IOV will fail to initialize later, so we don't have to explicitly
* handle that case.
*/
nvlist_move_nvlist(vf_schema, DRIVER_CONFIG_NAME, *driver_schema);
nvlist_move_nvlist(vf_schema, IOV_CONFIG_NAME, iov_schema);
nvlist_move_nvlist(schema, VF_SCHEMA_NAME, vf_schema);
*driver_schema = NULL;
}
static nvlist_t *
pci_iov_get_pf_subsystem_schema(void)
{
nvlist_t *pf;
pf = pci_iov_schema_alloc_node();
if (pf == NULL)
return (NULL);
pci_iov_schema_add_uint16(pf, "num_vfs", IOV_SCHEMA_REQUIRED, -1);
pci_iov_schema_add_string(pf, "device", IOV_SCHEMA_REQUIRED, NULL);
return (pf);
}
static nvlist_t *
pci_iov_get_vf_subsystem_schema(void)
{
nvlist_t *vf;
vf = pci_iov_schema_alloc_node();
if (vf == NULL)
return (NULL);
pci_iov_schema_add_bool(vf, "passthrough", IOV_SCHEMA_HASDEFAULT, 0);
return (vf);
}
static int
pci_iov_alloc_bar(struct pci_devinfo *dinfo, int bar, pci_addr_t bar_shift)
{
struct resource *res;
struct pcicfg_iov *iov;
device_t dev, bus;
rman_res_t start, end;
pci_addr_t bar_size;
int rid;
iov = dinfo->cfg.iov;
dev = dinfo->cfg.dev;
bus = device_get_parent(dev);
rid = iov->iov_pos + PCIR_SRIOV_BAR(bar);
bar_size = 1 << bar_shift;
res = pci_alloc_multi_resource(bus, dev, SYS_RES_MEMORY, &rid, 0,
~0, 1, iov->iov_num_vfs, RF_ACTIVE);
if (res == NULL)
return (ENXIO);
iov->iov_bar[bar].res = res;
iov->iov_bar[bar].bar_size = bar_size;
iov->iov_bar[bar].bar_shift = bar_shift;
start = rman_get_start(res);
end = rman_get_end(res);
return (rman_manage_region(&iov->rman, start, end));
}
static void
pci_iov_add_bars(struct pcicfg_iov *iov, struct pci_devinfo *dinfo)
{
struct pci_iov_bar *bar;
uint64_t bar_start;
int i;
for (i = 0; i <= PCIR_MAX_BAR_0; i++) {
bar = &iov->iov_bar[i];
if (bar->res != NULL) {
bar_start = rman_get_start(bar->res) +
dinfo->cfg.vf.index * bar->bar_size;
pci_add_bar(dinfo->cfg.dev, PCIR_BAR(i), bar_start,
bar->bar_shift);
}
}
}
static int
pci_iov_parse_config(struct pcicfg_iov *iov, struct pci_iov_arg *arg,
nvlist_t **ret)
{
void *packed_config;
nvlist_t *config;
int error;
config = NULL;
packed_config = NULL;
if (arg->len > pci_iov_max_config) {
error = EMSGSIZE;
goto out;
}
packed_config = malloc(arg->len, M_SRIOV, M_WAITOK);
error = copyin(arg->config, packed_config, arg->len);
if (error != 0)
goto out;
config = nvlist_unpack(packed_config, arg->len, NV_FLAG_IGNORE_CASE);
if (config == NULL) {
error = EINVAL;
goto out;
}
error = pci_iov_schema_validate_config(iov->iov_schema, config);
if (error != 0)
goto out;
error = nvlist_error(config);
if (error != 0)
goto out;
*ret = config;
config = NULL;
out:
nvlist_destroy(config);
free(packed_config, M_SRIOV);
return (error);
}
/*
* Set the ARI_EN bit in the lowest-numbered PCI function with the SR-IOV
* capability. This bit is only writeable on the lowest-numbered PF but
* affects all PFs on the device.
*/
static int
pci_iov_set_ari(device_t bus)
{
device_t lowest;
device_t *devlist;
int i, error, devcount, lowest_func, lowest_pos, iov_pos, dev_func;
uint16_t iov_ctl;
/* If ARI is disabled on the downstream port there is nothing to do. */
if (!PCIB_ARI_ENABLED(device_get_parent(bus)))
return (0);
error = device_get_children(bus, &devlist, &devcount);
if (error != 0)
return (error);
lowest = NULL;
for (i = 0; i < devcount; i++) {
if (pci_find_extcap(devlist[i], PCIZ_SRIOV, &iov_pos) == 0) {
dev_func = pci_get_function(devlist[i]);
if (lowest == NULL || dev_func < lowest_func) {
lowest = devlist[i];
lowest_func = dev_func;
lowest_pos = iov_pos;
}
}
}
free(devlist, M_TEMP);
/*
* If we called this function some device must have the SR-IOV
* capability.
*/
KASSERT(lowest != NULL,
("Could not find child of %s with SR-IOV capability",
device_get_nameunit(bus)));
iov_ctl = pci_read_config(lowest, lowest_pos + PCIR_SRIOV_CTL, 2);
iov_ctl |= PCIM_SRIOV_ARI_EN;
pci_write_config(lowest, lowest_pos + PCIR_SRIOV_CTL, iov_ctl, 2);
if ((pci_read_config(lowest, lowest_pos + PCIR_SRIOV_CTL, 2) &
PCIM_SRIOV_ARI_EN) == 0) {
device_printf(lowest, "failed to enable ARI\n");
return (ENXIO);
}
return (0);
}
static int
pci_iov_config_page_size(struct pci_devinfo *dinfo)
{
uint32_t page_cap, page_size;
page_cap = IOV_READ(dinfo, PCIR_SRIOV_PAGE_CAP, 4);
/*
* If the system page size is less than the smallest SR-IOV page size
* then round up to the smallest SR-IOV page size.
*/
if (PAGE_SHIFT < PCI_SRIOV_BASE_PAGE_SHIFT)
page_size = (1 << 0);
else
page_size = (1 << (PAGE_SHIFT - PCI_SRIOV_BASE_PAGE_SHIFT));
/* Check that the device supports the system page size. */
if (!(page_size & page_cap))
return (ENXIO);
IOV_WRITE(dinfo, PCIR_SRIOV_PAGE_SIZE, page_size, 4);
return (0);
}
static int
pci_iov_init(device_t dev, uint16_t num_vfs, const nvlist_t *config)
{
const nvlist_t *device, *driver_config;
device = nvlist_get_nvlist(config, PF_CONFIG_NAME);
driver_config = nvlist_get_nvlist(device, DRIVER_CONFIG_NAME);
return (PCI_IOV_INIT(dev, num_vfs, driver_config));
}
static int
pci_iov_init_rman(device_t pf, struct pcicfg_iov *iov)
{
int error;
iov->rman.rm_start = 0;
iov->rman.rm_end = ~0;
iov->rman.rm_type = RMAN_ARRAY;
snprintf(iov->rman_name, sizeof(iov->rman_name), "%s VF I/O memory",
device_get_nameunit(pf));
iov->rman.rm_descr = iov->rman_name;
error = rman_init(&iov->rman);
if (error != 0)
return (error);
iov->iov_flags |= IOV_RMAN_INITED;
return (0);
}
static int
pci_iov_alloc_bar_ea(struct pci_devinfo *dinfo, int bar)
{
struct pcicfg_iov *iov;
rman_res_t start, end;
struct resource *res;
struct resource_list *rl;
struct resource_list_entry *rle;
rl = &dinfo->resources;
iov = dinfo->cfg.iov;
rle = resource_list_find(rl, SYS_RES_MEMORY,
iov->iov_pos + PCIR_SRIOV_BAR(bar));
if (rle == NULL)
rle = resource_list_find(rl, SYS_RES_IOPORT,
iov->iov_pos + PCIR_SRIOV_BAR(bar));
if (rle == NULL)
return (ENXIO);
res = rle->res;
iov->iov_bar[bar].res = res;
iov->iov_bar[bar].bar_size = rman_get_size(res) / iov->iov_num_vfs;
iov->iov_bar[bar].bar_shift = pci_mapsize(iov->iov_bar[bar].bar_size);
start = rman_get_start(res);
end = rman_get_end(res);
return (rman_manage_region(&iov->rman, start, end));
}
static int
pci_iov_setup_bars(struct pci_devinfo *dinfo)
{
device_t dev;
struct pcicfg_iov *iov;
pci_addr_t bar_value, testval;
int i, last_64, error;
iov = dinfo->cfg.iov;
dev = dinfo->cfg.dev;
last_64 = 0;
pci_add_resources_ea(device_get_parent(dev), dev, 1);
for (i = 0; i <= PCIR_MAX_BAR_0; i++) {
/* First, try to use BARs allocated with EA */
error = pci_iov_alloc_bar_ea(dinfo, i);
if (error == 0)
continue;
/* Allocate legacy-BAR only if EA is not enabled */
if (pci_ea_is_enabled(dev, iov->iov_pos + PCIR_SRIOV_BAR(i)))
continue;
/*
* If a PCI BAR is a 64-bit wide BAR, then it spans two
* consecutive registers. Therefore if the last BAR that
* we looked at was a 64-bit BAR, we need to skip this
* register as it's the second half of the last BAR.
*/
if (!last_64) {
pci_read_bar(dev,
iov->iov_pos + PCIR_SRIOV_BAR(i),
&bar_value, &testval, &last_64);
if (testval != 0) {
error = pci_iov_alloc_bar(dinfo, i,
pci_mapsize(testval));
if (error != 0)
return (error);
}
} else
last_64 = 0;
}
return (0);
}
static void
pci_iov_enumerate_vfs(struct pci_devinfo *dinfo, const nvlist_t *config,
uint16_t first_rid, uint16_t rid_stride)
{
char device_name[VF_MAX_NAME];
const nvlist_t *device, *driver_config, *iov_config;
device_t bus, dev, vf;
struct pcicfg_iov *iov;
struct pci_devinfo *vfinfo;
int i, error;
uint16_t vid, did, next_rid;
iov = dinfo->cfg.iov;
dev = dinfo->cfg.dev;
bus = device_get_parent(dev);
next_rid = first_rid;
vid = pci_get_vendor(dev);
did = IOV_READ(dinfo, PCIR_SRIOV_VF_DID, 2);
for (i = 0; i < iov->iov_num_vfs; i++, next_rid += rid_stride) {
snprintf(device_name, sizeof(device_name), VF_PREFIX"%d", i);
device = nvlist_get_nvlist(config, device_name);
iov_config = nvlist_get_nvlist(device, IOV_CONFIG_NAME);
driver_config = nvlist_get_nvlist(device, DRIVER_CONFIG_NAME);
vf = PCI_CREATE_IOV_CHILD(bus, dev, next_rid, vid, did);
if (vf == NULL)
break;
/*
* If we are creating passthrough devices then force the ppt
* driver to attach to prevent a VF driver from claiming the
* VFs.
*/
if (nvlist_get_bool(iov_config, "passthrough"))
device_set_devclass_fixed(vf, "ppt");
vfinfo = device_get_ivars(vf);
vfinfo->cfg.iov = iov;
vfinfo->cfg.vf.index = i;
pci_iov_add_bars(iov, vfinfo);
error = PCI_IOV_ADD_VF(dev, i, driver_config);
if (error != 0) {
device_printf(dev, "Failed to add VF %d\n", i);
device_delete_child(bus, vf);
}
}
bus_generic_attach(bus);
}
static int
pci_iov_config(struct cdev *cdev, struct pci_iov_arg *arg)
{
device_t bus, dev;
struct pci_devinfo *dinfo;
struct pcicfg_iov *iov;
nvlist_t *config;
int i, error;
uint16_t rid_off, rid_stride;
uint16_t first_rid, last_rid;
uint16_t iov_ctl;
uint16_t num_vfs, total_vfs;
int iov_inited;
mtx_lock(&Giant);
dinfo = cdev->si_drv1;
iov = dinfo->cfg.iov;
dev = dinfo->cfg.dev;
bus = device_get_parent(dev);
iov_inited = 0;
config = NULL;
if ((iov->iov_flags & IOV_BUSY) || iov->iov_num_vfs != 0) {
mtx_unlock(&Giant);
return (EBUSY);
}
iov->iov_flags |= IOV_BUSY;
error = pci_iov_parse_config(iov, arg, &config);
if (error != 0)
goto out;
num_vfs = pci_iov_config_get_num_vfs(config);
total_vfs = IOV_READ(dinfo, PCIR_SRIOV_TOTAL_VFS, 2);
if (num_vfs > total_vfs) {
error = EINVAL;
goto out;
}
error = pci_iov_config_page_size(dinfo);
if (error != 0)
goto out;
error = pci_iov_set_ari(bus);
if (error != 0)
goto out;
error = pci_iov_init(dev, num_vfs, config);
if (error != 0)
goto out;
iov_inited = 1;
IOV_WRITE(dinfo, PCIR_SRIOV_NUM_VFS, num_vfs, 2);
rid_off = IOV_READ(dinfo, PCIR_SRIOV_VF_OFF, 2);
rid_stride = IOV_READ(dinfo, PCIR_SRIOV_VF_STRIDE, 2);
first_rid = pci_get_rid(dev) + rid_off;
last_rid = first_rid + (num_vfs - 1) * rid_stride;
/* We don't yet support allocating extra bus numbers for VFs. */
if (pci_get_bus(dev) != PCI_RID2BUS(last_rid)) {
error = ENOSPC;
goto out;
}
iov_ctl = IOV_READ(dinfo, PCIR_SRIOV_CTL, 2);
iov_ctl &= ~(PCIM_SRIOV_VF_EN | PCIM_SRIOV_VF_MSE);
IOV_WRITE(dinfo, PCIR_SRIOV_CTL, iov_ctl, 2);
error = pci_iov_init_rman(dev, iov);
if (error != 0)
goto out;
iov->iov_num_vfs = num_vfs;
error = pci_iov_setup_bars(dinfo);
if (error != 0)
goto out;
iov_ctl = IOV_READ(dinfo, PCIR_SRIOV_CTL, 2);
iov_ctl |= PCIM_SRIOV_VF_EN | PCIM_SRIOV_VF_MSE;
IOV_WRITE(dinfo, PCIR_SRIOV_CTL, iov_ctl, 2);
/* Per specification, we must wait 100ms before accessing VFs. */
pause("iov", roundup(hz, 10));
pci_iov_enumerate_vfs(dinfo, config, first_rid, rid_stride);
nvlist_destroy(config);
iov->iov_flags &= ~IOV_BUSY;
mtx_unlock(&Giant);
return (0);
out:
if (iov_inited)
PCI_IOV_UNINIT(dev);
for (i = 0; i <= PCIR_MAX_BAR_0; i++) {
if (iov->iov_bar[i].res != NULL) {
pci_release_resource(bus, dev, SYS_RES_MEMORY,
iov->iov_pos + PCIR_SRIOV_BAR(i),
iov->iov_bar[i].res);
pci_delete_resource(bus, dev, SYS_RES_MEMORY,
iov->iov_pos + PCIR_SRIOV_BAR(i));
iov->iov_bar[i].res = NULL;
}
}
if (iov->iov_flags & IOV_RMAN_INITED) {
rman_fini(&iov->rman);
iov->iov_flags &= ~IOV_RMAN_INITED;
}
nvlist_destroy(config);
iov->iov_num_vfs = 0;
iov->iov_flags &= ~IOV_BUSY;
mtx_unlock(&Giant);
return (error);
}
void
pci_iov_cfg_restore(device_t dev, struct pci_devinfo *dinfo)
{
struct pcicfg_iov *iov;
iov = dinfo->cfg.iov;
IOV_WRITE(dinfo, PCIR_SRIOV_PAGE_SIZE, iov->iov_page_size, 4);
IOV_WRITE(dinfo, PCIR_SRIOV_NUM_VFS, iov->iov_num_vfs, 2);
IOV_WRITE(dinfo, PCIR_SRIOV_CTL, iov->iov_ctl, 2);
}
void
pci_iov_cfg_save(device_t dev, struct pci_devinfo *dinfo)
{
struct pcicfg_iov *iov;
iov = dinfo->cfg.iov;
iov->iov_page_size = IOV_READ(dinfo, PCIR_SRIOV_PAGE_SIZE, 4);
iov->iov_ctl = IOV_READ(dinfo, PCIR_SRIOV_CTL, 2);
}
/* Return true if child is a VF of the given PF. */
static int
pci_iov_is_child_vf(struct pcicfg_iov *pf, device_t child)
{
struct pci_devinfo *vfinfo;
vfinfo = device_get_ivars(child);
if (!(vfinfo->cfg.flags & PCICFG_VF))
return (0);
return (pf == vfinfo->cfg.iov);
}
static int
pci_iov_delete_iov_children(struct pci_devinfo *dinfo)
{
device_t bus, dev, vf, *devlist;
struct pcicfg_iov *iov;
int i, error, devcount;
uint32_t iov_ctl;
mtx_assert(&Giant, MA_OWNED);
iov = dinfo->cfg.iov;
dev = dinfo->cfg.dev;
bus = device_get_parent(dev);
devlist = NULL;
iov->iov_flags |= IOV_BUSY;
error = device_get_children(bus, &devlist, &devcount);
if (error != 0)
goto out;
for (i = 0; i < devcount; i++) {
vf = devlist[i];
if (!pci_iov_is_child_vf(iov, vf))
continue;
error = device_detach(vf);
if (error != 0) {
device_printf(dev,
"Could not disable SR-IOV: failed to detach VF %s\n",
device_get_nameunit(vf));
goto out;
}
}
for (i = 0; i < devcount; i++) {
vf = devlist[i];
if (pci_iov_is_child_vf(iov, vf))
device_delete_child(bus, vf);
}
PCI_IOV_UNINIT(dev);
iov_ctl = IOV_READ(dinfo, PCIR_SRIOV_CTL, 2);
iov_ctl &= ~(PCIM_SRIOV_VF_EN | PCIM_SRIOV_VF_MSE);
IOV_WRITE(dinfo, PCIR_SRIOV_CTL, iov_ctl, 2);
IOV_WRITE(dinfo, PCIR_SRIOV_NUM_VFS, 0, 2);
iov->iov_num_vfs = 0;
for (i = 0; i <= PCIR_MAX_BAR_0; i++) {
if (iov->iov_bar[i].res != NULL) {
pci_release_resource(bus, dev, SYS_RES_MEMORY,
iov->iov_pos + PCIR_SRIOV_BAR(i),
iov->iov_bar[i].res);
pci_delete_resource(bus, dev, SYS_RES_MEMORY,
iov->iov_pos + PCIR_SRIOV_BAR(i));
iov->iov_bar[i].res = NULL;
}
}
if (iov->iov_flags & IOV_RMAN_INITED) {
rman_fini(&iov->rman);
iov->iov_flags &= ~IOV_RMAN_INITED;
}
error = 0;
out:
free(devlist, M_TEMP);
iov->iov_flags &= ~IOV_BUSY;
return (error);
}
static int
pci_iov_delete(struct cdev *cdev)
{
struct pci_devinfo *dinfo;
struct pcicfg_iov *iov;
int error;
mtx_lock(&Giant);
dinfo = cdev->si_drv1;
iov = dinfo->cfg.iov;
if ((iov->iov_flags & IOV_BUSY) != 0) {
error = EBUSY;
goto out;
}
if (iov->iov_num_vfs == 0) {
error = ECHILD;
goto out;
}
error = pci_iov_delete_iov_children(dinfo);
out:
mtx_unlock(&Giant);
return (error);
}
static int
pci_iov_get_schema_ioctl(struct cdev *cdev, struct pci_iov_schema *output)
{
struct pci_devinfo *dinfo;
void *packed;
size_t output_len, size;
int error;
packed = NULL;
mtx_lock(&Giant);
dinfo = cdev->si_drv1;
packed = nvlist_pack(dinfo->cfg.iov->iov_schema, &size);
mtx_unlock(&Giant);
if (packed == NULL) {
error = ENOMEM;
goto fail;
}
output_len = output->len;
output->len = size;
if (size <= output_len) {
error = copyout(packed, output->schema, size);
if (error != 0)
goto fail;
output->error = 0;
} else
/*
* If we return an error then the ioctl code won't copyout
* output back to userland, so we flag the error in the struct
* instead.
*/
output->error = EMSGSIZE;
error = 0;
fail:
free(packed, M_NVLIST);
return (error);
}
static int
pci_iov_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
struct thread *td)
{
switch (cmd) {
case IOV_CONFIG:
return (pci_iov_config(dev, (struct pci_iov_arg *)data));
case IOV_DELETE:
return (pci_iov_delete(dev));
case IOV_GET_SCHEMA:
return (pci_iov_get_schema_ioctl(dev,
(struct pci_iov_schema *)data));
default:
return (EINVAL);
}
}
struct resource *
pci_vf_alloc_mem_resource(device_t dev, device_t child, int *rid,
rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
{
struct pci_devinfo *dinfo;
struct pcicfg_iov *iov;
struct pci_map *map;
struct resource *res;
struct resource_list_entry *rle;
rman_res_t bar_start, bar_end;
pci_addr_t bar_length;
int error;
dinfo = device_get_ivars(child);
iov = dinfo->cfg.iov;
map = pci_find_bar(child, *rid);
if (map == NULL)
return (NULL);
bar_length = 1 << map->pm_size;
bar_start = map->pm_value;
bar_end = bar_start + bar_length - 1;
/* Make sure that the resource fits the constraints. */
if (bar_start >= end || bar_end <= bar_start || count != 1)
return (NULL);
/* Clamp the resource to the constraints if necessary. */
if (bar_start < start)
bar_start = start;
if (bar_end > end)
bar_end = end;
bar_length = bar_end - bar_start + 1;
res = rman_reserve_resource(&iov->rman, bar_start, bar_end,
bar_length, flags, child);
if (res == NULL)
return (NULL);
rle = resource_list_add(&dinfo->resources, SYS_RES_MEMORY, *rid,
bar_start, bar_end, 1);
if (rle == NULL) {
rman_release_resource(res);
return (NULL);
}
rman_set_rid(res, *rid);
if (flags & RF_ACTIVE) {
error = bus_activate_resource(child, SYS_RES_MEMORY, *rid, res);
if (error != 0) {
resource_list_delete(&dinfo->resources, SYS_RES_MEMORY,
*rid);
rman_release_resource(res);
return (NULL);
}
}
rle->res = res;
return (res);
}
int
pci_vf_release_mem_resource(device_t dev, device_t child, int rid,
struct resource *r)
{
struct pci_devinfo *dinfo;
struct resource_list_entry *rle;
int error;
dinfo = device_get_ivars(child);
if (rman_get_flags(r) & RF_ACTIVE) {
error = bus_deactivate_resource(child, SYS_RES_MEMORY, rid, r);
if (error != 0)
return (error);
}
rle = resource_list_find(&dinfo->resources, SYS_RES_MEMORY, rid);
if (rle != NULL) {
rle->res = NULL;
resource_list_delete(&dinfo->resources, SYS_RES_MEMORY,
rid);
}
return (rman_release_resource(r));
}