Pass SR-IOV configuration to kernel using an nvlist

Pass all SR-IOV configuration to the kernel using an nvlist.  The
main benefit that this offers is flexibility.  It allows a driver
to accept any number of parameters of any type supported by the
SR-IOV configuration infrastructure with having to make any
changes outside of the driver.

It also offers the user very fine-grained control over the
configuration of the VFs -- if they want, they can have different
configuration applied to every VF.

Differential Revision:	https://reviews.freebsd.org/D82
Reviewed by:		jhb
MFC after: 		1 month
Sponsored by:		Sandvine Inc.
This commit is contained in:
rstone 2015-03-01 00:40:57 +00:00
parent f0a0142dd0
commit c476927dd3
3 changed files with 193 additions and 30 deletions

View File

@ -217,6 +217,7 @@ METHOD int iov_detach {
METHOD int init_iov {
device_t dev;
uint16_t num_vfs;
const struct nvlist *config;
};
METHOD void uninit_iov {
@ -226,6 +227,7 @@ METHOD void uninit_iov {
METHOD int add_vf {
device_t dev;
uint16_t vfnum;
const struct nvlist *config;
};
METHOD device_t create_iov_child {

View File

@ -70,6 +70,18 @@ static struct cdevsw iov_cdevsw = {
.d_ioctl = pci_iov_ioctl
};
SYSCTL_DECL(_hw_pci);
/*
* The maximum amount of memory we will allocate for user configuration of an
* SR-IOV device. 1MB ought to be enough for anyone, but leave this
* configurable just in case.
*/
static u_long pci_iov_max_config = 1024 * 1024;
SYSCTL_ULONG(_hw_pci, OID_AUTO, iov_max_config, CTLFLAG_RWTUN,
&pci_iov_max_config, 0, "Maximum allowed size of SR-IOV configuration.");
#define IOV_READ(d, r, w) \
pci_read_config((d)->cfg.dev, (d)->cfg.iov->iov_pos + r, w)
@ -348,6 +360,51 @@ pci_iov_add_bars(struct pcicfg_iov *iov, struct pci_devinfo *dinfo)
}
}
static int
pci_iov_parse_config(struct pcicfg_iov *iov, struct pci_iov_arg *arg,
nvlist_t **ret)
{
void *packed_config;
nvlist_t *config;
int error;
config = NULL;
packed_config = NULL;
if (arg->len > pci_iov_max_config) {
error = EMSGSIZE;
goto out;
}
packed_config = malloc(arg->len, M_SRIOV, M_WAITOK);
error = copyin(arg->config, packed_config, arg->len);
if (error != 0)
goto out;
config = nvlist_unpack(packed_config, arg->len);
if (config == NULL) {
error = EINVAL;
goto out;
}
error = pci_iov_schema_validate_config(iov->iov_schema, config);
if (error != 0)
goto out;
error = nvlist_error(config);
if (error != 0)
goto out;
*ret = config;
config = NULL;
out:
nvlist_destroy(config);
free(packed_config, M_SRIOV);
return (error);
}
/*
* Set the ARI_EN bit in the lowest-numbered PCI function with the SR-IOV
* capability. This bit is only writeable on the lowest-numbered PF but
@ -421,6 +478,16 @@ pci_iov_config_page_size(struct pci_devinfo *dinfo)
return (0);
}
static int
pci_init_iov(device_t dev, uint16_t num_vfs, const nvlist_t *config)
{
const nvlist_t *device, *driver_config;
device = nvlist_get_nvlist(config, PF_CONFIG_NAME);
driver_config = nvlist_get_nvlist(device, DRIVER_CONFIG_NAME);
return (PCI_INIT_IOV(dev, num_vfs, driver_config));
}
static int
pci_iov_init_rman(device_t pf, struct pcicfg_iov *iov)
{
@ -479,9 +546,11 @@ pci_iov_setup_bars(struct pci_devinfo *dinfo)
}
static void
pci_iov_enumerate_vfs(struct pci_devinfo *dinfo, const char *driver,
pci_iov_enumerate_vfs(struct pci_devinfo *dinfo, const nvlist_t *config,
uint16_t first_rid, uint16_t rid_stride)
{
char device_name[VF_MAX_NAME];
const nvlist_t *device, *driver_config, *iov_config;
device_t bus, dev, vf;
struct pcicfg_iov *iov;
struct pci_devinfo *vfinfo;
@ -498,12 +567,23 @@ pci_iov_enumerate_vfs(struct pci_devinfo *dinfo, const char *driver,
did = IOV_READ(dinfo, PCIR_SRIOV_VF_DID, 2);
for (i = 0; i < iov->iov_num_vfs; i++, next_rid += rid_stride) {
snprintf(device_name, sizeof(device_name), VF_PREFIX"%d", i);
device = nvlist_get_nvlist(config, device_name);
iov_config = nvlist_get_nvlist(device, IOV_CONFIG_NAME);
driver_config = nvlist_get_nvlist(device, DRIVER_CONFIG_NAME);
vf = PCI_CREATE_IOV_CHILD(bus, dev, next_rid, vid, did);
if (vf == NULL)
break;
/*
* If we are creating passthrough devices then force the ppt
* driver to attach to prevent a VF driver from claiming the
* VFs.
*/
if (nvlist_get_bool(iov_config, "passthrough"))
device_set_devclass(vf, "ppt");
vfinfo = device_get_ivars(vf);
vfinfo->cfg.iov = iov;
@ -511,7 +591,7 @@ pci_iov_enumerate_vfs(struct pci_devinfo *dinfo, const char *driver,
pci_iov_add_bars(iov, vfinfo);
error = PCI_ADD_VF(dev, i);
error = PCI_ADD_VF(dev, i, driver_config);
if (error != 0) {
device_printf(dev, "Failed to add VF %d\n", i);
pci_delete_child(bus, vf);
@ -525,14 +605,14 @@ static int
pci_iov_config(struct cdev *cdev, struct pci_iov_arg *arg)
{
device_t bus, dev;
const char *driver;
struct pci_devinfo *dinfo;
struct pcicfg_iov *iov;
nvlist_t *config;
int i, error;
uint16_t rid_off, rid_stride;
uint16_t first_rid, last_rid;
uint16_t iov_ctl;
uint16_t total_vfs;
uint16_t num_vfs, total_vfs;
int iov_inited;
mtx_lock(&Giant);
@ -541,6 +621,7 @@ pci_iov_config(struct cdev *cdev, struct pci_iov_arg *arg)
dev = dinfo->cfg.dev;
bus = device_get_parent(dev);
iov_inited = 0;
config = NULL;
if ((iov->iov_flags & IOV_BUSY) || iov->iov_num_vfs != 0) {
mtx_unlock(&Giant);
@ -548,22 +629,17 @@ pci_iov_config(struct cdev *cdev, struct pci_iov_arg *arg)
}
iov->iov_flags |= IOV_BUSY;
total_vfs = IOV_READ(dinfo, PCIR_SRIOV_TOTAL_VFS, 2);
error = pci_iov_parse_config(iov, arg, &config);
if (error != 0)
goto out;
if (arg->num_vfs > total_vfs) {
num_vfs = pci_iov_config_get_num_vfs(config);
total_vfs = IOV_READ(dinfo, PCIR_SRIOV_TOTAL_VFS, 2);
if (num_vfs > total_vfs) {
error = EINVAL;
goto out;
}
/*
* If we are creating passthrough devices then force the ppt driver to
* attach to prevent a VF driver from claming the VFs.
*/
if (arg->passthrough)
driver = "ppt";
else
driver = NULL;
error = pci_iov_config_page_size(dinfo);
if (error != 0)
goto out;
@ -572,19 +648,18 @@ pci_iov_config(struct cdev *cdev, struct pci_iov_arg *arg)
if (error != 0)
goto out;
error = PCI_INIT_IOV(dev, arg->num_vfs);
error = pci_init_iov(dev, num_vfs, config);
if (error != 0)
goto out;
iov_inited = 1;
IOV_WRITE(dinfo, PCIR_SRIOV_NUM_VFS, arg->num_vfs, 2);
IOV_WRITE(dinfo, PCIR_SRIOV_NUM_VFS, num_vfs, 2);
rid_off = IOV_READ(dinfo, PCIR_SRIOV_VF_OFF, 2);
rid_stride = IOV_READ(dinfo, PCIR_SRIOV_VF_STRIDE, 2);
first_rid = pci_get_rid(dev) + rid_off;
last_rid = first_rid + (arg->num_vfs - 1) * rid_stride;
last_rid = first_rid + (num_vfs - 1) * rid_stride;
/* We don't yet support allocating extra bus numbers for VFs. */
if (pci_get_bus(dev) != PCI_RID2BUS(last_rid)) {
@ -600,7 +675,7 @@ pci_iov_config(struct cdev *cdev, struct pci_iov_arg *arg)
if (error != 0)
goto out;
iov->iov_num_vfs = arg->num_vfs;
iov->iov_num_vfs = num_vfs;
error = pci_iov_setup_bars(dinfo);
if (error != 0)
@ -612,7 +687,10 @@ pci_iov_config(struct cdev *cdev, struct pci_iov_arg *arg)
/* Per specification, we must wait 100ms before accessing VFs. */
pause("iov", roundup(hz, 10));
pci_iov_enumerate_vfs(dinfo, driver, first_rid, rid_stride);
pci_iov_enumerate_vfs(dinfo, config, first_rid, rid_stride);
nvlist_destroy(config);
iov->iov_flags &= ~IOV_BUSY;
mtx_unlock(&Giant);
return (0);
@ -635,6 +713,8 @@ pci_iov_config(struct cdev *cdev, struct pci_iov_arg *arg)
rman_fini(&iov->rman);
iov->iov_flags &= ~IOV_RMAN_INITED;
}
nvlist_destroy(config);
iov->iov_num_vfs = 0;
iov->iov_flags &= ~IOV_BUSY;
mtx_unlock(&Giant);

View File

@ -46,12 +46,6 @@
#define DEFAULT_SCHEMA_NAME "DEFAULT"
#define REQUIRED_SCHEMA_NAME "REQUIRED"
struct pci_iov_arg
{
int num_vfs;
int passthrough;
};
/*
* Because each PF device is expected to expose a unique set of possible
* configurations, the SR-IOV infrastructure dynamically queries the PF
@ -168,7 +162,94 @@ struct pci_iov_schema
int error;
};
#define IOV_CONFIG _IOWR('p', 10, struct pci_iov_arg)
/*
* SR-IOV configuration is passed to the kernel as a packed nvlist. See nv(3)
* for the details of the nvlist API. The expected format of the nvlist is:
*
* BASIC RULES
* 1) All keys are case-insensitive.
* 2) No keys that are not specified below may exist at any level of the
* config nvlist.
* 3) Unless otherwise specified, all keys are optional. It should go without
* saying a key being mandatory is transitive: that is, if a key is
* specified to contain a sub-nodes that contains a mandatory key, then
* the outer key is implicitly mandatory. If a key is mandatory then the
* associated value is also mandatory.
* 4) Order of keys is irrelevant.
*
* TOP LEVEL OF CONFIG NVLIST
* 1) All keys specified in this section are mandatory.
* 2) There must be a top-level key with the name PF_CONFIG_NAME. The value
* associated is an nvlist that follows the "device node" format. The
* parameters in this node specify parameters that apply to the PF.
* 3) For every VF being configured (this is set via the "num_vfs" parameter
* in the PF section), there must be a top-level key whose name is VF_PREFIX
* immediately followed by the index of the VF as a decimal integer. For
* example, this would be VF-0 for the first VF. VFs are numbered starting
* from 0. The value associated with this key follows the "device node"
* format. The parameters in this node specify configuration that applies
* to the VF specified in the key. Leading zeros are not permitted in VF
* index. Configuration for the second VF must be specified in a node with
* the key VF-1. VF-01 is not a valid key.
*
* DEVICE NODES
* 1) All keys specified in this section are mandatory.
* 2) The device node must contain a key with the name DRIVER_CONFIG_NAME. The
* value associated with this key is an nvlist following the subsystem node
* format. The parameters in this key specify configuration that is specific
* to a particular device driver.
* 3) The device node must contain a key with the name IOV_CONFIG_NAME. The
* value associated with this key is an nvlist following the subsystem node
* format. The parameters in this key specify configuration that is consumed
* by the SR-IOV infrastructure.
*
* SUBSYSTEM NODES
* 1) A subsystem node specifies configuration parameters that apply to a
* particular subsystem (driver or infrastructure) of a particular device
* (PF or individual VF).
* Note: We will refer to the section of the configuration schema that
* specifies the parameters for this subsystem and device
* configuration as the device/subystem schema.
* 2) The subsystem node must contain only keys that correspond to parameters
* that are specified in the device/subsystem schema.
* 3) Every parameter specified as required in the device/subsystem schema is
* a mandatory key in the subsystem node.
* Note: All parameters that are not required in device/subsystem schema are
* optional keys. In particular, any parameter specified to have a
* default value in the device/subsystem schema is optional. The
* kernel is responsible for applying default values.
* 4) The value of every parameter in the device node must conform to the
* restrictions of the type specified for that parameter in the device/
* subsystem schema.
*
* The following is an example of a valid configuration, when validated against
* the schema example given above.
*
* PF (NVLIST):
* driver (NVLIST):
* iov (NVLIST):
* num_vfs (NUMBER): 3 (3) (0x3)
* device (STRING): [ix0]
* VF-0 (NVLIST):
* driver (NVLIST):
* vlan (NUMBER): 1000 (1000) (0x3e8)
* iov (NVLIST):
* passthrough (BOOL): TRUE
* VF-1 (NVLIST):
* driver (NVLIST):
* iov (NVLIST):
* VF-2 (NVLIST):
* driver (NVLIST):
* mac-addr (BINARY): 6 020102030405
* iov (NVLIST):
*/
struct pci_iov_arg
{
void *config;
size_t len;
};
#define IOV_CONFIG _IOW('p', 10, struct pci_iov_arg)
#define IOV_DELETE _IO('p', 11)
#define IOV_GET_SCHEMA _IOWR('p', 12, struct pci_iov_schema)