Add support for MSI-X interrupts in the virtio network device and make that

the default.

The current behavior of advertising a single MSI vector can be requested by
setting the environment variable "BHYVE_USE_MSI" to "true". The use of MSI
is not compliant with the virtio specification and will be eventually phased
out.

Submitted by:	Gopakumar T
Obtained from:	NetApp
This commit is contained in:
Neel Natu 2013-01-30 04:30:36 +00:00
parent 8faceb3292
commit c9b4e98754
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=246109
4 changed files with 347 additions and 29 deletions

View File

@ -166,6 +166,94 @@ pci_parse_slot(char *opt, int legacy)
}
}
static int
pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset)
{
if (offset < pi->pi_msix.pba_offset)
return (0);
if (offset >= pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
return (0);
}
return (1);
}
int
pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size,
uint64_t value)
{
int msix_entry_offset;
int tab_index;
char *dest;
/* support only 4 or 8 byte writes */
if (size != 4 && size != 8)
return (-1);
/*
* Return if table index is beyond what device supports
*/
tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
if (tab_index >= pi->pi_msix.table_count)
return (-1);
msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
/* support only aligned writes */
if ((msix_entry_offset % size) != 0)
return (-1);
dest = (char *)(pi->pi_msix.table + tab_index);
dest += msix_entry_offset;
if (size == 4)
*((uint32_t *)dest) = value;
else
*((uint64_t *)dest) = value;
return (0);
}
uint64_t
pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size)
{
char *dest;
int msix_entry_offset;
int tab_index;
uint64_t retval = ~0;
/* support only 4 or 8 byte reads */
if (size != 4 && size != 8)
return (retval);
msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
/* support only aligned reads */
if ((msix_entry_offset % size) != 0) {
return (retval);
}
tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
if (tab_index < pi->pi_msix.table_count) {
/* valid MSI-X Table access */
dest = (char *)(pi->pi_msix.table + tab_index);
dest += msix_entry_offset;
if (size == 4)
retval = *((uint32_t *)dest);
else
retval = *((uint64_t *)dest);
} else if (pci_valid_pba_offset(pi, offset)) {
/* return 0 for PBA access */
retval = 0;
}
return (retval);
}
static int
pci_emul_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
@ -178,8 +266,7 @@ pci_emul_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
for (i = 0; i <= PCI_BARMAX; i++) {
if (pdi->pi_bar[i].type == PCIBAR_IO &&
port >= pdi->pi_bar[i].addr &&
port + bytes <=
pdi->pi_bar[i].addr + pdi->pi_bar[i].size) {
port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) {
offset = port - pdi->pi_bar[i].addr;
if (in)
*eax = (*pe->pe_barread)(ctx, vcpu, pdi, i,
@ -484,13 +571,95 @@ pci_emul_add_msicap(struct pci_devinst *pi, int msgnum)
return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap)));
}
static void
pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum,
uint32_t msix_tab_size, int nextptr)
{
CTASSERT(sizeof(struct msixcap) == 12);
assert(msix_tab_size % 4096 == 0);
bzero(msixcap, sizeof(struct msixcap));
msixcap->capid = PCIY_MSIX;
msixcap->nextptr = nextptr;
/*
* Message Control Register, all fields set to
* zero except for the Table Size.
* Note: Table size N is encoded as N-1
*/
msixcap->msgctrl = msgnum - 1;
/*
* MSI-X BAR setup:
* - MSI-X table start at offset 0
* - PBA table starts at a 4K aligned offset after the MSI-X table
*/
msixcap->table_info = barnum & PCIM_MSIX_BIR_MASK;
msixcap->pba_info = msix_tab_size | (barnum & PCIM_MSIX_BIR_MASK);
}
static void
pci_msix_table_init(struct pci_devinst *pi, int table_entries)
{
int i, table_size;
assert(table_entries > 0);
assert(table_entries <= MAX_MSIX_TABLE_ENTRIES);
table_size = table_entries * MSIX_TABLE_ENTRY_SIZE;
pi->pi_msix.table = malloc(table_size);
bzero(pi->pi_msix.table, table_size);
/* set mask bit of vector control register */
for (i = 0; i < table_entries; i++)
pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK;
}
int
pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum)
{
uint16_t pba_index;
uint32_t tab_size;
struct msixcap msixcap;
assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES);
assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0);
tab_size = msgnum * MSIX_TABLE_ENTRY_SIZE;
/* Align table size to nearest 4K */
tab_size = roundup2(tab_size, 4096);
pi->pi_msix.table_bar = barnum;
pi->pi_msix.pba_bar = barnum;
pi->pi_msix.table_offset = 0;
pi->pi_msix.table_count = msgnum;
pi->pi_msix.pba_offset = tab_size;
/* calculate the MMIO size required for MSI-X PBA */
pba_index = (msgnum - 1) / (PBA_TABLE_ENTRY_SIZE * 8);
pi->pi_msix.pba_size = (pba_index + 1) * PBA_TABLE_ENTRY_SIZE;
pci_msix_table_init(pi, msgnum);
pci_populate_msixcap(&msixcap, msgnum, barnum, tab_size, 0);
/* allocate memory for MSI-X Table and PBA */
pci_emul_alloc_bar(pi, barnum, PCIBAR_MEM32,
tab_size + pi->pi_msix.pba_size);
return (pci_emul_add_capability(pi, (u_char *)&msixcap,
sizeof(msixcap)));
}
void
msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
int bytes, uint32_t val)
{
uint16_t msgctrl, rwmask;
int off, table_bar;
off = offset - capoff;
table_bar = pi->pi_msix.table_bar;
/* Message Control Register */
@ -502,6 +671,7 @@ msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
val = msgctrl;
pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE;
pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK;
}
CFGWRITE(pi, offset, val, bytes);
@ -589,6 +759,9 @@ pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val)
case PCIY_MSI:
msicap_cfgwrite(pi, capoff, offset, bytes, val);
break;
case PCIY_MSIX:
msixcap_cfgwrite(pi, capoff, offset, bytes, val);
break;
default:
break;
}
@ -668,6 +841,35 @@ pci_msi_msgnum(struct pci_devinst *pi)
return (0);
}
int
pci_msix_enabled(struct pci_devinst *pi)
{
return (pi->pi_msix.enabled && !pi->pi_msi.enabled);
}
void
pci_generate_msix(struct pci_devinst *pi, int index)
{
struct msix_table_entry *mte;
if (!pci_msix_enabled(pi))
return;
if (pi->pi_msix.function_mask)
return;
if (index >= pi->pi_msix.table_count)
return;
mte = &pi->pi_msix.table[index];
if ((mte->vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
/* XXX Set PBA bit if interrupt is disabled */
vm_lapic_irq(pi->pi_vmctx,
(mte->addr >> 12) & 0xff, mte->msg_data & 0xff);
}
}
void
pci_generate_msi(struct pci_devinst *pi, int msg)
{

View File

@ -96,6 +96,8 @@ struct msix_table_entry {
* for the size that should be emulated.
*/
#define MSIX_TABLE_ENTRY_SIZE 16
#define MAX_MSIX_TABLE_ENTRIES 2048
#define PBA_TABLE_ENTRY_SIZE 8
struct pci_devinst {
struct pci_devemu *pi_d;
@ -120,6 +122,8 @@ struct pci_devinst {
size_t table_offset;
int table_count;
size_t pba_offset;
size_t pba_size;
int function_mask;
struct msix_table_entry *table; /* allocated at runtime */
} pi_msix;
@ -168,6 +172,10 @@ int pci_msix_enabled(struct pci_devinst *pi);
int pci_msi_msgnum(struct pci_devinst *pi);
void pci_parse_slot(char *opt, int legacy);
void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr);
int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum);
int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size,
uint64_t value);
uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size);
static __inline void
pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val)

View File

@ -59,17 +59,17 @@ __FBSDID("$FreeBSD$");
/*
* PCI config-space register offsets
*/
#define VTNET_R_CFG0 20
#define VTNET_R_CFG1 21
#define VTNET_R_CFG2 22
#define VTNET_R_CFG3 23
#define VTNET_R_CFG4 24
#define VTNET_R_CFG5 25
#define VTNET_R_CFG6 26
#define VTNET_R_CFG7 27
#define VTNET_R_MAX 27
#define VTNET_R_CFG0 24
#define VTNET_R_CFG1 25
#define VTNET_R_CFG2 26
#define VTNET_R_CFG3 27
#define VTNET_R_CFG4 28
#define VTNET_R_CFG5 29
#define VTNET_R_CFG6 30
#define VTNET_R_CFG7 31
#define VTNET_R_MAX 31
#define VTNET_REGSZ VTNET_R_MAX+1
#define VTNET_REGSZ VTNET_R_MAX+1
/*
* Host capabilities
@ -88,6 +88,8 @@ __FBSDID("$FreeBSD$");
#define VTNET_MAXQ 3
static int use_msix = 1;
struct vring_hqueue {
/* Internal state */
uint16_t hq_size;
@ -144,8 +146,23 @@ struct pci_vtnet_softc {
uint64_t vsc_pfn[VTNET_MAXQ];
struct vring_hqueue vsc_hq[VTNET_MAXQ];
uint16_t vsc_msix_table_idx[VTNET_MAXQ];
};
/*
* Return the size of IO BAR that maps virtio header and device specific
* region. The size would vary depending on whether MSI-X is enabled or
* not.
*/
static uint64_t
pci_vtnet_iosize(struct pci_devinst *pi)
{
if (pci_msix_enabled(pi))
return (VTNET_REGSZ);
else
return (VTNET_REGSZ - (VTCFG_R_CFG1 - VTCFG_R_MSIX));
}
/*
* Return the number of available descriptors in the vring taking care
* of the 16-bit index wraparound.
@ -344,8 +361,13 @@ pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
hq->hq_cur_aidx = aidx;
if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
sc->vsc_isr |= 1;
pci_generate_msi(sc->vsc_pi, 0);
if (use_msix) {
pci_generate_msix(sc->vsc_pi,
sc->vsc_msix_table_idx[VTNET_RXQ]);
} else {
sc->vsc_isr |= 1;
pci_generate_msi(sc->vsc_pi, 0);
}
}
}
@ -438,8 +460,13 @@ pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vring_hqueue *hq)
* Generate an interrupt if able
*/
if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
sc->vsc_isr |= 1;
pci_generate_msi(sc->vsc_pi, 0);
if (use_msix) {
pci_generate_msix(sc->vsc_pi,
sc->vsc_msix_table_idx[VTNET_TXQ]);
} else {
sc->vsc_isr |= 1;
pci_generate_msi(sc->vsc_pi, 0);
}
}
}
@ -512,6 +539,7 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
unsigned char digest[16];
char nstr[80];
struct pci_vtnet_softc *sc;
const char *env_msi;
/*
* Access to guest memory is required. Fail if
@ -527,6 +555,14 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
sc->vsc_pi = pi;
pthread_mutex_init(&sc->vsc_mtx, NULL);
/*
* Use MSI if set by user
*/
if ((env_msi = getenv("BHYVE_USE_MSI")) != NULL) {
if (strcasecmp(env_msi, "yes") == 0)
use_msix = 0;
}
/*
* Attempt to open the tap device
@ -594,7 +630,24 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
pci_emul_add_msicap(pi, 1);
if (use_msix) {
/* MSI-X support */
int i;
for (i = 0; i < VTNET_MAXQ; i++)
sc->vsc_msix_table_idx[i] = VIRTIO_MSI_NO_VECTOR;
/*
* BAR 1 used to map MSI-X table and PBA
*/
if (pci_emul_add_msixcap(pi, VTNET_MAXQ, 1))
return (1);
} else {
/* MSI support */
pci_emul_add_msicap(pi, 1);
}
pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VTNET_REGSZ);
return (0);
@ -609,6 +662,21 @@ static void (*pci_vtnet_qnotify[VTNET_MAXQ])(struct pci_vtnet_softc *) = {
pci_vtnet_ping_ctlq
};
static uint64_t
vtnet_adjust_offset(struct pci_devinst *pi, uint64_t offset)
{
/*
* Device specific offsets used by guest would change based on
* whether MSI-X capability is enabled or not
*/
if (!pci_msix_enabled(pi)) {
if (offset >= VTCFG_R_MSIX)
return (offset + (VTCFG_R_CFG1 - VTCFG_R_MSIX));
}
return (offset);
}
static void
pci_vtnet_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size, uint64_t value)
@ -616,9 +684,17 @@ pci_vtnet_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
struct pci_vtnet_softc *sc = pi->pi_arg;
void *ptr;
if (use_msix) {
if (baridx == pi->pi_msix.table_bar ||
baridx == pi->pi_msix.pba_bar) {
pci_emul_msix_twrite(pi, offset, size, value);
return;
}
}
assert(baridx == 0);
if (offset + size > VTNET_REGSZ) {
if (offset + size > pci_vtnet_iosize(pi)) {
DPRINTF(("vtnet_write: 2big, offset %ld size %d\n",
offset, size));
return;
@ -626,6 +702,8 @@ pci_vtnet_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
pthread_mutex_lock(&sc->vsc_mtx);
offset = vtnet_adjust_offset(pi, offset);
switch (offset) {
case VTCFG_R_GUESTCAP:
assert(size == 4);
@ -649,6 +727,15 @@ pci_vtnet_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
assert(size == 1);
pci_vtnet_update_status(sc, value);
break;
case VTCFG_R_CFGVEC:
assert(size == 2);
sc->vsc_msix_table_idx[VTNET_CTLQ] = value;
break;
case VTCFG_R_QVEC:
assert(size == 2);
assert(sc->vsc_curq != VTNET_CTLQ);
sc->vsc_msix_table_idx[sc->vsc_curq] = value;
break;
case VTNET_R_CFG0:
case VTNET_R_CFG1:
case VTNET_R_CFG2:
@ -693,9 +780,16 @@ pci_vtnet_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
void *ptr;
uint64_t value;
if (use_msix) {
if (baridx == pi->pi_msix.table_bar ||
baridx == pi->pi_msix.pba_bar) {
return (pci_emul_msix_tread(pi, offset, size));
}
}
assert(baridx == 0);
if (offset + size > VTNET_REGSZ) {
if (offset + size > pci_vtnet_iosize(pi)) {
DPRINTF(("vtnet_read: 2big, offset %ld size %d\n",
offset, size));
return (0);
@ -703,6 +797,8 @@ pci_vtnet_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
pthread_mutex_lock(&sc->vsc_mtx);
offset = vtnet_adjust_offset(pi, offset);
switch (offset) {
case VTCFG_R_HOSTCAP:
assert(size == 4);
@ -737,21 +833,30 @@ pci_vtnet_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
value = sc->vsc_isr;
sc->vsc_isr = 0; /* a read clears this flag */
break;
case VTCFG_R_CFGVEC:
assert(size == 2);
value = sc->vsc_msix_table_idx[VTNET_CTLQ];
break;
case VTCFG_R_QVEC:
assert(size == 2);
assert(sc->vsc_curq != VTNET_CTLQ);
value = sc->vsc_msix_table_idx[sc->vsc_curq];
break;
case VTNET_R_CFG0:
case VTNET_R_CFG1:
case VTNET_R_CFG2:
case VTNET_R_CFG3:
case VTNET_R_CFG4:
case VTNET_R_CFG5:
assert((size + offset) <= (VTNET_R_CFG5 + 1));
ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0];
if (size == 1) {
value = *(uint8_t *) ptr;
} else if (size == 2) {
value = *(uint16_t *) ptr;
} else {
value = *(uint32_t *) ptr;
}
assert((size + offset) <= (VTNET_R_CFG5 + 1));
ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0];
if (size == 1) {
value = *(uint8_t *) ptr;
} else if (size == 2) {
value = *(uint16_t *) ptr;
} else {
value = *(uint32_t *) ptr;
}
break;
case VTNET_R_CFG6:
assert(size != 4);

View File

@ -36,6 +36,7 @@
#define VRING_DESC_F_INDIRECT (1 << 2)
#define VRING_AVAIL_F_NO_INTERRUPT 1
#define VIRTIO_MSI_NO_VECTOR 0xFFFF
struct virtio_desc {
uint64_t vd_addr;
@ -78,6 +79,8 @@ struct virtio_used {
#define VTCFG_R_QNOTIFY 16
#define VTCFG_R_STATUS 18
#define VTCFG_R_ISR 19
#define VTCFG_R_CFGVEC 20
#define VTCFG_R_QVEC 22
#define VTCFG_R_CFG0 20 /* No MSI-X */
#define VTCFG_R_CFG1 24 /* With MSI-X */
#define VTCFG_R_MSIX 20