Create branch for bhyve graphics import.

This commit is contained in:
Peter Grehan 2016-05-27 06:22:24 +00:00
commit 361da73864
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/projects/bhyve_graphics/; revision=300828
57 changed files with 19959 additions and 0 deletions

55
Makefile Normal file
View File

@ -0,0 +1,55 @@
#
# $FreeBSD$
#
PROG= bhyve
PACKAGE= bhyve
DEBUG_FLAGS= -g -O0
MAN= bhyve.8
BHYVE_SYSDIR?=${SRCTOP}
SRCS= \
atkbdc.c \
acpi.c \
bhyverun.c \
block_if.c \
bootrom.c \
consport.c \
dbgport.c \
fwctl.c \
inout.c \
ioapic.c \
mem.c \
mevent.c \
mptbl.c \
pci_ahci.c \
pci_emul.c \
pci_hostbridge.c \
pci_irq.c \
pci_lpc.c \
pci_passthru.c \
pci_virtio_block.c \
pci_virtio_net.c \
pci_virtio_rnd.c \
pci_uart.c \
pm.c \
post.c \
rtc.c \
smbiostbl.c \
task_switch.c \
uart_emul.c \
virtio.c \
xmsr.c \
spinup_ap.c
.PATH: ${BHYVE_SYSDIR}/sys/amd64/vmm
SRCS+= vmm_instruction_emul.c
LIBADD= vmmapi md pthread
WARNS?= 2
.include <bsd.prog.mk>

22
Makefile.depend Normal file
View File

@ -0,0 +1,22 @@
# $FreeBSD$
# Autogenerated - do NOT edit!
DIRDEPS = \
gnu/lib/csu \
gnu/lib/libgcc \
include \
include/xlocale \
lib/${CSU_DIR} \
lib/libc \
lib/libcompiler_rt \
lib/libmd \
lib/libthr \
lib/libutil \
lib/libvmmapi \
.include <dirdeps.mk>
.if ${DEP_RELDIR} == ${_DEP_RELDIR}
# local dependencies - needed for -jN in clean tree
.endif

1012
acpi.c Normal file

File diff suppressed because it is too large Load Diff

54
acpi.h Normal file
View File

@ -0,0 +1,54 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _ACPI_H_
#define _ACPI_H_
#define SCI_INT 9
#define SMI_CMD 0xb2
#define BHYVE_ACPI_ENABLE 0xa0
#define BHYVE_ACPI_DISABLE 0xa1
#define PM1A_EVT_ADDR 0x400
#define PM1A_CNT_ADDR 0x404
#define IO_PMTMR 0x408 /* 4-byte i/o port for the timer */
struct vmctx;
int acpi_build(struct vmctx *ctx, int ncpu);
void dsdt_line(const char *fmt, ...);
void dsdt_fixed_ioport(uint16_t iobase, uint16_t length);
void dsdt_fixed_irq(uint8_t irq);
void dsdt_fixed_mem32(uint32_t base, uint32_t length);
void dsdt_indent(int levels);
void dsdt_unindent(int levels);
void sci_init(struct vmctx *ctx);
#endif /* _ACPI_H_ */

322
ahci.h Normal file
View File

@ -0,0 +1,322 @@
/*-
* Copyright (c) 1998 - 2008 Søren Schmidt <sos@FreeBSD.org>
* Copyright (c) 2009-2012 Alexander Motin <mav@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer,
* without modification, immediately at the beginning of the file.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _AHCI_H_
#define _AHCI_H_
/* ATA register defines */
#define ATA_DATA 0 /* (RW) data */
#define ATA_FEATURE 1 /* (W) feature */
#define ATA_F_DMA 0x01 /* enable DMA */
#define ATA_F_OVL 0x02 /* enable overlap */
#define ATA_COUNT 2 /* (W) sector count */
#define ATA_SECTOR 3 /* (RW) sector # */
#define ATA_CYL_LSB 4 /* (RW) cylinder# LSB */
#define ATA_CYL_MSB 5 /* (RW) cylinder# MSB */
#define ATA_DRIVE 6 /* (W) Sector/Drive/Head */
#define ATA_D_LBA 0x40 /* use LBA addressing */
#define ATA_D_IBM 0xa0 /* 512 byte sectors, ECC */
#define ATA_COMMAND 7 /* (W) command */
#define ATA_ERROR 8 /* (R) error */
#define ATA_E_ILI 0x01 /* illegal length */
#define ATA_E_NM 0x02 /* no media */
#define ATA_E_ABORT 0x04 /* command aborted */
#define ATA_E_MCR 0x08 /* media change request */
#define ATA_E_IDNF 0x10 /* ID not found */
#define ATA_E_MC 0x20 /* media changed */
#define ATA_E_UNC 0x40 /* uncorrectable data */
#define ATA_E_ICRC 0x80 /* UDMA crc error */
#define ATA_E_ATAPI_SENSE_MASK 0xf0 /* ATAPI sense key mask */
#define ATA_IREASON 9 /* (R) interrupt reason */
#define ATA_I_CMD 0x01 /* cmd (1) | data (0) */
#define ATA_I_IN 0x02 /* read (1) | write (0) */
#define ATA_I_RELEASE 0x04 /* released bus (1) */
#define ATA_I_TAGMASK 0xf8 /* tag mask */
#define ATA_STATUS 10 /* (R) status */
#define ATA_ALTSTAT 11 /* (R) alternate status */
#define ATA_S_ERROR 0x01 /* error */
#define ATA_S_INDEX 0x02 /* index */
#define ATA_S_CORR 0x04 /* data corrected */
#define ATA_S_DRQ 0x08 /* data request */
#define ATA_S_DSC 0x10 /* drive seek completed */
#define ATA_S_SERVICE 0x10 /* drive needs service */
#define ATA_S_DWF 0x20 /* drive write fault */
#define ATA_S_DMA 0x20 /* DMA ready */
#define ATA_S_READY 0x40 /* drive ready */
#define ATA_S_BUSY 0x80 /* busy */
#define ATA_CONTROL 12 /* (W) control */
#define ATA_A_IDS 0x02 /* disable interrupts */
#define ATA_A_RESET 0x04 /* RESET controller */
#define ATA_A_4BIT 0x08 /* 4 head bits */
#define ATA_A_HOB 0x80 /* High Order Byte enable */
/* SATA register defines */
#define ATA_SSTATUS 13
#define ATA_SS_DET_MASK 0x0000000f
#define ATA_SS_DET_NO_DEVICE 0x00000000
#define ATA_SS_DET_DEV_PRESENT 0x00000001
#define ATA_SS_DET_PHY_ONLINE 0x00000003
#define ATA_SS_DET_PHY_OFFLINE 0x00000004
#define ATA_SS_SPD_MASK 0x000000f0
#define ATA_SS_SPD_NO_SPEED 0x00000000
#define ATA_SS_SPD_GEN1 0x00000010
#define ATA_SS_SPD_GEN2 0x00000020
#define ATA_SS_SPD_GEN3 0x00000030
#define ATA_SS_IPM_MASK 0x00000f00
#define ATA_SS_IPM_NO_DEVICE 0x00000000
#define ATA_SS_IPM_ACTIVE 0x00000100
#define ATA_SS_IPM_PARTIAL 0x00000200
#define ATA_SS_IPM_SLUMBER 0x00000600
#define ATA_SS_IPM_DEVSLEEP 0x00000800
#define ATA_SERROR 14
#define ATA_SE_DATA_CORRECTED 0x00000001
#define ATA_SE_COMM_CORRECTED 0x00000002
#define ATA_SE_DATA_ERR 0x00000100
#define ATA_SE_COMM_ERR 0x00000200
#define ATA_SE_PROT_ERR 0x00000400
#define ATA_SE_HOST_ERR 0x00000800
#define ATA_SE_PHY_CHANGED 0x00010000
#define ATA_SE_PHY_IERROR 0x00020000
#define ATA_SE_COMM_WAKE 0x00040000
#define ATA_SE_DECODE_ERR 0x00080000
#define ATA_SE_PARITY_ERR 0x00100000
#define ATA_SE_CRC_ERR 0x00200000
#define ATA_SE_HANDSHAKE_ERR 0x00400000
#define ATA_SE_LINKSEQ_ERR 0x00800000
#define ATA_SE_TRANSPORT_ERR 0x01000000
#define ATA_SE_UNKNOWN_FIS 0x02000000
#define ATA_SE_EXCHANGED 0x04000000
#define ATA_SCONTROL 15
#define ATA_SC_DET_MASK 0x0000000f
#define ATA_SC_DET_IDLE 0x00000000
#define ATA_SC_DET_RESET 0x00000001
#define ATA_SC_DET_DISABLE 0x00000004
#define ATA_SC_SPD_MASK 0x000000f0
#define ATA_SC_SPD_NO_SPEED 0x00000000
#define ATA_SC_SPD_SPEED_GEN1 0x00000010
#define ATA_SC_SPD_SPEED_GEN2 0x00000020
#define ATA_SC_SPD_SPEED_GEN3 0x00000030
#define ATA_SC_IPM_MASK 0x00000f00
#define ATA_SC_IPM_NONE 0x00000000
#define ATA_SC_IPM_DIS_PARTIAL 0x00000100
#define ATA_SC_IPM_DIS_SLUMBER 0x00000200
#define ATA_SC_IPM_DIS_DEVSLEEP 0x00000400
#define ATA_SACTIVE 16
#define AHCI_MAX_PORTS 32
#define AHCI_MAX_SLOTS 32
#define AHCI_MAX_IRQS 16
/* SATA AHCI v1.0 register defines */
#define AHCI_CAP 0x00
#define AHCI_CAP_NPMASK 0x0000001f
#define AHCI_CAP_SXS 0x00000020
#define AHCI_CAP_EMS 0x00000040
#define AHCI_CAP_CCCS 0x00000080
#define AHCI_CAP_NCS 0x00001F00
#define AHCI_CAP_NCS_SHIFT 8
#define AHCI_CAP_PSC 0x00002000
#define AHCI_CAP_SSC 0x00004000
#define AHCI_CAP_PMD 0x00008000
#define AHCI_CAP_FBSS 0x00010000
#define AHCI_CAP_SPM 0x00020000
#define AHCI_CAP_SAM 0x00080000
#define AHCI_CAP_ISS 0x00F00000
#define AHCI_CAP_ISS_SHIFT 20
#define AHCI_CAP_SCLO 0x01000000
#define AHCI_CAP_SAL 0x02000000
#define AHCI_CAP_SALP 0x04000000
#define AHCI_CAP_SSS 0x08000000
#define AHCI_CAP_SMPS 0x10000000
#define AHCI_CAP_SSNTF 0x20000000
#define AHCI_CAP_SNCQ 0x40000000
#define AHCI_CAP_64BIT 0x80000000
#define AHCI_GHC 0x04
#define AHCI_GHC_AE 0x80000000
#define AHCI_GHC_MRSM 0x00000004
#define AHCI_GHC_IE 0x00000002
#define AHCI_GHC_HR 0x00000001
#define AHCI_IS 0x08
#define AHCI_PI 0x0c
#define AHCI_VS 0x10
#define AHCI_CCCC 0x14
#define AHCI_CCCC_TV_MASK 0xffff0000
#define AHCI_CCCC_TV_SHIFT 16
#define AHCI_CCCC_CC_MASK 0x0000ff00
#define AHCI_CCCC_CC_SHIFT 8
#define AHCI_CCCC_INT_MASK 0x000000f8
#define AHCI_CCCC_INT_SHIFT 3
#define AHCI_CCCC_EN 0x00000001
#define AHCI_CCCP 0x18
#define AHCI_EM_LOC 0x1C
#define AHCI_EM_CTL 0x20
#define AHCI_EM_MR 0x00000001
#define AHCI_EM_TM 0x00000100
#define AHCI_EM_RST 0x00000200
#define AHCI_EM_LED 0x00010000
#define AHCI_EM_SAFTE 0x00020000
#define AHCI_EM_SES2 0x00040000
#define AHCI_EM_SGPIO 0x00080000
#define AHCI_EM_SMB 0x01000000
#define AHCI_EM_XMT 0x02000000
#define AHCI_EM_ALHD 0x04000000
#define AHCI_EM_PM 0x08000000
#define AHCI_CAP2 0x24
#define AHCI_CAP2_BOH 0x00000001
#define AHCI_CAP2_NVMP 0x00000002
#define AHCI_CAP2_APST 0x00000004
#define AHCI_CAP2_SDS 0x00000008
#define AHCI_CAP2_SADM 0x00000010
#define AHCI_CAP2_DESO 0x00000020
#define AHCI_OFFSET 0x100
#define AHCI_STEP 0x80
#define AHCI_P_CLB 0x00
#define AHCI_P_CLBU 0x04
#define AHCI_P_FB 0x08
#define AHCI_P_FBU 0x0c
#define AHCI_P_IS 0x10
#define AHCI_P_IE 0x14
#define AHCI_P_IX_DHR 0x00000001
#define AHCI_P_IX_PS 0x00000002
#define AHCI_P_IX_DS 0x00000004
#define AHCI_P_IX_SDB 0x00000008
#define AHCI_P_IX_UF 0x00000010
#define AHCI_P_IX_DP 0x00000020
#define AHCI_P_IX_PC 0x00000040
#define AHCI_P_IX_MP 0x00000080
#define AHCI_P_IX_PRC 0x00400000
#define AHCI_P_IX_IPM 0x00800000
#define AHCI_P_IX_OF 0x01000000
#define AHCI_P_IX_INF 0x04000000
#define AHCI_P_IX_IF 0x08000000
#define AHCI_P_IX_HBD 0x10000000
#define AHCI_P_IX_HBF 0x20000000
#define AHCI_P_IX_TFE 0x40000000
#define AHCI_P_IX_CPD 0x80000000
#define AHCI_P_CMD 0x18
#define AHCI_P_CMD_ST 0x00000001
#define AHCI_P_CMD_SUD 0x00000002
#define AHCI_P_CMD_POD 0x00000004
#define AHCI_P_CMD_CLO 0x00000008
#define AHCI_P_CMD_FRE 0x00000010
#define AHCI_P_CMD_CCS_MASK 0x00001f00
#define AHCI_P_CMD_CCS_SHIFT 8
#define AHCI_P_CMD_ISS 0x00002000
#define AHCI_P_CMD_FR 0x00004000
#define AHCI_P_CMD_CR 0x00008000
#define AHCI_P_CMD_CPS 0x00010000
#define AHCI_P_CMD_PMA 0x00020000
#define AHCI_P_CMD_HPCP 0x00040000
#define AHCI_P_CMD_MPSP 0x00080000
#define AHCI_P_CMD_CPD 0x00100000
#define AHCI_P_CMD_ESP 0x00200000
#define AHCI_P_CMD_FBSCP 0x00400000
#define AHCI_P_CMD_APSTE 0x00800000
#define AHCI_P_CMD_ATAPI 0x01000000
#define AHCI_P_CMD_DLAE 0x02000000
#define AHCI_P_CMD_ALPE 0x04000000
#define AHCI_P_CMD_ASP 0x08000000
#define AHCI_P_CMD_ICC_MASK 0xf0000000
#define AHCI_P_CMD_NOOP 0x00000000
#define AHCI_P_CMD_ACTIVE 0x10000000
#define AHCI_P_CMD_PARTIAL 0x20000000
#define AHCI_P_CMD_SLUMBER 0x60000000
#define AHCI_P_CMD_DEVSLEEP 0x80000000
#define AHCI_P_TFD 0x20
#define AHCI_P_SIG 0x24
#define AHCI_P_SSTS 0x28
#define AHCI_P_SCTL 0x2c
#define AHCI_P_SERR 0x30
#define AHCI_P_SACT 0x34
#define AHCI_P_CI 0x38
#define AHCI_P_SNTF 0x3C
#define AHCI_P_FBS 0x40
#define AHCI_P_FBS_EN 0x00000001
#define AHCI_P_FBS_DEC 0x00000002
#define AHCI_P_FBS_SDE 0x00000004
#define AHCI_P_FBS_DEV 0x00000f00
#define AHCI_P_FBS_DEV_SHIFT 8
#define AHCI_P_FBS_ADO 0x0000f000
#define AHCI_P_FBS_ADO_SHIFT 12
#define AHCI_P_FBS_DWE 0x000f0000
#define AHCI_P_FBS_DWE_SHIFT 16
#define AHCI_P_DEVSLP 0x44
#define AHCI_P_DEVSLP_ADSE 0x00000001
#define AHCI_P_DEVSLP_DSP 0x00000002
#define AHCI_P_DEVSLP_DETO 0x000003fc
#define AHCI_P_DEVSLP_DETO_SHIFT 2
#define AHCI_P_DEVSLP_MDAT 0x00007c00
#define AHCI_P_DEVSLP_MDAT_SHIFT 10
#define AHCI_P_DEVSLP_DITO 0x01ff8000
#define AHCI_P_DEVSLP_DITO_SHIFT 15
#define AHCI_P_DEVSLP_DM 0x0e000000
#define AHCI_P_DEVSLP_DM_SHIFT 25
/* Just to be sure, if building as module. */
#if MAXPHYS < 512 * 1024
#undef MAXPHYS
#define MAXPHYS 512 * 1024
#endif
/* Pessimistic prognosis on number of required S/G entries */
#define AHCI_SG_ENTRIES (roundup(btoc(MAXPHYS) + 1, 8))
/* Command list. 32 commands. First, 1Kbyte aligned. */
#define AHCI_CL_OFFSET 0
#define AHCI_CL_SIZE 32
/* Command tables. Up to 32 commands, Each, 128byte aligned. */
#define AHCI_CT_OFFSET (AHCI_CL_OFFSET + AHCI_CL_SIZE * AHCI_MAX_SLOTS)
#define AHCI_CT_SIZE (128 + AHCI_SG_ENTRIES * 16)
/* Total main work area. */
#define AHCI_WORK_SIZE (AHCI_CT_OFFSET + AHCI_CT_SIZE * ch->numslots)
#endif /* _AHCI_H_ */

90
atkbdc.c Normal file
View File

@ -0,0 +1,90 @@
/*-
* Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <machine/vmm.h>
#include <vmmapi.h>
#include <assert.h>
#include <errno.h>
#include <stdio.h>
#include "inout.h"
#include "pci_lpc.h"
#define KBD_DATA_PORT 0x60
#define KBD_STS_CTL_PORT 0x64
#define KBD_SYS_FLAG 0x4
#define KBDC_RESET 0xfe
static int
atkbdc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
if (bytes != 1)
return (-1);
*eax = 0;
return (0);
}
static int
atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port,
int bytes, uint32_t *eax, void *arg)
{
int error, retval;
if (bytes != 1)
return (-1);
retval = 0;
if (in) {
*eax = KBD_SYS_FLAG; /* system passed POST */
} else {
switch (*eax) {
case KBDC_RESET: /* Pulse "reset" line. */
error = vm_suspend(ctx, VM_SUSPEND_RESET);
assert(error == 0 || errno == EALREADY);
break;
}
}
return (retval);
}
INOUT_PORT(atkdbc, KBD_DATA_PORT, IOPORT_F_INOUT, atkbdc_data_handler);
SYSRES_IO(KBD_DATA_PORT, 1);
INOUT_PORT(atkbdc, KBD_STS_CTL_PORT, IOPORT_F_INOUT,
atkbdc_sts_ctl_handler);
SYSRES_IO(KBD_STS_CTL_PORT, 1);

373
bhyve.8 Normal file
View File

@ -0,0 +1,373 @@
.\" Copyright (c) 2013 Peter Grehan
.\" All rights reserved.
.\"
.\" Redistribution and use in source and binary forms, with or without
.\" modification, are permitted provided that the following conditions
.\" are met:
.\" 1. Redistributions of source code must retain the above copyright
.\" notice, this list of conditions and the following disclaimer.
.\" 2. Redistributions in binary form must reproduce the above copyright
.\" notice, this list of conditions and the following disclaimer in the
.\" documentation and/or other materials provided with the distribution.
.\"
.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
.\" $FreeBSD$
.\"
.Dd April 18, 2016
.Dt BHYVE 8
.Os
.Sh NAME
.Nm bhyve
.Nd "run a guest operating system inside a virtual machine"
.Sh SYNOPSIS
.Nm
.Op Fl abehuwxACHPSWY
.Op Fl c Ar numcpus
.Op Fl g Ar gdbport
.Op Fl l Ar lpcdev Ns Op , Ns Ar conf
.Op Fl m Ar size Ns Op Ar K|k|M|m|G|g|T|t
.Op Fl p Ar vcpu:hostcpu
.Op Fl s Ar slot,emulation Ns Op , Ns Ar conf
.Op Fl U Ar uuid
.Ar vmname
.Sh DESCRIPTION
.Nm
is a hypervisor that runs guest operating systems inside a
virtual machine.
.Pp
Parameters such as the number of virtual CPUs, amount of guest memory, and
I/O connectivity can be specified with command-line parameters.
.Pp
The guest operating system must be loaded with
.Xr bhyveload 8
or a similar boot loader before running
.Nm .
.Pp
.Nm
runs until the guest operating system reboots or an unhandled hypervisor
exit is detected.
.Sh OPTIONS
.Bl -tag -width 10n
.It Fl a
The guest's local APIC is configured in xAPIC mode.
The xAPIC mode is the default setting so this option is redundant.
It will be deprecated in a future version.
.It Fl A
Generate ACPI tables.
Required for
.Fx Ns /amd64
guests.
.It Fl b
Enable a low-level console device supported by
.Fx
kernels compiled with
.Cd "device bvmconsole" .
This option will be deprecated in a future version.
.It Fl c Ar numcpus
Number of guest virtual CPUs.
The default is 1 and the maximum is 16.
.It Fl C
Include guest memory in core file.
.It Fl e
Force
.Nm
to exit when a guest issues an access to an I/O port that is not emulated.
This is intended for debug purposes.
.It Fl g Ar gdbport
For
.Fx
kernels compiled with
.Cd "device bvmdebug" ,
allow a remote kernel kgdb to be relayed to the guest kernel gdb stub
via a local IPv4 address and this port.
This option will be deprecated in a future version.
.It Fl h
Print help message and exit.
.It Fl H
Yield the virtual CPU thread when a HLT instruction is detected.
If this option is not specified, virtual CPUs will use 100% of a host CPU.
.It Fl l Ar lpcdev Ns Op , Ns Ar conf
Allow devices behind the LPC PCI-ISA bridge to be configured.
The only supported devices are the TTY-class devices
.Ar com1
and
.Ar com2
and the boot ROM device
.Ar bootrom .
.It Fl m Ar size Ns Op Ar K|k|M|m|G|g|T|t
Guest physical memory size in bytes.
This must be the same size that was given to
.Xr bhyveload 8 .
.Pp
The size argument may be suffixed with one of K, M, G or T (either upper
or lower case) to indicate a multiple of kilobytes, megabytes, gigabytes,
or terabytes.
If no suffix is given, the value is assumed to be in megabytes.
.It Fl p Ar vcpu:hostcpu
Pin guest's virtual CPU
.Em vcpu
to
.Em hostcpu .
.It Fl P
Force the guest virtual CPU to exit when a PAUSE instruction is detected.
.It Fl s Ar slot,emulation Ns Op , Ns Ar conf
Configure a virtual PCI slot and function.
.Pp
.Nm
provides PCI bus emulation and virtual devices that can be attached to
slots on the bus.
There are 32 available slots, with the option of providing up to 8 functions
per slot.
.Bl -tag -width 10n
.It Ar slot
.Ar pcislot[:function]
.Ar bus:pcislot:function
.Pp
The
.Ar pcislot
value is 0 to 31.
The optional
.Ar function
value is 0 to 7.
The optional
.Ar bus
value is 0 to 255.
If not specified, the
.Ar function
value defaults to 0.
If not specified, the
.Ar bus
value defaults to 0.
.It Ar emulation
.Bl -tag -width 10n
.It Li hostbridge | Li amd_hostbridge
.Pp
Provide a simple host bridge.
This is usually configured at slot 0, and is required by most guest
operating systems.
The
.Li amd_hostbridge
emulation is identical but uses a PCI vendor ID of
.Li AMD .
.It Li passthru
PCI pass-through device.
.It Li virtio-net
Virtio network interface.
.It Li virtio-blk
Virtio block storage interface.
.It Li virtio-rnd
Virtio RNG interface.
.It Li ahci-cd
AHCI controller attached to an ATAPI CD/DVD.
.It Li ahci-hd
AHCI controller attached to a SATA hard-drive.
.It Li uart
PCI 16550 serial device.
.It Li lpc
LPC PCI-ISA bridge with COM1 and COM2 16550 serial ports and a boot ROM.
The LPC bridge emulation can only be configured on bus 0.
.El
.It Op Ar conf
This optional parameter describes the backend for device emulations.
If
.Ar conf
is not specified, the device emulation has no backend and can be
considered unconnected.
.Pp
Network devices:
.Bl -tag -width 10n
.It Ar tapN Ns Op , Ns Ar mac=xx:xx:xx:xx:xx:xx
.It Ar vmnetN Ns Op , Ns Ar mac=xx:xx:xx:xx:xx:xx
.Pp
If
.Ar mac
is not specified, the MAC address is derived from a fixed OUI and the
remaining bytes from an MD5 hash of the slot and function numbers and
the device name.
.Pp
The MAC address is an ASCII string in
.Xr ethers 5
format.
.El
.Pp
Block storage devices:
.Bl -tag -width 10n
.It Pa /filename Ns Oo , Ns Ar block-device-options Oc
.It Pa /dev/xxx Ns Oo , Ns Ar block-device-options Oc
.El
.Pp
The
.Ar block-device-options
are:
.Bl -tag -width 8n
.It Li nocache
Open the file with
.Dv O_DIRECT .
.It Li direct
Open the file using
.Dv O_SYNC .
.It Li ro
Force the file to be opened read-only.
.It Li sectorsize= Ns Ar logical Ns Oo / Ns Ar physical Oc
Specify the logical and physical sector sizes of the emulated disk.
The physical sector size is optional and is equal to the logical sector size
if not explicitly specified.
.El
.Pp
TTY devices:
.Bl -tag -width 10n
.It Li stdio
Connect the serial port to the standard input and output of
the
.Nm
process.
.It Pa /dev/xxx
Use the host TTY device for serial port I/O.
.El
.Pp
Boot ROM device:
.Bl -tag -width 10n
.It Pa romfile
Map
.Ar romfile
in the guest address space reserved for boot firmware.
.El
.Pp
Pass-through devices:
.Bl -tag -width 10n
.It Ns Ar slot Ns / Ns Ar bus Ns / Ns Ar function
Connect to a PCI device on the host at the selector described by
.Ar slot ,
.Ar bus ,
and
.Ar function
numbers.
.El
.Pp
Guest memory must be wired using the
.Fl S
option when a pass-through device is configured.
.Pp
The host device must have been reserved at boot-time using the
.Va pptdev
loader variable as described in
.Xr vmm 4 .
.El
.It Fl S
Wire guest memory.
.It Fl u
RTC keeps UTC time.
.It Fl U Ar uuid
Set the universally unique identifier
.Pq UUID
in the guest's System Management BIOS System Information structure.
By default a UUID is generated from the host's hostname and
.Ar vmname .
.It Fl w
Ignore accesses to unimplemented Model Specific Registers (MSRs).
This is intended for debug purposes.
.It Fl W
Force virtio PCI device emulations to use MSI interrupts instead of MSI-X
interrupts.
.It Fl x
The guest's local APIC is configured in x2APIC mode.
.It Fl Y
Disable MPtable generation.
.It Ar vmname
Alphanumeric name of the guest.
This should be the same as that created by
.Xr bhyveload 8 .
.El
.Sh SIGNAL HANDLING
.Nm
deals with the following signals:
.Pp
.Bl -tag -width indent -compact
.It SIGTERM
Trigger ACPI poweroff for a VM
.El
.Sh EXIT STATUS
Exit status indicates how the VM was terminated:
.Pp
.Bl -tag -width indent -compact
.It 0
rebooted
.It 1
powered off
.It 2
halted
.It 3
triple fault
.El
.Sh EXAMPLES
The guest operating system must have been loaded with
.Xr bhyveload 8
or a similar boot loader before
.Xr bhyve 4
can be run.
.Pp
To run a virtual machine with 1GB of memory, two virtual CPUs, a virtio
block device backed by the
.Pa /my/image
filesystem image, and a serial port for the console:
.Bd -literal -offset indent
bhyve -c 2 -s 0,hostbridge -s 1,lpc -s 2,virtio-blk,/my/image \\
-l com1,stdio -A -H -P -m 1G vm1
.Ed
.Pp
Run a 24GB single-CPU virtual machine with three network ports, one of which
has a MAC address specified:
.Bd -literal -offset indent
bhyve -s 0,hostbridge -s 1,lpc -s 2:0,virtio-net,tap0 \\
-s 2:1,virtio-net,tap1 \\
-s 2:2,virtio-net,tap2,mac=00:be:fa:76:45:00 \\
-s 3,virtio-blk,/my/image -l com1,stdio \\
-A -H -P -m 24G bigvm
.Ed
.Pp
Run an 8GB quad-CPU virtual machine with 8 AHCI SATA disks, an AHCI ATAPI
CD-ROM, a single virtio network port, an AMD hostbridge, and the console
port connected to an
.Xr nmdm 4
null-modem device.
.Bd -literal -offset indent
bhyve -c 4 \\
-s 0,amd_hostbridge -s 1,lpc \\
-s 1:0,ahci-hd,/images/disk.1 \\
-s 1:1,ahci-hd,/images/disk.2 \\
-s 1:2,ahci-hd,/images/disk.3 \\
-s 1:3,ahci-hd,/images/disk.4 \\
-s 1:4,ahci-hd,/images/disk.5 \\
-s 1:5,ahci-hd,/images/disk.6 \\
-s 1:6,ahci-hd,/images/disk.7 \\
-s 1:7,ahci-hd,/images/disk.8 \\
-s 2,ahci-cd,/images/install.iso \\
-s 3,virtio-net,tap0 \\
-l com1,/dev/nmdm0A \\
-A -H -P -m 8G
.Ed
.Sh SEE ALSO
.Xr bhyve 4 ,
.Xr nmdm 4 ,
.Xr vmm 4 ,
.Xr ethers 5 ,
.Xr bhyvectl 8 ,
.Xr bhyveload 8
.Sh HISTORY
.Nm
first appeared in
.Fx 10.0 .
.Sh AUTHORS
.An Neel Natu Aq Mt neel@freebsd.org
.An Peter Grehan Aq Mt grehan@freebsd.org

971
bhyverun.c Normal file
View File

@ -0,0 +1,971 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/time.h>
#include <machine/atomic.h>
#include <machine/segments.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <err.h>
#include <libgen.h>
#include <unistd.h>
#include <assert.h>
#include <errno.h>
#include <pthread.h>
#include <pthread_np.h>
#include <sysexits.h>
#include <stdbool.h>
#include <machine/vmm.h>
#include <vmmapi.h>
#include "bhyverun.h"
#include "acpi.h"
#include "inout.h"
#include "dbgport.h"
#include "fwctl.h"
#include "ioapic.h"
#include "mem.h"
#include "mevent.h"
#include "mptbl.h"
#include "pci_emul.h"
#include "pci_irq.h"
#include "pci_lpc.h"
#include "smbiostbl.h"
#include "xmsr.h"
#include "spinup_ap.h"
#include "rtc.h"
#define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */
#define MB (1024UL * 1024)
#define GB (1024UL * MB)
typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu);
char *vmname;
int guest_ncpus;
char *guest_uuid_str;
static int guest_vmexit_on_hlt, guest_vmexit_on_pause;
static int virtio_msix = 1;
static int x2apic_mode = 0; /* default is xAPIC */
static int strictio;
static int strictmsr = 1;
static int acpi;
static char *progname;
static const int BSP = 0;
static cpuset_t cpumask;
static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
static struct vm_exit vmexit[VM_MAXCPU];
struct bhyvestats {
uint64_t vmexit_bogus;
uint64_t vmexit_reqidle;
uint64_t vmexit_hlt;
uint64_t vmexit_pause;
uint64_t vmexit_mtrap;
uint64_t vmexit_inst_emul;
uint64_t cpu_switch_rotate;
uint64_t cpu_switch_direct;
} stats;
struct mt_vmm_info {
pthread_t mt_thr;
struct vmctx *mt_ctx;
int mt_vcpu;
} mt_vmm_info[VM_MAXCPU];
static cpuset_t *vcpumap[VM_MAXCPU] = { NULL };
static void
usage(int code)
{
fprintf(stderr,
"Usage: %s [-abehuwxACHPSWY] [-c vcpus] [-g <gdb port>] [-l <lpc>]\n"
" %*s [-m mem] [-p vcpu:hostcpu] [-s <pci>] [-U uuid] <vm>\n"
" -a: local apic is in xAPIC mode (deprecated)\n"
" -A: create ACPI tables\n"
" -c: # cpus (default 1)\n"
" -C: include guest memory in core file\n"
" -e: exit on unhandled I/O access\n"
" -g: gdb port\n"
" -h: help\n"
" -H: vmexit from the guest on hlt\n"
" -l: LPC device configuration\n"
" -m: memory size in MB\n"
" -p: pin 'vcpu' to 'hostcpu'\n"
" -P: vmexit from the guest on pause\n"
" -s: <slot,driver,configinfo> PCI slot config\n"
" -S: guest memory cannot be swapped\n"
" -u: RTC keeps UTC time\n"
" -U: uuid\n"
" -w: ignore unimplemented MSRs\n"
" -W: force virtio to use single-vector MSI\n"
" -x: local apic is in x2APIC mode\n"
" -Y: disable MPtable generation\n",
progname, (int)strlen(progname), "");
exit(code);
}
static int
pincpu_parse(const char *opt)
{
int vcpu, pcpu;
if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) {
fprintf(stderr, "invalid format: %s\n", opt);
return (-1);
}
if (vcpu < 0 || vcpu >= VM_MAXCPU) {
fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n",
vcpu, VM_MAXCPU - 1);
return (-1);
}
if (pcpu < 0 || pcpu >= CPU_SETSIZE) {
fprintf(stderr, "hostcpu '%d' outside valid range from "
"0 to %d\n", pcpu, CPU_SETSIZE - 1);
return (-1);
}
if (vcpumap[vcpu] == NULL) {
if ((vcpumap[vcpu] = malloc(sizeof(cpuset_t))) == NULL) {
perror("malloc");
return (-1);
}
CPU_ZERO(vcpumap[vcpu]);
}
CPU_SET(pcpu, vcpumap[vcpu]);
return (0);
}
void
vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid,
int errcode)
{
struct vmctx *ctx;
int error, restart_instruction;
ctx = arg;
restart_instruction = 1;
error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode,
restart_instruction);
assert(error == 0);
}
void *
paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
{
return (vm_map_gpa(ctx, gaddr, len));
}
int
fbsdrun_vmexit_on_pause(void)
{
return (guest_vmexit_on_pause);
}
int
fbsdrun_vmexit_on_hlt(void)
{
return (guest_vmexit_on_hlt);
}
int
fbsdrun_virtio_msix(void)
{
return (virtio_msix);
}
static void *
fbsdrun_start_thread(void *param)
{
char tname[MAXCOMLEN + 1];
struct mt_vmm_info *mtp;
int vcpu;
mtp = param;
vcpu = mtp->mt_vcpu;
snprintf(tname, sizeof(tname), "vcpu %d", vcpu);
pthread_set_name_np(mtp->mt_thr, tname);
vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip);
/* not reached */
exit(1);
return (NULL);
}
void
fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip)
{
int error;
assert(fromcpu == BSP);
/*
* The 'newcpu' must be activated in the context of 'fromcpu'. If
* vm_activate_cpu() is delayed until newcpu's pthread starts running
* then vmm.ko is out-of-sync with bhyve and this can create a race
* with vm_suspend().
*/
error = vm_activate_cpu(ctx, newcpu);
if (error != 0)
err(EX_OSERR, "could not activate CPU %d", newcpu);
CPU_SET_ATOMIC(newcpu, &cpumask);
/*
* Set up the vmexit struct to allow execution to start
* at the given RIP
*/
vmexit[newcpu].rip = rip;
vmexit[newcpu].inst_length = 0;
mt_vmm_info[newcpu].mt_ctx = ctx;
mt_vmm_info[newcpu].mt_vcpu = newcpu;
error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL,
fbsdrun_start_thread, &mt_vmm_info[newcpu]);
assert(error == 0);
}
static int
fbsdrun_deletecpu(struct vmctx *ctx, int vcpu)
{
if (!CPU_ISSET(vcpu, &cpumask)) {
fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu);
exit(1);
}
CPU_CLR_ATOMIC(vcpu, &cpumask);
return (CPU_EMPTY(&cpumask));
}
static int
vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
uint32_t eax)
{
#if BHYVE_DEBUG
/*
* put guest-driven debug here
*/
#endif
return (VMEXIT_CONTINUE);
}
static int
vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
{
int error;
int bytes, port, in, out;
int vcpu;
vcpu = *pvcpu;
port = vme->u.inout.port;
bytes = vme->u.inout.bytes;
in = vme->u.inout.in;
out = !in;
/* Extra-special case of host notifications */
if (out && port == GUEST_NIO_PORT) {
error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax);
return (error);
}
error = emulate_inout(ctx, vcpu, vme, strictio);
if (error) {
fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n",
in ? "in" : "out",
bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'),
port, vmexit->rip);
return (VMEXIT_ABORT);
} else {
return (VMEXIT_CONTINUE);
}
}
static int
vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
{
uint64_t val;
uint32_t eax, edx;
int error;
val = 0;
error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val);
if (error != 0) {
fprintf(stderr, "rdmsr to register %#x on vcpu %d\n",
vme->u.msr.code, *pvcpu);
if (strictmsr) {
vm_inject_gp(ctx, *pvcpu);
return (VMEXIT_CONTINUE);
}
}
eax = val;
error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax);
assert(error == 0);
edx = val >> 32;
error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx);
assert(error == 0);
return (VMEXIT_CONTINUE);
}
static int
vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
{
int error;
error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval);
if (error != 0) {
fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n",
vme->u.msr.code, vme->u.msr.wval, *pvcpu);
if (strictmsr) {
vm_inject_gp(ctx, *pvcpu);
return (VMEXIT_CONTINUE);
}
}
return (VMEXIT_CONTINUE);
}
static int
vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
{
int newcpu;
int retval = VMEXIT_CONTINUE;
newcpu = spinup_ap(ctx, *pvcpu,
vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip);
return (retval);
}
#define DEBUG_EPT_MISCONFIG
#ifdef DEBUG_EPT_MISCONFIG
#define EXIT_REASON_EPT_MISCONFIG 49
#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400
#define VMCS_IDENT(x) ((x) | 0x80000000)
static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4];
static int ept_misconfig_ptenum;
#endif
static int
vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
fprintf(stderr, "vm exit[%d]\n", *pvcpu);
fprintf(stderr, "\treason\t\tVMX\n");
fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status);
fprintf(stderr, "\texit_reason\t%u\n", vmexit->u.vmx.exit_reason);
fprintf(stderr, "\tqualification\t0x%016lx\n",
vmexit->u.vmx.exit_qualification);
fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type);
fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error);
#ifdef DEBUG_EPT_MISCONFIG
if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) {
vm_get_register(ctx, *pvcpu,
VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS),
&ept_misconfig_gpa);
vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte,
&ept_misconfig_ptenum);
fprintf(stderr, "\tEPT misconfiguration:\n");
fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa);
fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n",
ept_misconfig_ptenum, ept_misconfig_pte[0],
ept_misconfig_pte[1], ept_misconfig_pte[2],
ept_misconfig_pte[3]);
}
#endif /* DEBUG_EPT_MISCONFIG */
return (VMEXIT_ABORT);
}
static int
vmexit_svm(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
fprintf(stderr, "vm exit[%d]\n", *pvcpu);
fprintf(stderr, "\treason\t\tSVM\n");
fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
fprintf(stderr, "\texitcode\t%#lx\n", vmexit->u.svm.exitcode);
fprintf(stderr, "\texitinfo1\t%#lx\n", vmexit->u.svm.exitinfo1);
fprintf(stderr, "\texitinfo2\t%#lx\n", vmexit->u.svm.exitinfo2);
return (VMEXIT_ABORT);
}
static int
vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
assert(vmexit->inst_length == 0);
stats.vmexit_bogus++;
return (VMEXIT_CONTINUE);
}
static int
vmexit_reqidle(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
assert(vmexit->inst_length == 0);
stats.vmexit_reqidle++;
return (VMEXIT_CONTINUE);
}
static int
vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
stats.vmexit_hlt++;
/*
* Just continue execution with the next instruction. We use
* the HLT VM exit as a way to be friendly with the host
* scheduler.
*/
return (VMEXIT_CONTINUE);
}
static int
vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
stats.vmexit_pause++;
return (VMEXIT_CONTINUE);
}
static int
vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
assert(vmexit->inst_length == 0);
stats.vmexit_mtrap++;
return (VMEXIT_CONTINUE);
}
static int
vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
int err, i;
struct vie *vie;
stats.vmexit_inst_emul++;
vie = &vmexit->u.inst_emul.vie;
err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa,
vie, &vmexit->u.inst_emul.paging);
if (err) {
if (err == ESRCH) {
fprintf(stderr, "Unhandled memory access to 0x%lx\n",
vmexit->u.inst_emul.gpa);
}
fprintf(stderr, "Failed to emulate instruction [");
for (i = 0; i < vie->num_valid; i++) {
fprintf(stderr, "0x%02x%s", vie->inst[i],
i != (vie->num_valid - 1) ? " " : "");
}
fprintf(stderr, "] at 0x%lx\n", vmexit->rip);
return (VMEXIT_ABORT);
}
return (VMEXIT_CONTINUE);
}
static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER;
static int
vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
enum vm_suspend_how how;
how = vmexit->u.suspended.how;
fbsdrun_deletecpu(ctx, *pvcpu);
if (*pvcpu != BSP) {
pthread_mutex_lock(&resetcpu_mtx);
pthread_cond_signal(&resetcpu_cond);
pthread_mutex_unlock(&resetcpu_mtx);
pthread_exit(NULL);
}
pthread_mutex_lock(&resetcpu_mtx);
while (!CPU_EMPTY(&cpumask)) {
pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx);
}
pthread_mutex_unlock(&resetcpu_mtx);
switch (how) {
case VM_SUSPEND_RESET:
exit(0);
case VM_SUSPEND_POWEROFF:
exit(1);
case VM_SUSPEND_HALT:
exit(2);
case VM_SUSPEND_TRIPLEFAULT:
exit(3);
default:
fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how);
exit(100);
}
return (0); /* NOTREACHED */
}
static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
[VM_EXITCODE_INOUT] = vmexit_inout,
[VM_EXITCODE_INOUT_STR] = vmexit_inout,
[VM_EXITCODE_VMX] = vmexit_vmx,
[VM_EXITCODE_SVM] = vmexit_svm,
[VM_EXITCODE_BOGUS] = vmexit_bogus,
[VM_EXITCODE_REQIDLE] = vmexit_reqidle,
[VM_EXITCODE_RDMSR] = vmexit_rdmsr,
[VM_EXITCODE_WRMSR] = vmexit_wrmsr,
[VM_EXITCODE_MTRAP] = vmexit_mtrap,
[VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
[VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
[VM_EXITCODE_SUSPENDED] = vmexit_suspend,
[VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch,
};
static void
vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip)
{
int error, rc;
enum vm_exitcode exitcode;
cpuset_t active_cpus;
if (vcpumap[vcpu] != NULL) {
error = pthread_setaffinity_np(pthread_self(),
sizeof(cpuset_t), vcpumap[vcpu]);
assert(error == 0);
}
error = vm_active_cpus(ctx, &active_cpus);
assert(CPU_ISSET(vcpu, &active_cpus));
error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip);
assert(error == 0);
while (1) {
error = vm_run(ctx, vcpu, &vmexit[vcpu]);
if (error != 0)
break;
exitcode = vmexit[vcpu].exitcode;
if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) {
fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n",
exitcode);
exit(1);
}
rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu);
switch (rc) {
case VMEXIT_CONTINUE:
break;
case VMEXIT_ABORT:
abort();
default:
exit(1);
}
}
fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
}
static int
num_vcpus_allowed(struct vmctx *ctx)
{
int tmp, error;
error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp);
/*
* The guest is allowed to spinup more than one processor only if the
* UNRESTRICTED_GUEST capability is available.
*/
if (error == 0)
return (VM_MAXCPU);
else
return (1);
}
void
fbsdrun_set_capabilities(struct vmctx *ctx, int cpu)
{
int err, tmp;
if (fbsdrun_vmexit_on_hlt()) {
err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp);
if (err < 0) {
fprintf(stderr, "VM exit on HLT not supported\n");
exit(1);
}
vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1);
if (cpu == BSP)
handler[VM_EXITCODE_HLT] = vmexit_hlt;
}
if (fbsdrun_vmexit_on_pause()) {
/*
* pause exit support required for this mode
*/
err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp);
if (err < 0) {
fprintf(stderr,
"SMP mux requested, no pause support\n");
exit(1);
}
vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1);
if (cpu == BSP)
handler[VM_EXITCODE_PAUSE] = vmexit_pause;
}
if (x2apic_mode)
err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED);
else
err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED);
if (err) {
fprintf(stderr, "Unable to set x2apic state (%d)\n", err);
exit(1);
}
vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1);
}
static struct vmctx *
do_open(const char *vmname)
{
struct vmctx *ctx;
int error;
bool reinit, romboot;
reinit = romboot = false;
if (lpc_bootrom())
romboot = true;
error = vm_create(vmname);
if (error) {
if (errno == EEXIST) {
if (romboot) {
reinit = true;
} else {
/*
* The virtual machine has been setup by the
* userspace bootloader.
*/
}
} else {
perror("vm_create");
exit(1);
}
} else {
if (!romboot) {
/*
* If the virtual machine was just created then a
* bootrom must be configured to boot it.
*/
fprintf(stderr, "virtual machine cannot be booted\n");
exit(1);
}
}
ctx = vm_open(vmname);
if (ctx == NULL) {
perror("vm_open");
exit(1);
}
if (reinit) {
error = vm_reinit(ctx);
if (error) {
perror("vm_reinit");
exit(1);
}
}
return (ctx);
}
int
main(int argc, char *argv[])
{
int c, error, gdb_port, err, bvmcons;
int max_vcpus, mptgen, memflags;
int rtc_localtime;
struct vmctx *ctx;
uint64_t rip;
size_t memsize;
char *optstr;
bvmcons = 0;
progname = basename(argv[0]);
gdb_port = 0;
guest_ncpus = 1;
memsize = 256 * MB;
mptgen = 1;
rtc_localtime = 1;
memflags = 0;
optstr = "abehuwxACHIPSWYp:g:c:s:m:l:U:";
while ((c = getopt(argc, argv, optstr)) != -1) {
switch (c) {
case 'a':
x2apic_mode = 0;
break;
case 'A':
acpi = 1;
break;
case 'b':
bvmcons = 1;
break;
case 'p':
if (pincpu_parse(optarg) != 0) {
errx(EX_USAGE, "invalid vcpu pinning "
"configuration '%s'", optarg);
}
break;
case 'c':
guest_ncpus = atoi(optarg);
break;
case 'C':
memflags |= VM_MEM_F_INCORE;
break;
case 'g':
gdb_port = atoi(optarg);
break;
case 'l':
if (lpc_device_parse(optarg) != 0) {
errx(EX_USAGE, "invalid lpc device "
"configuration '%s'", optarg);
}
break;
case 's':
if (pci_parse_slot(optarg) != 0)
exit(1);
else
break;
case 'S':
memflags |= VM_MEM_F_WIRED;
break;
case 'm':
error = vm_parse_memsize(optarg, &memsize);
if (error)
errx(EX_USAGE, "invalid memsize '%s'", optarg);
break;
case 'H':
guest_vmexit_on_hlt = 1;
break;
case 'I':
/*
* The "-I" option was used to add an ioapic to the
* virtual machine.
*
* An ioapic is now provided unconditionally for each
* virtual machine and this option is now deprecated.
*/
break;
case 'P':
guest_vmexit_on_pause = 1;
break;
case 'e':
strictio = 1;
break;
case 'u':
rtc_localtime = 0;
break;
case 'U':
guest_uuid_str = optarg;
break;
case 'w':
strictmsr = 0;
break;
case 'W':
virtio_msix = 0;
break;
case 'x':
x2apic_mode = 1;
break;
case 'Y':
mptgen = 0;
break;
case 'h':
usage(0);
default:
usage(1);
}
}
argc -= optind;
argv += optind;
if (argc != 1)
usage(1);
vmname = argv[0];
ctx = do_open(vmname);
if (guest_ncpus < 1) {
fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus);
exit(1);
}
max_vcpus = num_vcpus_allowed(ctx);
if (guest_ncpus > max_vcpus) {
fprintf(stderr, "%d vCPUs requested but only %d available\n",
guest_ncpus, max_vcpus);
exit(1);
}
fbsdrun_set_capabilities(ctx, BSP);
vm_set_memflags(ctx, memflags);
err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
if (err) {
fprintf(stderr, "Unable to setup memory (%d)\n", errno);
exit(1);
}
error = init_msr();
if (error) {
fprintf(stderr, "init_msr error %d", error);
exit(1);
}
init_mem();
init_inout();
pci_irq_init(ctx);
ioapic_init(ctx);
rtc_init(ctx, rtc_localtime);
sci_init(ctx);
/*
* Exit if a device emulation finds an error in it's initilization
*/
if (init_pci(ctx) != 0)
exit(1);
if (gdb_port != 0)
init_dbgport(gdb_port);
if (bvmcons)
init_bvmcons();
if (lpc_bootrom()) {
if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) {
fprintf(stderr, "ROM boot failed: unrestricted guest "
"capability not available\n");
exit(1);
}
error = vcpu_reset(ctx, BSP);
assert(error == 0);
}
error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
assert(error == 0);
/*
* build the guest tables, MP etc.
*/
if (mptgen) {
error = mptable_build(ctx, guest_ncpus);
if (error)
exit(1);
}
error = smbios_build(ctx);
assert(error == 0);
if (acpi) {
error = acpi_build(ctx, guest_ncpus);
assert(error == 0);
}
if (lpc_bootrom())
fwctl_init();
/*
* Change the proc title to include the VM name.
*/
setproctitle("%s", vmname);
/*
* Add CPU 0
*/
fbsdrun_addcpu(ctx, BSP, BSP, rip);
/*
* Head off to the main event dispatch loop
*/
mevent_dispatch();
exit(1);
}

55
bhyverun.h Normal file
View File

@ -0,0 +1,55 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _FBSDRUN_H_
#define _FBSDRUN_H_
#ifndef CTASSERT /* Allow lint to override */
#define CTASSERT(x) _CTASSERT(x, __LINE__)
#define _CTASSERT(x, y) __CTASSERT(x, y)
#define __CTASSERT(x, y) typedef char __assert ## y[(x) ? 1 : -1]
#endif
#define VMEXIT_CONTINUE (0)
#define VMEXIT_ABORT (-1)
struct vmctx;
extern int guest_ncpus;
extern char *guest_uuid_str;
extern char *vmname;
void *paddr_guest2host(struct vmctx *ctx, uintptr_t addr, size_t len);
void fbsdrun_set_capabilities(struct vmctx *ctx, int cpu);
void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip);
int fbsdrun_muxed(void);
int fbsdrun_vmexit_on_hlt(void);
int fbsdrun_vmexit_on_pause(void);
int fbsdrun_disable_x2apic(void);
int fbsdrun_virtio_msix(void);
#endif

820
block_if.c Normal file
View File

@ -0,0 +1,820 @@
/*-
* Copyright (c) 2013 Peter Grehan <grehan@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/queue.h>
#include <sys/errno.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/disk.h>
#include <assert.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <pthread_np.h>
#include <signal.h>
#include <unistd.h>
#include <machine/atomic.h>
#include "bhyverun.h"
#include "mevent.h"
#include "block_if.h"
#define BLOCKIF_SIG 0xb109b109
#define BLOCKIF_NUMTHR 8
#define BLOCKIF_MAXREQ (64 + BLOCKIF_NUMTHR)
enum blockop {
BOP_READ,
BOP_WRITE,
BOP_FLUSH,
BOP_DELETE
};
enum blockstat {
BST_FREE,
BST_BLOCK,
BST_PEND,
BST_BUSY,
BST_DONE
};
struct blockif_elem {
TAILQ_ENTRY(blockif_elem) be_link;
struct blockif_req *be_req;
enum blockop be_op;
enum blockstat be_status;
pthread_t be_tid;
off_t be_block;
};
struct blockif_ctxt {
int bc_magic;
int bc_fd;
int bc_ischr;
int bc_isgeom;
int bc_candelete;
int bc_rdonly;
off_t bc_size;
int bc_sectsz;
int bc_psectsz;
int bc_psectoff;
int bc_closing;
pthread_t bc_btid[BLOCKIF_NUMTHR];
pthread_mutex_t bc_mtx;
pthread_cond_t bc_cond;
/* Request elements and free/pending/busy queues */
TAILQ_HEAD(, blockif_elem) bc_freeq;
TAILQ_HEAD(, blockif_elem) bc_pendq;
TAILQ_HEAD(, blockif_elem) bc_busyq;
struct blockif_elem bc_reqs[BLOCKIF_MAXREQ];
};
static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
struct blockif_sig_elem {
pthread_mutex_t bse_mtx;
pthread_cond_t bse_cond;
int bse_pending;
struct blockif_sig_elem *bse_next;
};
static struct blockif_sig_elem *blockif_bse_head;
static int
blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
enum blockop op)
{
struct blockif_elem *be, *tbe;
off_t off;
int i;
be = TAILQ_FIRST(&bc->bc_freeq);
assert(be != NULL);
assert(be->be_status == BST_FREE);
TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
be->be_req = breq;
be->be_op = op;
switch (op) {
case BOP_READ:
case BOP_WRITE:
case BOP_DELETE:
off = breq->br_offset;
for (i = 0; i < breq->br_iovcnt; i++)
off += breq->br_iov[i].iov_len;
break;
default:
off = OFF_MAX;
}
be->be_block = off;
TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
if (tbe->be_block == breq->br_offset)
break;
}
if (tbe == NULL) {
TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) {
if (tbe->be_block == breq->br_offset)
break;
}
}
if (tbe == NULL)
be->be_status = BST_PEND;
else
be->be_status = BST_BLOCK;
TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
return (be->be_status == BST_PEND);
}
static int
blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep)
{
struct blockif_elem *be;
TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
if (be->be_status == BST_PEND)
break;
assert(be->be_status == BST_BLOCK);
}
if (be == NULL)
return (0);
TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
be->be_status = BST_BUSY;
be->be_tid = t;
TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
*bep = be;
return (1);
}
static void
blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
{
struct blockif_elem *tbe;
if (be->be_status == BST_DONE || be->be_status == BST_BUSY)
TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
else
TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
if (tbe->be_req->br_offset == be->be_block)
tbe->be_status = BST_PEND;
}
be->be_tid = 0;
be->be_status = BST_FREE;
be->be_req = NULL;
TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
}
static void
blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
{
struct blockif_req *br;
off_t arg[2];
ssize_t clen, len, off, boff, voff;
int i, err;
br = be->be_req;
if (br->br_iovcnt <= 1)
buf = NULL;
err = 0;
switch (be->be_op) {
case BOP_READ:
if (buf == NULL) {
if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
br->br_offset)) < 0)
err = errno;
else
br->br_resid -= len;
break;
}
i = 0;
off = voff = 0;
while (br->br_resid > 0) {
len = MIN(br->br_resid, MAXPHYS);
if (pread(bc->bc_fd, buf, len, br->br_offset +
off) < 0) {
err = errno;
break;
}
boff = 0;
do {
clen = MIN(len - boff, br->br_iov[i].iov_len -
voff);
memcpy(br->br_iov[i].iov_base + voff,
buf + boff, clen);
if (clen < br->br_iov[i].iov_len - voff)
voff += clen;
else {
i++;
voff = 0;
}
boff += clen;
} while (boff < len);
off += len;
br->br_resid -= len;
}
break;
case BOP_WRITE:
if (bc->bc_rdonly) {
err = EROFS;
break;
}
if (buf == NULL) {
if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
br->br_offset)) < 0)
err = errno;
else
br->br_resid -= len;
break;
}
i = 0;
off = voff = 0;
while (br->br_resid > 0) {
len = MIN(br->br_resid, MAXPHYS);
boff = 0;
do {
clen = MIN(len - boff, br->br_iov[i].iov_len -
voff);
memcpy(buf + boff,
br->br_iov[i].iov_base + voff, clen);
if (clen < br->br_iov[i].iov_len - voff)
voff += clen;
else {
i++;
voff = 0;
}
boff += clen;
} while (boff < len);
if (pwrite(bc->bc_fd, buf, len, br->br_offset +
off) < 0) {
err = errno;
break;
}
off += len;
br->br_resid -= len;
}
break;
case BOP_FLUSH:
if (bc->bc_ischr) {
if (ioctl(bc->bc_fd, DIOCGFLUSH))
err = errno;
} else if (fsync(bc->bc_fd))
err = errno;
break;
case BOP_DELETE:
if (!bc->bc_candelete)
err = EOPNOTSUPP;
else if (bc->bc_rdonly)
err = EROFS;
else if (bc->bc_ischr) {
arg[0] = br->br_offset;
arg[1] = br->br_resid;
if (ioctl(bc->bc_fd, DIOCGDELETE, arg))
err = errno;
else
br->br_resid = 0;
} else
err = EOPNOTSUPP;
break;
default:
err = EINVAL;
break;
}
be->be_status = BST_DONE;
(*br->br_callback)(br, err);
}
static void *
blockif_thr(void *arg)
{
struct blockif_ctxt *bc;
struct blockif_elem *be;
pthread_t t;
uint8_t *buf;
bc = arg;
if (bc->bc_isgeom)
buf = malloc(MAXPHYS);
else
buf = NULL;
t = pthread_self();
pthread_mutex_lock(&bc->bc_mtx);
for (;;) {
while (blockif_dequeue(bc, t, &be)) {
pthread_mutex_unlock(&bc->bc_mtx);
blockif_proc(bc, be, buf);
pthread_mutex_lock(&bc->bc_mtx);
blockif_complete(bc, be);
}
/* Check ctxt status here to see if exit requested */
if (bc->bc_closing)
break;
pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
}
pthread_mutex_unlock(&bc->bc_mtx);
if (buf)
free(buf);
pthread_exit(NULL);
return (NULL);
}
static void
blockif_sigcont_handler(int signal, enum ev_type type, void *arg)
{
struct blockif_sig_elem *bse;
for (;;) {
/*
* Process the entire list even if not intended for
* this thread.
*/
do {
bse = blockif_bse_head;
if (bse == NULL)
return;
} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
(uintptr_t)bse,
(uintptr_t)bse->bse_next));
pthread_mutex_lock(&bse->bse_mtx);
bse->bse_pending = 0;
pthread_cond_signal(&bse->bse_cond);
pthread_mutex_unlock(&bse->bse_mtx);
}
}
static void
blockif_init(void)
{
mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
(void) signal(SIGCONT, SIG_IGN);
}
struct blockif_ctxt *
blockif_open(const char *optstr, const char *ident)
{
char tname[MAXCOMLEN + 1];
char name[MAXPATHLEN];
char *nopt, *xopts, *cp;
struct blockif_ctxt *bc;
struct stat sbuf;
struct diocgattr_arg arg;
off_t size, psectsz, psectoff;
int extra, fd, i, sectsz;
int nocache, sync, ro, candelete, geom, ssopt, pssopt;
pthread_once(&blockif_once, blockif_init);
fd = -1;
ssopt = 0;
nocache = 0;
sync = 0;
ro = 0;
/*
* The first element in the optstring is always a pathname.
* Optional elements follow
*/
nopt = xopts = strdup(optstr);
while (xopts != NULL) {
cp = strsep(&xopts, ",");
if (cp == nopt) /* file or device pathname */
continue;
else if (!strcmp(cp, "nocache"))
nocache = 1;
else if (!strcmp(cp, "sync") || !strcmp(cp, "direct"))
sync = 1;
else if (!strcmp(cp, "ro"))
ro = 1;
else if (sscanf(cp, "sectorsize=%d/%d", &ssopt, &pssopt) == 2)
;
else if (sscanf(cp, "sectorsize=%d", &ssopt) == 1)
pssopt = ssopt;
else {
fprintf(stderr, "Invalid device option \"%s\"\n", cp);
goto err;
}
}
extra = 0;
if (nocache)
extra |= O_DIRECT;
if (sync)
extra |= O_SYNC;
fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra);
if (fd < 0 && !ro) {
/* Attempt a r/w fail with a r/o open */
fd = open(nopt, O_RDONLY | extra);
ro = 1;
}
if (fd < 0) {
perror("Could not open backing file");
goto err;
}
if (fstat(fd, &sbuf) < 0) {
perror("Could not stat backing file");
goto err;
}
/*
* Deal with raw devices
*/
size = sbuf.st_size;
sectsz = DEV_BSIZE;
psectsz = psectoff = 0;
candelete = geom = 0;
if (S_ISCHR(sbuf.st_mode)) {
if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
perror("Could not fetch dev blk/sector size");
goto err;
}
assert(size != 0);
assert(sectsz != 0);
if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0)
ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
arg.len = sizeof(arg.value.i);
if (ioctl(fd, DIOCGATTR, &arg) == 0)
candelete = arg.value.i;
if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0)
geom = 1;
} else
psectsz = sbuf.st_blksize;
if (ssopt != 0) {
if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 ||
ssopt > pssopt) {
fprintf(stderr, "Invalid sector size %d/%d\n",
ssopt, pssopt);
goto err;
}
/*
* Some backend drivers (e.g. cd0, ada0) require that the I/O
* size be a multiple of the device's sector size.
*
* Validate that the emulated sector size complies with this
* requirement.
*/
if (S_ISCHR(sbuf.st_mode)) {
if (ssopt < sectsz || (ssopt % sectsz) != 0) {
fprintf(stderr, "Sector size %d incompatible "
"with underlying device sector size %d\n",
ssopt, sectsz);
goto err;
}
}
sectsz = ssopt;
psectsz = pssopt;
psectoff = 0;
}
bc = calloc(1, sizeof(struct blockif_ctxt));
if (bc == NULL) {
perror("calloc");
goto err;
}
bc->bc_magic = BLOCKIF_SIG;
bc->bc_fd = fd;
bc->bc_ischr = S_ISCHR(sbuf.st_mode);
bc->bc_isgeom = geom;
bc->bc_candelete = candelete;
bc->bc_rdonly = ro;
bc->bc_size = size;
bc->bc_sectsz = sectsz;
bc->bc_psectsz = psectsz;
bc->bc_psectoff = psectoff;
pthread_mutex_init(&bc->bc_mtx, NULL);
pthread_cond_init(&bc->bc_cond, NULL);
TAILQ_INIT(&bc->bc_freeq);
TAILQ_INIT(&bc->bc_pendq);
TAILQ_INIT(&bc->bc_busyq);
for (i = 0; i < BLOCKIF_MAXREQ; i++) {
bc->bc_reqs[i].be_status = BST_FREE;
TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
}
for (i = 0; i < BLOCKIF_NUMTHR; i++) {
pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc);
snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i);
pthread_set_name_np(bc->bc_btid[i], tname);
}
return (bc);
err:
if (fd >= 0)
close(fd);
return (NULL);
}
static int
blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
enum blockop op)
{
int err;
err = 0;
pthread_mutex_lock(&bc->bc_mtx);
if (!TAILQ_EMPTY(&bc->bc_freeq)) {
/*
* Enqueue and inform the block i/o thread
* that there is work available
*/
if (blockif_enqueue(bc, breq, op))
pthread_cond_signal(&bc->bc_cond);
} else {
/*
* Callers are not allowed to enqueue more than
* the specified blockif queue limit. Return an
* error to indicate that the queue length has been
* exceeded.
*/
err = E2BIG;
}
pthread_mutex_unlock(&bc->bc_mtx);
return (err);
}
int
blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
{
assert(bc->bc_magic == BLOCKIF_SIG);
return (blockif_request(bc, breq, BOP_READ));
}
int
blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
{
assert(bc->bc_magic == BLOCKIF_SIG);
return (blockif_request(bc, breq, BOP_WRITE));
}
int
blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
{
assert(bc->bc_magic == BLOCKIF_SIG);
return (blockif_request(bc, breq, BOP_FLUSH));
}
int
blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq)
{
assert(bc->bc_magic == BLOCKIF_SIG);
return (blockif_request(bc, breq, BOP_DELETE));
}
int
blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
{
struct blockif_elem *be;
assert(bc->bc_magic == BLOCKIF_SIG);
pthread_mutex_lock(&bc->bc_mtx);
/*
* Check pending requests.
*/
TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
if (be->be_req == breq)
break;
}
if (be != NULL) {
/*
* Found it.
*/
blockif_complete(bc, be);
pthread_mutex_unlock(&bc->bc_mtx);
return (0);
}
/*
* Check in-flight requests.
*/
TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
if (be->be_req == breq)
break;
}
if (be == NULL) {
/*
* Didn't find it.
*/
pthread_mutex_unlock(&bc->bc_mtx);
return (EINVAL);
}
/*
* Interrupt the processing thread to force it return
* prematurely via it's normal callback path.
*/
while (be->be_status == BST_BUSY) {
struct blockif_sig_elem bse, *old_head;
pthread_mutex_init(&bse.bse_mtx, NULL);
pthread_cond_init(&bse.bse_cond, NULL);
bse.bse_pending = 1;
do {
old_head = blockif_bse_head;
bse.bse_next = old_head;
} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
(uintptr_t)old_head,
(uintptr_t)&bse));
pthread_kill(be->be_tid, SIGCONT);
pthread_mutex_lock(&bse.bse_mtx);
while (bse.bse_pending)
pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
pthread_mutex_unlock(&bse.bse_mtx);
}
pthread_mutex_unlock(&bc->bc_mtx);
/*
* The processing thread has been interrupted. Since it's not
* clear if the callback has been invoked yet, return EBUSY.
*/
return (EBUSY);
}
int
blockif_close(struct blockif_ctxt *bc)
{
void *jval;
int i;
assert(bc->bc_magic == BLOCKIF_SIG);
/*
* Stop the block i/o thread
*/
pthread_mutex_lock(&bc->bc_mtx);
bc->bc_closing = 1;
pthread_mutex_unlock(&bc->bc_mtx);
pthread_cond_broadcast(&bc->bc_cond);
for (i = 0; i < BLOCKIF_NUMTHR; i++)
pthread_join(bc->bc_btid[i], &jval);
/* XXX Cancel queued i/o's ??? */
/*
* Release resources
*/
bc->bc_magic = 0;
close(bc->bc_fd);
free(bc);
return (0);
}
/*
* Return virtual C/H/S values for a given block. Use the algorithm
* outlined in the VHD specification to calculate values.
*/
void
blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
{
off_t sectors; /* total sectors of the block dev */
off_t hcyl; /* cylinders times heads */
uint16_t secpt; /* sectors per track */
uint8_t heads;
assert(bc->bc_magic == BLOCKIF_SIG);
sectors = bc->bc_size / bc->bc_sectsz;
/* Clamp the size to the largest possible with CHS */
if (sectors > 65535UL*16*255)
sectors = 65535UL*16*255;
if (sectors >= 65536UL*16*63) {
secpt = 255;
heads = 16;
hcyl = sectors / secpt;
} else {
secpt = 17;
hcyl = sectors / secpt;
heads = (hcyl + 1023) / 1024;
if (heads < 4)
heads = 4;
if (hcyl >= (heads * 1024) || heads > 16) {
secpt = 31;
heads = 16;
hcyl = sectors / secpt;
}
if (hcyl >= (heads * 1024)) {
secpt = 63;
heads = 16;
hcyl = sectors / secpt;
}
}
*c = hcyl / heads;
*h = heads;
*s = secpt;
}
/*
* Accessors
*/
off_t
blockif_size(struct blockif_ctxt *bc)
{
assert(bc->bc_magic == BLOCKIF_SIG);
return (bc->bc_size);
}
int
blockif_sectsz(struct blockif_ctxt *bc)
{
assert(bc->bc_magic == BLOCKIF_SIG);
return (bc->bc_sectsz);
}
void
blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
{
assert(bc->bc_magic == BLOCKIF_SIG);
*size = bc->bc_psectsz;
*off = bc->bc_psectoff;
}
int
blockif_queuesz(struct blockif_ctxt *bc)
{
assert(bc->bc_magic == BLOCKIF_SIG);
return (BLOCKIF_MAXREQ - 1);
}
int
blockif_is_ro(struct blockif_ctxt *bc)
{
assert(bc->bc_magic == BLOCKIF_SIG);
return (bc->bc_rdonly);
}
int
blockif_candelete(struct blockif_ctxt *bc)
{
assert(bc->bc_magic == BLOCKIF_SIG);
return (bc->bc_candelete);
}

70
block_if.h Normal file
View File

@ -0,0 +1,70 @@
/*-
* Copyright (c) 2013 Peter Grehan <grehan@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
/*
* The block API to be used by bhyve block-device emulations. The routines
* are thread safe, with no assumptions about the context of the completion
* callback - it may occur in the caller's context, or asynchronously in
* another thread.
*/
#ifndef _BLOCK_IF_H_
#define _BLOCK_IF_H_
#include <sys/uio.h>
#include <sys/unistd.h>
#define BLOCKIF_IOV_MAX 33 /* not practical to be IOV_MAX */
struct blockif_req {
struct iovec br_iov[BLOCKIF_IOV_MAX];
int br_iovcnt;
off_t br_offset;
ssize_t br_resid;
void (*br_callback)(struct blockif_req *req, int err);
void *br_param;
};
struct blockif_ctxt;
struct blockif_ctxt *blockif_open(const char *optstr, const char *ident);
off_t blockif_size(struct blockif_ctxt *bc);
void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h,
uint8_t *s);
int blockif_sectsz(struct blockif_ctxt *bc);
void blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off);
int blockif_queuesz(struct blockif_ctxt *bc);
int blockif_is_ro(struct blockif_ctxt *bc);
int blockif_candelete(struct blockif_ctxt *bc);
int blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq);
int blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq);
int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq);
int blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq);
int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq);
int blockif_close(struct blockif_ctxt *bc);
#endif /* _BLOCK_IF_H_ */

111
bootrom.c Normal file
View File

@ -0,0 +1,111 @@
/*-
* Copyright (c) 2015 Neel Natu <neel@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/param.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <machine/vmm.h>
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <stdbool.h>
#include <vmmapi.h>
#include "bhyverun.h"
#include "bootrom.h"
#define MAX_BOOTROM_SIZE (16 * 1024 * 1024) /* 16 MB */
int
bootrom_init(struct vmctx *ctx, const char *romfile)
{
struct stat sbuf;
vm_paddr_t gpa;
ssize_t rlen;
char *ptr;
int fd, i, rv, prot;
rv = -1;
fd = open(romfile, O_RDONLY);
if (fd < 0) {
fprintf(stderr, "Error opening bootrom \"%s\": %s\n",
romfile, strerror(errno));
goto done;
}
if (fstat(fd, &sbuf) < 0) {
fprintf(stderr, "Could not fstat bootrom file \"%s\": %s\n",
romfile, strerror(errno));
goto done;
}
/*
* Limit bootrom size to 16MB so it doesn't encroach into reserved
* MMIO space (e.g. APIC, HPET, MSI).
*/
if (sbuf.st_size > MAX_BOOTROM_SIZE || sbuf.st_size < PAGE_SIZE) {
fprintf(stderr, "Invalid bootrom size %ld\n", sbuf.st_size);
goto done;
}
if (sbuf.st_size & PAGE_MASK) {
fprintf(stderr, "Bootrom size %ld is not a multiple of the "
"page size\n", sbuf.st_size);
goto done;
}
ptr = vm_create_devmem(ctx, VM_BOOTROM, "bootrom", sbuf.st_size);
if (ptr == MAP_FAILED)
goto done;
/* Map the bootrom into the guest address space */
prot = PROT_READ | PROT_EXEC;
gpa = (1ULL << 32) - sbuf.st_size;
if (vm_mmap_memseg(ctx, gpa, VM_BOOTROM, 0, sbuf.st_size, prot) != 0)
goto done;
/* Read 'romfile' into the guest address space */
for (i = 0; i < sbuf.st_size / PAGE_SIZE; i++) {
rlen = read(fd, ptr + i * PAGE_SIZE, PAGE_SIZE);
if (rlen != PAGE_SIZE) {
fprintf(stderr, "Incomplete read of page %d of bootrom "
"file %s: %ld bytes\n", i, romfile, rlen);
goto done;
}
}
rv = 0;
done:
if (fd >= 0)
close(fd);
return (rv);
}

38
bootrom.h Normal file
View File

@ -0,0 +1,38 @@
/*-
* Copyright (c) 2015 Neel Natu <neel@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _BOOTROM_H_
#define _BOOTROM_H_
#include <stdbool.h>
struct vmctx;
int bootrom_init(struct vmctx *ctx, const char *romfile);
#endif

153
consport.c Normal file
View File

@ -0,0 +1,153 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/select.h>
#include <stdio.h>
#include <stdlib.h>
#include <termios.h>
#include <unistd.h>
#include <stdbool.h>
#include "inout.h"
#include "pci_lpc.h"
#define BVM_CONSOLE_PORT 0x220
#define BVM_CONS_SIG ('b' << 8 | 'v')
static struct termios tio_orig, tio_new;
static void
ttyclose(void)
{
tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig);
}
static void
ttyopen(void)
{
tcgetattr(STDIN_FILENO, &tio_orig);
cfmakeraw(&tio_new);
tcsetattr(STDIN_FILENO, TCSANOW, &tio_new);
atexit(ttyclose);
}
static bool
tty_char_available(void)
{
fd_set rfds;
struct timeval tv;
FD_ZERO(&rfds);
FD_SET(STDIN_FILENO, &rfds);
tv.tv_sec = 0;
tv.tv_usec = 0;
if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) {
return (true);
} else {
return (false);
}
}
static int
ttyread(void)
{
char rb;
if (tty_char_available()) {
read(STDIN_FILENO, &rb, 1);
return (rb & 0xff);
} else {
return (-1);
}
}
static void
ttywrite(unsigned char wb)
{
(void) write(STDOUT_FILENO, &wb, 1);
}
static int
console_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
static int opened;
if (bytes == 2 && in) {
*eax = BVM_CONS_SIG;
return (0);
}
/*
* Guests might probe this port to look for old ISA devices
* using single-byte reads. Return 0xff for those.
*/
if (bytes == 1 && in) {
*eax = 0xff;
return (0);
}
if (bytes != 4)
return (-1);
if (!opened) {
ttyopen();
opened = 1;
}
if (in)
*eax = ttyread();
else
ttywrite(*eax);
return (0);
}
SYSRES_IO(BVM_CONSOLE_PORT, 4);
static struct inout_port consport = {
"bvmcons",
BVM_CONSOLE_PORT,
1,
IOPORT_F_INOUT,
console_handler
};
void
init_bvmcons(void)
{
register_inout(&consport);
}

151
dbgport.c Normal file
View File

@ -0,0 +1,151 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <sys/uio.h>
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include "inout.h"
#include "dbgport.h"
#include "pci_lpc.h"
#define BVM_DBG_PORT 0x224
#define BVM_DBG_SIG ('B' << 8 | 'V')
static int listen_fd, conn_fd;
static struct sockaddr_in sin;
static int
dbg_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
char ch;
int nwritten, nread, printonce;
if (bytes == 2 && in) {
*eax = BVM_DBG_SIG;
return (0);
}
if (bytes != 4)
return (-1);
again:
printonce = 0;
while (conn_fd < 0) {
if (!printonce) {
printf("Waiting for connection from gdb\r\n");
printonce = 1;
}
conn_fd = accept(listen_fd, NULL, NULL);
if (conn_fd >= 0)
fcntl(conn_fd, F_SETFL, O_NONBLOCK);
else if (errno != EINTR)
perror("accept");
}
if (in) {
nread = read(conn_fd, &ch, 1);
if (nread == -1 && errno == EAGAIN)
*eax = -1;
else if (nread == 1)
*eax = ch;
else {
close(conn_fd);
conn_fd = -1;
goto again;
}
} else {
ch = *eax;
nwritten = write(conn_fd, &ch, 1);
if (nwritten != 1) {
close(conn_fd);
conn_fd = -1;
goto again;
}
}
return (0);
}
static struct inout_port dbgport = {
"bvmdbg",
BVM_DBG_PORT,
1,
IOPORT_F_INOUT,
dbg_handler
};
SYSRES_IO(BVM_DBG_PORT, 4);
void
init_dbgport(int sport)
{
int reuse;
conn_fd = -1;
if ((listen_fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
perror("socket");
exit(1);
}
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = htonl(INADDR_ANY);
sin.sin_port = htons(sport);
reuse = 1;
if (setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &reuse,
sizeof(reuse)) < 0) {
perror("setsockopt");
exit(1);
}
if (bind(listen_fd, (struct sockaddr *)&sin, sizeof(sin)) < 0) {
perror("bind");
exit(1);
}
if (listen(listen_fd, 1) < 0) {
perror("listen");
exit(1);
}
register_inout(&dbgport);
}

34
dbgport.h Normal file
View File

@ -0,0 +1,34 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _DBGPORT_H_
#define _DBGPORT_H_
void init_dbgport(int port);
#endif

549
fwctl.c Normal file
View File

@ -0,0 +1,549 @@
/*-
* Copyright (c) 2015 Peter Grehan <grehan@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
/*
* Guest firmware interface. Uses i/o ports x510/x511 as Qemu does,
* but with a request/response messaging protocol.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/types.h>
#include <sys/errno.h>
#include <sys/uio.h>
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "bhyverun.h"
#include "inout.h"
#include "fwctl.h"
/*
* Messaging protocol base operations
*/
#define OP_NULL 1
#define OP_ECHO 2
#define OP_GET 3
#define OP_GET_LEN 4
#define OP_SET 5
#define OP_MAX OP_SET
/* I/O ports */
#define FWCTL_OUT 0x510
#define FWCTL_IN 0x511
/*
* Back-end state-machine
*/
enum state {
DORMANT,
IDENT_WAIT,
IDENT_SEND,
REQ,
RESP
} be_state = DORMANT;
static uint8_t sig[] = { 'B', 'H', 'Y', 'V' };
static u_int ident_idx;
struct op_info {
int op;
int (*op_start)(int len);
void (*op_data)(uint32_t data, int len);
int (*op_result)(struct iovec **data);
void (*op_done)(struct iovec *data);
};
static struct op_info *ops[OP_MAX+1];
/* Return 0-padded uint32_t */
static uint32_t
fwctl_send_rest(uint32_t *data, size_t len)
{
union {
uint8_t c[4];
uint32_t w;
} u;
uint8_t *cdata;
int i;
cdata = (uint8_t *) data;
u.w = 0;
for (i = 0, u.w = 0; i < len; i++)
u.c[i] = *cdata++;
return (u.w);
}
/*
* error op dummy proto - drop all data sent and return an error
*/
static int errop_code;
static void
errop_set(int err)
{
errop_code = err;
}
static int
errop_start(int len)
{
errop_code = ENOENT;
/* accept any length */
return (errop_code);
}
static void
errop_data(uint32_t data, int len)
{
/* ignore */
}
static int
errop_result(struct iovec **data)
{
/* no data to send back; always successful */
*data = NULL;
return (errop_code);
}
static void
errop_done(struct iovec *data)
{
/* assert data is NULL */
}
static struct op_info errop_info = {
.op_start = errop_start,
.op_data = errop_data,
.op_result = errop_result,
.op_done = errop_done
};
/* OID search */
SET_DECLARE(ctl_set, struct ctl);
CTL_NODE("hw.ncpu", &guest_ncpus, sizeof(guest_ncpus));
static struct ctl *
ctl_locate(const char *str, int maxlen)
{
struct ctl *cp, **cpp;
SET_FOREACH(cpp, ctl_set) {
cp = *cpp;
if (!strncmp(str, cp->c_oid, maxlen))
return (cp);
}
return (NULL);
}
/* uefi-sysctl get-len */
#define FGET_STRSZ 80
static struct iovec fget_biov[2];
static char fget_str[FGET_STRSZ];
static struct {
size_t f_sz;
uint32_t f_data[1024];
} fget_buf;
static int fget_cnt;
static size_t fget_size;
static int
fget_start(int len)
{
if (len > FGET_STRSZ)
return(E2BIG);
fget_cnt = 0;
return (0);
}
static void
fget_data(uint32_t data, int len)
{
*((uint32_t *) &fget_str[fget_cnt]) = data;
fget_cnt += sizeof(uint32_t);
}
static int
fget_result(struct iovec **data, int val)
{
struct ctl *cp;
int err;
err = 0;
/* Locate the OID */
cp = ctl_locate(fget_str, fget_cnt);
if (cp == NULL) {
*data = NULL;
err = ENOENT;
} else {
if (val) {
/* For now, copy the len/data into a buffer */
memset(&fget_buf, 0, sizeof(fget_buf));
fget_buf.f_sz = cp->c_len;
memcpy(fget_buf.f_data, cp->c_data, cp->c_len);
fget_biov[0].iov_base = (char *)&fget_buf;
fget_biov[0].iov_len = sizeof(fget_buf.f_sz) +
cp->c_len;
} else {
fget_size = cp->c_len;
fget_biov[0].iov_base = (char *)&fget_size;
fget_biov[0].iov_len = sizeof(fget_size);
}
fget_biov[1].iov_base = NULL;
fget_biov[1].iov_len = 0;
*data = fget_biov;
}
return (err);
}
static void
fget_done(struct iovec *data)
{
/* nothing needs to be freed */
}
static int
fget_len_result(struct iovec **data)
{
return (fget_result(data, 0));
}
static int
fget_val_result(struct iovec **data)
{
return (fget_result(data, 1));
}
static struct op_info fgetlen_info = {
.op_start = fget_start,
.op_data = fget_data,
.op_result = fget_len_result,
.op_done = fget_done
};
static struct op_info fgetval_info = {
.op_start = fget_start,
.op_data = fget_data,
.op_result = fget_val_result,
.op_done = fget_done
};
static struct req_info {
int req_error;
u_int req_count;
uint32_t req_size;
uint32_t req_type;
uint32_t req_txid;
struct op_info *req_op;
int resp_error;
int resp_count;
int resp_size;
int resp_off;
struct iovec *resp_biov;
} rinfo;
static void
fwctl_response_done(void)
{
(*rinfo.req_op->op_done)(rinfo.resp_biov);
/* reinit the req data struct */
memset(&rinfo, 0, sizeof(rinfo));
}
static void
fwctl_request_done(void)
{
rinfo.resp_error = (*rinfo.req_op->op_result)(&rinfo.resp_biov);
/* XXX only a single vector supported at the moment */
rinfo.resp_off = 0;
if (rinfo.resp_biov == NULL) {
rinfo.resp_size = 0;
} else {
rinfo.resp_size = rinfo.resp_biov[0].iov_len;
}
}
static int
fwctl_request_start(void)
{
int err;
/* Data size doesn't include header */
rinfo.req_size -= 12;
rinfo.req_op = &errop_info;
if (rinfo.req_type <= OP_MAX && ops[rinfo.req_type] != NULL)
rinfo.req_op = ops[rinfo.req_type];
err = (*rinfo.req_op->op_start)(rinfo.req_size);
if (err) {
errop_set(err);
rinfo.req_op = &errop_info;
}
/* Catch case of zero-length message here */
if (rinfo.req_size == 0) {
fwctl_request_done();
return (1);
}
return (0);
}
static int
fwctl_request_data(uint32_t value)
{
int remlen;
/* Make sure remaining size is >= 0 */
rinfo.req_size -= sizeof(uint32_t);
remlen = MAX(rinfo.req_size, 0);
(*rinfo.req_op->op_data)(value, remlen);
if (rinfo.req_size < sizeof(uint32_t)) {
fwctl_request_done();
return (1);
}
return (0);
}
static int
fwctl_request(uint32_t value)
{
int ret;
ret = 0;
switch (rinfo.req_count) {
case 0:
/* Verify size */
if (value < 12) {
printf("msg size error");
exit(1);
}
rinfo.req_size = value;
rinfo.req_count = 1;
break;
case 1:
rinfo.req_type = value;
rinfo.req_count++;
break;
case 2:
rinfo.req_txid = value;
rinfo.req_count++;
ret = fwctl_request_start();
break;
default:
ret = fwctl_request_data(value);
break;
}
return (ret);
}
static int
fwctl_response(uint32_t *retval)
{
uint32_t *dp;
int remlen;
switch(rinfo.resp_count) {
case 0:
/* 4 x u32 header len + data */
*retval = 4*sizeof(uint32_t) +
roundup(rinfo.resp_size, sizeof(uint32_t));
rinfo.resp_count++;
break;
case 1:
*retval = rinfo.req_type;
rinfo.resp_count++;
break;
case 2:
*retval = rinfo.req_txid;
rinfo.resp_count++;
break;
case 3:
*retval = rinfo.resp_error;
rinfo.resp_count++;
break;
default:
remlen = rinfo.resp_size - rinfo.resp_off;
dp = (uint32_t *)
((uint8_t *)rinfo.resp_biov->iov_base + rinfo.resp_off);
if (remlen >= sizeof(uint32_t)) {
*retval = *dp;
} else if (remlen > 0) {
*retval = fwctl_send_rest(dp, remlen);
}
rinfo.resp_off += sizeof(uint32_t);
break;
}
if (rinfo.resp_count > 3 &&
rinfo.resp_size - rinfo.resp_off <= 0) {
fwctl_response_done();
return (1);
}
return (0);
}
/*
* i/o port handling.
*/
static uint8_t
fwctl_inb(void)
{
uint8_t retval;
retval = 0xff;
switch (be_state) {
case IDENT_SEND:
retval = sig[ident_idx++];
if (ident_idx >= sizeof(sig))
be_state = REQ;
break;
default:
break;
}
return (retval);
}
static void
fwctl_outw(uint16_t val)
{
switch (be_state) {
case IDENT_WAIT:
if (val == 0) {
be_state = IDENT_SEND;
ident_idx = 0;
}
break;
default:
/* ignore */
break;
}
}
static uint32_t
fwctl_inl(void)
{
uint32_t retval;
switch (be_state) {
case RESP:
if (fwctl_response(&retval))
be_state = REQ;
break;
default:
retval = 0xffffffff;
break;
}
return (retval);
}
static void
fwctl_outl(uint32_t val)
{
switch (be_state) {
case REQ:
if (fwctl_request(val))
be_state = RESP;
default:
break;
}
}
static int
fwctl_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
if (in) {
if (bytes == 1)
*eax = fwctl_inb();
else if (bytes == 4)
*eax = fwctl_inl();
else
*eax = 0xffff;
} else {
if (bytes == 2)
fwctl_outw(*eax);
else if (bytes == 4)
fwctl_outl(*eax);
}
return (0);
}
INOUT_PORT(fwctl_wreg, FWCTL_OUT, IOPORT_F_INOUT, fwctl_handler);
INOUT_PORT(fwctl_rreg, FWCTL_IN, IOPORT_F_IN, fwctl_handler);
void
fwctl_init(void)
{
ops[OP_GET_LEN] = &fgetlen_info;
ops[OP_GET] = &fgetval_info;
be_state = IDENT_WAIT;
}

54
fwctl.h Normal file
View File

@ -0,0 +1,54 @@
/*-
* Copyright (c) 2015 Peter Grehan <grehan@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _FWCTL_H_
#define _FWCTL_H_
#include <sys/linker_set.h>
/*
* Linker set api for export of information to guest firmware via
* a sysctl-like OID interface
*/
struct ctl {
const char *c_oid;
const void *c_data;
const int c_len;
};
#define CTL_NODE(oid, data, len) \
static struct ctl __CONCAT(__ctl, __LINE__) = { \
oid, \
(data), \
(len), \
}; \
DATA_SET(ctl_set, __CONCAT(__ctl, __LINE__))
void fwctl_init(void);
#endif /* _FWCTL_H_ */

297
inout.c Normal file
View File

@ -0,0 +1,297 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/linker_set.h>
#include <sys/_iovec.h>
#include <sys/mman.h>
#include <x86/psl.h>
#include <x86/segments.h>
#include <machine/vmm.h>
#include <machine/vmm_instruction_emul.h>
#include <vmmapi.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include "bhyverun.h"
#include "inout.h"
SET_DECLARE(inout_port_set, struct inout_port);
#define MAX_IOPORTS (1 << 16)
#define VERIFY_IOPORT(port, size) \
assert((port) >= 0 && (size) > 0 && ((port) + (size)) <= MAX_IOPORTS)
static struct {
const char *name;
int flags;
inout_func_t handler;
void *arg;
} inout_handlers[MAX_IOPORTS];
static int
default_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
if (in) {
switch (bytes) {
case 4:
*eax = 0xffffffff;
break;
case 2:
*eax = 0xffff;
break;
case 1:
*eax = 0xff;
break;
}
}
return (0);
}
static void
register_default_iohandler(int start, int size)
{
struct inout_port iop;
VERIFY_IOPORT(start, size);
bzero(&iop, sizeof(iop));
iop.name = "default";
iop.port = start;
iop.size = size;
iop.flags = IOPORT_F_INOUT | IOPORT_F_DEFAULT;
iop.handler = default_inout;
register_inout(&iop);
}
int
emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
{
int addrsize, bytes, flags, in, port, prot, rep;
uint32_t eax, val;
inout_func_t handler;
void *arg;
int error, fault, retval;
enum vm_reg_name idxreg;
uint64_t gla, index, iterations, count;
struct vm_inout_str *vis;
struct iovec iov[2];
bytes = vmexit->u.inout.bytes;
in = vmexit->u.inout.in;
port = vmexit->u.inout.port;
assert(port < MAX_IOPORTS);
assert(bytes == 1 || bytes == 2 || bytes == 4);
handler = inout_handlers[port].handler;
if (strict && handler == default_inout)
return (-1);
flags = inout_handlers[port].flags;
arg = inout_handlers[port].arg;
if (in) {
if (!(flags & IOPORT_F_IN))
return (-1);
} else {
if (!(flags & IOPORT_F_OUT))
return (-1);
}
retval = 0;
if (vmexit->u.inout.string) {
vis = &vmexit->u.inout_str;
rep = vis->inout.rep;
addrsize = vis->addrsize;
prot = in ? PROT_WRITE : PROT_READ;
assert(addrsize == 2 || addrsize == 4 || addrsize == 8);
/* Index register */
idxreg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
index = vis->index & vie_size2mask(addrsize);
/* Count register */
count = vis->count & vie_size2mask(addrsize);
/* Limit number of back-to-back in/out emulations to 16 */
iterations = MIN(count, 16);
while (iterations > 0) {
assert(retval == 0);
if (vie_calculate_gla(vis->paging.cpu_mode,
vis->seg_name, &vis->seg_desc, index, bytes,
addrsize, prot, &gla)) {
vm_inject_gp(ctx, vcpu);
break;
}
error = vm_copy_setup(ctx, vcpu, &vis->paging, gla,
bytes, prot, iov, nitems(iov), &fault);
if (error) {
retval = -1; /* Unrecoverable error */
break;
} else if (fault) {
retval = 0; /* Resume guest to handle fault */
break;
}
if (vie_alignment_check(vis->paging.cpl, bytes,
vis->cr0, vis->rflags, gla)) {
vm_inject_ac(ctx, vcpu, 0);
break;
}
val = 0;
if (!in)
vm_copyin(ctx, vcpu, iov, &val, bytes);
retval = handler(ctx, vcpu, in, port, bytes, &val, arg);
if (retval != 0)
break;
if (in)
vm_copyout(ctx, vcpu, &val, iov, bytes);
/* Update index */
if (vis->rflags & PSL_D)
index -= bytes;
else
index += bytes;
count--;
iterations--;
}
/* Update index register */
error = vie_update_register(ctx, vcpu, idxreg, index, addrsize);
assert(error == 0);
/*
* Update count register only if the instruction had a repeat
* prefix.
*/
if (rep) {
error = vie_update_register(ctx, vcpu, VM_REG_GUEST_RCX,
count, addrsize);
assert(error == 0);
}
/* Restart the instruction if more iterations remain */
if (retval == 0 && count != 0) {
error = vm_restart_instruction(ctx, vcpu);
assert(error == 0);
}
} else {
eax = vmexit->u.inout.eax;
val = eax & vie_size2mask(bytes);
retval = handler(ctx, vcpu, in, port, bytes, &val, arg);
if (retval == 0 && in) {
eax &= ~vie_size2mask(bytes);
eax |= val & vie_size2mask(bytes);
error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX,
eax);
assert(error == 0);
}
}
return (retval);
}
void
init_inout(void)
{
struct inout_port **iopp, *iop;
/*
* Set up the default handler for all ports
*/
register_default_iohandler(0, MAX_IOPORTS);
/*
* Overwrite with specified handlers
*/
SET_FOREACH(iopp, inout_port_set) {
iop = *iopp;
assert(iop->port < MAX_IOPORTS);
inout_handlers[iop->port].name = iop->name;
inout_handlers[iop->port].flags = iop->flags;
inout_handlers[iop->port].handler = iop->handler;
inout_handlers[iop->port].arg = NULL;
}
}
int
register_inout(struct inout_port *iop)
{
int i;
VERIFY_IOPORT(iop->port, iop->size);
/*
* Verify that the new registration is not overwriting an already
* allocated i/o range.
*/
if ((iop->flags & IOPORT_F_DEFAULT) == 0) {
for (i = iop->port; i < iop->port + iop->size; i++) {
if ((inout_handlers[i].flags & IOPORT_F_DEFAULT) == 0)
return (-1);
}
}
for (i = iop->port; i < iop->port + iop->size; i++) {
inout_handlers[i].name = iop->name;
inout_handlers[i].flags = iop->flags;
inout_handlers[i].handler = iop->handler;
inout_handlers[i].arg = iop->arg;
}
return (0);
}
int
unregister_inout(struct inout_port *iop)
{
VERIFY_IOPORT(iop->port, iop->size);
assert(inout_handlers[iop->port].name == iop->name);
register_default_iohandler(iop->port, iop->size);
return (0);
}

79
inout.h Normal file
View File

@ -0,0 +1,79 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _INOUT_H_
#define _INOUT_H_
#include <sys/linker_set.h>
struct vmctx;
struct vm_exit;
/*
* inout emulation handlers return 0 on success and -1 on failure.
*/
typedef int (*inout_func_t)(struct vmctx *ctx, int vcpu, int in, int port,
int bytes, uint32_t *eax, void *arg);
struct inout_port {
const char *name;
int port;
int size;
int flags;
inout_func_t handler;
void *arg;
};
#define IOPORT_F_IN 0x1
#define IOPORT_F_OUT 0x2
#define IOPORT_F_INOUT (IOPORT_F_IN | IOPORT_F_OUT)
/*
* The following flags are used internally and must not be used by
* device models.
*/
#define IOPORT_F_DEFAULT 0x80000000 /* claimed by default handler */
#define INOUT_PORT(name, port, flags, handler) \
static struct inout_port __CONCAT(__inout_port, __LINE__) = { \
#name, \
(port), \
1, \
(flags), \
(handler), \
0 \
}; \
DATA_SET(inout_port_set, __CONCAT(__inout_port, __LINE__))
void init_inout(void);
int emulate_inout(struct vmctx *, int vcpu, struct vm_exit *vmexit,
int strict);
int register_inout(struct inout_port *iop);
int unregister_inout(struct inout_port *iop);
void init_bvmcons(void);
#endif /* _INOUT_H_ */

74
ioapic.c Normal file
View File

@ -0,0 +1,74 @@
/*-
* Copyright (c) 2014 Hudson River Trading LLC
* Written by: John H. Baldwin <jhb@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <machine/vmm.h>
#include <vmmapi.h>
#include "ioapic.h"
/*
* Assign PCI INTx interrupts to I/O APIC pins in a round-robin
* fashion. Note that we have no idea what the HPET is using, but the
* HPET is also programmable whereas this is intended for hardwired
* PCI interrupts.
*
* This assumes a single I/O APIC where pins >= 16 are permitted for
* PCI devices.
*/
static int pci_pins;
void
ioapic_init(struct vmctx *ctx)
{
if (vm_ioapic_pincount(ctx, &pci_pins) < 0) {
pci_pins = 0;
return;
}
/* Ignore the first 16 pins. */
if (pci_pins <= 16) {
pci_pins = 0;
return;
}
pci_pins -= 16;
}
int
ioapic_pci_alloc_irq(void)
{
static int last_pin;
if (pci_pins == 0)
return (-1);
return (16 + (last_pin++ % pci_pins));
}

39
ioapic.h Normal file
View File

@ -0,0 +1,39 @@
/*-
* Copyright (c) 2014 Hudson River Trading LLC
* Written by: John H. Baldwin <jhb@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _IOAPIC_H_
#define _IOAPIC_H_
/*
* Allocate a PCI IRQ from the I/O APIC.
*/
void ioapic_init(struct vmctx *ctx);
int ioapic_pci_alloc_irq(void);
#endif

291
mem.c Normal file
View File

@ -0,0 +1,291 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
/*
* Memory ranges are represented with an RB tree. On insertion, the range
* is checked for overlaps. On lookup, the key has the same base and limit
* so it can be searched within the range.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/tree.h>
#include <sys/errno.h>
#include <machine/vmm.h>
#include <machine/vmm_instruction_emul.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <pthread.h>
#include "mem.h"
struct mmio_rb_range {
RB_ENTRY(mmio_rb_range) mr_link; /* RB tree links */
struct mem_range mr_param;
uint64_t mr_base;
uint64_t mr_end;
};
struct mmio_rb_tree;
RB_PROTOTYPE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare);
RB_HEAD(mmio_rb_tree, mmio_rb_range) mmio_rb_root, mmio_rb_fallback;
/*
* Per-vCPU cache. Since most accesses from a vCPU will be to
* consecutive addresses in a range, it makes sense to cache the
* result of a lookup.
*/
static struct mmio_rb_range *mmio_hint[VM_MAXCPU];
static pthread_rwlock_t mmio_rwlock;
static int
mmio_rb_range_compare(struct mmio_rb_range *a, struct mmio_rb_range *b)
{
if (a->mr_end < b->mr_base)
return (-1);
else if (a->mr_base > b->mr_end)
return (1);
return (0);
}
static int
mmio_rb_lookup(struct mmio_rb_tree *rbt, uint64_t addr,
struct mmio_rb_range **entry)
{
struct mmio_rb_range find, *res;
find.mr_base = find.mr_end = addr;
res = RB_FIND(mmio_rb_tree, rbt, &find);
if (res != NULL) {
*entry = res;
return (0);
}
return (ENOENT);
}
static int
mmio_rb_add(struct mmio_rb_tree *rbt, struct mmio_rb_range *new)
{
struct mmio_rb_range *overlap;
overlap = RB_INSERT(mmio_rb_tree, rbt, new);
if (overlap != NULL) {
#ifdef RB_DEBUG
printf("overlap detected: new %lx:%lx, tree %lx:%lx\n",
new->mr_base, new->mr_end,
overlap->mr_base, overlap->mr_end);
#endif
return (EEXIST);
}
return (0);
}
#if 0
static void
mmio_rb_dump(struct mmio_rb_tree *rbt)
{
struct mmio_rb_range *np;
pthread_rwlock_rdlock(&mmio_rwlock);
RB_FOREACH(np, mmio_rb_tree, rbt) {
printf(" %lx:%lx, %s\n", np->mr_base, np->mr_end,
np->mr_param.name);
}
pthread_rwlock_unlock(&mmio_rwlock);
}
#endif
RB_GENERATE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare);
static int
mem_read(void *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size, void *arg)
{
int error;
struct mem_range *mr = arg;
error = (*mr->handler)(ctx, vcpu, MEM_F_READ, gpa, size,
rval, mr->arg1, mr->arg2);
return (error);
}
static int
mem_write(void *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size, void *arg)
{
int error;
struct mem_range *mr = arg;
error = (*mr->handler)(ctx, vcpu, MEM_F_WRITE, gpa, size,
&wval, mr->arg1, mr->arg2);
return (error);
}
int
emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie,
struct vm_guest_paging *paging)
{
struct mmio_rb_range *entry;
int err, immutable;
pthread_rwlock_rdlock(&mmio_rwlock);
/*
* First check the per-vCPU cache
*/
if (mmio_hint[vcpu] &&
paddr >= mmio_hint[vcpu]->mr_base &&
paddr <= mmio_hint[vcpu]->mr_end) {
entry = mmio_hint[vcpu];
} else
entry = NULL;
if (entry == NULL) {
if (mmio_rb_lookup(&mmio_rb_root, paddr, &entry) == 0) {
/* Update the per-vCPU cache */
mmio_hint[vcpu] = entry;
} else if (mmio_rb_lookup(&mmio_rb_fallback, paddr, &entry)) {
pthread_rwlock_unlock(&mmio_rwlock);
return (ESRCH);
}
}
assert(entry != NULL);
/*
* An 'immutable' memory range is guaranteed to be never removed
* so there is no need to hold 'mmio_rwlock' while calling the
* handler.
*
* XXX writes to the PCIR_COMMAND register can cause register_mem()
* to be called. If the guest is using PCI extended config space
* to modify the PCIR_COMMAND register then register_mem() can
* deadlock on 'mmio_rwlock'. However by registering the extended
* config space window as 'immutable' the deadlock can be avoided.
*/
immutable = (entry->mr_param.flags & MEM_F_IMMUTABLE);
if (immutable)
pthread_rwlock_unlock(&mmio_rwlock);
err = vmm_emulate_instruction(ctx, vcpu, paddr, vie, paging,
mem_read, mem_write, &entry->mr_param);
if (!immutable)
pthread_rwlock_unlock(&mmio_rwlock);
return (err);
}
static int
register_mem_int(struct mmio_rb_tree *rbt, struct mem_range *memp)
{
struct mmio_rb_range *entry, *mrp;
int err;
err = 0;
mrp = malloc(sizeof(struct mmio_rb_range));
if (mrp != NULL) {
mrp->mr_param = *memp;
mrp->mr_base = memp->base;
mrp->mr_end = memp->base + memp->size - 1;
pthread_rwlock_wrlock(&mmio_rwlock);
if (mmio_rb_lookup(rbt, memp->base, &entry) != 0)
err = mmio_rb_add(rbt, mrp);
pthread_rwlock_unlock(&mmio_rwlock);
if (err)
free(mrp);
} else
err = ENOMEM;
return (err);
}
int
register_mem(struct mem_range *memp)
{
return (register_mem_int(&mmio_rb_root, memp));
}
int
register_mem_fallback(struct mem_range *memp)
{
return (register_mem_int(&mmio_rb_fallback, memp));
}
int
unregister_mem(struct mem_range *memp)
{
struct mem_range *mr;
struct mmio_rb_range *entry = NULL;
int err, i;
pthread_rwlock_wrlock(&mmio_rwlock);
err = mmio_rb_lookup(&mmio_rb_root, memp->base, &entry);
if (err == 0) {
mr = &entry->mr_param;
assert(mr->name == memp->name);
assert(mr->base == memp->base && mr->size == memp->size);
assert((mr->flags & MEM_F_IMMUTABLE) == 0);
RB_REMOVE(mmio_rb_tree, &mmio_rb_root, entry);
/* flush Per-vCPU cache */
for (i=0; i < VM_MAXCPU; i++) {
if (mmio_hint[i] == entry)
mmio_hint[i] = NULL;
}
}
pthread_rwlock_unlock(&mmio_rwlock);
if (entry)
free(entry);
return (err);
}
void
init_mem(void)
{
RB_INIT(&mmio_rb_root);
RB_INIT(&mmio_rb_fallback);
pthread_rwlock_init(&mmio_rwlock, NULL);
}

61
mem.h Normal file
View File

@ -0,0 +1,61 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _MEM_H_
#define _MEM_H_
#include <sys/linker_set.h>
struct vmctx;
typedef int (*mem_func_t)(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
int size, uint64_t *val, void *arg1, long arg2);
struct mem_range {
const char *name;
int flags;
mem_func_t handler;
void *arg1;
long arg2;
uint64_t base;
uint64_t size;
};
#define MEM_F_READ 0x1
#define MEM_F_WRITE 0x2
#define MEM_F_RW 0x3
#define MEM_F_IMMUTABLE 0x4 /* mem_range cannot be unregistered */
void init_mem(void);
int emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie,
struct vm_guest_paging *paging);
int register_mem(struct mem_range *memp);
int register_mem_fallback(struct mem_range *memp);
int unregister_mem(struct mem_range *memp);
#endif /* _MEM_H_ */

456
mevent.c Normal file
View File

@ -0,0 +1,456 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
/*
* Micro event library for FreeBSD, designed for a single i/o thread
* using kqueue, and having events be persistent by default.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <assert.h>
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/event.h>
#include <sys/time.h>
#include <pthread.h>
#include <pthread_np.h>
#include "mevent.h"
#define MEVENT_MAX 64
#define MEV_ADD 1
#define MEV_ENABLE 2
#define MEV_DISABLE 3
#define MEV_DEL_PENDING 4
extern char *vmname;
static pthread_t mevent_tid;
static int mevent_timid = 43;
static int mevent_pipefd[2];
static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER;
struct mevent {
void (*me_func)(int, enum ev_type, void *);
#define me_msecs me_fd
int me_fd;
int me_timid;
enum ev_type me_type;
void *me_param;
int me_cq;
int me_state;
int me_closefd;
LIST_ENTRY(mevent) me_list;
};
static LIST_HEAD(listhead, mevent) global_head, change_head;
static void
mevent_qlock(void)
{
pthread_mutex_lock(&mevent_lmutex);
}
static void
mevent_qunlock(void)
{
pthread_mutex_unlock(&mevent_lmutex);
}
static void
mevent_pipe_read(int fd, enum ev_type type, void *param)
{
char buf[MEVENT_MAX];
int status;
/*
* Drain the pipe read side. The fd is non-blocking so this is
* safe to do.
*/
do {
status = read(fd, buf, sizeof(buf));
} while (status == MEVENT_MAX);
}
static void
mevent_notify(void)
{
char c;
/*
* If calling from outside the i/o thread, write a byte on the
* pipe to force the i/o thread to exit the blocking kevent call.
*/
if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) {
write(mevent_pipefd[1], &c, 1);
}
}
static int
mevent_kq_filter(struct mevent *mevp)
{
int retval;
retval = 0;
if (mevp->me_type == EVF_READ)
retval = EVFILT_READ;
if (mevp->me_type == EVF_WRITE)
retval = EVFILT_WRITE;
if (mevp->me_type == EVF_TIMER)
retval = EVFILT_TIMER;
if (mevp->me_type == EVF_SIGNAL)
retval = EVFILT_SIGNAL;
return (retval);
}
static int
mevent_kq_flags(struct mevent *mevp)
{
int ret;
switch (mevp->me_state) {
case MEV_ADD:
ret = EV_ADD; /* implicitly enabled */
break;
case MEV_ENABLE:
ret = EV_ENABLE;
break;
case MEV_DISABLE:
ret = EV_DISABLE;
break;
case MEV_DEL_PENDING:
ret = EV_DELETE;
break;
default:
assert(0);
break;
}
return (ret);
}
static int
mevent_kq_fflags(struct mevent *mevp)
{
/* XXX nothing yet, perhaps EV_EOF for reads ? */
return (0);
}
static int
mevent_build(int mfd, struct kevent *kev)
{
struct mevent *mevp, *tmpp;
int i;
i = 0;
mevent_qlock();
LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
if (mevp->me_closefd) {
/*
* A close of the file descriptor will remove the
* event
*/
close(mevp->me_fd);
} else {
if (mevp->me_type == EVF_TIMER) {
kev[i].ident = mevp->me_timid;
kev[i].data = mevp->me_msecs;
} else {
kev[i].ident = mevp->me_fd;
kev[i].data = 0;
}
kev[i].filter = mevent_kq_filter(mevp);
kev[i].flags = mevent_kq_flags(mevp);
kev[i].fflags = mevent_kq_fflags(mevp);
kev[i].udata = mevp;
i++;
}
mevp->me_cq = 0;
LIST_REMOVE(mevp, me_list);
if (mevp->me_state == MEV_DEL_PENDING) {
free(mevp);
} else {
LIST_INSERT_HEAD(&global_head, mevp, me_list);
}
assert(i < MEVENT_MAX);
}
mevent_qunlock();
return (i);
}
static void
mevent_handle(struct kevent *kev, int numev)
{
struct mevent *mevp;
int i;
for (i = 0; i < numev; i++) {
mevp = kev[i].udata;
/* XXX check for EV_ERROR ? */
(*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param);
}
}
struct mevent *
mevent_add(int tfd, enum ev_type type,
void (*func)(int, enum ev_type, void *), void *param)
{
struct mevent *lp, *mevp;
if (tfd < 0 || func == NULL) {
return (NULL);
}
mevp = NULL;
mevent_qlock();
/*
* Verify that the fd/type tuple is not present in any list
*/
LIST_FOREACH(lp, &global_head, me_list) {
if (type != EVF_TIMER && lp->me_fd == tfd &&
lp->me_type == type) {
goto exit;
}
}
LIST_FOREACH(lp, &change_head, me_list) {
if (type != EVF_TIMER && lp->me_fd == tfd &&
lp->me_type == type) {
goto exit;
}
}
/*
* Allocate an entry, populate it, and add it to the change list.
*/
mevp = calloc(1, sizeof(struct mevent));
if (mevp == NULL) {
goto exit;
}
if (type == EVF_TIMER) {
mevp->me_msecs = tfd;
mevp->me_timid = mevent_timid++;
} else
mevp->me_fd = tfd;
mevp->me_type = type;
mevp->me_func = func;
mevp->me_param = param;
LIST_INSERT_HEAD(&change_head, mevp, me_list);
mevp->me_cq = 1;
mevp->me_state = MEV_ADD;
mevent_notify();
exit:
mevent_qunlock();
return (mevp);
}
static int
mevent_update(struct mevent *evp, int newstate)
{
/*
* It's not possible to enable/disable a deleted event
*/
if (evp->me_state == MEV_DEL_PENDING)
return (EINVAL);
/*
* No update needed if state isn't changing
*/
if (evp->me_state == newstate)
return (0);
mevent_qlock();
evp->me_state = newstate;
/*
* Place the entry onto the changed list if not already there.
*/
if (evp->me_cq == 0) {
evp->me_cq = 1;
LIST_REMOVE(evp, me_list);
LIST_INSERT_HEAD(&change_head, evp, me_list);
mevent_notify();
}
mevent_qunlock();
return (0);
}
int
mevent_enable(struct mevent *evp)
{
return (mevent_update(evp, MEV_ENABLE));
}
int
mevent_disable(struct mevent *evp)
{
return (mevent_update(evp, MEV_DISABLE));
}
static int
mevent_delete_event(struct mevent *evp, int closefd)
{
mevent_qlock();
/*
* Place the entry onto the changed list if not already there, and
* mark as to be deleted.
*/
if (evp->me_cq == 0) {
evp->me_cq = 1;
LIST_REMOVE(evp, me_list);
LIST_INSERT_HEAD(&change_head, evp, me_list);
mevent_notify();
}
evp->me_state = MEV_DEL_PENDING;
if (closefd)
evp->me_closefd = 1;
mevent_qunlock();
return (0);
}
int
mevent_delete(struct mevent *evp)
{
return (mevent_delete_event(evp, 0));
}
int
mevent_delete_close(struct mevent *evp)
{
return (mevent_delete_event(evp, 1));
}
static void
mevent_set_name(void)
{
pthread_set_name_np(mevent_tid, "mevent");
}
void
mevent_dispatch(void)
{
struct kevent changelist[MEVENT_MAX];
struct kevent eventlist[MEVENT_MAX];
struct mevent *pipev;
int mfd;
int numev;
int ret;
mevent_tid = pthread_self();
mevent_set_name();
mfd = kqueue();
assert(mfd > 0);
/*
* Open the pipe that will be used for other threads to force
* the blocking kqueue call to exit by writing to it. Set the
* descriptor to non-blocking.
*/
ret = pipe(mevent_pipefd);
if (ret < 0) {
perror("pipe");
exit(0);
}
/*
* Add internal event handler for the pipe write fd
*/
pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL);
assert(pipev != NULL);
for (;;) {
/*
* Build changelist if required.
* XXX the changelist can be put into the blocking call
* to eliminate the extra syscall. Currently better for
* debug.
*/
numev = mevent_build(mfd, changelist);
if (numev) {
ret = kevent(mfd, changelist, numev, NULL, 0, NULL);
if (ret == -1) {
perror("Error return from kevent change");
}
}
/*
* Block awaiting events
*/
ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL);
if (ret == -1 && errno != EINTR) {
perror("Error return from kevent monitor");
}
/*
* Handle reported events
*/
mevent_handle(eventlist, ret);
}
}

51
mevent.h Normal file
View File

@ -0,0 +1,51 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _MEVENT_H_
#define _MEVENT_H_
enum ev_type {
EVF_READ,
EVF_WRITE,
EVF_TIMER,
EVF_SIGNAL
};
struct mevent;
struct mevent *mevent_add(int fd, enum ev_type type,
void (*func)(int, enum ev_type, void *),
void *param);
int mevent_enable(struct mevent *evp);
int mevent_disable(struct mevent *evp);
int mevent_delete(struct mevent *evp);
int mevent_delete_close(struct mevent *evp);
void mevent_dispatch(void);
#endif /* _MEVENT_H_ */

256
mevent_test.c Normal file
View File

@ -0,0 +1,256 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
/*
* Test program for the micro event library. Set up a simple TCP echo
* service.
*
* cc mevent_test.c mevent.c -lpthread
*/
#include <sys/types.h>
#include <sys/stdint.h>
#include <sys/sysctl.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <machine/cpufunc.h>
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <unistd.h>
#include "mevent.h"
#define TEST_PORT 4321
static pthread_mutex_t accept_mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t accept_condvar = PTHREAD_COND_INITIALIZER;
static struct mevent *tevp;
char *vmname = "test vm";
#define MEVENT_ECHO
/* Number of timer events to capture */
#define TEVSZ 4096
uint64_t tevbuf[TEVSZ];
static void
timer_print(void)
{
uint64_t min, max, diff, sum, tsc_freq;
size_t len;
int j;
min = UINT64_MAX;
max = 0;
sum = 0;
len = sizeof(tsc_freq);
sysctlbyname("machdep.tsc_freq", &tsc_freq, &len, NULL, 0);
for (j = 1; j < TEVSZ; j++) {
/* Convert a tsc diff into microseconds */
diff = (tevbuf[j] - tevbuf[j-1]) * 1000000 / tsc_freq;
sum += diff;
if (min > diff)
min = diff;
if (max < diff)
max = diff;
}
printf("timers done: usecs, min %ld, max %ld, mean %ld\n", min, max,
sum/(TEVSZ - 1));
}
static void
timer_callback(int fd, enum ev_type type, void *param)
{
static int i;
if (i >= TEVSZ)
abort();
tevbuf[i++] = rdtsc();
if (i == TEVSZ) {
mevent_delete(tevp);
timer_print();
}
}
#ifdef MEVENT_ECHO
struct esync {
pthread_mutex_t e_mt;
pthread_cond_t e_cond;
};
static void
echoer_callback(int fd, enum ev_type type, void *param)
{
struct esync *sync = param;
pthread_mutex_lock(&sync->e_mt);
pthread_cond_signal(&sync->e_cond);
pthread_mutex_unlock(&sync->e_mt);
}
static void *
echoer(void *param)
{
struct esync sync;
struct mevent *mev;
char buf[128];
int fd = (int)(uintptr_t) param;
int len;
pthread_mutex_init(&sync.e_mt, NULL);
pthread_cond_init(&sync.e_cond, NULL);
pthread_mutex_lock(&sync.e_mt);
mev = mevent_add(fd, EVF_READ, echoer_callback, &sync);
if (mev == NULL) {
printf("Could not allocate echoer event\n");
exit(1);
}
while (!pthread_cond_wait(&sync.e_cond, &sync.e_mt)) {
len = read(fd, buf, sizeof(buf));
if (len > 0) {
write(fd, buf, len);
write(0, buf, len);
} else {
break;
}
}
mevent_delete_close(mev);
pthread_mutex_unlock(&sync.e_mt);
pthread_mutex_destroy(&sync.e_mt);
pthread_cond_destroy(&sync.e_cond);
return (NULL);
}
#else
static void *
echoer(void *param)
{
char buf[128];
int fd = (int)(uintptr_t) param;
int len;
while ((len = read(fd, buf, sizeof(buf))) > 0) {
write(1, buf, len);
}
return (NULL);
}
#endif /* MEVENT_ECHO */
static void
acceptor_callback(int fd, enum ev_type type, void *param)
{
pthread_mutex_lock(&accept_mutex);
pthread_cond_signal(&accept_condvar);
pthread_mutex_unlock(&accept_mutex);
}
static void *
acceptor(void *param)
{
struct sockaddr_in sin;
pthread_t tid;
int news;
int s;
static int first;
if ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
perror("socket");
exit(1);
}
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = htonl(INADDR_ANY);
sin.sin_port = htons(TEST_PORT);
if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0) {
perror("bind");
exit(1);
}
if (listen(s, 1) < 0) {
perror("listen");
exit(1);
}
(void) mevent_add(s, EVF_READ, acceptor_callback, NULL);
pthread_mutex_lock(&accept_mutex);
while (!pthread_cond_wait(&accept_condvar, &accept_mutex)) {
news = accept(s, NULL, NULL);
if (news < 0) {
perror("accept error");
} else {
static int first = 1;
if (first) {
/*
* Start a timer
*/
first = 0;
tevp = mevent_add(1, EVF_TIMER, timer_callback,
NULL);
}
printf("incoming connection, spawning thread\n");
pthread_create(&tid, NULL, echoer,
(void *)(uintptr_t)news);
}
}
return (NULL);
}
main()
{
pthread_t tid;
pthread_create(&tid, NULL, acceptor, NULL);
mevent_dispatch();
}

377
mptbl.c Normal file
View File

@ -0,0 +1,377 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/errno.h>
#include <x86/mptable.h>
#include <stdio.h>
#include <string.h>
#include "acpi.h"
#include "bhyverun.h"
#include "mptbl.h"
#include "pci_emul.h"
#define MPTABLE_BASE 0xF0000
/* floating pointer length + maximum length of configuration table */
#define MPTABLE_MAX_LENGTH (65536 + 16)
#define LAPIC_PADDR 0xFEE00000
#define LAPIC_VERSION 16
#define IOAPIC_PADDR 0xFEC00000
#define IOAPIC_VERSION 0x11
#define MP_SPECREV 4
#define MPFP_SIG "_MP_"
/* Configuration header defines */
#define MPCH_SIG "PCMP"
#define MPCH_OEMID "BHyVe "
#define MPCH_OEMID_LEN 8
#define MPCH_PRODID "Hypervisor "
#define MPCH_PRODID_LEN 12
/* Processor entry defines */
#define MPEP_SIG_FAMILY 6 /* XXX bhyve should supply this */
#define MPEP_SIG_MODEL 26
#define MPEP_SIG_STEPPING 5
#define MPEP_SIG \
((MPEP_SIG_FAMILY << 8) | \
(MPEP_SIG_MODEL << 4) | \
(MPEP_SIG_STEPPING))
#define MPEP_FEATURES (0xBFEBFBFF) /* XXX Intel i7 */
/* Number of local intr entries */
#define MPEII_NUM_LOCAL_IRQ 2
/* Bus entry defines */
#define MPE_NUM_BUSES 2
#define MPE_BUSNAME_LEN 6
#define MPE_BUSNAME_ISA "ISA "
#define MPE_BUSNAME_PCI "PCI "
static void *oem_tbl_start;
static int oem_tbl_size;
static uint8_t
mpt_compute_checksum(void *base, size_t len)
{
uint8_t *bytes;
uint8_t sum;
for(bytes = base, sum = 0; len > 0; len--) {
sum += *bytes++;
}
return (256 - sum);
}
static void
mpt_build_mpfp(mpfps_t mpfp, vm_paddr_t gpa)
{
memset(mpfp, 0, sizeof(*mpfp));
memcpy(mpfp->signature, MPFP_SIG, 4);
mpfp->pap = gpa + sizeof(*mpfp);
mpfp->length = 1;
mpfp->spec_rev = MP_SPECREV;
mpfp->checksum = mpt_compute_checksum(mpfp, sizeof(*mpfp));
}
static void
mpt_build_mpch(mpcth_t mpch)
{
memset(mpch, 0, sizeof(*mpch));
memcpy(mpch->signature, MPCH_SIG, 4);
mpch->spec_rev = MP_SPECREV;
memcpy(mpch->oem_id, MPCH_OEMID, MPCH_OEMID_LEN);
memcpy(mpch->product_id, MPCH_PRODID, MPCH_PRODID_LEN);
mpch->apic_address = LAPIC_PADDR;
}
static void
mpt_build_proc_entries(proc_entry_ptr mpep, int ncpu)
{
int i;
for (i = 0; i < ncpu; i++) {
memset(mpep, 0, sizeof(*mpep));
mpep->type = MPCT_ENTRY_PROCESSOR;
mpep->apic_id = i; // XXX
mpep->apic_version = LAPIC_VERSION;
mpep->cpu_flags = PROCENTRY_FLAG_EN;
if (i == 0)
mpep->cpu_flags |= PROCENTRY_FLAG_BP;
mpep->cpu_signature = MPEP_SIG;
mpep->feature_flags = MPEP_FEATURES;
mpep++;
}
}
static void
mpt_build_localint_entries(int_entry_ptr mpie)
{
/* Hardcode LINT0 as ExtINT on all CPUs. */
memset(mpie, 0, sizeof(*mpie));
mpie->type = MPCT_ENTRY_LOCAL_INT;
mpie->int_type = INTENTRY_TYPE_EXTINT;
mpie->int_flags = INTENTRY_FLAGS_POLARITY_CONFORM |
INTENTRY_FLAGS_TRIGGER_CONFORM;
mpie->dst_apic_id = 0xff;
mpie->dst_apic_int = 0;
mpie++;
/* Hardcode LINT1 as NMI on all CPUs. */
memset(mpie, 0, sizeof(*mpie));
mpie->type = MPCT_ENTRY_LOCAL_INT;
mpie->int_type = INTENTRY_TYPE_NMI;
mpie->int_flags = INTENTRY_FLAGS_POLARITY_CONFORM |
INTENTRY_FLAGS_TRIGGER_CONFORM;
mpie->dst_apic_id = 0xff;
mpie->dst_apic_int = 1;
}
static void
mpt_build_bus_entries(bus_entry_ptr mpeb)
{
memset(mpeb, 0, sizeof(*mpeb));
mpeb->type = MPCT_ENTRY_BUS;
mpeb->bus_id = 0;
memcpy(mpeb->bus_type, MPE_BUSNAME_PCI, MPE_BUSNAME_LEN);
mpeb++;
memset(mpeb, 0, sizeof(*mpeb));
mpeb->type = MPCT_ENTRY_BUS;
mpeb->bus_id = 1;
memcpy(mpeb->bus_type, MPE_BUSNAME_ISA, MPE_BUSNAME_LEN);
}
static void
mpt_build_ioapic_entries(io_apic_entry_ptr mpei, int id)
{
memset(mpei, 0, sizeof(*mpei));
mpei->type = MPCT_ENTRY_IOAPIC;
mpei->apic_id = id;
mpei->apic_version = IOAPIC_VERSION;
mpei->apic_flags = IOAPICENTRY_FLAG_EN;
mpei->apic_address = IOAPIC_PADDR;
}
static int
mpt_count_ioint_entries(void)
{
int bus, count;
count = 0;
for (bus = 0; bus <= PCI_BUSMAX; bus++)
count += pci_count_lintr(bus);
/*
* Always include entries for the first 16 pins along with a entry
* for each active PCI INTx pin.
*/
return (16 + count);
}
static void
mpt_generate_pci_int(int bus, int slot, int pin, int pirq_pin, int ioapic_irq,
void *arg)
{
int_entry_ptr *mpiep, mpie;
mpiep = arg;
mpie = *mpiep;
memset(mpie, 0, sizeof(*mpie));
/*
* This is always after another I/O interrupt entry, so cheat
* and fetch the I/O APIC ID from the prior entry.
*/
mpie->type = MPCT_ENTRY_INT;
mpie->int_type = INTENTRY_TYPE_INT;
mpie->src_bus_id = bus;
mpie->src_bus_irq = slot << 2 | (pin - 1);
mpie->dst_apic_id = mpie[-1].dst_apic_id;
mpie->dst_apic_int = ioapic_irq;
*mpiep = mpie + 1;
}
static void
mpt_build_ioint_entries(int_entry_ptr mpie, int id)
{
int pin, bus;
/*
* The following config is taken from kernel mptable.c
* mptable_parse_default_config_ints(...), for now
* just use the default config, tweek later if needed.
*/
/* First, generate the first 16 pins. */
for (pin = 0; pin < 16; pin++) {
memset(mpie, 0, sizeof(*mpie));
mpie->type = MPCT_ENTRY_INT;
mpie->src_bus_id = 1;
mpie->dst_apic_id = id;
/*
* All default configs route IRQs from bus 0 to the first 16
* pins of the first I/O APIC with an APIC ID of 2.
*/
mpie->dst_apic_int = pin;
switch (pin) {
case 0:
/* Pin 0 is an ExtINT pin. */
mpie->int_type = INTENTRY_TYPE_EXTINT;
break;
case 2:
/* IRQ 0 is routed to pin 2. */
mpie->int_type = INTENTRY_TYPE_INT;
mpie->src_bus_irq = 0;
break;
case SCI_INT:
/* ACPI SCI is level triggered and active-lo. */
mpie->int_flags = INTENTRY_FLAGS_POLARITY_ACTIVELO |
INTENTRY_FLAGS_TRIGGER_LEVEL;
mpie->int_type = INTENTRY_TYPE_INT;
mpie->src_bus_irq = SCI_INT;
break;
default:
/* All other pins are identity mapped. */
mpie->int_type = INTENTRY_TYPE_INT;
mpie->src_bus_irq = pin;
break;
}
mpie++;
}
/* Next, generate entries for any PCI INTx interrupts. */
for (bus = 0; bus <= PCI_BUSMAX; bus++)
pci_walk_lintr(bus, mpt_generate_pci_int, &mpie);
}
void
mptable_add_oemtbl(void *tbl, int tblsz)
{
oem_tbl_start = tbl;
oem_tbl_size = tblsz;
}
int
mptable_build(struct vmctx *ctx, int ncpu)
{
mpcth_t mpch;
bus_entry_ptr mpeb;
io_apic_entry_ptr mpei;
proc_entry_ptr mpep;
mpfps_t mpfp;
int_entry_ptr mpie;
int ioints, bus;
char *curraddr;
char *startaddr;
startaddr = paddr_guest2host(ctx, MPTABLE_BASE, MPTABLE_MAX_LENGTH);
if (startaddr == NULL) {
fprintf(stderr, "mptable requires mapped mem\n");
return (ENOMEM);
}
/*
* There is no way to advertise multiple PCI hierarchies via MPtable
* so require that there is no PCI hierarchy with a non-zero bus
* number.
*/
for (bus = 1; bus <= PCI_BUSMAX; bus++) {
if (pci_bus_configured(bus)) {
fprintf(stderr, "MPtable is incompatible with "
"multiple PCI hierarchies.\r\n");
fprintf(stderr, "MPtable generation can be disabled "
"by passing the -Y option to bhyve(8).\r\n");
return (EINVAL);
}
}
curraddr = startaddr;
mpfp = (mpfps_t)curraddr;
mpt_build_mpfp(mpfp, MPTABLE_BASE);
curraddr += sizeof(*mpfp);
mpch = (mpcth_t)curraddr;
mpt_build_mpch(mpch);
curraddr += sizeof(*mpch);
mpep = (proc_entry_ptr)curraddr;
mpt_build_proc_entries(mpep, ncpu);
curraddr += sizeof(*mpep) * ncpu;
mpch->entry_count += ncpu;
mpeb = (bus_entry_ptr) curraddr;
mpt_build_bus_entries(mpeb);
curraddr += sizeof(*mpeb) * MPE_NUM_BUSES;
mpch->entry_count += MPE_NUM_BUSES;
mpei = (io_apic_entry_ptr)curraddr;
mpt_build_ioapic_entries(mpei, 0);
curraddr += sizeof(*mpei);
mpch->entry_count++;
mpie = (int_entry_ptr) curraddr;
ioints = mpt_count_ioint_entries();
mpt_build_ioint_entries(mpie, 0);
curraddr += sizeof(*mpie) * ioints;
mpch->entry_count += ioints;
mpie = (int_entry_ptr)curraddr;
mpt_build_localint_entries(mpie);
curraddr += sizeof(*mpie) * MPEII_NUM_LOCAL_IRQ;
mpch->entry_count += MPEII_NUM_LOCAL_IRQ;
if (oem_tbl_start) {
mpch->oem_table_pointer = curraddr - startaddr + MPTABLE_BASE;
mpch->oem_table_size = oem_tbl_size;
memcpy(curraddr, oem_tbl_start, oem_tbl_size);
}
mpch->base_table_length = curraddr - (char *)mpch;
mpch->checksum = mpt_compute_checksum(mpch, mpch->base_table_length);
return (0);
}

35
mptbl.h Normal file
View File

@ -0,0 +1,35 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _MPTBL_H_
#define _MPTBL_H_
int mptable_build(struct vmctx *ctx, int ncpu);
void mptable_add_oemtbl(void *tbl, int tblsz);
#endif /* _MPTBL_H_ */

2347
pci_ahci.c Normal file

File diff suppressed because it is too large Load Diff

2108
pci_emul.c Normal file

File diff suppressed because it is too large Load Diff

285
pci_emul.h Normal file
View File

@ -0,0 +1,285 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _PCI_EMUL_H_
#define _PCI_EMUL_H_
#include <sys/types.h>
#include <sys/queue.h>
#include <sys/kernel.h>
#include <sys/_pthreadtypes.h>
#include <dev/pci/pcireg.h>
#include <assert.h>
#define PCI_BARMAX PCIR_MAX_BAR_0 /* BAR registers in a Type 0 header */
struct vmctx;
struct pci_devinst;
struct memory_region;
struct pci_devemu {
char *pe_emu; /* Name of device emulation */
/* instance creation */
int (*pe_init)(struct vmctx *, struct pci_devinst *,
char *opts);
/* ACPI DSDT enumeration */
void (*pe_write_dsdt)(struct pci_devinst *);
/* config space read/write callbacks */
int (*pe_cfgwrite)(struct vmctx *ctx, int vcpu,
struct pci_devinst *pi, int offset,
int bytes, uint32_t val);
int (*pe_cfgread)(struct vmctx *ctx, int vcpu,
struct pci_devinst *pi, int offset,
int bytes, uint32_t *retval);
/* BAR read/write callbacks */
void (*pe_barwrite)(struct vmctx *ctx, int vcpu,
struct pci_devinst *pi, int baridx,
uint64_t offset, int size, uint64_t value);
uint64_t (*pe_barread)(struct vmctx *ctx, int vcpu,
struct pci_devinst *pi, int baridx,
uint64_t offset, int size);
};
#define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x);
enum pcibar_type {
PCIBAR_NONE,
PCIBAR_IO,
PCIBAR_MEM32,
PCIBAR_MEM64,
PCIBAR_MEMHI64
};
struct pcibar {
enum pcibar_type type; /* io or memory */
uint64_t size;
uint64_t addr;
};
#define PI_NAMESZ 40
struct msix_table_entry {
uint64_t addr;
uint32_t msg_data;
uint32_t vector_control;
} __packed;
/*
* In case the structure is modified to hold extra information, use a define
* for the size that should be emulated.
*/
#define MSIX_TABLE_ENTRY_SIZE 16
#define MAX_MSIX_TABLE_ENTRIES 2048
#define PBA_SIZE(msgnum) (roundup2((msgnum), 64) / 8)
enum lintr_stat {
IDLE,
ASSERTED,
PENDING
};
struct pci_devinst {
struct pci_devemu *pi_d;
struct vmctx *pi_vmctx;
uint8_t pi_bus, pi_slot, pi_func;
char pi_name[PI_NAMESZ];
int pi_bar_getsize;
int pi_prevcap;
int pi_capend;
struct {
int8_t pin;
enum lintr_stat state;
int pirq_pin;
int ioapic_irq;
pthread_mutex_t lock;
} pi_lintr;
struct {
int enabled;
uint64_t addr;
uint64_t msg_data;
int maxmsgnum;
} pi_msi;
struct {
int enabled;
int table_bar;
int pba_bar;
uint32_t table_offset;
int table_count;
uint32_t pba_offset;
int pba_size;
int function_mask;
struct msix_table_entry *table; /* allocated at runtime */
void *pba_page;
int pba_page_offset;
} pi_msix;
void *pi_arg; /* devemu-private data */
u_char pi_cfgdata[PCI_REGMAX + 1];
struct pcibar pi_bar[PCI_BARMAX + 1];
};
struct msicap {
uint8_t capid;
uint8_t nextptr;
uint16_t msgctrl;
uint32_t addrlo;
uint32_t addrhi;
uint16_t msgdata;
} __packed;
struct msixcap {
uint8_t capid;
uint8_t nextptr;
uint16_t msgctrl;
uint32_t table_info; /* bar index and offset within it */
uint32_t pba_info; /* bar index and offset within it */
} __packed;
struct pciecap {
uint8_t capid;
uint8_t nextptr;
uint16_t pcie_capabilities;
uint32_t dev_capabilities; /* all devices */
uint16_t dev_control;
uint16_t dev_status;
uint32_t link_capabilities; /* devices with links */
uint16_t link_control;
uint16_t link_status;
uint32_t slot_capabilities; /* ports with slots */
uint16_t slot_control;
uint16_t slot_status;
uint16_t root_control; /* root ports */
uint16_t root_capabilities;
uint32_t root_status;
uint32_t dev_capabilities2; /* all devices */
uint16_t dev_control2;
uint16_t dev_status2;
uint32_t link_capabilities2; /* devices with links */
uint16_t link_control2;
uint16_t link_status2;
uint32_t slot_capabilities2; /* ports with slots */
uint16_t slot_control2;
uint16_t slot_status2;
} __packed;
typedef void (*pci_lintr_cb)(int b, int s, int pin, int pirq_pin,
int ioapic_irq, void *arg);
int init_pci(struct vmctx *ctx);
void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
int bytes, uint32_t val);
void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
int bytes, uint32_t val);
void pci_callback(void);
int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx,
enum pcibar_type type, uint64_t size);
int pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx,
uint64_t hostbase, enum pcibar_type type, uint64_t size);
int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum);
int pci_emul_add_pciecap(struct pci_devinst *pi, int pcie_device_type);
void pci_generate_msi(struct pci_devinst *pi, int msgnum);
void pci_generate_msix(struct pci_devinst *pi, int msgnum);
void pci_lintr_assert(struct pci_devinst *pi);
void pci_lintr_deassert(struct pci_devinst *pi);
void pci_lintr_request(struct pci_devinst *pi);
int pci_msi_enabled(struct pci_devinst *pi);
int pci_msix_enabled(struct pci_devinst *pi);
int pci_msix_table_bar(struct pci_devinst *pi);
int pci_msix_pba_bar(struct pci_devinst *pi);
int pci_msi_msgnum(struct pci_devinst *pi);
int pci_parse_slot(char *opt);
void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr);
int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum);
int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size,
uint64_t value);
uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size);
int pci_count_lintr(int bus);
void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg);
void pci_write_dsdt(void);
uint64_t pci_ecfg_base(void);
int pci_bus_configured(int bus);
static __inline void
pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val)
{
assert(offset <= PCI_REGMAX);
*(uint8_t *)(pi->pi_cfgdata + offset) = val;
}
static __inline void
pci_set_cfgdata16(struct pci_devinst *pi, int offset, uint16_t val)
{
assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
*(uint16_t *)(pi->pi_cfgdata + offset) = val;
}
static __inline void
pci_set_cfgdata32(struct pci_devinst *pi, int offset, uint32_t val)
{
assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
*(uint32_t *)(pi->pi_cfgdata + offset) = val;
}
static __inline uint8_t
pci_get_cfgdata8(struct pci_devinst *pi, int offset)
{
assert(offset <= PCI_REGMAX);
return (*(uint8_t *)(pi->pi_cfgdata + offset));
}
static __inline uint16_t
pci_get_cfgdata16(struct pci_devinst *pi, int offset)
{
assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
return (*(uint16_t *)(pi->pi_cfgdata + offset));
}
static __inline uint32_t
pci_get_cfgdata32(struct pci_devinst *pi, int offset)
{
assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
return (*(uint32_t *)(pi->pi_cfgdata + offset));
}
#endif /* _PCI_EMUL_H_ */

70
pci_hostbridge.c Normal file
View File

@ -0,0 +1,70 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "pci_emul.h"
static int
pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
/* config space */
pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1275); /* NetApp */
pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1275); /* NetApp */
pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL);
pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE);
pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST);
pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_PORT);
return (0);
}
static int
pci_amd_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
(void) pci_hostbridge_init(ctx, pi, opts);
pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1022); /* AMD */
pci_set_cfgdata16(pi, PCIR_DEVICE, 0x7432); /* made up */
return (0);
}
struct pci_devemu pci_de_amd_hostbridge = {
.pe_emu = "amd_hostbridge",
.pe_init = pci_amd_hostbridge_init,
};
PCI_EMUL_SET(pci_de_amd_hostbridge);
struct pci_devemu pci_de_hostbridge = {
.pe_emu = "hostbridge",
.pe_init = pci_hostbridge_init,
};
PCI_EMUL_SET(pci_de_hostbridge);

346
pci_irq.c Normal file
View File

@ -0,0 +1,346 @@
/*-
* Copyright (c) 2014 Hudson River Trading LLC
* Written by: John H. Baldwin <jhb@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <machine/vmm.h>
#include <assert.h>
#include <pthread.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <vmmapi.h>
#include "acpi.h"
#include "inout.h"
#include "pci_emul.h"
#include "pci_irq.h"
#include "pci_lpc.h"
/*
* Implement an 8 pin PCI interrupt router compatible with the router
* present on Intel's ICH10 chip.
*/
/* Fields in each PIRQ register. */
#define PIRQ_DIS 0x80
#define PIRQ_IRQ 0x0f
/* Only IRQs 3-7, 9-12, and 14-15 are permitted. */
#define PERMITTED_IRQS 0xdef8
#define IRQ_PERMITTED(irq) (((1U << (irq)) & PERMITTED_IRQS) != 0)
/* IRQ count to disable an IRQ. */
#define IRQ_DISABLED 0xff
static struct pirq {
uint8_t reg;
int use_count;
int active_count;
pthread_mutex_t lock;
} pirqs[8];
static u_char irq_counts[16];
static int pirq_cold = 1;
/*
* Returns true if this pin is enabled with a valid IRQ. Setting the
* register to a reserved IRQ causes interrupts to not be asserted as
* if the pin was disabled.
*/
static bool
pirq_valid_irq(int reg)
{
if (reg & PIRQ_DIS)
return (false);
return (IRQ_PERMITTED(reg & PIRQ_IRQ));
}
uint8_t
pirq_read(int pin)
{
assert(pin > 0 && pin <= nitems(pirqs));
return (pirqs[pin - 1].reg);
}
void
pirq_write(struct vmctx *ctx, int pin, uint8_t val)
{
struct pirq *pirq;
assert(pin > 0 && pin <= nitems(pirqs));
pirq = &pirqs[pin - 1];
pthread_mutex_lock(&pirq->lock);
if (pirq->reg != (val & (PIRQ_DIS | PIRQ_IRQ))) {
if (pirq->active_count != 0 && pirq_valid_irq(pirq->reg))
vm_isa_deassert_irq(ctx, pirq->reg & PIRQ_IRQ, -1);
pirq->reg = val & (PIRQ_DIS | PIRQ_IRQ);
if (pirq->active_count != 0 && pirq_valid_irq(pirq->reg))
vm_isa_assert_irq(ctx, pirq->reg & PIRQ_IRQ, -1);
}
pthread_mutex_unlock(&pirq->lock);
}
void
pci_irq_reserve(int irq)
{
assert(irq >= 0 && irq < nitems(irq_counts));
assert(pirq_cold);
assert(irq_counts[irq] == 0 || irq_counts[irq] == IRQ_DISABLED);
irq_counts[irq] = IRQ_DISABLED;
}
void
pci_irq_use(int irq)
{
assert(irq >= 0 && irq < nitems(irq_counts));
assert(pirq_cold);
assert(irq_counts[irq] != IRQ_DISABLED);
irq_counts[irq]++;
}
void
pci_irq_init(struct vmctx *ctx)
{
int i;
for (i = 0; i < nitems(pirqs); i++) {
pirqs[i].reg = PIRQ_DIS;
pirqs[i].use_count = 0;
pirqs[i].active_count = 0;
pthread_mutex_init(&pirqs[i].lock, NULL);
}
for (i = 0; i < nitems(irq_counts); i++) {
if (IRQ_PERMITTED(i))
irq_counts[i] = 0;
else
irq_counts[i] = IRQ_DISABLED;
}
}
void
pci_irq_assert(struct pci_devinst *pi)
{
struct pirq *pirq;
if (pi->pi_lintr.pirq_pin > 0) {
assert(pi->pi_lintr.pirq_pin <= nitems(pirqs));
pirq = &pirqs[pi->pi_lintr.pirq_pin - 1];
pthread_mutex_lock(&pirq->lock);
pirq->active_count++;
if (pirq->active_count == 1 && pirq_valid_irq(pirq->reg)) {
vm_isa_assert_irq(pi->pi_vmctx, pirq->reg & PIRQ_IRQ,
pi->pi_lintr.ioapic_irq);
pthread_mutex_unlock(&pirq->lock);
return;
}
pthread_mutex_unlock(&pirq->lock);
}
vm_ioapic_assert_irq(pi->pi_vmctx, pi->pi_lintr.ioapic_irq);
}
void
pci_irq_deassert(struct pci_devinst *pi)
{
struct pirq *pirq;
if (pi->pi_lintr.pirq_pin > 0) {
assert(pi->pi_lintr.pirq_pin <= nitems(pirqs));
pirq = &pirqs[pi->pi_lintr.pirq_pin - 1];
pthread_mutex_lock(&pirq->lock);
pirq->active_count--;
if (pirq->active_count == 0 && pirq_valid_irq(pirq->reg)) {
vm_isa_deassert_irq(pi->pi_vmctx, pirq->reg & PIRQ_IRQ,
pi->pi_lintr.ioapic_irq);
pthread_mutex_unlock(&pirq->lock);
return;
}
pthread_mutex_unlock(&pirq->lock);
}
vm_ioapic_deassert_irq(pi->pi_vmctx, pi->pi_lintr.ioapic_irq);
}
int
pirq_alloc_pin(struct vmctx *ctx)
{
int best_count, best_irq, best_pin, irq, pin;
pirq_cold = 0;
/* First, find the least-used PIRQ pin. */
best_pin = 0;
best_count = pirqs[0].use_count;
for (pin = 1; pin < nitems(pirqs); pin++) {
if (pirqs[pin].use_count < best_count) {
best_pin = pin;
best_count = pirqs[pin].use_count;
}
}
pirqs[best_pin].use_count++;
/* Second, route this pin to an IRQ. */
if (pirqs[best_pin].reg == PIRQ_DIS) {
best_irq = -1;
best_count = 0;
for (irq = 0; irq < nitems(irq_counts); irq++) {
if (irq_counts[irq] == IRQ_DISABLED)
continue;
if (best_irq == -1 || irq_counts[irq] < best_count) {
best_irq = irq;
best_count = irq_counts[irq];
}
}
assert(best_irq >= 0);
irq_counts[best_irq]++;
pirqs[best_pin].reg = best_irq;
vm_isa_set_irq_trigger(ctx, best_irq, LEVEL_TRIGGER);
}
return (best_pin + 1);
}
int
pirq_irq(int pin)
{
assert(pin > 0 && pin <= nitems(pirqs));
return (pirqs[pin - 1].reg & PIRQ_IRQ);
}
/* XXX: Generate $PIR table. */
static void
pirq_dsdt(void)
{
char *irq_prs, *old;
int irq, pin;
irq_prs = NULL;
for (irq = 0; irq < nitems(irq_counts); irq++) {
if (!IRQ_PERMITTED(irq))
continue;
if (irq_prs == NULL)
asprintf(&irq_prs, "%d", irq);
else {
old = irq_prs;
asprintf(&irq_prs, "%s,%d", old, irq);
free(old);
}
}
/*
* A helper method to validate a link register's value. This
* duplicates pirq_valid_irq().
*/
dsdt_line("");
dsdt_line("Method (PIRV, 1, NotSerialized)");
dsdt_line("{");
dsdt_line(" If (And (Arg0, 0x%02X))", PIRQ_DIS);
dsdt_line(" {");
dsdt_line(" Return (0x00)");
dsdt_line(" }");
dsdt_line(" And (Arg0, 0x%02X, Local0)", PIRQ_IRQ);
dsdt_line(" If (LLess (Local0, 0x03))");
dsdt_line(" {");
dsdt_line(" Return (0x00)");
dsdt_line(" }");
dsdt_line(" If (LEqual (Local0, 0x08))");
dsdt_line(" {");
dsdt_line(" Return (0x00)");
dsdt_line(" }");
dsdt_line(" If (LEqual (Local0, 0x0D))");
dsdt_line(" {");
dsdt_line(" Return (0x00)");
dsdt_line(" }");
dsdt_line(" Return (0x01)");
dsdt_line("}");
for (pin = 0; pin < nitems(pirqs); pin++) {
dsdt_line("");
dsdt_line("Device (LNK%c)", 'A' + pin);
dsdt_line("{");
dsdt_line(" Name (_HID, EisaId (\"PNP0C0F\"))");
dsdt_line(" Name (_UID, 0x%02X)", pin + 1);
dsdt_line(" Method (_STA, 0, NotSerialized)");
dsdt_line(" {");
dsdt_line(" If (PIRV (PIR%c))", 'A' + pin);
dsdt_line(" {");
dsdt_line(" Return (0x0B)");
dsdt_line(" }");
dsdt_line(" Else");
dsdt_line(" {");
dsdt_line(" Return (0x09)");
dsdt_line(" }");
dsdt_line(" }");
dsdt_line(" Name (_PRS, ResourceTemplate ()");
dsdt_line(" {");
dsdt_line(" IRQ (Level, ActiveLow, Shared, )");
dsdt_line(" {%s}", irq_prs);
dsdt_line(" })");
dsdt_line(" Name (CB%02X, ResourceTemplate ()", pin + 1);
dsdt_line(" {");
dsdt_line(" IRQ (Level, ActiveLow, Shared, )");
dsdt_line(" {}");
dsdt_line(" })");
dsdt_line(" CreateWordField (CB%02X, 0x01, CIR%c)",
pin + 1, 'A' + pin);
dsdt_line(" Method (_CRS, 0, NotSerialized)");
dsdt_line(" {");
dsdt_line(" And (PIR%c, 0x%02X, Local0)", 'A' + pin,
PIRQ_DIS | PIRQ_IRQ);
dsdt_line(" If (PIRV (Local0))");
dsdt_line(" {");
dsdt_line(" ShiftLeft (0x01, Local0, CIR%c)", 'A' + pin);
dsdt_line(" }");
dsdt_line(" Else");
dsdt_line(" {");
dsdt_line(" Store (0x00, CIR%c)", 'A' + pin);
dsdt_line(" }");
dsdt_line(" Return (CB%02X)", pin + 1);
dsdt_line(" }");
dsdt_line(" Method (_DIS, 0, NotSerialized)");
dsdt_line(" {");
dsdt_line(" Store (0x80, PIR%c)", 'A' + pin);
dsdt_line(" }");
dsdt_line(" Method (_SRS, 1, NotSerialized)");
dsdt_line(" {");
dsdt_line(" CreateWordField (Arg0, 0x01, SIR%c)", 'A' + pin);
dsdt_line(" FindSetRightBit (SIR%c, Local0)", 'A' + pin);
dsdt_line(" Store (Decrement (Local0), PIR%c)", 'A' + pin);
dsdt_line(" }");
dsdt_line("}");
}
free(irq_prs);
}
LPC_DSDT(pirq_dsdt);

45
pci_irq.h Normal file
View File

@ -0,0 +1,45 @@
/*-
* Copyright (c) 2014 Hudson River Trading LLC
* Written by: John H. Baldwin <jhb@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef __PCI_IRQ_H__
#define __PCI_IRQ_H__
struct pci_devinst;
void pci_irq_assert(struct pci_devinst *pi);
void pci_irq_deassert(struct pci_devinst *pi);
void pci_irq_init(struct vmctx *ctx);
void pci_irq_reserve(int irq);
void pci_irq_use(int irq);
int pirq_alloc_pin(struct vmctx *ctx);
int pirq_irq(int pin);
uint8_t pirq_read(int pin);
void pirq_write(struct vmctx *ctx, int pin, uint8_t val);
#endif

450
pci_lpc.c Normal file
View File

@ -0,0 +1,450 @@
/*-
* Copyright (c) 2013 Neel Natu <neel@freebsd.org>
* Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <machine/vmm.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <vmmapi.h>
#include "acpi.h"
#include "bootrom.h"
#include "inout.h"
#include "pci_emul.h"
#include "pci_irq.h"
#include "pci_lpc.h"
#include "uart_emul.h"
#define IO_ICU1 0x20
#define IO_ICU2 0xA0
SET_DECLARE(lpc_dsdt_set, struct lpc_dsdt);
SET_DECLARE(lpc_sysres_set, struct lpc_sysres);
#define ELCR_PORT 0x4d0
SYSRES_IO(ELCR_PORT, 2);
#define IO_TIMER1_PORT 0x40
#define NMISC_PORT 0x61
SYSRES_IO(NMISC_PORT, 1);
static struct pci_devinst *lpc_bridge;
static const char *romfile;
#define LPC_UART_NUM 2
static struct lpc_uart_softc {
struct uart_softc *uart_softc;
const char *opts;
int iobase;
int irq;
int enabled;
} lpc_uart_softc[LPC_UART_NUM];
static const char *lpc_uart_names[LPC_UART_NUM] = { "COM1", "COM2" };
/*
* LPC device configuration is in the following form:
* <lpc_device_name>[,<options>]
* For e.g. "com1,stdio" or "bootrom,/var/romfile"
*/
int
lpc_device_parse(const char *opts)
{
int unit, error;
char *str, *cpy, *lpcdev;
error = -1;
str = cpy = strdup(opts);
lpcdev = strsep(&str, ",");
if (lpcdev != NULL) {
if (strcasecmp(lpcdev, "bootrom") == 0) {
romfile = str;
error = 0;
goto done;
}
for (unit = 0; unit < LPC_UART_NUM; unit++) {
if (strcasecmp(lpcdev, lpc_uart_names[unit]) == 0) {
lpc_uart_softc[unit].opts = str;
error = 0;
goto done;
}
}
}
done:
if (error)
free(cpy);
return (error);
}
const char *
lpc_bootrom(void)
{
return (romfile);
}
static void
lpc_uart_intr_assert(void *arg)
{
struct lpc_uart_softc *sc = arg;
assert(sc->irq >= 0);
vm_isa_pulse_irq(lpc_bridge->pi_vmctx, sc->irq, sc->irq);
}
static void
lpc_uart_intr_deassert(void *arg)
{
/*
* The COM devices on the LPC bus generate edge triggered interrupts,
* so nothing more to do here.
*/
}
static int
lpc_uart_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
int offset;
struct lpc_uart_softc *sc = arg;
offset = port - sc->iobase;
switch (bytes) {
case 1:
if (in)
*eax = uart_read(sc->uart_softc, offset);
else
uart_write(sc->uart_softc, offset, *eax);
break;
case 2:
if (in) {
*eax = uart_read(sc->uart_softc, offset);
*eax |= uart_read(sc->uart_softc, offset + 1) << 8;
} else {
uart_write(sc->uart_softc, offset, *eax);
uart_write(sc->uart_softc, offset + 1, *eax >> 8);
}
break;
default:
return (-1);
}
return (0);
}
static int
lpc_init(struct vmctx *ctx)
{
struct lpc_uart_softc *sc;
struct inout_port iop;
const char *name;
int unit, error;
if (romfile != NULL) {
error = bootrom_init(ctx, romfile);
if (error)
return (error);
}
/* COM1 and COM2 */
for (unit = 0; unit < LPC_UART_NUM; unit++) {
sc = &lpc_uart_softc[unit];
name = lpc_uart_names[unit];
if (uart_legacy_alloc(unit, &sc->iobase, &sc->irq) != 0) {
fprintf(stderr, "Unable to allocate resources for "
"LPC device %s\n", name);
return (-1);
}
pci_irq_reserve(sc->irq);
sc->uart_softc = uart_init(lpc_uart_intr_assert,
lpc_uart_intr_deassert, sc);
if (uart_set_backend(sc->uart_softc, sc->opts) != 0) {
fprintf(stderr, "Unable to initialize backend '%s' "
"for LPC device %s\n", sc->opts, name);
return (-1);
}
bzero(&iop, sizeof(struct inout_port));
iop.name = name;
iop.port = sc->iobase;
iop.size = UART_IO_BAR_SIZE;
iop.flags = IOPORT_F_INOUT;
iop.handler = lpc_uart_io_handler;
iop.arg = sc;
error = register_inout(&iop);
assert(error == 0);
sc->enabled = 1;
}
return (0);
}
static void
pci_lpc_write_dsdt(struct pci_devinst *pi)
{
struct lpc_dsdt **ldpp, *ldp;
dsdt_line("");
dsdt_line("Device (ISA)");
dsdt_line("{");
dsdt_line(" Name (_ADR, 0x%04X%04X)", pi->pi_slot, pi->pi_func);
dsdt_line(" OperationRegion (LPCR, PCI_Config, 0x00, 0x100)");
dsdt_line(" Field (LPCR, AnyAcc, NoLock, Preserve)");
dsdt_line(" {");
dsdt_line(" Offset (0x60),");
dsdt_line(" PIRA, 8,");
dsdt_line(" PIRB, 8,");
dsdt_line(" PIRC, 8,");
dsdt_line(" PIRD, 8,");
dsdt_line(" Offset (0x68),");
dsdt_line(" PIRE, 8,");
dsdt_line(" PIRF, 8,");
dsdt_line(" PIRG, 8,");
dsdt_line(" PIRH, 8");
dsdt_line(" }");
dsdt_line("");
dsdt_indent(1);
SET_FOREACH(ldpp, lpc_dsdt_set) {
ldp = *ldpp;
ldp->handler();
}
dsdt_line("");
dsdt_line("Device (PIC)");
dsdt_line("{");
dsdt_line(" Name (_HID, EisaId (\"PNP0000\"))");
dsdt_line(" Name (_CRS, ResourceTemplate ()");
dsdt_line(" {");
dsdt_indent(2);
dsdt_fixed_ioport(IO_ICU1, 2);
dsdt_fixed_ioport(IO_ICU2, 2);
dsdt_fixed_irq(2);
dsdt_unindent(2);
dsdt_line(" })");
dsdt_line("}");
dsdt_line("");
dsdt_line("Device (TIMR)");
dsdt_line("{");
dsdt_line(" Name (_HID, EisaId (\"PNP0100\"))");
dsdt_line(" Name (_CRS, ResourceTemplate ()");
dsdt_line(" {");
dsdt_indent(2);
dsdt_fixed_ioport(IO_TIMER1_PORT, 4);
dsdt_fixed_irq(0);
dsdt_unindent(2);
dsdt_line(" })");
dsdt_line("}");
dsdt_unindent(1);
dsdt_line("}");
}
static void
pci_lpc_sysres_dsdt(void)
{
struct lpc_sysres **lspp, *lsp;
dsdt_line("");
dsdt_line("Device (SIO)");
dsdt_line("{");
dsdt_line(" Name (_HID, EisaId (\"PNP0C02\"))");
dsdt_line(" Name (_CRS, ResourceTemplate ()");
dsdt_line(" {");
dsdt_indent(2);
SET_FOREACH(lspp, lpc_sysres_set) {
lsp = *lspp;
switch (lsp->type) {
case LPC_SYSRES_IO:
dsdt_fixed_ioport(lsp->base, lsp->length);
break;
case LPC_SYSRES_MEM:
dsdt_fixed_mem32(lsp->base, lsp->length);
break;
}
}
dsdt_unindent(2);
dsdt_line(" })");
dsdt_line("}");
}
LPC_DSDT(pci_lpc_sysres_dsdt);
static void
pci_lpc_uart_dsdt(void)
{
struct lpc_uart_softc *sc;
int unit;
for (unit = 0; unit < LPC_UART_NUM; unit++) {
sc = &lpc_uart_softc[unit];
if (!sc->enabled)
continue;
dsdt_line("");
dsdt_line("Device (%s)", lpc_uart_names[unit]);
dsdt_line("{");
dsdt_line(" Name (_HID, EisaId (\"PNP0501\"))");
dsdt_line(" Name (_UID, %d)", unit + 1);
dsdt_line(" Name (_CRS, ResourceTemplate ()");
dsdt_line(" {");
dsdt_indent(2);
dsdt_fixed_ioport(sc->iobase, UART_IO_BAR_SIZE);
dsdt_fixed_irq(sc->irq);
dsdt_unindent(2);
dsdt_line(" })");
dsdt_line("}");
}
}
LPC_DSDT(pci_lpc_uart_dsdt);
static int
pci_lpc_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int coff, int bytes, uint32_t val)
{
int pirq_pin;
if (bytes == 1) {
pirq_pin = 0;
if (coff >= 0x60 && coff <= 0x63)
pirq_pin = coff - 0x60 + 1;
if (coff >= 0x68 && coff <= 0x6b)
pirq_pin = coff - 0x68 + 5;
if (pirq_pin != 0) {
pirq_write(ctx, pirq_pin, val);
pci_set_cfgdata8(pi, coff, pirq_read(pirq_pin));
return (0);
}
}
return (-1);
}
static void
pci_lpc_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size, uint64_t value)
{
}
static uint64_t
pci_lpc_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size)
{
return (0);
}
#define LPC_DEV 0x7000
#define LPC_VENDOR 0x8086
static int
pci_lpc_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
/*
* Do not allow more than one LPC bridge to be configured.
*/
if (lpc_bridge != NULL) {
fprintf(stderr, "Only one LPC bridge is allowed.\n");
return (-1);
}
/*
* Enforce that the LPC can only be configured on bus 0. This
* simplifies the ACPI DSDT because it can provide a decode for
* all legacy i/o ports behind bus 0.
*/
if (pi->pi_bus != 0) {
fprintf(stderr, "LPC bridge can be present only on bus 0.\n");
return (-1);
}
if (lpc_init(ctx) != 0)
return (-1);
/* initialize config space */
pci_set_cfgdata16(pi, PCIR_DEVICE, LPC_DEV);
pci_set_cfgdata16(pi, PCIR_VENDOR, LPC_VENDOR);
pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE);
pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_ISA);
lpc_bridge = pi;
return (0);
}
char *
lpc_pirq_name(int pin)
{
char *name;
if (lpc_bridge == NULL)
return (NULL);
asprintf(&name, "\\_SB.PC00.ISA.LNK%c,", 'A' + pin - 1);
return (name);
}
void
lpc_pirq_routed(void)
{
int pin;
if (lpc_bridge == NULL)
return;
for (pin = 0; pin < 4; pin++)
pci_set_cfgdata8(lpc_bridge, 0x60 + pin, pirq_read(pin + 1));
for (pin = 0; pin < 4; pin++)
pci_set_cfgdata8(lpc_bridge, 0x68 + pin, pirq_read(pin + 5));
}
struct pci_devemu pci_de_lpc = {
.pe_emu = "lpc",
.pe_init = pci_lpc_init,
.pe_write_dsdt = pci_lpc_write_dsdt,
.pe_cfgwrite = pci_lpc_cfgwrite,
.pe_barwrite = pci_lpc_write,
.pe_barread = pci_lpc_read
};
PCI_EMUL_SET(pci_de_lpc);

73
pci_lpc.h Normal file
View File

@ -0,0 +1,73 @@
/*-
* Copyright (c) 2013 Neel Natu <neel@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _LPC_H_
#define _LPC_H_
#include <sys/linker_set.h>
typedef void (*lpc_write_dsdt_t)(void);
struct lpc_dsdt {
lpc_write_dsdt_t handler;
};
#define LPC_DSDT(handler) \
static struct lpc_dsdt __CONCAT(__lpc_dsdt, __LINE__) = { \
(handler), \
}; \
DATA_SET(lpc_dsdt_set, __CONCAT(__lpc_dsdt, __LINE__))
enum lpc_sysres_type {
LPC_SYSRES_IO,
LPC_SYSRES_MEM
};
struct lpc_sysres {
enum lpc_sysres_type type;
uint32_t base;
uint32_t length;
};
#define LPC_SYSRES(type, base, length) \
static struct lpc_sysres __CONCAT(__lpc_sysres, __LINE__) = { \
(type), \
(base), \
(length) \
}; \
DATA_SET(lpc_sysres_set, __CONCAT(__lpc_sysres, __LINE__))
#define SYSRES_IO(base, length) LPC_SYSRES(LPC_SYSRES_IO, base, length)
#define SYSRES_MEM(base, length) LPC_SYSRES(LPC_SYSRES_MEM, base, length)
int lpc_device_parse(const char *opt);
char *lpc_pirq_name(int pin);
void lpc_pirq_routed(void);
const char *lpc_bootrom(void);
#endif

897
pci_passthru.c Normal file
View File

@ -0,0 +1,897 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/pciio.h>
#include <sys/ioctl.h>
#include <dev/io/iodev.h>
#include <dev/pci/pcireg.h>
#include <machine/iodev.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <err.h>
#include <fcntl.h>
#include <unistd.h>
#include <machine/vmm.h>
#include <vmmapi.h>
#include "pci_emul.h"
#include "mem.h"
#ifndef _PATH_DEVPCI
#define _PATH_DEVPCI "/dev/pci"
#endif
#ifndef _PATH_DEVIO
#define _PATH_DEVIO "/dev/io"
#endif
#ifndef _PATH_MEM
#define _PATH_MEM "/dev/mem"
#endif
#define LEGACY_SUPPORT 1
#define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1)
#define MSIX_CAPLEN 12
static int pcifd = -1;
static int iofd = -1;
static int memfd = -1;
struct passthru_softc {
struct pci_devinst *psc_pi;
struct pcibar psc_bar[PCI_BARMAX + 1];
struct {
int capoff;
int msgctrl;
int emulated;
} psc_msi;
struct {
int capoff;
} psc_msix;
struct pcisel psc_sel;
};
static int
msi_caplen(int msgctrl)
{
int len;
len = 10; /* minimum length of msi capability */
if (msgctrl & PCIM_MSICTRL_64BIT)
len += 4;
#if 0
/*
* Ignore the 'mask' and 'pending' bits in the MSI capability.
* We'll let the guest manipulate them directly.
*/
if (msgctrl & PCIM_MSICTRL_VECTOR)
len += 10;
#endif
return (len);
}
static uint32_t
read_config(const struct pcisel *sel, long reg, int width)
{
struct pci_io pi;
bzero(&pi, sizeof(pi));
pi.pi_sel = *sel;
pi.pi_reg = reg;
pi.pi_width = width;
if (ioctl(pcifd, PCIOCREAD, &pi) < 0)
return (0); /* XXX */
else
return (pi.pi_data);
}
static void
write_config(const struct pcisel *sel, long reg, int width, uint32_t data)
{
struct pci_io pi;
bzero(&pi, sizeof(pi));
pi.pi_sel = *sel;
pi.pi_reg = reg;
pi.pi_width = width;
pi.pi_data = data;
(void)ioctl(pcifd, PCIOCWRITE, &pi); /* XXX */
}
#ifdef LEGACY_SUPPORT
static int
passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
{
int capoff, i;
struct msicap msicap;
u_char *capdata;
pci_populate_msicap(&msicap, msgnum, nextptr);
/*
* XXX
* Copy the msi capability structure in the last 16 bytes of the
* config space. This is wrong because it could shadow something
* useful to the device.
*/
capoff = 256 - roundup(sizeof(msicap), 4);
capdata = (u_char *)&msicap;
for (i = 0; i < sizeof(msicap); i++)
pci_set_cfgdata8(pi, capoff + i, capdata[i]);
return (capoff);
}
#endif /* LEGACY_SUPPORT */
static int
cfginitmsi(struct passthru_softc *sc)
{
int i, ptr, capptr, cap, sts, caplen, table_size;
uint32_t u32;
struct pcisel sel;
struct pci_devinst *pi;
struct msixcap msixcap;
uint32_t *msixcap_ptr;
pi = sc->psc_pi;
sel = sc->psc_sel;
/*
* Parse the capabilities and cache the location of the MSI
* and MSI-X capabilities.
*/
sts = read_config(&sel, PCIR_STATUS, 2);
if (sts & PCIM_STATUS_CAPPRESENT) {
ptr = read_config(&sel, PCIR_CAP_PTR, 1);
while (ptr != 0 && ptr != 0xff) {
cap = read_config(&sel, ptr + PCICAP_ID, 1);
if (cap == PCIY_MSI) {
/*
* Copy the MSI capability into the config
* space of the emulated pci device
*/
sc->psc_msi.capoff = ptr;
sc->psc_msi.msgctrl = read_config(&sel,
ptr + 2, 2);
sc->psc_msi.emulated = 0;
caplen = msi_caplen(sc->psc_msi.msgctrl);
capptr = ptr;
while (caplen > 0) {
u32 = read_config(&sel, capptr, 4);
pci_set_cfgdata32(pi, capptr, u32);
caplen -= 4;
capptr += 4;
}
} else if (cap == PCIY_MSIX) {
/*
* Copy the MSI-X capability
*/
sc->psc_msix.capoff = ptr;
caplen = 12;
msixcap_ptr = (uint32_t*) &msixcap;
capptr = ptr;
while (caplen > 0) {
u32 = read_config(&sel, capptr, 4);
*msixcap_ptr = u32;
pci_set_cfgdata32(pi, capptr, u32);
caplen -= 4;
capptr += 4;
msixcap_ptr++;
}
}
ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1);
}
}
if (sc->psc_msix.capoff != 0) {
pi->pi_msix.pba_bar =
msixcap.pba_info & PCIM_MSIX_BIR_MASK;
pi->pi_msix.pba_offset =
msixcap.pba_info & ~PCIM_MSIX_BIR_MASK;
pi->pi_msix.table_bar =
msixcap.table_info & PCIM_MSIX_BIR_MASK;
pi->pi_msix.table_offset =
msixcap.table_info & ~PCIM_MSIX_BIR_MASK;
pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl);
pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count);
/* Allocate the emulated MSI-X table array */
table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
pi->pi_msix.table = calloc(1, table_size);
/* Mask all table entries */
for (i = 0; i < pi->pi_msix.table_count; i++) {
pi->pi_msix.table[i].vector_control |=
PCIM_MSIX_VCTRL_MASK;
}
}
#ifdef LEGACY_SUPPORT
/*
* If the passthrough device does not support MSI then craft a
* MSI capability for it. We link the new MSI capability at the
* head of the list of capabilities.
*/
if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) {
int origptr, msiptr;
origptr = read_config(&sel, PCIR_CAP_PTR, 1);
msiptr = passthru_add_msicap(pi, 1, origptr);
sc->psc_msi.capoff = msiptr;
sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2);
sc->psc_msi.emulated = 1;
pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr);
}
#endif
/* Make sure one of the capabilities is present */
if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0)
return (-1);
else
return (0);
}
static uint64_t
msix_table_read(struct passthru_softc *sc, uint64_t offset, int size)
{
struct pci_devinst *pi;
struct msix_table_entry *entry;
uint8_t *src8;
uint16_t *src16;
uint32_t *src32;
uint64_t *src64;
uint64_t data;
size_t entry_offset;
int index;
pi = sc->psc_pi;
if (offset >= pi->pi_msix.pba_offset &&
offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
switch(size) {
case 1:
src8 = (uint8_t *)(pi->pi_msix.pba_page + offset -
pi->pi_msix.pba_page_offset);
data = *src8;
break;
case 2:
src16 = (uint16_t *)(pi->pi_msix.pba_page + offset -
pi->pi_msix.pba_page_offset);
data = *src16;
break;
case 4:
src32 = (uint32_t *)(pi->pi_msix.pba_page + offset -
pi->pi_msix.pba_page_offset);
data = *src32;
break;
case 8:
src64 = (uint64_t *)(pi->pi_msix.pba_page + offset -
pi->pi_msix.pba_page_offset);
data = *src64;
break;
default:
return (-1);
}
return (data);
}
if (offset < pi->pi_msix.table_offset)
return (-1);
offset -= pi->pi_msix.table_offset;
index = offset / MSIX_TABLE_ENTRY_SIZE;
if (index >= pi->pi_msix.table_count)
return (-1);
entry = &pi->pi_msix.table[index];
entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
switch(size) {
case 1:
src8 = (uint8_t *)((void *)entry + entry_offset);
data = *src8;
break;
case 2:
src16 = (uint16_t *)((void *)entry + entry_offset);
data = *src16;
break;
case 4:
src32 = (uint32_t *)((void *)entry + entry_offset);
data = *src32;
break;
case 8:
src64 = (uint64_t *)((void *)entry + entry_offset);
data = *src64;
break;
default:
return (-1);
}
return (data);
}
static void
msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc,
uint64_t offset, int size, uint64_t data)
{
struct pci_devinst *pi;
struct msix_table_entry *entry;
uint8_t *dest8;
uint16_t *dest16;
uint32_t *dest32;
uint64_t *dest64;
size_t entry_offset;
uint32_t vector_control;
int error, index;
pi = sc->psc_pi;
if (offset >= pi->pi_msix.pba_offset &&
offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
switch(size) {
case 1:
dest8 = (uint8_t *)(pi->pi_msix.pba_page + offset -
pi->pi_msix.pba_page_offset);
*dest8 = data;
break;
case 2:
dest16 = (uint16_t *)(pi->pi_msix.pba_page + offset -
pi->pi_msix.pba_page_offset);
*dest16 = data;
break;
case 4:
dest32 = (uint32_t *)(pi->pi_msix.pba_page + offset -
pi->pi_msix.pba_page_offset);
*dest32 = data;
break;
case 8:
dest64 = (uint64_t *)(pi->pi_msix.pba_page + offset -
pi->pi_msix.pba_page_offset);
*dest64 = data;
break;
default:
break;
}
return;
}
if (offset < pi->pi_msix.table_offset)
return;
offset -= pi->pi_msix.table_offset;
index = offset / MSIX_TABLE_ENTRY_SIZE;
if (index >= pi->pi_msix.table_count)
return;
entry = &pi->pi_msix.table[index];
entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
/* Only 4 byte naturally-aligned writes are supported */
assert(size == 4);
assert(entry_offset % 4 == 0);
vector_control = entry->vector_control;
dest32 = (uint32_t *)((void *)entry + entry_offset);
*dest32 = data;
/* If MSI-X hasn't been enabled, do nothing */
if (pi->pi_msix.enabled) {
/* If the entry is masked, don't set it up */
if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 ||
(vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
error = vm_setup_pptdev_msix(ctx, vcpu,
sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
sc->psc_sel.pc_func, index, entry->addr,
entry->msg_data, entry->vector_control);
}
}
}
static int
init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base)
{
int b, s, f;
int error, idx;
size_t len, remaining;
uint32_t table_size, table_offset;
uint32_t pba_size, pba_offset;
vm_paddr_t start;
struct pci_devinst *pi = sc->psc_pi;
assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0);
b = sc->psc_sel.pc_bus;
s = sc->psc_sel.pc_dev;
f = sc->psc_sel.pc_func;
/*
* If the MSI-X table BAR maps memory intended for
* other uses, it is at least assured that the table
* either resides in its own page within the region,
* or it resides in a page shared with only the PBA.
*/
table_offset = rounddown2(pi->pi_msix.table_offset, 4096);
table_size = pi->pi_msix.table_offset - table_offset;
table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
table_size = roundup2(table_size, 4096);
idx = pi->pi_msix.table_bar;
start = pi->pi_bar[idx].addr;
remaining = pi->pi_bar[idx].size;
if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar) {
pba_offset = pi->pi_msix.pba_offset;
pba_size = pi->pi_msix.pba_size;
if (pba_offset >= table_offset + table_size ||
table_offset >= pba_offset + pba_size) {
/*
* If the PBA does not share a page with the MSI-x
* tables, no PBA emulation is required.
*/
pi->pi_msix.pba_page = NULL;
pi->pi_msix.pba_page_offset = 0;
} else {
/*
* The PBA overlaps with either the first or last
* page of the MSI-X table region. Map the
* appropriate page.
*/
if (pba_offset <= table_offset)
pi->pi_msix.pba_page_offset = table_offset;
else
pi->pi_msix.pba_page_offset = table_offset +
table_size - 4096;
pi->pi_msix.pba_page = mmap(NULL, 4096, PROT_READ |
PROT_WRITE, MAP_SHARED, memfd, start +
pi->pi_msix.pba_page_offset);
if (pi->pi_msix.pba_page == MAP_FAILED) {
warn(
"Failed to map PBA page for MSI-X on %d/%d/%d",
b, s, f);
return (-1);
}
}
}
/* Map everything before the MSI-X table */
if (table_offset > 0) {
len = table_offset;
error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base);
if (error)
return (error);
base += len;
start += len;
remaining -= len;
}
/* Skip the MSI-X table */
base += table_size;
start += table_size;
remaining -= table_size;
/* Map everything beyond the end of the MSI-X table */
if (remaining > 0) {
len = remaining;
error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base);
if (error)
return (error);
}
return (0);
}
static int
cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
{
int i, error;
struct pci_devinst *pi;
struct pci_bar_io bar;
enum pcibar_type bartype;
uint64_t base, size;
pi = sc->psc_pi;
/*
* Initialize BAR registers
*/
for (i = 0; i <= PCI_BARMAX; i++) {
bzero(&bar, sizeof(bar));
bar.pbi_sel = sc->psc_sel;
bar.pbi_reg = PCIR_BAR(i);
if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0)
continue;
if (PCI_BAR_IO(bar.pbi_base)) {
bartype = PCIBAR_IO;
base = bar.pbi_base & PCIM_BAR_IO_BASE;
} else {
switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) {
case PCIM_BAR_MEM_64:
bartype = PCIBAR_MEM64;
break;
default:
bartype = PCIBAR_MEM32;
break;
}
base = bar.pbi_base & PCIM_BAR_MEM_BASE;
}
size = bar.pbi_length;
if (bartype != PCIBAR_IO) {
if (((base | size) & PAGE_MASK) != 0) {
warnx("passthru device %d/%d/%d BAR %d: "
"base %#lx or size %#lx not page aligned\n",
sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
sc->psc_sel.pc_func, i, base, size);
return (-1);
}
}
/* Cache information about the "real" BAR */
sc->psc_bar[i].type = bartype;
sc->psc_bar[i].size = size;
sc->psc_bar[i].addr = base;
/* Allocate the BAR in the guest I/O or MMIO space */
error = pci_emul_alloc_pbar(pi, i, base, bartype, size);
if (error)
return (-1);
/* The MSI-X table needs special handling */
if (i == pci_msix_table_bar(pi)) {
error = init_msix_table(ctx, sc, base);
if (error)
return (-1);
} else if (bartype != PCIBAR_IO) {
/* Map the physical BAR in the guest MMIO space */
error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
pi->pi_bar[i].addr, pi->pi_bar[i].size, base);
if (error)
return (-1);
}
/*
* 64-bit BAR takes up two slots so skip the next one.
*/
if (bartype == PCIBAR_MEM64) {
i++;
assert(i <= PCI_BARMAX);
sc->psc_bar[i].type = PCIBAR_MEMHI64;
}
}
return (0);
}
static int
cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func)
{
int error;
struct passthru_softc *sc;
error = 1;
sc = pi->pi_arg;
bzero(&sc->psc_sel, sizeof(struct pcisel));
sc->psc_sel.pc_bus = bus;
sc->psc_sel.pc_dev = slot;
sc->psc_sel.pc_func = func;
if (cfginitmsi(sc) != 0) {
warnx("failed to initialize MSI for PCI %d/%d/%d",
bus, slot, func);
goto done;
}
if (cfginitbar(ctx, sc) != 0) {
warnx("failed to initialize BARs for PCI %d/%d/%d",
bus, slot, func);
goto done;
}
error = 0; /* success */
done:
return (error);
}
static int
passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
int bus, slot, func, error, memflags;
struct passthru_softc *sc;
sc = NULL;
error = 1;
memflags = vm_get_memflags(ctx);
if (!(memflags & VM_MEM_F_WIRED)) {
warnx("passthru requires guest memory to be wired");
goto done;
}
if (pcifd < 0) {
pcifd = open(_PATH_DEVPCI, O_RDWR, 0);
if (pcifd < 0) {
warn("failed to open %s", _PATH_DEVPCI);
goto done;
}
}
if (iofd < 0) {
iofd = open(_PATH_DEVIO, O_RDWR, 0);
if (iofd < 0) {
warn("failed to open %s", _PATH_DEVIO);
goto done;
}
}
if (memfd < 0) {
memfd = open(_PATH_MEM, O_RDWR, 0);
if (memfd < 0) {
warn("failed to open %s", _PATH_MEM);
goto done;
}
}
if (opts == NULL ||
sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3) {
warnx("invalid passthru options");
goto done;
}
if (vm_assign_pptdev(ctx, bus, slot, func) != 0) {
warnx("PCI device at %d/%d/%d is not using the ppt(4) driver",
bus, slot, func);
goto done;
}
sc = calloc(1, sizeof(struct passthru_softc));
pi->pi_arg = sc;
sc->psc_pi = pi;
/* initialize config space */
if ((error = cfginit(ctx, pi, bus, slot, func)) != 0)
goto done;
error = 0; /* success */
done:
if (error) {
free(sc);
vm_unassign_pptdev(ctx, bus, slot, func);
}
return (error);
}
static int
bar_access(int coff)
{
if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1))
return (1);
else
return (0);
}
static int
msicap_access(struct passthru_softc *sc, int coff)
{
int caplen;
if (sc->psc_msi.capoff == 0)
return (0);
caplen = msi_caplen(sc->psc_msi.msgctrl);
if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen)
return (1);
else
return (0);
}
static int
msixcap_access(struct passthru_softc *sc, int coff)
{
if (sc->psc_msix.capoff == 0)
return (0);
return (coff >= sc->psc_msix.capoff &&
coff < sc->psc_msix.capoff + MSIX_CAPLEN);
}
static int
passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int coff, int bytes, uint32_t *rv)
{
struct passthru_softc *sc;
sc = pi->pi_arg;
/*
* PCI BARs and MSI capability is emulated.
*/
if (bar_access(coff) || msicap_access(sc, coff))
return (-1);
#ifdef LEGACY_SUPPORT
/*
* Emulate PCIR_CAP_PTR if this device does not support MSI capability
* natively.
*/
if (sc->psc_msi.emulated) {
if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4)
return (-1);
}
#endif
/* Everything else just read from the device's config space */
*rv = read_config(&sc->psc_sel, coff, bytes);
return (0);
}
static int
passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int coff, int bytes, uint32_t val)
{
int error, msix_table_entries, i;
struct passthru_softc *sc;
sc = pi->pi_arg;
/*
* PCI BARs are emulated
*/
if (bar_access(coff))
return (-1);
/*
* MSI capability is emulated
*/
if (msicap_access(sc, coff)) {
msicap_cfgwrite(pi, sc->psc_msi.capoff, coff, bytes, val);
error = vm_setup_pptdev_msi(ctx, vcpu, sc->psc_sel.pc_bus,
sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
pi->pi_msi.addr, pi->pi_msi.msg_data,
pi->pi_msi.maxmsgnum);
if (error != 0)
err(1, "vm_setup_pptdev_msi");
return (0);
}
if (msixcap_access(sc, coff)) {
msixcap_cfgwrite(pi, sc->psc_msix.capoff, coff, bytes, val);
if (pi->pi_msix.enabled) {
msix_table_entries = pi->pi_msix.table_count;
for (i = 0; i < msix_table_entries; i++) {
error = vm_setup_pptdev_msix(ctx, vcpu,
sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
sc->psc_sel.pc_func, i,
pi->pi_msix.table[i].addr,
pi->pi_msix.table[i].msg_data,
pi->pi_msix.table[i].vector_control);
if (error)
err(1, "vm_setup_pptdev_msix");
}
}
return (0);
}
#ifdef LEGACY_SUPPORT
/*
* If this device does not support MSI natively then we cannot let
* the guest disable legacy interrupts from the device. It is the
* legacy interrupt that is triggering the virtual MSI to the guest.
*/
if (sc->psc_msi.emulated && pci_msi_enabled(pi)) {
if (coff == PCIR_COMMAND && bytes == 2)
val &= ~PCIM_CMD_INTxDIS;
}
#endif
write_config(&sc->psc_sel, coff, bytes, val);
return (0);
}
static void
passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
uint64_t offset, int size, uint64_t value)
{
struct passthru_softc *sc;
struct iodev_pio_req pio;
sc = pi->pi_arg;
if (baridx == pci_msix_table_bar(pi)) {
msix_table_write(ctx, vcpu, sc, offset, size, value);
} else {
assert(pi->pi_bar[baridx].type == PCIBAR_IO);
bzero(&pio, sizeof(struct iodev_pio_req));
pio.access = IODEV_PIO_WRITE;
pio.port = sc->psc_bar[baridx].addr + offset;
pio.width = size;
pio.val = value;
(void)ioctl(iofd, IODEV_PIO, &pio);
}
}
static uint64_t
passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
uint64_t offset, int size)
{
struct passthru_softc *sc;
struct iodev_pio_req pio;
uint64_t val;
sc = pi->pi_arg;
if (baridx == pci_msix_table_bar(pi)) {
val = msix_table_read(sc, offset, size);
} else {
assert(pi->pi_bar[baridx].type == PCIBAR_IO);
bzero(&pio, sizeof(struct iodev_pio_req));
pio.access = IODEV_PIO_READ;
pio.port = sc->psc_bar[baridx].addr + offset;
pio.width = size;
pio.val = 0;
(void)ioctl(iofd, IODEV_PIO, &pio);
val = pio.val;
}
return (val);
}
struct pci_devemu passthru = {
.pe_emu = "passthru",
.pe_init = passthru_init,
.pe_cfgwrite = passthru_cfgwrite,
.pe_cfgread = passthru_cfgread,
.pe_barwrite = passthru_write,
.pe_barread = passthru_read,
};
PCI_EMUL_SET(passthru);

119
pci_uart.c Normal file
View File

@ -0,0 +1,119 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <stdio.h>
#include "bhyverun.h"
#include "pci_emul.h"
#include "uart_emul.h"
/*
* Pick a PCI vid/did of a chip with a single uart at
* BAR0, that most versions of FreeBSD can understand:
* Siig CyberSerial 1-port.
*/
#define COM_VENDOR 0x131f
#define COM_DEV 0x2000
static void
pci_uart_intr_assert(void *arg)
{
struct pci_devinst *pi = arg;
pci_lintr_assert(pi);
}
static void
pci_uart_intr_deassert(void *arg)
{
struct pci_devinst *pi = arg;
pci_lintr_deassert(pi);
}
static void
pci_uart_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size, uint64_t value)
{
assert(baridx == 0);
assert(size == 1);
uart_write(pi->pi_arg, offset, value);
}
uint64_t
pci_uart_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size)
{
uint8_t val;
assert(baridx == 0);
assert(size == 1);
val = uart_read(pi->pi_arg, offset);
return (val);
}
static int
pci_uart_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
struct uart_softc *sc;
pci_emul_alloc_bar(pi, 0, PCIBAR_IO, UART_IO_BAR_SIZE);
pci_lintr_request(pi);
/* initialize config space */
pci_set_cfgdata16(pi, PCIR_DEVICE, COM_DEV);
pci_set_cfgdata16(pi, PCIR_VENDOR, COM_VENDOR);
pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SIMPLECOMM);
sc = uart_init(pci_uart_intr_assert, pci_uart_intr_deassert, pi);
pi->pi_arg = sc;
if (uart_set_backend(sc, opts) != 0) {
fprintf(stderr, "Unable to initialize backend '%s' for "
"pci uart at %d:%d\n", opts, pi->pi_slot, pi->pi_func);
return (-1);
}
return (0);
}
struct pci_devemu pci_de_com = {
.pe_emu = "uart",
.pe_init = pci_uart_init,
.pe_barwrite = pci_uart_write,
.pe_barread = pci_uart_read
};
PCI_EMUL_SET(pci_de_com);

410
pci_virtio_block.c Normal file
View File

@ -0,0 +1,410 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/linker_set.h>
#include <sys/stat.h>
#include <sys/uio.h>
#include <sys/ioctl.h>
#include <sys/disk.h>
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <strings.h>
#include <unistd.h>
#include <assert.h>
#include <pthread.h>
#include <md5.h>
#include "bhyverun.h"
#include "pci_emul.h"
#include "virtio.h"
#include "block_if.h"
#define VTBLK_RINGSZ 64
#define VTBLK_S_OK 0
#define VTBLK_S_IOERR 1
#define VTBLK_S_UNSUPP 2
#define VTBLK_BLK_ID_BYTES 20
/* Capability bits */
#define VTBLK_F_SEG_MAX (1 << 2) /* Maximum request segments */
#define VTBLK_F_BLK_SIZE (1 << 6) /* cfg block size valid */
#define VTBLK_F_FLUSH (1 << 9) /* Cache flush support */
#define VTBLK_F_TOPOLOGY (1 << 10) /* Optimal I/O alignment */
/*
* Host capabilities
*/
#define VTBLK_S_HOSTCAPS \
( VTBLK_F_SEG_MAX | \
VTBLK_F_BLK_SIZE | \
VTBLK_F_FLUSH | \
VTBLK_F_TOPOLOGY | \
VIRTIO_RING_F_INDIRECT_DESC ) /* indirect descriptors */
/*
* Config space "registers"
*/
struct vtblk_config {
uint64_t vbc_capacity;
uint32_t vbc_size_max;
uint32_t vbc_seg_max;
struct {
uint16_t cylinders;
uint8_t heads;
uint8_t sectors;
} vbc_geometry;
uint32_t vbc_blk_size;
struct {
uint8_t physical_block_exp;
uint8_t alignment_offset;
uint16_t min_io_size;
uint32_t opt_io_size;
} vbc_topology;
uint8_t vbc_writeback;
} __packed;
/*
* Fixed-size block header
*/
struct virtio_blk_hdr {
#define VBH_OP_READ 0
#define VBH_OP_WRITE 1
#define VBH_OP_FLUSH 4
#define VBH_OP_FLUSH_OUT 5
#define VBH_OP_IDENT 8
#define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into vbh_type */
uint32_t vbh_type;
uint32_t vbh_ioprio;
uint64_t vbh_sector;
} __packed;
/*
* Debug printf
*/
static int pci_vtblk_debug;
#define DPRINTF(params) if (pci_vtblk_debug) printf params
#define WPRINTF(params) printf params
struct pci_vtblk_ioreq {
struct blockif_req io_req;
struct pci_vtblk_softc *io_sc;
uint8_t *io_status;
uint16_t io_idx;
};
/*
* Per-device softc
*/
struct pci_vtblk_softc {
struct virtio_softc vbsc_vs;
pthread_mutex_t vsc_mtx;
struct vqueue_info vbsc_vq;
struct vtblk_config vbsc_cfg;
struct blockif_ctxt *bc;
char vbsc_ident[VTBLK_BLK_ID_BYTES];
struct pci_vtblk_ioreq vbsc_ios[VTBLK_RINGSZ];
};
static void pci_vtblk_reset(void *);
static void pci_vtblk_notify(void *, struct vqueue_info *);
static int pci_vtblk_cfgread(void *, int, int, uint32_t *);
static int pci_vtblk_cfgwrite(void *, int, int, uint32_t);
static struct virtio_consts vtblk_vi_consts = {
"vtblk", /* our name */
1, /* we support 1 virtqueue */
sizeof(struct vtblk_config), /* config reg size */
pci_vtblk_reset, /* reset */
pci_vtblk_notify, /* device-wide qnotify */
pci_vtblk_cfgread, /* read PCI config */
pci_vtblk_cfgwrite, /* write PCI config */
NULL, /* apply negotiated features */
VTBLK_S_HOSTCAPS, /* our capabilities */
};
static void
pci_vtblk_reset(void *vsc)
{
struct pci_vtblk_softc *sc = vsc;
DPRINTF(("vtblk: device reset requested !\n"));
vi_reset_dev(&sc->vbsc_vs);
}
static void
pci_vtblk_done(struct blockif_req *br, int err)
{
struct pci_vtblk_ioreq *io = br->br_param;
struct pci_vtblk_softc *sc = io->io_sc;
/* convert errno into a virtio block error return */
if (err == EOPNOTSUPP || err == ENOSYS)
*io->io_status = VTBLK_S_UNSUPP;
else if (err != 0)
*io->io_status = VTBLK_S_IOERR;
else
*io->io_status = VTBLK_S_OK;
/*
* Return the descriptor back to the host.
* We wrote 1 byte (our status) to host.
*/
pthread_mutex_lock(&sc->vsc_mtx);
vq_relchain(&sc->vbsc_vq, io->io_idx, 1);
vq_endchains(&sc->vbsc_vq, 0);
pthread_mutex_unlock(&sc->vsc_mtx);
}
static void
pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq)
{
struct virtio_blk_hdr *vbh;
struct pci_vtblk_ioreq *io;
int i, n;
int err;
ssize_t iolen;
int writeop, type;
struct iovec iov[BLOCKIF_IOV_MAX + 2];
uint16_t idx, flags[BLOCKIF_IOV_MAX + 2];
n = vq_getchain(vq, &idx, iov, BLOCKIF_IOV_MAX + 2, flags);
/*
* The first descriptor will be the read-only fixed header,
* and the last is for status (hence +2 above and below).
* The remaining iov's are the actual data I/O vectors.
*
* XXX - note - this fails on crash dump, which does a
* VIRTIO_BLK_T_FLUSH with a zero transfer length
*/
assert(n >= 2 && n <= BLOCKIF_IOV_MAX + 2);
io = &sc->vbsc_ios[idx];
assert((flags[0] & VRING_DESC_F_WRITE) == 0);
assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr));
vbh = iov[0].iov_base;
memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2));
io->io_req.br_iovcnt = n - 2;
io->io_req.br_offset = vbh->vbh_sector * DEV_BSIZE;
io->io_status = iov[--n].iov_base;
assert(iov[n].iov_len == 1);
assert(flags[n] & VRING_DESC_F_WRITE);
/*
* XXX
* The guest should not be setting the BARRIER flag because
* we don't advertise the capability.
*/
type = vbh->vbh_type & ~VBH_FLAG_BARRIER;
writeop = (type == VBH_OP_WRITE);
iolen = 0;
for (i = 1; i < n; i++) {
/*
* - write op implies read-only descriptor,
* - read/ident op implies write-only descriptor,
* therefore test the inverse of the descriptor bit
* to the op.
*/
assert(((flags[i] & VRING_DESC_F_WRITE) == 0) == writeop);
iolen += iov[i].iov_len;
}
io->io_req.br_resid = iolen;
DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld\n\r",
writeop ? "write" : "read/ident", iolen, i - 1,
io->io_req.br_offset));
switch (type) {
case VBH_OP_READ:
err = blockif_read(sc->bc, &io->io_req);
break;
case VBH_OP_WRITE:
err = blockif_write(sc->bc, &io->io_req);
break;
case VBH_OP_FLUSH:
case VBH_OP_FLUSH_OUT:
err = blockif_flush(sc->bc, &io->io_req);
break;
case VBH_OP_IDENT:
/* Assume a single buffer */
/* S/n equal to buffer is not zero-terminated. */
memset(iov[1].iov_base, 0, iov[1].iov_len);
strncpy(iov[1].iov_base, sc->vbsc_ident,
MIN(iov[1].iov_len, sizeof(sc->vbsc_ident)));
pci_vtblk_done(&io->io_req, 0);
return;
default:
pci_vtblk_done(&io->io_req, EOPNOTSUPP);
return;
}
assert(err == 0);
}
static void
pci_vtblk_notify(void *vsc, struct vqueue_info *vq)
{
struct pci_vtblk_softc *sc = vsc;
while (vq_has_descs(vq))
pci_vtblk_proc(sc, vq);
}
static int
pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
char bident[sizeof("XX:X:X")];
struct blockif_ctxt *bctxt;
MD5_CTX mdctx;
u_char digest[16];
struct pci_vtblk_softc *sc;
off_t size;
int i, sectsz, sts, sto;
if (opts == NULL) {
printf("virtio-block: backing device required\n");
return (1);
}
/*
* The supplied backing file has to exist
*/
snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func);
bctxt = blockif_open(opts, bident);
if (bctxt == NULL) {
perror("Could not open backing file");
return (1);
}
size = blockif_size(bctxt);
sectsz = blockif_sectsz(bctxt);
blockif_psectsz(bctxt, &sts, &sto);
sc = calloc(1, sizeof(struct pci_vtblk_softc));
sc->bc = bctxt;
for (i = 0; i < VTBLK_RINGSZ; i++) {
struct pci_vtblk_ioreq *io = &sc->vbsc_ios[i];
io->io_req.br_callback = pci_vtblk_done;
io->io_req.br_param = io;
io->io_sc = sc;
io->io_idx = i;
}
pthread_mutex_init(&sc->vsc_mtx, NULL);
/* init virtio softc and virtqueues */
vi_softc_linkup(&sc->vbsc_vs, &vtblk_vi_consts, sc, pi, &sc->vbsc_vq);
sc->vbsc_vs.vs_mtx = &sc->vsc_mtx;
sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ;
/* sc->vbsc_vq.vq_notify = we have no per-queue notify */
/*
* Create an identifier for the backing file. Use parts of the
* md5 sum of the filename
*/
MD5Init(&mdctx);
MD5Update(&mdctx, opts, strlen(opts));
MD5Final(digest, &mdctx);
sprintf(sc->vbsc_ident, "BHYVE-%02X%02X-%02X%02X-%02X%02X",
digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]);
/* setup virtio block config space */
sc->vbsc_cfg.vbc_capacity = size / DEV_BSIZE; /* 512-byte units */
sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */
sc->vbsc_cfg.vbc_seg_max = BLOCKIF_IOV_MAX;
sc->vbsc_cfg.vbc_geometry.cylinders = 0; /* no geometry */
sc->vbsc_cfg.vbc_geometry.heads = 0;
sc->vbsc_cfg.vbc_geometry.sectors = 0;
sc->vbsc_cfg.vbc_blk_size = sectsz;
sc->vbsc_cfg.vbc_topology.physical_block_exp =
(sts > sectsz) ? (ffsll(sts / sectsz) - 1) : 0;
sc->vbsc_cfg.vbc_topology.alignment_offset =
(sto != 0) ? ((sts - sto) / sectsz) : 0;
sc->vbsc_cfg.vbc_topology.min_io_size = 0;
sc->vbsc_cfg.vbc_topology.opt_io_size = 0;
sc->vbsc_cfg.vbc_writeback = 0;
/*
* Should we move some of this into virtio.c? Could
* have the device, class, and subdev_0 as fields in
* the virtio constants structure.
*/
pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK);
pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK);
pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix())) {
blockif_close(sc->bc);
free(sc);
return (1);
}
vi_set_io_bar(&sc->vbsc_vs, 0);
return (0);
}
static int
pci_vtblk_cfgwrite(void *vsc, int offset, int size, uint32_t value)
{
DPRINTF(("vtblk: write to readonly reg %d\n\r", offset));
return (1);
}
static int
pci_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval)
{
struct pci_vtblk_softc *sc = vsc;
void *ptr;
/* our caller has already verified offset and size */
ptr = (uint8_t *)&sc->vbsc_cfg + offset;
memcpy(retval, ptr, size);
return (0);
}
struct pci_devemu pci_de_vblk = {
.pe_emu = "virtio-blk",
.pe_init = pci_vtblk_init,
.pe_barwrite = vi_pci_write,
.pe_barread = vi_pci_read
};
PCI_EMUL_SET(pci_de_vblk);

976
pci_virtio_net.c Normal file
View File

@ -0,0 +1,976 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/linker_set.h>
#include <sys/select.h>
#include <sys/uio.h>
#include <sys/ioctl.h>
#include <machine/atomic.h>
#include <net/ethernet.h>
#ifndef NETMAP_WITH_LIBS
#define NETMAP_WITH_LIBS
#endif
#include <net/netmap_user.h>
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <strings.h>
#include <unistd.h>
#include <assert.h>
#include <md5.h>
#include <pthread.h>
#include <pthread_np.h>
#include "bhyverun.h"
#include "pci_emul.h"
#include "mevent.h"
#include "virtio.h"
#define VTNET_RINGSZ 1024
#define VTNET_MAXSEGS 256
/*
* Host capabilities. Note that we only offer a few of these.
*/
#define VIRTIO_NET_F_CSUM (1 << 0) /* host handles partial cksum */
#define VIRTIO_NET_F_GUEST_CSUM (1 << 1) /* guest handles partial cksum */
#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */
#define VIRTIO_NET_F_GSO_DEPREC (1 << 6) /* deprecated: host handles GSO */
#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can rcv TSOv4 */
#define VIRTIO_NET_F_GUEST_TSO6 (1 << 8) /* guest can rcv TSOv6 */
#define VIRTIO_NET_F_GUEST_ECN (1 << 9) /* guest can rcv TSO with ECN */
#define VIRTIO_NET_F_GUEST_UFO (1 << 10) /* guest can rcv UFO */
#define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can rcv TSOv4 */
#define VIRTIO_NET_F_HOST_TSO6 (1 << 12) /* host can rcv TSOv6 */
#define VIRTIO_NET_F_HOST_ECN (1 << 13) /* host can rcv TSO with ECN */
#define VIRTIO_NET_F_HOST_UFO (1 << 14) /* host can rcv UFO */
#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */
#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */
#define VIRTIO_NET_F_CTRL_VQ (1 << 17) /* control channel available */
#define VIRTIO_NET_F_CTRL_RX (1 << 18) /* control channel RX mode support */
#define VIRTIO_NET_F_CTRL_VLAN (1 << 19) /* control channel VLAN filtering */
#define VIRTIO_NET_F_GUEST_ANNOUNCE \
(1 << 21) /* guest can send gratuitous pkts */
#define VTNET_S_HOSTCAPS \
( VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \
VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC)
/*
* PCI config-space "registers"
*/
struct virtio_net_config {
uint8_t mac[6];
uint16_t status;
} __packed;
/*
* Queue definitions.
*/
#define VTNET_RXQ 0
#define VTNET_TXQ 1
#define VTNET_CTLQ 2 /* NB: not yet supported */
#define VTNET_MAXQ 3
/*
* Fixed network header size
*/
struct virtio_net_rxhdr {
uint8_t vrh_flags;
uint8_t vrh_gso_type;
uint16_t vrh_hdr_len;
uint16_t vrh_gso_size;
uint16_t vrh_csum_start;
uint16_t vrh_csum_offset;
uint16_t vrh_bufs;
} __packed;
/*
* Debug printf
*/
static int pci_vtnet_debug;
#define DPRINTF(params) if (pci_vtnet_debug) printf params
#define WPRINTF(params) printf params
/*
* Per-device softc
*/
struct pci_vtnet_softc {
struct virtio_softc vsc_vs;
struct vqueue_info vsc_queues[VTNET_MAXQ - 1];
pthread_mutex_t vsc_mtx;
struct mevent *vsc_mevp;
int vsc_tapfd;
struct nm_desc *vsc_nmd;
int vsc_rx_ready;
volatile int resetting; /* set and checked outside lock */
uint64_t vsc_features; /* negotiated features */
struct virtio_net_config vsc_config;
pthread_mutex_t rx_mtx;
int rx_in_progress;
int rx_vhdrlen;
int rx_merge; /* merged rx bufs in use */
pthread_t tx_tid;
pthread_mutex_t tx_mtx;
pthread_cond_t tx_cond;
int tx_in_progress;
void (*pci_vtnet_rx)(struct pci_vtnet_softc *sc);
void (*pci_vtnet_tx)(struct pci_vtnet_softc *sc, struct iovec *iov,
int iovcnt, int len);
};
static void pci_vtnet_reset(void *);
/* static void pci_vtnet_notify(void *, struct vqueue_info *); */
static int pci_vtnet_cfgread(void *, int, int, uint32_t *);
static int pci_vtnet_cfgwrite(void *, int, int, uint32_t);
static void pci_vtnet_neg_features(void *, uint64_t);
static struct virtio_consts vtnet_vi_consts = {
"vtnet", /* our name */
VTNET_MAXQ - 1, /* we currently support 2 virtqueues */
sizeof(struct virtio_net_config), /* config reg size */
pci_vtnet_reset, /* reset */
NULL, /* device-wide qnotify -- not used */
pci_vtnet_cfgread, /* read PCI config */
pci_vtnet_cfgwrite, /* write PCI config */
pci_vtnet_neg_features, /* apply negotiated features */
VTNET_S_HOSTCAPS, /* our capabilities */
};
/*
* If the transmit thread is active then stall until it is done.
*/
static void
pci_vtnet_txwait(struct pci_vtnet_softc *sc)
{
pthread_mutex_lock(&sc->tx_mtx);
while (sc->tx_in_progress) {
pthread_mutex_unlock(&sc->tx_mtx);
usleep(10000);
pthread_mutex_lock(&sc->tx_mtx);
}
pthread_mutex_unlock(&sc->tx_mtx);
}
/*
* If the receive thread is active then stall until it is done.
*/
static void
pci_vtnet_rxwait(struct pci_vtnet_softc *sc)
{
pthread_mutex_lock(&sc->rx_mtx);
while (sc->rx_in_progress) {
pthread_mutex_unlock(&sc->rx_mtx);
usleep(10000);
pthread_mutex_lock(&sc->rx_mtx);
}
pthread_mutex_unlock(&sc->rx_mtx);
}
static void
pci_vtnet_reset(void *vsc)
{
struct pci_vtnet_softc *sc = vsc;
DPRINTF(("vtnet: device reset requested !\n"));
sc->resetting = 1;
/*
* Wait for the transmit and receive threads to finish their
* processing.
*/
pci_vtnet_txwait(sc);
pci_vtnet_rxwait(sc);
sc->vsc_rx_ready = 0;
sc->rx_merge = 1;
sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr);
/* now reset rings, MSI-X vectors, and negotiated capabilities */
vi_reset_dev(&sc->vsc_vs);
sc->resetting = 0;
}
/*
* Called to send a buffer chain out to the tap device
*/
static void
pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
int len)
{
static char pad[60]; /* all zero bytes */
if (sc->vsc_tapfd == -1)
return;
/*
* If the length is < 60, pad out to that and add the
* extra zero'd segment to the iov. It is guaranteed that
* there is always an extra iov available by the caller.
*/
if (len < 60) {
iov[iovcnt].iov_base = pad;
iov[iovcnt].iov_len = 60 - len;
iovcnt++;
}
(void) writev(sc->vsc_tapfd, iov, iovcnt);
}
/*
* Called when there is read activity on the tap file descriptor.
* Each buffer posted by the guest is assumed to be able to contain
* an entire ethernet frame + rx header.
* MP note: the dummybuf is only used for discarding frames, so there
* is no need for it to be per-vtnet or locked.
*/
static uint8_t dummybuf[2048];
static __inline struct iovec *
rx_iov_trim(struct iovec *iov, int *niov, int tlen)
{
struct iovec *riov;
/* XXX short-cut: assume first segment is >= tlen */
assert(iov[0].iov_len >= tlen);
iov[0].iov_len -= tlen;
if (iov[0].iov_len == 0) {
assert(*niov > 1);
*niov -= 1;
riov = &iov[1];
} else {
iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen);
riov = &iov[0];
}
return (riov);
}
static void
pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
{
struct iovec iov[VTNET_MAXSEGS], *riov;
struct vqueue_info *vq;
void *vrx;
int len, n;
uint16_t idx;
/*
* Should never be called without a valid tap fd
*/
assert(sc->vsc_tapfd != -1);
/*
* But, will be called when the rx ring hasn't yet
* been set up or the guest is resetting the device.
*/
if (!sc->vsc_rx_ready || sc->resetting) {
/*
* Drop the packet and try later.
*/
(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
return;
}
/*
* Check for available rx buffers
*/
vq = &sc->vsc_queues[VTNET_RXQ];
if (!vq_has_descs(vq)) {
/*
* Drop the packet and try later. Interrupt on
* empty, if that's negotiated.
*/
(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
vq_endchains(vq, 1);
return;
}
do {
/*
* Get descriptor chain.
*/
n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
assert(n >= 1 && n <= VTNET_MAXSEGS);
/*
* Get a pointer to the rx header, and use the
* data immediately following it for the packet buffer.
*/
vrx = iov[0].iov_base;
riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen);
len = readv(sc->vsc_tapfd, riov, n);
if (len < 0 && errno == EWOULDBLOCK) {
/*
* No more packets, but still some avail ring
* entries. Interrupt if needed/appropriate.
*/
vq_retchain(vq);
vq_endchains(vq, 0);
return;
}
/*
* The only valid field in the rx packet header is the
* number of buffers if merged rx bufs were negotiated.
*/
memset(vrx, 0, sc->rx_vhdrlen);
if (sc->rx_merge) {
struct virtio_net_rxhdr *vrxh;
vrxh = vrx;
vrxh->vrh_bufs = 1;
}
/*
* Release this chain and handle more chains.
*/
vq_relchain(vq, idx, len + sc->rx_vhdrlen);
} while (vq_has_descs(vq));
/* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
vq_endchains(vq, 1);
}
static __inline int
pci_vtnet_netmap_writev(struct nm_desc *nmd, struct iovec *iov, int iovcnt)
{
int r, i;
int len = 0;
for (r = nmd->cur_tx_ring; ; ) {
struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, r);
uint32_t cur, idx;
char *buf;
if (nm_ring_empty(ring)) {
r++;
if (r > nmd->last_tx_ring)
r = nmd->first_tx_ring;
if (r == nmd->cur_tx_ring)
break;
continue;
}
cur = ring->cur;
idx = ring->slot[cur].buf_idx;
buf = NETMAP_BUF(ring, idx);
for (i = 0; i < iovcnt; i++) {
if (len + iov[i].iov_len > 2048)
break;
memcpy(&buf[len], iov[i].iov_base, iov[i].iov_len);
len += iov[i].iov_len;
}
ring->slot[cur].len = len;
ring->head = ring->cur = nm_ring_next(ring, cur);
nmd->cur_tx_ring = r;
ioctl(nmd->fd, NIOCTXSYNC, NULL);
break;
}
return (len);
}
static __inline int
pci_vtnet_netmap_readv(struct nm_desc *nmd, struct iovec *iov, int iovcnt)
{
int len = 0;
int i = 0;
int r;
for (r = nmd->cur_rx_ring; ; ) {
struct netmap_ring *ring = NETMAP_RXRING(nmd->nifp, r);
uint32_t cur, idx;
char *buf;
size_t left;
if (nm_ring_empty(ring)) {
r++;
if (r > nmd->last_rx_ring)
r = nmd->first_rx_ring;
if (r == nmd->cur_rx_ring)
break;
continue;
}
cur = ring->cur;
idx = ring->slot[cur].buf_idx;
buf = NETMAP_BUF(ring, idx);
left = ring->slot[cur].len;
for (i = 0; i < iovcnt && left > 0; i++) {
if (iov[i].iov_len > left)
iov[i].iov_len = left;
memcpy(iov[i].iov_base, &buf[len], iov[i].iov_len);
len += iov[i].iov_len;
left -= iov[i].iov_len;
}
ring->head = ring->cur = nm_ring_next(ring, cur);
nmd->cur_rx_ring = r;
ioctl(nmd->fd, NIOCRXSYNC, NULL);
break;
}
for (; i < iovcnt; i++)
iov[i].iov_len = 0;
return (len);
}
/*
* Called to send a buffer chain out to the vale port
*/
static void
pci_vtnet_netmap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
int len)
{
static char pad[60]; /* all zero bytes */
if (sc->vsc_nmd == NULL)
return;
/*
* If the length is < 60, pad out to that and add the
* extra zero'd segment to the iov. It is guaranteed that
* there is always an extra iov available by the caller.
*/
if (len < 60) {
iov[iovcnt].iov_base = pad;
iov[iovcnt].iov_len = 60 - len;
iovcnt++;
}
(void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt);
}
static void
pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc)
{
struct iovec iov[VTNET_MAXSEGS], *riov;
struct vqueue_info *vq;
void *vrx;
int len, n;
uint16_t idx;
/*
* Should never be called without a valid netmap descriptor
*/
assert(sc->vsc_nmd != NULL);
/*
* But, will be called when the rx ring hasn't yet
* been set up or the guest is resetting the device.
*/
if (!sc->vsc_rx_ready || sc->resetting) {
/*
* Drop the packet and try later.
*/
(void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf);
return;
}
/*
* Check for available rx buffers
*/
vq = &sc->vsc_queues[VTNET_RXQ];
if (!vq_has_descs(vq)) {
/*
* Drop the packet and try later. Interrupt on
* empty, if that's negotiated.
*/
(void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf);
vq_endchains(vq, 1);
return;
}
do {
/*
* Get descriptor chain.
*/
n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
assert(n >= 1 && n <= VTNET_MAXSEGS);
/*
* Get a pointer to the rx header, and use the
* data immediately following it for the packet buffer.
*/
vrx = iov[0].iov_base;
riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen);
len = pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n);
if (len == 0) {
/*
* No more packets, but still some avail ring
* entries. Interrupt if needed/appropriate.
*/
vq_retchain(vq);
vq_endchains(vq, 0);
return;
}
/*
* The only valid field in the rx packet header is the
* number of buffers if merged rx bufs were negotiated.
*/
memset(vrx, 0, sc->rx_vhdrlen);
if (sc->rx_merge) {
struct virtio_net_rxhdr *vrxh;
vrxh = vrx;
vrxh->vrh_bufs = 1;
}
/*
* Release this chain and handle more chains.
*/
vq_relchain(vq, idx, len + sc->rx_vhdrlen);
} while (vq_has_descs(vq));
/* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
vq_endchains(vq, 1);
}
static void
pci_vtnet_rx_callback(int fd, enum ev_type type, void *param)
{
struct pci_vtnet_softc *sc = param;
pthread_mutex_lock(&sc->rx_mtx);
sc->rx_in_progress = 1;
sc->pci_vtnet_rx(sc);
sc->rx_in_progress = 0;
pthread_mutex_unlock(&sc->rx_mtx);
}
static void
pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq)
{
struct pci_vtnet_softc *sc = vsc;
/*
* A qnotify means that the rx process can now begin
*/
if (sc->vsc_rx_ready == 0) {
sc->vsc_rx_ready = 1;
vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
}
}
static void
pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq)
{
struct iovec iov[VTNET_MAXSEGS + 1];
int i, n;
int plen, tlen;
uint16_t idx;
/*
* Obtain chain of descriptors. The first one is
* really the header descriptor, so we need to sum
* up two lengths: packet length and transfer length.
*/
n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
assert(n >= 1 && n <= VTNET_MAXSEGS);
plen = 0;
tlen = iov[0].iov_len;
for (i = 1; i < n; i++) {
plen += iov[i].iov_len;
tlen += iov[i].iov_len;
}
DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n));
sc->pci_vtnet_tx(sc, &iov[1], n - 1, plen);
/* chain is processed, release it and set tlen */
vq_relchain(vq, idx, tlen);
}
static void
pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq)
{
struct pci_vtnet_softc *sc = vsc;
/*
* Any ring entries to process?
*/
if (!vq_has_descs(vq))
return;
/* Signal the tx thread for processing */
pthread_mutex_lock(&sc->tx_mtx);
vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
if (sc->tx_in_progress == 0)
pthread_cond_signal(&sc->tx_cond);
pthread_mutex_unlock(&sc->tx_mtx);
}
/*
* Thread which will handle processing of TX desc
*/
static void *
pci_vtnet_tx_thread(void *param)
{
struct pci_vtnet_softc *sc = param;
struct vqueue_info *vq;
int error;
vq = &sc->vsc_queues[VTNET_TXQ];
/*
* Let us wait till the tx queue pointers get initialised &
* first tx signaled
*/
pthread_mutex_lock(&sc->tx_mtx);
error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
assert(error == 0);
for (;;) {
/* note - tx mutex is locked here */
while (sc->resetting || !vq_has_descs(vq)) {
vq->vq_used->vu_flags &= ~VRING_USED_F_NO_NOTIFY;
mb();
if (!sc->resetting && vq_has_descs(vq))
break;
sc->tx_in_progress = 0;
error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
assert(error == 0);
}
vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
sc->tx_in_progress = 1;
pthread_mutex_unlock(&sc->tx_mtx);
do {
/*
* Run through entries, placing them into
* iovecs and sending when an end-of-packet
* is found
*/
pci_vtnet_proctx(sc, vq);
} while (vq_has_descs(vq));
/*
* Generate an interrupt if needed.
*/
vq_endchains(vq, 1);
pthread_mutex_lock(&sc->tx_mtx);
}
}
#ifdef notyet
static void
pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq)
{
DPRINTF(("vtnet: control qnotify!\n\r"));
}
#endif
static int
pci_vtnet_parsemac(char *mac_str, uint8_t *mac_addr)
{
struct ether_addr *ea;
char *tmpstr;
char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 };
tmpstr = strsep(&mac_str,"=");
if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) {
ea = ether_aton(mac_str);
if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) ||
memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) {
fprintf(stderr, "Invalid MAC %s\n", mac_str);
return (EINVAL);
} else
memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN);
}
return (0);
}
static void
pci_vtnet_tap_setup(struct pci_vtnet_softc *sc, char *devname)
{
char tbuf[80];
strcpy(tbuf, "/dev/");
strlcat(tbuf, devname, sizeof(tbuf));
sc->pci_vtnet_rx = pci_vtnet_tap_rx;
sc->pci_vtnet_tx = pci_vtnet_tap_tx;
sc->vsc_tapfd = open(tbuf, O_RDWR);
if (sc->vsc_tapfd == -1) {
WPRINTF(("open of tap device %s failed\n", tbuf));
return;
}
/*
* Set non-blocking and register for read
* notifications with the event loop
*/
int opt = 1;
if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
WPRINTF(("tap device O_NONBLOCK failed\n"));
close(sc->vsc_tapfd);
sc->vsc_tapfd = -1;
}
sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
EVF_READ,
pci_vtnet_rx_callback,
sc);
if (sc->vsc_mevp == NULL) {
WPRINTF(("Could not register event\n"));
close(sc->vsc_tapfd);
sc->vsc_tapfd = -1;
}
}
static void
pci_vtnet_netmap_setup(struct pci_vtnet_softc *sc, char *ifname)
{
sc->pci_vtnet_rx = pci_vtnet_netmap_rx;
sc->pci_vtnet_tx = pci_vtnet_netmap_tx;
sc->vsc_nmd = nm_open(ifname, NULL, 0, 0);
if (sc->vsc_nmd == NULL) {
WPRINTF(("open of netmap device %s failed\n", ifname));
return;
}
sc->vsc_mevp = mevent_add(sc->vsc_nmd->fd,
EVF_READ,
pci_vtnet_rx_callback,
sc);
if (sc->vsc_mevp == NULL) {
WPRINTF(("Could not register event\n"));
nm_close(sc->vsc_nmd);
sc->vsc_nmd = NULL;
}
}
static int
pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
MD5_CTX mdctx;
unsigned char digest[16];
char nstr[80];
char tname[MAXCOMLEN + 1];
struct pci_vtnet_softc *sc;
char *devname;
char *vtopts;
int mac_provided;
sc = calloc(1, sizeof(struct pci_vtnet_softc));
pthread_mutex_init(&sc->vsc_mtx, NULL);
vi_softc_linkup(&sc->vsc_vs, &vtnet_vi_consts, sc, pi, sc->vsc_queues);
sc->vsc_vs.vs_mtx = &sc->vsc_mtx;
sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ;
sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq;
sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ;
sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq;
#ifdef notyet
sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ;
sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq;
#endif
/*
* Attempt to open the tap device and read the MAC address
* if specified
*/
mac_provided = 0;
sc->vsc_tapfd = -1;
sc->vsc_nmd = NULL;
if (opts != NULL) {
int err;
devname = vtopts = strdup(opts);
(void) strsep(&vtopts, ",");
if (vtopts != NULL) {
err = pci_vtnet_parsemac(vtopts, sc->vsc_config.mac);
if (err != 0) {
free(devname);
return (err);
}
mac_provided = 1;
}
if (strncmp(devname, "vale", 4) == 0)
pci_vtnet_netmap_setup(sc, devname);
if (strncmp(devname, "tap", 3) == 0 ||
strncmp(devname, "vmnet", 5) == 0)
pci_vtnet_tap_setup(sc, devname);
free(devname);
}
/*
* The default MAC address is the standard NetApp OUI of 00-a0-98,
* followed by an MD5 of the PCI slot/func number and dev name
*/
if (!mac_provided) {
snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot,
pi->pi_func, vmname);
MD5Init(&mdctx);
MD5Update(&mdctx, nstr, strlen(nstr));
MD5Final(digest, &mdctx);
sc->vsc_config.mac[0] = 0x00;
sc->vsc_config.mac[1] = 0xa0;
sc->vsc_config.mac[2] = 0x98;
sc->vsc_config.mac[3] = digest[0];
sc->vsc_config.mac[4] = digest[1];
sc->vsc_config.mac[5] = digest[2];
}
/* initialize config space */
pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
/* Link is up if we managed to open tap device or vale port. */
sc->vsc_config.status = (opts == NULL || sc->vsc_tapfd >= 0 ||
sc->vsc_nmd != NULL);
/* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */
if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix()))
return (1);
/* use BAR 0 to map config regs in IO space */
vi_set_io_bar(&sc->vsc_vs, 0);
sc->resetting = 0;
sc->rx_merge = 1;
sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr);
sc->rx_in_progress = 0;
pthread_mutex_init(&sc->rx_mtx, NULL);
/*
* Initialize tx semaphore & spawn TX processing thread.
* As of now, only one thread for TX desc processing is
* spawned.
*/
sc->tx_in_progress = 0;
pthread_mutex_init(&sc->tx_mtx, NULL);
pthread_cond_init(&sc->tx_cond, NULL);
pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc);
snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot,
pi->pi_func);
pthread_set_name_np(sc->tx_tid, tname);
return (0);
}
static int
pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value)
{
struct pci_vtnet_softc *sc = vsc;
void *ptr;
if (offset < 6) {
assert(offset + size <= 6);
/*
* The driver is allowed to change the MAC address
*/
ptr = &sc->vsc_config.mac[offset];
memcpy(ptr, &value, size);
} else {
/* silently ignore other writes */
DPRINTF(("vtnet: write to readonly reg %d\n\r", offset));
}
return (0);
}
static int
pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval)
{
struct pci_vtnet_softc *sc = vsc;
void *ptr;
ptr = (uint8_t *)&sc->vsc_config + offset;
memcpy(retval, ptr, size);
return (0);
}
static void
pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features)
{
struct pci_vtnet_softc *sc = vsc;
sc->vsc_features = negotiated_features;
if (!(sc->vsc_features & VIRTIO_NET_F_MRG_RXBUF)) {
sc->rx_merge = 0;
/* non-merge rx header is 2 bytes shorter */
sc->rx_vhdrlen -= 2;
}
}
struct pci_devemu pci_de_vnet = {
.pe_emu = "virtio-net",
.pe_init = pci_vtnet_init,
.pe_barwrite = vi_pci_write,
.pe_barread = vi_pci_read
};
PCI_EMUL_SET(pci_de_vnet);

189
pci_virtio_rnd.c Normal file
View File

@ -0,0 +1,189 @@
/*-
* Copyright (c) 2014 Nahanni Systems Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer
* in this position and unchanged.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* virtio entropy device emulation.
* Randomness is sourced from /dev/random which does not block
* once it has been seeded at bootup.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/linker_set.h>
#include <sys/uio.h>
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <assert.h>
#include <pthread.h>
#include "bhyverun.h"
#include "pci_emul.h"
#include "virtio.h"
#define VTRND_RINGSZ 64
static int pci_vtrnd_debug;
#define DPRINTF(params) if (pci_vtrnd_debug) printf params
#define WPRINTF(params) printf params
/*
* Per-device softc
*/
struct pci_vtrnd_softc {
struct virtio_softc vrsc_vs;
struct vqueue_info vrsc_vq;
pthread_mutex_t vrsc_mtx;
uint64_t vrsc_cfg;
int vrsc_fd;
};
static void pci_vtrnd_reset(void *);
static void pci_vtrnd_notify(void *, struct vqueue_info *);
static struct virtio_consts vtrnd_vi_consts = {
"vtrnd", /* our name */
1, /* we support 1 virtqueue */
0, /* config reg size */
pci_vtrnd_reset, /* reset */
pci_vtrnd_notify, /* device-wide qnotify */
NULL, /* read virtio config */
NULL, /* write virtio config */
NULL, /* apply negotiated features */
0, /* our capabilities */
};
static void
pci_vtrnd_reset(void *vsc)
{
struct pci_vtrnd_softc *sc;
sc = vsc;
DPRINTF(("vtrnd: device reset requested !\n"));
vi_reset_dev(&sc->vrsc_vs);
}
static void
pci_vtrnd_notify(void *vsc, struct vqueue_info *vq)
{
struct iovec iov;
struct pci_vtrnd_softc *sc;
int len;
uint16_t idx;
sc = vsc;
if (sc->vrsc_fd < 0) {
vq_endchains(vq, 0);
return;
}
while (vq_has_descs(vq)) {
vq_getchain(vq, &idx, &iov, 1, NULL);
len = read(sc->vrsc_fd, iov.iov_base, iov.iov_len);
DPRINTF(("vtrnd: vtrnd_notify(): %d\r\n", len));
/* Catastrophe if unable to read from /dev/random */
assert(len > 0);
/*
* Release this chain and handle more
*/
vq_relchain(vq, idx, len);
}
vq_endchains(vq, 1); /* Generate interrupt if appropriate. */
}
static int
pci_vtrnd_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
struct pci_vtrnd_softc *sc;
int fd;
int len;
uint8_t v;
/*
* Should always be able to open /dev/random.
*/
fd = open("/dev/random", O_RDONLY | O_NONBLOCK);
assert(fd >= 0);
/*
* Check that device is seeded and non-blocking.
*/
len = read(fd, &v, sizeof(v));
if (len <= 0) {
WPRINTF(("vtrnd: /dev/random not ready, read(): %d", len));
return (1);
}
sc = calloc(1, sizeof(struct pci_vtrnd_softc));
vi_softc_linkup(&sc->vrsc_vs, &vtrnd_vi_consts, sc, pi, &sc->vrsc_vq);
sc->vrsc_vs.vs_mtx = &sc->vrsc_mtx;
sc->vrsc_vq.vq_qsize = VTRND_RINGSZ;
/* keep /dev/random opened while emulating */
sc->vrsc_fd = fd;
/* initialize config space */
pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_RANDOM);
pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_CRYPTO);
pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_ENTROPY);
pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
if (vi_intr_init(&sc->vrsc_vs, 1, fbsdrun_virtio_msix()))
return (1);
vi_set_io_bar(&sc->vrsc_vs, 0);
return (0);
}
struct pci_devemu pci_de_vrnd = {
.pe_emu = "virtio-rnd",
.pe_init = pci_vtrnd_init,
.pe_barwrite = vi_pci_write,
.pe_barread = vi_pci_read
};
PCI_EMUL_SET(pci_de_vrnd);

312
pm.c Normal file
View File

@ -0,0 +1,312 @@
/*-
* Copyright (c) 2013 Hudson River Trading LLC
* Written by: John H. Baldwin <jhb@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <machine/vmm.h>
#include <assert.h>
#include <errno.h>
#include <pthread.h>
#include <signal.h>
#include <vmmapi.h>
#include "acpi.h"
#include "inout.h"
#include "mevent.h"
#include "pci_irq.h"
#include "pci_lpc.h"
static pthread_mutex_t pm_lock = PTHREAD_MUTEX_INITIALIZER;
static struct mevent *power_button;
static sig_t old_power_handler;
/*
* Reset Control register at I/O port 0xcf9. Bit 2 forces a system
* reset when it transitions from 0 to 1. Bit 1 selects the type of
* reset to attempt: 0 selects a "soft" reset, and 1 selects a "hard"
* reset.
*/
static int
reset_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
int error;
static uint8_t reset_control;
if (bytes != 1)
return (-1);
if (in)
*eax = reset_control;
else {
reset_control = *eax;
/* Treat hard and soft resets the same. */
if (reset_control & 0x4) {
error = vm_suspend(ctx, VM_SUSPEND_RESET);
assert(error == 0 || errno == EALREADY);
}
}
return (0);
}
INOUT_PORT(reset_reg, 0xCF9, IOPORT_F_INOUT, reset_handler);
/*
* ACPI's SCI is a level-triggered interrupt.
*/
static int sci_active;
static void
sci_assert(struct vmctx *ctx)
{
if (sci_active)
return;
vm_isa_assert_irq(ctx, SCI_INT, SCI_INT);
sci_active = 1;
}
static void
sci_deassert(struct vmctx *ctx)
{
if (!sci_active)
return;
vm_isa_deassert_irq(ctx, SCI_INT, SCI_INT);
sci_active = 0;
}
/*
* Power Management 1 Event Registers
*
* The only power management event supported is a power button upon
* receiving SIGTERM.
*/
static uint16_t pm1_enable, pm1_status;
#define PM1_TMR_STS 0x0001
#define PM1_BM_STS 0x0010
#define PM1_GBL_STS 0x0020
#define PM1_PWRBTN_STS 0x0100
#define PM1_SLPBTN_STS 0x0200
#define PM1_RTC_STS 0x0400
#define PM1_WAK_STS 0x8000
#define PM1_TMR_EN 0x0001
#define PM1_GBL_EN 0x0020
#define PM1_PWRBTN_EN 0x0100
#define PM1_SLPBTN_EN 0x0200
#define PM1_RTC_EN 0x0400
static void
sci_update(struct vmctx *ctx)
{
int need_sci;
/* See if the SCI should be active or not. */
need_sci = 0;
if ((pm1_enable & PM1_TMR_EN) && (pm1_status & PM1_TMR_STS))
need_sci = 1;
if ((pm1_enable & PM1_GBL_EN) && (pm1_status & PM1_GBL_STS))
need_sci = 1;
if ((pm1_enable & PM1_PWRBTN_EN) && (pm1_status & PM1_PWRBTN_STS))
need_sci = 1;
if ((pm1_enable & PM1_SLPBTN_EN) && (pm1_status & PM1_SLPBTN_STS))
need_sci = 1;
if ((pm1_enable & PM1_RTC_EN) && (pm1_status & PM1_RTC_STS))
need_sci = 1;
if (need_sci)
sci_assert(ctx);
else
sci_deassert(ctx);
}
static int
pm1_status_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
if (bytes != 2)
return (-1);
pthread_mutex_lock(&pm_lock);
if (in)
*eax = pm1_status;
else {
/*
* Writes are only permitted to clear certain bits by
* writing 1 to those flags.
*/
pm1_status &= ~(*eax & (PM1_WAK_STS | PM1_RTC_STS |
PM1_SLPBTN_STS | PM1_PWRBTN_STS | PM1_BM_STS));
sci_update(ctx);
}
pthread_mutex_unlock(&pm_lock);
return (0);
}
static int
pm1_enable_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
if (bytes != 2)
return (-1);
pthread_mutex_lock(&pm_lock);
if (in)
*eax = pm1_enable;
else {
/*
* Only permit certain bits to be set. We never use
* the global lock, but ACPI-CA whines profusely if it
* can't set GBL_EN.
*/
pm1_enable = *eax & (PM1_PWRBTN_EN | PM1_GBL_EN);
sci_update(ctx);
}
pthread_mutex_unlock(&pm_lock);
return (0);
}
INOUT_PORT(pm1_status, PM1A_EVT_ADDR, IOPORT_F_INOUT, pm1_status_handler);
INOUT_PORT(pm1_enable, PM1A_EVT_ADDR + 2, IOPORT_F_INOUT, pm1_enable_handler);
static void
power_button_handler(int signal, enum ev_type type, void *arg)
{
struct vmctx *ctx;
ctx = arg;
pthread_mutex_lock(&pm_lock);
if (!(pm1_status & PM1_PWRBTN_STS)) {
pm1_status |= PM1_PWRBTN_STS;
sci_update(ctx);
}
pthread_mutex_unlock(&pm_lock);
}
/*
* Power Management 1 Control Register
*
* This is mostly unimplemented except that we wish to handle writes that
* set SPL_EN to handle S5 (soft power off).
*/
static uint16_t pm1_control;
#define PM1_SCI_EN 0x0001
#define PM1_SLP_TYP 0x1c00
#define PM1_SLP_EN 0x2000
#define PM1_ALWAYS_ZERO 0xc003
static int
pm1_control_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
int error;
if (bytes != 2)
return (-1);
if (in)
*eax = pm1_control;
else {
/*
* Various bits are write-only or reserved, so force them
* to zero in pm1_control. Always preserve SCI_EN as OSPM
* can never change it.
*/
pm1_control = (pm1_control & PM1_SCI_EN) |
(*eax & ~(PM1_SLP_EN | PM1_ALWAYS_ZERO));
/*
* If SLP_EN is set, check for S5. Bhyve's _S5_ method
* says that '5' should be stored in SLP_TYP for S5.
*/
if (*eax & PM1_SLP_EN) {
if ((pm1_control & PM1_SLP_TYP) >> 10 == 5) {
error = vm_suspend(ctx, VM_SUSPEND_POWEROFF);
assert(error == 0 || errno == EALREADY);
}
}
}
return (0);
}
INOUT_PORT(pm1_control, PM1A_CNT_ADDR, IOPORT_F_INOUT, pm1_control_handler);
SYSRES_IO(PM1A_EVT_ADDR, 8);
/*
* ACPI SMI Command Register
*
* This write-only register is used to enable and disable ACPI.
*/
static int
smi_cmd_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
assert(!in);
if (bytes != 1)
return (-1);
pthread_mutex_lock(&pm_lock);
switch (*eax) {
case BHYVE_ACPI_ENABLE:
pm1_control |= PM1_SCI_EN;
if (power_button == NULL) {
power_button = mevent_add(SIGTERM, EVF_SIGNAL,
power_button_handler, ctx);
old_power_handler = signal(SIGTERM, SIG_IGN);
}
break;
case BHYVE_ACPI_DISABLE:
pm1_control &= ~PM1_SCI_EN;
if (power_button != NULL) {
mevent_delete(power_button);
power_button = NULL;
signal(SIGTERM, old_power_handler);
}
break;
}
pthread_mutex_unlock(&pm_lock);
return (0);
}
INOUT_PORT(smi_cmd, SMI_CMD, IOPORT_F_OUT, smi_cmd_handler);
SYSRES_IO(SMI_CMD, 1);
void
sci_init(struct vmctx *ctx)
{
/*
* Mark ACPI's SCI as level trigger and bump its use count
* in the PIRQ router.
*/
pci_irq_use(SCI_INT);
vm_isa_set_irq_trigger(ctx, SCI_INT, LEVEL_TRIGGER);
}

53
post.c Normal file
View File

@ -0,0 +1,53 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <assert.h>
#include "inout.h"
#include "pci_lpc.h"
static int
post_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
assert(in == 1);
if (bytes != 1)
return (-1);
*eax = 0xff; /* return some garbage */
return (0);
}
INOUT_PORT(post, 0x84, IOPORT_F_IN, post_data_handler);
SYSRES_IO(0x84, 1);

129
rtc.c Normal file
View File

@ -0,0 +1,129 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <time.h>
#include <assert.h>
#include <machine/vmm.h>
#include <vmmapi.h>
#include "acpi.h"
#include "pci_lpc.h"
#include "rtc.h"
#define IO_RTC 0x70
#define RTC_LMEM_LSB 0x34
#define RTC_LMEM_MSB 0x35
#define RTC_HMEM_LSB 0x5b
#define RTC_HMEM_SB 0x5c
#define RTC_HMEM_MSB 0x5d
#define m_64KB (64*1024)
#define m_16MB (16*1024*1024)
#define m_4GB (4ULL*1024*1024*1024)
/*
* Returns the current RTC time as number of seconds since 00:00:00 Jan 1, 1970
*/
static time_t
rtc_time(struct vmctx *ctx, int use_localtime)
{
struct tm tm;
time_t t;
time(&t);
if (use_localtime) {
localtime_r(&t, &tm);
t = timegm(&tm);
}
return (t);
}
void
rtc_init(struct vmctx *ctx, int use_localtime)
{
size_t himem;
size_t lomem;
int err;
/* XXX init diag/reset code/equipment/checksum ? */
/*
* Report guest memory size in nvram cells as required by UEFI.
* Little-endian encoding.
* 0x34/0x35 - 64KB chunks above 16MB, below 4GB
* 0x5b/0x5c/0x5d - 64KB chunks above 4GB
*/
lomem = (vm_get_lowmem_size(ctx) - m_16MB) / m_64KB;
err = vm_rtc_write(ctx, RTC_LMEM_LSB, lomem);
assert(err == 0);
err = vm_rtc_write(ctx, RTC_LMEM_MSB, lomem >> 8);
assert(err == 0);
himem = vm_get_highmem_size(ctx) / m_64KB;
err = vm_rtc_write(ctx, RTC_HMEM_LSB, himem);
assert(err == 0);
err = vm_rtc_write(ctx, RTC_HMEM_SB, himem >> 8);
assert(err == 0);
err = vm_rtc_write(ctx, RTC_HMEM_MSB, himem >> 16);
assert(err == 0);
err = vm_rtc_settime(ctx, rtc_time(ctx, use_localtime));
assert(err == 0);
}
static void
rtc_dsdt(void)
{
dsdt_line("");
dsdt_line("Device (RTC)");
dsdt_line("{");
dsdt_line(" Name (_HID, EisaId (\"PNP0B00\"))");
dsdt_line(" Name (_CRS, ResourceTemplate ()");
dsdt_line(" {");
dsdt_indent(2);
dsdt_fixed_ioport(IO_RTC, 2);
dsdt_fixed_irq(8);
dsdt_unindent(2);
dsdt_line(" })");
dsdt_line("}");
}
LPC_DSDT(rtc_dsdt);
/*
* Reserve the extended RTC I/O ports although they are not emulated at this
* time.
*/
SYSRES_IO(0x72, 6);

34
rtc.h Normal file
View File

@ -0,0 +1,34 @@
/*
* Copyright (c) 2013 Peter Grehan <grehan@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _RTC_H_
#define _RTC_H_
void rtc_init(struct vmctx *ctx, int use_localtime);
#endif /* _RTC_H_ */

827
smbiostbl.c Normal file
View File

@ -0,0 +1,827 @@
/*-
* Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <assert.h>
#include <errno.h>
#include <md5.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <uuid.h>
#include <machine/vmm.h>
#include <vmmapi.h>
#include "bhyverun.h"
#include "smbiostbl.h"
#define MB (1024*1024)
#define GB (1024ULL*1024*1024)
#define SMBIOS_BASE 0xF1000
/* BHYVE_ACPI_BASE - SMBIOS_BASE) */
#define SMBIOS_MAX_LENGTH (0xF2400 - 0xF1000)
#define SMBIOS_TYPE_BIOS 0
#define SMBIOS_TYPE_SYSTEM 1
#define SMBIOS_TYPE_CHASSIS 3
#define SMBIOS_TYPE_PROCESSOR 4
#define SMBIOS_TYPE_MEMARRAY 16
#define SMBIOS_TYPE_MEMDEVICE 17
#define SMBIOS_TYPE_MEMARRAYMAP 19
#define SMBIOS_TYPE_BOOT 32
#define SMBIOS_TYPE_EOT 127
struct smbios_structure {
uint8_t type;
uint8_t length;
uint16_t handle;
} __packed;
typedef int (*initializer_func_t)(struct smbios_structure *template_entry,
const char **template_strings, char *curaddr, char **endaddr,
uint16_t *n, uint16_t *size);
struct smbios_template_entry {
struct smbios_structure *entry;
const char **strings;
initializer_func_t initializer;
};
/*
* SMBIOS Structure Table Entry Point
*/
#define SMBIOS_ENTRY_EANCHOR "_SM_"
#define SMBIOS_ENTRY_EANCHORLEN 4
#define SMBIOS_ENTRY_IANCHOR "_DMI_"
#define SMBIOS_ENTRY_IANCHORLEN 5
struct smbios_entry_point {
char eanchor[4]; /* anchor tag */
uint8_t echecksum; /* checksum of entry point structure */
uint8_t eplen; /* length in bytes of entry point */
uint8_t major; /* major version of the SMBIOS spec */
uint8_t minor; /* minor version of the SMBIOS spec */
uint16_t maxssize; /* maximum size in bytes of a struct */
uint8_t revision; /* entry point structure revision */
uint8_t format[5]; /* entry point rev-specific data */
char ianchor[5]; /* intermediate anchor tag */
uint8_t ichecksum; /* intermediate checksum */
uint16_t stlen; /* len in bytes of structure table */
uint32_t staddr; /* physical addr of structure table */
uint16_t stnum; /* number of structure table entries */
uint8_t bcdrev; /* BCD value representing DMI ver */
} __packed;
/*
* BIOS Information
*/
#define SMBIOS_FL_ISA 0x00000010 /* ISA is supported */
#define SMBIOS_FL_PCI 0x00000080 /* PCI is supported */
#define SMBIOS_FL_SHADOW 0x00001000 /* BIOS shadowing is allowed */
#define SMBIOS_FL_CDBOOT 0x00008000 /* Boot from CD is supported */
#define SMBIOS_FL_SELBOOT 0x00010000 /* Selectable Boot supported */
#define SMBIOS_FL_EDD 0x00080000 /* EDD Spec is supported */
#define SMBIOS_XB1_FL_ACPI 0x00000001 /* ACPI is supported */
#define SMBIOS_XB2_FL_BBS 0x00000001 /* BIOS Boot Specification */
#define SMBIOS_XB2_FL_VM 0x00000010 /* Virtual Machine */
struct smbios_table_type0 {
struct smbios_structure header;
uint8_t vendor; /* vendor string */
uint8_t version; /* version string */
uint16_t segment; /* address segment location */
uint8_t rel_date; /* release date */
uint8_t size; /* rom size */
uint64_t cflags; /* characteristics */
uint8_t xc_bytes[2]; /* characteristics ext bytes */
uint8_t sb_major_rel; /* system bios version */
uint8_t sb_minor_rele;
uint8_t ecfw_major_rel; /* embedded ctrl fw version */
uint8_t ecfw_minor_rel;
} __packed;
/*
* System Information
*/
#define SMBIOS_WAKEUP_SWITCH 0x06 /* power switch */
struct smbios_table_type1 {
struct smbios_structure header;
uint8_t manufacturer; /* manufacturer string */
uint8_t product; /* product name string */
uint8_t version; /* version string */
uint8_t serial; /* serial number string */
uint8_t uuid[16]; /* uuid byte array */
uint8_t wakeup; /* wake-up event */
uint8_t sku; /* sku number string */
uint8_t family; /* family name string */
} __packed;
/*
* System Enclosure or Chassis
*/
#define SMBIOS_CHT_UNKNOWN 0x02 /* unknown */
#define SMBIOS_CHST_SAFE 0x03 /* safe */
#define SMBIOS_CHSC_NONE 0x03 /* none */
struct smbios_table_type3 {
struct smbios_structure header;
uint8_t manufacturer; /* manufacturer string */
uint8_t type; /* type */
uint8_t version; /* version string */
uint8_t serial; /* serial number string */
uint8_t asset; /* asset tag string */
uint8_t bustate; /* boot-up state */
uint8_t psstate; /* power supply state */
uint8_t tstate; /* thermal state */
uint8_t security; /* security status */
uint8_t uheight; /* height in 'u's */
uint8_t cords; /* number of power cords */
uint8_t elems; /* number of element records */
uint8_t elemlen; /* length of records */
uint8_t sku; /* sku number string */
} __packed;
/*
* Processor Information
*/
#define SMBIOS_PRT_CENTRAL 0x03 /* central processor */
#define SMBIOS_PRF_OTHER 0x01 /* other */
#define SMBIOS_PRS_PRESENT 0x40 /* socket is populated */
#define SMBIOS_PRS_ENABLED 0x1 /* enabled */
#define SMBIOS_PRU_NONE 0x06 /* none */
#define SMBIOS_PFL_64B 0x04 /* 64-bit capable */
struct smbios_table_type4 {
struct smbios_structure header;
uint8_t socket; /* socket designation string */
uint8_t type; /* processor type */
uint8_t family; /* processor family */
uint8_t manufacturer; /* manufacturer string */
uint64_t cpuid; /* processor cpuid */
uint8_t version; /* version string */
uint8_t voltage; /* voltage */
uint16_t clkspeed; /* ext clock speed in mhz */
uint16_t maxspeed; /* maximum speed in mhz */
uint16_t curspeed; /* current speed in mhz */
uint8_t status; /* status */
uint8_t upgrade; /* upgrade */
uint16_t l1handle; /* l1 cache handle */
uint16_t l2handle; /* l2 cache handle */
uint16_t l3handle; /* l3 cache handle */
uint8_t serial; /* serial number string */
uint8_t asset; /* asset tag string */
uint8_t part; /* part number string */
uint8_t cores; /* cores per socket */
uint8_t ecores; /* enabled cores */
uint8_t threads; /* threads per socket */
uint16_t cflags; /* processor characteristics */
uint16_t family2; /* processor family 2 */
} __packed;
/*
* Physical Memory Array
*/
#define SMBIOS_MAL_SYSMB 0x03 /* system board or motherboard */
#define SMBIOS_MAU_SYSTEM 0x03 /* system memory */
#define SMBIOS_MAE_NONE 0x03 /* none */
struct smbios_table_type16 {
struct smbios_structure header;
uint8_t location; /* physical device location */
uint8_t use; /* device functional purpose */
uint8_t ecc; /* err detect/correct method */
uint32_t size; /* max mem capacity in kb */
uint16_t errhand; /* handle of error (if any) */
uint16_t ndevs; /* num of slots or sockets */
uint64_t xsize; /* max mem capacity in bytes */
} __packed;
/*
* Memory Device
*/
#define SMBIOS_MDFF_UNKNOWN 0x02 /* unknown */
#define SMBIOS_MDT_UNKNOWN 0x02 /* unknown */
#define SMBIOS_MDF_UNKNOWN 0x0004 /* unknown */
struct smbios_table_type17 {
struct smbios_structure header;
uint16_t arrayhand; /* handle of physl mem array */
uint16_t errhand; /* handle of mem error data */
uint16_t twidth; /* total width in bits */
uint16_t dwidth; /* data width in bits */
uint16_t size; /* size in bytes */
uint8_t form; /* form factor */
uint8_t set; /* set */
uint8_t dloc; /* device locator string */
uint8_t bloc; /* phys bank locator string */
uint8_t type; /* memory type */
uint16_t flags; /* memory characteristics */
uint16_t maxspeed; /* maximum speed in mhz */
uint8_t manufacturer; /* manufacturer string */
uint8_t serial; /* serial number string */
uint8_t asset; /* asset tag string */
uint8_t part; /* part number string */
uint8_t attributes; /* attributes */
uint32_t xsize; /* extended size in mbs */
uint16_t curspeed; /* current speed in mhz */
uint16_t minvoltage; /* minimum voltage */
uint16_t maxvoltage; /* maximum voltage */
uint16_t curvoltage; /* configured voltage */
} __packed;
/*
* Memory Array Mapped Address
*/
struct smbios_table_type19 {
struct smbios_structure header;
uint32_t saddr; /* start phys addr in kb */
uint32_t eaddr; /* end phys addr in kb */
uint16_t arrayhand; /* physical mem array handle */
uint8_t width; /* num of dev in row */
uint64_t xsaddr; /* start phys addr in bytes */
uint64_t xeaddr; /* end phys addr in bytes */
} __packed;
/*
* System Boot Information
*/
#define SMBIOS_BOOT_NORMAL 0 /* no errors detected */
struct smbios_table_type32 {
struct smbios_structure header;
uint8_t reserved[6];
uint8_t status; /* boot status */
} __packed;
/*
* End-of-Table
*/
struct smbios_table_type127 {
struct smbios_structure header;
} __packed;
struct smbios_table_type0 smbios_type0_template = {
{ SMBIOS_TYPE_BIOS, sizeof (struct smbios_table_type0), 0 },
1, /* bios vendor string */
2, /* bios version string */
0xF000, /* bios address segment location */
3, /* bios release date */
0x0, /* bios size (64k * (n + 1) is the size in bytes) */
SMBIOS_FL_ISA | SMBIOS_FL_PCI | SMBIOS_FL_SHADOW |
SMBIOS_FL_CDBOOT | SMBIOS_FL_EDD,
{ SMBIOS_XB1_FL_ACPI, SMBIOS_XB2_FL_BBS | SMBIOS_XB2_FL_VM },
0x0, /* bios major release */
0x0, /* bios minor release */
0xff, /* embedded controller firmware major release */
0xff /* embedded controller firmware minor release */
};
const char *smbios_type0_strings[] = {
"BHYVE", /* vendor string */
"1.00", /* bios version string */
"03/14/2014", /* bios release date string */
NULL
};
struct smbios_table_type1 smbios_type1_template = {
{ SMBIOS_TYPE_SYSTEM, sizeof (struct smbios_table_type1), 0 },
1, /* manufacturer string */
2, /* product string */
3, /* version string */
4, /* serial number string */
{ 0 },
SMBIOS_WAKEUP_SWITCH,
5, /* sku string */
6 /* family string */
};
static int smbios_type1_initializer(struct smbios_structure *template_entry,
const char **template_strings, char *curaddr, char **endaddr,
uint16_t *n, uint16_t *size);
const char *smbios_type1_strings[] = {
" ", /* manufacturer string */
"BHYVE", /* product name string */
"1.0", /* version string */
"None", /* serial number string */
"None", /* sku string */
" ", /* family name string */
NULL
};
struct smbios_table_type3 smbios_type3_template = {
{ SMBIOS_TYPE_CHASSIS, sizeof (struct smbios_table_type3), 0 },
1, /* manufacturer string */
SMBIOS_CHT_UNKNOWN,
2, /* version string */
3, /* serial number string */
4, /* asset tag string */
SMBIOS_CHST_SAFE,
SMBIOS_CHST_SAFE,
SMBIOS_CHST_SAFE,
SMBIOS_CHSC_NONE,
0, /* height in 'u's (0=enclosure height unspecified) */
0, /* number of power cords (0=number unspecified) */
0, /* number of contained element records */
0, /* length of records */
5 /* sku number string */
};
const char *smbios_type3_strings[] = {
" ", /* manufacturer string */
"1.0", /* version string */
"None", /* serial number string */
"None", /* asset tag string */
"None", /* sku number string */
NULL
};
struct smbios_table_type4 smbios_type4_template = {
{ SMBIOS_TYPE_PROCESSOR, sizeof (struct smbios_table_type4), 0 },
1, /* socket designation string */
SMBIOS_PRT_CENTRAL,
SMBIOS_PRF_OTHER,
2, /* manufacturer string */
0, /* cpuid */
3, /* version string */
0, /* voltage */
0, /* external clock frequency in mhz (0=unknown) */
0, /* maximum frequency in mhz (0=unknown) */
0, /* current frequency in mhz (0=unknown) */
SMBIOS_PRS_PRESENT | SMBIOS_PRS_ENABLED,
SMBIOS_PRU_NONE,
-1, /* l1 cache handle */
-1, /* l2 cache handle */
-1, /* l3 cache handle */
4, /* serial number string */
5, /* asset tag string */
6, /* part number string */
0, /* cores per socket (0=unknown) */
0, /* enabled cores per socket (0=unknown) */
0, /* threads per socket (0=unknown) */
SMBIOS_PFL_64B,
SMBIOS_PRF_OTHER
};
const char *smbios_type4_strings[] = {
" ", /* socket designation string */
" ", /* manufacturer string */
" ", /* version string */
"None", /* serial number string */
"None", /* asset tag string */
"None", /* part number string */
NULL
};
static int smbios_type4_initializer(struct smbios_structure *template_entry,
const char **template_strings, char *curaddr, char **endaddr,
uint16_t *n, uint16_t *size);
struct smbios_table_type16 smbios_type16_template = {
{ SMBIOS_TYPE_MEMARRAY, sizeof (struct smbios_table_type16), 0 },
SMBIOS_MAL_SYSMB,
SMBIOS_MAU_SYSTEM,
SMBIOS_MAE_NONE,
0x80000000, /* max mem capacity in kb (0x80000000=use extended) */
-1, /* handle of error (if any) */
0, /* number of slots or sockets (TBD) */
0 /* extended maximum memory capacity in bytes (TBD) */
};
static int smbios_type16_initializer(struct smbios_structure *template_entry,
const char **template_strings, char *curaddr, char **endaddr,
uint16_t *n, uint16_t *size);
struct smbios_table_type17 smbios_type17_template = {
{ SMBIOS_TYPE_MEMDEVICE, sizeof (struct smbios_table_type17), 0 },
-1, /* handle of physical memory array */
-1, /* handle of memory error data */
64, /* total width in bits including ecc */
64, /* data width in bits */
0x7fff, /* size in bytes (0x7fff=use extended)*/
SMBIOS_MDFF_UNKNOWN,
0, /* set (0x00=none, 0xff=unknown) */
1, /* device locator string */
2, /* physical bank locator string */
SMBIOS_MDT_UNKNOWN,
SMBIOS_MDF_UNKNOWN,
0, /* maximum memory speed in mhz (0=unknown) */
3, /* manufacturer string */
4, /* serial number string */
5, /* asset tag string */
6, /* part number string */
0, /* attributes (0=unknown rank information) */
0, /* extended size in mb (TBD) */
0, /* current speed in mhz (0=unknown) */
0, /* minimum voltage in mv (0=unknown) */
0, /* maximum voltage in mv (0=unknown) */
0 /* configured voltage in mv (0=unknown) */
};
const char *smbios_type17_strings[] = {
" ", /* device locator string */
" ", /* physical bank locator string */
" ", /* manufacturer string */
"None", /* serial number string */
"None", /* asset tag string */
"None", /* part number string */
NULL
};
static int smbios_type17_initializer(struct smbios_structure *template_entry,
const char **template_strings, char *curaddr, char **endaddr,
uint16_t *n, uint16_t *size);
struct smbios_table_type19 smbios_type19_template = {
{ SMBIOS_TYPE_MEMARRAYMAP, sizeof (struct smbios_table_type19), 0 },
0xffffffff, /* starting phys addr in kb (0xffffffff=use ext) */
0xffffffff, /* ending phys addr in kb (0xffffffff=use ext) */
-1, /* physical memory array handle */
1, /* number of devices that form a row */
0, /* extended starting phys addr in bytes (TDB) */
0 /* extended ending phys addr in bytes (TDB) */
};
static int smbios_type19_initializer(struct smbios_structure *template_entry,
const char **template_strings, char *curaddr, char **endaddr,
uint16_t *n, uint16_t *size);
struct smbios_table_type32 smbios_type32_template = {
{ SMBIOS_TYPE_BOOT, sizeof (struct smbios_table_type32), 0 },
{ 0, 0, 0, 0, 0, 0 },
SMBIOS_BOOT_NORMAL
};
struct smbios_table_type127 smbios_type127_template = {
{ SMBIOS_TYPE_EOT, sizeof (struct smbios_table_type127), 0 }
};
static int smbios_generic_initializer(struct smbios_structure *template_entry,
const char **template_strings, char *curaddr, char **endaddr,
uint16_t *n, uint16_t *size);
static struct smbios_template_entry smbios_template[] = {
{ (struct smbios_structure *)&smbios_type0_template,
smbios_type0_strings,
smbios_generic_initializer },
{ (struct smbios_structure *)&smbios_type1_template,
smbios_type1_strings,
smbios_type1_initializer },
{ (struct smbios_structure *)&smbios_type3_template,
smbios_type3_strings,
smbios_generic_initializer },
{ (struct smbios_structure *)&smbios_type4_template,
smbios_type4_strings,
smbios_type4_initializer },
{ (struct smbios_structure *)&smbios_type16_template,
NULL,
smbios_type16_initializer },
{ (struct smbios_structure *)&smbios_type17_template,
smbios_type17_strings,
smbios_type17_initializer },
{ (struct smbios_structure *)&smbios_type19_template,
NULL,
smbios_type19_initializer },
{ (struct smbios_structure *)&smbios_type32_template,
NULL,
smbios_generic_initializer },
{ (struct smbios_structure *)&smbios_type127_template,
NULL,
smbios_generic_initializer },
{ NULL,NULL, NULL }
};
static uint64_t guest_lomem, guest_himem;
static uint16_t type16_handle;
static int
smbios_generic_initializer(struct smbios_structure *template_entry,
const char **template_strings, char *curaddr, char **endaddr,
uint16_t *n, uint16_t *size)
{
struct smbios_structure *entry;
memcpy(curaddr, template_entry, template_entry->length);
entry = (struct smbios_structure *)curaddr;
entry->handle = *n + 1;
curaddr += entry->length;
if (template_strings != NULL) {
int i;
for (i = 0; template_strings[i] != NULL; i++) {
const char *string;
int len;
string = template_strings[i];
len = strlen(string) + 1;
memcpy(curaddr, string, len);
curaddr += len;
}
*curaddr = '\0';
curaddr++;
} else {
/* Minimum string section is double nul */
*curaddr = '\0';
curaddr++;
*curaddr = '\0';
curaddr++;
}
(*n)++;
*endaddr = curaddr;
return (0);
}
static int
smbios_type1_initializer(struct smbios_structure *template_entry,
const char **template_strings, char *curaddr, char **endaddr,
uint16_t *n, uint16_t *size)
{
struct smbios_table_type1 *type1;
smbios_generic_initializer(template_entry, template_strings,
curaddr, endaddr, n, size);
type1 = (struct smbios_table_type1 *)curaddr;
if (guest_uuid_str != NULL) {
uuid_t uuid;
uint32_t status;
uuid_from_string(guest_uuid_str, &uuid, &status);
if (status != uuid_s_ok)
return (-1);
uuid_enc_le(&type1->uuid, &uuid);
} else {
MD5_CTX mdctx;
u_char digest[16];
char hostname[MAXHOSTNAMELEN];
/*
* Universally unique and yet reproducible are an
* oxymoron, however reproducible is desirable in
* this case.
*/
if (gethostname(hostname, sizeof(hostname)))
return (-1);
MD5Init(&mdctx);
MD5Update(&mdctx, vmname, strlen(vmname));
MD5Update(&mdctx, hostname, sizeof(hostname));
MD5Final(digest, &mdctx);
/*
* Set the variant and version number.
*/
digest[6] &= 0x0F;
digest[6] |= 0x30; /* version 3 */
digest[8] &= 0x3F;
digest[8] |= 0x80;
memcpy(&type1->uuid, digest, sizeof (digest));
}
return (0);
}
static int
smbios_type4_initializer(struct smbios_structure *template_entry,
const char **template_strings, char *curaddr, char **endaddr,
uint16_t *n, uint16_t *size)
{
int i;
for (i = 0; i < guest_ncpus; i++) {
struct smbios_table_type4 *type4;
char *p;
int nstrings, len;
smbios_generic_initializer(template_entry, template_strings,
curaddr, endaddr, n, size);
type4 = (struct smbios_table_type4 *)curaddr;
p = curaddr + sizeof (struct smbios_table_type4);
nstrings = 0;
while (p < *endaddr - 1) {
if (*p++ == '\0')
nstrings++;
}
len = sprintf(*endaddr - 1, "CPU #%d", i) + 1;
*endaddr += len - 1;
*(*endaddr) = '\0';
(*endaddr)++;
type4->socket = nstrings + 1;
curaddr = *endaddr;
}
return (0);
}
static int
smbios_type16_initializer(struct smbios_structure *template_entry,
const char **template_strings, char *curaddr, char **endaddr,
uint16_t *n, uint16_t *size)
{
struct smbios_table_type16 *type16;
type16_handle = *n;
smbios_generic_initializer(template_entry, template_strings,
curaddr, endaddr, n, size);
type16 = (struct smbios_table_type16 *)curaddr;
type16->xsize = guest_lomem + guest_himem;
type16->ndevs = guest_himem > 0 ? 2 : 1;
return (0);
}
static int
smbios_type17_initializer(struct smbios_structure *template_entry,
const char **template_strings, char *curaddr, char **endaddr,
uint16_t *n, uint16_t *size)
{
struct smbios_table_type17 *type17;
smbios_generic_initializer(template_entry, template_strings,
curaddr, endaddr, n, size);
type17 = (struct smbios_table_type17 *)curaddr;
type17->arrayhand = type16_handle;
type17->xsize = guest_lomem;
if (guest_himem > 0) {
curaddr = *endaddr;
smbios_generic_initializer(template_entry, template_strings,
curaddr, endaddr, n, size);
type17 = (struct smbios_table_type17 *)curaddr;
type17->arrayhand = type16_handle;
type17->xsize = guest_himem;
}
return (0);
}
static int
smbios_type19_initializer(struct smbios_structure *template_entry,
const char **template_strings, char *curaddr, char **endaddr,
uint16_t *n, uint16_t *size)
{
struct smbios_table_type19 *type19;
smbios_generic_initializer(template_entry, template_strings,
curaddr, endaddr, n, size);
type19 = (struct smbios_table_type19 *)curaddr;
type19->arrayhand = type16_handle;
type19->xsaddr = 0;
type19->xeaddr = guest_lomem;
if (guest_himem > 0) {
curaddr = *endaddr;
smbios_generic_initializer(template_entry, template_strings,
curaddr, endaddr, n, size);
type19 = (struct smbios_table_type19 *)curaddr;
type19->arrayhand = type16_handle;
type19->xsaddr = 4*GB;
type19->xeaddr = guest_himem;
}
return (0);
}
static void
smbios_ep_initializer(struct smbios_entry_point *smbios_ep, uint32_t staddr)
{
memset(smbios_ep, 0, sizeof(*smbios_ep));
memcpy(smbios_ep->eanchor, SMBIOS_ENTRY_EANCHOR,
SMBIOS_ENTRY_EANCHORLEN);
smbios_ep->eplen = 0x1F;
assert(sizeof (struct smbios_entry_point) == smbios_ep->eplen);
smbios_ep->major = 2;
smbios_ep->minor = 6;
smbios_ep->revision = 0;
memcpy(smbios_ep->ianchor, SMBIOS_ENTRY_IANCHOR,
SMBIOS_ENTRY_IANCHORLEN);
smbios_ep->staddr = staddr;
smbios_ep->bcdrev = 0x24;
}
static void
smbios_ep_finalizer(struct smbios_entry_point *smbios_ep, uint16_t len,
uint16_t num, uint16_t maxssize)
{
uint8_t checksum;
int i;
smbios_ep->maxssize = maxssize;
smbios_ep->stlen = len;
smbios_ep->stnum = num;
checksum = 0;
for (i = 0x10; i < 0x1f; i++) {
checksum -= ((uint8_t *)smbios_ep)[i];
}
smbios_ep->ichecksum = checksum;
checksum = 0;
for (i = 0; i < 0x1f; i++) {
checksum -= ((uint8_t *)smbios_ep)[i];
}
smbios_ep->echecksum = checksum;
}
int
smbios_build(struct vmctx *ctx)
{
struct smbios_entry_point *smbios_ep;
uint16_t n;
uint16_t maxssize;
char *curaddr, *startaddr, *ststartaddr;
int i;
int err;
guest_lomem = vm_get_lowmem_size(ctx);
guest_himem = vm_get_highmem_size(ctx);
startaddr = paddr_guest2host(ctx, SMBIOS_BASE, SMBIOS_MAX_LENGTH);
if (startaddr == NULL) {
fprintf(stderr, "smbios table requires mapped mem\n");
return (ENOMEM);
}
curaddr = startaddr;
smbios_ep = (struct smbios_entry_point *)curaddr;
smbios_ep_initializer(smbios_ep, SMBIOS_BASE +
sizeof(struct smbios_entry_point));
curaddr += sizeof(struct smbios_entry_point);
ststartaddr = curaddr;
n = 0;
maxssize = 0;
for (i = 0; smbios_template[i].entry != NULL; i++) {
struct smbios_structure *entry;
const char **strings;
initializer_func_t initializer;
char *endaddr;
uint16_t size;
entry = smbios_template[i].entry;
strings = smbios_template[i].strings;
initializer = smbios_template[i].initializer;
err = (*initializer)(entry, strings, curaddr, &endaddr,
&n, &size);
if (err != 0)
return (err);
if (size > maxssize)
maxssize = size;
curaddr = endaddr;
}
assert(curaddr - startaddr < SMBIOS_MAX_LENGTH);
smbios_ep_finalizer(smbios_ep, curaddr - ststartaddr, n, maxssize);
return (0);
}

36
smbiostbl.h Normal file
View File

@ -0,0 +1,36 @@
/*-
* Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _SMBIOSTBL_H_
#define _SMBIOSTBL_H_
struct vmctx;
int smbios_build(struct vmctx *ctx);
#endif /* _SMBIOSTBL_H_ */

104
spinup_ap.c Normal file
View File

@ -0,0 +1,104 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/types.h>
#include <machine/vmm.h>
#include <vmmapi.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include "bhyverun.h"
#include "spinup_ap.h"
static void
spinup_ap_realmode(struct vmctx *ctx, int newcpu, uint64_t *rip)
{
int vector, error;
uint16_t cs;
uint64_t desc_base;
uint32_t desc_limit, desc_access;
vector = *rip >> PAGE_SHIFT;
*rip = 0;
/*
* Update the %cs and %rip of the guest so that it starts
* executing real mode code at at 'vector << 12'.
*/
error = vm_set_register(ctx, newcpu, VM_REG_GUEST_RIP, *rip);
assert(error == 0);
error = vm_get_desc(ctx, newcpu, VM_REG_GUEST_CS, &desc_base,
&desc_limit, &desc_access);
assert(error == 0);
desc_base = vector << PAGE_SHIFT;
error = vm_set_desc(ctx, newcpu, VM_REG_GUEST_CS,
desc_base, desc_limit, desc_access);
assert(error == 0);
cs = (vector << PAGE_SHIFT) >> 4;
error = vm_set_register(ctx, newcpu, VM_REG_GUEST_CS, cs);
assert(error == 0);
}
int
spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip)
{
int error;
assert(newcpu != 0);
assert(newcpu < guest_ncpus);
error = vcpu_reset(ctx, newcpu);
assert(error == 0);
fbsdrun_set_capabilities(ctx, newcpu);
/*
* Enable the 'unrestricted guest' mode for 'newcpu'.
*
* Set up the processor state in power-on 16-bit mode, with the CS:IP
* init'd to the specified low-mem 4K page.
*/
error = vm_set_capability(ctx, newcpu, VM_CAP_UNRESTRICTED_GUEST, 1);
assert(error == 0);
spinup_ap_realmode(ctx, newcpu, &rip);
fbsdrun_addcpu(ctx, vcpu, newcpu, rip);
return (newcpu);
}

34
spinup_ap.h Normal file
View File

@ -0,0 +1,34 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _SPINUP_AP_H_
#define _SPINUP_AP_H_
int spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip);
#endif

939
task_switch.c Normal file
View File

@ -0,0 +1,939 @@
/*-
* Copyright (c) 2014 Neel Natu <neel@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/_iovec.h>
#include <sys/mman.h>
#include <x86/psl.h>
#include <x86/segments.h>
#include <x86/specialreg.h>
#include <machine/vmm.h>
#include <machine/vmm_instruction_emul.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <errno.h>
#include <vmmapi.h>
#include "bhyverun.h"
/*
* Using 'struct i386tss' is tempting but causes myriad sign extension
* issues because all of its fields are defined as signed integers.
*/
struct tss32 {
uint16_t tss_link;
uint16_t rsvd1;
uint32_t tss_esp0;
uint16_t tss_ss0;
uint16_t rsvd2;
uint32_t tss_esp1;
uint16_t tss_ss1;
uint16_t rsvd3;
uint32_t tss_esp2;
uint16_t tss_ss2;
uint16_t rsvd4;
uint32_t tss_cr3;
uint32_t tss_eip;
uint32_t tss_eflags;
uint32_t tss_eax;
uint32_t tss_ecx;
uint32_t tss_edx;
uint32_t tss_ebx;
uint32_t tss_esp;
uint32_t tss_ebp;
uint32_t tss_esi;
uint32_t tss_edi;
uint16_t tss_es;
uint16_t rsvd5;
uint16_t tss_cs;
uint16_t rsvd6;
uint16_t tss_ss;
uint16_t rsvd7;
uint16_t tss_ds;
uint16_t rsvd8;
uint16_t tss_fs;
uint16_t rsvd9;
uint16_t tss_gs;
uint16_t rsvd10;
uint16_t tss_ldt;
uint16_t rsvd11;
uint16_t tss_trap;
uint16_t tss_iomap;
};
CTASSERT(sizeof(struct tss32) == 104);
#define SEL_START(sel) (((sel) & ~0x7))
#define SEL_LIMIT(sel) (((sel) | 0x7))
#define TSS_BUSY(type) (((type) & 0x2) != 0)
static uint64_t
GETREG(struct vmctx *ctx, int vcpu, int reg)
{
uint64_t val;
int error;
error = vm_get_register(ctx, vcpu, reg, &val);
assert(error == 0);
return (val);
}
static void
SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
{
int error;
error = vm_set_register(ctx, vcpu, reg, val);
assert(error == 0);
}
static struct seg_desc
usd_to_seg_desc(struct user_segment_descriptor *usd)
{
struct seg_desc seg_desc;
seg_desc.base = (u_int)USD_GETBASE(usd);
if (usd->sd_gran)
seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff;
else
seg_desc.limit = (u_int)USD_GETLIMIT(usd);
seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7;
seg_desc.access |= usd->sd_xx << 12;
seg_desc.access |= usd->sd_def32 << 14;
seg_desc.access |= usd->sd_gran << 15;
return (seg_desc);
}
/*
* Inject an exception with an error code that is a segment selector.
* The format of the error code is described in section 6.13, "Error Code",
* Intel SDM volume 3.
*
* Bit 0 (EXT) denotes whether the exception occurred during delivery
* of an external event like an interrupt.
*
* Bit 1 (IDT) indicates whether the selector points to a gate descriptor
* in the IDT.
*
* Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI).
*/
static void
sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext)
{
/*
* Bit 2 from the selector is retained as-is in the error code.
*
* Bit 1 can be safely cleared because none of the selectors
* encountered during task switch emulation refer to a task
* gate in the IDT.
*
* Bit 0 is set depending on the value of 'ext'.
*/
sel &= ~0x3;
if (ext)
sel |= 0x1;
vm_inject_fault(ctx, vcpu, vector, 1, sel);
}
/*
* Return 0 if the selector 'sel' in within the limits of the GDT/LDT
* and non-zero otherwise.
*/
static int
desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel)
{
uint64_t base;
uint32_t limit, access;
int error, reg;
reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
assert(error == 0);
if (reg == VM_REG_GUEST_LDTR) {
if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access))
return (-1);
}
if (limit < SEL_LIMIT(sel))
return (-1);
else
return (0);
}
/*
* Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced
* by the selector 'sel'.
*
* Returns 0 on success.
* Returns 1 if an exception was injected into the guest.
* Returns -1 otherwise.
*/
static int
desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
uint16_t sel, struct user_segment_descriptor *desc, bool doread,
int *faultptr)
{
struct iovec iov[2];
uint64_t base;
uint32_t limit, access;
int error, reg;
reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
assert(error == 0);
assert(limit >= SEL_LIMIT(sel));
error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel),
sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov),
faultptr);
if (error || *faultptr)
return (error);
if (doread)
vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc));
else
vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc));
return (0);
}
static int
desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
{
return (desc_table_rw(ctx, vcpu, paging, sel, desc, true, faultptr));
}
static int
desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
{
return (desc_table_rw(ctx, vcpu, paging, sel, desc, false, faultptr));
}
/*
* Read the TSS descriptor referenced by 'sel' into 'desc'.
*
* Returns 0 on success.
* Returns 1 if an exception was injected into the guest.
* Returns -1 otherwise.
*/
static int
read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
{
struct vm_guest_paging sup_paging;
int error;
assert(!ISLDT(sel));
assert(IDXSEL(sel) != 0);
/* Fetch the new TSS descriptor */
if (desc_table_limit_check(ctx, vcpu, sel)) {
if (ts->reason == TSR_IRET)
sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
else
sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext);
return (1);
}
sup_paging = ts->paging;
sup_paging.cpl = 0; /* implicit supervisor mode */
error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc, faultptr);
return (error);
}
static bool
code_desc(int sd_type)
{
/* code descriptor */
return ((sd_type & 0x18) == 0x18);
}
static bool
stack_desc(int sd_type)
{
/* writable data descriptor */
return ((sd_type & 0x1A) == 0x12);
}
static bool
data_desc(int sd_type)
{
/* data descriptor or a readable code descriptor */
return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A);
}
static bool
ldt_desc(int sd_type)
{
return (sd_type == SDT_SYSLDT);
}
/*
* Validate the descriptor 'seg_desc' associated with 'segment'.
*/
static int
validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
int segment, struct seg_desc *seg_desc, int *faultptr)
{
struct vm_guest_paging sup_paging;
struct user_segment_descriptor usd;
int error, idtvec;
int cpl, dpl, rpl;
uint16_t sel, cs;
bool ldtseg, codeseg, stackseg, dataseg, conforming;
ldtseg = codeseg = stackseg = dataseg = false;
switch (segment) {
case VM_REG_GUEST_LDTR:
ldtseg = true;
break;
case VM_REG_GUEST_CS:
codeseg = true;
break;
case VM_REG_GUEST_SS:
stackseg = true;
break;
case VM_REG_GUEST_DS:
case VM_REG_GUEST_ES:
case VM_REG_GUEST_FS:
case VM_REG_GUEST_GS:
dataseg = true;
break;
default:
assert(0);
}
/* Get the segment selector */
sel = GETREG(ctx, vcpu, segment);
/* LDT selector must point into the GDT */
if (ldtseg && ISLDT(sel)) {
sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
return (1);
}
/* Descriptor table limit check */
if (desc_table_limit_check(ctx, vcpu, sel)) {
sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
return (1);
}
/* NULL selector */
if (IDXSEL(sel) == 0) {
/* Code and stack segment selectors cannot be NULL */
if (codeseg || stackseg) {
sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
return (1);
}
seg_desc->base = 0;
seg_desc->limit = 0;
seg_desc->access = 0x10000; /* unusable */
return (0);
}
/* Read the descriptor from the GDT/LDT */
sup_paging = ts->paging;
sup_paging.cpl = 0; /* implicit supervisor mode */
error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd, faultptr);
if (error || *faultptr)
return (error);
/* Verify that the descriptor type is compatible with the segment */
if ((ldtseg && !ldt_desc(usd.sd_type)) ||
(codeseg && !code_desc(usd.sd_type)) ||
(dataseg && !data_desc(usd.sd_type)) ||
(stackseg && !stack_desc(usd.sd_type))) {
sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
return (1);
}
/* Segment must be marked present */
if (!usd.sd_p) {
if (ldtseg)
idtvec = IDT_TS;
else if (stackseg)
idtvec = IDT_SS;
else
idtvec = IDT_NP;
sel_exception(ctx, vcpu, idtvec, sel, ts->ext);
return (1);
}
cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
cpl = cs & SEL_RPL_MASK;
rpl = sel & SEL_RPL_MASK;
dpl = usd.sd_dpl;
if (stackseg && (rpl != cpl || dpl != cpl)) {
sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
return (1);
}
if (codeseg) {
conforming = (usd.sd_type & 0x4) ? true : false;
if ((conforming && (cpl < dpl)) ||
(!conforming && (cpl != dpl))) {
sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
return (1);
}
}
if (dataseg) {
/*
* A data segment is always non-conforming except when it's
* descriptor is a readable, conforming code segment.
*/
if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0)
conforming = true;
else
conforming = false;
if (!conforming && (rpl > dpl || cpl > dpl)) {
sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
return (1);
}
}
*seg_desc = usd_to_seg_desc(&usd);
return (0);
}
static void
tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch,
uint32_t eip, struct tss32 *tss, struct iovec *iov)
{
/* General purpose registers */
tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX);
tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX);
tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX);
tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX);
tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP);
tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI);
tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI);
/* Segment selectors */
tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES);
tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS);
tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS);
tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS);
/* eflags and eip */
tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
if (task_switch->reason == TSR_IRET)
tss->tss_eflags &= ~PSL_NT;
tss->tss_eip = eip;
/* Copy updated old TSS into guest memory */
vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32));
}
static void
update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd)
{
int error;
error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access);
assert(error == 0);
}
/*
* Update the vcpu registers to reflect the state of the new task.
*/
static int
tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr)
{
struct seg_desc seg_desc, seg_desc2;
uint64_t *pdpte, maxphyaddr, reserved;
uint32_t eflags;
int error, i;
bool nested;
nested = false;
if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) {
tss->tss_link = ot_sel;
nested = true;
}
eflags = tss->tss_eflags;
if (nested)
eflags |= PSL_NT;
/* LDTR */
SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt);
/* PBDR */
if (ts->paging.paging_mode != PAGING_MODE_FLAT) {
if (ts->paging.paging_mode == PAGING_MODE_PAE) {
/*
* XXX Assuming 36-bit MAXPHYADDR.
*/
maxphyaddr = (1UL << 36) - 1;
pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32);
for (i = 0; i < 4; i++) {
/* Check reserved bits if the PDPTE is valid */
if (!(pdpte[i] & 0x1))
continue;
/*
* Bits 2:1, 8:5 and bits above the processor's
* maximum physical address are reserved.
*/
reserved = ~maxphyaddr | 0x1E6;
if (pdpte[i] & reserved) {
vm_inject_gp(ctx, vcpu);
return (1);
}
}
SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]);
SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]);
SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]);
SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]);
}
SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3);
ts->paging.cr3 = tss->tss_cr3;
}
/* eflags and eip */
SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags);
SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip);
/* General purpose registers */
SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax);
SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx);
SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx);
SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx);
SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp);
SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp);
SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi);
SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi);
/* Segment selectors */
SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es);
SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs);
SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss);
SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds);
SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs);
SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs);
/*
* If this is a nested task then write out the new TSS to update
* the previous link field.
*/
if (nested)
vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss));
/* Validate segment descriptors */
error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc,
faultptr);
if (error || *faultptr)
return (error);
update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc);
/*
* Section "Checks on Guest Segment Registers", Intel SDM, Vol 3.
*
* The SS and CS attribute checks on VM-entry are inter-dependent so
* we need to make sure that both segments are valid before updating
* either of them. This ensures that the VMCS state can pass the
* VM-entry checks so the guest can handle any exception injected
* during task switch emulation.
*/
error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc,
faultptr);
if (error || *faultptr)
return (error);
error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2,
faultptr);
if (error || *faultptr)
return (error);
update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc);
update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2);
ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK;
error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc,
faultptr);
if (error || *faultptr)
return (error);
update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc);
error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc,
faultptr);
if (error || *faultptr)
return (error);
update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc);
error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc,
faultptr);
if (error || *faultptr)
return (error);
update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc);
error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc,
faultptr);
if (error || *faultptr)
return (error);
update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc);
return (0);
}
/*
* Push an error code on the stack of the new task. This is needed if the
* task switch was triggered by a hardware exception that causes an error
* code to be saved (e.g. #PF).
*/
static int
push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
int task_type, uint32_t errcode, int *faultptr)
{
struct iovec iov[2];
struct seg_desc seg_desc;
int stacksize, bytes, error;
uint64_t gla, cr0, rflags;
uint32_t esp;
uint16_t stacksel;
*faultptr = 0;
cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base,
&seg_desc.limit, &seg_desc.access);
assert(error == 0);
/*
* Section "Error Code" in the Intel SDM vol 3: the error code is
* pushed on the stack as a doubleword or word (depending on the
* default interrupt, trap or task gate size).
*/
if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS)
bytes = 4;
else
bytes = 2;
/*
* PUSH instruction from Intel SDM vol 2: the 'B' flag in the
* stack-segment descriptor determines the size of the stack
* pointer outside of 64-bit mode.
*/
if (SEG_DESC_DEF32(seg_desc.access))
stacksize = 4;
else
stacksize = 2;
esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
esp -= bytes;
if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS,
&seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) {
sel_exception(ctx, vcpu, IDT_SS, stacksel, 1);
*faultptr = 1;
return (0);
}
if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) {
vm_inject_ac(ctx, vcpu, 1);
*faultptr = 1;
return (0);
}
error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE,
iov, nitems(iov), faultptr);
if (error || *faultptr)
return (error);
vm_copyout(ctx, vcpu, &errcode, iov, bytes);
SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp);
return (0);
}
/*
* Evaluate return value from helper functions and potentially return to
* the VM run loop.
*/
#define CHKERR(error,fault) \
do { \
assert((error == 0) || (error == EFAULT)); \
if (error) \
return (VMEXIT_ABORT); \
else if (fault) \
return (VMEXIT_CONTINUE); \
} while (0)
int
vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
struct seg_desc nt;
struct tss32 oldtss, newtss;
struct vm_task_switch *task_switch;
struct vm_guest_paging *paging, sup_paging;
struct user_segment_descriptor nt_desc, ot_desc;
struct iovec nt_iov[2], ot_iov[2];
uint64_t cr0, ot_base;
uint32_t eip, ot_lim, access;
int error, ext, fault, minlimit, nt_type, ot_type, vcpu;
enum task_switch_reason reason;
uint16_t nt_sel, ot_sel;
task_switch = &vmexit->u.task_switch;
nt_sel = task_switch->tsssel;
ext = vmexit->u.task_switch.ext;
reason = vmexit->u.task_switch.reason;
paging = &vmexit->u.task_switch.paging;
vcpu = *pvcpu;
assert(paging->cpu_mode == CPU_MODE_PROTECTED);
/*
* Calculate the instruction pointer to store in the old TSS.
*/
eip = vmexit->rip + vmexit->inst_length;
/*
* Section 4.6, "Access Rights" in Intel SDM Vol 3.
* The following page table accesses are implicitly supervisor mode:
* - accesses to GDT or LDT to load segment descriptors
* - accesses to the task state segment during task switch
*/
sup_paging = *paging;
sup_paging.cpl = 0; /* implicit supervisor mode */
/* Fetch the new TSS descriptor */
error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc,
&fault);
CHKERR(error, fault);
nt = usd_to_seg_desc(&nt_desc);
/* Verify the type of the new TSS */
nt_type = SEG_DESC_TYPE(nt.access);
if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS &&
nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) {
sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
goto done;
}
/* TSS descriptor must have present bit set */
if (!SEG_DESC_PRESENT(nt.access)) {
sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext);
goto done;
}
/*
* TSS must have a minimum length of 104 bytes for a 32-bit TSS and
* 44 bytes for a 16-bit TSS.
*/
if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS)
minlimit = 104 - 1;
else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS)
minlimit = 44 - 1;
else
minlimit = 0;
assert(minlimit > 0);
if (nt.limit < minlimit) {
sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
goto done;
}
/* TSS must be busy if task switch is due to IRET */
if (reason == TSR_IRET && !TSS_BUSY(nt_type)) {
sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
goto done;
}
/*
* TSS must be available (not busy) if task switch reason is
* CALL, JMP, exception or interrupt.
*/
if (reason != TSR_IRET && TSS_BUSY(nt_type)) {
sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext);
goto done;
}
/* Fetch the new TSS */
error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1,
PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault);
CHKERR(error, fault);
vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1);
/* Get the old TSS selector from the guest's task register */
ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR);
if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) {
/*
* This might happen if a task switch was attempted without
* ever loading the task register with LTR. In this case the
* TR would contain the values from power-on:
* (sel = 0, base = 0, limit = 0xffff).
*/
sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext);
goto done;
}
/* Get the old TSS base and limit from the guest's task register */
error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim,
&access);
assert(error == 0);
assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access));
ot_type = SEG_DESC_TYPE(access);
assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY);
/* Fetch the old TSS descriptor */
error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc,
&fault);
CHKERR(error, fault);
/* Get the old TSS */
error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1,
PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault);
CHKERR(error, fault);
vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1);
/*
* Clear the busy bit in the old TSS descriptor if the task switch
* due to an IRET or JMP instruction.
*/
if (reason == TSR_IRET || reason == TSR_JMP) {
ot_desc.sd_type &= ~0x2;
error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel,
&ot_desc, &fault);
CHKERR(error, fault);
}
if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) {
fprintf(stderr, "Task switch to 16-bit TSS not supported\n");
return (VMEXIT_ABORT);
}
/* Save processor state in old TSS */
tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov);
/*
* If the task switch was triggered for any reason other than IRET
* then set the busy bit in the new TSS descriptor.
*/
if (reason != TSR_IRET) {
nt_desc.sd_type |= 0x2;
error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel,
&nt_desc, &fault);
CHKERR(error, fault);
}
/* Update task register to point at the new TSS */
SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel);
/* Update the hidden descriptor state of the task register */
nt = usd_to_seg_desc(&nt_desc);
update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt);
/* Set CR0.TS */
cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS);
/*
* We are now committed to the task switch. Any exceptions encountered
* after this point will be handled in the context of the new task and
* the saved instruction pointer will belong to the new task.
*/
error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, newtss.tss_eip);
assert(error == 0);
/* Load processor state from new TSS */
error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov,
&fault);
CHKERR(error, fault);
/*
* Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception
* caused an error code to be generated, this error code is copied
* to the stack of the new task.
*/
if (task_switch->errcode_valid) {
assert(task_switch->ext);
assert(task_switch->reason == TSR_IDT_GATE);
error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type,
task_switch->errcode, &fault);
CHKERR(error, fault);
}
/*
* Treatment of virtual-NMI blocking if NMI is delivered through
* a task gate.
*
* Section "Architectural State Before A VM Exit", Intel SDM, Vol3:
* If the virtual NMIs VM-execution control is 1, VM entry injects
* an NMI, and delivery of the NMI causes a task switch that causes
* a VM exit, virtual-NMI blocking is in effect before the VM exit
* commences.
*
* Thus, virtual-NMI blocking is in effect at the time of the task
* switch VM exit.
*/
/*
* Treatment of virtual-NMI unblocking on IRET from NMI handler task.
*
* Section "Changes to Instruction Behavior in VMX Non-Root Operation"
* If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking.
* This unblocking of virtual-NMI occurs even if IRET causes a fault.
*
* Thus, virtual-NMI blocking is cleared at the time of the task switch
* VM exit.
*/
/*
* If the task switch was triggered by an event delivered through
* the IDT then extinguish the pending event from the vcpu's
* exitintinfo.
*/
if (task_switch->reason == TSR_IDT_GATE) {
error = vm_set_intinfo(ctx, vcpu, 0);
assert(error == 0);
}
/*
* XXX should inject debug exception if 'T' bit is 1
*/
done:
return (VMEXIT_CONTINUE);
}

674
uart_emul.c Normal file
View File

@ -0,0 +1,674 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* Copyright (c) 2013 Neel Natu <neel@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <dev/ic/ns16550.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <fcntl.h>
#include <termios.h>
#include <unistd.h>
#include <stdbool.h>
#include <string.h>
#include <pthread.h>
#include "mevent.h"
#include "uart_emul.h"
#define COM1_BASE 0x3F8
#define COM1_IRQ 4
#define COM2_BASE 0x2F8
#define COM2_IRQ 3
#define DEFAULT_RCLK 1843200
#define DEFAULT_BAUD 9600
#define FCR_RX_MASK 0xC0
#define MCR_OUT1 0x04
#define MCR_OUT2 0x08
#define MSR_DELTA_MASK 0x0f
#ifndef REG_SCR
#define REG_SCR com_scr
#endif
#define FIFOSZ 16
static bool uart_stdio; /* stdio in use for i/o */
static struct termios tio_stdio_orig;
static struct {
int baseaddr;
int irq;
bool inuse;
} uart_lres[] = {
{ COM1_BASE, COM1_IRQ, false},
{ COM2_BASE, COM2_IRQ, false},
};
#define UART_NLDEVS (sizeof(uart_lres) / sizeof(uart_lres[0]))
struct fifo {
uint8_t buf[FIFOSZ];
int rindex; /* index to read from */
int windex; /* index to write to */
int num; /* number of characters in the fifo */
int size; /* size of the fifo */
};
struct ttyfd {
bool opened;
int fd; /* tty device file descriptor */
struct termios tio_orig, tio_new; /* I/O Terminals */
};
struct uart_softc {
pthread_mutex_t mtx; /* protects all softc elements */
uint8_t data; /* Data register (R/W) */
uint8_t ier; /* Interrupt enable register (R/W) */
uint8_t lcr; /* Line control register (R/W) */
uint8_t mcr; /* Modem control register (R/W) */
uint8_t lsr; /* Line status register (R/W) */
uint8_t msr; /* Modem status register (R/W) */
uint8_t fcr; /* FIFO control register (W) */
uint8_t scr; /* Scratch register (R/W) */
uint8_t dll; /* Baudrate divisor latch LSB */
uint8_t dlh; /* Baudrate divisor latch MSB */
struct fifo rxfifo;
struct mevent *mev;
struct ttyfd tty;
bool thre_int_pending; /* THRE interrupt pending */
void *arg;
uart_intr_func_t intr_assert;
uart_intr_func_t intr_deassert;
};
static void uart_drain(int fd, enum ev_type ev, void *arg);
static void
ttyclose(void)
{
tcsetattr(STDIN_FILENO, TCSANOW, &tio_stdio_orig);
}
static void
ttyopen(struct ttyfd *tf)
{
tcgetattr(tf->fd, &tf->tio_orig);
tf->tio_new = tf->tio_orig;
cfmakeraw(&tf->tio_new);
tf->tio_new.c_cflag |= CLOCAL;
tcsetattr(tf->fd, TCSANOW, &tf->tio_new);
if (tf->fd == STDIN_FILENO) {
tio_stdio_orig = tf->tio_orig;
atexit(ttyclose);
}
}
static int
ttyread(struct ttyfd *tf)
{
unsigned char rb;
if (read(tf->fd, &rb, 1) == 1)
return (rb);
else
return (-1);
}
static void
ttywrite(struct ttyfd *tf, unsigned char wb)
{
(void)write(tf->fd, &wb, 1);
}
static void
rxfifo_reset(struct uart_softc *sc, int size)
{
char flushbuf[32];
struct fifo *fifo;
ssize_t nread;
int error;
fifo = &sc->rxfifo;
bzero(fifo, sizeof(struct fifo));
fifo->size = size;
if (sc->tty.opened) {
/*
* Flush any unread input from the tty buffer.
*/
while (1) {
nread = read(sc->tty.fd, flushbuf, sizeof(flushbuf));
if (nread != sizeof(flushbuf))
break;
}
/*
* Enable mevent to trigger when new characters are available
* on the tty fd.
*/
error = mevent_enable(sc->mev);
assert(error == 0);
}
}
static int
rxfifo_available(struct uart_softc *sc)
{
struct fifo *fifo;
fifo = &sc->rxfifo;
return (fifo->num < fifo->size);
}
static int
rxfifo_putchar(struct uart_softc *sc, uint8_t ch)
{
struct fifo *fifo;
int error;
fifo = &sc->rxfifo;
if (fifo->num < fifo->size) {
fifo->buf[fifo->windex] = ch;
fifo->windex = (fifo->windex + 1) % fifo->size;
fifo->num++;
if (!rxfifo_available(sc)) {
if (sc->tty.opened) {
/*
* Disable mevent callback if the FIFO is full.
*/
error = mevent_disable(sc->mev);
assert(error == 0);
}
}
return (0);
} else
return (-1);
}
static int
rxfifo_getchar(struct uart_softc *sc)
{
struct fifo *fifo;
int c, error, wasfull;
wasfull = 0;
fifo = &sc->rxfifo;
if (fifo->num > 0) {
if (!rxfifo_available(sc))
wasfull = 1;
c = fifo->buf[fifo->rindex];
fifo->rindex = (fifo->rindex + 1) % fifo->size;
fifo->num--;
if (wasfull) {
if (sc->tty.opened) {
error = mevent_enable(sc->mev);
assert(error == 0);
}
}
return (c);
} else
return (-1);
}
static int
rxfifo_numchars(struct uart_softc *sc)
{
struct fifo *fifo = &sc->rxfifo;
return (fifo->num);
}
static void
uart_opentty(struct uart_softc *sc)
{
ttyopen(&sc->tty);
sc->mev = mevent_add(sc->tty.fd, EVF_READ, uart_drain, sc);
assert(sc->mev != NULL);
}
static uint8_t
modem_status(uint8_t mcr)
{
uint8_t msr;
if (mcr & MCR_LOOPBACK) {
/*
* In the loopback mode certain bits from the MCR are
* reflected back into MSR.
*/
msr = 0;
if (mcr & MCR_RTS)
msr |= MSR_CTS;
if (mcr & MCR_DTR)
msr |= MSR_DSR;
if (mcr & MCR_OUT1)
msr |= MSR_RI;
if (mcr & MCR_OUT2)
msr |= MSR_DCD;
} else {
/*
* Always assert DCD and DSR so tty open doesn't block
* even if CLOCAL is turned off.
*/
msr = MSR_DCD | MSR_DSR;
}
assert((msr & MSR_DELTA_MASK) == 0);
return (msr);
}
/*
* The IIR returns a prioritized interrupt reason:
* - receive data available
* - transmit holding register empty
* - modem status change
*
* Return an interrupt reason if one is available.
*/
static int
uart_intr_reason(struct uart_softc *sc)
{
if ((sc->lsr & LSR_OE) != 0 && (sc->ier & IER_ERLS) != 0)
return (IIR_RLS);
else if (rxfifo_numchars(sc) > 0 && (sc->ier & IER_ERXRDY) != 0)
return (IIR_RXTOUT);
else if (sc->thre_int_pending && (sc->ier & IER_ETXRDY) != 0)
return (IIR_TXRDY);
else if ((sc->msr & MSR_DELTA_MASK) != 0 && (sc->ier & IER_EMSC) != 0)
return (IIR_MLSC);
else
return (IIR_NOPEND);
}
static void
uart_reset(struct uart_softc *sc)
{
uint16_t divisor;
divisor = DEFAULT_RCLK / DEFAULT_BAUD / 16;
sc->dll = divisor;
sc->dlh = divisor >> 16;
sc->msr = modem_status(sc->mcr);
rxfifo_reset(sc, 1); /* no fifo until enabled by software */
}
/*
* Toggle the COM port's intr pin depending on whether or not we have an
* interrupt condition to report to the processor.
*/
static void
uart_toggle_intr(struct uart_softc *sc)
{
uint8_t intr_reason;
intr_reason = uart_intr_reason(sc);
if (intr_reason == IIR_NOPEND)
(*sc->intr_deassert)(sc->arg);
else
(*sc->intr_assert)(sc->arg);
}
static void
uart_drain(int fd, enum ev_type ev, void *arg)
{
struct uart_softc *sc;
int ch;
sc = arg;
assert(fd == sc->tty.fd);
assert(ev == EVF_READ);
/*
* This routine is called in the context of the mevent thread
* to take out the softc lock to protect against concurrent
* access from a vCPU i/o exit
*/
pthread_mutex_lock(&sc->mtx);
if ((sc->mcr & MCR_LOOPBACK) != 0) {
(void) ttyread(&sc->tty);
} else {
while (rxfifo_available(sc) &&
((ch = ttyread(&sc->tty)) != -1)) {
rxfifo_putchar(sc, ch);
}
uart_toggle_intr(sc);
}
pthread_mutex_unlock(&sc->mtx);
}
void
uart_write(struct uart_softc *sc, int offset, uint8_t value)
{
int fifosz;
uint8_t msr;
pthread_mutex_lock(&sc->mtx);
/*
* Take care of the special case DLAB accesses first
*/
if ((sc->lcr & LCR_DLAB) != 0) {
if (offset == REG_DLL) {
sc->dll = value;
goto done;
}
if (offset == REG_DLH) {
sc->dlh = value;
goto done;
}
}
switch (offset) {
case REG_DATA:
if (sc->mcr & MCR_LOOPBACK) {
if (rxfifo_putchar(sc, value) != 0)
sc->lsr |= LSR_OE;
} else if (sc->tty.opened) {
ttywrite(&sc->tty, value);
} /* else drop on floor */
sc->thre_int_pending = true;
break;
case REG_IER:
/*
* Apply mask so that bits 4-7 are 0
* Also enables bits 0-3 only if they're 1
*/
sc->ier = value & 0x0F;
break;
case REG_FCR:
/*
* When moving from FIFO and 16450 mode and vice versa,
* the FIFO contents are reset.
*/
if ((sc->fcr & FCR_ENABLE) ^ (value & FCR_ENABLE)) {
fifosz = (value & FCR_ENABLE) ? FIFOSZ : 1;
rxfifo_reset(sc, fifosz);
}
/*
* The FCR_ENABLE bit must be '1' for the programming
* of other FCR bits to be effective.
*/
if ((value & FCR_ENABLE) == 0) {
sc->fcr = 0;
} else {
if ((value & FCR_RCV_RST) != 0)
rxfifo_reset(sc, FIFOSZ);
sc->fcr = value &
(FCR_ENABLE | FCR_DMA | FCR_RX_MASK);
}
break;
case REG_LCR:
sc->lcr = value;
break;
case REG_MCR:
/* Apply mask so that bits 5-7 are 0 */
sc->mcr = value & 0x1F;
msr = modem_status(sc->mcr);
/*
* Detect if there has been any change between the
* previous and the new value of MSR. If there is
* then assert the appropriate MSR delta bit.
*/
if ((msr & MSR_CTS) ^ (sc->msr & MSR_CTS))
sc->msr |= MSR_DCTS;
if ((msr & MSR_DSR) ^ (sc->msr & MSR_DSR))
sc->msr |= MSR_DDSR;
if ((msr & MSR_DCD) ^ (sc->msr & MSR_DCD))
sc->msr |= MSR_DDCD;
if ((sc->msr & MSR_RI) != 0 && (msr & MSR_RI) == 0)
sc->msr |= MSR_TERI;
/*
* Update the value of MSR while retaining the delta
* bits.
*/
sc->msr &= MSR_DELTA_MASK;
sc->msr |= msr;
break;
case REG_LSR:
/*
* Line status register is not meant to be written to
* during normal operation.
*/
break;
case REG_MSR:
/*
* As far as I can tell MSR is a read-only register.
*/
break;
case REG_SCR:
sc->scr = value;
break;
default:
break;
}
done:
uart_toggle_intr(sc);
pthread_mutex_unlock(&sc->mtx);
}
uint8_t
uart_read(struct uart_softc *sc, int offset)
{
uint8_t iir, intr_reason, reg;
pthread_mutex_lock(&sc->mtx);
/*
* Take care of the special case DLAB accesses first
*/
if ((sc->lcr & LCR_DLAB) != 0) {
if (offset == REG_DLL) {
reg = sc->dll;
goto done;
}
if (offset == REG_DLH) {
reg = sc->dlh;
goto done;
}
}
switch (offset) {
case REG_DATA:
reg = rxfifo_getchar(sc);
break;
case REG_IER:
reg = sc->ier;
break;
case REG_IIR:
iir = (sc->fcr & FCR_ENABLE) ? IIR_FIFO_MASK : 0;
intr_reason = uart_intr_reason(sc);
/*
* Deal with side effects of reading the IIR register
*/
if (intr_reason == IIR_TXRDY)
sc->thre_int_pending = false;
iir |= intr_reason;
reg = iir;
break;
case REG_LCR:
reg = sc->lcr;
break;
case REG_MCR:
reg = sc->mcr;
break;
case REG_LSR:
/* Transmitter is always ready for more data */
sc->lsr |= LSR_TEMT | LSR_THRE;
/* Check for new receive data */
if (rxfifo_numchars(sc) > 0)
sc->lsr |= LSR_RXRDY;
else
sc->lsr &= ~LSR_RXRDY;
reg = sc->lsr;
/* The LSR_OE bit is cleared on LSR read */
sc->lsr &= ~LSR_OE;
break;
case REG_MSR:
/*
* MSR delta bits are cleared on read
*/
reg = sc->msr;
sc->msr &= ~MSR_DELTA_MASK;
break;
case REG_SCR:
reg = sc->scr;
break;
default:
reg = 0xFF;
break;
}
done:
uart_toggle_intr(sc);
pthread_mutex_unlock(&sc->mtx);
return (reg);
}
int
uart_legacy_alloc(int which, int *baseaddr, int *irq)
{
if (which < 0 || which >= UART_NLDEVS || uart_lres[which].inuse)
return (-1);
uart_lres[which].inuse = true;
*baseaddr = uart_lres[which].baseaddr;
*irq = uart_lres[which].irq;
return (0);
}
struct uart_softc *
uart_init(uart_intr_func_t intr_assert, uart_intr_func_t intr_deassert,
void *arg)
{
struct uart_softc *sc;
sc = calloc(1, sizeof(struct uart_softc));
sc->arg = arg;
sc->intr_assert = intr_assert;
sc->intr_deassert = intr_deassert;
pthread_mutex_init(&sc->mtx, NULL);
uart_reset(sc);
return (sc);
}
static int
uart_tty_backend(struct uart_softc *sc, const char *opts)
{
int fd;
int retval;
retval = -1;
fd = open(opts, O_RDWR | O_NONBLOCK);
if (fd > 0 && isatty(fd)) {
sc->tty.fd = fd;
sc->tty.opened = true;
retval = 0;
}
return (retval);
}
int
uart_set_backend(struct uart_softc *sc, const char *opts)
{
int retval;
retval = -1;
if (opts == NULL)
return (0);
if (strcmp("stdio", opts) == 0) {
if (!uart_stdio) {
sc->tty.fd = STDIN_FILENO;
sc->tty.opened = true;
uart_stdio = true;
retval = 0;
}
} else if (uart_tty_backend(sc, opts) == 0) {
retval = 0;
}
/* Make the backend file descriptor non-blocking */
if (retval == 0)
retval = fcntl(sc->tty.fd, F_SETFL, O_NONBLOCK);
if (retval == 0)
uart_opentty(sc);
return (retval);
}

45
uart_emul.h Normal file
View File

@ -0,0 +1,45 @@
/*-
* Copyright (c) 2013 Neel Natu <neel@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _UART_EMUL_H_
#define _UART_EMUL_H_
#define UART_IO_BAR_SIZE 8
struct uart_softc;
typedef void (*uart_intr_func_t)(void *arg);
struct uart_softc *uart_init(uart_intr_func_t intr_assert,
uart_intr_func_t intr_deassert, void *arg);
int uart_legacy_alloc(int unit, int *ioaddr, int *irq);
uint8_t uart_read(struct uart_softc *sc, int offset);
void uart_write(struct uart_softc *sc, int offset, uint8_t value);
int uart_set_backend(struct uart_softc *sc, const char *opt);
#endif

777
virtio.c Normal file
View File

@ -0,0 +1,777 @@
/*-
* Copyright (c) 2013 Chris Torek <torek @ torek net>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/uio.h>
#include <stdio.h>
#include <stdint.h>
#include <pthread.h>
#include <pthread_np.h>
#include "bhyverun.h"
#include "pci_emul.h"
#include "virtio.h"
/*
* Functions for dealing with generalized "virtual devices" as
* defined by <https://www.google.com/#output=search&q=virtio+spec>
*/
/*
* In case we decide to relax the "virtio softc comes at the
* front of virtio-based device softc" constraint, let's use
* this to convert.
*/
#define DEV_SOFTC(vs) ((void *)(vs))
/*
* Link a virtio_softc to its constants, the device softc, and
* the PCI emulation.
*/
void
vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc,
void *dev_softc, struct pci_devinst *pi,
struct vqueue_info *queues)
{
int i;
/* vs and dev_softc addresses must match */
assert((void *)vs == dev_softc);
vs->vs_vc = vc;
vs->vs_pi = pi;
pi->pi_arg = vs;
vs->vs_queues = queues;
for (i = 0; i < vc->vc_nvq; i++) {
queues[i].vq_vs = vs;
queues[i].vq_num = i;
}
}
/*
* Reset device (device-wide). This erases all queues, i.e.,
* all the queues become invalid (though we don't wipe out the
* internal pointers, we just clear the VQ_ALLOC flag).
*
* It resets negotiated features to "none".
*
* If MSI-X is enabled, this also resets all the vectors to NO_VECTOR.
*/
void
vi_reset_dev(struct virtio_softc *vs)
{
struct vqueue_info *vq;
int i, nvq;
if (vs->vs_mtx)
assert(pthread_mutex_isowned_np(vs->vs_mtx));
nvq = vs->vs_vc->vc_nvq;
for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) {
vq->vq_flags = 0;
vq->vq_last_avail = 0;
vq->vq_save_used = 0;
vq->vq_pfn = 0;
vq->vq_msix_idx = VIRTIO_MSI_NO_VECTOR;
}
vs->vs_negotiated_caps = 0;
vs->vs_curq = 0;
/* vs->vs_status = 0; -- redundant */
if (vs->vs_isr)
pci_lintr_deassert(vs->vs_pi);
vs->vs_isr = 0;
vs->vs_msix_cfg_idx = VIRTIO_MSI_NO_VECTOR;
}
/*
* Set I/O BAR (usually 0) to map PCI config registers.
*/
void
vi_set_io_bar(struct virtio_softc *vs, int barnum)
{
size_t size;
/*
* ??? should we use CFG0 if MSI-X is disabled?
* Existing code did not...
*/
size = VTCFG_R_CFG1 + vs->vs_vc->vc_cfgsize;
pci_emul_alloc_bar(vs->vs_pi, barnum, PCIBAR_IO, size);
}
/*
* Initialize MSI-X vector capabilities if we're to use MSI-X,
* or MSI capabilities if not.
*
* We assume we want one MSI-X vector per queue, here, plus one
* for the config vec.
*/
int
vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix)
{
int nvec;
if (use_msix) {
vs->vs_flags |= VIRTIO_USE_MSIX;
VS_LOCK(vs);
vi_reset_dev(vs); /* set all vectors to NO_VECTOR */
VS_UNLOCK(vs);
nvec = vs->vs_vc->vc_nvq + 1;
if (pci_emul_add_msixcap(vs->vs_pi, nvec, barnum))
return (1);
} else
vs->vs_flags &= ~VIRTIO_USE_MSIX;
/* Only 1 MSI vector for bhyve */
pci_emul_add_msicap(vs->vs_pi, 1);
/* Legacy interrupts are mandatory for virtio devices */
pci_lintr_request(vs->vs_pi);
return (0);
}
/*
* Initialize the currently-selected virtio queue (vs->vs_curq).
* The guest just gave us a page frame number, from which we can
* calculate the addresses of the queue.
*/
void
vi_vq_init(struct virtio_softc *vs, uint32_t pfn)
{
struct vqueue_info *vq;
uint64_t phys;
size_t size;
char *base;
vq = &vs->vs_queues[vs->vs_curq];
vq->vq_pfn = pfn;
phys = (uint64_t)pfn << VRING_PFN;
size = vring_size(vq->vq_qsize);
base = paddr_guest2host(vs->vs_pi->pi_vmctx, phys, size);
/* First page(s) are descriptors... */
vq->vq_desc = (struct virtio_desc *)base;
base += vq->vq_qsize * sizeof(struct virtio_desc);
/* ... immediately followed by "avail" ring (entirely uint16_t's) */
vq->vq_avail = (struct vring_avail *)base;
base += (2 + vq->vq_qsize + 1) * sizeof(uint16_t);
/* Then it's rounded up to the next page... */
base = (char *)roundup2((uintptr_t)base, VRING_ALIGN);
/* ... and the last page(s) are the used ring. */
vq->vq_used = (struct vring_used *)base;
/* Mark queue as allocated, and start at 0 when we use it. */
vq->vq_flags = VQ_ALLOC;
vq->vq_last_avail = 0;
vq->vq_save_used = 0;
}
/*
* Helper inline for vq_getchain(): record the i'th "real"
* descriptor.
*/
static inline void
_vq_record(int i, volatile struct virtio_desc *vd, struct vmctx *ctx,
struct iovec *iov, int n_iov, uint16_t *flags) {
if (i >= n_iov)
return;
iov[i].iov_base = paddr_guest2host(ctx, vd->vd_addr, vd->vd_len);
iov[i].iov_len = vd->vd_len;
if (flags != NULL)
flags[i] = vd->vd_flags;
}
#define VQ_MAX_DESCRIPTORS 512 /* see below */
/*
* Examine the chain of descriptors starting at the "next one" to
* make sure that they describe a sensible request. If so, return
* the number of "real" descriptors that would be needed/used in
* acting on this request. This may be smaller than the number of
* available descriptors, e.g., if there are two available but
* they are two separate requests, this just returns 1. Or, it
* may be larger: if there are indirect descriptors involved,
* there may only be one descriptor available but it may be an
* indirect pointing to eight more. We return 8 in this case,
* i.e., we do not count the indirect descriptors, only the "real"
* ones.
*
* Basically, this vets the vd_flags and vd_next field of each
* descriptor and tells you how many are involved. Since some may
* be indirect, this also needs the vmctx (in the pci_devinst
* at vs->vs_pi) so that it can find indirect descriptors.
*
* As we process each descriptor, we copy and adjust it (guest to
* host address wise, also using the vmtctx) into the given iov[]
* array (of the given size). If the array overflows, we stop
* placing values into the array but keep processing descriptors,
* up to VQ_MAX_DESCRIPTORS, before giving up and returning -1.
* So you, the caller, must not assume that iov[] is as big as the
* return value (you can process the same thing twice to allocate
* a larger iov array if needed, or supply a zero length to find
* out how much space is needed).
*
* If you want to verify the WRITE flag on each descriptor, pass a
* non-NULL "flags" pointer to an array of "uint16_t" of the same size
* as n_iov and we'll copy each vd_flags field after unwinding any
* indirects.
*
* If some descriptor(s) are invalid, this prints a diagnostic message
* and returns -1. If no descriptors are ready now it simply returns 0.
*
* You are assumed to have done a vq_ring_ready() if needed (note
* that vq_has_descs() does one).
*/
int
vq_getchain(struct vqueue_info *vq, uint16_t *pidx,
struct iovec *iov, int n_iov, uint16_t *flags)
{
int i;
u_int ndesc, n_indir;
u_int idx, next;
volatile struct virtio_desc *vdir, *vindir, *vp;
struct vmctx *ctx;
struct virtio_softc *vs;
const char *name;
vs = vq->vq_vs;
name = vs->vs_vc->vc_name;
/*
* Note: it's the responsibility of the guest not to
* update vq->vq_avail->va_idx until all of the descriptors
* the guest has written are valid (including all their
* vd_next fields and vd_flags).
*
* Compute (last_avail - va_idx) in integers mod 2**16. This is
* the number of descriptors the device has made available
* since the last time we updated vq->vq_last_avail.
*
* We just need to do the subtraction as an unsigned int,
* then trim off excess bits.
*/
idx = vq->vq_last_avail;
ndesc = (uint16_t)((u_int)vq->vq_avail->va_idx - idx);
if (ndesc == 0)
return (0);
if (ndesc > vq->vq_qsize) {
/* XXX need better way to diagnose issues */
fprintf(stderr,
"%s: ndesc (%u) out of range, driver confused?\r\n",
name, (u_int)ndesc);
return (-1);
}
/*
* Now count/parse "involved" descriptors starting from
* the head of the chain.
*
* To prevent loops, we could be more complicated and
* check whether we're re-visiting a previously visited
* index, but we just abort if the count gets excessive.
*/
ctx = vs->vs_pi->pi_vmctx;
*pidx = next = vq->vq_avail->va_ring[idx & (vq->vq_qsize - 1)];
vq->vq_last_avail++;
for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) {
if (next >= vq->vq_qsize) {
fprintf(stderr,
"%s: descriptor index %u out of range, "
"driver confused?\r\n",
name, next);
return (-1);
}
vdir = &vq->vq_desc[next];
if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) {
_vq_record(i, vdir, ctx, iov, n_iov, flags);
i++;
} else if ((vs->vs_vc->vc_hv_caps &
VIRTIO_RING_F_INDIRECT_DESC) == 0) {
fprintf(stderr,
"%s: descriptor has forbidden INDIRECT flag, "
"driver confused?\r\n",
name);
return (-1);
} else {
n_indir = vdir->vd_len / 16;
if ((vdir->vd_len & 0xf) || n_indir == 0) {
fprintf(stderr,
"%s: invalid indir len 0x%x, "
"driver confused?\r\n",
name, (u_int)vdir->vd_len);
return (-1);
}
vindir = paddr_guest2host(ctx,
vdir->vd_addr, vdir->vd_len);
/*
* Indirects start at the 0th, then follow
* their own embedded "next"s until those run
* out. Each one's indirect flag must be off
* (we don't really have to check, could just
* ignore errors...).
*/
next = 0;
for (;;) {
vp = &vindir[next];
if (vp->vd_flags & VRING_DESC_F_INDIRECT) {
fprintf(stderr,
"%s: indirect desc has INDIR flag,"
" driver confused?\r\n",
name);
return (-1);
}
_vq_record(i, vp, ctx, iov, n_iov, flags);
if (++i > VQ_MAX_DESCRIPTORS)
goto loopy;
if ((vp->vd_flags & VRING_DESC_F_NEXT) == 0)
break;
next = vp->vd_next;
if (next >= n_indir) {
fprintf(stderr,
"%s: invalid next %u > %u, "
"driver confused?\r\n",
name, (u_int)next, n_indir);
return (-1);
}
}
}
if ((vdir->vd_flags & VRING_DESC_F_NEXT) == 0)
return (i);
}
loopy:
fprintf(stderr,
"%s: descriptor loop? count > %d - driver confused?\r\n",
name, i);
return (-1);
}
/*
* Return the currently-first request chain back to the available queue.
*
* (This chain is the one you handled when you called vq_getchain()
* and used its positive return value.)
*/
void
vq_retchain(struct vqueue_info *vq)
{
vq->vq_last_avail--;
}
/*
* Return specified request chain to the guest, setting its I/O length
* to the provided value.
*
* (This chain is the one you handled when you called vq_getchain()
* and used its positive return value.)
*/
void
vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen)
{
uint16_t uidx, mask;
volatile struct vring_used *vuh;
volatile struct virtio_used *vue;
/*
* Notes:
* - mask is N-1 where N is a power of 2 so computes x % N
* - vuh points to the "used" data shared with guest
* - vue points to the "used" ring entry we want to update
* - head is the same value we compute in vq_iovecs().
*
* (I apologize for the two fields named vu_idx; the
* virtio spec calls the one that vue points to, "id"...)
*/
mask = vq->vq_qsize - 1;
vuh = vq->vq_used;
uidx = vuh->vu_idx;
vue = &vuh->vu_ring[uidx++ & mask];
vue->vu_idx = idx;
vue->vu_tlen = iolen;
vuh->vu_idx = uidx;
}
/*
* Driver has finished processing "available" chains and calling
* vq_relchain on each one. If driver used all the available
* chains, used_all should be set.
*
* If the "used" index moved we may need to inform the guest, i.e.,
* deliver an interrupt. Even if the used index did NOT move we
* may need to deliver an interrupt, if the avail ring is empty and
* we are supposed to interrupt on empty.
*
* Note that used_all_avail is provided by the caller because it's
* a snapshot of the ring state when he decided to finish interrupt
* processing -- it's possible that descriptors became available after
* that point. (It's also typically a constant 1/True as well.)
*/
void
vq_endchains(struct vqueue_info *vq, int used_all_avail)
{
struct virtio_softc *vs;
uint16_t event_idx, new_idx, old_idx;
int intr;
/*
* Interrupt generation: if we're using EVENT_IDX,
* interrupt if we've crossed the event threshold.
* Otherwise interrupt is generated if we added "used" entries,
* but suppressed by VRING_AVAIL_F_NO_INTERRUPT.
*
* In any case, though, if NOTIFY_ON_EMPTY is set and the
* entire avail was processed, we need to interrupt always.
*/
vs = vq->vq_vs;
old_idx = vq->vq_save_used;
vq->vq_save_used = new_idx = vq->vq_used->vu_idx;
if (used_all_avail &&
(vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY))
intr = 1;
else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) {
event_idx = VQ_USED_EVENT_IDX(vq);
/*
* This calculation is per docs and the kernel
* (see src/sys/dev/virtio/virtio_ring.h).
*/
intr = (uint16_t)(new_idx - event_idx - 1) <
(uint16_t)(new_idx - old_idx);
} else {
intr = new_idx != old_idx &&
!(vq->vq_avail->va_flags & VRING_AVAIL_F_NO_INTERRUPT);
}
if (intr)
vq_interrupt(vs, vq);
}
/* Note: these are in sorted order to make for a fast search */
static struct config_reg {
uint16_t cr_offset; /* register offset */
uint8_t cr_size; /* size (bytes) */
uint8_t cr_ro; /* true => reg is read only */
const char *cr_name; /* name of reg */
} config_regs[] = {
{ VTCFG_R_HOSTCAP, 4, 1, "HOSTCAP" },
{ VTCFG_R_GUESTCAP, 4, 0, "GUESTCAP" },
{ VTCFG_R_PFN, 4, 0, "PFN" },
{ VTCFG_R_QNUM, 2, 1, "QNUM" },
{ VTCFG_R_QSEL, 2, 0, "QSEL" },
{ VTCFG_R_QNOTIFY, 2, 0, "QNOTIFY" },
{ VTCFG_R_STATUS, 1, 0, "STATUS" },
{ VTCFG_R_ISR, 1, 0, "ISR" },
{ VTCFG_R_CFGVEC, 2, 0, "CFGVEC" },
{ VTCFG_R_QVEC, 2, 0, "QVEC" },
};
static inline struct config_reg *
vi_find_cr(int offset) {
u_int hi, lo, mid;
struct config_reg *cr;
lo = 0;
hi = sizeof(config_regs) / sizeof(*config_regs) - 1;
while (hi >= lo) {
mid = (hi + lo) >> 1;
cr = &config_regs[mid];
if (cr->cr_offset == offset)
return (cr);
if (cr->cr_offset < offset)
lo = mid + 1;
else
hi = mid - 1;
}
return (NULL);
}
/*
* Handle pci config space reads.
* If it's to the MSI-X info, do that.
* If it's part of the virtio standard stuff, do that.
* Otherwise dispatch to the actual driver.
*/
uint64_t
vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size)
{
struct virtio_softc *vs = pi->pi_arg;
struct virtio_consts *vc;
struct config_reg *cr;
uint64_t virtio_config_size, max;
const char *name;
uint32_t newoff;
uint32_t value;
int error;
if (vs->vs_flags & VIRTIO_USE_MSIX) {
if (baridx == pci_msix_table_bar(pi) ||
baridx == pci_msix_pba_bar(pi)) {
return (pci_emul_msix_tread(pi, offset, size));
}
}
/* XXX probably should do something better than just assert() */
assert(baridx == 0);
if (vs->vs_mtx)
pthread_mutex_lock(vs->vs_mtx);
vc = vs->vs_vc;
name = vc->vc_name;
value = size == 1 ? 0xff : size == 2 ? 0xffff : 0xffffffff;
if (size != 1 && size != 2 && size != 4)
goto bad;
if (pci_msix_enabled(pi))
virtio_config_size = VTCFG_R_CFG1;
else
virtio_config_size = VTCFG_R_CFG0;
if (offset >= virtio_config_size) {
/*
* Subtract off the standard size (including MSI-X
* registers if enabled) and dispatch to underlying driver.
* If that fails, fall into general code.
*/
newoff = offset - virtio_config_size;
max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000;
if (newoff + size > max)
goto bad;
error = (*vc->vc_cfgread)(DEV_SOFTC(vs), newoff, size, &value);
if (!error)
goto done;
}
bad:
cr = vi_find_cr(offset);
if (cr == NULL || cr->cr_size != size) {
if (cr != NULL) {
/* offset must be OK, so size must be bad */
fprintf(stderr,
"%s: read from %s: bad size %d\r\n",
name, cr->cr_name, size);
} else {
fprintf(stderr,
"%s: read from bad offset/size %jd/%d\r\n",
name, (uintmax_t)offset, size);
}
goto done;
}
switch (offset) {
case VTCFG_R_HOSTCAP:
value = vc->vc_hv_caps;
break;
case VTCFG_R_GUESTCAP:
value = vs->vs_negotiated_caps;
break;
case VTCFG_R_PFN:
if (vs->vs_curq < vc->vc_nvq)
value = vs->vs_queues[vs->vs_curq].vq_pfn;
break;
case VTCFG_R_QNUM:
value = vs->vs_curq < vc->vc_nvq ?
vs->vs_queues[vs->vs_curq].vq_qsize : 0;
break;
case VTCFG_R_QSEL:
value = vs->vs_curq;
break;
case VTCFG_R_QNOTIFY:
value = 0; /* XXX */
break;
case VTCFG_R_STATUS:
value = vs->vs_status;
break;
case VTCFG_R_ISR:
value = vs->vs_isr;
vs->vs_isr = 0; /* a read clears this flag */
if (value)
pci_lintr_deassert(pi);
break;
case VTCFG_R_CFGVEC:
value = vs->vs_msix_cfg_idx;
break;
case VTCFG_R_QVEC:
value = vs->vs_curq < vc->vc_nvq ?
vs->vs_queues[vs->vs_curq].vq_msix_idx :
VIRTIO_MSI_NO_VECTOR;
break;
}
done:
if (vs->vs_mtx)
pthread_mutex_unlock(vs->vs_mtx);
return (value);
}
/*
* Handle pci config space writes.
* If it's to the MSI-X info, do that.
* If it's part of the virtio standard stuff, do that.
* Otherwise dispatch to the actual driver.
*/
void
vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size, uint64_t value)
{
struct virtio_softc *vs = pi->pi_arg;
struct vqueue_info *vq;
struct virtio_consts *vc;
struct config_reg *cr;
uint64_t virtio_config_size, max;
const char *name;
uint32_t newoff;
int error;
if (vs->vs_flags & VIRTIO_USE_MSIX) {
if (baridx == pci_msix_table_bar(pi) ||
baridx == pci_msix_pba_bar(pi)) {
pci_emul_msix_twrite(pi, offset, size, value);
return;
}
}
/* XXX probably should do something better than just assert() */
assert(baridx == 0);
if (vs->vs_mtx)
pthread_mutex_lock(vs->vs_mtx);
vc = vs->vs_vc;
name = vc->vc_name;
if (size != 1 && size != 2 && size != 4)
goto bad;
if (pci_msix_enabled(pi))
virtio_config_size = VTCFG_R_CFG1;
else
virtio_config_size = VTCFG_R_CFG0;
if (offset >= virtio_config_size) {
/*
* Subtract off the standard size (including MSI-X
* registers if enabled) and dispatch to underlying driver.
*/
newoff = offset - virtio_config_size;
max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000;
if (newoff + size > max)
goto bad;
error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), newoff, size, value);
if (!error)
goto done;
}
bad:
cr = vi_find_cr(offset);
if (cr == NULL || cr->cr_size != size || cr->cr_ro) {
if (cr != NULL) {
/* offset must be OK, wrong size and/or reg is R/O */
if (cr->cr_size != size)
fprintf(stderr,
"%s: write to %s: bad size %d\r\n",
name, cr->cr_name, size);
if (cr->cr_ro)
fprintf(stderr,
"%s: write to read-only reg %s\r\n",
name, cr->cr_name);
} else {
fprintf(stderr,
"%s: write to bad offset/size %jd/%d\r\n",
name, (uintmax_t)offset, size);
}
goto done;
}
switch (offset) {
case VTCFG_R_GUESTCAP:
vs->vs_negotiated_caps = value & vc->vc_hv_caps;
if (vc->vc_apply_features)
(*vc->vc_apply_features)(DEV_SOFTC(vs),
vs->vs_negotiated_caps);
break;
case VTCFG_R_PFN:
if (vs->vs_curq >= vc->vc_nvq)
goto bad_qindex;
vi_vq_init(vs, value);
break;
case VTCFG_R_QSEL:
/*
* Note that the guest is allowed to select an
* invalid queue; we just need to return a QNUM
* of 0 while the bad queue is selected.
*/
vs->vs_curq = value;
break;
case VTCFG_R_QNOTIFY:
if (value >= vc->vc_nvq) {
fprintf(stderr, "%s: queue %d notify out of range\r\n",
name, (int)value);
goto done;
}
vq = &vs->vs_queues[value];
if (vq->vq_notify)
(*vq->vq_notify)(DEV_SOFTC(vs), vq);
else if (vc->vc_qnotify)
(*vc->vc_qnotify)(DEV_SOFTC(vs), vq);
else
fprintf(stderr,
"%s: qnotify queue %d: missing vq/vc notify\r\n",
name, (int)value);
break;
case VTCFG_R_STATUS:
vs->vs_status = value;
if (value == 0)
(*vc->vc_reset)(DEV_SOFTC(vs));
break;
case VTCFG_R_CFGVEC:
vs->vs_msix_cfg_idx = value;
break;
case VTCFG_R_QVEC:
if (vs->vs_curq >= vc->vc_nvq)
goto bad_qindex;
vq = &vs->vs_queues[vs->vs_curq];
vq->vq_msix_idx = value;
break;
}
goto done;
bad_qindex:
fprintf(stderr,
"%s: write config reg %s: curq %d >= max %d\r\n",
name, cr->cr_name, vs->vs_curq, vc->vc_nvq);
done:
if (vs->vs_mtx)
pthread_mutex_unlock(vs->vs_mtx);
}

464
virtio.h Normal file
View File

@ -0,0 +1,464 @@
/*-
* Copyright (c) 2013 Chris Torek <torek @ torek net>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VIRTIO_H_
#define _VIRTIO_H_
/*
* These are derived from several virtio specifications.
*
* Some useful links:
* https://github.com/rustyrussell/virtio-spec
* http://people.redhat.com/pbonzini/virtio-spec.pdf
*/
/*
* A virtual device has zero or more "virtual queues" (virtqueue).
* Each virtqueue uses at least two 4096-byte pages, laid out thus:
*
* +-----------------------------------------------+
* | "desc": <N> descriptors, 16 bytes each |
* | ----------------------------------------- |
* | "avail": 2 uint16; <N> uint16; 1 uint16 |
* | ----------------------------------------- |
* | pad to 4k boundary |
* +-----------------------------------------------+
* | "used": 2 x uint16; <N> elems; 1 uint16 |
* | ----------------------------------------- |
* | pad to 4k boundary |
* +-----------------------------------------------+
*
* The number <N> that appears here is always a power of two and is
* limited to no more than 32768 (as it must fit in a 16-bit field).
* If <N> is sufficiently large, the above will occupy more than
* two pages. In any case, all pages must be physically contiguous
* within the guest's physical address space.
*
* The <N> 16-byte "desc" descriptors consist of a 64-bit guest
* physical address <addr>, a 32-bit length <len>, a 16-bit
* <flags>, and a 16-bit <next> field (all in guest byte order).
*
* There are three flags that may be set :
* NEXT descriptor is chained, so use its "next" field
* WRITE descriptor is for host to write into guest RAM
* (else host is to read from guest RAM)
* INDIRECT descriptor address field is (guest physical)
* address of a linear array of descriptors
*
* Unless INDIRECT is set, <len> is the number of bytes that may
* be read/written from guest physical address <addr>. If
* INDIRECT is set, WRITE is ignored and <len> provides the length
* of the indirect descriptors (and <len> must be a multiple of
* 16). Note that NEXT may still be set in the main descriptor
* pointing to the indirect, and should be set in each indirect
* descriptor that uses the next descriptor (these should generally
* be numbered sequentially). However, INDIRECT must not be set
* in the indirect descriptors. Upon reaching an indirect descriptor
* without a NEXT bit, control returns to the direct descriptors.
*
* Except inside an indirect, each <next> value must be in the
* range [0 .. N) (i.e., the half-open interval). (Inside an
* indirect, each <next> must be in the range [0 .. <len>/16).)
*
* The "avail" data structures reside in the same pages as the
* "desc" structures since both together are used by the device to
* pass information to the hypervisor's virtual driver. These
* begin with a 16-bit <flags> field and 16-bit index <idx>, then
* have <N> 16-bit <ring> values, followed by one final 16-bit
* field <used_event>. The <N> <ring> entries are simply indices
* indices into the descriptor ring (and thus must meet the same
* constraints as each <next> value). However, <idx> is counted
* up from 0 (initially) and simply wraps around after 65535; it
* is taken mod <N> to find the next available entry.
*
* The "used" ring occupies a separate page or pages, and contains
* values written from the virtual driver back to the guest OS.
* This begins with a 16-bit <flags> and 16-bit <idx>, then there
* are <N> "vring_used" elements, followed by a 16-bit <avail_event>.
* The <N> "vring_used" elements consist of a 32-bit <id> and a
* 32-bit <len> (vu_tlen below). The <id> is simply the index of
* the head of a descriptor chain the guest made available
* earlier, and the <len> is the number of bytes actually written,
* e.g., in the case of a network driver that provided a large
* receive buffer but received only a small amount of data.
*
* The two event fields, <used_event> and <avail_event>, in the
* avail and used rings (respectively -- note the reversal!), are
* always provided, but are used only if the virtual device
* negotiates the VIRTIO_RING_F_EVENT_IDX feature during feature
* negotiation. Similarly, both rings provide a flag --
* VRING_AVAIL_F_NO_INTERRUPT and VRING_USED_F_NO_NOTIFY -- in
* their <flags> field, indicating that the guest does not need an
* interrupt, or that the hypervisor driver does not need a
* notify, when descriptors are added to the corresponding ring.
* (These are provided only for interrupt optimization and need
* not be implemented.)
*/
#define VRING_ALIGN 4096
#define VRING_DESC_F_NEXT (1 << 0)
#define VRING_DESC_F_WRITE (1 << 1)
#define VRING_DESC_F_INDIRECT (1 << 2)
struct virtio_desc { /* AKA vring_desc */
uint64_t vd_addr; /* guest physical address */
uint32_t vd_len; /* length of scatter/gather seg */
uint16_t vd_flags; /* VRING_F_DESC_* */
uint16_t vd_next; /* next desc if F_NEXT */
} __packed;
struct virtio_used { /* AKA vring_used_elem */
uint32_t vu_idx; /* head of used descriptor chain */
uint32_t vu_tlen; /* length written-to */
} __packed;
#define VRING_AVAIL_F_NO_INTERRUPT 1
struct vring_avail {
uint16_t va_flags; /* VRING_AVAIL_F_* */
uint16_t va_idx; /* counts to 65535, then cycles */
uint16_t va_ring[]; /* size N, reported in QNUM value */
/* uint16_t va_used_event; -- after N ring entries */
} __packed;
#define VRING_USED_F_NO_NOTIFY 1
struct vring_used {
uint16_t vu_flags; /* VRING_USED_F_* */
uint16_t vu_idx; /* counts to 65535, then cycles */
struct virtio_used vu_ring[]; /* size N */
/* uint16_t vu_avail_event; -- after N ring entries */
} __packed;
/*
* The address of any given virtual queue is determined by a single
* Page Frame Number register. The guest writes the PFN into the
* PCI config space. However, a device that has two or more
* virtqueues can have a different PFN, and size, for each queue.
* The number of queues is determinable via the PCI config space
* VTCFG_R_QSEL register. Writes to QSEL select the queue: 0 means
* queue #0, 1 means queue#1, etc. Once a queue is selected, the
* remaining PFN and QNUM registers refer to that queue.
*
* QNUM is a read-only register containing a nonzero power of two
* that indicates the (hypervisor's) queue size. Or, if reading it
* produces zero, the hypervisor does not have a corresponding
* queue. (The number of possible queues depends on the virtual
* device. The block device has just one; the network device
* provides either two -- 0 = receive, 1 = transmit -- or three,
* with 2 = control.)
*
* PFN is a read/write register giving the physical page address of
* the virtqueue in guest memory (the guest must allocate enough space
* based on the hypervisor's provided QNUM).
*
* QNOTIFY is effectively write-only: when the guest writes a queue
* number to the register, the hypervisor should scan the specified
* virtqueue. (Reading QNOTIFY currently always gets 0).
*/
/*
* PFN register shift amount
*/
#define VRING_PFN 12
/*
* Virtio device types
*
* XXX Should really be merged with <dev/virtio/virtio.h> defines
*/
#define VIRTIO_TYPE_NET 1
#define VIRTIO_TYPE_BLOCK 2
#define VIRTIO_TYPE_CONSOLE 3
#define VIRTIO_TYPE_ENTROPY 4
#define VIRTIO_TYPE_BALLOON 5
#define VIRTIO_TYPE_IOMEMORY 6
#define VIRTIO_TYPE_RPMSG 7
#define VIRTIO_TYPE_SCSI 8
#define VIRTIO_TYPE_9P 9
/* experimental IDs start at 65535 and work down */
/*
* PCI vendor/device IDs
*/
#define VIRTIO_VENDOR 0x1AF4
#define VIRTIO_DEV_NET 0x1000
#define VIRTIO_DEV_BLOCK 0x1001
#define VIRTIO_DEV_RANDOM 0x1002
/*
* PCI config space constants.
*
* If MSI-X is enabled, the ISR register is generally not used,
* and the configuration vector and queue vector appear at offsets
* 20 and 22 with the remaining configuration registers at 24.
* If MSI-X is not enabled, those two registers disappear and
* the remaining configuration registers start at offset 20.
*/
#define VTCFG_R_HOSTCAP 0
#define VTCFG_R_GUESTCAP 4
#define VTCFG_R_PFN 8
#define VTCFG_R_QNUM 12
#define VTCFG_R_QSEL 14
#define VTCFG_R_QNOTIFY 16
#define VTCFG_R_STATUS 18
#define VTCFG_R_ISR 19
#define VTCFG_R_CFGVEC 20
#define VTCFG_R_QVEC 22
#define VTCFG_R_CFG0 20 /* No MSI-X */
#define VTCFG_R_CFG1 24 /* With MSI-X */
#define VTCFG_R_MSIX 20
/*
* Bits in VTCFG_R_STATUS. Guests need not actually set any of these,
* but a guest writing 0 to this register means "please reset".
*/
#define VTCFG_STATUS_ACK 0x01 /* guest OS has acknowledged dev */
#define VTCFG_STATUS_DRIVER 0x02 /* guest OS driver is loaded */
#define VTCFG_STATUS_DRIVER_OK 0x04 /* guest OS driver ready */
#define VTCFG_STATUS_FAILED 0x80 /* guest has given up on this dev */
/*
* Bits in VTCFG_R_ISR. These apply only if not using MSI-X.
*
* (We don't [yet?] ever use CONF_CHANGED.)
*/
#define VTCFG_ISR_QUEUES 0x01 /* re-scan queues */
#define VTCFG_ISR_CONF_CHANGED 0x80 /* configuration changed */
#define VIRTIO_MSI_NO_VECTOR 0xFFFF
/*
* Feature flags.
* Note: bits 0 through 23 are reserved to each device type.
*/
#define VIRTIO_F_NOTIFY_ON_EMPTY (1 << 24)
#define VIRTIO_RING_F_INDIRECT_DESC (1 << 28)
#define VIRTIO_RING_F_EVENT_IDX (1 << 29)
/* From section 2.3, "Virtqueue Configuration", of the virtio specification */
static inline size_t
vring_size(u_int qsz)
{
size_t size;
/* constant 3 below = va_flags, va_idx, va_used_event */
size = sizeof(struct virtio_desc) * qsz + sizeof(uint16_t) * (3 + qsz);
size = roundup2(size, VRING_ALIGN);
/* constant 3 below = vu_flags, vu_idx, vu_avail_event */
size += sizeof(uint16_t) * 3 + sizeof(struct virtio_used) * qsz;
size = roundup2(size, VRING_ALIGN);
return (size);
}
struct vmctx;
struct pci_devinst;
struct vqueue_info;
/*
* A virtual device, with some number (possibly 0) of virtual
* queues and some size (possibly 0) of configuration-space
* registers private to the device. The virtio_softc should come
* at the front of each "derived class", so that a pointer to the
* virtio_softc is also a pointer to the more specific, derived-
* from-virtio driver's softc.
*
* Note: inside each hypervisor virtio driver, changes to these
* data structures must be locked against other threads, if any.
* Except for PCI config space register read/write, we assume each
* driver does the required locking, but we need a pointer to the
* lock (if there is one) for PCI config space read/write ops.
*
* When the guest reads or writes the device's config space, the
* generic layer checks for operations on the special registers
* described above. If the offset of the register(s) being read
* or written is past the CFG area (CFG0 or CFG1), the request is
* passed on to the virtual device, after subtracting off the
* generic-layer size. (So, drivers can just use the offset as
* an offset into "struct config", for instance.)
*
* (The virtio layer also makes sure that the read or write is to/
* from a "good" config offset, hence vc_cfgsize, and on BAR #0.
* However, the driver must verify the read or write size and offset
* and that no one is writing a readonly register.)
*
* The BROKED flag ("this thing done gone and broked") is for future
* use.
*/
#define VIRTIO_USE_MSIX 0x01
#define VIRTIO_EVENT_IDX 0x02 /* use the event-index values */
#define VIRTIO_BROKED 0x08 /* ??? */
struct virtio_softc {
struct virtio_consts *vs_vc; /* constants (see below) */
int vs_flags; /* VIRTIO_* flags from above */
pthread_mutex_t *vs_mtx; /* POSIX mutex, if any */
struct pci_devinst *vs_pi; /* PCI device instance */
uint32_t vs_negotiated_caps; /* negotiated capabilities */
struct vqueue_info *vs_queues; /* one per vc_nvq */
int vs_curq; /* current queue */
uint8_t vs_status; /* value from last status write */
uint8_t vs_isr; /* ISR flags, if not MSI-X */
uint16_t vs_msix_cfg_idx; /* MSI-X vector for config event */
};
#define VS_LOCK(vs) \
do { \
if (vs->vs_mtx) \
pthread_mutex_lock(vs->vs_mtx); \
} while (0)
#define VS_UNLOCK(vs) \
do { \
if (vs->vs_mtx) \
pthread_mutex_unlock(vs->vs_mtx); \
} while (0)
struct virtio_consts {
const char *vc_name; /* name of driver (for diagnostics) */
int vc_nvq; /* number of virtual queues */
size_t vc_cfgsize; /* size of dev-specific config regs */
void (*vc_reset)(void *); /* called on virtual device reset */
void (*vc_qnotify)(void *, struct vqueue_info *);
/* called on QNOTIFY if no VQ notify */
int (*vc_cfgread)(void *, int, int, uint32_t *);
/* called to read config regs */
int (*vc_cfgwrite)(void *, int, int, uint32_t);
/* called to write config regs */
void (*vc_apply_features)(void *, uint64_t);
/* called to apply negotiated features */
uint64_t vc_hv_caps; /* hypervisor-provided capabilities */
};
/*
* Data structure allocated (statically) per virtual queue.
*
* Drivers may change vq_qsize after a reset. When the guest OS
* requests a device reset, the hypervisor first calls
* vs->vs_vc->vc_reset(); then the data structure below is
* reinitialized (for each virtqueue: vs->vs_vc->vc_nvq).
*
* The remaining fields should only be fussed-with by the generic
* code.
*
* Note: the addresses of vq_desc, vq_avail, and vq_used are all
* computable from each other, but it's a lot simpler if we just
* keep a pointer to each one. The event indices are similarly
* (but more easily) computable, and this time we'll compute them:
* they're just XX_ring[N].
*/
#define VQ_ALLOC 0x01 /* set once we have a pfn */
#define VQ_BROKED 0x02 /* ??? */
struct vqueue_info {
uint16_t vq_qsize; /* size of this queue (a power of 2) */
void (*vq_notify)(void *, struct vqueue_info *);
/* called instead of vc_notify, if not NULL */
struct virtio_softc *vq_vs; /* backpointer to softc */
uint16_t vq_num; /* we're the num'th queue in the softc */
uint16_t vq_flags; /* flags (see above) */
uint16_t vq_last_avail; /* a recent value of vq_avail->va_idx */
uint16_t vq_save_used; /* saved vq_used->vu_idx; see vq_endchains */
uint16_t vq_msix_idx; /* MSI-X index, or VIRTIO_MSI_NO_VECTOR */
uint32_t vq_pfn; /* PFN of virt queue (not shifted!) */
volatile struct virtio_desc *vq_desc; /* descriptor array */
volatile struct vring_avail *vq_avail; /* the "avail" ring */
volatile struct vring_used *vq_used; /* the "used" ring */
};
/* as noted above, these are sort of backwards, name-wise */
#define VQ_AVAIL_EVENT_IDX(vq) \
(*(volatile uint16_t *)&(vq)->vq_used->vu_ring[(vq)->vq_qsize])
#define VQ_USED_EVENT_IDX(vq) \
((vq)->vq_avail->va_ring[(vq)->vq_qsize])
/*
* Is this ring ready for I/O?
*/
static inline int
vq_ring_ready(struct vqueue_info *vq)
{
return (vq->vq_flags & VQ_ALLOC);
}
/*
* Are there "available" descriptors? (This does not count
* how many, just returns True if there are some.)
*/
static inline int
vq_has_descs(struct vqueue_info *vq)
{
return (vq_ring_ready(vq) && vq->vq_last_avail !=
vq->vq_avail->va_idx);
}
/*
* Deliver an interrupt to guest on the given virtual queue
* (if possible, or a generic MSI interrupt if not using MSI-X).
*/
static inline void
vq_interrupt(struct virtio_softc *vs, struct vqueue_info *vq)
{
if (pci_msix_enabled(vs->vs_pi))
pci_generate_msix(vs->vs_pi, vq->vq_msix_idx);
else {
VS_LOCK(vs);
vs->vs_isr |= VTCFG_ISR_QUEUES;
pci_generate_msi(vs->vs_pi, 0);
pci_lintr_assert(vs->vs_pi);
VS_UNLOCK(vs);
}
}
struct iovec;
void vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc,
void *dev_softc, struct pci_devinst *pi,
struct vqueue_info *queues);
int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix);
void vi_reset_dev(struct virtio_softc *);
void vi_set_io_bar(struct virtio_softc *, int);
int vq_getchain(struct vqueue_info *vq, uint16_t *pidx,
struct iovec *iov, int n_iov, uint16_t *flags);
void vq_retchain(struct vqueue_info *vq);
void vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen);
void vq_endchains(struct vqueue_info *vq, int used_all_avail);
uint64_t vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size);
void vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size, uint64_t value);
#endif /* _VIRTIO_H_ */

230
xmsr.c Normal file
View File

@ -0,0 +1,230 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <machine/cpufunc.h>
#include <machine/vmm.h>
#include <machine/specialreg.h>
#include <vmmapi.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "xmsr.h"
static int cpu_vendor_intel, cpu_vendor_amd;
int
emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t val)
{
if (cpu_vendor_intel) {
switch (num) {
case 0xd04: /* Sandy Bridge uncore PMCs */
case 0xc24:
return (0);
case MSR_BIOS_UPDT_TRIG:
return (0);
case MSR_BIOS_SIGN:
return (0);
default:
break;
}
} else if (cpu_vendor_amd) {
switch (num) {
case MSR_HWCR:
/*
* Ignore writes to hardware configuration MSR.
*/
return (0);
case MSR_NB_CFG1:
case MSR_IC_CFG:
return (0); /* Ignore writes */
case MSR_PERFEVSEL0:
case MSR_PERFEVSEL1:
case MSR_PERFEVSEL2:
case MSR_PERFEVSEL3:
/* Ignore writes to the PerfEvtSel MSRs */
return (0);
case MSR_K7_PERFCTR0:
case MSR_K7_PERFCTR1:
case MSR_K7_PERFCTR2:
case MSR_K7_PERFCTR3:
/* Ignore writes to the PerfCtr MSRs */
return (0);
case MSR_P_STATE_CONTROL:
/* Ignore write to change the P-state */
return (0);
default:
break;
}
}
return (-1);
}
int
emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t *val)
{
int error = 0;
if (cpu_vendor_intel) {
switch (num) {
case MSR_BIOS_SIGN:
case MSR_IA32_PLATFORM_ID:
case MSR_PKG_ENERGY_STATUS:
case MSR_PP0_ENERGY_STATUS:
case MSR_PP1_ENERGY_STATUS:
case MSR_DRAM_ENERGY_STATUS:
*val = 0;
break;
case MSR_RAPL_POWER_UNIT:
/*
* Use the default value documented in section
* "RAPL Interfaces" in Intel SDM vol3.
*/
*val = 0x000a1003;
break;
default:
error = -1;
break;
}
} else if (cpu_vendor_amd) {
switch (num) {
case MSR_BIOS_SIGN:
*val = 0;
break;
case MSR_HWCR:
/*
* Bios and Kernel Developer's Guides for AMD Families
* 12H, 14H, 15H and 16H.
*/
*val = 0x01000010; /* Reset value */
*val |= 1 << 9; /* MONITOR/MWAIT disable */
break;
case MSR_NB_CFG1:
case MSR_IC_CFG:
/*
* The reset value is processor family dependent so
* just return 0.
*/
*val = 0;
break;
case MSR_PERFEVSEL0:
case MSR_PERFEVSEL1:
case MSR_PERFEVSEL2:
case MSR_PERFEVSEL3:
/*
* PerfEvtSel MSRs are not properly virtualized so just
* return zero.
*/
*val = 0;
break;
case MSR_K7_PERFCTR0:
case MSR_K7_PERFCTR1:
case MSR_K7_PERFCTR2:
case MSR_K7_PERFCTR3:
/*
* PerfCtr MSRs are not properly virtualized so just
* return zero.
*/
*val = 0;
break;
case MSR_SMM_ADDR:
case MSR_SMM_MASK:
/*
* Return the reset value defined in the AMD Bios and
* Kernel Developer's Guide.
*/
*val = 0;
break;
case MSR_P_STATE_LIMIT:
case MSR_P_STATE_CONTROL:
case MSR_P_STATE_STATUS:
case MSR_P_STATE_CONFIG(0): /* P0 configuration */
*val = 0;
break;
/*
* OpenBSD guests test bit 0 of this MSR to detect if the
* workaround for erratum 721 is already applied.
* http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf
*/
case 0xC0011029:
*val = 1;
break;
default:
error = -1;
break;
}
} else {
error = -1;
}
return (error);
}
int
init_msr(void)
{
int error;
u_int regs[4];
char cpu_vendor[13];
do_cpuid(0, regs);
((u_int *)&cpu_vendor)[0] = regs[1];
((u_int *)&cpu_vendor)[1] = regs[3];
((u_int *)&cpu_vendor)[2] = regs[2];
cpu_vendor[12] = '\0';
error = 0;
if (strcmp(cpu_vendor, "AuthenticAMD") == 0) {
cpu_vendor_amd = 1;
} else if (strcmp(cpu_vendor, "GenuineIntel") == 0) {
cpu_vendor_intel = 1;
} else {
fprintf(stderr, "Unknown cpu vendor \"%s\"\n", cpu_vendor);
error = -1;
}
return (error);
}

36
xmsr.h Normal file
View File

@ -0,0 +1,36 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _XMSR_H_
#define _XMSR_H_
int init_msr(void);
int emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val);
int emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t *val);
#endif