freebsd-dev/sys/amd64/vmm/vmm.c

2718 lines
62 KiB
C
Raw Normal View History

/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/module.h>
#include <sys/sysctl.h>
#include <sys/malloc.h>
#include <sys/pcpu.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/smp.h>
#include <sys/systm.h>
#include <vm/vm.h>
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_extern.h>
#include <vm/vm_param.h>
#include <machine/cpu.h>
#include <machine/pcb.h>
#include <machine/smp.h>
PTI for amd64. The implementation of the Kernel Page Table Isolation (KPTI) for amd64, first version. It provides a workaround for the 'meltdown' vulnerability. PTI is turned off by default for now, enable with the loader tunable vm.pmap.pti=1. The pmap page table is split into kernel-mode table and user-mode table. Kernel-mode table is identical to the non-PTI table, while usermode table is obtained from kernel table by leaving userspace mappings intact, but only leaving the following parts of the kernel mapped: kernel text (but not modules text) PCPU GDT/IDT/user LDT/task structures IST stacks for NMI and doublefault handlers. Kernel switches to user page table before returning to usermode, and restores full kernel page table on the entry. Initial kernel-mode stack for PTI trampoline is allocated in PCPU, it is only 16 qwords. Kernel entry trampoline switches page tables. then the hardware trap frame is copied to the normal kstack, and execution continues. IST stacks are kept mapped and no trampoline is needed for NMI/doublefault, but of course page table switch is performed. On return to usermode, the trampoline is used again, iret frame is copied to the trampoline stack, page tables are switched and iretq is executed. The case of iretq faulting due to the invalid usermode context is tricky, since the frame for fault is appended to the trampoline frame. Besides copying the fault frame and original (corrupted) frame to kstack, the fault frame must be patched to make it look as if the fault occured on the kstack, see the comment in doret_iret detection code in trap(). Currently kernel pages which are mapped during trampoline operation are identical for all pmaps. They are registered using pmap_pti_add_kva(). Besides initial registrations done during boot, LDT and non-common TSS segments are registered if user requested their use. In principle, they can be installed into kernel page table per pmap with some work. Similarly, PCPU can be hidden from userspace mapping using trampoline PCPU page, but again I do not see much benefits besides complexity. PDPE pages for the kernel half of the user page tables are pre-allocated during boot because we need to know pml4 entries which are copied to the top-level paging structure page, in advance on a new pmap creation. I enforce this to avoid iterating over the all existing pmaps if a new PDPE page is needed for PTI kernel mappings. The iteration is a known problematic operation on i386. The need to flush hidden kernel translations on the switch to user mode make global tables (PG_G) meaningless and even harming, so PG_G use is disabled for PTI case. Our existing use of PCID is incompatible with PTI and is automatically disabled if PTI is enabled. PCID can be forced on only for developer's benefit. MCE is known to be broken, it requires IST stack to operate completely correctly even for non-PTI case, and absolutely needs dedicated IST stack because MCE delivery while trampoline did not switched from PTI stack is fatal. The fix is pending. Reviewed by: markj (partially) Tested by: pho (previous version) Discussed with: jeff, jhb Sponsored by: The FreeBSD Foundation MFC after: 2 weeks
2018-01-17 11:44:21 +00:00
#include <machine/md_var.h>
#include <x86/psl.h>
#include <x86/apicreg.h>
#include <machine/vmm.h>
#include <machine/vmm_dev.h>
#include <machine/vmm_instruction_emul.h>
#include "vmm_ioport.h"
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
#include "vmm_ktr.h"
#include "vmm_host.h"
#include "vmm_mem.h"
#include "vmm_util.h"
#include "vatpic.h"
#include "vatpit.h"
#include "vhpet.h"
#include "vioapic.h"
#include "vlapic.h"
#include "vpmtmr.h"
#include "vrtc.h"
#include "vmm_stat.h"
#include "vmm_lapic.h"
#include "io/ppt.h"
#include "io/iommu.h"
struct vlapic;
/*
* Initialization:
* (a) allocated when vcpu is created
* (i) initialized when vcpu is created and when it is reinitialized
* (o) initialized the first time the vcpu is created
* (x) initialized before use
*/
struct vcpu {
struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */
enum vcpu_state state; /* (o) vcpu state */
int hostcpu; /* (o) vcpu's host cpu */
int reqidle; /* (i) request vcpu to idle */
struct vlapic *vlapic; /* (i) APIC device model */
enum x2apic_state x2apic_state; /* (i) APIC mode */
uint64_t exitintinfo; /* (i) events pending at VM exit */
int nmi_pending; /* (i) NMI pending */
int extint_pending; /* (i) INTR pending */
int exception_pending; /* (i) exception pending */
int exc_vector; /* (x) exception collateral */
int exc_errcode_valid;
uint32_t exc_errcode;
struct savefpu *guestfpu; /* (a,i) guest fpu state */
uint64_t guest_xcr0; /* (i) guest %xcr0 register */
void *stats; /* (a,i) statistics */
struct vm_exit exitinfo; /* (x) exit reason and collateral */
uint64_t nextrip; /* (x) next instruction to execute */
};
#define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
#define vcpu_lock(v) mtx_lock_spin(&((v)->mtx))
#define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx))
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
#define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED)
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
struct mem_seg {
size_t len;
bool sysmem;
struct vm_object *object;
};
#define VM_MAX_MEMSEGS 3
struct mem_map {
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
vm_paddr_t gpa;
size_t len;
vm_ooffset_t segoff;
int segid;
int prot;
int flags;
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
};
#define VM_MAX_MEMMAPS 4
/*
* Initialization:
* (o) initialized the first time the VM is created
* (i) initialized when VM is created and when it is reinitialized
* (x) initialized before use
*/
struct vm {
void *cookie; /* (i) cpu-specific data */
void *iommu; /* (x) iommu-specific data */
struct vhpet *vhpet; /* (i) virtual HPET */
struct vioapic *vioapic; /* (i) virtual ioapic */
struct vatpic *vatpic; /* (i) virtual atpic */
struct vatpit *vatpit; /* (i) virtual atpit */
struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */
struct vrtc *vrtc; /* (o) virtual RTC */
volatile cpuset_t active_cpus; /* (i) active vcpus */
volatile cpuset_t debug_cpus; /* (i) vcpus stopped for debug */
int suspend; /* (i) stop VM execution */
volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */
volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */
cpuset_t rendezvous_req_cpus; /* (x) rendezvous requested */
cpuset_t rendezvous_done_cpus; /* (x) rendezvous finished */
void *rendezvous_arg; /* (x) rendezvous func/arg */
vm_rendezvous_func_t rendezvous_func;
struct mtx rendezvous_mtx; /* (o) rendezvous lock */
struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
struct vmspace *vmspace; /* (o) guest's address space */
char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */
struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */
/* The following describe the vm cpu topology */
uint16_t sockets; /* (o) num of sockets */
uint16_t cores; /* (o) num of cores/socket */
uint16_t threads; /* (o) num of threads/core */
uint16_t maxcpus; /* (o) max pluggable cpus */
};
static int vmm_initialized;
static struct vmm_ops *ops;
2014-01-09 03:25:54 +00:00
#define VMM_INIT(num) (ops != NULL ? (*ops->init)(num) : 0)
#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0)
#define VMM_RESUME() (ops != NULL ? (*ops->resume)() : 0)
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
#define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
#define VMRUN(vmi, vcpu, rip, pmap, evinfo) \
(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo) : ENXIO)
#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
#define VMSPACE_ALLOC(min, max) \
(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
#define VMSPACE_FREE(vmspace) \
(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
#define VMGETREG(vmi, vcpu, num, retval) \
(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
#define VMSETREG(vmi, vcpu, num, val) \
(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
#define VMGETDESC(vmi, vcpu, num, desc) \
(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
#define VMSETDESC(vmi, vcpu, num, desc) \
(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
#define VMGETCAP(vmi, vcpu, num, retval) \
(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
#define VMSETCAP(vmi, vcpu, num, val) \
(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
#define VLAPIC_INIT(vmi, vcpu) \
(ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
#define VLAPIC_CLEANUP(vmi, vlapic) \
(ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
#define fpu_start_emulating() load_cr0(rcr0() | CR0_TS)
#define fpu_stop_emulating() clts()
SDT_PROVIDER_DEFINE(vmm);
static MALLOC_DEFINE(M_VM, "vm", "vm");
/* statistics */
static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
2014-01-09 03:25:54 +00:00
SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
/*
* Halt the guest if all vcpus are executing a HLT instruction with
* interrupts disabled.
*/
static int halt_detection_enabled = 1;
SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN,
&halt_detection_enabled, 0,
"Halt VM if all vcpus execute HLT with interrupts disabled");
static int vmm_ipinum;
2014-01-09 03:25:54 +00:00
SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
"IPI vector used for vcpu notifications");
static int trace_guest_exceptions;
SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN,
&trace_guest_exceptions, 0,
"Trap into hypervisor on all guest exceptions and reflect them back");
static void vm_free_memmap(struct vm *vm, int ident);
static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr);
#ifdef KTR
static const char *
vcpu_state2str(enum vcpu_state state)
{
switch (state) {
case VCPU_IDLE:
return ("idle");
case VCPU_FROZEN:
return ("frozen");
case VCPU_RUNNING:
return ("running");
case VCPU_SLEEPING:
return ("sleeping");
default:
return ("unknown");
}
}
#endif
static void
vcpu_cleanup(struct vm *vm, int i, bool destroy)
{
struct vcpu *vcpu = &vm->vcpu[i];
VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
if (destroy) {
vmm_stat_free(vcpu->stats);
fpu_save_area_free(vcpu->guestfpu);
}
}
static void
vcpu_init(struct vm *vm, int vcpu_id, bool create)
{
struct vcpu *vcpu;
KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
("vcpu_init: invalid vcpu %d", vcpu_id));
vcpu = &vm->vcpu[vcpu_id];
if (create) {
KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
"initialized", vcpu_id));
vcpu_lock_init(vcpu);
vcpu->state = VCPU_IDLE;
vcpu->hostcpu = NOCPU;
vcpu->guestfpu = fpu_save_area_alloc();
vcpu->stats = vmm_stat_alloc();
}
vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
vcpu->reqidle = 0;
vcpu->exitintinfo = 0;
vcpu->nmi_pending = 0;
vcpu->extint_pending = 0;
vcpu->exception_pending = 0;
vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
fpu_save_area_reset(vcpu->guestfpu);
vmm_stat_init(vcpu->stats);
}
int
vcpu_trace_exceptions(struct vm *vm, int vcpuid)
{
return (trace_guest_exceptions);
}
struct vm_exit *
vm_exitinfo(struct vm *vm, int cpuid)
{
struct vcpu *vcpu;
if (cpuid < 0 || cpuid >= vm->maxcpus)
panic("vm_exitinfo: invalid cpuid %d", cpuid);
vcpu = &vm->vcpu[cpuid];
return (&vcpu->exitinfo);
}
static void
vmm_resume(void)
{
VMM_RESUME();
}
static int
vmm_init(void)
{
int error;
vmm_host_state_init();
2014-01-09 03:25:54 +00:00
PTI for amd64. The implementation of the Kernel Page Table Isolation (KPTI) for amd64, first version. It provides a workaround for the 'meltdown' vulnerability. PTI is turned off by default for now, enable with the loader tunable vm.pmap.pti=1. The pmap page table is split into kernel-mode table and user-mode table. Kernel-mode table is identical to the non-PTI table, while usermode table is obtained from kernel table by leaving userspace mappings intact, but only leaving the following parts of the kernel mapped: kernel text (but not modules text) PCPU GDT/IDT/user LDT/task structures IST stacks for NMI and doublefault handlers. Kernel switches to user page table before returning to usermode, and restores full kernel page table on the entry. Initial kernel-mode stack for PTI trampoline is allocated in PCPU, it is only 16 qwords. Kernel entry trampoline switches page tables. then the hardware trap frame is copied to the normal kstack, and execution continues. IST stacks are kept mapped and no trampoline is needed for NMI/doublefault, but of course page table switch is performed. On return to usermode, the trampoline is used again, iret frame is copied to the trampoline stack, page tables are switched and iretq is executed. The case of iretq faulting due to the invalid usermode context is tricky, since the frame for fault is appended to the trampoline frame. Besides copying the fault frame and original (corrupted) frame to kstack, the fault frame must be patched to make it look as if the fault occured on the kstack, see the comment in doret_iret detection code in trap(). Currently kernel pages which are mapped during trampoline operation are identical for all pmaps. They are registered using pmap_pti_add_kva(). Besides initial registrations done during boot, LDT and non-common TSS segments are registered if user requested their use. In principle, they can be installed into kernel page table per pmap with some work. Similarly, PCPU can be hidden from userspace mapping using trampoline PCPU page, but again I do not see much benefits besides complexity. PDPE pages for the kernel half of the user page tables are pre-allocated during boot because we need to know pml4 entries which are copied to the top-level paging structure page, in advance on a new pmap creation. I enforce this to avoid iterating over the all existing pmaps if a new PDPE page is needed for PTI kernel mappings. The iteration is a known problematic operation on i386. The need to flush hidden kernel translations on the switch to user mode make global tables (PG_G) meaningless and even harming, so PG_G use is disabled for PTI case. Our existing use of PCID is incompatible with PTI and is automatically disabled if PTI is enabled. PCID can be forced on only for developer's benefit. MCE is known to be broken, it requires IST stack to operate completely correctly even for non-PTI case, and absolutely needs dedicated IST stack because MCE delivery while trampoline did not switched from PTI stack is fatal. The fix is pending. Reviewed by: markj (partially) Tested by: pho (previous version) Discussed with: jeff, jhb Sponsored by: The FreeBSD Foundation MFC after: 2 weeks
2018-01-17 11:44:21 +00:00
vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
&IDTVEC(justreturn));
if (vmm_ipinum < 0)
2014-01-09 03:25:54 +00:00
vmm_ipinum = IPI_AST;
error = vmm_mem_init();
if (error)
return (error);
if (vmm_is_intel())
ops = &vmm_ops_intel;
else if (vmm_is_amd())
ops = &vmm_ops_amd;
else
return (ENXIO);
vmm_resume_p = vmm_resume;
2014-01-09 03:25:54 +00:00
return (VMM_INIT(vmm_ipinum));
}
static int
vmm_handler(module_t mod, int what, void *arg)
{
int error;
switch (what) {
case MOD_LOAD:
vmmdev_init();
error = vmm_init();
if (error == 0)
vmm_initialized = 1;
break;
case MOD_UNLOAD:
error = vmmdev_cleanup();
if (error == 0) {
vmm_resume_p = NULL;
iommu_cleanup();
2014-01-09 03:25:54 +00:00
if (vmm_ipinum != IPI_AST)
lapic_ipi_free(vmm_ipinum);
error = VMM_CLEANUP();
/*
* Something bad happened - prevent new
* VMs from being created
*/
if (error)
vmm_initialized = 0;
}
break;
default:
error = 0;
break;
}
return (error);
}
static moduledata_t vmm_kmod = {
"vmm",
vmm_handler,
NULL
};
/*
* vmm initialization has the following dependencies:
*
* - VT-x initialization requires smp_rendezvous() and therefore must happen
* after SMP is fully functional (after SI_SUB_SMP).
*/
DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
MODULE_VERSION(vmm, 1);
static void
vm_init(struct vm *vm, bool create)
{
int i;
vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
vm->iommu = NULL;
vm->vioapic = vioapic_init(vm);
vm->vhpet = vhpet_init(vm);
vm->vatpic = vatpic_init(vm);
vm->vatpit = vatpit_init(vm);
vm->vpmtmr = vpmtmr_init(vm);
if (create)
vm->vrtc = vrtc_init(vm);
CPU_ZERO(&vm->active_cpus);
CPU_ZERO(&vm->debug_cpus);
vm->suspend = 0;
CPU_ZERO(&vm->suspended_cpus);
for (i = 0; i < vm->maxcpus; i++)
vcpu_init(vm, i, create);
}
/*
* The default CPU topology is a single thread per package.
*/
u_int cores_per_package = 1;
u_int threads_per_core = 1;
int
vm_create(const char *name, struct vm **retvm)
{
struct vm *vm;
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
struct vmspace *vmspace;
/*
* If vmm.ko could not be successfully initialized then don't attempt
* to create the virtual machine.
*/
if (!vmm_initialized)
return (ENXIO);
if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
return (EINVAL);
vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS);
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
if (vmspace == NULL)
return (ENOMEM);
vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
strcpy(vm->name, name);
vm->vmspace = vmspace;
mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
vm->sockets = 1;
vm->cores = cores_per_package; /* XXX backwards compatibility */
vm->threads = threads_per_core; /* XXX backwards compatibility */
vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */
vm_init(vm, true);
*retvm = vm;
return (0);
}
void
vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
uint16_t *threads, uint16_t *maxcpus)
{
*sockets = vm->sockets;
*cores = vm->cores;
*threads = vm->threads;
*maxcpus = vm->maxcpus;
}
uint16_t
vm_get_maxcpus(struct vm *vm)
{
return (vm->maxcpus);
}
int
vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
uint16_t threads, uint16_t maxcpus)
{
if (maxcpus != 0)
return (EINVAL); /* XXX remove when supported */
if ((sockets * cores * threads) > vm->maxcpus)
return (EINVAL);
/* XXX need to check sockets * cores * threads == vCPU, how? */
vm->sockets = sockets;
vm->cores = cores;
vm->threads = threads;
vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */
return(0);
}
static void
vm_cleanup(struct vm *vm, bool destroy)
{
struct mem_map *mm;
int i;
ppt_unassign_all(vm);
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
if (vm->iommu != NULL)
iommu_destroy_domain(vm->iommu);
if (destroy)
vrtc_cleanup(vm->vrtc);
else
vrtc_reset(vm->vrtc);
vpmtmr_cleanup(vm->vpmtmr);
vatpit_cleanup(vm->vatpit);
vhpet_cleanup(vm->vhpet);
vatpic_cleanup(vm->vatpic);
vioapic_cleanup(vm->vioapic);
for (i = 0; i < vm->maxcpus; i++)
vcpu_cleanup(vm, i, destroy);
VMCLEANUP(vm->cookie);
/*
* System memory is removed from the guest address space only when
* the VM is destroyed. This is because the mapping remains the same
* across VM reset.
*
* Device memory can be relocated by the guest (e.g. using PCI BARs)
* so those mappings are removed on a VM reset.
*/
for (i = 0; i < VM_MAX_MEMMAPS; i++) {
mm = &vm->mem_maps[i];
if (destroy || !sysmem_mapping(vm, mm))
vm_free_memmap(vm, i);
}
if (destroy) {
for (i = 0; i < VM_MAX_MEMSEGS; i++)
vm_free_memseg(vm, i);
VMSPACE_FREE(vm->vmspace);
vm->vmspace = NULL;
}
}
void
vm_destroy(struct vm *vm)
{
vm_cleanup(vm, true);
free(vm, M_VM);
}
int
vm_reinit(struct vm *vm)
{
int error;
/*
* A virtual machine can be reset only if all vcpus are suspended.
*/
if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
vm_cleanup(vm, false);
vm_init(vm, false);
error = 0;
} else {
error = EBUSY;
}
return (error);
}
const char *
vm_name(struct vm *vm)
{
return (vm->name);
}
int
vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
{
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
vm_object_t obj;
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
return (ENOMEM);
else
return (0);
}
int
vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
{
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
vmm_mmio_free(vm->vmspace, gpa, len);
return (0);
}
/*
* Return 'true' if 'gpa' is allocated in the guest address space.
*
* This function is called in the context of a running vcpu which acts as
* an implicit lock on 'vm->mem_maps[]'.
*/
bool
vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
{
struct mem_map *mm;
int i;
#ifdef INVARIANTS
int hostcpu, state;
state = vcpu_get_state(vm, vcpuid, &hostcpu);
KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
#endif
for (i = 0; i < VM_MAX_MEMMAPS; i++) {
mm = &vm->mem_maps[i];
if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
return (true); /* 'gpa' is sysmem or devmem */
}
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
if (ppt_is_mmio(vm, gpa))
return (true); /* 'gpa' is pci passthru mmio */
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
return (false);
}
int
vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
{
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
struct mem_seg *seg;
vm_object_t obj;
if (ident < 0 || ident >= VM_MAX_MEMSEGS)
return (EINVAL);
if (len == 0 || (len & PAGE_MASK))
return (EINVAL);
seg = &vm->mem_segs[ident];
if (seg->object != NULL) {
if (seg->len == len && seg->sysmem == sysmem)
return (EEXIST);
else
return (EINVAL);
}
obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
if (obj == NULL)
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
return (ENOMEM);
seg->len = len;
seg->object = obj;
seg->sysmem = sysmem;
return (0);
}
int
vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
vm_object_t *objptr)
{
struct mem_seg *seg;
if (ident < 0 || ident >= VM_MAX_MEMSEGS)
return (EINVAL);
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
seg = &vm->mem_segs[ident];
if (len)
*len = seg->len;
if (sysmem)
*sysmem = seg->sysmem;
if (objptr)
*objptr = seg->object;
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
return (0);
}
void
vm_free_memseg(struct vm *vm, int ident)
{
struct mem_seg *seg;
KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
("%s: invalid memseg ident %d", __func__, ident));
seg = &vm->mem_segs[ident];
if (seg->object != NULL) {
vm_object_deallocate(seg->object);
bzero(seg, sizeof(struct mem_seg));
}
}
int
vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
size_t len, int prot, int flags)
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
{
struct mem_seg *seg;
struct mem_map *m, *map;
vm_ooffset_t last;
int i, error;
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0)
return (EINVAL);
if (flags & ~VM_MEMMAP_F_WIRED)
return (EINVAL);
if (segid < 0 || segid >= VM_MAX_MEMSEGS)
return (EINVAL);
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
seg = &vm->mem_segs[segid];
if (seg->object == NULL)
return (EINVAL);
last = first + len;
if (first < 0 || first >= last || last > seg->len)
return (EINVAL);
if ((gpa | first | last) & PAGE_MASK)
return (EINVAL);
map = NULL;
for (i = 0; i < VM_MAX_MEMMAPS; i++) {
m = &vm->mem_maps[i];
if (m->len == 0) {
map = m;
break;
}
}
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
if (map == NULL)
return (ENOSPC);
error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
len, 0, VMFS_NO_SPACE, prot, prot, 0);
if (error != KERN_SUCCESS)
return (EFAULT);
vm_object_reference(seg->object);
if (flags & VM_MEMMAP_F_WIRED) {
error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
if (error != KERN_SUCCESS) {
vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
Provide separate accounting for user-wired pages. Historically we have not distinguished between kernel wirings and user wirings for accounting purposes. User wirings (via mlock(2)) were subject to a global limit on the number of wired pages, so if large swaths of physical memory were wired by the kernel, as happens with the ZFS ARC among other things, the limit could be exceeded, causing user wirings to fail. The change adds a new counter, v_user_wire_count, which counts the number of virtual pages wired by user processes via mlock(2) and mlockall(2). Only user-wired pages are subject to the system-wide limit which helps provide some safety against deadlocks. In particular, while sources of kernel wirings typically support some backpressure mechanism, there is no way to reclaim user-wired pages shorting of killing the wiring process. The limit is exported as vm.max_user_wired, renamed from vm.max_wired, and changed from u_int to u_long. The choice to count virtual user-wired pages rather than physical pages was done for simplicity. There are mechanisms that can cause user-wired mappings to be destroyed while maintaining a wiring of the backing physical page; these make it difficult to accurately track user wirings at the physical page layer. The change also closes some holes which allowed user wirings to succeed even when they would cause the system limit to be exceeded. For instance, mmap() may now fail with ENOMEM in a process that has called mlockall(MCL_FUTURE) if the new mapping would cause the user wiring limit to be exceeded. Note that bhyve -S is subject to the user wiring limit, which defaults to 1/3 of physical RAM. Users that wish to exceed the limit must tune vm.max_user_wired. Reviewed by: kib, ngie (mlock() test changes) Tested by: pho (earlier version) MFC after: 45 days Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D19908
2019-05-13 16:38:48 +00:00
return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM :
EFAULT);
}
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
}
map->gpa = gpa;
map->len = len;
map->segoff = first;
map->segid = segid;
map->prot = prot;
map->flags = flags;
return (0);
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
}
int
vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
{
struct mem_map *mm, *mmnext;
int i;
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
mmnext = NULL;
for (i = 0; i < VM_MAX_MEMMAPS; i++) {
mm = &vm->mem_maps[i];
if (mm->len == 0 || mm->gpa < *gpa)
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
continue;
if (mmnext == NULL || mm->gpa < mmnext->gpa)
mmnext = mm;
}
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
if (mmnext != NULL) {
*gpa = mmnext->gpa;
if (segid)
*segid = mmnext->segid;
if (segoff)
*segoff = mmnext->segoff;
if (len)
*len = mmnext->len;
if (prot)
*prot = mmnext->prot;
if (flags)
*flags = mmnext->flags;
return (0);
} else {
return (ENOENT);
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
}
}
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
static void
vm_free_memmap(struct vm *vm, int ident)
{
struct mem_map *mm;
int error;
mm = &vm->mem_maps[ident];
if (mm->len) {
error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
mm->gpa + mm->len);
KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d",
__func__, error));
bzero(mm, sizeof(struct mem_map));
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
}
}
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
static __inline bool
sysmem_mapping(struct vm *vm, struct mem_map *mm)
{
if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
return (true);
else
return (false);
}
vm_paddr_t
vmm_sysmem_maxaddr(struct vm *vm)
{
struct mem_map *mm;
vm_paddr_t maxaddr;
int i;
maxaddr = 0;
for (i = 0; i < VM_MAX_MEMMAPS; i++) {
mm = &vm->mem_maps[i];
if (sysmem_mapping(vm, mm)) {
if (maxaddr < mm->gpa + mm->len)
maxaddr = mm->gpa + mm->len;
}
}
return (maxaddr);
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
}
static void
vm_iommu_modify(struct vm *vm, bool map)
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
{
int i, sz;
vm_paddr_t gpa, hpa;
struct mem_map *mm;
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
void *vp, *cookie, *host_domain;
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
sz = PAGE_SIZE;
host_domain = iommu_host_domain();
for (i = 0; i < VM_MAX_MEMMAPS; i++) {
mm = &vm->mem_maps[i];
if (!sysmem_mapping(vm, mm))
continue;
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
if (map) {
KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
("iommu map found invalid memmap %#lx/%#lx/%#x",
mm->gpa, mm->len, mm->flags));
if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
continue;
mm->flags |= VM_MEMMAP_F_IOMMU;
} else {
if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
continue;
mm->flags &= ~VM_MEMMAP_F_IOMMU;
KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
("iommu unmap found invalid memmap %#lx/%#lx/%#x",
mm->gpa, mm->len, mm->flags));
}
gpa = mm->gpa;
while (gpa < mm->gpa + mm->len) {
vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, VM_PROT_WRITE,
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
&cookie);
KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
vm_name(vm), gpa));
vm_gpa_release(cookie);
hpa = DMAP_TO_PHYS((uintptr_t)vp);
if (map) {
iommu_create_mapping(vm->iommu, gpa, hpa, sz);
iommu_remove_mapping(host_domain, hpa, sz);
} else {
iommu_remove_mapping(vm->iommu, gpa, sz);
iommu_create_mapping(host_domain, hpa, hpa, sz);
}
gpa += PAGE_SIZE;
}
}
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
/*
* Invalidate the cached translations associated with the domain
* from which pages were removed.
*/
if (map)
iommu_invalidate_tlb(host_domain);
else
iommu_invalidate_tlb(vm->iommu);
}
#define vm_iommu_unmap(vm) vm_iommu_modify((vm), false)
#define vm_iommu_map(vm) vm_iommu_modify((vm), true)
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
int
vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
{
int error;
error = ppt_unassign_device(vm, bus, slot, func);
if (error)
return (error);
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
if (ppt_assigned_devices(vm) == 0)
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
vm_iommu_unmap(vm);
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
return (0);
}
int
vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
{
int error;
vm_paddr_t maxaddr;
/* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
if (ppt_assigned_devices(vm) == 0) {
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
KASSERT(vm->iommu == NULL,
("vm_assign_pptdev: iommu must be NULL"));
maxaddr = vmm_sysmem_maxaddr(vm);
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
vm->iommu = iommu_create_domain(maxaddr);
if (vm->iommu == NULL)
return (ENXIO);
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
vm_iommu_map(vm);
}
error = ppt_assign_device(vm, bus, slot, func);
return (error);
}
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
void *
vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot,
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
void **cookie)
{
int i, count, pageoff;
struct mem_map *mm;
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
vm_page_t m;
#ifdef INVARIANTS
/*
* All vcpus are frozen by ioctls that modify the memory map
* (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is
* guaranteed if at least one vcpu is in the VCPU_FROZEN state.
*/
int state;
KASSERT(vcpuid >= -1 && vcpuid < vm->maxcpus, ("%s: invalid vcpuid %d",
__func__, vcpuid));
for (i = 0; i < vm->maxcpus; i++) {
if (vcpuid != -1 && vcpuid != i)
continue;
state = vcpu_get_state(vm, i, NULL);
KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
__func__, state));
}
#endif
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
pageoff = gpa & PAGE_MASK;
if (len > PAGE_SIZE - pageoff)
panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
count = 0;
for (i = 0; i < VM_MAX_MEMMAPS; i++) {
mm = &vm->mem_maps[i];
if (sysmem_mapping(vm, mm) && gpa >= mm->gpa &&
gpa < mm->gpa + mm->len) {
count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
break;
}
}
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
if (count == 1) {
*cookie = m;
return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
} else {
*cookie = NULL;
return (NULL);
}
}
void
vm_gpa_release(void *cookie)
{
vm_page_t m = cookie;
vm_page_unwire(m, PQ_ACTIVE);
}
int
vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
{
if (vcpu < 0 || vcpu >= vm->maxcpus)
return (EINVAL);
if (reg >= VM_REG_LAST)
return (EINVAL);
return (VMGETREG(vm->cookie, vcpu, reg, retval));
}
int
vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
{
struct vcpu *vcpu;
int error;
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
if (reg >= VM_REG_LAST)
return (EINVAL);
error = VMSETREG(vm->cookie, vcpuid, reg, val);
if (error || reg != VM_REG_GUEST_RIP)
return (error);
/* Set 'nextrip' to match the value of %rip */
VCPU_CTR1(vm, vcpuid, "Setting nextrip to %#lx", val);
vcpu = &vm->vcpu[vcpuid];
vcpu->nextrip = val;
return (0);
}
static bool
is_descriptor_table(int reg)
{
switch (reg) {
case VM_REG_GUEST_IDTR:
case VM_REG_GUEST_GDTR:
return (true);
default:
return (false);
}
}
static bool
is_segment_register(int reg)
{
switch (reg) {
case VM_REG_GUEST_ES:
case VM_REG_GUEST_CS:
case VM_REG_GUEST_SS:
case VM_REG_GUEST_DS:
case VM_REG_GUEST_FS:
case VM_REG_GUEST_GS:
case VM_REG_GUEST_TR:
case VM_REG_GUEST_LDTR:
return (true);
default:
return (false);
}
}
int
vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
struct seg_desc *desc)
{
if (vcpu < 0 || vcpu >= vm->maxcpus)
return (EINVAL);
if (!is_segment_register(reg) && !is_descriptor_table(reg))
return (EINVAL);
return (VMGETDESC(vm->cookie, vcpu, reg, desc));
}
int
vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
struct seg_desc *desc)
{
if (vcpu < 0 || vcpu >= vm->maxcpus)
return (EINVAL);
if (!is_segment_register(reg) && !is_descriptor_table(reg))
return (EINVAL);
return (VMSETDESC(vm->cookie, vcpu, reg, desc));
}
static void
restore_guest_fpustate(struct vcpu *vcpu)
{
/* flush host state to the pcb */
fpuexit(curthread);
/* restore guest FPU state */
fpu_stop_emulating();
fpurestore(vcpu->guestfpu);
/* restore guest XCR0 if XSAVE is enabled in the host */
if (rcr4() & CR4_XSAVE)
load_xcr(0, vcpu->guest_xcr0);
/*
* The FPU is now "dirty" with the guest's state so turn on emulation
* to trap any access to the FPU by the host.
*/
fpu_start_emulating();
}
static void
save_guest_fpustate(struct vcpu *vcpu)
{
if ((rcr0() & CR0_TS) == 0)
panic("fpu emulation not enabled in host!");
/* save guest XCR0 and restore host XCR0 */
if (rcr4() & CR4_XSAVE) {
vcpu->guest_xcr0 = rxcr(0);
load_xcr(0, vmm_get_host_xcr0());
}
/* save guest FPU state */
fpu_stop_emulating();
fpusave(vcpu->guestfpu);
fpu_start_emulating();
}
static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
static int
vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
bool from_idle)
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
{
struct vcpu *vcpu;
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
int error;
vcpu = &vm->vcpu[vcpuid];
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
vcpu_assert_locked(vcpu);
/*
* State transitions from the vmmdev_ioctl() must always begin from
* the VCPU_IDLE state. This guarantees that there is only a single
* ioctl() operating on a vcpu at any point.
*/
if (from_idle) {
while (vcpu->state != VCPU_IDLE) {
vcpu->reqidle = 1;
vcpu_notify_event_locked(vcpu, false);
VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to "
"idle requested", vcpu_state2str(vcpu->state));
msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
}
} else {
KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
"vcpu idle state"));
}
if (vcpu->state == VCPU_RUNNING) {
KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
"mismatch for running vcpu", curcpu, vcpu->hostcpu));
} else {
KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
"vcpu that is not running", vcpu->hostcpu));
}
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
/*
* The following state transitions are allowed:
* IDLE -> FROZEN -> IDLE
* FROZEN -> RUNNING -> FROZEN
* FROZEN -> SLEEPING -> FROZEN
*/
switch (vcpu->state) {
case VCPU_IDLE:
case VCPU_RUNNING:
case VCPU_SLEEPING:
error = (newstate != VCPU_FROZEN);
break;
case VCPU_FROZEN:
error = (newstate == VCPU_FROZEN);
break;
default:
error = 1;
break;
}
if (error)
return (EBUSY);
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
vcpu->state = newstate;
if (newstate == VCPU_RUNNING)
vcpu->hostcpu = curcpu;
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
else
vcpu->hostcpu = NOCPU;
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
if (newstate == VCPU_IDLE)
wakeup(&vcpu->state);
return (0);
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
}
static void
vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
{
int error;
if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
panic("Error %d setting state to %d\n", error, newstate);
}
static void
vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
{
int error;
if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
panic("Error %d setting state to %d", error, newstate);
}
static void
vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
{
KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
/*
* Update 'rendezvous_func' and execute a write memory barrier to
* ensure that it is visible across all host cpus. This is not needed
* for correctness but it does ensure that all the vcpus will notice
* that the rendezvous is requested immediately.
*/
vm->rendezvous_func = func;
wmb();
}
#define RENDEZVOUS_CTR0(vm, vcpuid, fmt) \
do { \
if (vcpuid >= 0) \
VCPU_CTR0(vm, vcpuid, fmt); \
else \
VM_CTR0(vm, fmt); \
} while (0)
static void
vm_handle_rendezvous(struct vm *vm, int vcpuid)
{
KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < vm->maxcpus),
("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
mtx_lock(&vm->rendezvous_mtx);
while (vm->rendezvous_func != NULL) {
/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
if (vcpuid != -1 &&
CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
!CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
(*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
}
if (CPU_CMP(&vm->rendezvous_req_cpus,
&vm->rendezvous_done_cpus) == 0) {
VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
vm_set_rendezvous_func(vm, NULL);
wakeup(&vm->rendezvous_func);
break;
}
RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
"vmrndv", 0);
}
mtx_unlock(&vm->rendezvous_mtx);
}
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
/*
* Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
*/
static int
vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
{
struct vcpu *vcpu;
const char *wmesg;
int t, vcpu_halted, vm_halted;
KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
vcpu = &vm->vcpu[vcpuid];
vcpu_halted = 0;
vm_halted = 0;
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
vcpu_lock(vcpu);
while (1) {
/*
* Do a final check for pending NMI or interrupts before
* really putting this thread to sleep. Also check for
* software events that would cause this vcpu to wakeup.
*
* These interrupts/events could have happened after the
* vcpu returned from VMRUN() and before it acquired the
* vcpu lock above.
*/
if (vm->rendezvous_func != NULL || vm->suspend || vcpu->reqidle)
break;
if (vm_nmi_pending(vm, vcpuid))
break;
if (!intr_disabled) {
if (vm_extint_pending(vm, vcpuid) ||
vlapic_pending_intr(vcpu->vlapic, NULL)) {
break;
}
}
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
/* Don't go to sleep if the vcpu thread needs to yield */
if (vcpu_should_yield(vm, vcpuid))
break;
if (vcpu_debugged(vm, vcpuid))
break;
/*
* Some Linux guests implement "halt" by having all vcpus
* execute HLT with interrupts disabled. 'halted_cpus' keeps
* track of the vcpus that have entered this state. When all
* vcpus enter the halted state the virtual machine is halted.
*/
if (intr_disabled) {
wmesg = "vmhalt";
VCPU_CTR0(vm, vcpuid, "Halted");
if (!vcpu_halted && halt_detection_enabled) {
vcpu_halted = 1;
CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
}
if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
vm_halted = 1;
break;
}
} else {
wmesg = "vmidle";
}
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
t = ticks;
vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
/*
* XXX msleep_spin() cannot be interrupted by signals so
* wake up periodically to check pending signals.
*/
msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
}
if (vcpu_halted)
CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
vcpu_unlock(vcpu);
if (vm_halted)
vm_suspend(vm, VM_SUSPEND_HALT);
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
return (0);
}
static int
vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
{
int rv, ftype;
struct vm_map *map;
struct vcpu *vcpu;
struct vm_exit *vme;
vcpu = &vm->vcpu[vcpuid];
vme = &vcpu->exitinfo;
KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
__func__, vme->inst_length));
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
ftype = vme->u.paging.fault_type;
KASSERT(ftype == VM_PROT_READ ||
ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
("vm_handle_paging: invalid fault_type %d", ftype));
if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
vme->u.paging.gpa, ftype);
if (rv == 0) {
VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx",
ftype == VM_PROT_READ ? "accessed" : "dirty",
vme->u.paging.gpa);
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
goto done;
}
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
}
map = &vm->vmspace->vm_map;
Improve MD page fault handlers. Centralize calculation of signal and ucode delivered on unhandled page fault in new function vm_fault_trap(). MD trap_pfault() now almost always uses the signal numbers and error codes calculated in consistent MI way. This introduces the protection fault compatibility sysctls to all non-x86 architectures which did not have that bug, but apparently they were already much more wrong in selecting delivered signals on protection violations. Change the delivered signal for accesses to mapped area after the backing object was truncated. According to POSIX description for mmap(2): The system shall always zero-fill any partial page at the end of an object. Further, the system shall never write out any modified portions of the last page of an object which are beyond its end. References within the address range starting at pa and continuing for len bytes to whole pages following the end of an object shall result in delivery of a SIGBUS signal. An implementation may generate SIGBUS signals when a reference would cause an error in the mapped object, such as out-of-space condition. Adjust according to the description, keeping the existing compatibility code for SIGSEGV/SIGBUS on protection failures. For situations where kernel cannot handle page fault due to resource limit enforcement, SIGBUS with a new error code BUS_OBJERR is delivered. Also, provide a new error code SEGV_PKUERR for SIGSEGV on amd64 due to protection key access violation. vm_fault_hold() is renamed to vm_fault(). Fixed some nits in trap_pfault()s like mis-interpreting Mach errors as errnos. Removed unneeded truncations of the fault addresses reported by hardware. PR: 211924 Reviewed by: alc Discussed with: jilles, markj Sponsored by: The FreeBSD Foundation MFC after: 1 week Differential revision: https://reviews.freebsd.org/D21566
2019-09-27 18:43:36 +00:00
rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL);
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
"ftype = %d", rv, vme->u.paging.gpa, ftype);
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
if (rv != KERN_SUCCESS)
return (EFAULT);
done:
return (0);
}
static int
vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
{
struct vie *vie;
struct vcpu *vcpu;
struct vm_exit *vme;
uint64_t gla, gpa, cs_base;
struct vm_guest_paging *paging;
mem_region_read_t mread;
mem_region_write_t mwrite;
enum vm_cpu_mode cpu_mode;
int cs_d, error, fault;
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
vcpu = &vm->vcpu[vcpuid];
vme = &vcpu->exitinfo;
KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
__func__, vme->inst_length));
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
gla = vme->u.inst_emul.gla;
gpa = vme->u.inst_emul.gpa;
cs_base = vme->u.inst_emul.cs_base;
cs_d = vme->u.inst_emul.cs_d;
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
vie = &vme->u.inst_emul.vie;
paging = &vme->u.inst_emul.paging;
cpu_mode = paging->cpu_mode;
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa);
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
/* Fetch, decode and emulate the faulting instruction */
if (vie->num_valid == 0) {
error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip +
cs_base, VIE_INST_SIZE, vie, &fault);
} else {
/*
* The instruction bytes have already been copied into 'vie'
*/
error = fault = 0;
}
if (error || fault)
return (error);
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) {
VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %#lx",
vme->rip + cs_base);
*retu = true; /* dump instruction bytes in userspace */
return (0);
}
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
/*
* Update 'nextrip' based on the length of the emulated instruction.
*/
vme->inst_length = vie->num_processed;
vcpu->nextrip += vie->num_processed;
VCPU_CTR1(vm, vcpuid, "nextrip updated to %#lx after instruction "
"decoding", vcpu->nextrip);
/* return to userland unless this is an in-kernel emulated device */
if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
mread = lapic_mmio_read;
mwrite = lapic_mmio_write;
} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
mread = vioapic_mmio_read;
mwrite = vioapic_mmio_write;
} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
mread = vhpet_mmio_read;
mwrite = vhpet_mmio_write;
} else {
*retu = true;
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
return (0);
}
error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging,
mread, mwrite, retu);
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
return (error);
}
static int
vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu)
{
int i, done;
struct vcpu *vcpu;
done = 0;
vcpu = &vm->vcpu[vcpuid];
CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
/*
* Wait until all 'active_cpus' have suspended themselves.
*
* Since a VM may be suspended at any time including when one or
* more vcpus are doing a rendezvous we need to call the rendezvous
* handler while we are waiting to prevent a deadlock.
*/
vcpu_lock(vcpu);
while (1) {
if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
break;
}
if (vm->rendezvous_func == NULL) {
VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
} else {
VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend");
vcpu_unlock(vcpu);
vm_handle_rendezvous(vm, vcpuid);
vcpu_lock(vcpu);
}
}
vcpu_unlock(vcpu);
/*
* Wakeup the other sleeping vcpus and return to userspace.
*/
for (i = 0; i < vm->maxcpus; i++) {
if (CPU_ISSET(i, &vm->suspended_cpus)) {
vcpu_notify_event(vm, i, false);
}
}
*retu = true;
return (0);
}
static int
vm_handle_reqidle(struct vm *vm, int vcpuid, bool *retu)
{
struct vcpu *vcpu = &vm->vcpu[vcpuid];
vcpu_lock(vcpu);
KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
vcpu->reqidle = 0;
vcpu_unlock(vcpu);
*retu = true;
return (0);
}
int
vm_suspend(struct vm *vm, enum vm_suspend_how how)
{
int i;
if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
return (EINVAL);
if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
VM_CTR2(vm, "virtual machine already suspended %d/%d",
vm->suspend, how);
return (EALREADY);
}
VM_CTR1(vm, "virtual machine successfully suspended %d", how);
/*
* Notify all active vcpus that they are now suspended.
*/
for (i = 0; i < vm->maxcpus; i++) {
if (CPU_ISSET(i, &vm->active_cpus))
vcpu_notify_event(vm, i, false);
}
return (0);
}
void
vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
{
struct vm_exit *vmexit;
KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
("vm_exit_suspended: invalid suspend type %d", vm->suspend));
vmexit = vm_exitinfo(vm, vcpuid);
vmexit->rip = rip;
vmexit->inst_length = 0;
vmexit->exitcode = VM_EXITCODE_SUSPENDED;
vmexit->u.suspended.how = vm->suspend;
}
void
vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip)
{
struct vm_exit *vmexit;
vmexit = vm_exitinfo(vm, vcpuid);
vmexit->rip = rip;
vmexit->inst_length = 0;
vmexit->exitcode = VM_EXITCODE_DEBUG;
}
void
vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip)
{
struct vm_exit *vmexit;
KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress"));
vmexit = vm_exitinfo(vm, vcpuid);
vmexit->rip = rip;
vmexit->inst_length = 0;
vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1);
}
void
vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip)
{
struct vm_exit *vmexit;
vmexit = vm_exitinfo(vm, vcpuid);
vmexit->rip = rip;
vmexit->inst_length = 0;
vmexit->exitcode = VM_EXITCODE_REQIDLE;
vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
}
void
vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
{
struct vm_exit *vmexit;
vmexit = vm_exitinfo(vm, vcpuid);
vmexit->rip = rip;
vmexit->inst_length = 0;
vmexit->exitcode = VM_EXITCODE_BOGUS;
vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
}
int
vm_run(struct vm *vm, struct vm_run *vmrun)
{
struct vm_eventinfo evinfo;
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
int error, vcpuid;
struct vcpu *vcpu;
struct pcb *pcb;
uint64_t tscval;
struct vm_exit *vme;
bool retu, intr_disabled;
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
pmap_t pmap;
vcpuid = vmrun->cpuid;
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
if (!CPU_ISSET(vcpuid, &vm->active_cpus))
return (EINVAL);
if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
return (EINVAL);
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
pmap = vmspace_pmap(vm->vmspace);
vcpu = &vm->vcpu[vcpuid];
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
vme = &vcpu->exitinfo;
evinfo.rptr = &vm->rendezvous_func;
evinfo.sptr = &vm->suspend;
evinfo.iptr = &vcpu->reqidle;
restart:
critical_enter();
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
("vm_run: absurd pm_active"));
tscval = rdtsc();
pcb = PCPU_GET(curpcb);
set_pcb_flags(pcb, PCB_FULL_IRET);
restore_guest_fpustate(vcpu);
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo);
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
save_guest_fpustate(vcpu);
vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
critical_exit();
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
if (error == 0) {
retu = false;
vcpu->nextrip = vme->rip + vme->inst_length;
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
switch (vme->exitcode) {
case VM_EXITCODE_REQIDLE:
error = vm_handle_reqidle(vm, vcpuid, &retu);
break;
case VM_EXITCODE_SUSPENDED:
error = vm_handle_suspend(vm, vcpuid, &retu);
break;
case VM_EXITCODE_IOAPIC_EOI:
vioapic_process_eoi(vm, vcpuid,
vme->u.ioapic_eoi.vector);
break;
case VM_EXITCODE_RENDEZVOUS:
vm_handle_rendezvous(vm, vcpuid);
error = 0;
break;
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
case VM_EXITCODE_HLT:
intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
break;
case VM_EXITCODE_PAGING:
error = vm_handle_paging(vm, vcpuid, &retu);
break;
case VM_EXITCODE_INST_EMUL:
error = vm_handle_inst_emul(vm, vcpuid, &retu);
break;
case VM_EXITCODE_INOUT:
case VM_EXITCODE_INOUT_STR:
error = vm_handle_inout(vm, vcpuid, vme, &retu);
break;
case VM_EXITCODE_MONITOR:
case VM_EXITCODE_MWAIT:
case VM_EXITCODE_VMINSN:
vm_inject_ud(vm, vcpuid);
break;
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
default:
retu = true; /* handled in userland */
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
break;
}
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
}
if (error == 0 && retu == false)
goto restart;
VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode);
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
/* copy the exit information */
bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
return (error);
}
int
vm_restart_instruction(void *arg, int vcpuid)
{
struct vm *vm;
struct vcpu *vcpu;
enum vcpu_state state;
uint64_t rip;
int error;
vm = arg;
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
vcpu = &vm->vcpu[vcpuid];
state = vcpu_get_state(vm, vcpuid, NULL);
if (state == VCPU_RUNNING) {
/*
* When a vcpu is "running" the next instruction is determined
* by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
* Thus setting 'inst_length' to zero will cause the current
* instruction to be restarted.
*/
vcpu->exitinfo.inst_length = 0;
VCPU_CTR1(vm, vcpuid, "restarting instruction at %#lx by "
"setting inst_length to zero", vcpu->exitinfo.rip);
} else if (state == VCPU_FROZEN) {
/*
* When a vcpu is "frozen" it is outside the critical section
* around VMRUN() and 'nextrip' points to the next instruction.
* Thus instruction restart is achieved by setting 'nextrip'
* to the vcpu's %rip.
*/
error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
KASSERT(!error, ("%s: error %d getting rip", __func__, error));
VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
"nextrip from %#lx to %#lx", vcpu->nextrip, rip);
vcpu->nextrip = rip;
} else {
panic("%s: invalid state %d", __func__, state);
}
return (0);
}
int
vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
{
struct vcpu *vcpu;
int type, vector;
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
vcpu = &vm->vcpu[vcpuid];
if (info & VM_INTINFO_VALID) {
type = info & VM_INTINFO_TYPE;
vector = info & 0xff;
if (type == VM_INTINFO_NMI && vector != IDT_NMI)
return (EINVAL);
if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
return (EINVAL);
if (info & VM_INTINFO_RSVD)
return (EINVAL);
} else {
info = 0;
}
VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info);
vcpu->exitintinfo = info;
return (0);
}
enum exc_class {
EXC_BENIGN,
EXC_CONTRIBUTORY,
EXC_PAGEFAULT
};
#define IDT_VE 20 /* Virtualization Exception (Intel specific) */
static enum exc_class
exception_class(uint64_t info)
{
int type, vector;
KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info));
type = info & VM_INTINFO_TYPE;
vector = info & 0xff;
/* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
switch (type) {
case VM_INTINFO_HWINTR:
case VM_INTINFO_SWINTR:
case VM_INTINFO_NMI:
return (EXC_BENIGN);
default:
/*
* Hardware exception.
*
* SVM and VT-x use identical type values to represent NMI,
* hardware interrupt and software interrupt.
*
* SVM uses type '3' for all exceptions. VT-x uses type '3'
* for exceptions except #BP and #OF. #BP and #OF use a type
* value of '5' or '6'. Therefore we don't check for explicit
* values of 'type' to classify 'intinfo' into a hardware
* exception.
*/
break;
}
switch (vector) {
case IDT_PF:
case IDT_VE:
return (EXC_PAGEFAULT);
case IDT_DE:
case IDT_TS:
case IDT_NP:
case IDT_SS:
case IDT_GP:
return (EXC_CONTRIBUTORY);
default:
return (EXC_BENIGN);
}
}
static int
nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
uint64_t *retinfo)
{
enum exc_class exc1, exc2;
int type1, vector1;
KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1));
KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2));
/*
* If an exception occurs while attempting to call the double-fault
* handler the processor enters shutdown mode (aka triple fault).
*/
type1 = info1 & VM_INTINFO_TYPE;
vector1 = info1 & 0xff;
if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)",
info1, info2);
vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
*retinfo = 0;
return (0);
}
/*
* Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
*/
exc1 = exception_class(info1);
exc2 = exception_class(info2);
if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
(exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
/* Convert nested fault into a double fault. */
*retinfo = IDT_DF;
*retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
*retinfo |= VM_INTINFO_DEL_ERRCODE;
} else {
/* Handle exceptions serially */
*retinfo = info2;
}
return (1);
}
static uint64_t
vcpu_exception_intinfo(struct vcpu *vcpu)
{
uint64_t info = 0;
if (vcpu->exception_pending) {
info = vcpu->exc_vector & 0xff;
info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
if (vcpu->exc_errcode_valid) {
info |= VM_INTINFO_DEL_ERRCODE;
info |= (uint64_t)vcpu->exc_errcode << 32;
}
}
return (info);
}
int
vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
{
struct vcpu *vcpu;
uint64_t info1, info2;
int valid;
KASSERT(vcpuid >= 0 &&
vcpuid < vm->maxcpus, ("invalid vcpu %d", vcpuid));
vcpu = &vm->vcpu[vcpuid];
info1 = vcpu->exitintinfo;
vcpu->exitintinfo = 0;
info2 = 0;
if (vcpu->exception_pending) {
info2 = vcpu_exception_intinfo(vcpu);
vcpu->exception_pending = 0;
VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx",
vcpu->exc_vector, info2);
}
if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
} else if (info1 & VM_INTINFO_VALID) {
*retinfo = info1;
valid = 1;
} else if (info2 & VM_INTINFO_VALID) {
*retinfo = info2;
valid = 1;
} else {
valid = 0;
}
if (valid) {
VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), "
"retinfo(%#lx)", __func__, info1, info2, *retinfo);
}
return (valid);
}
int
vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
{
struct vcpu *vcpu;
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
vcpu = &vm->vcpu[vcpuid];
*info1 = vcpu->exitintinfo;
*info2 = vcpu_exception_intinfo(vcpu);
return (0);
}
int
vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
uint32_t errcode, int restart_instruction)
{
struct vcpu *vcpu;
uint64_t regval;
int error;
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
if (vector < 0 || vector >= 32)
return (EINVAL);
/*
* A double fault exception should never be injected directly into
* the guest. It is a derived exception that results from specific
* combinations of nested faults.
*/
if (vector == IDT_DF)
return (EINVAL);
vcpu = &vm->vcpu[vcpuid];
if (vcpu->exception_pending) {
VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
"pending exception %d", vector, vcpu->exc_vector);
return (EBUSY);
}
if (errcode_valid) {
/*
* Exceptions don't deliver an error code in real mode.
*/
error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &regval);
KASSERT(!error, ("%s: error %d getting CR0", __func__, error));
if (!(regval & CR0_PE))
errcode_valid = 0;
}
/*
* From section 26.6.1 "Interruptibility State" in Intel SDM:
*
* Event blocking by "STI" or "MOV SS" is cleared after guest executes
* one instruction or incurs an exception.
*/
error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
__func__, error));
if (restart_instruction)
vm_restart_instruction(vm, vcpuid);
vcpu->exception_pending = 1;
vcpu->exc_vector = vector;
vcpu->exc_errcode = errcode;
vcpu->exc_errcode_valid = errcode_valid;
VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector);
return (0);
}
void
vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid,
int errcode)
{
struct vm *vm;
int error, restart_instruction;
vm = vmarg;
restart_instruction = 1;
error = vm_inject_exception(vm, vcpuid, vector, errcode_valid,
errcode, restart_instruction);
KASSERT(error == 0, ("vm_inject_exception error %d", error));
}
void
vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2)
{
struct vm *vm;
int error;
vm = vmarg;
VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
error_code, cr2);
error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
}
static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
int
vm_inject_nmi(struct vm *vm, int vcpuid)
{
struct vcpu *vcpu;
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
vcpu = &vm->vcpu[vcpuid];
vcpu->nmi_pending = 1;
vcpu_notify_event(vm, vcpuid, false);
return (0);
}
int
vm_nmi_pending(struct vm *vm, int vcpuid)
{
struct vcpu *vcpu;
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
vcpu = &vm->vcpu[vcpuid];
return (vcpu->nmi_pending);
}
void
vm_nmi_clear(struct vm *vm, int vcpuid)
{
struct vcpu *vcpu;
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
vcpu = &vm->vcpu[vcpuid];
if (vcpu->nmi_pending == 0)
panic("vm_nmi_clear: inconsistent nmi_pending state");
vcpu->nmi_pending = 0;
vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
}
static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
int
vm_inject_extint(struct vm *vm, int vcpuid)
{
struct vcpu *vcpu;
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
vcpu = &vm->vcpu[vcpuid];
vcpu->extint_pending = 1;
vcpu_notify_event(vm, vcpuid, false);
return (0);
}
int
vm_extint_pending(struct vm *vm, int vcpuid)
{
struct vcpu *vcpu;
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
vcpu = &vm->vcpu[vcpuid];
return (vcpu->extint_pending);
}
void
vm_extint_clear(struct vm *vm, int vcpuid)
{
struct vcpu *vcpu;
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
vcpu = &vm->vcpu[vcpuid];
if (vcpu->extint_pending == 0)
panic("vm_extint_clear: inconsistent extint_pending state");
vcpu->extint_pending = 0;
vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
}
int
vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
{
if (vcpu < 0 || vcpu >= vm->maxcpus)
return (EINVAL);
if (type < 0 || type >= VM_CAP_MAX)
return (EINVAL);
return (VMGETCAP(vm->cookie, vcpu, type, retval));
}
int
vm_set_capability(struct vm *vm, int vcpu, int type, int val)
{
if (vcpu < 0 || vcpu >= vm->maxcpus)
return (EINVAL);
if (type < 0 || type >= VM_CAP_MAX)
return (EINVAL);
return (VMSETCAP(vm->cookie, vcpu, type, val));
}
struct vlapic *
vm_lapic(struct vm *vm, int cpu)
{
return (vm->vcpu[cpu].vlapic);
}
struct vioapic *
vm_ioapic(struct vm *vm)
{
return (vm->vioapic);
}
struct vhpet *
vm_hpet(struct vm *vm)
{
return (vm->vhpet);
}
bool
vmm_is_pptdev(int bus, int slot, int func)
{
int b, f, i, n, s;
char *val, *cp, *cp2;
bool found;
/*
* XXX
* The length of an environment variable is limited to 128 bytes which
* puts an upper limit on the number of passthru devices that may be
* specified using a single environment variable.
*
* Work around this by scanning multiple environment variable
* names instead of a single one - yuck!
*/
const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
found = false;
for (i = 0; names[i] != NULL && !found; i++) {
cp = val = kern_getenv(names[i]);
while (cp != NULL && *cp != '\0') {
if ((cp2 = strchr(cp, ' ')) != NULL)
*cp2 = '\0';
n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
if (n == 3 && bus == b && slot == s && func == f) {
found = true;
break;
}
if (cp2 != NULL)
*cp2++ = ' ';
cp = cp2;
}
freeenv(val);
}
return (found);
}
void *
vm_iommu_domain(struct vm *vm)
{
return (vm->iommu);
}
int
vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
bool from_idle)
{
int error;
struct vcpu *vcpu;
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
vcpu = &vm->vcpu[vcpuid];
vcpu_lock(vcpu);
error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
vcpu_unlock(vcpu);
return (error);
}
enum vcpu_state
vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
{
struct vcpu *vcpu;
enum vcpu_state state;
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
vcpu = &vm->vcpu[vcpuid];
vcpu_lock(vcpu);
state = vcpu->state;
if (hostcpu != NULL)
*hostcpu = vcpu->hostcpu;
vcpu_unlock(vcpu);
return (state);
}
int
vm_activate_cpu(struct vm *vm, int vcpuid)
{
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
if (CPU_ISSET(vcpuid, &vm->active_cpus))
return (EBUSY);
VCPU_CTR0(vm, vcpuid, "activated");
CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
return (0);
}
int
vm_suspend_cpu(struct vm *vm, int vcpuid)
{
int i;
if (vcpuid < -1 || vcpuid >= vm->maxcpus)
return (EINVAL);
if (vcpuid == -1) {
vm->debug_cpus = vm->active_cpus;
for (i = 0; i < vm->maxcpus; i++) {
if (CPU_ISSET(i, &vm->active_cpus))
vcpu_notify_event(vm, i, false);
}
} else {
if (!CPU_ISSET(vcpuid, &vm->active_cpus))
return (EINVAL);
CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus);
vcpu_notify_event(vm, vcpuid, false);
}
return (0);
}
int
vm_resume_cpu(struct vm *vm, int vcpuid)
{
if (vcpuid < -1 || vcpuid >= vm->maxcpus)
return (EINVAL);
if (vcpuid == -1) {
CPU_ZERO(&vm->debug_cpus);
} else {
if (!CPU_ISSET(vcpuid, &vm->debug_cpus))
return (EINVAL);
CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
}
return (0);
}
int
vcpu_debugged(struct vm *vm, int vcpuid)
{
return (CPU_ISSET(vcpuid, &vm->debug_cpus));
}
2011-06-28 06:26:03 +00:00
cpuset_t
vm_active_cpus(struct vm *vm)
{
return (vm->active_cpus);
}
cpuset_t
vm_debug_cpus(struct vm *vm)
{
return (vm->debug_cpus);
}
cpuset_t
vm_suspended_cpus(struct vm *vm)
{
return (vm->suspended_cpus);
}
void *
vcpu_stats(struct vm *vm, int vcpuid)
{
return (vm->vcpu[vcpuid].stats);
}
int
vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
{
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
*state = vm->vcpu[vcpuid].x2apic_state;
return (0);
}
int
vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
{
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
if (state >= X2APIC_STATE_LAST)
return (EINVAL);
vm->vcpu[vcpuid].x2apic_state = state;
vlapic_set_x2apic_state(vm, vcpuid, state);
return (0);
}
/*
* This function is called to ensure that a vcpu "sees" a pending event
* as soon as possible:
* - If the vcpu thread is sleeping then it is woken up.
* - If the vcpu is running on a different host_cpu then an IPI will be directed
* to the host_cpu to cause the vcpu to trap into the hypervisor.
*/
static void
vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr)
{
int hostcpu;
hostcpu = vcpu->hostcpu;
if (vcpu->state == VCPU_RUNNING) {
KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
if (hostcpu != curcpu) {
if (lapic_intr) {
2014-01-09 03:25:54 +00:00
vlapic_post_intr(vcpu->vlapic, hostcpu,
vmm_ipinum);
} else {
ipi_cpu(hostcpu, vmm_ipinum);
}
} else {
/*
* If the 'vcpu' is running on 'curcpu' then it must
* be sending a notification to itself (e.g. SELF_IPI).
* The pending event will be picked up when the vcpu
* transitions back to guest context.
*/
}
} else {
KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
"with hostcpu %d", vcpu->state, hostcpu));
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
if (vcpu->state == VCPU_SLEEPING)
wakeup_one(vcpu);
}
}
void
vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
{
struct vcpu *vcpu = &vm->vcpu[vcpuid];
vcpu_lock(vcpu);
vcpu_notify_event_locked(vcpu, lapic_intr);
vcpu_unlock(vcpu);
}
Merge projects/bhyve_npt_pmap into head. Make the amd64/pmap code aware of nested page table mappings used by bhyve guests. This allows bhyve to associate each guest with its own vmspace and deal with nested page faults in the context of that vmspace. This also enables features like accessed/dirty bit tracking, swapping to disk and transparent superpage promotions of guest memory. Guest vmspace: Each bhyve guest has a unique vmspace to represent the physical memory allocated to the guest. Each memory segment allocated by the guest is mapped into the guest's address space via the 'vmspace->vm_map' and is backed by an object of type OBJT_DEFAULT. pmap types: The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT. The PT_X86 pmap type is used by the vmspace associated with the host kernel as well as user processes executing on the host. The PT_EPT pmap is used by the vmspace associated with a bhyve guest. Page Table Entries: The EPT page table entries as mostly similar in functionality to regular page table entries although there are some differences in terms of what bits are used to express that functionality. For e.g. the dirty bit is represented by bit 9 in the nested PTE as opposed to bit 6 in the regular x86 PTE. Therefore the bitmask representing the dirty bit is now computed at runtime based on the type of the pmap. Thus PG_M that was previously a macro now becomes a local variable that is initialized at runtime using 'pmap_modified_bit(pmap)'. An additional wrinkle associated with EPT mappings is that older Intel processors don't have hardware support for tracking accessed/dirty bits in the PTE. This means that the amd64/pmap code needs to emulate these bits to provide proper accounting to the VM subsystem. This is achieved by using the following mapping for EPT entries that need emulation of A/D bits: Bit Position Interpreted By PG_V 52 software (accessed bit emulation handler) PG_RW 53 software (dirty bit emulation handler) PG_A 0 hardware (aka EPT_PG_RD) PG_M 1 hardware (aka EPT_PG_WR) The idea to use the mapping listed above for A/D bit emulation came from Alan Cox (alc@). The final difference with respect to x86 PTEs is that some EPT implementations do not support superpage mappings. This is recorded in the 'pm_flags' field of the pmap. TLB invalidation: The amd64/pmap code has a number of ways to do invalidation of mappings that may be cached in the TLB: single page, multiple pages in a range or the entire TLB. All of these funnel into a single EPT invalidation routine called 'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and sends an IPI to the host cpus that are executing the guest's vcpus. On a subsequent entry into the guest it will detect that the EPT has changed and invalidate the mappings from the TLB. Guest memory access: Since the guest memory is no longer wired we need to hold the host physical page that backs the guest physical page before we can access it. The helper functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose. PCI passthru: Guest's with PCI passthru devices will wire the entire guest physical address space. The MMIO BAR associated with the passthru device is backed by a vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that have one or more PCI passthru devices attached to them. Limitations: There isn't a way to map a guest physical page without execute permissions. This is because the amd64/pmap code interprets the guest physical mappings as user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U shares the same bit position as EPT_PG_EXECUTE all guest mappings become automatically executable. Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews as well as their support and encouragement. Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing object for pci passthru mmio regions. Special thanks to Peter Holm for testing the patch on short notice. Approved by: re Discussed with: grehan Reviewed by: alc, kib Tested by: pho
2013-10-05 21:22:35 +00:00
struct vmspace *
vm_get_vmspace(struct vm *vm)
{
return (vm->vmspace);
}
int
vm_apicid2vcpuid(struct vm *vm, int apicid)
{
/*
* XXX apic id is assumed to be numerically identical to vcpu id
*/
return (apicid);
}
void
vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
vm_rendezvous_func_t func, void *arg)
{
int i;
/*
* Enforce that this function is called without any locks
*/
WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < vm->maxcpus),
("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
restart:
mtx_lock(&vm->rendezvous_mtx);
if (vm->rendezvous_func != NULL) {
/*
* If a rendezvous is already in progress then we need to
* call the rendezvous handler in case this 'vcpuid' is one
* of the targets of the rendezvous.
*/
RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
mtx_unlock(&vm->rendezvous_mtx);
vm_handle_rendezvous(vm, vcpuid);
goto restart;
}
KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
"rendezvous is still in progress"));
RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
vm->rendezvous_req_cpus = dest;
CPU_ZERO(&vm->rendezvous_done_cpus);
vm->rendezvous_arg = arg;
vm_set_rendezvous_func(vm, func);
mtx_unlock(&vm->rendezvous_mtx);
/*
* Wake up any sleeping vcpus and trigger a VM-exit in any running
* vcpus so they handle the rendezvous as soon as possible.
*/
for (i = 0; i < vm->maxcpus; i++) {
if (CPU_ISSET(i, &dest))
vcpu_notify_event(vm, i, false);
}
vm_handle_rendezvous(vm, vcpuid);
}
struct vatpic *
vm_atpic(struct vm *vm)
{
return (vm->vatpic);
}
struct vatpit *
vm_atpit(struct vm *vm)
{
return (vm->vatpit);
}
struct vpmtmr *
vm_pmtmr(struct vm *vm)
{
return (vm->vpmtmr);
}
struct vrtc *
vm_rtc(struct vm *vm)
{
return (vm->vrtc);
}
enum vm_reg_name
vm_segment_name(int seg)
{
static enum vm_reg_name seg_names[] = {
VM_REG_GUEST_ES,
VM_REG_GUEST_CS,
VM_REG_GUEST_SS,
VM_REG_GUEST_DS,
VM_REG_GUEST_FS,
VM_REG_GUEST_GS
};
KASSERT(seg >= 0 && seg < nitems(seg_names),
("%s: invalid segment encoding %d", __func__, seg));
return (seg_names[seg]);
}
void
vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
int num_copyinfo)
{
int idx;
for (idx = 0; idx < num_copyinfo; idx++) {
if (copyinfo[idx].cookie != NULL)
vm_gpa_release(copyinfo[idx].cookie);
}
bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo));
}
int
vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
int num_copyinfo, int *fault)
{
int error, idx, nused;
size_t n, off, remaining;
void *hva, *cookie;
uint64_t gpa;
bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo);
nused = 0;
remaining = len;
while (remaining > 0) {
KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
if (error || *fault)
return (error);
off = gpa & PAGE_MASK;
n = min(remaining, PAGE_SIZE - off);
copyinfo[nused].gpa = gpa;
copyinfo[nused].len = n;
remaining -= n;
gla += n;
nused++;
}
for (idx = 0; idx < nused; idx++) {
hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa,
copyinfo[idx].len, prot, &cookie);
if (hva == NULL)
break;
copyinfo[idx].hva = hva;
copyinfo[idx].cookie = cookie;
}
if (idx != nused) {
vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
return (EFAULT);
} else {
*fault = 0;
return (0);
}
}
void
vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
size_t len)
{
char *dst;
int idx;
dst = kaddr;
idx = 0;
while (len > 0) {
bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
len -= copyinfo[idx].len;
dst += copyinfo[idx].len;
idx++;
}
}
void
vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
struct vm_copyinfo *copyinfo, size_t len)
{
const char *src;
int idx;
src = kaddr;
idx = 0;
while (len > 0) {
bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
len -= copyinfo[idx].len;
src += copyinfo[idx].len;
idx++;
}
}
/*
* Return the amount of in-use and wired memory for the VM. Since
* these are global stats, only return the values with for vCPU 0
*/
VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
VMM_STAT_DECLARE(VMM_MEM_WIRED);
static void
vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
{
if (vcpu == 0) {
vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
PAGE_SIZE * vmspace_resident_count(vm->vmspace));
}
}
static void
vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
{
if (vcpu == 0) {
vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
}
}
VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);