kboot: Improve amd64 booting

Copy more of the necessary state for FreeBSD to boot:
o Copy EFI memory tables
o Create custom page tables needed for the kernel to find itself
o Simplify the passing of args to the trampoline by putting them
  on the stack rather than in dedicated memory.

This is only partially successful... we get only part way through the
amd64 startup code before dying. However, it's much further than before
the changes.

Sponsored by:		Netflix
Reviewed by:		tsoome, kevans
Differential Revision:	https://reviews.freebsd.org/D38259
This commit is contained in:
Warner Losh 2023-02-03 08:40:13 -07:00
parent dfcca21075
commit 2069a2a08f
2 changed files with 263 additions and 111 deletions

View File

@ -1,9 +1,6 @@
/*-
* Copyright (c) 2013 The FreeBSD Foundation
* All rights reserved.
* Copyright (c) 2022 Netflix, Inc
*
* This software was developed by Benno Rice under sponsorship from
* the FreeBSD Foundation.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@ -24,53 +21,87 @@
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <machine/asmacros.h>
#define ASM_FILE
#include "multiboot2.h"
.text
.globl amd64_tramp
/*
* void amd64_tramp(uint64_t stack, void *copy_finish, uint64_t kernend,
* uint64_t modulep, uint64_t pagetable, uint64_t entry)
* This is the trampoline that starts the FreeBSD kernel. Since the Linux kernel
* calls this routine with no args, and has a different environment than the
* boot loader provides and that the kernel expects, this code is responsible
* for setting all that up and calling the normal kernel entry point. It's
* analogous to the "purgatory" code in the linux kernel. Details about these
* operations are contained in comments below. On amd64, the kernel starts all
* the APs so we don't have to worry about them here.
*/
amd64_tramp:
cli /* Make sure we don't get interrupted. */
movq %rdi,%rsp /* Switch to our temporary stack. */
movq %rdx,%r12 /* Stash the kernel values for later. */
movq %rcx,%r13
movq %r8,%r14
movq %r9,%r15
callq *%rsi /* Call copy_finish so we're all ready to go. */
pushq %r12 /* Push kernend. */
salq $32,%r13 /* Shift modulep and push it. */
pushq %r13
pushq %r15 /* Push the entry address. */
movq %r14,%cr3 /* Switch page tables. */
ret /* "Return" to kernel entry. */
ALIGN_TEXT
amd64_tramp_end:
/* void multiboot2_exec(uint64_t entry, uint64_t multiboot_info, uint64_t stack) */
.globl multiboot2_exec
multiboot2_exec:
movq %rdx,%rsp
pushq %rdi
movq %rsi,%rbx
movq $MULTIBOOT2_BOOTLOADER_MAGIC,%rax
ret
/*
* Keep in sync with elf64_freebsd.c. Kexec starts tramp w/o any parameters, so
* store them here. This is constructed to be a useful stack:
*
* struct trampoline_data {
* uint64_t pt4; // Page table address to pop
* uint64_t entry; // return address to jump to kernel
* uint32_t fill1; // 0
* uint32_t modulep; // 4 module metadata
* uint32_t kernend; // 8 kernel end
* uint32_t fill2; // 12
* };
*
* loader.kboot will construct a stack that btext expects, which is arguments on
* the stack, not in registers, and these args are 32-bit not 64
*
* Processor is already in long mode when we're called, paging is enabled and
* boot loader loads things such that:
* - kernel mapped at KERNBASE, aligned to 2MB, below 4GB, contiguous memory
* - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M)
* - kernel is mapped with 2M superpages
* - The kernel, modules and metadata is in first 4GB which is unity mapped
* - There's additional memory after loader provided data for early allocations
*
* Unlike EFI, we don't support copying the staging area. We tell Linux to land
* the kernel in its final location with the needed alignment, etc. We copy the
* trampoline code to 1MB offset above KERNBASE since that memory is otherwise
* free and safely above the lower 1MB swamp we inherited from IBM PC, though
* this code makes no assumptions about where that might.
*
* Thus, the trampoline just needs to set %rsp to that stack pop the %cr3 value,
* set it and then retq to jump to the kernel with its stack args filled in.
* Since the handoff to this code used to be from 32-bit code, it uses the i386
* calling conventions which put the arguments on the stack. The kernel's btext
* routine expects this setup.
*/
.text
.globl tramp
tramp:
cli /* Make sure we don't get interrupted. */
leaq tramp_pt4(%rip), %rsp /* Setup our pre-filled-in stack */
popq %rax /* Pop off the PT4 ptr for %cr3 */
movq %rax, %cr3 /* set the page table */
retq /* Return addr and args already on stack */
/*
* The following is the stack for the above code. The stack will increase in
* address as things are popped off of it, so we start with the stack pointing
* to tramp_pt4.
*/
.p2align 3 /* Stack has to be 8 byte aligned */
trampoline_data:
tramp_pt4: .quad 0 /* New %cr3 value */
tramp_entry: .quad 0 /* Entry to kernel (btext) */
/* %rsp points here on entry to amd64 kernel's btext */
.long 0 /* 0 filler, ignored (current loaders set to 0) */
tramp_modulep: .long 0 /* 4 moudlep */
tramp_kernend: .long 0 /* 8 kernend */
.long 0 /* 12 alignment filler (also 0) */
tramp_end:
.data
.globl amd64_tramp_size
amd64_tramp_size:
.long amd64_tramp_end-amd64_tramp
.type tramp_size,@object
.globl tramp_size
tramp_size:
.long tramp_end-tramp
.size tramp_size, 4
.type tramp_data_offset,@object
.globl tramp_data_offset
tramp_data_offset:
.long trampoline_data-tramp
.size tramp_data_offset, 4

View File

@ -41,9 +41,12 @@ __FBSDID("$FreeBSD$");
#ifdef EFI
#include <efi.h>
#include <efilib.h>
#else
#include "host_syscall.h"
#endif
#include "bootstrap.h"
#include "kboot.h"
#include "platform/acfreebsd.h"
#include "acconfig.h"
@ -53,9 +56,7 @@ __FBSDID("$FreeBSD$");
#ifdef EFI
#include "loader_efi.h"
#endif
#ifdef EFI
static EFI_GUID acpi_guid = ACPI_TABLE_GUID;
static EFI_GUID acpi20_guid = ACPI_20_TABLE_GUID;
#endif
@ -63,9 +64,11 @@ static EFI_GUID acpi20_guid = ACPI_20_TABLE_GUID;
#ifdef EFI
#define LOADER_PAGE_SIZE EFI_PAGE_SIZE
#else
#define LOADER_PAGE_SIZE 8192
#define LOADER_PAGE_SIZE PAGE_SIZE
#endif
extern vm_offset_t kboot_get_phys_load_segment(void);
extern int bi_load(char *args, vm_offset_t *modulep, vm_offset_t *kernendp,
bool exit_bs);
@ -81,13 +84,13 @@ static struct file_format amd64_elf_obj = {
.l_exec = elf64_obj_exec,
};
#if 0
#ifdef EFI
extern struct file_format multiboot2;
extern struct file_format multiboot2_obj;
#endif
struct file_format *file_formats[] = {
#if 0
#ifdef EFI
&multiboot2,
&multiboot2_obj,
#endif
@ -96,21 +99,44 @@ struct file_format *file_formats[] = {
NULL
};
#ifdef EFI
#ifndef EFI
/*
* We create the stack that we want. We have the address of the page tables
* we make on top (so we pop that off and set %cr3). We have the entry point
* to the kernel (which retq pops off) This leaves the stack that the btext
* wants: offset 4 is modulep and offset8 is kernend, with the filler bytes
* to keep this aligned. This makes the trampoline very simple.
*/
struct trampoline_data {
uint64_t pt4; // Page table address to pop
uint64_t entry; // return address to jump to kernel
uint32_t fill1; // 0
uint32_t modulep; // 4 module metadata
uint32_t kernend; // 8 kernel end
uint32_t fill2; // 12
};
_Static_assert(sizeof(struct trampoline_data) == 32, "Bad size for trampoline data");
#endif
static pml4_entry_t *PT4;
static pdp_entry_t *PT3;
static pdp_entry_t *PT3_l, *PT3_u;
static pd_entry_t *PT2;
static pd_entry_t *PT2_l0, *PT2_l1, *PT2_l2, *PT2_l3, *PT2_u0, *PT2_u1;
#ifdef EFI
static pdp_entry_t *PT3;
static pd_entry_t *PT2;
extern EFI_PHYSICAL_ADDRESS staging;
static void (*trampoline)(uint64_t stack, void *copy_finish, uint64_t kernend,
uint64_t modulep, pml4_entry_t *pagetable, uint64_t entry);
#endif
extern uintptr_t amd64_tramp;
extern uint32_t amd64_tramp_size;
extern uintptr_t tramp;
extern uint32_t tramp_size;
#ifndef EFI
extern uint32_t tramp_data_offset;
#endif
/*
* There is an ELF kernel and one or more ELF modules loaded.
@ -120,15 +146,27 @@ extern uint32_t amd64_tramp_size;
static int
elf64_exec(struct preloaded_file *fp)
{
#ifdef EFI
struct file_metadata *md;
Elf_Ehdr *ehdr;
vm_offset_t modulep, kernend, trampcode, trampstack;
vm_offset_t modulep, kernend;
int err, i;
ACPI_TABLE_RSDP *rsdp;
char buf[24];
#ifdef EFI
ACPI_TABLE_RSDP *rsdp = NULL;
int revision;
bool copy_auto;
int copy_auto;
vm_offset_t trampstack, trampcode;
#else
vm_offset_t rsdp = 0;
void *trampcode;
int nseg;
void *kseg;
vm_offset_t trampolinebase;
uint64_t *trampoline;
struct trampoline_data *trampoline_data;
vm_offset_t staging;
int error;
#endif
#ifdef EFI
copy_auto = copy_staging == COPY_STAGING_AUTO;
@ -136,66 +174,49 @@ elf64_exec(struct preloaded_file *fp)
copy_staging = fp->f_kernphys_relocatable ?
COPY_STAGING_DISABLE : COPY_STAGING_ENABLE;
#else
copy_auto = COPY_STAGING_DISABLE; /* XXX */
/*
* Figure out where to put it.
*
* Linux does not allow to do kexec_load into any part of memory. Ask
* arch_loadaddr to resolve the first available chunk of physical memory
* where loading is possible (load_addr).
*
* The kernel is loaded at the 'base' address in continguous physical
* pages (using 2MB super pages). The first such page is unused by the
* kernel and serves as a good place to put not only the trampoline, but
* the page table pages that the trampoline needs to setup the proper
* kernel starting environment.
*/
staging = trampolinebase = kboot_get_phys_load_segment();
trampolinebase += 1ULL << 20; /* Copy trampoline to base + 1MB, kernel will wind up at 2MB */
printf("Load address at %#jx\n", (uintmax_t)trampolinebase);
printf("Relocation offset is %#jx\n", (uintmax_t)elf64_relocation_offset);
#endif
/*
* Report the RSDP to the kernel. While this can be found with
* a BIOS boot, the RSDP may be elsewhere when booted from UEFI.
* The old code used the 'hints' method to communite this to
* the kernel. However, while convenient, the 'hints' method
* is fragile and does not work when static hints are compiled
* into the kernel. Instead, move to setting different tunables
* that start with acpi. The old 'hints' can be removed before
* we branch for FreeBSD 12.
*/
#ifdef EFI
rsdp = efi_get_table(&acpi20_guid);
if (rsdp == NULL) {
rsdp = efi_get_table(&acpi_guid);
}
#else
rsdp = NULL;
#warning "write me"
rsdp = acpi_rsdp();
#endif
if (rsdp != NULL) {
if (rsdp != 0) {
sprintf(buf, "0x%016llx", (unsigned long long)rsdp);
setenv("hint.acpi.0.rsdp", buf, 1);
setenv("acpi.rsdp", buf, 1);
revision = rsdp->Revision;
if (revision == 0)
revision = 1;
sprintf(buf, "%d", revision);
setenv("hint.acpi.0.revision", buf, 1);
setenv("acpi.revision", buf, 1);
strncpy(buf, rsdp->OemId, sizeof(rsdp->OemId));
buf[sizeof(rsdp->OemId)] = '\0';
setenv("hint.acpi.0.oem", buf, 1);
setenv("acpi.oem", buf, 1);
sprintf(buf, "0x%016x", rsdp->RsdtPhysicalAddress);
setenv("hint.acpi.0.rsdt", buf, 1);
setenv("acpi.rsdt", buf, 1);
if (revision >= 2) {
/* XXX extended checksum? */
sprintf(buf, "0x%016llx",
(unsigned long long)rsdp->XsdtPhysicalAddress);
setenv("hint.acpi.0.xsdt", buf, 1);
setenv("acpi.xsdt", buf, 1);
sprintf(buf, "%d", rsdp->Length);
setenv("hint.acpi.0.xsdt_length", buf, 1);
setenv("acpi.xsdt_length", buf, 1);
}
}
if ((md = file_findmetadata(fp, MODINFOMD_ELFHDR)) == NULL)
return (EFTYPE);
ehdr = (Elf_Ehdr *)&(md->md_data);
#ifdef EFI
trampcode = copy_staging == COPY_STAGING_ENABLE ?
(vm_offset_t)0x0000000040000000 /* 1G */ :
(vm_offset_t)0x0000000100000000; /* 4G */;
#ifdef EFI
err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 1,
(EFI_PHYSICAL_ADDRESS *)&trampcode);
if (EFI_ERROR(err)) {
@ -204,17 +225,22 @@ elf64_exec(struct preloaded_file *fp)
copy_staging = COPY_STAGING_AUTO;
return (ENOMEM);
}
trampstack = trampcode + LOADER_PAGE_SIZE - 8;
#else
#warning "Write me"
// XXX Question: why not just use malloc?
trampcode = host_getmem(LOADER_PAGE_SIZE);
if (trampcode == NULL) {
printf("Unable to allocate trampoline\n");
return (ENOMEM);
}
#endif
bzero((void *)trampcode, LOADER_PAGE_SIZE);
trampstack = trampcode + LOADER_PAGE_SIZE - 8;
bcopy((void *)&amd64_tramp, (void *)trampcode, amd64_tramp_size);
bcopy((void *)&tramp, (void *)trampcode, tramp_size);
trampoline = (void *)trampcode;
#ifdef EFI
if (copy_staging == COPY_STAGING_ENABLE) {
PT4 = (pml4_entry_t *)0x0000000040000000;
#ifdef EFI
err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 3,
(EFI_PHYSICAL_ADDRESS *)&PT4);
if (EFI_ERROR(err)) {
@ -224,9 +250,6 @@ elf64_exec(struct preloaded_file *fp)
copy_staging = COPY_STAGING_AUTO;
return (ENOMEM);
}
#else
#warning "Write me"
#endif
bzero(PT4, 3 * LOADER_PAGE_SIZE);
PT3 = &PT4[512];
PT2 = &PT3[512];
@ -259,7 +282,6 @@ elf64_exec(struct preloaded_file *fp)
}
} else {
PT4 = (pml4_entry_t *)0x0000000100000000; /* 4G */
#ifdef EFI
err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 9,
(EFI_PHYSICAL_ADDRESS *)&PT4);
if (EFI_ERROR(err)) {
@ -269,10 +291,6 @@ elf64_exec(struct preloaded_file *fp)
copy_staging = COPY_STAGING_AUTO;
return (ENOMEM);
}
#else
#warning "Write me"
#endif
bzero(PT4, 9 * LOADER_PAGE_SIZE);
PT3_l = &PT4[NPML4EPG * 1];
@ -308,10 +326,84 @@ elf64_exec(struct preloaded_file *fp)
PG_V | PG_RW | PG_PS;
}
}
#else
{
vm_offset_t pabase, pa_pt3_l, pa_pt3_u, pa_pt2_l0, pa_pt2_l1, pa_pt2_l2, pa_pt2_l3, pa_pt2_u0, pa_pt2_u1;
/* We'll find a place for these later */
PT4 = (pml4_entry_t *)host_getmem(9 * LOADER_PAGE_SIZE);
bzero(PT4, 9 * LOADER_PAGE_SIZE);
PT3_l = &PT4[NPML4EPG * 1];
PT3_u = &PT4[NPML4EPG * 2];
PT2_l0 = &PT4[NPML4EPG * 3];
PT2_l1 = &PT4[NPML4EPG * 4];
PT2_l2 = &PT4[NPML4EPG * 5];
PT2_l3 = &PT4[NPML4EPG * 6];
PT2_u0 = &PT4[NPML4EPG * 7];
PT2_u1 = &PT4[NPML4EPG * 8];
pabase = trampolinebase + LOADER_PAGE_SIZE;
pa_pt3_l = pabase + LOADER_PAGE_SIZE * 1;
pa_pt3_u = pabase + LOADER_PAGE_SIZE * 2;
pa_pt2_l0 = pabase + LOADER_PAGE_SIZE * 3;
pa_pt2_l1 = pabase + LOADER_PAGE_SIZE * 4;
pa_pt2_l2 = pabase + LOADER_PAGE_SIZE * 5;
pa_pt2_l3 = pabase + LOADER_PAGE_SIZE * 6;
pa_pt2_u0 = pabase + LOADER_PAGE_SIZE * 7;
pa_pt2_u1 = pabase + LOADER_PAGE_SIZE * 8;
/* 1:1 mapping of lower 4G */
PT4[0] = (pml4_entry_t)pa_pt3_l | PG_V | PG_RW;
PT3_l[0] = (pdp_entry_t)pa_pt2_l0 | PG_V | PG_RW;
PT3_l[1] = (pdp_entry_t)pa_pt2_l1 | PG_V | PG_RW;
PT3_l[2] = (pdp_entry_t)pa_pt2_l2 | PG_V | PG_RW;
PT3_l[3] = (pdp_entry_t)pa_pt2_l3 | PG_V | PG_RW;
for (i = 0; i < 4 * NPDEPG; i++) { /* we overflow PT2_l0 into _l1, etc */
PT2_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V |
PG_RW | PG_PS;
}
/* mapping of kernel 2G below top */
PT4[NPML4EPG - 1] = (pml4_entry_t)pa_pt3_u | PG_V | PG_RW;
PT3_u[NPDPEPG - 2] = (pdp_entry_t)pa_pt2_u0 | PG_V | PG_RW;
PT3_u[NPDPEPG - 1] = (pdp_entry_t)pa_pt2_u1 | PG_V | PG_RW;
/* compat mapping of phys @0 */
PT2_u0[0] = PG_PS | PG_V | PG_RW;
/* this maps past staging area */
/*
* Kernel uses the KERNSTART (== KERNBASE + 2MB) entry to figure
* out where we loaded the kernel. This is PT2_u0[1] (since
* these map 2MB pages. So the PA that this maps has to be
* kboot's staging + 2MB. For UEFI we do 'i - 1' since we load
* the kernel right at staging (and assume the first address we
* load is 2MB in efi_copyin). However for kboot, staging + 1 *
* NBPDR == staging + 2MB which is where the kernel starts. Our
* trampoline need not be mapped into the kernel space since we
* execute PA==VA for that, and the trampoline can just go away
* once the kernel is called.
*
* Staging should likely be as low as possible, though, because
* all the 'early' allocations are at kernend (which the kernel
* calls physfree).
*/
for (i = 1; i < 2 * NPDEPG; i++) { /* we overflow PT2_u0 into _u1 */
PT2_u0[i] = ((pd_entry_t)staging +
((pd_entry_t)i) * NBPDR) |
PG_V | PG_RW | PG_PS;
if (i < 10) printf("Mapping %d to %#lx staging %#lx\n", i, PT2_u0[i], staging);
}
}
#endif
#ifdef EFI
printf("staging %#lx (%scopying) tramp %p PT4 %p\n",
staging, copy_staging == COPY_STAGING_ENABLE ? "" : "not ",
trampoline, PT4);
#else
printf("staging %#lx tramp %p PT4 %p\n", staging, (void *)trampolinebase,
(void *)trampolinebase + LOADER_PAGE_SIZE);
#endif
printf("Start @ 0x%lx ...\n", ehdr->e_entry);
#ifdef EFI
@ -321,17 +413,46 @@ elf64_exec(struct preloaded_file *fp)
if (err != 0) {
#ifdef EFI
efi_time_init();
#endif
if (copy_auto)
copy_staging = COPY_STAGING_AUTO;
#endif
return (err);
}
dev_cleanup();
#ifdef EFI
trampoline(trampstack, copy_staging == COPY_STAGING_ENABLE ?
efi_copy_finish : efi_copy_finish_nop, kernend, modulep,
PT4, ehdr->e_entry);
#else
trampoline_data = (void *)trampoline + tramp_data_offset;
trampoline_data->entry = ehdr->e_entry;
trampoline_data->pt4 = trampolinebase + LOADER_PAGE_SIZE;
/*
* So we compute the VA of the module data by modulep + KERNBASE....
* need to make sure that that address is mapped right. We calculate
* the start of available memory to allocate via kernend (which is
* calculated with a phyaddr of "kernend + PA(PT_u0[1])"), so we better
* make sure we're not overwriting the last 2MB of the kernel :).
*/
trampoline_data->modulep = modulep; /* Offset from KERNBASE */
trampoline_data->kernend = kernend; /* Offset from the load address */
trampoline_data->fill1 = trampoline_data->fill2 = 0;
printf("Modulep = %lx kernend %lx\n", modulep, kernend);
/* NOTE: when copyting in, it's relative to the start of our 'area' not an abs addr */
/* Copy the trampoline to the ksegs */
archsw.arch_copyin((void *)trampcode, trampolinebase - staging, tramp_size);
/* Copy the page table to the ksegs */
archsw.arch_copyin(PT4, trampoline_data->pt4 - staging, 9 * LOADER_PAGE_SIZE);
if (archsw.arch_kexec_kseg_get == NULL)
panic("architecture did not provide kexec segment mapping");
archsw.arch_kexec_kseg_get(&nseg, &kseg);
error = host_kexec_load(trampolinebase, nseg, kseg, HOST_KEXEC_ARCH_X86_64);
if (error != 0)
panic("kexec_load returned error: %d", error);
host_reboot(HOST_REBOOT_MAGIC1, HOST_REBOOT_MAGIC2, HOST_REBOOT_CMD_KEXEC, 0);
#endif
panic("exec returned");