amd64 UEFI loader: stop copying staging area to 2M physical

On amd64, add a possibility to activate kernel with staging area in place.
Add 'copy_staging' command to control this.  For now, by default the
old mode of copying kernel to 2M phys is retained.  It is going to be
changed in several weeks.

On amd64, add some slop to the staging area to satisfy both requirements
of the kernel startup allocator, and to have space for minor staging data
increase after the final size is calculated.  Add a new command
'staging_slop' to control its size.

Improve staging area resizing, in particular, reallocate it anew if
we cannot grow it neither down nor up.

Reviewed by:	kevans, markj
Discussed with:	emaste (the delivery plan)
Sponsored by:	The FreeBSD Foundation
MFC after:	1 week
Differential revision:	https://reviews.freebsd.org/D31121
This commit is contained in:
Konstantin Belousov 2021-07-10 22:55:56 +03:00
parent ee62fb2e1e
commit f75caed644
6 changed files with 346 additions and 58 deletions

View File

@ -228,6 +228,9 @@ struct preloaded_file
size_t f_size; /* file size */
struct kernel_module *f_modules; /* list of modules if any */
struct preloaded_file *f_next; /* next file */
#ifdef __amd64__
bool f_kernphys_relocatable;
#endif
};
struct file_format

View File

@ -207,6 +207,18 @@ static int elf_section_header_convert(const Elf_Ehdr *ehdr, Elf_Shdr *shdr)
#undef CONVERT_SWITCH
#undef CONVERT_FIELD
#ifdef __amd64__
static bool
is_kernphys_relocatable(elf_file_t ef)
{
Elf_Sym sym;
return (__elfN(lookup_symbol)(ef, "kernphys", &sym, STT_OBJECT) == 0 &&
sym.st_size == 8);
}
#endif
static int
__elfN(load_elf_header)(char *filename, elf_file_t ef)
{
@ -434,6 +446,9 @@ __elfN(loadfile_raw)(char *filename, uint64_t dest,
/* Load OK, return module pointer */
*result = (struct preloaded_file *)fp;
err = 0;
#ifdef __amd64__
fp->f_kernphys_relocatable = is_kernphys_relocatable(&ef);
#endif
goto out;
ioerr:

View File

@ -82,7 +82,11 @@ struct file_format *file_formats[] = {
static pml4_entry_t *PT4;
static pdp_entry_t *PT3;
static pdp_entry_t *PT3_l, *PT3_u;
static pd_entry_t *PT2;
static pd_entry_t *PT2_l0, *PT2_l1, *PT2_l2, *PT2_l3, *PT2_u0, *PT2_u1;
extern EFI_PHYSICAL_ADDRESS staging;
static void (*trampoline)(uint64_t stack, void *copy_finish, uint64_t kernend,
uint64_t modulep, pml4_entry_t *pagetable, uint64_t entry);
@ -105,6 +109,12 @@ elf64_exec(struct preloaded_file *fp)
ACPI_TABLE_RSDP *rsdp;
char buf[24];
int revision;
bool copy_auto;
copy_auto = copy_staging == COPY_STAGING_AUTO;
if (copy_auto)
copy_staging = fp->f_kernphys_relocatable ?
COPY_STAGING_DISABLE : COPY_STAGING_ENABLE;
/*
* Report the RSDP to the kernel. While this can be found with
@ -151,57 +161,133 @@ elf64_exec(struct preloaded_file *fp)
}
if ((md = file_findmetadata(fp, MODINFOMD_ELFHDR)) == NULL)
return(EFTYPE);
return (EFTYPE);
ehdr = (Elf_Ehdr *)&(md->md_data);
trampcode = (vm_offset_t)0x0000000040000000;
trampcode = copy_staging == COPY_STAGING_ENABLE ?
(vm_offset_t)0x0000000040000000 /* 1G */ :
(vm_offset_t)0x0000000100000000; /* 4G */;
err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 1,
(EFI_PHYSICAL_ADDRESS *)&trampcode);
if (EFI_ERROR(err)) {
printf("Unable to allocate trampoline\n");
if (copy_auto)
copy_staging = COPY_STAGING_AUTO;
return (ENOMEM);
}
bzero((void *)trampcode, EFI_PAGE_SIZE);
trampstack = trampcode + EFI_PAGE_SIZE - 8;
bcopy((void *)&amd64_tramp, (void *)trampcode, amd64_tramp_size);
trampoline = (void *)trampcode;
PT4 = (pml4_entry_t *)0x0000000040000000;
err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 3,
(EFI_PHYSICAL_ADDRESS *)&PT4);
bzero(PT4, 3 * EFI_PAGE_SIZE);
if (copy_staging == COPY_STAGING_ENABLE) {
PT4 = (pml4_entry_t *)0x0000000040000000;
err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 3,
(EFI_PHYSICAL_ADDRESS *)&PT4);
if (EFI_ERROR(err)) {
printf("Unable to allocate trampoline page table\n");
BS->FreePages(trampcode, 1);
if (copy_auto)
copy_staging = COPY_STAGING_AUTO;
return (ENOMEM);
}
bzero(PT4, 3 * EFI_PAGE_SIZE);
PT3 = &PT4[512];
PT2 = &PT3[512];
PT3 = &PT4[512];
PT2 = &PT3[512];
/*
* This is kinda brutal, but every single 1GB VM
* memory segment points to the same first 1GB of
* physical memory. But it is more than adequate.
*/
for (i = 0; i < NPTEPG; i++) {
/*
* Each slot of the L4 pages points to the
* same L3 page.
*/
PT4[i] = (pml4_entry_t)PT3;
PT4[i] |= PG_V | PG_RW;
/*
* This is kinda brutal, but every single 1GB VM memory segment points
* to the same first 1GB of physical memory. But it is more than
* adequate.
*/
for (i = 0; i < 512; i++) {
/* Each slot of the L4 pages points to the same L3 page. */
PT4[i] = (pml4_entry_t)PT3;
PT4[i] |= PG_V | PG_RW;
/*
* Each slot of the L3 pages points to the
* same L2 page.
*/
PT3[i] = (pdp_entry_t)PT2;
PT3[i] |= PG_V | PG_RW;
/* Each slot of the L3 pages points to the same L2 page. */
PT3[i] = (pdp_entry_t)PT2;
PT3[i] |= PG_V | PG_RW;
/*
* The L2 page slots are mapped with 2MB pages for 1GB.
*/
PT2[i] = (pd_entry_t)i * (2 * 1024 * 1024);
PT2[i] |= PG_V | PG_RW | PG_PS;
}
} else {
PT4 = (pml4_entry_t *)0x0000000100000000; /* 4G */
err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 9,
(EFI_PHYSICAL_ADDRESS *)&PT4);
if (EFI_ERROR(err)) {
printf("Unable to allocate trampoline page table\n");
BS->FreePages(trampcode, 9);
if (copy_auto)
copy_staging = COPY_STAGING_AUTO;
return (ENOMEM);
}
/* The L2 page slots are mapped with 2MB pages for 1GB. */
PT2[i] = i * (2 * 1024 * 1024);
PT2[i] |= PG_V | PG_RW | PG_PS;
bzero(PT4, 9 * EFI_PAGE_SIZE);
PT3_l = &PT4[NPML4EPG * 1];
PT3_u = &PT4[NPML4EPG * 2];
PT2_l0 = &PT4[NPML4EPG * 3];
PT2_l1 = &PT4[NPML4EPG * 4];
PT2_l2 = &PT4[NPML4EPG * 5];
PT2_l3 = &PT4[NPML4EPG * 6];
PT2_u0 = &PT4[NPML4EPG * 7];
PT2_u1 = &PT4[NPML4EPG * 8];
/* 1:1 mapping of lower 4G */
PT4[0] = (pml4_entry_t)PT3_l | PG_V | PG_RW;
PT3_l[0] = (pdp_entry_t)PT2_l0 | PG_V | PG_RW;
PT3_l[1] = (pdp_entry_t)PT2_l1 | PG_V | PG_RW;
PT3_l[2] = (pdp_entry_t)PT2_l2 | PG_V | PG_RW;
PT3_l[3] = (pdp_entry_t)PT2_l3 | PG_V | PG_RW;
for (i = 0; i < 4 * NPDEPG; i++) {
PT2_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V |
PG_RW | PG_PS;
}
/* mapping of kernel 2G below top */
PT4[NPML4EPG - 1] = (pml4_entry_t)PT3_u | PG_V | PG_RW;
PT3_u[NPDPEPG - 2] = (pdp_entry_t)PT2_u0 | PG_V | PG_RW;
PT3_u[NPDPEPG - 1] = (pdp_entry_t)PT2_u1 | PG_V | PG_RW;
/* compat mapping of phys @0 */
PT2_u0[0] = PG_PS | PG_V | PG_RW;
/* this maps past staging area */
for (i = 1; i < 2 * NPDEPG; i++) {
PT2_u0[i] = ((pd_entry_t)staging +
((pd_entry_t)i - 1) * NBPDR) |
PG_V | PG_RW | PG_PS;
}
}
printf("staging %#lx (%scoping) tramp %p PT4 %p\n",
staging, copy_staging == COPY_STAGING_ENABLE ? "" : "not ",
trampoline, PT4);
printf("Start @ 0x%lx ...\n", ehdr->e_entry);
efi_time_fini();
err = bi_load(fp->f_args, &modulep, &kernend, true);
if (err != 0) {
efi_time_init();
return(err);
if (copy_auto)
copy_staging = COPY_STAGING_AUTO;
return (err);
}
dev_cleanup();
trampoline(trampstack, efi_copy_finish, kernend, modulep, PT4,
ehdr->e_entry);
trampoline(trampstack, copy_staging == COPY_STAGING_ENABLE ?
efi_copy_finish : efi_copy_finish_nop, kernend, modulep,
PT4, ehdr->e_entry);
panic("exec returned");
}

View File

@ -65,6 +65,8 @@ int bi_load(char *args, vm_offset_t *modulep, vm_offset_t *kernendp,
extern EFI_SYSTEM_TABLE *ST;
int boot_services_gone;
static int
bi_getboothowto(char *kargs)
{
@ -396,8 +398,10 @@ bi_load_efi_data(struct preloaded_file *kfp, bool exit_bs)
if (!exit_bs)
break;
status = BS->ExitBootServices(IH, efi_mapkey);
if (!EFI_ERROR(status))
if (!EFI_ERROR(status)) {
boot_services_gone = 1;
break;
}
}
if (retry == 0) {

View File

@ -39,6 +39,11 @@ __FBSDID("$FreeBSD$");
#include "loader_efi.h"
#define M(x) ((x) * 1024 * 1024)
#define G(x) (1UL * (x) * 1024 * 1024 * 1024)
extern int boot_services_gone;
#if defined(__i386__) || defined(__amd64__)
#include <machine/cpufunc.h>
#include <machine/specialreg.h>
@ -175,24 +180,142 @@ efi_verify_staging_size(unsigned long *nr_pages)
#ifndef EFI_STAGING_SIZE
#if defined(__arm__)
#define EFI_STAGING_SIZE 32
#define EFI_STAGING_SIZE M(32)
#else
#define EFI_STAGING_SIZE 64
#define EFI_STAGING_SIZE M(64)
#endif
#endif
#if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \
defined(__riscv)
#define EFI_STAGING_2M_ALIGN 1
#else
#define EFI_STAGING_2M_ALIGN 0
#endif
#if defined(__amd64__)
#define EFI_STAGING_SLOP M(8)
#else
#define EFI_STAGING_SLOP 0
#endif
static u_long staging_slop = EFI_STAGING_SLOP;
EFI_PHYSICAL_ADDRESS staging, staging_end, staging_base;
int stage_offset_set = 0;
ssize_t stage_offset;
static void
efi_copy_free(void)
{
BS->FreePages(staging_base, (staging_end - staging_base) /
EFI_PAGE_SIZE);
stage_offset_set = 0;
stage_offset = 0;
}
#ifdef __amd64__
int copy_staging = COPY_STAGING_ENABLE;
static int
command_copy_staging(int argc, char *argv[])
{
static const char *const mode[3] = {
[COPY_STAGING_ENABLE] = "enable",
[COPY_STAGING_DISABLE] = "disable",
[COPY_STAGING_AUTO] = "auto",
};
int prev, res;
res = CMD_OK;
if (argc > 2) {
res = CMD_ERROR;
} else if (argc == 2) {
prev = copy_staging;
if (strcmp(argv[1], "enable") == 0)
copy_staging = COPY_STAGING_ENABLE;
else if (strcmp(argv[1], "disable") == 0)
copy_staging = COPY_STAGING_DISABLE;
else if (strcmp(argv[1], "auto") == 0)
copy_staging = COPY_STAGING_AUTO;
else {
printf("usage: copy_staging enable|disable|auto\n");
res = CMD_ERROR;
}
if (res == CMD_OK && prev != copy_staging) {
printf("changed copy_staging, unloading kernel\n");
unload();
efi_copy_free();
efi_copy_init();
}
} else {
printf("copy staging: %s\n", mode[copy_staging]);
}
return (res);
}
COMMAND_SET(copy_staging, "copy_staging", "copy staging", command_copy_staging);
#endif
static int
command_staging_slop(int argc, char *argv[])
{
char *endp;
u_long new, prev;
int res;
res = CMD_OK;
if (argc > 2) {
res = CMD_ERROR;
} else if (argc == 2) {
new = strtoul(argv[1], &endp, 0);
if (*endp != '\0') {
printf("invalid slop value\n");
res = CMD_ERROR;
}
if (res == CMD_OK && staging_slop != new) {
printf("changed slop, unloading kernel\n");
unload();
efi_copy_free();
efi_copy_init();
}
} else {
printf("staging slop %#lx\n", staging_slop);
}
return (res);
}
COMMAND_SET(staging_slop, "staging_slop", "set staging slop",
command_staging_slop);
#if defined(__i386__) || defined(__amd64__)
/*
* The staging area must reside in the the first 1GB or 4GB physical
* memory: see elf64_exec() in
* boot/efi/loader/arch/amd64/elf64_freebsd.c.
*/
static EFI_PHYSICAL_ADDRESS
get_staging_max(void)
{
EFI_PHYSICAL_ADDRESS res;
#if defined(__i386__)
res = G(1);
#elif defined(__amd64__)
res = copy_staging == COPY_STAGING_ENABLE ? G(1) : G(4);
#endif
return (res);
}
#define EFI_ALLOC_METHOD AllocateMaxAddress
#else
#define EFI_ALLOC_METHOD AllocateAnyPages
#endif
int
efi_copy_init(void)
{
EFI_STATUS status;
unsigned long nr_pages;
nr_pages = EFI_SIZE_TO_PAGES((EFI_STAGING_SIZE) * 1024 * 1024);
nr_pages = EFI_SIZE_TO_PAGES((EFI_STAGING_SIZE));
#if defined(__i386__) || defined(__amd64__)
/*
@ -203,18 +326,10 @@ efi_copy_init(void)
if (running_on_hyperv())
efi_verify_staging_size(&nr_pages);
/*
* The staging area must reside in the the first 1GB physical
* memory: see elf64_exec() in
* boot/efi/loader/arch/amd64/elf64_freebsd.c.
*/
staging = 1024*1024*1024;
status = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData,
nr_pages, &staging);
#else
status = BS->AllocatePages(AllocateAnyPages, EfiLoaderData,
nr_pages, &staging);
staging = get_staging_max();
#endif
status = BS->AllocatePages(EFI_ALLOC_METHOD, EfiLoaderData,
nr_pages, &staging);
if (EFI_ERROR(status)) {
printf("failed to allocate staging area: %lu\n",
EFI_ERROR_CODE(status));
@ -223,7 +338,7 @@ efi_copy_init(void)
staging_base = staging;
staging_end = staging + nr_pages * EFI_PAGE_SIZE;
#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
#if EFI_STAGING_2M_ALIGN
/*
* Round the kernel load address to a 2MiB value. This is needed
* because the kernel builds a page table based on where it has
@ -231,7 +346,7 @@ efi_copy_init(void)
* either a 1MiB or 2MiB page for this we need to make sure it
* is correctly aligned for both cases.
*/
staging = roundup2(staging, 2 * 1024 * 1024);
staging = roundup2(staging, M(2));
#endif
return (0);
@ -240,20 +355,42 @@ efi_copy_init(void)
static bool
efi_check_space(vm_offset_t end)
{
EFI_PHYSICAL_ADDRESS addr;
EFI_PHYSICAL_ADDRESS addr, new_base, new_staging;
EFI_STATUS status;
unsigned long nr_pages;
end = roundup2(end, EFI_PAGE_SIZE);
/* There is already enough space */
if (end <= staging_end)
if (end + staging_slop <= staging_end)
return (true);
end = roundup2(end, EFI_PAGE_SIZE);
nr_pages = EFI_SIZE_TO_PAGES(end - staging_end);
if (boot_services_gone) {
if (end <= staging_end)
return (true);
panic("efi_check_space: cannot expand staging area "
"after boot services were exited\n");
}
/*
* Add slop at the end:
* 1. amd64 kernel expects to do some very early allocations
* by carving out memory after kernend. Slop guarantees
* that it does not ovewrite anything useful.
* 2. It seems that initial calculation of the staging size
* could be somewhat smaller than actually copying in after
* boot services are exited. Slop avoids calling
* BS->AllocatePages() when it cannot work.
*/
end += staging_slop;
nr_pages = EFI_SIZE_TO_PAGES(end - staging_end);
#if defined(__i386__) || defined(__amd64__)
/* X86 needs all memory to be allocated under the 1G boundary */
if (end > 1024*1024*1024)
/*
* i386 needs all memory to be allocated under the 1G boundary.
* amd64 needs all memory to be allocated under the 1G or 4G boundary.
*/
if (end > get_staging_max())
goto before_staging;
#endif
@ -268,14 +405,12 @@ efi_check_space(vm_offset_t end)
before_staging:
/* Try allocating space before the previous allocation */
if (staging < nr_pages * EFI_PAGE_SIZE) {
printf("Not enough space before allocation\n");
return (false);
}
if (staging < nr_pages * EFI_PAGE_SIZE)
goto expand;
addr = staging - nr_pages * EFI_PAGE_SIZE;
#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
#if EFI_STAGING_2M_ALIGN
/* See efi_copy_init for why this is needed */
addr = rounddown2(addr, 2 * 1024 * 1024);
addr = rounddown2(addr, M(2));
#endif
nr_pages = EFI_SIZE_TO_PAGES(staging_base - addr);
status = BS->AllocatePages(AllocateAddress, EfiLoaderData, nr_pages,
@ -288,11 +423,42 @@ efi_check_space(vm_offset_t end)
staging_base = addr;
memmove((void *)(uintptr_t)staging_base,
(void *)(uintptr_t)staging, staging_end - staging);
stage_offset -= (staging - staging_base);
stage_offset -= staging - staging_base;
staging = staging_base;
return (true);
}
expand:
nr_pages = EFI_SIZE_TO_PAGES(end - (vm_offset_t)staging);
#if EFI_STAGING_2M_ALIGN
nr_pages += M(2) / EFI_PAGE_SIZE;
#endif
#if defined(__i386__) || defined(__amd64__)
new_base = get_staging_max();
#endif
status = BS->AllocatePages(EFI_ALLOC_METHOD, EfiLoaderData,
nr_pages, &new_base);
if (!EFI_ERROR(status)) {
#if EFI_STAGING_2M_ALIGN
new_staging = roundup2(new_base, M(2));
#else
new_staging = new_base;
#endif
/*
* Move the old allocation and update the state so
* translation still works.
*/
memcpy((void *)(uintptr_t)new_staging,
(void *)(uintptr_t)staging, staging_end - staging);
BS->FreePages(staging_base, (staging_end - staging_base) /
EFI_PAGE_SIZE);
stage_offset -= staging - new_staging;
staging = new_staging;
staging_end = new_base + nr_pages * EFI_PAGE_SIZE;
staging_base = new_base;
return (true);
}
printf("efi_check_space: Unable to expand staging area\n");
return (false);
}
@ -335,7 +501,6 @@ efi_copyout(const vm_offset_t src, void *dest, const size_t len)
return (len);
}
ssize_t
efi_readin(readin_handle_t fd, vm_offset_t dest, const size_t len)
{
@ -364,3 +529,8 @@ efi_copy_finish(void)
while (src < last)
*dst++ = *src++;
}
void
efi_copy_finish_nop(void)
{
}

View File

@ -34,6 +34,15 @@
#include <stand.h>
#include <readin.h>
#ifdef __amd64__
enum {
COPY_STAGING_ENABLE,
COPY_STAGING_DISABLE,
COPY_STAGING_AUTO,
};
extern int copy_staging;
#endif
int efi_autoload(void);
int efi_copy_init(void);
@ -44,5 +53,6 @@ ssize_t efi_readin(readin_handle_t fd, vm_offset_t dest, const size_t len);
void * efi_translate(vm_offset_t ptr);
void efi_copy_finish(void);
void efi_copy_finish_nop(void);
#endif /* _LOADER_EFI_COPY_H_ */