amd64: rework AP startup
Stop using temporal page table with 1:1 mapping of low 1G populated over the whole VA. Use 1:1 mapping of low 4G temporarily installed in the normal kernel page table. The features are: - now there is one less step for startup asm to perform - the startup code still needs to be at lower 1G because CPU starts in real mode. But everything else can be located anywhere in low 4G because it is accessed by non-paged 32bit protected mode. Note that kernel page table root page is at low 4G, as well as the kernel itself. - the page table pages can be allocated by normal allocator, there is no need to carve them from the phys_avail segments at very early time. The allocation of the page for startup code still requires some magic. Pages are freed after APs are ignited. - la57 startup for APs is less tricky, we directly load the final page table and do not need to tweak the paging mode. Reviewed by: markj Tested by: pho Sponsored by: The FreeBSD Foundation MFC after: 1 week Differential revision: https://reviews.freebsd.org/D31121
This commit is contained in:
parent
d96f55bc71
commit
d6717f8778
@ -1279,7 +1279,7 @@ getmemsize(caddr_t kmdp, u_int64_t first)
|
||||
* in real mode mode (e.g. SMP bare metal).
|
||||
*/
|
||||
#ifdef SMP
|
||||
mp_bootaddress(physmap, &physmap_idx);
|
||||
alloc_ap_trampoline(physmap, &physmap_idx);
|
||||
#endif
|
||||
|
||||
/* call pmap initialization to make new kernel address space */
|
||||
|
@ -105,6 +105,7 @@ static char *nmi_stack;
|
||||
static char *dbg_stack;
|
||||
|
||||
extern u_int mptramp_la57;
|
||||
extern u_int mptramp_nx;
|
||||
|
||||
/*
|
||||
* Local data and functions.
|
||||
@ -112,86 +113,6 @@ extern u_int mptramp_la57;
|
||||
|
||||
static int start_ap(int apic_id);
|
||||
|
||||
static bool
|
||||
is_kernel_paddr(vm_paddr_t pa)
|
||||
{
|
||||
|
||||
return (pa >= trunc_2mpage(btext - KERNBASE) &&
|
||||
pa < round_page(_end - KERNBASE));
|
||||
}
|
||||
|
||||
static bool
|
||||
is_mpboot_good(vm_paddr_t start, vm_paddr_t end)
|
||||
{
|
||||
|
||||
return (start + AP_BOOTPT_SZ <= GiB(4) && atop(end) < Maxmem);
|
||||
}
|
||||
|
||||
/*
|
||||
* Calculate usable address in base memory for AP trampoline code.
|
||||
*/
|
||||
void
|
||||
mp_bootaddress(vm_paddr_t *physmap, unsigned int *physmap_idx)
|
||||
{
|
||||
vm_paddr_t start, end;
|
||||
unsigned int i;
|
||||
bool allocated;
|
||||
|
||||
alloc_ap_trampoline(physmap, physmap_idx);
|
||||
|
||||
/*
|
||||
* Find a memory region big enough below the 4GB boundary to
|
||||
* store the initial page tables. Region must be mapped by
|
||||
* the direct map.
|
||||
*
|
||||
* Note that it needs to be aligned to a page boundary.
|
||||
*/
|
||||
allocated = false;
|
||||
for (i = *physmap_idx; i <= *physmap_idx; i -= 2) {
|
||||
/*
|
||||
* First, try to chomp at the start of the physmap region.
|
||||
* Kernel binary might claim it already.
|
||||
*/
|
||||
start = round_page(physmap[i]);
|
||||
end = start + AP_BOOTPT_SZ;
|
||||
if (start < end && end <= physmap[i + 1] &&
|
||||
is_mpboot_good(start, end) &&
|
||||
!is_kernel_paddr(start) && !is_kernel_paddr(end - 1)) {
|
||||
allocated = true;
|
||||
physmap[i] = end;
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* Second, try to chomp at the end. Again, check
|
||||
* against kernel.
|
||||
*/
|
||||
end = trunc_page(physmap[i + 1]);
|
||||
start = end - AP_BOOTPT_SZ;
|
||||
if (start < end && start >= physmap[i] &&
|
||||
is_mpboot_good(start, end) &&
|
||||
!is_kernel_paddr(start) && !is_kernel_paddr(end - 1)) {
|
||||
allocated = true;
|
||||
physmap[i + 1] = start;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (allocated) {
|
||||
mptramp_pagetables = start;
|
||||
if (physmap[i] == physmap[i + 1] && *physmap_idx != 0) {
|
||||
memmove(&physmap[i], &physmap[i + 2],
|
||||
sizeof(*physmap) * (*physmap_idx - i + 2));
|
||||
*physmap_idx -= 2;
|
||||
}
|
||||
} else {
|
||||
mptramp_pagetables = trunc_page(boot_address) - AP_BOOTPT_SZ;
|
||||
if (bootverbose)
|
||||
printf(
|
||||
"Cannot find enough space for the initial AP page tables, placing them at %#x",
|
||||
mptramp_pagetables);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize the IPI handlers and start up the AP's.
|
||||
*/
|
||||
@ -243,6 +164,9 @@ cpu_mp_start(void)
|
||||
assign_cpu_ids();
|
||||
|
||||
mptramp_la57 = la57;
|
||||
mptramp_nx = pg_nx != 0;
|
||||
MPASS(kernel_pmap->pm_cr3 < (1UL << 32));
|
||||
mptramp_pagetables = kernel_pmap->pm_cr3;
|
||||
|
||||
/* Start each Application Processor */
|
||||
start_all_aps();
|
||||
@ -399,55 +323,67 @@ mp_realloc_pcpu(int cpuid, int domain)
|
||||
int
|
||||
start_all_aps(void)
|
||||
{
|
||||
u_int64_t *pt5, *pt4, *pt3, *pt2;
|
||||
vm_page_t m_pml4, m_pdp, m_pd[4];
|
||||
pml5_entry_t old_pml45;
|
||||
pml4_entry_t *v_pml4;
|
||||
pdp_entry_t *v_pdp;
|
||||
pd_entry_t *v_pd;
|
||||
u_int32_t mpbioswarmvec;
|
||||
int apic_id, cpu, domain, i, xo;
|
||||
int apic_id, cpu, domain, i;
|
||||
u_char mpbiosreason;
|
||||
|
||||
mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
|
||||
|
||||
/* Create a transient 1:1 mapping of low 4G */
|
||||
if (la57) {
|
||||
m_pml4 = pmap_page_alloc_below_4g(true);
|
||||
v_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4));
|
||||
} else {
|
||||
v_pml4 = &kernel_pmap->pm_pmltop[0];
|
||||
}
|
||||
m_pdp = pmap_page_alloc_below_4g(true);
|
||||
v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp));
|
||||
m_pd[0] = pmap_page_alloc_below_4g(false);
|
||||
v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[0]));
|
||||
for (i = 0; i < NPDEPG; i++)
|
||||
v_pd[i] = (i << PDRSHIFT) | X86_PG_V | X86_PG_RW | X86_PG_A |
|
||||
X86_PG_M | PG_PS;
|
||||
m_pd[1] = pmap_page_alloc_below_4g(false);
|
||||
v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[1]));
|
||||
for (i = 0; i < NPDEPG; i++)
|
||||
v_pd[i] = (NBPDP + (i << PDRSHIFT)) | X86_PG_V | X86_PG_RW |
|
||||
X86_PG_A | X86_PG_M | PG_PS;
|
||||
m_pd[2] = pmap_page_alloc_below_4g(false);
|
||||
v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[2]));
|
||||
for (i = 0; i < NPDEPG; i++)
|
||||
v_pd[i] = (2UL * NBPDP + (i << PDRSHIFT)) | X86_PG_V |
|
||||
X86_PG_RW | X86_PG_A | X86_PG_M | PG_PS;
|
||||
m_pd[3] = pmap_page_alloc_below_4g(false);
|
||||
v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[3]));
|
||||
for (i = 0; i < NPDEPG; i++)
|
||||
v_pd[i] = (3UL * NBPDP + (i << PDRSHIFT)) | X86_PG_V |
|
||||
X86_PG_RW | X86_PG_A | X86_PG_M | PG_PS;
|
||||
v_pdp[0] = VM_PAGE_TO_PHYS(m_pd[0]) | X86_PG_V |
|
||||
X86_PG_RW | X86_PG_A | X86_PG_M;
|
||||
v_pdp[1] = VM_PAGE_TO_PHYS(m_pd[1]) | X86_PG_V |
|
||||
X86_PG_RW | X86_PG_A | X86_PG_M;
|
||||
v_pdp[2] = VM_PAGE_TO_PHYS(m_pd[2]) | X86_PG_V |
|
||||
X86_PG_RW | X86_PG_A | X86_PG_M;
|
||||
v_pdp[3] = VM_PAGE_TO_PHYS(m_pd[3]) | X86_PG_V |
|
||||
X86_PG_RW | X86_PG_A | X86_PG_M;
|
||||
old_pml45 = kernel_pmap->pm_pmltop[0];
|
||||
if (la57) {
|
||||
kernel_pmap->pm_pmltop[0] = VM_PAGE_TO_PHYS(m_pml4) |
|
||||
X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
|
||||
}
|
||||
v_pml4[0] = VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V |
|
||||
X86_PG_RW | X86_PG_A | X86_PG_M;
|
||||
pmap_invalidate_all(kernel_pmap);
|
||||
|
||||
/* copy the AP 1st level boot code */
|
||||
bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size);
|
||||
|
||||
/* Locate the page tables, they'll be below the trampoline */
|
||||
if (la57) {
|
||||
pt5 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables);
|
||||
xo = 1;
|
||||
} else {
|
||||
xo = 0;
|
||||
}
|
||||
pt4 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables + xo * PAGE_SIZE);
|
||||
pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t);
|
||||
pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t);
|
||||
|
||||
/* Create the initial 1GB replicated page tables */
|
||||
for (i = 0; i < 512; i++) {
|
||||
if (la57) {
|
||||
pt5[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
|
||||
PAGE_SIZE);
|
||||
pt5[i] |= PG_V | PG_RW | PG_U;
|
||||
}
|
||||
|
||||
/*
|
||||
* Each slot of the level 4 pages points to the same
|
||||
* level 3 page.
|
||||
*/
|
||||
pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
|
||||
(xo + 1) * PAGE_SIZE);
|
||||
pt4[i] |= PG_V | PG_RW | PG_U;
|
||||
|
||||
/*
|
||||
* Each slot of the level 3 pages points to the same
|
||||
* level 2 page.
|
||||
*/
|
||||
pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
|
||||
((xo + 2) * PAGE_SIZE));
|
||||
pt3[i] |= PG_V | PG_RW | PG_U;
|
||||
|
||||
/* The level 2 page slots are mapped with 2MB pages for 1GB. */
|
||||
pt2[i] = i * (2 * 1024 * 1024);
|
||||
pt2[i] |= PG_V | PG_RW | PG_PS | PG_U;
|
||||
}
|
||||
if (bootverbose)
|
||||
printf("AP boot address %#x\n", boot_address);
|
||||
|
||||
/* save the current value of the warm-start vector */
|
||||
if (!efi_boot)
|
||||
@ -515,6 +451,17 @@ start_all_aps(void)
|
||||
outb(CMOS_REG, BIOS_RESET);
|
||||
outb(CMOS_DATA, mpbiosreason);
|
||||
|
||||
/* Destroy transient 1:1 mapping */
|
||||
kernel_pmap->pm_pmltop[0] = old_pml45;
|
||||
invlpg(0);
|
||||
if (la57)
|
||||
vm_page_free(m_pml4);
|
||||
vm_page_free(m_pd[3]);
|
||||
vm_page_free(m_pd[2]);
|
||||
vm_page_free(m_pd[1]);
|
||||
vm_page_free(m_pd[0]);
|
||||
vm_page_free(m_pdp);
|
||||
|
||||
/* number of APs actually started */
|
||||
return (mp_naps);
|
||||
}
|
||||
|
@ -95,12 +95,25 @@ protmode:
|
||||
* is later enabled.
|
||||
*/
|
||||
mov %cr4, %eax
|
||||
orl $CR4_PAE, %eax
|
||||
orl $(CR4_PAE | CR4_PGE), %eax
|
||||
cmpb $0, mptramp_la57-mptramp_start(%ebx)
|
||||
je 1f
|
||||
orl $CR4_LA57, %eax
|
||||
1: mov %eax, %cr4
|
||||
|
||||
/*
|
||||
* If the BSP reported NXE support, enable EFER.NXE for all APs
|
||||
* prior to loading %cr3. This avoids page faults if the AP
|
||||
* encounters memory marked with the NX bit prior to detecting and
|
||||
* enabling NXE support.
|
||||
*/
|
||||
cmpb $0,mptramp_nx-mptramp_start(%ebx)
|
||||
je 2f
|
||||
movl $MSR_EFER, %ecx
|
||||
rdmsr
|
||||
orl $EFER_NXE, %eax
|
||||
wrmsr
|
||||
2:
|
||||
/*
|
||||
* Enable EFER.LME so that we get long mode when all the prereqs are
|
||||
* in place. In this case, it turns on when CR0_PG is finally enabled.
|
||||
@ -112,12 +125,13 @@ protmode:
|
||||
wrmsr
|
||||
|
||||
/*
|
||||
* Point to the embedded page tables for startup. Note that this
|
||||
* only gets accessed after we're actually in 64 bit mode, however
|
||||
* we can only set the bottom 32 bits of %cr3 in this state. This
|
||||
* means we are required to use a temporary page table that is below
|
||||
* the 4GB limit. %ebx is still our relocation base. We could just
|
||||
* subtract 3 * PAGE_SIZE, but that would be too easy.
|
||||
* Load kernel page table pointer into %cr3.
|
||||
* %ebx is still our relocation base.
|
||||
*
|
||||
* Note that this only gets accessed after we're actually in 64 bit
|
||||
* mode, however we can only set the bottom 32 bits of %cr3 in this
|
||||
* state. This means we depend on the kernel page table being
|
||||
* allocated from the low 4G.
|
||||
*/
|
||||
leal mptramp_pagetables-mptramp_start(%ebx),%eax
|
||||
movl (%eax), %eax
|
||||
@ -155,10 +169,8 @@ jmp_64:
|
||||
/*
|
||||
* Yeehar! We're running in 64 bit mode! We can mostly ignore our
|
||||
* segment registers, and get on with it.
|
||||
* Note that we are running at the correct virtual address, but with
|
||||
* a 1:1 1GB mirrored mapping over entire address space. We had better
|
||||
* switch to a real %cr3 promptly so that we can get to the direct map
|
||||
* space. Remember that jmp is relative and that we've been relocated,
|
||||
* We are running at the correct virtual address space.
|
||||
* Note that the jmp is relative and that we've been relocated,
|
||||
* so use an indirect jump.
|
||||
*/
|
||||
.code64
|
||||
@ -220,6 +232,10 @@ mptramp_pagetables:
|
||||
mptramp_la57:
|
||||
.long 0
|
||||
|
||||
.globl mptramp_nx
|
||||
mptramp_nx:
|
||||
.long 0
|
||||
|
||||
/*
|
||||
* The pseudo descriptor for lgdt to use.
|
||||
*/
|
||||
@ -243,31 +259,5 @@ bootMP_size:
|
||||
.code64
|
||||
.p2align 4,0
|
||||
entry_64:
|
||||
/*
|
||||
* If the BSP reported NXE support, enable EFER.NXE for all APs
|
||||
* prior to loading %cr3. This avoids page faults if the AP
|
||||
* encounters memory marked with the NX bit prior to detecting and
|
||||
* enabling NXE support.
|
||||
*/
|
||||
movq pg_nx, %rbx
|
||||
testq %rbx, %rbx
|
||||
je 1f
|
||||
movl $MSR_EFER, %ecx
|
||||
rdmsr
|
||||
orl $EFER_NXE, %eax
|
||||
wrmsr
|
||||
|
||||
1:
|
||||
/*
|
||||
* Load a real %cr3 that has all the direct map stuff and switches
|
||||
* off the 1GB replicated mirror. Load a stack pointer and jump
|
||||
* into AP startup code in C.
|
||||
*/
|
||||
cmpl $0, la57
|
||||
jne 2f
|
||||
movq KPML4phys, %rax
|
||||
jmp 3f
|
||||
2: movq KPML5phys, %rax
|
||||
3: movq %rax, %cr3
|
||||
movq bootSTK, %rsp
|
||||
jmp init_secondary
|
||||
|
@ -39,7 +39,6 @@ inthand_t
|
||||
|
||||
void invlop_handler(void);
|
||||
int start_all_aps(void);
|
||||
void mp_bootaddress(vm_paddr_t *, unsigned int *);
|
||||
|
||||
#endif /* !LOCORE */
|
||||
#endif /* SMP */
|
||||
|
@ -1065,11 +1065,6 @@ init_secondary_tail(void)
|
||||
}
|
||||
|
||||
#ifdef __amd64__
|
||||
/*
|
||||
* Enable global pages TLB extension
|
||||
* This also implicitly flushes the TLB
|
||||
*/
|
||||
load_cr4(rcr4() | CR4_PGE);
|
||||
if (pmap_pcid_enabled)
|
||||
load_cr4(rcr4() | CR4_PCIDE);
|
||||
load_ds(_udatasel);
|
||||
|
Loading…
Reference in New Issue
Block a user