e2d6c417e3
This change adds support for transparent superpages for PowerPC64 systems using Hashed Page Tables (HPT). All pmap operations are supported. The changes were inspired by RISC-V implementation of superpages, by @markj (r344106), but heavily adapted to fit PPC64 HPT architecture and existing MMU OEA64 code. While these changes are not better tested, superpages support is disabled by default. To enable it, use vm.pmap.superpages_enabled=1. In this initial implementation, when superpages are disabled, system performance stays at the same level as without these changes. When superpages are enabled, buildworld time increases a bit (~2%). However, for workloads that put a heavy pressure on the TLB the performance boost is much bigger (see HPC Challenge and pgbench on D25237). Reviewed by: jhibbits Sponsored by: Eldorado Research Institute (eldorado.org.br) Differential Revision: https://reviews.freebsd.org/D25237
576 lines
14 KiB
C
576 lines
14 KiB
C
/*-
|
|
* Copyright (c) 2015 Nathan Whitehorn
|
|
* Copyright (c) 2017-2018 Semihalf
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
|
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
|
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
|
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
|
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include <sys/cdefs.h>
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
#include <sys/param.h>
|
|
#include <sys/systm.h>
|
|
#include <sys/kernel.h>
|
|
#include <sys/bus.h>
|
|
#include <sys/pcpu.h>
|
|
#include <sys/proc.h>
|
|
#include <sys/smp.h>
|
|
#include <vm/vm.h>
|
|
#include <vm/pmap.h>
|
|
|
|
#include <machine/bus.h>
|
|
#include <machine/cpu.h>
|
|
#include <machine/hid.h>
|
|
#include <machine/platformvar.h>
|
|
#include <machine/pmap.h>
|
|
#include <machine/rtas.h>
|
|
#include <machine/smp.h>
|
|
#include <machine/spr.h>
|
|
#include <machine/trap.h>
|
|
|
|
#include <dev/ofw/openfirm.h>
|
|
#include <dev/ofw/ofw_bus.h>
|
|
#include <dev/ofw/ofw_bus_subr.h>
|
|
#include <machine/ofw_machdep.h>
|
|
#include <powerpc/aim/mmu_oea64.h>
|
|
|
|
#include "platform_if.h"
|
|
#include "opal.h"
|
|
|
|
#ifdef SMP
|
|
extern void *ap_pcpu;
|
|
#endif
|
|
|
|
void (*powernv_smp_ap_extra_init)(void);
|
|
|
|
static int powernv_probe(platform_t);
|
|
static int powernv_attach(platform_t);
|
|
void powernv_mem_regions(platform_t, struct mem_region *phys, int *physsz,
|
|
struct mem_region *avail, int *availsz);
|
|
static void powernv_numa_mem_regions(platform_t plat, struct numa_mem_region *phys, int *physsz);
|
|
static u_long powernv_timebase_freq(platform_t, struct cpuref *cpuref);
|
|
static int powernv_smp_first_cpu(platform_t, struct cpuref *cpuref);
|
|
static int powernv_smp_next_cpu(platform_t, struct cpuref *cpuref);
|
|
static int powernv_smp_get_bsp(platform_t, struct cpuref *cpuref);
|
|
static void powernv_smp_ap_init(platform_t);
|
|
#ifdef SMP
|
|
static int powernv_smp_start_cpu(platform_t, struct pcpu *cpu);
|
|
static void powernv_smp_probe_threads(platform_t);
|
|
static struct cpu_group *powernv_smp_topo(platform_t plat);
|
|
#endif
|
|
static void powernv_reset(platform_t);
|
|
static void powernv_cpu_idle(sbintime_t sbt);
|
|
static int powernv_cpuref_init(void);
|
|
static int powernv_node_numa_domain(platform_t platform, phandle_t node);
|
|
|
|
static platform_method_t powernv_methods[] = {
|
|
PLATFORMMETHOD(platform_probe, powernv_probe),
|
|
PLATFORMMETHOD(platform_attach, powernv_attach),
|
|
PLATFORMMETHOD(platform_mem_regions, powernv_mem_regions),
|
|
PLATFORMMETHOD(platform_numa_mem_regions, powernv_numa_mem_regions),
|
|
PLATFORMMETHOD(platform_timebase_freq, powernv_timebase_freq),
|
|
|
|
PLATFORMMETHOD(platform_smp_ap_init, powernv_smp_ap_init),
|
|
PLATFORMMETHOD(platform_smp_first_cpu, powernv_smp_first_cpu),
|
|
PLATFORMMETHOD(platform_smp_next_cpu, powernv_smp_next_cpu),
|
|
PLATFORMMETHOD(platform_smp_get_bsp, powernv_smp_get_bsp),
|
|
#ifdef SMP
|
|
PLATFORMMETHOD(platform_smp_start_cpu, powernv_smp_start_cpu),
|
|
PLATFORMMETHOD(platform_smp_probe_threads, powernv_smp_probe_threads),
|
|
PLATFORMMETHOD(platform_smp_topo, powernv_smp_topo),
|
|
#endif
|
|
PLATFORMMETHOD(platform_node_numa_domain, powernv_node_numa_domain),
|
|
|
|
PLATFORMMETHOD(platform_reset, powernv_reset),
|
|
{ 0, 0 }
|
|
};
|
|
|
|
static platform_def_t powernv_platform = {
|
|
"powernv",
|
|
powernv_methods,
|
|
0
|
|
};
|
|
|
|
static struct cpuref platform_cpuref[MAXCPU];
|
|
static int platform_cpuref_cnt;
|
|
static int platform_cpuref_valid;
|
|
static int platform_associativity;
|
|
|
|
PLATFORM_DEF(powernv_platform);
|
|
|
|
static uint64_t powernv_boot_pir;
|
|
|
|
static int
|
|
powernv_probe(platform_t plat)
|
|
{
|
|
if (opal_check() == 0)
|
|
return (BUS_PROBE_SPECIFIC);
|
|
|
|
return (ENXIO);
|
|
}
|
|
|
|
static int
|
|
powernv_attach(platform_t plat)
|
|
{
|
|
uint32_t nptlp, shift = 0, slb_encoding = 0;
|
|
int32_t lp_size, lp_encoding;
|
|
char buf[255];
|
|
pcell_t refpoints[3];
|
|
pcell_t prop;
|
|
phandle_t cpu;
|
|
phandle_t opal;
|
|
int res, len, idx;
|
|
register_t msr;
|
|
bool has_lp;
|
|
|
|
/* Ping OPAL again just to make sure */
|
|
opal_check();
|
|
|
|
#if BYTE_ORDER == LITTLE_ENDIAN
|
|
opal_call(OPAL_REINIT_CPUS, 2 /* Little endian */);
|
|
#else
|
|
opal_call(OPAL_REINIT_CPUS, 1 /* Big endian */);
|
|
#endif
|
|
opal = OF_finddevice("/ibm,opal");
|
|
|
|
platform_associativity = 4; /* Skiboot default. */
|
|
if (OF_getencprop(opal, "ibm,associativity-reference-points", refpoints,
|
|
sizeof(refpoints)) > 0) {
|
|
platform_associativity = refpoints[0];
|
|
}
|
|
|
|
if (cpu_idle_hook == NULL)
|
|
cpu_idle_hook = powernv_cpu_idle;
|
|
|
|
powernv_boot_pir = mfspr(SPR_PIR);
|
|
|
|
/* LPID must not be altered when PSL_DR or PSL_IR is set */
|
|
msr = mfmsr();
|
|
mtmsr(msr & ~(PSL_DR | PSL_IR));
|
|
|
|
/* Direct interrupts to SRR instead of HSRR and reset LPCR otherwise */
|
|
mtspr(SPR_LPID, 0);
|
|
isync();
|
|
|
|
if (cpu_features2 & PPC_FEATURE2_ARCH_3_00)
|
|
lpcr |= LPCR_HVICE;
|
|
|
|
#if BYTE_ORDER == LITTLE_ENDIAN
|
|
lpcr |= LPCR_ILE;
|
|
#endif
|
|
|
|
mtspr(SPR_LPCR, lpcr);
|
|
isync();
|
|
|
|
mtmsr(msr);
|
|
|
|
powernv_cpuref_init();
|
|
|
|
/* Set SLB count from device tree */
|
|
cpu = OF_peer(0);
|
|
cpu = OF_child(cpu);
|
|
while (cpu != 0) {
|
|
res = OF_getprop(cpu, "name", buf, sizeof(buf));
|
|
if (res > 0 && strcmp(buf, "cpus") == 0)
|
|
break;
|
|
cpu = OF_peer(cpu);
|
|
}
|
|
if (cpu == 0)
|
|
goto out;
|
|
|
|
cpu = OF_child(cpu);
|
|
while (cpu != 0) {
|
|
res = OF_getprop(cpu, "device_type", buf, sizeof(buf));
|
|
if (res > 0 && strcmp(buf, "cpu") == 0)
|
|
break;
|
|
cpu = OF_peer(cpu);
|
|
}
|
|
if (cpu == 0)
|
|
goto out;
|
|
|
|
res = OF_getencprop(cpu, "ibm,slb-size", &prop, sizeof(prop));
|
|
if (res > 0)
|
|
n_slbs = prop;
|
|
|
|
/*
|
|
* Scan the large page size property for PAPR compatible machines.
|
|
* See PAPR D.5 Changes to Section 5.1.4, 'CPU Node Properties'
|
|
* for the encoding of the property.
|
|
*/
|
|
|
|
len = OF_getproplen(cpu, "ibm,segment-page-sizes");
|
|
if (len > 0) {
|
|
/*
|
|
* We have to use a variable length array on the stack
|
|
* since we have very limited stack space.
|
|
*/
|
|
pcell_t arr[len/sizeof(cell_t)];
|
|
res = OF_getencprop(cpu, "ibm,segment-page-sizes", arr,
|
|
sizeof(arr));
|
|
len /= 4;
|
|
idx = 0;
|
|
has_lp = false;
|
|
while (len > 0) {
|
|
shift = arr[idx];
|
|
slb_encoding = arr[idx + 1];
|
|
nptlp = arr[idx + 2];
|
|
idx += 3;
|
|
len -= 3;
|
|
while (len > 0 && nptlp) {
|
|
lp_size = arr[idx];
|
|
lp_encoding = arr[idx+1];
|
|
if (slb_encoding == SLBV_L && lp_encoding == 0)
|
|
has_lp = true;
|
|
|
|
if (slb_encoding == SLB_PGSZ_4K_4K &&
|
|
lp_encoding == LP_4K_16M)
|
|
moea64_has_lp_4k_16m = true;
|
|
|
|
idx += 2;
|
|
len -= 2;
|
|
nptlp--;
|
|
}
|
|
if (has_lp && moea64_has_lp_4k_16m)
|
|
break;
|
|
}
|
|
|
|
if (!has_lp)
|
|
panic("Standard large pages (SLB[L] = 1, PTE[LP] = 0) "
|
|
"not supported by this system.");
|
|
|
|
moea64_large_page_shift = shift;
|
|
moea64_large_page_size = 1ULL << lp_size;
|
|
}
|
|
|
|
out:
|
|
return (0);
|
|
}
|
|
|
|
void
|
|
powernv_mem_regions(platform_t plat, struct mem_region *phys, int *physsz,
|
|
struct mem_region *avail, int *availsz)
|
|
{
|
|
|
|
ofw_mem_regions(phys, physsz, avail, availsz);
|
|
}
|
|
|
|
static void
|
|
powernv_numa_mem_regions(platform_t plat, struct numa_mem_region *phys, int *physsz)
|
|
{
|
|
|
|
ofw_numa_mem_regions(phys, physsz);
|
|
}
|
|
|
|
static u_long
|
|
powernv_timebase_freq(platform_t plat, struct cpuref *cpuref)
|
|
{
|
|
char buf[8];
|
|
phandle_t cpu, dev, root;
|
|
int res;
|
|
int32_t ticks = -1;
|
|
|
|
root = OF_peer(0);
|
|
dev = OF_child(root);
|
|
while (dev != 0) {
|
|
res = OF_getprop(dev, "name", buf, sizeof(buf));
|
|
if (res > 0 && strcmp(buf, "cpus") == 0)
|
|
break;
|
|
dev = OF_peer(dev);
|
|
}
|
|
|
|
for (cpu = OF_child(dev); cpu != 0; cpu = OF_peer(cpu)) {
|
|
res = OF_getprop(cpu, "device_type", buf, sizeof(buf));
|
|
if (res > 0 && strcmp(buf, "cpu") == 0)
|
|
break;
|
|
}
|
|
if (cpu == 0)
|
|
return (512000000);
|
|
|
|
OF_getencprop(cpu, "timebase-frequency", &ticks, sizeof(ticks));
|
|
|
|
if (ticks <= 0)
|
|
panic("Unable to determine timebase frequency!");
|
|
|
|
return (ticks);
|
|
|
|
}
|
|
|
|
static int
|
|
powernv_cpuref_init(void)
|
|
{
|
|
phandle_t cpu, dev;
|
|
char buf[32];
|
|
int a, res, tmp_cpuref_cnt;
|
|
static struct cpuref tmp_cpuref[MAXCPU];
|
|
cell_t interrupt_servers[32];
|
|
uint64_t bsp;
|
|
|
|
if (platform_cpuref_valid)
|
|
return (0);
|
|
|
|
dev = OF_peer(0);
|
|
dev = OF_child(dev);
|
|
while (dev != 0) {
|
|
res = OF_getprop(dev, "name", buf, sizeof(buf));
|
|
if (res > 0 && strcmp(buf, "cpus") == 0)
|
|
break;
|
|
dev = OF_peer(dev);
|
|
}
|
|
|
|
bsp = 0;
|
|
tmp_cpuref_cnt = 0;
|
|
for (cpu = OF_child(dev); cpu != 0; cpu = OF_peer(cpu)) {
|
|
res = OF_getprop(cpu, "device_type", buf, sizeof(buf));
|
|
if (res > 0 && strcmp(buf, "cpu") == 0) {
|
|
if (!ofw_bus_node_status_okay(cpu))
|
|
continue;
|
|
res = OF_getproplen(cpu, "ibm,ppc-interrupt-server#s");
|
|
if (res > 0) {
|
|
OF_getencprop(cpu, "ibm,ppc-interrupt-server#s",
|
|
interrupt_servers, res);
|
|
|
|
for (a = 0; a < res/sizeof(cell_t); a++) {
|
|
tmp_cpuref[tmp_cpuref_cnt].cr_hwref = interrupt_servers[a];
|
|
tmp_cpuref[tmp_cpuref_cnt].cr_cpuid = tmp_cpuref_cnt;
|
|
tmp_cpuref[tmp_cpuref_cnt].cr_domain =
|
|
powernv_node_numa_domain(NULL, cpu);
|
|
if (interrupt_servers[a] == (uint32_t)powernv_boot_pir)
|
|
bsp = tmp_cpuref_cnt;
|
|
|
|
tmp_cpuref_cnt++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Map IDs, so BSP has CPUID 0 regardless of hwref */
|
|
for (a = bsp; a < tmp_cpuref_cnt; a++) {
|
|
platform_cpuref[platform_cpuref_cnt].cr_hwref = tmp_cpuref[a].cr_hwref;
|
|
platform_cpuref[platform_cpuref_cnt].cr_cpuid = platform_cpuref_cnt;
|
|
platform_cpuref[platform_cpuref_cnt].cr_domain = tmp_cpuref[a].cr_domain;
|
|
platform_cpuref_cnt++;
|
|
}
|
|
for (a = 0; a < bsp; a++) {
|
|
platform_cpuref[platform_cpuref_cnt].cr_hwref = tmp_cpuref[a].cr_hwref;
|
|
platform_cpuref[platform_cpuref_cnt].cr_cpuid = platform_cpuref_cnt;
|
|
platform_cpuref[platform_cpuref_cnt].cr_domain = tmp_cpuref[a].cr_domain;
|
|
platform_cpuref_cnt++;
|
|
}
|
|
|
|
platform_cpuref_valid = 1;
|
|
|
|
return (0);
|
|
}
|
|
|
|
static int
|
|
powernv_smp_first_cpu(platform_t plat, struct cpuref *cpuref)
|
|
{
|
|
if (platform_cpuref_valid == 0)
|
|
return (EINVAL);
|
|
|
|
cpuref->cr_cpuid = 0;
|
|
cpuref->cr_hwref = platform_cpuref[0].cr_hwref;
|
|
cpuref->cr_domain = platform_cpuref[0].cr_domain;
|
|
|
|
return (0);
|
|
}
|
|
|
|
static int
|
|
powernv_smp_next_cpu(platform_t plat, struct cpuref *cpuref)
|
|
{
|
|
int id;
|
|
|
|
if (platform_cpuref_valid == 0)
|
|
return (EINVAL);
|
|
|
|
id = cpuref->cr_cpuid + 1;
|
|
if (id >= platform_cpuref_cnt)
|
|
return (ENOENT);
|
|
|
|
cpuref->cr_cpuid = platform_cpuref[id].cr_cpuid;
|
|
cpuref->cr_hwref = platform_cpuref[id].cr_hwref;
|
|
cpuref->cr_domain = platform_cpuref[id].cr_domain;
|
|
|
|
return (0);
|
|
}
|
|
|
|
static int
|
|
powernv_smp_get_bsp(platform_t plat, struct cpuref *cpuref)
|
|
{
|
|
|
|
cpuref->cr_cpuid = platform_cpuref[0].cr_cpuid;
|
|
cpuref->cr_hwref = platform_cpuref[0].cr_hwref;
|
|
cpuref->cr_domain = platform_cpuref[0].cr_domain;
|
|
return (0);
|
|
}
|
|
|
|
#ifdef SMP
|
|
static int
|
|
powernv_smp_start_cpu(platform_t plat, struct pcpu *pc)
|
|
{
|
|
int result;
|
|
|
|
ap_pcpu = pc;
|
|
powerpc_sync();
|
|
|
|
result = opal_call(OPAL_START_CPU, pc->pc_hwref, EXC_RST);
|
|
if (result != OPAL_SUCCESS) {
|
|
printf("OPAL error (%d): unable to start AP %d\n",
|
|
result, (int)pc->pc_hwref);
|
|
return (ENXIO);
|
|
}
|
|
|
|
return (0);
|
|
}
|
|
|
|
static void
|
|
powernv_smp_probe_threads(platform_t plat)
|
|
{
|
|
char buf[8];
|
|
phandle_t cpu, dev, root;
|
|
int res, nthreads;
|
|
|
|
root = OF_peer(0);
|
|
|
|
dev = OF_child(root);
|
|
while (dev != 0) {
|
|
res = OF_getprop(dev, "name", buf, sizeof(buf));
|
|
if (res > 0 && strcmp(buf, "cpus") == 0)
|
|
break;
|
|
dev = OF_peer(dev);
|
|
}
|
|
|
|
nthreads = 1;
|
|
for (cpu = OF_child(dev); cpu != 0; cpu = OF_peer(cpu)) {
|
|
res = OF_getprop(cpu, "device_type", buf, sizeof(buf));
|
|
if (res <= 0 || strcmp(buf, "cpu") != 0)
|
|
continue;
|
|
|
|
res = OF_getproplen(cpu, "ibm,ppc-interrupt-server#s");
|
|
|
|
if (res >= 0)
|
|
nthreads = res / sizeof(cell_t);
|
|
else
|
|
nthreads = 1;
|
|
break;
|
|
}
|
|
|
|
smp_threads_per_core = nthreads;
|
|
if (mp_ncpus % nthreads == 0)
|
|
mp_ncores = mp_ncpus / nthreads;
|
|
}
|
|
|
|
static struct cpu_group *
|
|
powernv_smp_topo(platform_t plat)
|
|
{
|
|
if (mp_ncpus % smp_threads_per_core != 0) {
|
|
printf("WARNING: Irregular SMP topology. Performance may be "
|
|
"suboptimal (%d threads, %d on first core)\n",
|
|
mp_ncpus, smp_threads_per_core);
|
|
return (smp_topo_none());
|
|
}
|
|
|
|
/* Don't do anything fancier for non-threaded SMP */
|
|
if (smp_threads_per_core == 1)
|
|
return (smp_topo_none());
|
|
|
|
return (smp_topo_1level(CG_SHARE_L1, smp_threads_per_core,
|
|
CG_FLAG_SMT));
|
|
}
|
|
|
|
#endif
|
|
|
|
static void
|
|
powernv_reset(platform_t platform)
|
|
{
|
|
|
|
opal_call(OPAL_CEC_REBOOT);
|
|
}
|
|
|
|
static void
|
|
powernv_smp_ap_init(platform_t platform)
|
|
{
|
|
|
|
if (powernv_smp_ap_extra_init != NULL)
|
|
powernv_smp_ap_extra_init();
|
|
}
|
|
|
|
static void
|
|
powernv_cpu_idle(sbintime_t sbt)
|
|
{
|
|
}
|
|
|
|
static int
|
|
powernv_node_numa_domain(platform_t platform, phandle_t node)
|
|
{
|
|
/* XXX: Is locking necessary in here? */
|
|
static int numa_domains[MAXMEMDOM];
|
|
static int numa_max_domain;
|
|
cell_t associativity[5];
|
|
int i, res;
|
|
|
|
#ifndef NUMA
|
|
return (0);
|
|
#endif
|
|
if (vm_ndomains == 1)
|
|
return (0);
|
|
|
|
res = OF_getencprop(node, "ibm,associativity",
|
|
associativity, sizeof(associativity));
|
|
|
|
/*
|
|
* If this node doesn't have associativity, or if there are not
|
|
* enough elements in it, check its parent.
|
|
*/
|
|
if (res < (int)(sizeof(cell_t) * (platform_associativity + 1))) {
|
|
node = OF_parent(node);
|
|
/* If already at the root, use default domain. */
|
|
if (node == 0)
|
|
return (0);
|
|
return (powernv_node_numa_domain(platform, node));
|
|
}
|
|
|
|
for (i = 0; i < numa_max_domain; i++) {
|
|
if (numa_domains[i] == associativity[platform_associativity])
|
|
return (i);
|
|
}
|
|
if (i < MAXMEMDOM)
|
|
numa_domains[numa_max_domain++] =
|
|
associativity[platform_associativity];
|
|
else
|
|
i = 0;
|
|
|
|
return (i);
|
|
}
|
|
|
|
/* Set up the Nest MMU on POWER9 relatively early, but after pmap is setup. */
|
|
static void
|
|
powernv_setup_nmmu(void *unused)
|
|
{
|
|
if (opal_check() != 0)
|
|
return;
|
|
opal_call(OPAL_NMMU_SET_PTCR, -1, mfspr(SPR_PTCR));
|
|
}
|
|
|
|
SYSINIT(powernv_setup_nmmu, SI_SUB_CPU, SI_ORDER_ANY, powernv_setup_nmmu, NULL);
|