dune: remove (unused)

This commit is contained in:
Amy Ousterhout 2019-02-21 16:48:46 -05:00 committed by Josh Fried
parent decab1ef6a
commit 9f7f74699d
18 changed files with 4 additions and 3104 deletions

View File

@ -11,8 +11,6 @@ bindings - language bindings (C++ and rust) for the runtime.
dpdk - [DPDK](https://www.dpdk.org/) library for accessing NIC queues
from userspace.
dune - a better implementation of libdune based on the base library.
iokernel - dedicated core that steers packets and reallocates cores
across applications.

View File

@ -32,11 +32,6 @@ print-% : ; @echo $* = $($*)
base_src = $(wildcard base/*.c)
base_obj = $(base_src:.c=.o)
# libdune.a - the dune library
dune_src = $(wildcard dune/*.c)
dune_asm = $(wildcard dune/*.S)
dune_obj = $(dune_src:.c=.o) $(dune_asm:.S=.o)
#libnet.a - a packet/networking utility library
net_src = $(wildcard net/*.c) $(wildcard net/ixgbe/*.c)
net_obj = $(net_src:.c=.o)
@ -76,14 +71,11 @@ DPDK_LIBS += -lrte_pmd_mlx4 -libverbs -lmlx4
endif
# must be first
all: libbase.a libdune.a libnet.a libruntime.a iokerneld iokerneld-noht $(test_targets)
all: libbase.a libnet.a libruntime.a iokerneld iokerneld-noht $(test_targets)
libbase.a: $(base_obj)
$(AR) rcs $@ $^
libdune.a: $(dune_obj)
$(AR) rcs $@ $^
libnet.a: $(net_obj)
$(AR) rcs $@ $^
@ -102,8 +94,8 @@ $(test_targets): $(test_obj) libbase.a libruntime.a libnet.a base/base.ld
$(LD) $(LDFLAGS) -o $@ $@.o libruntime.a libnet.a libbase.a -lpthread
# general build rules for all targets
src = $(base_src) $(dune_src) $(net_src) $(runtime_src) $(iokernel_src) $(test_src)
asm = $(dune_asm) $(runtime_asm)
src = $(base_src) $(net_src) $(runtime_src) $(iokernel_src) $(test_src)
asm = $(runtime_asm)
obj = $(src:.c=.o) $(asm:.S=.o) $(iokernel_src:.c=-noht.o)
dep = $(obj:.o=.d)
@ -132,5 +124,5 @@ sparse: $(src)
.PHONY: clean
clean:
rm -f $(obj) $(dep) libbase.a libdune.a libnet.a libruntime.a \
rm -f $(obj) $(dep) libbase.a libnet.a libruntime.a \
iokerneld iokerneld-noht $(test_targets)

View File

@ -6,7 +6,6 @@ use std::path::PathBuf;
fn main() {
// Tell cargo to tell rustc to link the library.
println!("cargo:rustc-link-lib=static=base");
println!("cargo:rustc-link-lib=static=dune");
println!("cargo:rustc-link-lib=static=net");
println!("cargo:rustc-link-lib=static=runtime");
println!("cargo:rustc-flags=-L ../..");

View File

@ -1,51 +0,0 @@
/**
* dune.h - public header for Dune support
*/
#pragma once
#include <base/types.h>
#include <asm/ioctl.h>
/*
* IOCTL interface
*/
/* FIXME: this must be reserved in miscdevice.h */
#define DUNE_MINOR 233
#define DUNE_ENTER _IOR(DUNE_MINOR, 0x01, struct dune_config)
#define DUNE_GET_SYSCALL _IO(DUNE_MINOR, 0x02)
#define DUNE_GET_LAYOUT _IOW(DUNE_MINOR, 0x03, struct dune_layout)
#define DUNE_SIGNAL_INTR_BASE 200
struct dune_config {
uintptr_t rip;
uintptr_t rsp;
uintptr_t cr3;
long ret;
} __attribute__((packed));
extern int __dune_enter(int fd, struct dune_config *cfg);
extern int __dune_ret(void);
struct dune_layout {
uintptr_t phys_limit;
uintptr_t base_map;
uintptr_t base_stack;
} __attribute__((packed));
#define GPA_STACK_SIZE ((unsigned long)1 << 28) /* 256 megabytes */
#define GPA_MAP_SIZE (((unsigned long)1 << 32) - GPA_STACK_SIZE) /* 3.75 gigabytes */
static inline physaddr_t gpa_stack_base(const struct dune_layout *layout)
{
return layout->phys_limit - GPA_STACK_SIZE;
}
static inline physaddr_t gpa_map_base(const struct dune_layout *layout)
{
return layout->phys_limit - GPA_STACK_SIZE - GPA_MAP_SIZE;
}

View File

@ -1,465 +0,0 @@
/*
* entry.c - routines for managing Dune, user-kernel mode transitions,
* and CPU initialization
*/
#include <fcntl.h>
#include <sys/ioctl.h>
#include <asm/prctl.h>
#include <base/stddef.h>
#include <base/page.h>
#include <base/log.h>
#include <base/thread.h>
#include <base/lock.h>
#include <dune/vm.h>
#include <dune/procmap.h>
#include <dune/entry.h>
#include <dune/mmu.h>
#include <dune/trap.h>
#include <dune/fpu.h>
#include <dune/msr.h>
#include "dune.h"
/*
* kern_pgtbl contains all the mappings necessary to run the kernel.
* After initialization, it is immutable, and therefore does not
* require locking.
*/
ptent_t *kern_pgtbl;
/* the per-cpu kernel context stack pointer */
__thread uintptr_t kern_sp;
uintptr_t entry_vdso_base;
static int dune_fd;
static bool linux_has_vvar;
static struct idtd idt_template[IDT_ENTRIES];
static uint64_t gdt_template[GDT_ENTRIES] = {
0,
0,
SEG64(SEG_X | SEG_R, 0),
SEG64(SEG_W, 0),
0,
SEG64(SEG_W, 3),
SEG64(SEG_X | SEG_R, 3),
0,
0,
};
static inline void set_idt_addr(struct idtd *id, physaddr_t addr)
{
id->low = addr & 0xFFFF;
id->middle = (addr >> 16) & 0xFFFF;
id->high = (addr >> 32) & 0xFFFFFFFF;
}
static void entry_init_idt(void)
{
int i;
for (i = 0; i < IDT_ENTRIES; i++) {
struct idtd *id = &idt_template[i];
uintptr_t isr = (uintptr_t)&trap_entry_tbl[TRAP_ENTRY_SIZE * i];
memset(id, 0, sizeof(*id));
id->selector = GD_KT;
id->type = IDTD_P | IDTD_TRAP_GATE;
switch (i) {
case T_BRKPT:
id->type |= IDTD_CPL3;
/* fallthrough */
case T_DBLFLT:
case T_NMI:
case T_MCHK:
id->ist = 1;
break;
}
set_idt_addr(id, isr);
}
}
static int entry_init_layout(struct dune_layout *layout)
{
int ret = ioctl(dune_fd, DUNE_GET_LAYOUT, layout);
if (ret)
return -EIO;
log_info("entry: dune mem layout\n");
log_info("\tphys_limit\t0x%016lx\n", layout->phys_limit);
log_info("\tmap_base\t0x%016lx\n", layout->base_map);
log_info("\tstack_back\t0x%016lx\n", layout->base_stack);
return 0;
}
static ptent_t procmap_entry_to_flags(const struct procmap_entry *e)
{
ptent_t flags = PTE_P | PTE_G;
if (e->w)
flags |= PTE_W;
if (!e->x)
flags |= PTE_NX;
return flags;
}
static int entry_procmap_cb(const struct procmap_entry *e, unsigned long data)
{
struct dune_layout *layout = (struct dune_layout *) data;
if (e->type == PROCMAP_TYPE_VDSO || e->type == PROCMAP_TYPE_VVAR) {
off_t off = e->begin - layout->base_stack;
size_t len = e->end - e->begin;
ptent_t flags = procmap_entry_to_flags(e);
if (e->type == PROCMAP_TYPE_VVAR)
linux_has_vvar = true;
else
entry_vdso_base = e->begin;
if (off + len > GPA_STACK_SIZE)
panic("entry: dune stack region does not contain vsdo\n");
if (flags & PTE_W) {
log_err("entry: can't support writable vdso regions\n");
return -EINVAL;
}
return vm_map_phys(kern_pgtbl, gpa_stack_base(layout) + off,
(void *)e->begin, len, PGSIZE_4KB, flags | PTE_U);
}
if (e->type == PROCMAP_TYPE_VSYSCALL) {
return vm_map_copy(kern_pgtbl, vsyscall_page, (void *)e->begin,
PGSIZE_4KB, PGSIZE_4KB, PTE_P | PTE_G | PTE_U);
}
if (e->type == PROCMAP_TYPE_STACK) {
off_t off = e->begin - layout->base_stack;
return vm_map_phys(kern_pgtbl, gpa_stack_base(layout) + off,
(void *)e->begin, e->end - e->begin, PGSIZE_4KB,
PTE_P | PTE_W | PTE_G | PTE_NX);
}
/* ignore entries inside the dune map region */
if (e->end >= gpa_map_base(layout)) {
if (e->begin < layout->base_map ||
e->end > layout->base_map + GPA_MAP_SIZE) {
log_err("entry: procmap entry is out of range - "
"0x%016lx-0x%016lx %c%c%c%c %08lx %s\n",
e->begin, e->end,
e->r ? 'R' : '-',
e->w ? 'W' : '-',
e->x ? 'X' : '-',
e->p ? 'P' : 'S',
e->offset, e->path);
return -EINVAL;
}
return 0;
}
/* skip regions mapped by the page allocator */
if (e->begin >= PAGE_BASE_ADDR && e->end <= PAGE_END_ADDR)
return 0;
return vm_map_phys(kern_pgtbl, (physaddr_t)e->begin, (void *)e->begin,
e->end - e->begin, PGSIZE_4KB,
procmap_entry_to_flags(e));
}
static int entry_setup_oldstyle_vvar(void)
{
log_info("entry: didn't find [vvar] section, creating one manually\n");
#define VVAR_ADDR 0xffffffffff5ff000UL
return vm_map_copy(kern_pgtbl, (void *)VVAR_ADDR, (void *)VVAR_ADDR,
PGSIZE_4KB, PGSIZE_4KB, PTE_P | PTE_G | PTE_U);
}
static int entry_setup_syscall(void)
{
int ret;
uintptr_t lstar, aligned_lstar;
struct page *pg;
size_t total_len = (size_t)syscall_enter_end -
(size_t)syscall_enter;
size_t part_len;
void *buf;
BUG_ON(total_len > PGSIZE_4KB);
lstar = ioctl(dune_fd, DUNE_GET_SYSCALL);
if (lstar == -1)
return -EIO;
aligned_lstar = PGADDR_4KB(lstar);
pg = page_alloc(PGSIZE_4KB);
if (!pg)
return -ENOMEM;
ret = vm_insert_page(kern_pgtbl, (void *)aligned_lstar,
pg, PTE_P | PTE_G);
if (ret)
return ret;
part_len = min(total_len, PGSIZE_4KB - PGOFF_4KB(lstar));
buf = (char *)page_to_addr(pg) + PGOFF_4KB(lstar);
memcpy(buf, syscall_enter, part_len);
total_len -= part_len;
/* did the handler spill over to a second page boundary? */
if (total_len) {
pg = page_alloc(PGSIZE_4KB);
if (!pg)
return -ENOMEM;
aligned_lstar += PGSIZE_4KB;
ret = vm_insert_page(kern_pgtbl, (void *)aligned_lstar,
pg, PTE_P | PTE_G);
if (ret)
return ret;
buf = page_to_addr(pg);
memcpy(buf, &syscall_enter[part_len], total_len);
}
return 0;
}
static int entry_init_pgtbl(const struct dune_layout *layout)
{
int ret;
kern_pgtbl = vm_create_pt();
if (!kern_pgtbl)
return -ENOMEM;
/* step 1: bulk map the dune map region */
ret = vm_map_phys(kern_pgtbl, gpa_map_base(layout),
(void *)layout->base_map, GPA_MAP_SIZE,
PGSIZE_2MB, PTE_P | PTE_W | PTE_G);
if (ret)
goto fail;
/* step 2: identity map the base library page-map region */
ret = vm_map_phys(kern_pgtbl, (physaddr_t)PAGE_BASE_ADDR,
(void *)PAGE_BASE_ADDR, PAGE_END_ADDR - PAGE_BASE_ADDR,
PGSIZE_2MB, PTE_P | PTE_W | PTE_G | PTE_NX);
if (ret)
goto fail;
/* step 3: precision map phdr, heap, stack, vdso, and vvar sections */
ret = procmap_iterate(&entry_procmap_cb, (unsigned long)layout);
if (ret)
goto fail;
if(!linux_has_vvar) {
ret = entry_setup_oldstyle_vvar();
if (ret)
goto fail;
}
/* step 4: map the system call handler page */
ret = entry_setup_syscall();
if (ret)
goto fail;
return 0;
fail:
vm_destroy_pt(kern_pgtbl);
return ret;
}
/**
* entry_init - initialization for entry
*/
int entry_init(void)
{
int ret;
struct dune_layout layout;
dune_fd = open("/dev/dune", O_RDWR);
if (dune_fd < 0) {
log_err("entry: failed to open dune device\n");
return -EIO;
}
entry_init_idt();
ret = entry_init_layout(&layout);
if (ret) {
log_err("entry: unable to get dune memory layout\n");
return ret;
}
ret = entry_init_pgtbl(&layout);
if (ret) {
log_err("entry: failed to create kernel page table\n");
return ret;
}
return 0;
}
static __thread uint64_t gdt[GDT_ENTRIES] __aligned(CACHE_LINE_SIZE);
static __thread struct tssd tss __aligned(CACHE_LINE_SIZE);
static __thread struct idtd idt __aligned(CACHE_LINE_SIZE);
static __thread struct entry_percpu cpu_entry;
/* FIXME: protect the stacks with guard pages */
static int entry_setup_stacks(struct tssd *tss)
{
int i;
struct page *safe_stack_pg, *intr_stack_pg;
char *safe_stack, *intr_stack;
safe_stack_pg = page_alloc(PGSIZE_4KB);
if (!safe_stack_pg)
return -ENOMEM;
safe_stack = page_to_addr(safe_stack_pg);
safe_stack += PGSIZE_4KB;
tss->iomb = offsetof(struct tssd, iopb);
for (i = 0; i < 8; i++)
tss->ist[i] = (uintptr_t) safe_stack;
intr_stack_pg = page_alloc(PGSIZE_4KB);
if (!intr_stack_pg) {
page_put_addr(safe_stack_pg);
return -ENOMEM;
}
intr_stack = page_to_addr(intr_stack_pg);
intr_stack += PGSIZE_4KB;
tss->rsp[0] = (uintptr_t)intr_stack;
kern_sp = (uintptr_t)intr_stack;
return 0;
}
static int entry_start_dune(void)
{
struct dune_config conf;
int ret;
conf.rip = (uintptr_t)&__dune_ret;
conf.rsp = 0;
conf.cr3 = (uintptr_t)kern_pgtbl;
ret = __dune_enter(dune_fd, &conf);
if (ret) {
log_err("entry: failed to enter dune mode\n");
return ret;
}
return 0;
}
static int entry_boot_cpu(struct entry_percpu *ent,
uintptr_t gdt_addr, uintptr_t idt_addr)
{
struct tptr _idtr, _gdtr;
_gdtr.base = gdt_addr;
_gdtr.limit = sizeof(gdt_template) - 1;
_idtr.base = idt_addr;
_idtr.limit = sizeof(idt_template) - 1;
asm volatile(
/* STEP 1: load the new GDT */
"lgdt %0\n"
/* STEP 2: initialize data segements */
"mov $" __cstr(GD_KD) ", %%ax\n"
"mov %%ax, %%ds\n"
"mov %%ax, %%es\n"
"mov %%ax, %%ss\n"
/* STEP 3: long jump into the new code segment */
"mov $" __cstr(GD_KT) ", %%rax\n"
"pushq %%rax\n"
"pushq $1f\n"
"lretq\n"
"1: nop\n"
/* STEP 4: load the task register (for safe stack switching) */
"mov $" __cstr(GD_TSS) ", %%ax\n"
"ltr %%ax\n"
/* STEP 5: load the new IDT */
"lidt %1\n"
: : "m" (_gdtr), "m" (_idtr) : "rax");
/* STEP 6: FS and GS require special initialization on 64-bit */
setfsbase(ent->kfs_base);
setgsbase((uintptr_t)ent);
setgskernbase((uintptr_t)ent);
irq_enable();
return 0;
}
extern int arch_prctl(int code, unsigned long *addr);
/*
* entry_init_one - per-cpu initialization for entry
*/
int entry_init_one(void)
{
int ret;
struct entry_percpu *ent = &cpu_entry;
unsigned long fs_base;
/* step 1: set up the TSS */
ret = entry_setup_stacks(&tss);
if (ret)
return ret;
/* step 2: set up the GDT */
memcpy(gdt, gdt_template, sizeof(gdt_template));
gdt[GD_TSS >> 3] = (SEG_TSSA | SEG_P | SEG_A |
SEG_BASELO(&tss) |
SEG_LIM(sizeof(struct tssd) - 1));
gdt[GD_TSS2 >> 3] = SEG_BASEHI(&tss);
/* step 3: set up the IDT */
memcpy(&idt, idt_template, sizeof(idt));
/* step 4: setup the entry per-cpu structure */
if (arch_prctl(ARCH_GET_FS, &fs_base) == -1) {
log_err("entry: failed to get current FS.base\n");
return -EIO;
}
ent->kfs_base = fs_base;
ent->ugs_base = 0;
/* step 5: enter dune mode */
ret = entry_start_dune();
if (ret)
return ret;
/* step 6: set up architectural state */
ret = entry_boot_cpu(ent, (uintptr_t)gdt, (uintptr_t)&idt);
if (ret)
return ret;
return 0;
}

View File

@ -1,136 +0,0 @@
/*
* procmap.c - Parse linux process map information.
*/
/*
* Format:
* start addr-end addr perms offset dev(xx:yy) inode path
*
* Permsissions:
* rwxp
* ||||
* Readable ---------+|||
* (r or -) |||
* Writable ----------+||
* (w or -) ||
* Executable ---------+|
* (X or -) |
* Private/Shared ------+
* (p or s)
*
* Special Paths:
* - <filename>
* - anonymous
* - [heap]
* - [stack]
* - [vsyscall]
* - [vdso]
*
* Example /proc/self/maps:
* 00400000-0040b000 r-xp 00000000 fe:00 917797 /bin/cat
* 0060a000-0060b000 r--p 0000a000 fe:00 917797 /bin/cat
* 0060b000-0060c000 rw-p 0000b000 fe:00 917797 /bin/cat
* 022cf000-022f0000 rw-p 00000000 00:00 0 [heap]
* 7fe598687000-7fe59881e000 r-xp 00000000 fe:00 917523 /lib/libc-2.15.so
* 7fe59881e000-7fe598a1e000 ---p 00197000 fe:00 917523 /lib/libc-2.15.so
* 7fe598a1e000-7fe598a22000 r--p 00197000 fe:00 917523 /lib/libc-2.15.so
* 7fe598a22000-7fe598a24000 rw-p 0019b000 fe:00 917523 /lib/libc-2.15.so
* 7fe598a24000-7fe598a28000 rw-p 00000000 00:00 0
* 7fe598a28000-7fe598a49000 r-xp 00000000 fe:00 917531 /lib/ld-2.15.so
* 7fe598c37000-7fe598c3a000 rw-p 00000000 00:00 0
* 7fe598c47000-7fe598c48000 rw-p 00000000 00:00 0
* 7fe598c48000-7fe598c49000 r--p 00020000 fe:00 917531 /lib/ld-2.15.so
* 7fe598c49000-7fe598c4a000 rw-p 00021000 fe:00 917531 /lib/ld-2.15.so
* 7fe598c4a000-7fe598c4b000 rw-p 00000000 00:00 0
* 7fff601ca000-7fff601eb000 rw-p 00000000 00:00 0 [stack]
* 7fff601ff000-7fff60200000 r-xp 00000000 00:00 0 [vdso]
* ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
*/
#include <stdio.h>
#include <string.h>
#include <base/stddef.h>
#include <base/log.h>
#include <dune/procmap.h>
static int get_type(const char *path)
{
if (path[0] != '[' && path[0] != '\0')
return PROCMAP_TYPE_FILE;
if (path[0] == '\0')
return PROCMAP_TYPE_ANONYMOUS;
if (strcmp(path, "[heap]") == 0)
return PROCMAP_TYPE_HEAP;
if (strncmp(path, "[stack", 6) == 0)
return PROCMAP_TYPE_STACK;
if (strcmp(path, "[vsyscall]") == 0)
return PROCMAP_TYPE_VSYSCALL;
if (strcmp(path, "[vdso]") == 0)
return PROCMAP_TYPE_VDSO;
if (strcmp(path, "[vvar]") == 0)
return PROCMAP_TYPE_VVAR;
return PROCMAP_TYPE_UNKNOWN;
}
int procmap_iterate(procmap_cb_t cb, unsigned long data)
{
struct procmap_entry e;
FILE *map;
unsigned int dev1, dev2, inode;
char read, write, execute, private;
char line[512];
char path[256];
int ret = 0;
map = fopen("/proc/self/maps", "r");
if (map == NULL) {
log_err("procmap: could not open /proc/self/maps!\n");
return -EIO;
}
setvbuf(map, NULL, _IOFBF, 8192);
while (!feof(map)) {
path[0] = '\0';
if (fgets(line, 512, map) == NULL)
break;
sscanf((char *)&line, "%lx-%lx %c%c%c%c %lx %x:%x %d %s",
&e.begin, &e.end,
&read, &write, &execute, &private, &e.offset,
&dev1, &dev2, &inode, path);
e.r = (read == 'r');
e.w = (write == 'w');
e.x = (execute == 'x');
e.p = (private == 'p');
e.path = path;
e.type = get_type(path);
ret = cb(&e, data);
if (ret)
break;
}
fclose(map);
return ret;
}
static int
procmap_dump_helper(const struct procmap_entry *e, unsigned long data)
{
log_info("0x%016lx-0x%016lx %c%c%c%c %08lx %s\n",
e->begin, e->end,
e->r ? 'R' : '-',
e->w ? 'W' : '-',
e->x ? 'X' : '-',
e->p ? 'P' : 'S',
e->offset, e->path);
return 0;
}
void procmap_dump()
{
log_info("--- Process Map Dump ---\n");
procmap_iterate(&procmap_dump_helper, 0);
}

View File

@ -1,137 +0,0 @@
/*
* trap.c - x86 exception and interrupt support
*/
#include <base/stddef.h>
#include <base/log.h>
#include <dune/vm.h>
#include <dune/entry.h>
#include <dune/trap.h>
#include <dune/msr.h>
#define STACK_DUMP_DEPTH 16
#define NUM_CODES 20
static const char *trap_codes[NUM_CODES] = {
"divide error",
"debug exception",
"non-maskable interrupt",
"breakpoint",
"overflow",
"bounds check",
"illegal opcode",
"dev not available",
"double fault",
"reserved",
"invalid TSS",
"segment not present",
"stack exception",
"general protection fault",
"page fault",
"reserved",
"floating point error",
"alignment check",
"machine check",
"SIMD error",
};
static int safe_peekq(uint64_t *addr, uint64_t *val)
{
int ret, level;
ptent_t *pte;
ret = vm_lookup_pte(kern_pgtbl, addr, &level, &pte);
if (ret)
return ret;
if (!(*pte & PTE_P))
return -EINVAL;
if (*pte & PTE_PAGE) {
uint64_t *direct_ptr = (uint64_t *)((char *)PTE_ADDR(*pte) +
((off_t)addr & (PGLEVEL_TO_SIZE(level) - 1)));
*val = *direct_ptr;
} else {
*val = *(uint64_t *)addr;
}
return 0;
}
static void dump_stack(uintptr_t rsp)
{
int i;
uint64_t *sp = (uint64_t *)rsp;
log_info("dumping stack contents:\n");
if (rsp & (sizeof(uint64_t) - 1)) {
log_err("misaligned stack\n");
return;
}
for (i = 0; i < STACK_DUMP_DEPTH; i++) {
uint64_t val;
if (!safe_peekq(&sp[i], &val)) {
log_info("*(RSP+%03d) 0x%016lx\n",
(int)(i * sizeof(uint64_t)), val);
} else {
log_info("*(RSP+%03d) <unmapped>\n",
(int)(i * sizeof(uint64_t)));
break;
}
}
}
void dump_trap_frame(struct env_tf *tf)
{
log_info("--- Begin Frame Dump ---\n");
log_info("RIP 0x%016lx\n", tf->rip);
log_info("CS 0x%02x SS 0x%02x\n", tf->cs, tf->ss);
log_info("ERR 0x%08x RFLAGS 0x%08lx\n", tf->err, tf->rflags);
log_info("RAX 0x%016lx RCX 0x%016lx\n", tf->rax, tf->rcx);
log_info("RDX 0x%016lx RBX 0x%016lx\n", tf->rdx, tf->rbx);
log_info("RSP 0x%016lx RBP 0x%016lx\n", tf->rsp, tf->rbp);
log_info("RSI 0x%016lx RDI 0x%016lx\n", tf->rsi, tf->rdi);
log_info("R8 0x%016lx R9 0x%016lx\n", tf->r8, tf->r9);
log_info("R10 0x%016lx R11 0x%016lx\n", tf->r10, tf->r11);
log_info("R12 0x%016lx R13 0x%016lx\n", tf->r12, tf->r13);
log_info("R14 0x%016lx R15 0x%016lx\n", tf->r14, tf->r15);
log_info("FS.base 0x%016lx GS.base 0x%016lx\n",
getfsbase(), getgsbase());
dump_stack(tf->rsp);
log_info("--- End Frame Dump ---\n");
}
static void dump_pgflt(struct env_tf *tf)
{
uint32_t fec = tf->err;
uintptr_t fault_addr;
asm volatile("mov %%cr2, %0" : "=r" (fault_addr));
log_err("trap: %s page fault at ADDR 0x%016lx (%s, %s%s)\n",
(fec & FEC_U) ? "user" : "kernel", fault_addr,
(fec & FEC_P) ? "protection" : "non-present page",
(fec & FEC_RSV) ? "reserved bit error, " : "",
(fec & FEC_I) ? "code" : "data");
if (fault_addr < PGSIZE_4KB)
log_err("trap: likely NULL pointer exception\n");
}
void trap_handler(int num, struct env_tf *tf)
{
bool user = ((tf->cs & 0x3) == 0x3);
if (num == T_PGFLT) {
dump_pgflt(tf);
} else {
log_err("trap: unhandled trap %d (%s) in %s\n", num,
num < NUM_CODES ? trap_codes[num] : "spurious",
user ? "user" : "kernel");
}
dump_trap_frame(tf);
init_shutdown(EXIT_FAILURE);
}

View File

@ -1,454 +0,0 @@
/*
* trapasm.S - assembly helper routines (e.g. system calls, interrupts, traps)
*/
/*
* Enabling USE_RDRWGSFS can reduce system call overhead but this feature
* is only available on Ivy Bridge and later Intel CPUs.
*
* FIXME: detect this automatically
*/
#define MSR_FS_BASE 0xc0000100
#define MSR_GS_BASE 0xc0000101
#define GD_KT 0x10
#define GD_KD 0x18
#define GD_UD 0x28 | 0x03
#define GD_UT 0x30 | 0x03
/*
* Trap Frame Format
* NOTE: this reflects the layout of struct dune_tf
*/
/* arguments */
#define RDI (0)
#define RSI (8)
#define RDX (16)
#define RCX (24)
#define R8 (32)
#define R9 (40)
/* other registers */
#define R10 (48)
#define R11 (56)
#define RBX (64)
#define RBP (72)
#define R12 (80)
#define R13 (88)
#define R14 (96)
#define R15 (104)
#define REG_END (112)
/* syscall num / return code */
#define RAX (112)
/* exception frame */
#define ERR (120)
#define RIP (128)
#define CS (136)
#define RFLAGS (144)
#define RSP (152)
#define SS (160)
#define EF_START (128)
#define TF_END (168)
#define TF_ALIGN (176)
/*
* Dune Config Format
* NOTE: this reflects the layout of struct dune_config
*/
#define DUNE_CFG_RIP (0)
#define DUNE_CFG_RSP (8)
#define DUNE_CFG_CR3 (16)
#define DUNE_CFG_RET (24)
/*
* Supervisor Private Area Format
*/
#define TMP (8)
#define KFS_BASE (16)
#define UFS_BASE (24)
#define UGS_BASE (32)
#define FLAGS (40)
#define THREAD_STACK (48)
#define FLAG_IN_USER 0x1
#define FLAG_LOAD_USER 0x2
.text
/*
* macro to save destructable register state
*/
.macro SAVE_REGS save_full=1, include_rax=1
movq %rdi, RDI(%rsp)
movq %rsi, RSI(%rsp)
movq %rdx, RDX(%rsp)
movq %r8, R8(%rsp)
movq %r9, R9(%rsp)
.if \save_full
movq %r10, R10(%rsp)
movq %r11, R11(%rsp)
movq %rcx, RCX(%rsp)
.endif
.if \include_rax
movq %rax, RAX(%rsp)
.endif
.endm
/*
* macro to save the rest of register state
*
* useful for operations that violate AMD64 calling conventions
* by destroying callee restored state
*/
.macro SAVE_REST
movq %rbx, RBX(%rsp)
movq %rbp, RBP(%rsp)
movq %r12, R12(%rsp)
movq %r13, R13(%rsp)
movq %r14, R14(%rsp)
movq %r15, R15(%rsp)
.endm
/*
* macro to restore destructable register state
*/
.macro RESTORE_REGS rstor_full=1, include_rax=1
.if \include_rax
movq RAX(%rsp), %rax
.endif
.if \rstor_full
movq RCX(%rsp), %rcx
movq R11(%rsp), %r11
movq R10(%rsp), %r10
.endif
movq R9(%rsp), %r9
movq R8(%rsp), %r8
movq RDX(%rsp), %rdx
movq RSI(%rsp), %rsi
movq RDI(%rsp), %rdi
.endm
/*
* macro to restore the rest of register state
*
* useful for operations that violate AMD64 calling conventions
* by destroying callee restored state
*/
.macro RESTORE_REST
movq R15(%rsp), %r15
movq R14(%rsp), %r14
movq R13(%rsp), %r13
movq R12(%rsp), %r12
movq RBP(%rsp), %rbp
movq RBX(%rsp), %rbx
.endm
/*
* macro to setup FS and GS segments for kernel mode
*/
.macro SETUP_KERNEL_SEGS
movq $0, %gs:FLAGS
.endm
/*
* macro to setup FS and GS segments for user mode
*
* NOTE: clobbers %rax, %rdx, and %rcx
* WARNING: unsafe if interrupts are not disabled
*/
.macro SETUP_USER_SEGS check=1
orq $FLAG_IN_USER, %gs:FLAGS
.if \check
testq $FLAG_LOAD_USER, %gs:FLAGS
jz 1f
.endif
movq %gs:UFS_BASE, %rax
#ifdef USE_RDWRGSFS
wrfsbase %rax
#else
movq %rax, %rdx
shrq $32, %rdx
movl $MSR_FS_BASE, %ecx
wrmsr
#endif /* USE_RDWRGSFS */
movq %gs:UGS_BASE, %rax
swapgs
#ifdef USE_RDWRGSFS
wrgsbase %rax
#else
movq %rax, %rdx
shrq $32, %rdx
movl $MSR_GS_BASE, %ecx
wrmsr
#endif /* USE_RDWRGSFS */
.if \check
jmp 2f
1: swapgs
2:
.endif
.endm
.globl __dune_enter
__dune_enter:
subq $REG_END, %rsp
SAVE_REGS 1, 0
SAVE_REST
movq %rsp, DUNE_CFG_RSP(%rsi)
movq %rsi, %rdx
movq $0x8020e901, %rsi /* XXX DUNE_ENTER */
movq $16, %rax /* __NR_ioctl */
syscall
cmpq $0, %rax
jnz __dune_ret
mov %rdx, %rbx
call init_shutdown_late
movq DUNE_CFG_RET(%rbx), %rdi
movq $231, %rax /* __NR_exit_group */
syscall
.globl __dune_ret
__dune_ret:
RESTORE_REST
RESTORE_REGS 1, 0
addq $REG_END, %rsp
retq
/*
* System Call ABI
* ---------------
*
* User Parameters:
* %rsp - stack pointer
* %rcx - instruction pointer
* %r11 - eflags
* %rax - system call number
*
* Arguments:
* %rdi - arg0, %rsi - arg1, %rdx - arg2
* %r10 - arg3, %r8 - arg4, %r9 - arg5
*
* Return code goes in %rax
*
* XXX: don't do relative jumps - watch out code is memcpy
* XXX: Invoked with interrupts disabled...
*/
.globl syscall_enter
syscall_enter:
/*
* Hack to redirect any syscall instructions executed
* in kernel mode to the hypervisor through vmcall.
*/
swapgs
testq $FLAG_IN_USER, %gs:FLAGS
jnz 1f
pushq %r11
popfq
vmcall
jmp *%rcx
1:
/* first switch to the kernel stack */
movq %rsp, %gs:TMP
movq %gs:THREAD_STACK, %rsp
/* now push the trap frame onto the stack */
subq $TF_END, %rsp
movq %rcx, RIP(%rsp)
movq %r11, RFLAGS(%rsp)
movq %r10, RCX(%rsp) /* fixup to standard 64-bit calling ABI */
SAVE_REGS 0, 1
SAVE_REST
movq %gs:TMP, %rax
movq %rax, RSP(%rsp)
/* configure the segment bases */
SETUP_KERNEL_SEGS
/* then finally re-enable interrupts and jump to the handler */
sti
movq %rsp, %rdi /* argument 0 */
lea syscall_handler, %rax
call *%rax
cli
/* restore the segment bases */
SETUP_USER_SEGS
/* then pop the trap frame off the stack */
RESTORE_REGS 0, 1
RESTORE_REST
movq RCX(%rsp), %r10
movq RFLAGS(%rsp), %r11
movq RIP(%rsp), %rcx
/* switch to the user stack and return to ring 3 */
movq RSP(%rsp), %rsp
sysretq
.globl syscall_enter_end
syscall_enter_end:
nop
.globl pop_tf
pop_tf:
/* restore callee regs */
movq RBX(%rdi), %rbx
movq RBP(%rdi), %rbp
movq R12(%rdi), %r12
movq R13(%rdi), %r13
movq R14(%rdi), %r14
movq R15(%rdi), %r15
/* restore ip and stack */
movq RSP(%rdi), %rsp
movq RIP(%rdi), %rcx
jmpq *%rcx
.globl pop_tf_user
pop_tf_user:
movq %rdi, %rsp /* might not be a stack! */
SETUP_USER_SEGS 0
RESTORE_REGS
RESTORE_REST
addq $EF_START, %rsp
iretq
.globl pop_tf_user_fast
pop_tf_user_fast:
movq %rdi, %rsp /* might not be a stack! */
SETUP_USER_SEGS 0
RESTORE_REGS 0, 1
RESTORE_REST
movq R10(%rsp), %r10
movq RIP(%rsp), %rcx
movq RFLAGS(%rsp), %r11
movq RSP(%rsp), %rsp
sysretq
/**
* switch_tf - saves the current kernel frame and pops
* the next kernel frame
* @cur: the current trap frame
* @next: the next trap frame
*/
.globl switch_tf
switch_tf:
/* save callee regs */
movq %rbx, RBX(%rdi)
movq %rbp, RBP(%rdi)
movq %r12, R12(%rdi)
movq %r13, R13(%rdi)
movq %r14, R14(%rdi)
movq %r15, R15(%rdi)
/* save ip and stack */
movq (%rsp), %rcx
movq %rcx, RIP(%rdi)
leaq 8(%rsp), %rcx
movq %rcx, RSP(%rdi)
/* restore callee regs */
movq RBX(%rsi), %rbx
movq RBP(%rsi), %rbp
movq R12(%rsi), %r12
movq R13(%rsi), %r13
movq R14(%rsi), %r14
movq R15(%rsi), %r15
/* restore ip and stack */
movq RSP(%rsi), %rsp
movq RIP(%rsi), %rcx
/* restore arguments (in case new thread) */
movq RDI(%rsi), %rdi # ARG0
movq RSI(%rsi), %rsi # ARG1
jmpq *%rcx
/*
* NOTE: interrupts start out disabled.
* The macro generates a fixed-sized array of handlers, one for each vector.
*/
.globl trap_entry_tbl
.align 16
trap_entry_tbl:
i = 0
.rept 256
.align 16
.if i <> 8 && (i <= 9 || i >= 15) && i <> 17
pushq %rax /* placeholder for no error code */
.endif
pushq %rax /* save %rax */
mov $i, %rax
jmp 1f
i = i + 1
.endr
1:
/* save the remaining destructable registers */
subq $REG_END, %rsp
SAVE_REGS 1, 0 /* %rax already is pushed */
SAVE_REST
movq %rax, %rdi
/* determine if we were in user mode before the trap */
testq $3, CS(%rsp)
jz 2f
swapgs
SETUP_KERNEL_SEGS
2:
sti
/* setup arguments and call the handler */
movq %rsp, %rsi
call trap_handler
/* determine if we're returning to user mode */
testq $3, CS(%rsp)
jz 3f
/* return to user mode */
cli
SETUP_USER_SEGS
RESTORE_REGS
addq $EF_START, %rsp
iretq
/*
* This is the exception return fast path. It is only
* available when returning to the kernel instead of user
* space. The reason it is faster is that iretq has a
* fair amount of overhead and we can avoid that by using
* a regular retq instead.
*/
3:
movq RIP(%rsp), %rax
movq RSP(%rsp), %rcx
subq $8, %rcx
movq %rax, (%rcx) /* XXX: this overwrites SS in the trap frame */
movq %rcx, RSP(%rsp)
movq RFLAGS(%rsp), %rcx
pushq %rcx
popfq
RESTORE_REGS
/* jump to the frame */
movq RSP(%rsp), %rsp
retq

701
dune/vm.c
View File

@ -1,701 +0,0 @@
/*
* vm.h - virtual memory management support
*/
#include <string.h>
#include <base/stddef.h>
#include <base/mem.h>
#include <base/page.h>
#include <dune/vm.h>
#define PTE_DEF_FLAGS CAST64(PTE_P | PTE_W | PTE_U)
#define PTE_PERM_FLAGS CAST64(PTE_P | PTE_W | PTE_NX | PTE_U)
#define PTE_COW_FLAGS CAST64(PTE_P | PTE_NX | PTE_U)
static bool pte_present(ptent_t e)
{
return (PTE_FLAGS(e) & PTE_P) > 0;
}
static bool pte_big(ptent_t e)
{
return (PTE_FLAGS(e) & PTE_PS) > 0;
}
static bool addr_is_aligned(const void *addr, int pgsize)
{
return !((uintptr_t)addr & (pgsize - 1));
}
static bool addr_is_aligned_to_level(const void *addr, int level)
{
return addr_is_aligned(addr, PGLEVEL_TO_SIZE(level));
}
static struct page *vm_alloc_pgdir(void)
{
struct page *pg = page_zalloc(PGSIZE_4KB);
if (unlikely(!pg))
return NULL;
pg->flags |= PAGE_FLAG_PGDIR;
pg->item_count = 0;
return pg;
}
/**
* vm_lookup_pte - looks up a page table entry
* @tbl: the page table
* @va: the virtual address
* @level_out: a pointer to store the page level
* @pte_out: a pointer to store the PTE pointer
*
* WARNING: Synchronization not provided...
*
* Returns 0 if successful, otherwise fail.
*/
int vm_lookup_pte(ptent_t *tbl, const void *va, int *level_out,
ptent_t **pte_out)
{
ptent_t *pte = tbl;
int level;
for (level = PGLEVEL_NUM - 1; level >= 0; level--) {
pte = &pte[PDX(level, va)];
if (!*pte)
return -ENOENT;
if (!level || (level <= PGLEVEL_1GB && pte_big(*pte)))
break;
pte = (ptent_t *)PTE_ADDR(*pte);
}
if (!addr_is_aligned_to_level(va, level))
return -EINVAL;
if (level_out)
*level_out = level;
if (pte_out)
*pte_out = pte;
return 0;
}
/**
* vm_insert_pte - inserts an entry into the page table
* @tbl: the page table
* @va: the virtual address
* @level: the level to insert the pte
* @pte_in: the pte to insert
*
* WARNING: Synchronization is not provided.
*
* Returns 0 if successful, otherwise fail.
*/
int vm_insert_pte(ptent_t *tbl, const void *va, int level, ptent_t pte_in)
{
ptent_t *hist[PGLEVEL_NUM];
ptent_t *pte = tbl;
struct page *pg;
int pos;
if (level < PGLEVEL_4KB || level >= PGLEVEL_NUM)
return -EINVAL;
if (!(pte_in & PTE_PS) && level > PGLEVEL_4KB)
return -EINVAL;
if (!addr_is_aligned_to_level(va, level))
return -EINVAL;
for (pos = PGLEVEL_NUM - 1; pos > level; pos--) {
pte = &pte[PDX(pos, va)];
hist[pos] = pte;
if (!*pte) {
addr_to_smpage(pte)->item_count++;
pg = vm_alloc_pgdir();
if (unlikely(!pg))
goto fail;
*pte = (ptent_t)smpage_to_addr(pg) | PTE_DEF_FLAGS;
} else if (pos <= PGLEVEL_1GB && pte_big(*pte)) {
return -EEXIST;
}
pte = (ptent_t *)PTE_ADDR(*pte);
}
pte = &pte[PDX(level, va)];
if (unlikely(*pte))
return -EEXIST;
addr_to_smpage(pte)->item_count++;
*pte = pte_in;
return 0;
fail:
for (; pos < PGLEVEL_NUM; pos++) {
*hist[pos] = 0;
pg = addr_to_smpage(hist[pos]);
if (!--pg->item_count)
break;
page_put(pg);
}
return -ENOMEM;
}
/**
* vm_remove_pte - removes an entry from the page table
* @tbl: the page table
* @va: the virtual address
* @level_out: a pointer to store the page level
* @pte_out: a pointer to store the pte value
*
* WARNING: Synchronization is not provided.
*
* Returns 0 if successful, otherwise -ENOENT if nothing to remove.
*/
int vm_remove_pte(ptent_t *tbl, const void *va,
int *level_out, ptent_t *pte_out)
{
ptent_t *hist[PGLEVEL_NUM];
ptent_t *pte = tbl;
struct page *pg;
int level;
for (level = PGLEVEL_NUM - 1; level >= PGLEVEL_4KB; level--) {
pte = &pte[PDX(level, va)];
hist[level] = pte;
if (!*pte)
return -ENOENT;
if (!level || (level <= PGLEVEL_1GB && pte_big(*pte)))
break;
pte = (ptent_t *)PTE_ADDR(*pte);
}
if (!addr_is_aligned_to_level(va, level))
return -EINVAL;
if (level_out)
*level_out = level;
if (pte_out)
*pte_out = *pte;
for (; level < PGLEVEL_NUM; level++) {
pg = addr_to_smpage(hist[level]);
*hist[level] = 0;
if (!--pg->item_count)
break;
page_put(pg);
}
return 0;
}
/**
* vm_lookup_page - gets the page mapped at a virtual address
* @tbl: the page table
* @va: the virtual address
* @pg_out: the page to get
*
* WARNING: Sychronization is not provided.
*
* Returns a struct page, or NULL if none was mapped.
*/
int vm_lookup_page(ptent_t *tbl, const void *va, struct page **pg_out)
{
int ret;
ptent_t *pte;
ret = vm_lookup_pte(tbl, va, NULL, &pte);
if (ret)
return ret;
assert(*pte & PTE_PAGE);
*pg_out = addr_to_page((void *)PTE_ADDR(*pte));
return 0;
}
/**
* vm_insert_page - inserts a page at a virtual address
* @tbl: the page table
* @va: the virtual address
* @pg: the page to insert
* @flags: the PTE flags
*
* WARNING: Synchronization is not provided.
* The caller is responsible for incrementing the page refcount.
*
* Returns 0 if successful, otherwise fail.
*/
int vm_insert_page(ptent_t *tbl, const void *va, struct page *pg, ptent_t flags)
{
int ret;
ptent_t pte;
bool large = (pg->flags & PAGE_FLAG_LARGE) > 0;
pte = (ptent_t)smpage_to_addr(pg) | flags | PTE_PAGE;
if (large)
pte |= PTE_PS;
ret = vm_insert_pte(tbl, va, large ? PGLEVEL_2MB : PGLEVEL_4KB, pte);
return ret;
}
/**
* vm_remove_page - removes a page at a virtual address
* @tbl: the page table
* @va: the virtual address
* @pg_out: a pointer to store the removed page (can be NULL)
*
* WARNING: Synchronization is not provided.
* The caller is responsible for dropping the page refcount.
*
* Returns 0 if successful, or -ENOENT if there wasn't a page mapped.
*/
int vm_remove_page(ptent_t *tbl, const void *va, struct page **pg_out)
{
int ret;
ptent_t pte;
ret = vm_remove_pte(tbl, va, NULL, &pte);
if (ret)
return ret;
assert(pte & PTE_PAGE);
if (pg_out)
*pg_out = addr_to_page((void *)PTE_ADDR(pte));
return 0;
}
/**
* vm_map_phys - maps a range of physical memory to a range of virtual addresses
* @tbl: the page table
* @pa: the starting physical address
* @va: the starting virtual address
* @len: the length of the mapping (in bytes)
* @pgsize: the page size to use for the mappings
* @flags: the PTE flags
*
* WARNING: Synchronization is not provided.
*
* Returns 0 if successful, otherwise fail.
*/
int vm_map_phys(ptent_t *tbl, physaddr_t pa, const void *va,
size_t len, int pgsize, ptent_t flags)
{
intptr_t pos;
int ret;
if (unlikely(!addr_is_aligned(va, pgsize)))
return -EINVAL;
if (pgsize > PGSIZE_4KB)
flags |= PTE_PS;
for (pos = 0; pos < len; pos += pgsize) {
ptent_t pte = PTE_FLAGS(flags) | PTE_ADDR(pa + pos);
ret = vm_insert_pte(tbl, va + pos,
PGSIZE_TO_LEVEL(pgsize), pte);
if (unlikely(ret))
goto fail;
}
return 0;
fail:
for (pos -= pgsize; pos >= 0; pos -= pgsize)
vm_remove_pte(tbl, va + pos, NULL, NULL);
return ret;
}
/**
* vm_map_pages - maps pages to a range of virtual addresses
* @tbl: the pgae table
* @va: the starting virtual address
* @len: the length of the mapping (in bytes)
* @pgsize: the page size to use for the mappings
* @flags: the PTE flags
*
* WARNING: Synchronization is not provided.
*
* Returns 0 if successful, otherwise fail.
*/
int vm_map_pages(ptent_t *tbl, const void *va, size_t len,
int pgsize, ptent_t flags)
{
const char *start = (const char *)va;
intptr_t pos;
int ret;
if (unlikely(pgsize != PGSIZE_4KB && pgsize != PGSIZE_2MB))
return -EINVAL;
if (unlikely(!addr_is_aligned(va, pgsize)))
return -EINVAL;
for (pos = 0; pos < len; pos += pgsize) {
struct page *pg = page_zalloc(pgsize);
if (unlikely(!pg))
goto fail;
ret = vm_insert_page(tbl, start + pos, pg, flags);
if (unlikely(ret)) {
page_put(pg);
goto fail;
}
}
return 0;
fail:
for (pos -= pgsize; pos >= 0; pos -= pgsize) {
struct page *pg;
if (!vm_remove_page(tbl, start + pos, &pg))
page_put(pg);
}
return ret;
}
/**
* vm_map_copy - copies memory to new pages for a range of virtual addresses
* @tbl: the page table
* @src_va: the source data (from the current page table)
* @map_va: the destination address (in page table @tbl)
* @len: the length to copy
* @pgsize: the page size
* @flags: the PTE flags
*
* WARNING: Synchronization is not provided.
*
* Returns 0 if successful, otherwise fail.
*/
int vm_map_copy(ptent_t *tbl, const void *src_va, const void *map_va,
size_t len, int pgsize, ptent_t flags)
{
const char *src_start = (const char *)src_va;
const char *map_start = (const char *)map_va;
intptr_t pos;
int ret;
if (unlikely(pgsize != PGSIZE_4KB && pgsize != PGSIZE_2MB))
return -EINVAL;
if (unlikely(!addr_is_aligned(map_va, pgsize)))
return -EINVAL;
for (pos = 0; pos < len; pos += pgsize) {
struct page *pg = page_alloc(pgsize);
if (unlikely(!pg))
goto fail;
memcpy(page_to_addr(pg), src_start + pos,
min(pgsize, len - pos));
ret = vm_insert_page(tbl, map_start + pos, pg, flags);
if (unlikely(ret)) {
page_put(pg);
goto fail;
}
}
return 0;
fail:
for (pos -= pgsize; pos >= 0; pos -= pgsize) {
struct page *pg;
if (!vm_remove_page(tbl, map_start + pos, &pg))
page_put(pg);
}
return ret;
}
/**
* vm_mod - changes the PTE flags for a range of virtual addresses
* @tbl: the page table
* @va: the starting virtual address
* @len: the length of the range (in bytes)
* @pgsize: the smallest possible page size
* @flags: the new PTE flags
*
* Will silently skip missing mappings.
*
* Returns true if one or more PTE permissions were changed, otherwise false.
*/
bool vm_mod(ptent_t *tbl, const void *va, size_t len, int pgsize, ptent_t flags)
{
const char *start = (const char *) va;
intptr_t pos;
int ret, level;
bool changed = false;
/* check alignment */
assert(addr_is_aligned(va, pgsize));
for (pos = 0; pos < len;) {
ptent_t *pte;
ptent_t old;
ret = vm_lookup_pte(tbl, start + pos, &level, &pte);
if (ret) {
pos += pgsize;
continue;
}
old = *pte;
*pte &= ~(PTE_PERM_FLAGS);
if (old & PTE_COW)
*pte |= (flags & PTE_COW_FLAGS);
else
*pte |= (flags & PTE_PERM_FLAGS);
if (*pte != old)
changed = true;
assert(pgsize <= PGLEVEL_TO_SIZE(level));
pos += PGLEVEL_TO_SIZE(level);
}
return changed;
}
/**
* vm_disable - marks a range of PTEs not present
* @tbl: the page table
* @va: the starting virtual address
* @len: the length of the range (in bytes)
* @pgsize: the smallest possible page size
*
* Will silently skip missing mappings.
*
* Returns true if one or more PTEs were disabled, otherwise false.
*/
bool vm_disable(ptent_t *tbl, const void *va, size_t len, int pgsize)
{
const char *start = (const char *) va;
intptr_t pos;
int ret, level;
bool changed = false;
/* check alignment */
assert(addr_is_aligned(va, pgsize));
for (pos = 0; pos < len;) {
ptent_t *pte;
ret = vm_lookup_pte(tbl, start + pos, &level, &pte);
if (ret) {
pos += pgsize;
continue;
}
*pte &= ~(CAST64(PTE_P));
assert(pgsize <= PGLEVEL_TO_SIZE(level));
pos += PGLEVEL_TO_SIZE(level);
changed = true;
}
return changed;
}
/**
* vm_unmap - removes mappings from a range of virtual addresses
* @tbl: the page table
* @va: the starting virtual address
* @len: the length of the range (in bytes)
* @pgsize: the smallest possible page size
*
* Use this variant for mappings that are not backed by pages.
*
* Cannot fail, but may skip missing mappings.
*/
void vm_unmap(ptent_t *tbl, const void *va, size_t len, int pgsize)
{
uintptr_t pos;
int ret, level;
/* check alignment */
assert(addr_is_aligned(va, pgsize));
for (pos = 0; pos < len;) {
ret = vm_remove_pte(tbl, va + pos, &level, NULL);
if (ret) {
pos += pgsize;
} else {
assert(pgsize <= PGLEVEL_TO_SIZE(level));
pos += PGLEVEL_TO_SIZE(level);
}
}
}
/**
* vm_unmap_pages - removes pages from a range of virtual addresses
* @tbl: the page table
* @va: the starting virtual address
* @len: the length of the range (in bytes)
* @pgsize: the smallest possible page size
*
* Use this variant for mappings backed by pages (does ref counting).
*
* Cannot fail, but may skip missing mappings.
*/
void vm_unmap_pages(ptent_t *tbl, const void *va, size_t len, int pgsize)
{
intptr_t pos;
/* check alignment */
assert(addr_is_aligned(va, pgsize));
for (pos = 0; pos < len;) {
struct page *pg;
if (!vm_remove_page(tbl, va + pos, &pg)) {
assert(pgsize <= page_to_size(pg));
pos += page_to_size(pg);
page_put(pg);
} else
pos += pgsize;
}
}
/**
* vm_create_pt - creates a page table
*
* Returns a page table, or NULL if out of memory.
*/
ptent_t *vm_create_pt(void)
{
struct page *pg = vm_alloc_pgdir();
if (!pg)
return NULL;
return (ptent_t *)smpage_to_addr(pg);
}
/**
* vm_clone_kern_pt - creates a copy of the kernel page table
*
* WARNING: Pages in the kernel page table won't be refcounted. It's assumed
* they are never deallocated for the life of the process.
*
* Returns a page table, or NULL if out of memory.
*/
ptent_t *vm_clone_kern_pt(void)
{
int i, j, k, l;
struct page *pg;
ptent_t *src_pud, *src_pmd, *src_pd;
ptent_t *dst_pud, *dst_pmd, *dst_pd;
ptent_t *pgtbl = vm_create_pt();
if (unlikely(!pgtbl))
return NULL;
for (i = 0; i < NPTENTRIES; i++) {
if (!pte_present(kern_pgtbl[i]))
continue;
pg = vm_alloc_pgdir();
if (unlikely(!pg))
goto err;
src_pud = (ptent_t *)PTE_ADDR(kern_pgtbl[i]);
dst_pud = (ptent_t *)smpage_to_addr(pg);
pgtbl[i] = (ptent_t)dst_pud | PTE_DEF_FLAGS;
addr_to_smpage(pgtbl)->item_count++;
for (j = 0; j < NPTENTRIES; j++) {
if (!src_pud[j])
continue;
if (pte_big(src_pud[j])) {
assert(!(src_pud[j] & PTE_PAGE));
dst_pud[j] = src_pud[j];
pg->item_count++;
continue;
}
pg = vm_alloc_pgdir();
if (unlikely(!pg))
goto err;
src_pmd = (ptent_t *)PTE_ADDR(src_pud[j]);
dst_pmd = (ptent_t *)smpage_to_addr(pg);
dst_pud[j] = (ptent_t)dst_pmd | PTE_DEF_FLAGS;
addr_to_smpage(dst_pud)->item_count++;
for (k = 0; k < NPTENTRIES; k++) {
if (!src_pmd[k])
continue;
if (pte_big(src_pmd[k])) {
dst_pmd[k] = src_pmd[k];
pg->item_count++;
continue;
}
pg = vm_alloc_pgdir();
if (unlikely(!pg))
goto err;
src_pd = (ptent_t *)PTE_ADDR(src_pmd[k]);
dst_pd = (ptent_t *)smpage_to_addr(pg);
dst_pmd[k] = (ptent_t)smpage_to_addr(pg) |
PTE_DEF_FLAGS;
addr_to_smpage(dst_pmd)->item_count++;
for (l = 0; l < NPTENTRIES; l++) {
dst_pd[l] = src_pd[l];
pg->item_count++;
}
}
}
}
return pgtbl;
err:
vm_destroy_pt(pgtbl);
return NULL;
}
/**
* vm_destroy_pt - destroys a page table
* @tbl: the page table
*/
void vm_destroy_pt(ptent_t *tbl)
{
int i, j, k;
ptent_t *pud, *pmd;
for (i = 0; i < NPTENTRIES; i++) {
if (!pte_present(tbl[i]))
continue;
pud = (ptent_t *)PTE_ADDR(tbl[i]);
for (j = 0; j < NPTENTRIES; j++) {
if (!pud[j])
continue;
if (pte_big(pud[j]))
continue;
pmd = (ptent_t *)PTE_ADDR(pud[j]);
for (k = 0; k < NPTENTRIES; k++) {
if (!pmd[k])
continue;
if (pte_big(pmd[k]))
continue;
page_put_addr((ptent_t *)PTE_ADDR(pmd[k]));
}
page_put_addr(pmd);
}
page_put_addr(pud);
}
page_put_addr(tbl);
}

View File

@ -1,45 +0,0 @@
#include <asm/unistd_64.h>
.data
.globl vsyscall_page
.balign 4096, 0xcc
.type vsyscall_page, @object
vsyscall_page:
/* handle gettimeofday() */
mov %cs, %rax
test $3, %rax
mov $__NR_gettimeofday, %rax
jnz 1f
vmcall
ret
1:
syscall
ret
/* handle time() */
.balign 1024, 0xcc
mov %cs, %rax
test $3, %rax
mov $__NR_time, %rax
jnz 2f
vmcall
ret
2:
syscall
ret
/* handle getcpu() */
.balign 1024, 0xcc
mov %cs, %rax
test $3, %rax
mov $__NR_getcpu, %rax
jnz 3f
vmcall
ret
3:
syscall
ret
.balign 4096, 0xcc
.size __dune_vsyscall_page, 4096

View File

@ -1,189 +0,0 @@
/*
* entry.h - routines for entering and exiting the kernel
*/
#pragma once
#include <base/stddef.h>
/* the base address of the linux kernel vdso mapping */
extern uintptr_t entry_vdso_base;
/* asm entry routines */
extern const char syscall_enter[];
extern const char syscall_enter_end[];
extern const char trap_entry_tbl[];
extern const char vsyscall_page[];
#define TRAP_ENTRY_SIZE 16
/*
* We use the same general GDT layout as Linux so that can we use
* the same syscall MSR values. In practice only code segments
* matter, since ia-32e mode ignores most of segment values anyway,
* but just to be extra careful we match data as well.
*/
#define GD_KT 0x10
#define GD_KD 0x18
#define GD_UD 0x28
#define GD_UT 0x30
#define GD_TSS 0x38
#define GD_TSS2 0x40
#define GDT_ENTRIES 9
struct env_tf {
/* manually saved, arguments */
uint64_t rdi;
uint64_t rsi;
uint64_t rdx;
uint64_t rcx;
uint64_t r8;
uint64_t r9;
uint64_t r10;
uint64_t r11;
/* saved by C calling conventions */
uint64_t rbx;
uint64_t rbp;
uint64_t r12;
uint64_t r13;
uint64_t r14;
uint64_t r15;
/* system call number, ret */
uint64_t rax;
/* exception frame */
uint32_t err;
uint32_t pad1;
uint64_t rip;
uint16_t cs;
uint16_t pad2[3];
uint64_t rflags;
uint64_t rsp;
uint16_t ss;
uint16_t pad3[3];
} __packed;
#define ARG0(tf) ((tf)->rdi)
#define ARG1(tf) ((tf)->rsi)
#define ARG2(tf) ((tf)->rdx)
#define ARG3(tf) ((tf)->rcx)
#define ARG4(tf) ((tf)->r8)
#define ARG5(tf) ((tf)->r9)
extern void pop_tf(struct env_tf *tf) __noreturn;
extern void pop_tf_user(struct env_tf *tf) __noreturn;
extern void pop_tf_user_fast(struct env_tf *tf) __noreturn;
extern void switch_tf(struct env_tf *curtf, struct env_tf *newtf);
struct entry_percpu {
void *percpu_ptr;
uint64_t tmp;
uintptr_t kfs_base;
uintptr_t ufs_base;
uintptr_t ugs_base;
uint64_t flags;
void *thread_stack;
uint32_t preempt_cnt;
uint32_t pad;
} __packed;
#define ENTRY_FLAG_IN_USER 0x1 /* in usermode? */
#define ENTRY_FLAG_LOAD_USER 0x2 /* restore usermode segs? */
static inline void entry_set_thread_stack(uintptr_t val)
{
asm("movq %0, %%gs:%c[thread_stack]"
: /* no outputs */
: "r"(val), [thread_stack]"i"(offsetof(struct entry_percpu, thread_stack))
: "memory");
}
static inline uint64_t entry_get_kfs_base(void)
{
uint64_t val;
asm("movq %%gs:%c[kfs_base], %0"
: "=r"(val)
: [kfs_base]"i"(offsetof(struct entry_percpu, kfs_base))
: "memory");
return val;
}
static inline void entry_set_kfs_base(uint64_t val)
{
asm("movq %0, %%gs:%c[kfs_base]"
: /* no outputs */
: "r"(val), [kfs_base]"i"(offsetof(struct entry_percpu, kfs_base))
: "memory");
}
static inline uint64_t entry_get_ufs_base(void)
{
uint64_t val;
asm("movq %%gs:%c[ufs_base], %0"
: "=r"(val)
: [ufs_base]"i"(offsetof(struct entry_percpu, ufs_base))
: "memory");
return val;
}
static inline void entry_set_ufs_base(uint64_t val)
{
asm("movq %0, %%gs:%c[ufs_base]"
: /* no outputs */
: "r"(val), [ufs_base]"i"(offsetof(struct entry_percpu, ufs_base))
: "memory");
}
static inline uint64_t entry_get_ugs_base(void)
{
uint64_t val;
asm("movq %%gs:%c[ugs_base], %0"
: "=r"(val)
: [ugs_base]"i"(offsetof(struct entry_percpu, ugs_base))
: "memory");
return val;
}
static inline void entry_set_ugs_base(uint64_t val)
{
asm("movq %0, %%gs:%c[ugs_base]"
: /* no outputs */
: "r"(val), [ugs_base]"i"(offsetof(struct entry_percpu, ugs_base))
: "memory");
}
static inline void entry_set_flag_mask(uint64_t val)
{
asm("orq %0, %%gs:%c[flags]"
: /* no outputs */
: "r"(val), [flags]"i"(offsetof(struct entry_percpu, flags))
: "memory", "cc");
}
static inline void entry_clear_flag_mask(uint64_t val)
{
asm("andq %0, %%gs:%c[flags]"
: /* no outputs */
: "r"(~(val)), [flags]"i"(offsetof(struct entry_percpu, flags))
: "memory", "cc");
}
static inline bool entry_test_flag_mask(uint64_t val)
{
asm goto("testq %0, %%gs:%c[flags]\n\t"
"jz %l[no_match]\n\t"
: /* no outputs */
: "r"(val), [flags]"i"(offsetof(struct entry_percpu, flags))
: "memory", "cc"
: no_match);
return true;
no_match:
return false;
}

View File

@ -1,123 +0,0 @@
/*
* fpu.h - x86 floating point, MMX, SSE, and AVX support for Dune
*/
#pragma once
#include <base/types.h>
struct fxsave_area {
uint16_t cwd;
uint16_t swd;
uint16_t twd;
uint16_t fop;
uint64_t rip;
uint64_t rdp;
uint32_t mxcsr;
uint32_t mxcsr_mask;
uint32_t st_regs[32]; // 8 128-bit FP registers
uint32_t xmm_regs[64]; // 16 128-bit XMM registers
uint32_t padding[24];
} __attribute__((packed));
struct xsave_header {
uint64_t xstate_bv;
uint64_t xcomp_bv;
uint64_t reserved_zero;
uint64_t reserved[5];
} __attribute__((packed));
struct xsave_area {
struct fxsave_area fxsave;
struct xsave_header header;
uint32_t ymm_regs[64]; // extends XMM registers to 256-bit
/* FIXME: check CPUID, could be other extensions in the future */
} __attribute__((packed, aligned(64)));
struct fpu_area {
/* we only support xsave, since it's available in nehalem and later */
struct xsave_area xsave;
};
static inline void fpu_xsave(struct fpu_area *fp, uint64_t mask)
{
uint32_t lmask = mask;
uint32_t umask = mask >> 32;
asm volatile("xsaveq %0\n\t" : "=m"(fp->xsave) :
"a"(lmask), "d"(umask) :
"memory");
}
static inline void fpu_xsaveopt(struct fpu_area *fp, uint64_t mask)
{
uint32_t lmask = mask;
uint32_t umask = mask >> 32;
asm volatile("xsaveoptq %0\n\t" : "=m"(fp->xsave) :
"a"(lmask), "d"(umask) :
"memory");
}
static inline void fpu_xrstor(struct fpu_area *fp, uint64_t mask)
{
uint32_t lmask = mask;
uint32_t umask = mask >> 32;
asm volatile("xrstorq %0\n\t" : : "m"(fp->xsave),
"a"(lmask), "d"(umask) :
"memory");
}
/*
* fpu_init - initializes an fpu area
* @fp: the fpu area
*/
static inline void fpu_init(struct fpu_area *fp)
{
fp->xsave.header.xstate_bv = 0;
fp->xsave.header.xcomp_bv = 0;
fp->xsave.header.reserved_zero = 0;
fp->xsave.fxsave.cwd = 0x37f;
fp->xsave.fxsave.mxcsr = 0x1f80;
}
/*
* fpu_load - loads an fpu area into fpu registers
* @fp: the fpu area
*/
static inline void fpu_load(struct fpu_area *fp)
{
fpu_xrstor(fp, -1);
}
/*
* fpu_save - saves fpu registers to an fpu area
* @fp: the fpu area
*
* WARNING: Do not call this function on a memory region
* that was not previously loaded with fpu_load().
*
* If you do, register state corruption might be possible. See
* "XSAVEOPT Usage Guidlines" under the XSAVEOPT instruction
* description in the Intel Manual Instruction Set Reference
* for more details.
*/
static inline void fpu_save(struct fpu_area *fp)
{
// FIXME: need to check CPUID because only
// sandybridge and later support XSAVEOPT
fpu_xsaveopt(fp, -1);
}
/*
* fpu_save_safe - saves an fpu area from CPU registers
* @fp: the fpu area
*
* Works under all conditions, but may be slower.
*/
static inline void fpu_save_safe(struct fpu_area *fp)
{
fpu_xsave(fp, -1);
}

View File

@ -1,264 +0,0 @@
/*
* mmu.h - x86 MMU definitions
*
* NOTE: This code is derived from JOS, created by MIT PDOS.
*/
#pragma once
#include <base/types.h>
typedef uint64_t ptent_t;
#define UINT64(x) ((uint64_t) x)
#define CAST64(x) ((uint64_t) x)
#define ONE UINT64 (1)
#define NPTBITS 9 /* log2(NPTENTRIES) */
/*
*
* Part 1. Paging data structures and control registers
*
*/
/* index into:
* n = 0 => page table
* n = 1 => page directory
* n = 2 => page directory pointer
* n = 3 => page map level 4
*/
#define PDXMASK ((1 << NPTBITS) - 1)
#define PDSHIFT(n) (12 + NPTBITS * (n))
#define PDX(n, la) ((((uintptr_t) (la)) >> PDSHIFT(n)) & PDXMASK)
#define NPTENTRIES (1 << NPTBITS)
/* page number field of address */
#define PPN(la) ((la) >> PGSHIFT)
/* page size */
#define PGSHIFT 12 /* log2(PGSIZE) */
#define PGSIZE (1 << PGSHIFT) /* bytes mapped by a page */
#define PGMASK (PGSIZE - 1)
/* offset in page */
#define PGOFF(la) (((uintptr_t) (la)) & PGMASK)
#define PGADDR(la) (((uintptr_t) (la)) & ~CAST64(PGMASK))
/* big page size */
#define BIG_PGSHIFT 21
#define BIG_PGSIZE (1 << BIG_PGSHIFT)
#define BIG_PGMASK (BIG_PGSIZE - 1)
/* offset in big page */
#define BIG_PGOFF(la) (((uintptr_t) (la)) & BIG_PGMASK)
#define BIG_PGADDR(la) (((uintptr_t) (la)) & ~CAST64(BIG_PGMASK))
/* Page table/directory entry flags. */
#define PTE_P 0x0001 /* Present */
#define PTE_W 0x0002 /* Writeable */
#define PTE_U 0x0004 /* User */
#define PTE_PWT 0x0008 /* Write-Through */
#define PTE_PCD 0x0010 /* Cache-Disable */
#define PTE_A 0x0020 /* Accessed */
#define PTE_D 0x0040 /* Dirty */
#define PTE_PS 0x0080 /* Page size, in PD/PDP/PML4 */
#define PTE_PAT 0x0080 /* Page attribute table, in 4KB PTE */
#define PTE_G 0x0100 /* Global */
#define PTE_AVAIL 0x0E00 /* 3 bits not used by hardware */
#define PTE_PAT_PS 0x1000 /* Page attribute table, in 2MB PTE */
#define PTE_AVAIL2 0x7FF0000000000000UL /* 11 bits not used by hardware */
#define PTE_NX 0x8000000000000000UL /* No execute */
/* OS Specific Flags - Using available bits in PTE */
#define PTE_PAGE 0x0200 /* backed by a page */
#define PTE_COW 0x0400 /* copy-on-write */
/* address in page table entry */
#define PTE_ADDR(pte) ((physaddr_t)(pte) & 0xffffffffff000UL)
#define PTE_FLAGS(pte) ((physaddr_t)(pte) & 0xfff0000000000fffUL)
/* Control Register flags */
#define CR0_PE 0x1 /* Protected mode enable */
#define CR0_MP 0x2 /* Monitor coProcessor */
#define CR0_EM 0x4 /* Emulation */
#define CR0_TS 0x8 /* Task Switched */
#define CR0_ET 0x10 /* Extension Type */
#define CR0_NE 0x20 /* Numeric Errror */
#define CR0_WP 0x10000 /* Write Protect */
#define CR0_AM 0x40000 /* Alignment Mask */
#define CR0_NW 0x20000000 /* Not Writethrough */
#define CR0_CD 0x40000000 /* Cache Disable */
#define CR0_PG 0x80000000 /* Paging */
#define CR3_PWT 0x8 /* Page-level writethrough */
#define CR3_PCD 0x10 /* Page-level cache disable */
#define CR4_VME 0x1 /* V86 Mode Extensions */
#define CR4_PVI 0x2 /* Protected-Mode Virtual Interrupts */
#define CR4_TSD 0x4 /* Time Stamp Disable */
#define CR4_DE 0x8 /* Debugging Extensions */
#define CR4_PSE 0x10 /* Page Size Extensions */
#define CR4_PAE 0x20 /* Page address extension */
#define CR4_MCE 0x40 /* Machine Check Enable */
#define CR4_PGE 0x80 /* Page-global enable */
#define CR4_PCE 0x100 /* Performance counter enable */
#define CR4_OSFXSR 0x200 /* FXSAVE/FXRSTOR support */
#define CR4_OSX 0x400 /* OS unmasked exception support */
/* MTRR registers */
#define MTRR_CAP 0xfe /* MTRR capabilities */
#define MTRR_CAP_VCNT_MASK 0xff /* Variable-size register count */
#define MTRR_CAP_FIX 0x100 /* Fixed-size register support */
#define MTRR_CAP_WC 0x400 /* Write-combining support */
#define MTRR_BASE(i) (0x200 + 2*(i)) /* Physical address base */
#define MTRR_BASE_UC 0x00 /* Uncacheable */
#define MTRR_BASE_WC 0x01 /* Write-Combining */
#define MTRR_BASE_WT 0x04 /* Writethrough */
#define MTRR_BASE_WP 0x05 /* Write-Protect */
#define MTRR_BASE_WB 0x06 /* Writeback */
#define MTRR_MASK(i) (0x201 + 2*(i)) /* Physical address mask */
#define MTRR_MASK_FULL PGADDR((ONE << 36) - 1)
#define MTRR_MASK_VALID 0x800
/* EFER Register */
#define EFER 0xc0000080 /* MSR number */
#define EFER_SCE 0x1 /* System-call extension */
#define EFER_LME 0x100 /* Long mode enable */
#define EFER_LMA 0x400 /* Long mode active */
#define EFER_NXE 0x800 /* No-execute enable */
#define EFER_FFXSR 0x4000 /* Fast FXSAVE/FXRSTOR */
/* FS/GS base registers */
#define MSR_FS_BASE 0xc0000100
#define MSR_GS_BASE 0xc0000101
/* Debug registers */
#define MSR_DEBUG_CTL 0x1d9 /* MSR number */
#define DEBUG_CTL_LBR (1 << 0) /* Last-Branch Record */
#define MSR_LBR_FROM_IP 0x1db /* Last branch from IP */
#define MSR_LBR_TO_IP 0x1dc /* Last branch to IP */
#define MSR_LEX_FROM_IP 0x1dd /* Last exception from IP */
#define MSR_LEX_TO_IP 0x1de /* Last exception to IP */
#define DR7_L(n) (ONE << ((n)*2)) /* Local breakpoint enable */
#define DR7_G(n) (ONE << ((n)*2+1)) /* Global breakpoint enable */
#define DR7_LE (ONE << 8) /* Local enable */
#define DR7_GE (ONE << 9) /* Global enable */
#define DR7_GD (ONE << 13) /* General-detect enable */
#define DR7_RW_SHIFT(n) ((n) * 4 + 16) /* Breakpoint access mode */
#define DR7_LEN_SHIFT(n) ((n) * 4 + 18) /* Breakpoint addr length */
#define DR7_RW_EXEC 0x0
#define DR7_RW_WRITE 0x1
#define DR7_RW_IO 0x2
#define DR7_RW_RW 0x3
#define DR7_LEN_1 0x0
#define DR7_LEN_2 0x1
#define DR7_LEN_8 0x2
#define DR7_LEN_4 0x3
/* Rflags register */
#define FL_CF 0x00000001 /* Carry Flag */
#define FL_PF 0x00000004 /* Parity Flag */
#define FL_AF 0x00000010 /* Auxiliary carry Flag */
#define FL_ZF 0x00000040 /* Zero Flag */
#define FL_SF 0x00000080 /* Sign Flag */
#define FL_TF 0x00000100 /* Trap Flag */
#define FL_IF 0x00000200 /* Interrupt Flag */
#define FL_DF 0x00000400 /* Direction Flag */
#define FL_OF 0x00000800 /* Overflow Flag */
#define FL_IOPL_MASK 0x00003000 /* I/O Privilege Level bitmask */
#define FL_IOPL_0 0x00000000 /* IOPL == 0 */
#define FL_IOPL_1 0x00001000 /* IOPL == 1 */
#define FL_IOPL_2 0x00002000 /* IOPL == 2 */
#define FL_IOPL_3 0x00003000 /* IOPL == 3 */
#define FL_NT 0x00004000 /* Nested Task */
#define FL_RF 0x00010000 /* Resume Flag */
#define FL_VM 0x00020000 /* Virtual 8086 mode */
#define FL_AC 0x00040000 /* Alignment Check */
#define FL_VIF 0x00080000 /* Virtual Interrupt Flag */
#define FL_VIP 0x00100000 /* Virtual Interrupt Pending */
#define FL_ID 0x00200000 /* ID flag */
/* Page fault error codes */
#define FEC_P 0x1 /* Fault caused by protection violation */
#define FEC_W 0x2 /* Fault caused by a write */
#define FEC_U 0x4 /* Fault occured in user mode */
#define FEC_RSV 0x8 /* Fault caused by reserved PTE bit */
#define FEC_I 0x10 /* Fault caused by instruction fetch */
/*
*
* Part 2. Segmentation data structures and constants.
*
*/
/* STA_ macros are for segment type values */
#define STA_A (ONE << 0) /* Accessed */
#define STA_W (ONE << 1) /* Writable (for data segments) */
#define STA_E (ONE << 2) /* Expand down (for data segments) */
#define STA_X (ONE << 3) /* 1 = Code segment (executable) */
#define STA_R (ONE << 1) /* Readable (for code segments) */
#define STA_C (ONE << 2) /* Conforming (for code segments) */
/* SEG_ macros specify segment type values shifted into place */
#define SEG_A (STA_A << 40) /* Accessed */
#define SEG_W (STA_W << 40) /* Writable (for data segments) */
#define SEG_E (STA_E << 40) /* Expand down (for data segments) */
#define SEG_X (STA_X << 40) /* 1 = Code segment (executable) */
#define SEG_R (STA_R << 40) /* Readable (for code segments) */
#define SEG_C (STA_C << 40) /* Conforming (for code segments) */
#define SEG_S (ONE << 44) /* 1 = non-system, 0 = system segment */
#define SEG_LDT (UINT64 (0x2) << 40) /* 64-bit local descriptor segment */
#define SEG_TSSA (UINT64 (0x9) << 40) /* Available 64-bit TSS */
#define SEG_TSSB (UINT64 (0xa) << 40) /* Busy 64-bit TSS */
#define SEG_CG (UINT64 (0xc) << 40) /* 64-bit Call Gate */
#define SEG_IG (UINT64 (0xe) << 40) /* 64-bit Interrupt Gate */
#define SEG_TG (UINT64 (0xf) << 40) /* 64-bit Trap Gate */
#define SEG_DPL(x) (((x) & UINT64(3)) << 45) /* Descriptor privilege level */
#define SEG_P (ONE << 47) /* Present */
#define SEG_L (ONE << 53) /* Long mode */
#define SEG_D (ONE << 54) /* 1 = 32-bit in legacy, 0 in long mode */
#define SEG_G (ONE << 55) /* Granulatity: 1 = scale limit by 4K */
/* Base and limit for 32-bit or low half of 64-bit segments */
#define SEG_LIM(x) (((x) & 0xffff) | ((x) & UINT64 (0xf0000)) << 32)
#define SEG_BASELO(x) (((CAST64 (x) & 0xffffff) << 16) \
| ((CAST64 (x) & 0xff000000) << 32))
#define SEG_BASEHI(x) (CAST64 (x) >> 32)
#define SEG32_ASM(type, base, lim) \
.word (((lim) >> 12) & 0xffff), ((base) & 0xffff); \
.byte (((base) >> 16) & 0xff), (0x90 | (type)), \
(0xC0 | (((lim) >> 28) & 0xf)), (((base) >> 24) & 0xff)
#define SEG32(type, base, lim, dpl) \
((type) | SEG_S | SEG_P | SEG_D | SEG_G | SEG_A | SEG_DPL (dpl) \
| SEG_BASELO (base) | SEG_LIM ((lim) >> 12))
#define SEG64(type, dpl) \
((type) | SEG_S | SEG_P | SEG_G | SEG_L | SEG_A | SEG_DPL (dpl) \
| SEG_LIM (0xffffffff))
/* Target and segment selector for trap/interrupt gates */
#define SEG_SEL(x) (((x) & 0xffff) << 16)
#define SEG_TARGETLO(x) ((CAST64 (x) & 0xffff) \
| ((CAST64 (x) & 0xffff0000) << 32))
#define SEG_TARGETHI(x) (CAST64 (x) >> 32)
#define GATE32(type, sel, target, dpl) \
((type) | SEG_DPL (dpl) | SEG_P | SEG_SEL (sel) | SEG_TARGETLO (target))
#define SETGATE(gate, type, sel, target, dpl) \
do { \
gate.gd_lo = GATE32 (type, sel, target, dpl); \
gate.gd_hi = SEG_TARGETHI (target); \
} while (0)

View File

@ -1,131 +0,0 @@
/*
* msr.h - x86 Machine-specific Register (MSR) support
*
* Based on code from XV6, created by MIT PDOS.
*/
#pragma once
#include <base/types.h>
static inline uint64_t rdmsr(uint64_t msr)
{
uint32_t low, high;
asm volatile("rdmsr" : "=a" (low), "=d" (high) : "c" (msr));
return (uint64_t)low | ((uint64_t)high << 32);
}
static inline void wrmsr(uint64_t msr, uint64_t val)
{
uint32_t low = (val & 0xffffffff);
uint32_t high = (val >> 32);
asm volatile("wrmsr" : : "c" (msr), "a" (low), "d" (high) : "memory");
}
// FS/GS base registers
#define MSR_FS_BASE 0xc0000100
#define MSR_GS_BASE 0xc0000101
#define MSR_GS_KERNBASE 0xc0000102
// SYSCALL and SYSRET registers
#define MSR_STAR 0xc0000081
#define MSR_LSTAR 0xc0000082
#define MSR_CSTAR 0xc0000083
#define MSR_SFMASK 0xc0000084
#define MSR_INTEL_MISC_ENABLE 0x1a0
#define MISC_ENABLE_PEBS_UNAVAILABLE (1<<12) // Read-only
// AMD performance event-select registers
#define MSR_AMD_PERF_SEL0 0xC0010000
#define MSR_AMD_PERF_SEL1 0xC0010001
#define MSR_AMD_PERF_SEL2 0xC0010002
#define MSR_AMD_PERF_SEL3 0xC0010003
// AMD performance event-count registers
#define MSR_AMD_PERF_CNT0 0xC0010004
#define MSR_AMD_PERF_CNT1 0xC0010005
#define MSR_AMD_PERF_CNT2 0xC0010006
#define MSR_AMD_PERF_CNT3 0xC0010007
// Intel performance event-select registers
#define MSR_INTEL_PERF_SEL0 0x00000186
// Intel performance event-count registers
#define MSR_INTEL_PERF_CNT0 0x000000c1
#define MSR_INTEL_PERF_GLOBAL_STATUS 0x38e
#define PERF_GLOBAL_STATUS_PEBS (1ull << 62)
#define MSR_INTEL_PERF_GLOBAL_CTRL 0x38f
#define MSR_INTEL_PERF_GLOBAL_OVF_CTRL 0x390
#define MSR_INTEL_PERF_CAPABILITIES 0x345 // RO
#define MSR_INTEL_PEBS_ENABLE 0x3f1
#define MSR_INTEL_PEBS_LD_LAT 0x3f6
#define MSR_INTEL_DS_AREA 0x600
// Common event-select bits
#define PERF_SEL_USR (1ULL << 16)
#define PERF_SEL_OS (1ULL << 17)
#define PERF_SEL_EDGE (1ULL << 18)
#define PERF_SEL_INT (1ULL << 20)
#define PERF_SEL_ENABLE (1ULL << 22)
#define PERF_SEL_INV (1ULL << 23)
#define PERF_SEL_CMASK_SHIFT 24
// APIC Base Address Register MSR
#define MSR_APIC_BAR 0x0000001b
#define APIC_BAR_XAPIC_EN (1 << 11)
#define APIC_BAR_X2APIC_EN (1 << 10)
#define MSR_PKG_ENERGY_STATUS 0x00000611
static inline uintptr_t getfsbase(void)
{
#ifdef USE_RDWRGSFS
uintptr_t base;
asm volatile("rdfsbase %0" : "=r"(base));
return base;
#else
return rdmsr(MSR_FS_BASE);
#endif
}
static inline uintptr_t getgsbase(void)
{
#ifdef USE_RDWRGSFS
uintptr_t base;
asm volatile("rdgsbase %0" : "=r"(base));
return base;
#else
return rdmsr(MSR_GS_BASE);
#endif
}
static inline void setfsbase(uintptr_t base)
{
#ifdef USE_RDWRGSFS
asm volatile("wrfsbase %0" : : "r"(base));
#else
wrmsr(MSR_FS_BASE, base);
#endif
}
static inline void setgsbase(uintptr_t base)
{
#ifdef USE_RDWRGSFS
asm volatile("wrgsbase %0" : : "r"(base));
#else
wrmsr(MSR_GS_BASE, base);
#endif
}
static inline void setgskernbase(uintptr_t base)
{
assert(!is_irq_enabled());
asm volatile("swapgs");
#ifdef USE_RDWRGSFS
asm volatile("wrgsbase %0" : : "r"(base));
#else
wrmsr(MSR_GS_BASE, base);
#endif
asm volatile("swapgs");
}

View File

@ -1,148 +0,0 @@
/*
* ops.h - useful x86 opcodes
*/
#pragma once
#include <ix/stddef.h>
/* CPUID Flags. */
#define CPUID_FLAG_FPU 0x1 /* Floating Point Unit. */
#define CPUID_FLAG_VME 0x2 /* Virtual Mode Extensions. */
#define CPUID_FLAG_DE 0x4 /* Debugging Extensions. */
#define CPUID_FLAG_PSE 0x8 /* Page Size Extensions. */
#define CPUID_FLAG_TSC 0x10 /* Time Stamp Counter. */
#define CPUID_FLAG_MSR 0x20 /* Model-specific registers. */
#define CPUID_FLAG_PAE 0x40 /* Physical Address Extensions. */
#define CPUID_FLAG_MCE 0x80 /* Machine Check Exceptions. */
#define CPUID_FLAG_CXCHG8 0x100 /* Compare and exchange 8-byte. */
#define CPUID_FLAG_APIC 0x200 /* On-chip APIC. */
#define CPUID_FLAG_SEP 0x800 /* Fast System Calls. */
#define CPUID_FLAG_MTRR 0x1000 /* Memory Type Range Registers. */
#define CPUID_FLAG_PGE 0x2000 /* Page Global Enable. */
#define CPUID_FLAG_MCA 0x4000 /* Machine Check Architecture. */
#define CPUID_FLAG_CMOV 0x8000 /* Conditional move-instruction. */
#define CPUID_FLAG_PAT 0x10000 /* Page Attribute Table. */
#define CPUID_FLAG_PSE36 0x20000 /* 36-bit Page Size Extensions. */
#define CPUID_FLAG_PSN 0x40000 /* Processor Serial Number. */
#define CPUID_FLAG_CLFL 0x80000 /* CLFLUSH - fixme? */
#define CPUID_FLAG_DTES 0x200000 /* Debug Trace and EMON Store MSRs. */
#define CPUID_FLAG_ACPI 0x400000 /* Thermal Cotrol MSR. */
#define CPUID_FLAG_MMX 0x800000 /* MMX instruction set. */
#define CPUID_FLAG_FXSR 0x1000000 /* Fast floating point save/restore. */
#define CPUID_FLAG_SSE 0x2000000 /* SSE (Streaming SIMD Extensions) */
#define CPUID_FLAG_SSE2 0x4000000 /* SSE2 (Streaming SIMD Extensions - #2) */
#define CPUID_FLAG_SS 0x8000000 /* Selfsnoop. */
#define CPUID_FLAG_HTT 0x10000000 /* Hyper-Threading Technology. */
#define CPUID_FLAG_TM1 0x20000000 /* Thermal Interrupts, Status MSRs. */
#define CPUID_FLAG_IA64 0x40000000 /* IA-64 (64-bit Intel CPU) */
#define CPUID_FLAG_PBE 0x80000000 /* Pending Break Event. */
/* from xv6, created by MIT PDOS */
static inline void cpuid(uint32_t info, uint32_t *eaxp,
uint32_t *ebxp, uint32_t *ecxp,
uint32_t *edxp)
{
uint32_t eax, ebx, ecx, edx;
asm volatile("cpuid"
: "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
: "a" (info));
if (eaxp)
*eaxp = eax;
if (ebxp)
*ebxp = ebx;
if (ecxp)
*ecxp = ecx;
if (edxp)
*edxp = edx;
}
static inline uint64_t rdtsc(void)
{
uint32_t a, d;
asm volatile("rdtsc" : "=a" (a), "=d" (d));
return ((uint64_t) a) | (((uint64_t) d) << 32);
}
static inline uint64_t rdtscp(uint32_t *auxp)
{
unsigned int a, d, c;
asm volatile("rdtscp" : "=a" (a), "=d" (d), "=c" (c));
if (auxp)
*auxp = c;
return ((uint64_t) a) | (((uint64_t) d) << 32);
}
static inline uint64_t read_cr3(void)
{
uint64_t val;
asm volatile("movq %%cr3, %0" : "=r" (val));
return val;
}
static inline void write_cr3(uint64_t val)
{
asm volatile("movq %0, %%cr3" : : "r" (val));
}
#define PCID_COUNT (1 << 12)
#ifdef USE_INVPCID
static inline void invpcid(uint16_t pcid, uint64_t type, uintptr_t la)
{
struct {
uint64_t pcid:12;
uint64_t rsv:52;
uint64_t la;
} desc;
assert(pcid < PCID_COUNT);
desc.pcid = pcid;
desc.rsv = 0;
desc.la = la;
asm volatile("invpcid (%0), %1" : :
"r" (&desc), "r" (type) : "memory");
}
enum {
INVPCID_TYPE_ADDR = 0, /* individual address invalidation */
INVPCID_TYPE_CTX, /* single context invalidation */
INVPCID_TYPE_ALL_GLB, /* all contexts and global translations */
INVPCID_TYPE_ALL, /* all contexts except global translations */
};
#endif /* USE_INVPCID */
static inline void flush_tlb_addr(const void *va)
{
asm volatile("invlpg (%0)" : : "r" (va) : "memory");
}
static inline void set_pgroot(uint16_t pcid, uintptr_t pa, bool inval)
{
assert(pcid < PCID_COUNT);
if (inval)
write_cr3(pa | (uintptr_t) pcid);
else
write_cr3(pa | (uintptr_t) pcid | (1UL << 63));
}
static inline void monitor(void const *p, unsigned extensions, unsigned hints)
{
asm volatile("monitor" : : "a" (p), "c" (extensions), "d" (hints));
}
static inline void mwait(unsigned idle_state, unsigned flags)
{
asm volatile("mwait" : : "a" (idle_state), "c" (flags));
}
#define IDLE_STATE_C1 0x00 /* ~2 microseconds */
#define IDLE_STATE_C1E 0x01 /* ~10 microseconds */
#define IDLE_STATE_C3 0x10 /* ~33 microseconds */

View File

@ -1,33 +0,0 @@
/*
* procmap.h - parses linux process map information
*/
#pragma once
#include <base/stddef.h>
struct procmap_entry {
uintptr_t begin;
uintptr_t end;
uint64_t offset;
bool r; // Readable
bool w; // Writable
bool x; // Executable
bool p; // Private (or shared)
char *path;
int type;
};
#define PROCMAP_TYPE_UNKNOWN 0x00
#define PROCMAP_TYPE_FILE 0x01
#define PROCMAP_TYPE_ANONYMOUS 0x02
#define PROCMAP_TYPE_HEAP 0x03
#define PROCMAP_TYPE_STACK 0x04
#define PROCMAP_TYPE_VSYSCALL 0x05
#define PROCMAP_TYPE_VDSO 0x06
#define PROCMAP_TYPE_VVAR 0x07
typedef int (*procmap_cb_t)(const struct procmap_entry *, unsigned long data);
extern int procmap_iterate(procmap_cb_t cb, unsigned long data);
extern void procmap_dump(void);

View File

@ -1,128 +0,0 @@
/*
* trap.h - x86 exception and interrupt support
*/
#pragma once
#include <base/stddef.h>
#include <base/cpu.h>
#include <dune/mmu.h>
/* format used by LGDT and LIDT instructions */
struct tptr {
uint16_t limit;
uint64_t base;
} __packed;
/* the interrupt descriptor table (IDT) descriptor format */
struct idtd {
uint16_t low;
uint16_t selector;
uint8_t ist;
uint8_t type;
uint16_t middle;
uint32_t high;
uint32_t zero;
} __packed;
#define IDTD_P (1 << 7)
#define IDTD_CPL3 (3 << 5)
#define IDTD_TRAP_GATE 0xF
#define IDTD_INTERRUPT_GATE 0xE
#define IDT_ENTRIES 256
/* the task-switch segment (TSS) descriptor format */
struct tssd {
char ign1[4];
uint64_t rsp[3];
uint64_t ist[8];
char ign2[10];
uint16_t iomb;
uint8_t iopb[];
} __packed;
/* x86 trap codes */
#define T_DIVIDE 0 // divide error
#define T_DEBUG 1 // debug exception
#define T_NMI 2 // non-maskable interrupt
#define T_BRKPT 3 // breakpoint
#define T_OFLOW 4 // overflow
#define T_BOUND 5 // bounds check
#define T_ILLOP 6 // illegal opcode
#define T_DEVICE 7 // device not available
#define T_DBLFLT 8 // double fault
/* #define T_COPROC 9 */ // reserved (not generated by recent processors)
#define T_TSS 10 // invalid task switch segment
#define T_SEGNP 11 // segment not present
#define T_STACK 12 // stack exception
#define T_GPFLT 13 // genernal protection fault
#define T_PGFLT 14 // page fault
/* #define T_RES 15 */ // reserved
#define T_FPERR 16 // floating point error
#define T_ALIGN 17 // aligment check
#define T_MCHK 18 // machine check
#define T_SIMDERR 19 // SIMD floating point error
/**
* irq_disable - disables interrupts
*/
static inline void irq_disable(void)
{
asm volatile("cli" : : : "memory");
}
/**
* irq_enable - enables interrupts
*/
static inline void irq_enable(void)
{
asm volatile("sti" : : : "memory");
}
/**
* is_irq_enabled - are interrupts currently enabled?
*
* Returns true if interrupts are enabled.
*/
static inline bool is_irq_enabled(void)
{
unsigned long flags;
asm volatile("pushf\n\t"
"pop %0\n\t"
: "=rm" (flags) : : "memory");
return ((flags & FL_IF) > 0);
}
/**
* irq_save - disables interrupts, saving the current interrupt status
*
* Returns the current FLAGS.
*/
static inline unsigned long irq_save(void)
{
unsigned long flags;
asm volatile("pushf\n\t"
"pop %0\n\t"
: "=rm" (flags) : : "memory");
if (flags & FL_IF)
irq_disable();
return flags;
}
/**
* irq_restore - restores the previous interrupt status
* @flags: the previous FLAGS
*/
static inline void irq_restore(unsigned long flags)
{
asm volatile("push %0\n\t"
"popf\n\t"
: : "g" (flags) : "memory", "cc");
}

View File

@ -1,84 +0,0 @@
/*
* vm.h - virtual memory management
*/
#pragma once
#include <base/mem.h>
#include <base/page.h>
#include <dune/mmu.h>
#define PGLEVEL_4KB 0
#define PGLEVEL_2MB 1
#define PGLEVEL_1GB 2
#define PGLEVEL_NUM 4
#define PGLEVEL_TO_SIZE(level) (1 << PDSHIFT(level))
#define PGSIZE_TO_LEVEL(size) ((__builtin_ctz(size) - PGSHIFT_4KB) / NPTBITS)
/*
* Raw Operations
*/
extern int
vm_lookup_pte(ptent_t *tbl, const void *va,
int *level_out, ptent_t **pte_out);
extern int
vm_insert_pte(ptent_t *tbl, const void *va,
int level, ptent_t pte_in);
extern int
vm_get_pte(ptent_t *tbl, const void *va,
int level, ptent_t **pte_out);
extern int
vm_remove_pte(ptent_t *tbl, const void *va,
int *level_out, ptent_t *pte_out);
/*
* Page Operations
*/
extern int
vm_lookup_page(ptent_t *tbl, const void *va, struct page **pg_out);
extern int
vm_insert_page(ptent_t *tbl, const void *va,
struct page *pg, ptent_t flags);
extern int
vm_remove_page(ptent_t *tbl, const void *va,
struct page **pg_out);
/*
* Ranged Operations
*/
extern int
vm_map_phys(ptent_t *tbl, physaddr_t pa, const void *va,
size_t len, int pgsize, ptent_t flags);
extern int
vm_map_pages(ptent_t *tbl, const void *va, size_t len,
int pgsize, ptent_t flags);
extern int
vm_map_copy(ptent_t *tbl, const void *src_va, const void *map_va,
size_t len, int pgsize, ptent_t flags);
extern bool
vm_mod(ptent_t *tbl, const void *va, size_t len, int pgsize, ptent_t flags);
extern bool
vm_disable(ptent_t *tbl, const void *va, size_t len, int pgsize);
extern void
vm_unmap(ptent_t *tbl, const void *va, size_t len, int pgsize);
extern void
vm_unmap_pages(ptent_t *tbl, const void *va, size_t len, int pgsize);
/*
* Page Tables
*/
extern ptent_t *vm_create_pt(void);
extern ptent_t *vm_clone_kern_pt(void);
extern void vm_destroy_pt(ptent_t *tbl);
extern ptent_t *kern_pgtbl;