Compare commits
4 Commits
master
...
nsdi_final
Author | SHA1 | Date | |
---|---|---|---|
|
d1b17da7ae | ||
|
23d54f9f3b | ||
|
9f7f74699d | ||
|
decab1ef6a |
36
CONTRIBUTING.md
Normal file
36
CONTRIBUTING.md
Normal file
@ -0,0 +1,36 @@
|
||||
## Code Overview
|
||||
|
||||
apps - synthetic and benchmarking applications.
|
||||
|
||||
base - a extension to the standard C library that provides tools for managing
|
||||
lists, memory, bitmaps, initialization, atomics, and several other useful
|
||||
features.
|
||||
|
||||
bindings - language bindings (C++ and rust) for the runtime.
|
||||
|
||||
dpdk - [DPDK](https://www.dpdk.org/) library for accessing NIC queues
|
||||
from userspace.
|
||||
|
||||
iokernel - dedicated core that steers packets and reallocates cores
|
||||
across applications.
|
||||
|
||||
net - a packet manipulation library.
|
||||
|
||||
runtime - a user-level threading and networking runtime.
|
||||
|
||||
shim - a shim layer that enables running unmodified
|
||||
[PARSEC](http://parsec.cs.princeton.edu/) applications atop Shenango.
|
||||
|
||||
|
||||
## Coding Style
|
||||
|
||||
Use the following conventions for C code:
|
||||
https://www.kernel.org/doc/html/v4.10/process/coding-style.html
|
||||
|
||||
Use the following conventions for C++ code:
|
||||
https://google.github.io/styleguide/cppguide.html
|
||||
|
||||
For third party libraries and tools, use their existing coding style.
|
||||
|
||||
For some helpful tips on how to write clean code, see:
|
||||
https://www.lysator.liu.se/c/pikestyle.html
|
201
LICENSE
Normal file
201
LICENSE
Normal file
@ -0,0 +1,201 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
16
Makefile
16
Makefile
@ -32,11 +32,6 @@ print-% : ; @echo $* = $($*)
|
||||
base_src = $(wildcard base/*.c)
|
||||
base_obj = $(base_src:.c=.o)
|
||||
|
||||
# libdune.a - the dune library
|
||||
dune_src = $(wildcard dune/*.c)
|
||||
dune_asm = $(wildcard dune/*.S)
|
||||
dune_obj = $(dune_src:.c=.o) $(dune_asm:.S=.o)
|
||||
|
||||
#libnet.a - a packet/networking utility library
|
||||
net_src = $(wildcard net/*.c) $(wildcard net/ixgbe/*.c)
|
||||
net_obj = $(net_src:.c=.o)
|
||||
@ -76,14 +71,11 @@ DPDK_LIBS += -lrte_pmd_mlx4 -libverbs -lmlx4
|
||||
endif
|
||||
|
||||
# must be first
|
||||
all: libbase.a libdune.a libnet.a libruntime.a iokerneld iokerneld-noht $(test_targets)
|
||||
all: libbase.a libnet.a libruntime.a iokerneld iokerneld-noht $(test_targets)
|
||||
|
||||
libbase.a: $(base_obj)
|
||||
$(AR) rcs $@ $^
|
||||
|
||||
libdune.a: $(dune_obj)
|
||||
$(AR) rcs $@ $^
|
||||
|
||||
libnet.a: $(net_obj)
|
||||
$(AR) rcs $@ $^
|
||||
|
||||
@ -102,8 +94,8 @@ $(test_targets): $(test_obj) libbase.a libruntime.a libnet.a base/base.ld
|
||||
$(LD) $(LDFLAGS) -o $@ $@.o libruntime.a libnet.a libbase.a -lpthread
|
||||
|
||||
# general build rules for all targets
|
||||
src = $(base_src) $(dune_src) $(net_src) $(runtime_src) $(iokernel_src) $(test_src)
|
||||
asm = $(dune_asm) $(runtime_asm)
|
||||
src = $(base_src) $(net_src) $(runtime_src) $(iokernel_src) $(test_src)
|
||||
asm = $(runtime_asm)
|
||||
obj = $(src:.c=.o) $(asm:.S=.o) $(iokernel_src:.c=-noht.o)
|
||||
dep = $(obj:.o=.d)
|
||||
|
||||
@ -132,5 +124,5 @@ sparse: $(src)
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
rm -f $(obj) $(dep) libbase.a libdune.a libnet.a libruntime.a \
|
||||
rm -f $(obj) $(dep) libbase.a libnet.a libruntime.a \
|
||||
iokerneld iokerneld-noht $(test_targets)
|
||||
|
66
README.md
66
README.md
@ -5,7 +5,7 @@ simultaneously provide low tail latency and high CPU efficiency, by
|
||||
rapidly reallocating cores across applications, at timescales as small
|
||||
as every 5 microseconds.
|
||||
|
||||
## How do I use it?
|
||||
## How to Run Shenango
|
||||
|
||||
1) Clone the Shenango repository.
|
||||
|
||||
@ -24,14 +24,7 @@ make clean && make
|
||||
|
||||
To enable debugging, build with `make DEBUG=1`.
|
||||
|
||||
3) Setup hugepages. We require at least 64 2MB hugepages. Run the script below
|
||||
and follow the instructions for NUMA or non-NUMA systems, as appropriate.
|
||||
|
||||
```
|
||||
./dpdk/usertools/dpdk-setup.sh
|
||||
```
|
||||
|
||||
4) Install Rust and build a synthetic client-server application.
|
||||
3) Install Rust and build a synthetic client-server application.
|
||||
|
||||
```
|
||||
curl https://sh.rustup.rs -sSf | sh
|
||||
@ -44,7 +37,7 @@ cargo update
|
||||
cargo build --release
|
||||
```
|
||||
|
||||
5) Run the synthetic application with a client and server. The client
|
||||
4) Run the synthetic application with a client and server. The client
|
||||
sends requests to the server, which performs a specified amount of
|
||||
fake work (e.g., computing square roots for 10us), before responding.
|
||||
|
||||
@ -60,51 +53,12 @@ sudo ./iokerneld
|
||||
./apps/synthetic/target/release/synthetic 192.168.1.3:5000 --config client.config --mode runtime-client
|
||||
```
|
||||
|
||||
### Supported Platforms
|
||||
## Supported Platforms
|
||||
|
||||
This code has been tested most thoroughly on Ubuntu 17.10, with kernel
|
||||
4.14.0. It has been tested with Intel 82599ES 10 Gbit/s NICs and
|
||||
Mellanox ConnectX-3 Pro 10 Gbit/s NICs. If you use Mellanox NICs, you
|
||||
This code has been tested most thoroughly on Ubuntu 18.04, with kernel
|
||||
4.15.0. It has been tested with Intel 82599ES 10 Gbits/s NICs and
|
||||
Mellanox ConnectX-3 Pro 10 Gbits/s NICs. If you use Mellanox NICs, you
|
||||
should install the Mellanox OFED as described in [DPDK's
|
||||
documentation](https://doc.dpdk.org/guides/nics/mlx4.html).
|
||||
|
||||
## How do I contribute?
|
||||
|
||||
### Code Overview
|
||||
|
||||
apps - synthetic and benchmarking applications.
|
||||
|
||||
base - a extension to the standard C library that provides tools for managing
|
||||
lists, memory, bitmaps, initialization, atomics, and several other useful
|
||||
features.
|
||||
|
||||
bindings - language bindings (C++ and rust) for the runtime.
|
||||
|
||||
dpdk - [DPDK](https://www.dpdk.org/) library for accessing NIC queues
|
||||
from userspace.
|
||||
|
||||
dune - a better implementation of libdune based on the base library.
|
||||
|
||||
iokernel - dedicated core that steers packets and reallocates cores
|
||||
across applications.
|
||||
|
||||
net - a packet manipulation library.
|
||||
|
||||
runtime - a user-level threading and networking runtime.
|
||||
|
||||
shim - a shim layer that enables running unmodified
|
||||
[PARSEC](http://parsec.cs.princeton.edu/) applications atop Shenango.
|
||||
|
||||
|
||||
### Coding Style
|
||||
|
||||
Use the following conventions for C code:
|
||||
https://www.kernel.org/doc/html/v4.10/process/coding-style.html
|
||||
|
||||
Use the following conventions for C++ code:
|
||||
https://google.github.io/styleguide/cppguide.html
|
||||
|
||||
For third party libraries and tools, use their existing coding style.
|
||||
|
||||
For some helpful tips on how to write clean code, see:
|
||||
https://www.lysator.liu.se/c/pikestyle.html
|
||||
documentation](https://doc.dpdk.org/guides/nics/mlx4.html). If you use
|
||||
Intel NICs, you should insert the IGB UIO module and bind your NIC
|
||||
interface to it (e.g., using the script `./dpdk/usertools/dpdk-setup.sh`).
|
||||
|
2
apps/bench/.gitignore
vendored
2
apps/bench/.gitignore
vendored
@ -4,8 +4,6 @@ efficiency_linux
|
||||
stress
|
||||
stress_linux
|
||||
tbench
|
||||
tbench_arachne
|
||||
tbench_linux
|
||||
netbench
|
||||
netbench2
|
||||
netbench_udp
|
||||
|
@ -5,7 +5,6 @@ CXXPATH = ../../bindings/cc
|
||||
INC = -I../../inc -I../../bindings/cc -I./
|
||||
CXXFLAGS = -g -Wall -std=gnu++11 -D_GNU_SOURCE $(INC) -mssse3
|
||||
LDFLAGS = -T../../base/base.ld -no-pie
|
||||
ARACHNE ?= /home/friedj/memcached-arachne/arachne-all/Arachne
|
||||
|
||||
LD = g++
|
||||
CC = g++
|
||||
@ -27,9 +26,6 @@ fake_worker_obj = $(fake_worker_src:.cc=.o)
|
||||
tbench_src = tbench.cc
|
||||
tbench_obj = $(tbench_src:.cc=.o)
|
||||
|
||||
tbench_linux_src = tbench_linux.cc
|
||||
tbench_linux_obj = $(tbench_linux_src:.cc=.o)
|
||||
|
||||
callibrate_src = callibrate.cc
|
||||
callibrate_obj = $(callibrate_src:.cc=.o)
|
||||
|
||||
@ -65,25 +61,14 @@ linux_mech_bench_obj = $(linux_mech_bench_src:.cc=.o)
|
||||
|
||||
librt_libs = $(CXXPATH)/librt++.a $(BASEPATH)/libruntime.a $(BASEPATH)/libnet.a $(BASEPATH)/libbase.a
|
||||
|
||||
LIBS_ARACHNE=-I$(ARACHNE)/include -I$(ARACHNE)/../CoreArbiter/include -I$(ARACHNE)/../PerfUtils/include \
|
||||
-L$(ARACHNE)/lib -lArachne -L$(ARACHNE)/../CoreArbiter/lib -lCoreArbiter \
|
||||
$(ARACHNE)/../PerfUtils/lib/libPerfUtils.a -lpcrecpp -pthread
|
||||
|
||||
# must be first
|
||||
all: tbench tbench_linux callibrate stress efficiency efficiency_linux \
|
||||
all: tbench callibrate stress efficiency efficiency_linux \
|
||||
netbench netbench2 netbench_udp netbench_linux netperf linux_mech_bench \
|
||||
stress_linux tbench_arachne
|
||||
stress_linux
|
||||
|
||||
tbench: $(tbench_obj) $(librt_libs)
|
||||
$(LD) -o $@ $(LDFLAGS) $(tbench_obj) $(librt_libs) -lpthread
|
||||
|
||||
tbench_linux: $(tbench_linux_obj)
|
||||
$(LD) -o $@ $(LDFLAGS) $(tbench_linux_obj) -lpthread
|
||||
|
||||
tbench_arachne: tbench_arachne.cc
|
||||
$(LD) -o $@ $(LDFLAGS) tbench_arachne.cc $(LIBS_ARACHNE)
|
||||
|
||||
|
||||
callibrate: $(fake_worker_obj) $(callibrate_obj)
|
||||
$(LD) -o $@ $(LDFLAGS) $(fake_worker_obj) $(callibrate_obj) -lpthread
|
||||
|
||||
@ -120,7 +105,7 @@ linux_mech_bench: $(linux_mech_bench_obj) $(librt_libs)
|
||||
$(LD) -o $@ $(LDFLAGS) $(linux_mech_bench_obj) $(librt_libs) -lpthread
|
||||
|
||||
# general build rules for all targets
|
||||
src = $(fake_worker_src) $(tbench_src) $(tbench_linux_src) $(callibrate_src)
|
||||
src = $(fake_worker_src) $(tbench_src) $(callibrate_src)
|
||||
src += $(stress_src) $(efficiency_src) $(efficiency_linux_src) $(netbench_src)
|
||||
src += $(netbench2_src) $(netbench_udp_src) $(netbench_linux_src) $(netperf_src)
|
||||
src += $(linux_mech_bench_src)
|
||||
@ -140,6 +125,6 @@ endif
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
rm -f $(obj) $(dep) tbench tbench_linux callibrate stress efficiency \
|
||||
rm -f $(obj) $(dep) tbench callibrate stress efficiency \
|
||||
efficiency_linux netbench netbench2 netbench_udp netbench_linux \
|
||||
netperf linux_mech_bench stress_linux tbench_arachne
|
||||
netperf linux_mech_bench stress_linux
|
||||
|
@ -1,33 +1,9 @@
|
||||
# Threading Benchmarks
|
||||
|
||||
First build Shenango and Arachne, then build the benchmarks in this directory
|
||||
with `make clean && make`. Run the benchmarks as described below, restricting
|
||||
each to run on a single core.
|
||||
First build Shenango and then build the benchmarks in this directory
|
||||
with `make clean && make`. Run the main Shenango threading benchmarks
|
||||
as follows (benchmarks will use a single runtime core).
|
||||
|
||||
## pthreads
|
||||
```
|
||||
taskset --cpu-list 2 ./tbench_linux
|
||||
```
|
||||
|
||||
## Go
|
||||
```
|
||||
export GOMAXPROCS=1
|
||||
cd go
|
||||
go test -bench .
|
||||
```
|
||||
|
||||
## Arachne
|
||||
In arachne-all directory:
|
||||
```
|
||||
sudo ./CoreArbiter/bin/coreArbiterServer
|
||||
```
|
||||
|
||||
In this directory:
|
||||
```
|
||||
./tbench_arachne
|
||||
```
|
||||
|
||||
## Shenango
|
||||
In shenango directory:
|
||||
```
|
||||
sudo ./iokerneld
|
||||
|
@ -1,76 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"sync"
|
||||
"runtime"
|
||||
)
|
||||
|
||||
func BenchmarkSpawnJoin(b *testing.B) {
|
||||
c := make(chan int, 1)
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
go func() {
|
||||
c <- 1
|
||||
}()
|
||||
<-c
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkUncontendedMutex(b *testing.B) {
|
||||
var m = &sync.Mutex{}
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
m.Lock()
|
||||
m.Unlock()
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkYield(b *testing.B) {
|
||||
c := make(chan int, 1)
|
||||
|
||||
go func() {
|
||||
for i := 0; i < b.N / 2; i++ {
|
||||
runtime.Gosched()
|
||||
}
|
||||
c <- 1
|
||||
}()
|
||||
|
||||
for i := 0; i < b.N / 2; i++ {
|
||||
runtime.Gosched()
|
||||
}
|
||||
|
||||
<-c
|
||||
}
|
||||
|
||||
func BenchmarkCondvarPingPong(b *testing.B) {
|
||||
m := &sync.Mutex{}
|
||||
cv := sync.NewCond(m)
|
||||
c := make(chan int, 1)
|
||||
dir := bool(false)
|
||||
|
||||
go func() {
|
||||
m.Lock()
|
||||
for i := 0; i < b.N / 2; i++ {
|
||||
for dir {
|
||||
cv.Wait()
|
||||
}
|
||||
dir = true
|
||||
cv.Signal()
|
||||
}
|
||||
m.Unlock()
|
||||
c <- 1
|
||||
}()
|
||||
|
||||
m.Lock()
|
||||
for i := 0; i < b.N / 2; i++ {
|
||||
for !dir {
|
||||
cv.Wait()
|
||||
}
|
||||
dir = false
|
||||
cv.Signal()
|
||||
}
|
||||
m.Unlock()
|
||||
|
||||
<-c
|
||||
}
|
@ -1,127 +0,0 @@
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
|
||||
#include "Arachne/Arachne.h"
|
||||
|
||||
|
||||
namespace {
|
||||
|
||||
using us = std::chrono::duration<double, std::micro>;
|
||||
constexpr int kMeasureRounds = 1000000;
|
||||
|
||||
void empty_thread() {;}
|
||||
|
||||
void BenchSpawnJoin() {
|
||||
for (int i = 0; i < kMeasureRounds; ++i) {
|
||||
auto th = Arachne::createThread(empty_thread);
|
||||
Arachne::join(th);
|
||||
}
|
||||
}
|
||||
|
||||
void BenchUncontendedMutex() {
|
||||
Arachne::SpinLock mutex;
|
||||
volatile unsigned long foo = 0;
|
||||
|
||||
for (int i = 0; i < kMeasureRounds; ++i) {
|
||||
mutex.lock();
|
||||
foo++;
|
||||
mutex.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
void yielder() {
|
||||
for (int i = 0; i < kMeasureRounds / 2; ++i)
|
||||
Arachne::yield();
|
||||
}
|
||||
|
||||
void BenchYield() {
|
||||
auto th = Arachne::createThread(yielder);
|
||||
yielder();
|
||||
Arachne::join(th);
|
||||
}
|
||||
|
||||
struct pong {
|
||||
Arachne::SpinLock mutex;
|
||||
Arachne::ConditionVariable cv;
|
||||
bool dir = false;
|
||||
};
|
||||
|
||||
void ping_pong_1(struct pong *p)
|
||||
{
|
||||
p->mutex.lock();
|
||||
for (int i = 0; i < kMeasureRounds / 2; ++i) {
|
||||
while (p->dir)
|
||||
p->cv.wait(p->mutex);
|
||||
p->dir = true;
|
||||
p->cv.notifyOne();
|
||||
}
|
||||
p->mutex.unlock();
|
||||
}
|
||||
|
||||
void BenchCondvarPingPong() {
|
||||
struct pong p;
|
||||
|
||||
auto th = Arachne::createThread(ping_pong_1, &p);
|
||||
|
||||
p.mutex.lock();
|
||||
for (int i = 0; i < kMeasureRounds / 2; ++i) {
|
||||
while (!p.dir)
|
||||
p.cv.wait(p.mutex);
|
||||
p.dir = false;
|
||||
p.cv.notifyOne();
|
||||
}
|
||||
|
||||
Arachne::join(th);
|
||||
}
|
||||
|
||||
void PrintResult(std::string name, us time) {
|
||||
time /= kMeasureRounds;
|
||||
std::cout << "test '" << name << "' took "<< time.count() << " us."
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
int MainHandler() {
|
||||
auto start = std::chrono::steady_clock::now();
|
||||
BenchSpawnJoin();
|
||||
auto finish = std::chrono::steady_clock::now();
|
||||
PrintResult("SpawnJoin",
|
||||
std::chrono::duration_cast<us>(finish - start));
|
||||
|
||||
start = std::chrono::steady_clock::now();
|
||||
BenchUncontendedMutex();
|
||||
finish = std::chrono::steady_clock::now();
|
||||
PrintResult("UncontendedMutex",
|
||||
std::chrono::duration_cast<us>(finish - start));
|
||||
|
||||
start = std::chrono::steady_clock::now();
|
||||
BenchYield();
|
||||
finish = std::chrono::steady_clock::now();
|
||||
PrintResult("Yield",
|
||||
std::chrono::duration_cast<us>(finish - start));
|
||||
|
||||
start = std::chrono::steady_clock::now();
|
||||
BenchCondvarPingPong();
|
||||
finish = std::chrono::steady_clock::now();
|
||||
PrintResult("CondvarPingPong",
|
||||
std::chrono::duration_cast<us>(finish - start));
|
||||
|
||||
Arachne::shutDown();
|
||||
return 0;
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
// Requires coreArbiter: ./coreArbiterServer
|
||||
int
|
||||
main(int argc, const char** argv) {
|
||||
// Initialize the library
|
||||
Arachne::minNumCores = 1;
|
||||
Arachne::maxNumCores = 1;
|
||||
Arachne::disableLoadEstimation = true;
|
||||
Arachne::init(&argc, argv);
|
||||
|
||||
Arachne::createThreadOnCore(2, MainHandler);
|
||||
|
||||
Arachne::waitForTermination();
|
||||
return 0;
|
||||
}
|
@ -1,104 +0,0 @@
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
#include <thread>
|
||||
#include <mutex>
|
||||
#include <condition_variable>
|
||||
|
||||
namespace {
|
||||
|
||||
using us = std::chrono::duration<double, std::micro>;
|
||||
constexpr int kMeasureRounds = 1000000;
|
||||
|
||||
void BenchSpawnJoin() {
|
||||
for (int i = 0; i < kMeasureRounds; ++i) {
|
||||
auto th = std::thread([](){;});
|
||||
th.join();
|
||||
}
|
||||
}
|
||||
|
||||
void BenchUncontendedMutex() {
|
||||
std::mutex m;
|
||||
volatile unsigned long foo = 0;
|
||||
|
||||
for (int i = 0; i < kMeasureRounds; ++i) {
|
||||
std::unique_lock<std::mutex> l(m);
|
||||
foo++;
|
||||
}
|
||||
}
|
||||
|
||||
void BenchYield() {
|
||||
auto th = std::thread([](){
|
||||
for (int i = 0; i < kMeasureRounds / 2; ++i)
|
||||
std::this_thread::yield();
|
||||
});
|
||||
|
||||
for (int i = 0; i < kMeasureRounds / 2; ++i)
|
||||
std::this_thread::yield();
|
||||
|
||||
th.join();
|
||||
}
|
||||
|
||||
void BenchCondvarPingPong() {
|
||||
std::mutex m;
|
||||
std::condition_variable cv;
|
||||
bool dir = false; // shared and protected by @m.
|
||||
|
||||
auto th = std::thread([&](){
|
||||
std::unique_lock<std::mutex> l(m);
|
||||
for (int i = 0; i < kMeasureRounds / 2; ++i) {
|
||||
while (dir)
|
||||
cv.wait(l);
|
||||
dir = true;
|
||||
cv.notify_one();
|
||||
}
|
||||
});
|
||||
|
||||
std::unique_lock<std::mutex> l(m);
|
||||
for (int i = 0; i < kMeasureRounds / 2; ++i) {
|
||||
while (!dir)
|
||||
cv.wait(l);
|
||||
dir = false;
|
||||
cv.notify_one();
|
||||
}
|
||||
|
||||
th.join();
|
||||
}
|
||||
|
||||
void PrintResult(std::string name, us time) {
|
||||
time /= kMeasureRounds;
|
||||
std::cout << "test '" << name << "' took "<< time.count() << " us."
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
void MainHandler(void *arg) {
|
||||
auto start = std::chrono::steady_clock::now();
|
||||
BenchSpawnJoin();
|
||||
auto finish = std::chrono::steady_clock::now();
|
||||
PrintResult("SpawnJoin",
|
||||
std::chrono::duration_cast<us>(finish - start));
|
||||
|
||||
start = std::chrono::steady_clock::now();
|
||||
BenchUncontendedMutex();
|
||||
finish = std::chrono::steady_clock::now();
|
||||
PrintResult("UncontendedMutex",
|
||||
std::chrono::duration_cast<us>(finish - start));
|
||||
|
||||
start = std::chrono::steady_clock::now();
|
||||
BenchYield();
|
||||
finish = std::chrono::steady_clock::now();
|
||||
PrintResult("Yield",
|
||||
std::chrono::duration_cast<us>(finish - start));
|
||||
|
||||
start = std::chrono::steady_clock::now();
|
||||
BenchCondvarPingPong();
|
||||
finish = std::chrono::steady_clock::now();
|
||||
PrintResult("CondvarPingPong",
|
||||
std::chrono::duration_cast<us>(finish - start));
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
MainHandler(NULL);
|
||||
return 0;
|
||||
}
|
@ -6,7 +6,6 @@ use std::path::PathBuf;
|
||||
fn main() {
|
||||
// Tell cargo to tell rustc to link the library.
|
||||
println!("cargo:rustc-link-lib=static=base");
|
||||
println!("cargo:rustc-link-lib=static=dune");
|
||||
println!("cargo:rustc-link-lib=static=net");
|
||||
println!("cargo:rustc-link-lib=static=runtime");
|
||||
println!("cargo:rustc-flags=-L ../..");
|
||||
|
51
dune/dune.h
51
dune/dune.h
@ -1,51 +0,0 @@
|
||||
/**
|
||||
* dune.h - public header for Dune support
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <base/types.h>
|
||||
#include <asm/ioctl.h>
|
||||
|
||||
/*
|
||||
* IOCTL interface
|
||||
*/
|
||||
|
||||
/* FIXME: this must be reserved in miscdevice.h */
|
||||
#define DUNE_MINOR 233
|
||||
|
||||
#define DUNE_ENTER _IOR(DUNE_MINOR, 0x01, struct dune_config)
|
||||
#define DUNE_GET_SYSCALL _IO(DUNE_MINOR, 0x02)
|
||||
#define DUNE_GET_LAYOUT _IOW(DUNE_MINOR, 0x03, struct dune_layout)
|
||||
|
||||
#define DUNE_SIGNAL_INTR_BASE 200
|
||||
|
||||
struct dune_config {
|
||||
uintptr_t rip;
|
||||
uintptr_t rsp;
|
||||
uintptr_t cr3;
|
||||
long ret;
|
||||
} __attribute__((packed));
|
||||
|
||||
extern int __dune_enter(int fd, struct dune_config *cfg);
|
||||
extern int __dune_ret(void);
|
||||
|
||||
struct dune_layout {
|
||||
uintptr_t phys_limit;
|
||||
uintptr_t base_map;
|
||||
uintptr_t base_stack;
|
||||
} __attribute__((packed));
|
||||
|
||||
#define GPA_STACK_SIZE ((unsigned long)1 << 28) /* 256 megabytes */
|
||||
#define GPA_MAP_SIZE (((unsigned long)1 << 32) - GPA_STACK_SIZE) /* 3.75 gigabytes */
|
||||
|
||||
static inline physaddr_t gpa_stack_base(const struct dune_layout *layout)
|
||||
{
|
||||
return layout->phys_limit - GPA_STACK_SIZE;
|
||||
}
|
||||
|
||||
static inline physaddr_t gpa_map_base(const struct dune_layout *layout)
|
||||
{
|
||||
return layout->phys_limit - GPA_STACK_SIZE - GPA_MAP_SIZE;
|
||||
}
|
||||
|
465
dune/entry.c
465
dune/entry.c
@ -1,465 +0,0 @@
|
||||
/*
|
||||
* entry.c - routines for managing Dune, user-kernel mode transitions,
|
||||
* and CPU initialization
|
||||
*/
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <asm/prctl.h>
|
||||
|
||||
#include <base/stddef.h>
|
||||
#include <base/page.h>
|
||||
#include <base/log.h>
|
||||
#include <base/thread.h>
|
||||
#include <base/lock.h>
|
||||
|
||||
#include <dune/vm.h>
|
||||
#include <dune/procmap.h>
|
||||
#include <dune/entry.h>
|
||||
#include <dune/mmu.h>
|
||||
#include <dune/trap.h>
|
||||
#include <dune/fpu.h>
|
||||
#include <dune/msr.h>
|
||||
|
||||
#include "dune.h"
|
||||
|
||||
/*
|
||||
* kern_pgtbl contains all the mappings necessary to run the kernel.
|
||||
* After initialization, it is immutable, and therefore does not
|
||||
* require locking.
|
||||
*/
|
||||
ptent_t *kern_pgtbl;
|
||||
|
||||
/* the per-cpu kernel context stack pointer */
|
||||
__thread uintptr_t kern_sp;
|
||||
|
||||
uintptr_t entry_vdso_base;
|
||||
|
||||
static int dune_fd;
|
||||
static bool linux_has_vvar;
|
||||
|
||||
static struct idtd idt_template[IDT_ENTRIES];
|
||||
|
||||
static uint64_t gdt_template[GDT_ENTRIES] = {
|
||||
0,
|
||||
0,
|
||||
SEG64(SEG_X | SEG_R, 0),
|
||||
SEG64(SEG_W, 0),
|
||||
0,
|
||||
SEG64(SEG_W, 3),
|
||||
SEG64(SEG_X | SEG_R, 3),
|
||||
0,
|
||||
0,
|
||||
};
|
||||
|
||||
static inline void set_idt_addr(struct idtd *id, physaddr_t addr)
|
||||
{
|
||||
id->low = addr & 0xFFFF;
|
||||
id->middle = (addr >> 16) & 0xFFFF;
|
||||
id->high = (addr >> 32) & 0xFFFFFFFF;
|
||||
}
|
||||
|
||||
static void entry_init_idt(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < IDT_ENTRIES; i++) {
|
||||
struct idtd *id = &idt_template[i];
|
||||
uintptr_t isr = (uintptr_t)&trap_entry_tbl[TRAP_ENTRY_SIZE * i];
|
||||
|
||||
memset(id, 0, sizeof(*id));
|
||||
|
||||
id->selector = GD_KT;
|
||||
id->type = IDTD_P | IDTD_TRAP_GATE;
|
||||
|
||||
switch (i) {
|
||||
case T_BRKPT:
|
||||
id->type |= IDTD_CPL3;
|
||||
/* fallthrough */
|
||||
case T_DBLFLT:
|
||||
case T_NMI:
|
||||
case T_MCHK:
|
||||
id->ist = 1;
|
||||
break;
|
||||
}
|
||||
|
||||
set_idt_addr(id, isr);
|
||||
}
|
||||
}
|
||||
|
||||
static int entry_init_layout(struct dune_layout *layout)
|
||||
{
|
||||
int ret = ioctl(dune_fd, DUNE_GET_LAYOUT, layout);
|
||||
if (ret)
|
||||
return -EIO;
|
||||
|
||||
log_info("entry: dune mem layout\n");
|
||||
log_info("\tphys_limit\t0x%016lx\n", layout->phys_limit);
|
||||
log_info("\tmap_base\t0x%016lx\n", layout->base_map);
|
||||
log_info("\tstack_back\t0x%016lx\n", layout->base_stack);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ptent_t procmap_entry_to_flags(const struct procmap_entry *e)
|
||||
{
|
||||
ptent_t flags = PTE_P | PTE_G;
|
||||
|
||||
if (e->w)
|
||||
flags |= PTE_W;
|
||||
if (!e->x)
|
||||
flags |= PTE_NX;
|
||||
|
||||
return flags;
|
||||
}
|
||||
|
||||
static int entry_procmap_cb(const struct procmap_entry *e, unsigned long data)
|
||||
{
|
||||
struct dune_layout *layout = (struct dune_layout *) data;
|
||||
|
||||
if (e->type == PROCMAP_TYPE_VDSO || e->type == PROCMAP_TYPE_VVAR) {
|
||||
off_t off = e->begin - layout->base_stack;
|
||||
size_t len = e->end - e->begin;
|
||||
ptent_t flags = procmap_entry_to_flags(e);
|
||||
|
||||
if (e->type == PROCMAP_TYPE_VVAR)
|
||||
linux_has_vvar = true;
|
||||
else
|
||||
entry_vdso_base = e->begin;
|
||||
|
||||
if (off + len > GPA_STACK_SIZE)
|
||||
panic("entry: dune stack region does not contain vsdo\n");
|
||||
|
||||
if (flags & PTE_W) {
|
||||
log_err("entry: can't support writable vdso regions\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return vm_map_phys(kern_pgtbl, gpa_stack_base(layout) + off,
|
||||
(void *)e->begin, len, PGSIZE_4KB, flags | PTE_U);
|
||||
}
|
||||
|
||||
if (e->type == PROCMAP_TYPE_VSYSCALL) {
|
||||
return vm_map_copy(kern_pgtbl, vsyscall_page, (void *)e->begin,
|
||||
PGSIZE_4KB, PGSIZE_4KB, PTE_P | PTE_G | PTE_U);
|
||||
}
|
||||
|
||||
if (e->type == PROCMAP_TYPE_STACK) {
|
||||
off_t off = e->begin - layout->base_stack;
|
||||
return vm_map_phys(kern_pgtbl, gpa_stack_base(layout) + off,
|
||||
(void *)e->begin, e->end - e->begin, PGSIZE_4KB,
|
||||
PTE_P | PTE_W | PTE_G | PTE_NX);
|
||||
}
|
||||
|
||||
/* ignore entries inside the dune map region */
|
||||
if (e->end >= gpa_map_base(layout)) {
|
||||
if (e->begin < layout->base_map ||
|
||||
e->end > layout->base_map + GPA_MAP_SIZE) {
|
||||
log_err("entry: procmap entry is out of range - "
|
||||
"0x%016lx-0x%016lx %c%c%c%c %08lx %s\n",
|
||||
e->begin, e->end,
|
||||
e->r ? 'R' : '-',
|
||||
e->w ? 'W' : '-',
|
||||
e->x ? 'X' : '-',
|
||||
e->p ? 'P' : 'S',
|
||||
e->offset, e->path);
|
||||
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* skip regions mapped by the page allocator */
|
||||
if (e->begin >= PAGE_BASE_ADDR && e->end <= PAGE_END_ADDR)
|
||||
return 0;
|
||||
|
||||
return vm_map_phys(kern_pgtbl, (physaddr_t)e->begin, (void *)e->begin,
|
||||
e->end - e->begin, PGSIZE_4KB,
|
||||
procmap_entry_to_flags(e));
|
||||
}
|
||||
|
||||
static int entry_setup_oldstyle_vvar(void)
|
||||
{
|
||||
log_info("entry: didn't find [vvar] section, creating one manually\n");
|
||||
|
||||
#define VVAR_ADDR 0xffffffffff5ff000UL
|
||||
return vm_map_copy(kern_pgtbl, (void *)VVAR_ADDR, (void *)VVAR_ADDR,
|
||||
PGSIZE_4KB, PGSIZE_4KB, PTE_P | PTE_G | PTE_U);
|
||||
}
|
||||
|
||||
static int entry_setup_syscall(void)
|
||||
{
|
||||
int ret;
|
||||
uintptr_t lstar, aligned_lstar;
|
||||
struct page *pg;
|
||||
size_t total_len = (size_t)syscall_enter_end -
|
||||
(size_t)syscall_enter;
|
||||
size_t part_len;
|
||||
void *buf;
|
||||
|
||||
BUG_ON(total_len > PGSIZE_4KB);
|
||||
|
||||
lstar = ioctl(dune_fd, DUNE_GET_SYSCALL);
|
||||
if (lstar == -1)
|
||||
return -EIO;
|
||||
|
||||
aligned_lstar = PGADDR_4KB(lstar);
|
||||
|
||||
pg = page_alloc(PGSIZE_4KB);
|
||||
if (!pg)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = vm_insert_page(kern_pgtbl, (void *)aligned_lstar,
|
||||
pg, PTE_P | PTE_G);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
part_len = min(total_len, PGSIZE_4KB - PGOFF_4KB(lstar));
|
||||
buf = (char *)page_to_addr(pg) + PGOFF_4KB(lstar);
|
||||
memcpy(buf, syscall_enter, part_len);
|
||||
total_len -= part_len;
|
||||
|
||||
/* did the handler spill over to a second page boundary? */
|
||||
if (total_len) {
|
||||
pg = page_alloc(PGSIZE_4KB);
|
||||
if (!pg)
|
||||
return -ENOMEM;
|
||||
|
||||
aligned_lstar += PGSIZE_4KB;
|
||||
ret = vm_insert_page(kern_pgtbl, (void *)aligned_lstar,
|
||||
pg, PTE_P | PTE_G);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
buf = page_to_addr(pg);
|
||||
memcpy(buf, &syscall_enter[part_len], total_len);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int entry_init_pgtbl(const struct dune_layout *layout)
|
||||
{
|
||||
int ret;
|
||||
|
||||
kern_pgtbl = vm_create_pt();
|
||||
if (!kern_pgtbl)
|
||||
return -ENOMEM;
|
||||
|
||||
/* step 1: bulk map the dune map region */
|
||||
ret = vm_map_phys(kern_pgtbl, gpa_map_base(layout),
|
||||
(void *)layout->base_map, GPA_MAP_SIZE,
|
||||
PGSIZE_2MB, PTE_P | PTE_W | PTE_G);
|
||||
if (ret)
|
||||
goto fail;
|
||||
|
||||
/* step 2: identity map the base library page-map region */
|
||||
ret = vm_map_phys(kern_pgtbl, (physaddr_t)PAGE_BASE_ADDR,
|
||||
(void *)PAGE_BASE_ADDR, PAGE_END_ADDR - PAGE_BASE_ADDR,
|
||||
PGSIZE_2MB, PTE_P | PTE_W | PTE_G | PTE_NX);
|
||||
if (ret)
|
||||
goto fail;
|
||||
|
||||
/* step 3: precision map phdr, heap, stack, vdso, and vvar sections */
|
||||
ret = procmap_iterate(&entry_procmap_cb, (unsigned long)layout);
|
||||
if (ret)
|
||||
goto fail;
|
||||
|
||||
if(!linux_has_vvar) {
|
||||
ret = entry_setup_oldstyle_vvar();
|
||||
if (ret)
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/* step 4: map the system call handler page */
|
||||
ret = entry_setup_syscall();
|
||||
if (ret)
|
||||
goto fail;
|
||||
|
||||
return 0;
|
||||
|
||||
fail:
|
||||
vm_destroy_pt(kern_pgtbl);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* entry_init - initialization for entry
|
||||
*/
|
||||
int entry_init(void)
|
||||
{
|
||||
int ret;
|
||||
struct dune_layout layout;
|
||||
|
||||
dune_fd = open("/dev/dune", O_RDWR);
|
||||
if (dune_fd < 0) {
|
||||
log_err("entry: failed to open dune device\n");
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
entry_init_idt();
|
||||
|
||||
ret = entry_init_layout(&layout);
|
||||
if (ret) {
|
||||
log_err("entry: unable to get dune memory layout\n");
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = entry_init_pgtbl(&layout);
|
||||
if (ret) {
|
||||
log_err("entry: failed to create kernel page table\n");
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static __thread uint64_t gdt[GDT_ENTRIES] __aligned(CACHE_LINE_SIZE);
|
||||
static __thread struct tssd tss __aligned(CACHE_LINE_SIZE);
|
||||
static __thread struct idtd idt __aligned(CACHE_LINE_SIZE);
|
||||
static __thread struct entry_percpu cpu_entry;
|
||||
|
||||
/* FIXME: protect the stacks with guard pages */
|
||||
static int entry_setup_stacks(struct tssd *tss)
|
||||
{
|
||||
int i;
|
||||
struct page *safe_stack_pg, *intr_stack_pg;
|
||||
char *safe_stack, *intr_stack;
|
||||
|
||||
safe_stack_pg = page_alloc(PGSIZE_4KB);
|
||||
if (!safe_stack_pg)
|
||||
return -ENOMEM;
|
||||
|
||||
safe_stack = page_to_addr(safe_stack_pg);
|
||||
safe_stack += PGSIZE_4KB;
|
||||
tss->iomb = offsetof(struct tssd, iopb);
|
||||
|
||||
for (i = 0; i < 8; i++)
|
||||
tss->ist[i] = (uintptr_t) safe_stack;
|
||||
|
||||
intr_stack_pg = page_alloc(PGSIZE_4KB);
|
||||
if (!intr_stack_pg) {
|
||||
page_put_addr(safe_stack_pg);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
intr_stack = page_to_addr(intr_stack_pg);
|
||||
intr_stack += PGSIZE_4KB;
|
||||
tss->rsp[0] = (uintptr_t)intr_stack;
|
||||
kern_sp = (uintptr_t)intr_stack;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int entry_start_dune(void)
|
||||
{
|
||||
struct dune_config conf;
|
||||
int ret;
|
||||
|
||||
conf.rip = (uintptr_t)&__dune_ret;
|
||||
conf.rsp = 0;
|
||||
conf.cr3 = (uintptr_t)kern_pgtbl;
|
||||
|
||||
ret = __dune_enter(dune_fd, &conf);
|
||||
if (ret) {
|
||||
log_err("entry: failed to enter dune mode\n");
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int entry_boot_cpu(struct entry_percpu *ent,
|
||||
uintptr_t gdt_addr, uintptr_t idt_addr)
|
||||
{
|
||||
struct tptr _idtr, _gdtr;
|
||||
|
||||
_gdtr.base = gdt_addr;
|
||||
_gdtr.limit = sizeof(gdt_template) - 1;
|
||||
|
||||
_idtr.base = idt_addr;
|
||||
_idtr.limit = sizeof(idt_template) - 1;
|
||||
|
||||
asm volatile(
|
||||
/* STEP 1: load the new GDT */
|
||||
"lgdt %0\n"
|
||||
|
||||
/* STEP 2: initialize data segements */
|
||||
"mov $" __cstr(GD_KD) ", %%ax\n"
|
||||
"mov %%ax, %%ds\n"
|
||||
"mov %%ax, %%es\n"
|
||||
"mov %%ax, %%ss\n"
|
||||
|
||||
/* STEP 3: long jump into the new code segment */
|
||||
"mov $" __cstr(GD_KT) ", %%rax\n"
|
||||
"pushq %%rax\n"
|
||||
"pushq $1f\n"
|
||||
"lretq\n"
|
||||
"1: nop\n"
|
||||
|
||||
/* STEP 4: load the task register (for safe stack switching) */
|
||||
"mov $" __cstr(GD_TSS) ", %%ax\n"
|
||||
"ltr %%ax\n"
|
||||
|
||||
/* STEP 5: load the new IDT */
|
||||
"lidt %1\n"
|
||||
|
||||
: : "m" (_gdtr), "m" (_idtr) : "rax");
|
||||
|
||||
/* STEP 6: FS and GS require special initialization on 64-bit */
|
||||
setfsbase(ent->kfs_base);
|
||||
setgsbase((uintptr_t)ent);
|
||||
setgskernbase((uintptr_t)ent);
|
||||
irq_enable();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int arch_prctl(int code, unsigned long *addr);
|
||||
|
||||
/*
|
||||
* entry_init_one - per-cpu initialization for entry
|
||||
*/
|
||||
int entry_init_one(void)
|
||||
{
|
||||
int ret;
|
||||
struct entry_percpu *ent = &cpu_entry;
|
||||
unsigned long fs_base;
|
||||
|
||||
/* step 1: set up the TSS */
|
||||
ret = entry_setup_stacks(&tss);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/* step 2: set up the GDT */
|
||||
memcpy(gdt, gdt_template, sizeof(gdt_template));
|
||||
gdt[GD_TSS >> 3] = (SEG_TSSA | SEG_P | SEG_A |
|
||||
SEG_BASELO(&tss) |
|
||||
SEG_LIM(sizeof(struct tssd) - 1));
|
||||
gdt[GD_TSS2 >> 3] = SEG_BASEHI(&tss);
|
||||
|
||||
/* step 3: set up the IDT */
|
||||
memcpy(&idt, idt_template, sizeof(idt));
|
||||
|
||||
/* step 4: setup the entry per-cpu structure */
|
||||
if (arch_prctl(ARCH_GET_FS, &fs_base) == -1) {
|
||||
log_err("entry: failed to get current FS.base\n");
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
ent->kfs_base = fs_base;
|
||||
ent->ugs_base = 0;
|
||||
|
||||
/* step 5: enter dune mode */
|
||||
ret = entry_start_dune();
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/* step 6: set up architectural state */
|
||||
ret = entry_boot_cpu(ent, (uintptr_t)gdt, (uintptr_t)&idt);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return 0;
|
||||
}
|
136
dune/procmap.c
136
dune/procmap.c
@ -1,136 +0,0 @@
|
||||
/*
|
||||
* procmap.c - Parse linux process map information.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Format:
|
||||
* start addr-end addr perms offset dev(xx:yy) inode path
|
||||
*
|
||||
* Permsissions:
|
||||
* rwxp
|
||||
* ||||
|
||||
* Readable ---------+|||
|
||||
* (r or -) |||
|
||||
* Writable ----------+||
|
||||
* (w or -) ||
|
||||
* Executable ---------+|
|
||||
* (X or -) |
|
||||
* Private/Shared ------+
|
||||
* (p or s)
|
||||
*
|
||||
* Special Paths:
|
||||
* - <filename>
|
||||
* - anonymous
|
||||
* - [heap]
|
||||
* - [stack]
|
||||
* - [vsyscall]
|
||||
* - [vdso]
|
||||
*
|
||||
* Example /proc/self/maps:
|
||||
* 00400000-0040b000 r-xp 00000000 fe:00 917797 /bin/cat
|
||||
* 0060a000-0060b000 r--p 0000a000 fe:00 917797 /bin/cat
|
||||
* 0060b000-0060c000 rw-p 0000b000 fe:00 917797 /bin/cat
|
||||
* 022cf000-022f0000 rw-p 00000000 00:00 0 [heap]
|
||||
* 7fe598687000-7fe59881e000 r-xp 00000000 fe:00 917523 /lib/libc-2.15.so
|
||||
* 7fe59881e000-7fe598a1e000 ---p 00197000 fe:00 917523 /lib/libc-2.15.so
|
||||
* 7fe598a1e000-7fe598a22000 r--p 00197000 fe:00 917523 /lib/libc-2.15.so
|
||||
* 7fe598a22000-7fe598a24000 rw-p 0019b000 fe:00 917523 /lib/libc-2.15.so
|
||||
* 7fe598a24000-7fe598a28000 rw-p 00000000 00:00 0
|
||||
* 7fe598a28000-7fe598a49000 r-xp 00000000 fe:00 917531 /lib/ld-2.15.so
|
||||
* 7fe598c37000-7fe598c3a000 rw-p 00000000 00:00 0
|
||||
* 7fe598c47000-7fe598c48000 rw-p 00000000 00:00 0
|
||||
* 7fe598c48000-7fe598c49000 r--p 00020000 fe:00 917531 /lib/ld-2.15.so
|
||||
* 7fe598c49000-7fe598c4a000 rw-p 00021000 fe:00 917531 /lib/ld-2.15.so
|
||||
* 7fe598c4a000-7fe598c4b000 rw-p 00000000 00:00 0
|
||||
* 7fff601ca000-7fff601eb000 rw-p 00000000 00:00 0 [stack]
|
||||
* 7fff601ff000-7fff60200000 r-xp 00000000 00:00 0 [vdso]
|
||||
* ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <base/stddef.h>
|
||||
#include <base/log.h>
|
||||
#include <dune/procmap.h>
|
||||
|
||||
static int get_type(const char *path)
|
||||
{
|
||||
if (path[0] != '[' && path[0] != '\0')
|
||||
return PROCMAP_TYPE_FILE;
|
||||
if (path[0] == '\0')
|
||||
return PROCMAP_TYPE_ANONYMOUS;
|
||||
if (strcmp(path, "[heap]") == 0)
|
||||
return PROCMAP_TYPE_HEAP;
|
||||
if (strncmp(path, "[stack", 6) == 0)
|
||||
return PROCMAP_TYPE_STACK;
|
||||
if (strcmp(path, "[vsyscall]") == 0)
|
||||
return PROCMAP_TYPE_VSYSCALL;
|
||||
if (strcmp(path, "[vdso]") == 0)
|
||||
return PROCMAP_TYPE_VDSO;
|
||||
if (strcmp(path, "[vvar]") == 0)
|
||||
return PROCMAP_TYPE_VVAR;
|
||||
return PROCMAP_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
int procmap_iterate(procmap_cb_t cb, unsigned long data)
|
||||
{
|
||||
struct procmap_entry e;
|
||||
FILE *map;
|
||||
unsigned int dev1, dev2, inode;
|
||||
char read, write, execute, private;
|
||||
char line[512];
|
||||
char path[256];
|
||||
int ret = 0;
|
||||
|
||||
map = fopen("/proc/self/maps", "r");
|
||||
if (map == NULL) {
|
||||
log_err("procmap: could not open /proc/self/maps!\n");
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
setvbuf(map, NULL, _IOFBF, 8192);
|
||||
|
||||
while (!feof(map)) {
|
||||
path[0] = '\0';
|
||||
if (fgets(line, 512, map) == NULL)
|
||||
break;
|
||||
sscanf((char *)&line, "%lx-%lx %c%c%c%c %lx %x:%x %d %s",
|
||||
&e.begin, &e.end,
|
||||
&read, &write, &execute, &private, &e.offset,
|
||||
&dev1, &dev2, &inode, path);
|
||||
e.r = (read == 'r');
|
||||
e.w = (write == 'w');
|
||||
e.x = (execute == 'x');
|
||||
e.p = (private == 'p');
|
||||
e.path = path;
|
||||
e.type = get_type(path);
|
||||
ret = cb(&e, data);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
|
||||
fclose(map);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int
|
||||
procmap_dump_helper(const struct procmap_entry *e, unsigned long data)
|
||||
{
|
||||
log_info("0x%016lx-0x%016lx %c%c%c%c %08lx %s\n",
|
||||
e->begin, e->end,
|
||||
e->r ? 'R' : '-',
|
||||
e->w ? 'W' : '-',
|
||||
e->x ? 'X' : '-',
|
||||
e->p ? 'P' : 'S',
|
||||
e->offset, e->path);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void procmap_dump()
|
||||
{
|
||||
log_info("--- Process Map Dump ---\n");
|
||||
procmap_iterate(&procmap_dump_helper, 0);
|
||||
}
|
137
dune/trap.c
137
dune/trap.c
@ -1,137 +0,0 @@
|
||||
/*
|
||||
* trap.c - x86 exception and interrupt support
|
||||
*/
|
||||
|
||||
#include <base/stddef.h>
|
||||
#include <base/log.h>
|
||||
#include <dune/vm.h>
|
||||
#include <dune/entry.h>
|
||||
#include <dune/trap.h>
|
||||
#include <dune/msr.h>
|
||||
|
||||
#define STACK_DUMP_DEPTH 16
|
||||
#define NUM_CODES 20
|
||||
|
||||
static const char *trap_codes[NUM_CODES] = {
|
||||
"divide error",
|
||||
"debug exception",
|
||||
"non-maskable interrupt",
|
||||
"breakpoint",
|
||||
"overflow",
|
||||
"bounds check",
|
||||
"illegal opcode",
|
||||
"dev not available",
|
||||
"double fault",
|
||||
"reserved",
|
||||
"invalid TSS",
|
||||
"segment not present",
|
||||
"stack exception",
|
||||
"general protection fault",
|
||||
"page fault",
|
||||
"reserved",
|
||||
"floating point error",
|
||||
"alignment check",
|
||||
"machine check",
|
||||
"SIMD error",
|
||||
};
|
||||
|
||||
static int safe_peekq(uint64_t *addr, uint64_t *val)
|
||||
{
|
||||
int ret, level;
|
||||
ptent_t *pte;
|
||||
|
||||
ret = vm_lookup_pte(kern_pgtbl, addr, &level, &pte);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (!(*pte & PTE_P))
|
||||
return -EINVAL;
|
||||
|
||||
if (*pte & PTE_PAGE) {
|
||||
uint64_t *direct_ptr = (uint64_t *)((char *)PTE_ADDR(*pte) +
|
||||
((off_t)addr & (PGLEVEL_TO_SIZE(level) - 1)));
|
||||
*val = *direct_ptr;
|
||||
} else {
|
||||
*val = *(uint64_t *)addr;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void dump_stack(uintptr_t rsp)
|
||||
{
|
||||
int i;
|
||||
uint64_t *sp = (uint64_t *)rsp;
|
||||
|
||||
log_info("dumping stack contents:\n");
|
||||
|
||||
if (rsp & (sizeof(uint64_t) - 1)) {
|
||||
log_err("misaligned stack\n");
|
||||
return;
|
||||
}
|
||||
|
||||
for (i = 0; i < STACK_DUMP_DEPTH; i++) {
|
||||
uint64_t val;
|
||||
|
||||
if (!safe_peekq(&sp[i], &val)) {
|
||||
log_info("*(RSP+%03d) 0x%016lx\n",
|
||||
(int)(i * sizeof(uint64_t)), val);
|
||||
} else {
|
||||
log_info("*(RSP+%03d) <unmapped>\n",
|
||||
(int)(i * sizeof(uint64_t)));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void dump_trap_frame(struct env_tf *tf)
|
||||
{
|
||||
log_info("--- Begin Frame Dump ---\n");
|
||||
log_info("RIP 0x%016lx\n", tf->rip);
|
||||
log_info("CS 0x%02x SS 0x%02x\n", tf->cs, tf->ss);
|
||||
log_info("ERR 0x%08x RFLAGS 0x%08lx\n", tf->err, tf->rflags);
|
||||
log_info("RAX 0x%016lx RCX 0x%016lx\n", tf->rax, tf->rcx);
|
||||
log_info("RDX 0x%016lx RBX 0x%016lx\n", tf->rdx, tf->rbx);
|
||||
log_info("RSP 0x%016lx RBP 0x%016lx\n", tf->rsp, tf->rbp);
|
||||
log_info("RSI 0x%016lx RDI 0x%016lx\n", tf->rsi, tf->rdi);
|
||||
log_info("R8 0x%016lx R9 0x%016lx\n", tf->r8, tf->r9);
|
||||
log_info("R10 0x%016lx R11 0x%016lx\n", tf->r10, tf->r11);
|
||||
log_info("R12 0x%016lx R13 0x%016lx\n", tf->r12, tf->r13);
|
||||
log_info("R14 0x%016lx R15 0x%016lx\n", tf->r14, tf->r15);
|
||||
log_info("FS.base 0x%016lx GS.base 0x%016lx\n",
|
||||
getfsbase(), getgsbase());
|
||||
dump_stack(tf->rsp);
|
||||
log_info("--- End Frame Dump ---\n");
|
||||
}
|
||||
|
||||
static void dump_pgflt(struct env_tf *tf)
|
||||
{
|
||||
uint32_t fec = tf->err;
|
||||
uintptr_t fault_addr;
|
||||
|
||||
asm volatile("mov %%cr2, %0" : "=r" (fault_addr));
|
||||
|
||||
log_err("trap: %s page fault at ADDR 0x%016lx (%s, %s%s)\n",
|
||||
(fec & FEC_U) ? "user" : "kernel", fault_addr,
|
||||
(fec & FEC_P) ? "protection" : "non-present page",
|
||||
(fec & FEC_RSV) ? "reserved bit error, " : "",
|
||||
(fec & FEC_I) ? "code" : "data");
|
||||
if (fault_addr < PGSIZE_4KB)
|
||||
log_err("trap: likely NULL pointer exception\n");
|
||||
}
|
||||
|
||||
void trap_handler(int num, struct env_tf *tf)
|
||||
{
|
||||
bool user = ((tf->cs & 0x3) == 0x3);
|
||||
|
||||
if (num == T_PGFLT) {
|
||||
dump_pgflt(tf);
|
||||
} else {
|
||||
log_err("trap: unhandled trap %d (%s) in %s\n", num,
|
||||
num < NUM_CODES ? trap_codes[num] : "spurious",
|
||||
user ? "user" : "kernel");
|
||||
}
|
||||
|
||||
dump_trap_frame(tf);
|
||||
init_shutdown(EXIT_FAILURE);
|
||||
}
|
454
dune/trapasm.S
454
dune/trapasm.S
@ -1,454 +0,0 @@
|
||||
/*
|
||||
* trapasm.S - assembly helper routines (e.g. system calls, interrupts, traps)
|
||||
*/
|
||||
|
||||
/*
|
||||
* Enabling USE_RDRWGSFS can reduce system call overhead but this feature
|
||||
* is only available on Ivy Bridge and later Intel CPUs.
|
||||
*
|
||||
* FIXME: detect this automatically
|
||||
*/
|
||||
|
||||
#define MSR_FS_BASE 0xc0000100
|
||||
#define MSR_GS_BASE 0xc0000101
|
||||
#define GD_KT 0x10
|
||||
#define GD_KD 0x18
|
||||
#define GD_UD 0x28 | 0x03
|
||||
#define GD_UT 0x30 | 0x03
|
||||
|
||||
/*
|
||||
* Trap Frame Format
|
||||
* NOTE: this reflects the layout of struct dune_tf
|
||||
*/
|
||||
|
||||
/* arguments */
|
||||
#define RDI (0)
|
||||
#define RSI (8)
|
||||
#define RDX (16)
|
||||
#define RCX (24)
|
||||
#define R8 (32)
|
||||
#define R9 (40)
|
||||
|
||||
/* other registers */
|
||||
#define R10 (48)
|
||||
#define R11 (56)
|
||||
#define RBX (64)
|
||||
#define RBP (72)
|
||||
#define R12 (80)
|
||||
#define R13 (88)
|
||||
#define R14 (96)
|
||||
#define R15 (104)
|
||||
|
||||
#define REG_END (112)
|
||||
|
||||
/* syscall num / return code */
|
||||
#define RAX (112)
|
||||
|
||||
/* exception frame */
|
||||
#define ERR (120)
|
||||
#define RIP (128)
|
||||
#define CS (136)
|
||||
#define RFLAGS (144)
|
||||
#define RSP (152)
|
||||
#define SS (160)
|
||||
|
||||
#define EF_START (128)
|
||||
#define TF_END (168)
|
||||
#define TF_ALIGN (176)
|
||||
|
||||
/*
|
||||
* Dune Config Format
|
||||
* NOTE: this reflects the layout of struct dune_config
|
||||
*/
|
||||
#define DUNE_CFG_RIP (0)
|
||||
#define DUNE_CFG_RSP (8)
|
||||
#define DUNE_CFG_CR3 (16)
|
||||
#define DUNE_CFG_RET (24)
|
||||
|
||||
/*
|
||||
* Supervisor Private Area Format
|
||||
*/
|
||||
#define TMP (8)
|
||||
#define KFS_BASE (16)
|
||||
#define UFS_BASE (24)
|
||||
#define UGS_BASE (32)
|
||||
#define FLAGS (40)
|
||||
#define THREAD_STACK (48)
|
||||
|
||||
#define FLAG_IN_USER 0x1
|
||||
#define FLAG_LOAD_USER 0x2
|
||||
|
||||
|
||||
.text
|
||||
|
||||
/*
|
||||
* macro to save destructable register state
|
||||
*/
|
||||
.macro SAVE_REGS save_full=1, include_rax=1
|
||||
movq %rdi, RDI(%rsp)
|
||||
movq %rsi, RSI(%rsp)
|
||||
movq %rdx, RDX(%rsp)
|
||||
movq %r8, R8(%rsp)
|
||||
movq %r9, R9(%rsp)
|
||||
|
||||
.if \save_full
|
||||
movq %r10, R10(%rsp)
|
||||
movq %r11, R11(%rsp)
|
||||
movq %rcx, RCX(%rsp)
|
||||
.endif
|
||||
|
||||
.if \include_rax
|
||||
movq %rax, RAX(%rsp)
|
||||
.endif
|
||||
.endm
|
||||
|
||||
/*
|
||||
* macro to save the rest of register state
|
||||
*
|
||||
* useful for operations that violate AMD64 calling conventions
|
||||
* by destroying callee restored state
|
||||
*/
|
||||
.macro SAVE_REST
|
||||
movq %rbx, RBX(%rsp)
|
||||
movq %rbp, RBP(%rsp)
|
||||
movq %r12, R12(%rsp)
|
||||
movq %r13, R13(%rsp)
|
||||
movq %r14, R14(%rsp)
|
||||
movq %r15, R15(%rsp)
|
||||
.endm
|
||||
|
||||
/*
|
||||
* macro to restore destructable register state
|
||||
*/
|
||||
.macro RESTORE_REGS rstor_full=1, include_rax=1
|
||||
.if \include_rax
|
||||
movq RAX(%rsp), %rax
|
||||
.endif
|
||||
|
||||
.if \rstor_full
|
||||
movq RCX(%rsp), %rcx
|
||||
movq R11(%rsp), %r11
|
||||
movq R10(%rsp), %r10
|
||||
.endif
|
||||
|
||||
movq R9(%rsp), %r9
|
||||
movq R8(%rsp), %r8
|
||||
movq RDX(%rsp), %rdx
|
||||
movq RSI(%rsp), %rsi
|
||||
movq RDI(%rsp), %rdi
|
||||
.endm
|
||||
|
||||
/*
|
||||
* macro to restore the rest of register state
|
||||
*
|
||||
* useful for operations that violate AMD64 calling conventions
|
||||
* by destroying callee restored state
|
||||
*/
|
||||
.macro RESTORE_REST
|
||||
movq R15(%rsp), %r15
|
||||
movq R14(%rsp), %r14
|
||||
movq R13(%rsp), %r13
|
||||
movq R12(%rsp), %r12
|
||||
movq RBP(%rsp), %rbp
|
||||
movq RBX(%rsp), %rbx
|
||||
.endm
|
||||
|
||||
/*
|
||||
* macro to setup FS and GS segments for kernel mode
|
||||
*/
|
||||
.macro SETUP_KERNEL_SEGS
|
||||
movq $0, %gs:FLAGS
|
||||
.endm
|
||||
|
||||
/*
|
||||
* macro to setup FS and GS segments for user mode
|
||||
*
|
||||
* NOTE: clobbers %rax, %rdx, and %rcx
|
||||
* WARNING: unsafe if interrupts are not disabled
|
||||
*/
|
||||
.macro SETUP_USER_SEGS check=1
|
||||
orq $FLAG_IN_USER, %gs:FLAGS
|
||||
|
||||
.if \check
|
||||
testq $FLAG_LOAD_USER, %gs:FLAGS
|
||||
jz 1f
|
||||
.endif
|
||||
|
||||
movq %gs:UFS_BASE, %rax
|
||||
#ifdef USE_RDWRGSFS
|
||||
wrfsbase %rax
|
||||
#else
|
||||
movq %rax, %rdx
|
||||
shrq $32, %rdx
|
||||
movl $MSR_FS_BASE, %ecx
|
||||
wrmsr
|
||||
#endif /* USE_RDWRGSFS */
|
||||
|
||||
movq %gs:UGS_BASE, %rax
|
||||
swapgs
|
||||
#ifdef USE_RDWRGSFS
|
||||
wrgsbase %rax
|
||||
#else
|
||||
movq %rax, %rdx
|
||||
shrq $32, %rdx
|
||||
movl $MSR_GS_BASE, %ecx
|
||||
wrmsr
|
||||
#endif /* USE_RDWRGSFS */
|
||||
|
||||
.if \check
|
||||
jmp 2f
|
||||
1: swapgs
|
||||
2:
|
||||
.endif
|
||||
.endm
|
||||
|
||||
|
||||
.globl __dune_enter
|
||||
__dune_enter:
|
||||
subq $REG_END, %rsp
|
||||
SAVE_REGS 1, 0
|
||||
SAVE_REST
|
||||
movq %rsp, DUNE_CFG_RSP(%rsi)
|
||||
movq %rsi, %rdx
|
||||
movq $0x8020e901, %rsi /* XXX DUNE_ENTER */
|
||||
movq $16, %rax /* __NR_ioctl */
|
||||
syscall
|
||||
|
||||
cmpq $0, %rax
|
||||
jnz __dune_ret
|
||||
mov %rdx, %rbx
|
||||
call init_shutdown_late
|
||||
movq DUNE_CFG_RET(%rbx), %rdi
|
||||
movq $231, %rax /* __NR_exit_group */
|
||||
syscall
|
||||
|
||||
.globl __dune_ret
|
||||
__dune_ret:
|
||||
RESTORE_REST
|
||||
RESTORE_REGS 1, 0
|
||||
addq $REG_END, %rsp
|
||||
retq
|
||||
|
||||
/*
|
||||
* System Call ABI
|
||||
* ---------------
|
||||
*
|
||||
* User Parameters:
|
||||
* %rsp - stack pointer
|
||||
* %rcx - instruction pointer
|
||||
* %r11 - eflags
|
||||
* %rax - system call number
|
||||
*
|
||||
* Arguments:
|
||||
* %rdi - arg0, %rsi - arg1, %rdx - arg2
|
||||
* %r10 - arg3, %r8 - arg4, %r9 - arg5
|
||||
*
|
||||
* Return code goes in %rax
|
||||
*
|
||||
* XXX: don't do relative jumps - watch out code is memcpy
|
||||
* XXX: Invoked with interrupts disabled...
|
||||
*/
|
||||
.globl syscall_enter
|
||||
syscall_enter:
|
||||
/*
|
||||
* Hack to redirect any syscall instructions executed
|
||||
* in kernel mode to the hypervisor through vmcall.
|
||||
*/
|
||||
swapgs
|
||||
testq $FLAG_IN_USER, %gs:FLAGS
|
||||
jnz 1f
|
||||
pushq %r11
|
||||
popfq
|
||||
vmcall
|
||||
jmp *%rcx
|
||||
|
||||
1:
|
||||
/* first switch to the kernel stack */
|
||||
movq %rsp, %gs:TMP
|
||||
movq %gs:THREAD_STACK, %rsp
|
||||
|
||||
/* now push the trap frame onto the stack */
|
||||
subq $TF_END, %rsp
|
||||
movq %rcx, RIP(%rsp)
|
||||
movq %r11, RFLAGS(%rsp)
|
||||
movq %r10, RCX(%rsp) /* fixup to standard 64-bit calling ABI */
|
||||
SAVE_REGS 0, 1
|
||||
SAVE_REST
|
||||
movq %gs:TMP, %rax
|
||||
movq %rax, RSP(%rsp)
|
||||
|
||||
/* configure the segment bases */
|
||||
SETUP_KERNEL_SEGS
|
||||
|
||||
/* then finally re-enable interrupts and jump to the handler */
|
||||
sti
|
||||
movq %rsp, %rdi /* argument 0 */
|
||||
lea syscall_handler, %rax
|
||||
call *%rax
|
||||
cli
|
||||
|
||||
/* restore the segment bases */
|
||||
SETUP_USER_SEGS
|
||||
|
||||
/* then pop the trap frame off the stack */
|
||||
RESTORE_REGS 0, 1
|
||||
RESTORE_REST
|
||||
movq RCX(%rsp), %r10
|
||||
movq RFLAGS(%rsp), %r11
|
||||
movq RIP(%rsp), %rcx
|
||||
|
||||
/* switch to the user stack and return to ring 3 */
|
||||
movq RSP(%rsp), %rsp
|
||||
sysretq
|
||||
|
||||
.globl syscall_enter_end
|
||||
syscall_enter_end:
|
||||
nop
|
||||
|
||||
.globl pop_tf
|
||||
pop_tf:
|
||||
/* restore callee regs */
|
||||
movq RBX(%rdi), %rbx
|
||||
movq RBP(%rdi), %rbp
|
||||
movq R12(%rdi), %r12
|
||||
movq R13(%rdi), %r13
|
||||
movq R14(%rdi), %r14
|
||||
movq R15(%rdi), %r15
|
||||
|
||||
/* restore ip and stack */
|
||||
movq RSP(%rdi), %rsp
|
||||
movq RIP(%rdi), %rcx
|
||||
|
||||
jmpq *%rcx
|
||||
|
||||
.globl pop_tf_user
|
||||
pop_tf_user:
|
||||
movq %rdi, %rsp /* might not be a stack! */
|
||||
SETUP_USER_SEGS 0
|
||||
RESTORE_REGS
|
||||
RESTORE_REST
|
||||
addq $EF_START, %rsp
|
||||
iretq
|
||||
|
||||
.globl pop_tf_user_fast
|
||||
pop_tf_user_fast:
|
||||
movq %rdi, %rsp /* might not be a stack! */
|
||||
SETUP_USER_SEGS 0
|
||||
RESTORE_REGS 0, 1
|
||||
RESTORE_REST
|
||||
movq R10(%rsp), %r10
|
||||
movq RIP(%rsp), %rcx
|
||||
movq RFLAGS(%rsp), %r11
|
||||
movq RSP(%rsp), %rsp
|
||||
sysretq
|
||||
|
||||
/**
|
||||
* switch_tf - saves the current kernel frame and pops
|
||||
* the next kernel frame
|
||||
* @cur: the current trap frame
|
||||
* @next: the next trap frame
|
||||
*/
|
||||
.globl switch_tf
|
||||
switch_tf:
|
||||
/* save callee regs */
|
||||
movq %rbx, RBX(%rdi)
|
||||
movq %rbp, RBP(%rdi)
|
||||
movq %r12, R12(%rdi)
|
||||
movq %r13, R13(%rdi)
|
||||
movq %r14, R14(%rdi)
|
||||
movq %r15, R15(%rdi)
|
||||
|
||||
/* save ip and stack */
|
||||
movq (%rsp), %rcx
|
||||
movq %rcx, RIP(%rdi)
|
||||
leaq 8(%rsp), %rcx
|
||||
movq %rcx, RSP(%rdi)
|
||||
|
||||
/* restore callee regs */
|
||||
movq RBX(%rsi), %rbx
|
||||
movq RBP(%rsi), %rbp
|
||||
movq R12(%rsi), %r12
|
||||
movq R13(%rsi), %r13
|
||||
movq R14(%rsi), %r14
|
||||
movq R15(%rsi), %r15
|
||||
|
||||
/* restore ip and stack */
|
||||
movq RSP(%rsi), %rsp
|
||||
movq RIP(%rsi), %rcx
|
||||
|
||||
/* restore arguments (in case new thread) */
|
||||
movq RDI(%rsi), %rdi # ARG0
|
||||
movq RSI(%rsi), %rsi # ARG1
|
||||
|
||||
jmpq *%rcx
|
||||
|
||||
/*
|
||||
* NOTE: interrupts start out disabled.
|
||||
* The macro generates a fixed-sized array of handlers, one for each vector.
|
||||
*/
|
||||
.globl trap_entry_tbl
|
||||
.align 16
|
||||
trap_entry_tbl:
|
||||
i = 0
|
||||
.rept 256
|
||||
.align 16
|
||||
.if i <> 8 && (i <= 9 || i >= 15) && i <> 17
|
||||
pushq %rax /* placeholder for no error code */
|
||||
.endif
|
||||
pushq %rax /* save %rax */
|
||||
mov $i, %rax
|
||||
jmp 1f
|
||||
i = i + 1
|
||||
.endr
|
||||
|
||||
1:
|
||||
/* save the remaining destructable registers */
|
||||
subq $REG_END, %rsp
|
||||
SAVE_REGS 1, 0 /* %rax already is pushed */
|
||||
SAVE_REST
|
||||
movq %rax, %rdi
|
||||
|
||||
/* determine if we were in user mode before the trap */
|
||||
testq $3, CS(%rsp)
|
||||
jz 2f
|
||||
swapgs
|
||||
SETUP_KERNEL_SEGS
|
||||
|
||||
2:
|
||||
sti
|
||||
/* setup arguments and call the handler */
|
||||
movq %rsp, %rsi
|
||||
call trap_handler
|
||||
|
||||
/* determine if we're returning to user mode */
|
||||
testq $3, CS(%rsp)
|
||||
jz 3f
|
||||
|
||||
/* return to user mode */
|
||||
cli
|
||||
SETUP_USER_SEGS
|
||||
RESTORE_REGS
|
||||
addq $EF_START, %rsp
|
||||
iretq
|
||||
|
||||
/*
|
||||
* This is the exception return fast path. It is only
|
||||
* available when returning to the kernel instead of user
|
||||
* space. The reason it is faster is that iretq has a
|
||||
* fair amount of overhead and we can avoid that by using
|
||||
* a regular retq instead.
|
||||
*/
|
||||
3:
|
||||
movq RIP(%rsp), %rax
|
||||
movq RSP(%rsp), %rcx
|
||||
subq $8, %rcx
|
||||
movq %rax, (%rcx) /* XXX: this overwrites SS in the trap frame */
|
||||
movq %rcx, RSP(%rsp)
|
||||
movq RFLAGS(%rsp), %rcx
|
||||
pushq %rcx
|
||||
popfq
|
||||
RESTORE_REGS
|
||||
|
||||
/* jump to the frame */
|
||||
movq RSP(%rsp), %rsp
|
||||
retq
|
701
dune/vm.c
701
dune/vm.c
@ -1,701 +0,0 @@
|
||||
/*
|
||||
* vm.h - virtual memory management support
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include <base/stddef.h>
|
||||
#include <base/mem.h>
|
||||
#include <base/page.h>
|
||||
#include <dune/vm.h>
|
||||
|
||||
#define PTE_DEF_FLAGS CAST64(PTE_P | PTE_W | PTE_U)
|
||||
#define PTE_PERM_FLAGS CAST64(PTE_P | PTE_W | PTE_NX | PTE_U)
|
||||
#define PTE_COW_FLAGS CAST64(PTE_P | PTE_NX | PTE_U)
|
||||
|
||||
static bool pte_present(ptent_t e)
|
||||
{
|
||||
return (PTE_FLAGS(e) & PTE_P) > 0;
|
||||
}
|
||||
|
||||
static bool pte_big(ptent_t e)
|
||||
{
|
||||
return (PTE_FLAGS(e) & PTE_PS) > 0;
|
||||
}
|
||||
|
||||
static bool addr_is_aligned(const void *addr, int pgsize)
|
||||
{
|
||||
return !((uintptr_t)addr & (pgsize - 1));
|
||||
}
|
||||
|
||||
static bool addr_is_aligned_to_level(const void *addr, int level)
|
||||
{
|
||||
return addr_is_aligned(addr, PGLEVEL_TO_SIZE(level));
|
||||
}
|
||||
|
||||
static struct page *vm_alloc_pgdir(void)
|
||||
{
|
||||
struct page *pg = page_zalloc(PGSIZE_4KB);
|
||||
if (unlikely(!pg))
|
||||
return NULL;
|
||||
|
||||
pg->flags |= PAGE_FLAG_PGDIR;
|
||||
pg->item_count = 0;
|
||||
return pg;
|
||||
}
|
||||
|
||||
/**
|
||||
* vm_lookup_pte - looks up a page table entry
|
||||
* @tbl: the page table
|
||||
* @va: the virtual address
|
||||
* @level_out: a pointer to store the page level
|
||||
* @pte_out: a pointer to store the PTE pointer
|
||||
*
|
||||
* WARNING: Synchronization not provided...
|
||||
*
|
||||
* Returns 0 if successful, otherwise fail.
|
||||
*/
|
||||
int vm_lookup_pte(ptent_t *tbl, const void *va, int *level_out,
|
||||
ptent_t **pte_out)
|
||||
{
|
||||
ptent_t *pte = tbl;
|
||||
int level;
|
||||
|
||||
for (level = PGLEVEL_NUM - 1; level >= 0; level--) {
|
||||
pte = &pte[PDX(level, va)];
|
||||
if (!*pte)
|
||||
return -ENOENT;
|
||||
if (!level || (level <= PGLEVEL_1GB && pte_big(*pte)))
|
||||
break;
|
||||
|
||||
pte = (ptent_t *)PTE_ADDR(*pte);
|
||||
}
|
||||
|
||||
if (!addr_is_aligned_to_level(va, level))
|
||||
return -EINVAL;
|
||||
|
||||
if (level_out)
|
||||
*level_out = level;
|
||||
if (pte_out)
|
||||
*pte_out = pte;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* vm_insert_pte - inserts an entry into the page table
|
||||
* @tbl: the page table
|
||||
* @va: the virtual address
|
||||
* @level: the level to insert the pte
|
||||
* @pte_in: the pte to insert
|
||||
*
|
||||
* WARNING: Synchronization is not provided.
|
||||
*
|
||||
* Returns 0 if successful, otherwise fail.
|
||||
*/
|
||||
int vm_insert_pte(ptent_t *tbl, const void *va, int level, ptent_t pte_in)
|
||||
{
|
||||
ptent_t *hist[PGLEVEL_NUM];
|
||||
ptent_t *pte = tbl;
|
||||
struct page *pg;
|
||||
int pos;
|
||||
|
||||
if (level < PGLEVEL_4KB || level >= PGLEVEL_NUM)
|
||||
return -EINVAL;
|
||||
if (!(pte_in & PTE_PS) && level > PGLEVEL_4KB)
|
||||
return -EINVAL;
|
||||
if (!addr_is_aligned_to_level(va, level))
|
||||
return -EINVAL;
|
||||
|
||||
for (pos = PGLEVEL_NUM - 1; pos > level; pos--) {
|
||||
pte = &pte[PDX(pos, va)];
|
||||
hist[pos] = pte;
|
||||
|
||||
if (!*pte) {
|
||||
addr_to_smpage(pte)->item_count++;
|
||||
pg = vm_alloc_pgdir();
|
||||
if (unlikely(!pg))
|
||||
goto fail;
|
||||
|
||||
*pte = (ptent_t)smpage_to_addr(pg) | PTE_DEF_FLAGS;
|
||||
} else if (pos <= PGLEVEL_1GB && pte_big(*pte)) {
|
||||
return -EEXIST;
|
||||
}
|
||||
|
||||
pte = (ptent_t *)PTE_ADDR(*pte);
|
||||
}
|
||||
|
||||
pte = &pte[PDX(level, va)];
|
||||
if (unlikely(*pte))
|
||||
return -EEXIST;
|
||||
|
||||
addr_to_smpage(pte)->item_count++;
|
||||
*pte = pte_in;
|
||||
return 0;
|
||||
|
||||
|
||||
fail:
|
||||
for (; pos < PGLEVEL_NUM; pos++) {
|
||||
*hist[pos] = 0;
|
||||
pg = addr_to_smpage(hist[pos]);
|
||||
if (!--pg->item_count)
|
||||
break;
|
||||
|
||||
page_put(pg);
|
||||
}
|
||||
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
/**
|
||||
* vm_remove_pte - removes an entry from the page table
|
||||
* @tbl: the page table
|
||||
* @va: the virtual address
|
||||
* @level_out: a pointer to store the page level
|
||||
* @pte_out: a pointer to store the pte value
|
||||
*
|
||||
* WARNING: Synchronization is not provided.
|
||||
*
|
||||
* Returns 0 if successful, otherwise -ENOENT if nothing to remove.
|
||||
*/
|
||||
int vm_remove_pte(ptent_t *tbl, const void *va,
|
||||
int *level_out, ptent_t *pte_out)
|
||||
{
|
||||
ptent_t *hist[PGLEVEL_NUM];
|
||||
ptent_t *pte = tbl;
|
||||
struct page *pg;
|
||||
int level;
|
||||
|
||||
for (level = PGLEVEL_NUM - 1; level >= PGLEVEL_4KB; level--) {
|
||||
pte = &pte[PDX(level, va)];
|
||||
hist[level] = pte;
|
||||
if (!*pte)
|
||||
return -ENOENT;
|
||||
if (!level || (level <= PGLEVEL_1GB && pte_big(*pte)))
|
||||
break;
|
||||
|
||||
pte = (ptent_t *)PTE_ADDR(*pte);
|
||||
}
|
||||
|
||||
if (!addr_is_aligned_to_level(va, level))
|
||||
return -EINVAL;
|
||||
|
||||
if (level_out)
|
||||
*level_out = level;
|
||||
if (pte_out)
|
||||
*pte_out = *pte;
|
||||
|
||||
for (; level < PGLEVEL_NUM; level++) {
|
||||
pg = addr_to_smpage(hist[level]);
|
||||
*hist[level] = 0;
|
||||
if (!--pg->item_count)
|
||||
break;
|
||||
|
||||
page_put(pg);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* vm_lookup_page - gets the page mapped at a virtual address
|
||||
* @tbl: the page table
|
||||
* @va: the virtual address
|
||||
* @pg_out: the page to get
|
||||
*
|
||||
* WARNING: Sychronization is not provided.
|
||||
*
|
||||
* Returns a struct page, or NULL if none was mapped.
|
||||
*/
|
||||
int vm_lookup_page(ptent_t *tbl, const void *va, struct page **pg_out)
|
||||
{
|
||||
int ret;
|
||||
ptent_t *pte;
|
||||
|
||||
ret = vm_lookup_pte(tbl, va, NULL, &pte);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
assert(*pte & PTE_PAGE);
|
||||
*pg_out = addr_to_page((void *)PTE_ADDR(*pte));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* vm_insert_page - inserts a page at a virtual address
|
||||
* @tbl: the page table
|
||||
* @va: the virtual address
|
||||
* @pg: the page to insert
|
||||
* @flags: the PTE flags
|
||||
*
|
||||
* WARNING: Synchronization is not provided.
|
||||
* The caller is responsible for incrementing the page refcount.
|
||||
*
|
||||
* Returns 0 if successful, otherwise fail.
|
||||
*/
|
||||
int vm_insert_page(ptent_t *tbl, const void *va, struct page *pg, ptent_t flags)
|
||||
{
|
||||
int ret;
|
||||
ptent_t pte;
|
||||
bool large = (pg->flags & PAGE_FLAG_LARGE) > 0;
|
||||
|
||||
pte = (ptent_t)smpage_to_addr(pg) | flags | PTE_PAGE;
|
||||
if (large)
|
||||
pte |= PTE_PS;
|
||||
|
||||
ret = vm_insert_pte(tbl, va, large ? PGLEVEL_2MB : PGLEVEL_4KB, pte);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* vm_remove_page - removes a page at a virtual address
|
||||
* @tbl: the page table
|
||||
* @va: the virtual address
|
||||
* @pg_out: a pointer to store the removed page (can be NULL)
|
||||
*
|
||||
* WARNING: Synchronization is not provided.
|
||||
* The caller is responsible for dropping the page refcount.
|
||||
*
|
||||
* Returns 0 if successful, or -ENOENT if there wasn't a page mapped.
|
||||
*/
|
||||
int vm_remove_page(ptent_t *tbl, const void *va, struct page **pg_out)
|
||||
{
|
||||
int ret;
|
||||
ptent_t pte;
|
||||
|
||||
ret = vm_remove_pte(tbl, va, NULL, &pte);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
assert(pte & PTE_PAGE);
|
||||
if (pg_out)
|
||||
*pg_out = addr_to_page((void *)PTE_ADDR(pte));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* vm_map_phys - maps a range of physical memory to a range of virtual addresses
|
||||
* @tbl: the page table
|
||||
* @pa: the starting physical address
|
||||
* @va: the starting virtual address
|
||||
* @len: the length of the mapping (in bytes)
|
||||
* @pgsize: the page size to use for the mappings
|
||||
* @flags: the PTE flags
|
||||
*
|
||||
* WARNING: Synchronization is not provided.
|
||||
*
|
||||
* Returns 0 if successful, otherwise fail.
|
||||
*/
|
||||
int vm_map_phys(ptent_t *tbl, physaddr_t pa, const void *va,
|
||||
size_t len, int pgsize, ptent_t flags)
|
||||
{
|
||||
intptr_t pos;
|
||||
int ret;
|
||||
|
||||
if (unlikely(!addr_is_aligned(va, pgsize)))
|
||||
return -EINVAL;
|
||||
|
||||
if (pgsize > PGSIZE_4KB)
|
||||
flags |= PTE_PS;
|
||||
|
||||
for (pos = 0; pos < len; pos += pgsize) {
|
||||
ptent_t pte = PTE_FLAGS(flags) | PTE_ADDR(pa + pos);
|
||||
|
||||
ret = vm_insert_pte(tbl, va + pos,
|
||||
PGSIZE_TO_LEVEL(pgsize), pte);
|
||||
if (unlikely(ret))
|
||||
goto fail;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
fail:
|
||||
for (pos -= pgsize; pos >= 0; pos -= pgsize)
|
||||
vm_remove_pte(tbl, va + pos, NULL, NULL);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* vm_map_pages - maps pages to a range of virtual addresses
|
||||
* @tbl: the pgae table
|
||||
* @va: the starting virtual address
|
||||
* @len: the length of the mapping (in bytes)
|
||||
* @pgsize: the page size to use for the mappings
|
||||
* @flags: the PTE flags
|
||||
*
|
||||
* WARNING: Synchronization is not provided.
|
||||
*
|
||||
* Returns 0 if successful, otherwise fail.
|
||||
*/
|
||||
int vm_map_pages(ptent_t *tbl, const void *va, size_t len,
|
||||
int pgsize, ptent_t flags)
|
||||
{
|
||||
const char *start = (const char *)va;
|
||||
intptr_t pos;
|
||||
int ret;
|
||||
|
||||
if (unlikely(pgsize != PGSIZE_4KB && pgsize != PGSIZE_2MB))
|
||||
return -EINVAL;
|
||||
if (unlikely(!addr_is_aligned(va, pgsize)))
|
||||
return -EINVAL;
|
||||
|
||||
for (pos = 0; pos < len; pos += pgsize) {
|
||||
struct page *pg = page_zalloc(pgsize);
|
||||
if (unlikely(!pg))
|
||||
goto fail;
|
||||
|
||||
ret = vm_insert_page(tbl, start + pos, pg, flags);
|
||||
if (unlikely(ret)) {
|
||||
page_put(pg);
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
fail:
|
||||
for (pos -= pgsize; pos >= 0; pos -= pgsize) {
|
||||
struct page *pg;
|
||||
if (!vm_remove_page(tbl, start + pos, &pg))
|
||||
page_put(pg);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* vm_map_copy - copies memory to new pages for a range of virtual addresses
|
||||
* @tbl: the page table
|
||||
* @src_va: the source data (from the current page table)
|
||||
* @map_va: the destination address (in page table @tbl)
|
||||
* @len: the length to copy
|
||||
* @pgsize: the page size
|
||||
* @flags: the PTE flags
|
||||
*
|
||||
* WARNING: Synchronization is not provided.
|
||||
*
|
||||
* Returns 0 if successful, otherwise fail.
|
||||
*/
|
||||
int vm_map_copy(ptent_t *tbl, const void *src_va, const void *map_va,
|
||||
size_t len, int pgsize, ptent_t flags)
|
||||
{
|
||||
const char *src_start = (const char *)src_va;
|
||||
const char *map_start = (const char *)map_va;
|
||||
intptr_t pos;
|
||||
int ret;
|
||||
|
||||
if (unlikely(pgsize != PGSIZE_4KB && pgsize != PGSIZE_2MB))
|
||||
return -EINVAL;
|
||||
if (unlikely(!addr_is_aligned(map_va, pgsize)))
|
||||
return -EINVAL;
|
||||
|
||||
for (pos = 0; pos < len; pos += pgsize) {
|
||||
struct page *pg = page_alloc(pgsize);
|
||||
if (unlikely(!pg))
|
||||
goto fail;
|
||||
|
||||
memcpy(page_to_addr(pg), src_start + pos,
|
||||
min(pgsize, len - pos));
|
||||
ret = vm_insert_page(tbl, map_start + pos, pg, flags);
|
||||
if (unlikely(ret)) {
|
||||
page_put(pg);
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
fail:
|
||||
for (pos -= pgsize; pos >= 0; pos -= pgsize) {
|
||||
struct page *pg;
|
||||
if (!vm_remove_page(tbl, map_start + pos, &pg))
|
||||
page_put(pg);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* vm_mod - changes the PTE flags for a range of virtual addresses
|
||||
* @tbl: the page table
|
||||
* @va: the starting virtual address
|
||||
* @len: the length of the range (in bytes)
|
||||
* @pgsize: the smallest possible page size
|
||||
* @flags: the new PTE flags
|
||||
*
|
||||
* Will silently skip missing mappings.
|
||||
*
|
||||
* Returns true if one or more PTE permissions were changed, otherwise false.
|
||||
*/
|
||||
bool vm_mod(ptent_t *tbl, const void *va, size_t len, int pgsize, ptent_t flags)
|
||||
{
|
||||
const char *start = (const char *) va;
|
||||
intptr_t pos;
|
||||
int ret, level;
|
||||
bool changed = false;
|
||||
|
||||
/* check alignment */
|
||||
assert(addr_is_aligned(va, pgsize));
|
||||
|
||||
for (pos = 0; pos < len;) {
|
||||
ptent_t *pte;
|
||||
ptent_t old;
|
||||
|
||||
ret = vm_lookup_pte(tbl, start + pos, &level, &pte);
|
||||
if (ret) {
|
||||
pos += pgsize;
|
||||
continue;
|
||||
}
|
||||
|
||||
old = *pte;
|
||||
*pte &= ~(PTE_PERM_FLAGS);
|
||||
if (old & PTE_COW)
|
||||
*pte |= (flags & PTE_COW_FLAGS);
|
||||
else
|
||||
*pte |= (flags & PTE_PERM_FLAGS);
|
||||
if (*pte != old)
|
||||
changed = true;
|
||||
|
||||
assert(pgsize <= PGLEVEL_TO_SIZE(level));
|
||||
pos += PGLEVEL_TO_SIZE(level);
|
||||
}
|
||||
|
||||
return changed;
|
||||
}
|
||||
|
||||
/**
|
||||
* vm_disable - marks a range of PTEs not present
|
||||
* @tbl: the page table
|
||||
* @va: the starting virtual address
|
||||
* @len: the length of the range (in bytes)
|
||||
* @pgsize: the smallest possible page size
|
||||
*
|
||||
* Will silently skip missing mappings.
|
||||
*
|
||||
* Returns true if one or more PTEs were disabled, otherwise false.
|
||||
*/
|
||||
bool vm_disable(ptent_t *tbl, const void *va, size_t len, int pgsize)
|
||||
{
|
||||
const char *start = (const char *) va;
|
||||
intptr_t pos;
|
||||
int ret, level;
|
||||
bool changed = false;
|
||||
|
||||
/* check alignment */
|
||||
assert(addr_is_aligned(va, pgsize));
|
||||
|
||||
for (pos = 0; pos < len;) {
|
||||
ptent_t *pte;
|
||||
|
||||
ret = vm_lookup_pte(tbl, start + pos, &level, &pte);
|
||||
if (ret) {
|
||||
pos += pgsize;
|
||||
continue;
|
||||
}
|
||||
|
||||
*pte &= ~(CAST64(PTE_P));
|
||||
assert(pgsize <= PGLEVEL_TO_SIZE(level));
|
||||
pos += PGLEVEL_TO_SIZE(level);
|
||||
changed = true;
|
||||
}
|
||||
|
||||
return changed;
|
||||
}
|
||||
|
||||
/**
|
||||
* vm_unmap - removes mappings from a range of virtual addresses
|
||||
* @tbl: the page table
|
||||
* @va: the starting virtual address
|
||||
* @len: the length of the range (in bytes)
|
||||
* @pgsize: the smallest possible page size
|
||||
*
|
||||
* Use this variant for mappings that are not backed by pages.
|
||||
*
|
||||
* Cannot fail, but may skip missing mappings.
|
||||
*/
|
||||
void vm_unmap(ptent_t *tbl, const void *va, size_t len, int pgsize)
|
||||
{
|
||||
uintptr_t pos;
|
||||
int ret, level;
|
||||
|
||||
/* check alignment */
|
||||
assert(addr_is_aligned(va, pgsize));
|
||||
|
||||
for (pos = 0; pos < len;) {
|
||||
ret = vm_remove_pte(tbl, va + pos, &level, NULL);
|
||||
if (ret) {
|
||||
pos += pgsize;
|
||||
} else {
|
||||
assert(pgsize <= PGLEVEL_TO_SIZE(level));
|
||||
pos += PGLEVEL_TO_SIZE(level);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* vm_unmap_pages - removes pages from a range of virtual addresses
|
||||
* @tbl: the page table
|
||||
* @va: the starting virtual address
|
||||
* @len: the length of the range (in bytes)
|
||||
* @pgsize: the smallest possible page size
|
||||
*
|
||||
* Use this variant for mappings backed by pages (does ref counting).
|
||||
*
|
||||
* Cannot fail, but may skip missing mappings.
|
||||
*/
|
||||
void vm_unmap_pages(ptent_t *tbl, const void *va, size_t len, int pgsize)
|
||||
{
|
||||
intptr_t pos;
|
||||
|
||||
/* check alignment */
|
||||
assert(addr_is_aligned(va, pgsize));
|
||||
|
||||
for (pos = 0; pos < len;) {
|
||||
struct page *pg;
|
||||
if (!vm_remove_page(tbl, va + pos, &pg)) {
|
||||
assert(pgsize <= page_to_size(pg));
|
||||
pos += page_to_size(pg);
|
||||
page_put(pg);
|
||||
} else
|
||||
pos += pgsize;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* vm_create_pt - creates a page table
|
||||
*
|
||||
* Returns a page table, or NULL if out of memory.
|
||||
*/
|
||||
ptent_t *vm_create_pt(void)
|
||||
{
|
||||
struct page *pg = vm_alloc_pgdir();
|
||||
if (!pg)
|
||||
return NULL;
|
||||
|
||||
return (ptent_t *)smpage_to_addr(pg);
|
||||
}
|
||||
|
||||
/**
|
||||
* vm_clone_kern_pt - creates a copy of the kernel page table
|
||||
*
|
||||
* WARNING: Pages in the kernel page table won't be refcounted. It's assumed
|
||||
* they are never deallocated for the life of the process.
|
||||
*
|
||||
* Returns a page table, or NULL if out of memory.
|
||||
*/
|
||||
ptent_t *vm_clone_kern_pt(void)
|
||||
{
|
||||
int i, j, k, l;
|
||||
struct page *pg;
|
||||
ptent_t *src_pud, *src_pmd, *src_pd;
|
||||
ptent_t *dst_pud, *dst_pmd, *dst_pd;
|
||||
ptent_t *pgtbl = vm_create_pt();
|
||||
if (unlikely(!pgtbl))
|
||||
return NULL;
|
||||
|
||||
for (i = 0; i < NPTENTRIES; i++) {
|
||||
if (!pte_present(kern_pgtbl[i]))
|
||||
continue;
|
||||
|
||||
pg = vm_alloc_pgdir();
|
||||
if (unlikely(!pg))
|
||||
goto err;
|
||||
|
||||
src_pud = (ptent_t *)PTE_ADDR(kern_pgtbl[i]);
|
||||
dst_pud = (ptent_t *)smpage_to_addr(pg);
|
||||
pgtbl[i] = (ptent_t)dst_pud | PTE_DEF_FLAGS;
|
||||
addr_to_smpage(pgtbl)->item_count++;
|
||||
|
||||
for (j = 0; j < NPTENTRIES; j++) {
|
||||
if (!src_pud[j])
|
||||
continue;
|
||||
if (pte_big(src_pud[j])) {
|
||||
assert(!(src_pud[j] & PTE_PAGE));
|
||||
dst_pud[j] = src_pud[j];
|
||||
pg->item_count++;
|
||||
continue;
|
||||
}
|
||||
|
||||
pg = vm_alloc_pgdir();
|
||||
if (unlikely(!pg))
|
||||
goto err;
|
||||
|
||||
src_pmd = (ptent_t *)PTE_ADDR(src_pud[j]);
|
||||
dst_pmd = (ptent_t *)smpage_to_addr(pg);
|
||||
dst_pud[j] = (ptent_t)dst_pmd | PTE_DEF_FLAGS;
|
||||
addr_to_smpage(dst_pud)->item_count++;
|
||||
|
||||
for (k = 0; k < NPTENTRIES; k++) {
|
||||
if (!src_pmd[k])
|
||||
continue;
|
||||
if (pte_big(src_pmd[k])) {
|
||||
dst_pmd[k] = src_pmd[k];
|
||||
pg->item_count++;
|
||||
continue;
|
||||
}
|
||||
|
||||
pg = vm_alloc_pgdir();
|
||||
if (unlikely(!pg))
|
||||
goto err;
|
||||
|
||||
src_pd = (ptent_t *)PTE_ADDR(src_pmd[k]);
|
||||
dst_pd = (ptent_t *)smpage_to_addr(pg);
|
||||
dst_pmd[k] = (ptent_t)smpage_to_addr(pg) |
|
||||
PTE_DEF_FLAGS;
|
||||
addr_to_smpage(dst_pmd)->item_count++;
|
||||
|
||||
for (l = 0; l < NPTENTRIES; l++) {
|
||||
dst_pd[l] = src_pd[l];
|
||||
pg->item_count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return pgtbl;
|
||||
|
||||
err:
|
||||
vm_destroy_pt(pgtbl);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* vm_destroy_pt - destroys a page table
|
||||
* @tbl: the page table
|
||||
*/
|
||||
void vm_destroy_pt(ptent_t *tbl)
|
||||
{
|
||||
int i, j, k;
|
||||
ptent_t *pud, *pmd;
|
||||
|
||||
for (i = 0; i < NPTENTRIES; i++) {
|
||||
if (!pte_present(tbl[i]))
|
||||
continue;
|
||||
|
||||
pud = (ptent_t *)PTE_ADDR(tbl[i]);
|
||||
|
||||
for (j = 0; j < NPTENTRIES; j++) {
|
||||
if (!pud[j])
|
||||
continue;
|
||||
if (pte_big(pud[j]))
|
||||
continue;
|
||||
|
||||
pmd = (ptent_t *)PTE_ADDR(pud[j]);
|
||||
|
||||
for (k = 0; k < NPTENTRIES; k++) {
|
||||
if (!pmd[k])
|
||||
continue;
|
||||
if (pte_big(pmd[k]))
|
||||
continue;
|
||||
|
||||
page_put_addr((ptent_t *)PTE_ADDR(pmd[k]));
|
||||
}
|
||||
|
||||
page_put_addr(pmd);
|
||||
}
|
||||
|
||||
page_put_addr(pud);
|
||||
}
|
||||
|
||||
page_put_addr(tbl);
|
||||
}
|
@ -1,45 +0,0 @@
|
||||
#include <asm/unistd_64.h>
|
||||
|
||||
.data
|
||||
.globl vsyscall_page
|
||||
.balign 4096, 0xcc
|
||||
.type vsyscall_page, @object
|
||||
vsyscall_page:
|
||||
|
||||
/* handle gettimeofday() */
|
||||
mov %cs, %rax
|
||||
test $3, %rax
|
||||
mov $__NR_gettimeofday, %rax
|
||||
jnz 1f
|
||||
vmcall
|
||||
ret
|
||||
1:
|
||||
syscall
|
||||
ret
|
||||
|
||||
/* handle time() */
|
||||
.balign 1024, 0xcc
|
||||
mov %cs, %rax
|
||||
test $3, %rax
|
||||
mov $__NR_time, %rax
|
||||
jnz 2f
|
||||
vmcall
|
||||
ret
|
||||
2:
|
||||
syscall
|
||||
ret
|
||||
|
||||
/* handle getcpu() */
|
||||
.balign 1024, 0xcc
|
||||
mov %cs, %rax
|
||||
test $3, %rax
|
||||
mov $__NR_getcpu, %rax
|
||||
jnz 3f
|
||||
vmcall
|
||||
ret
|
||||
3:
|
||||
syscall
|
||||
ret
|
||||
|
||||
.balign 4096, 0xcc
|
||||
.size __dune_vsyscall_page, 4096
|
189
inc/dune/entry.h
189
inc/dune/entry.h
@ -1,189 +0,0 @@
|
||||
/*
|
||||
* entry.h - routines for entering and exiting the kernel
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <base/stddef.h>
|
||||
|
||||
/* the base address of the linux kernel vdso mapping */
|
||||
extern uintptr_t entry_vdso_base;
|
||||
|
||||
/* asm entry routines */
|
||||
extern const char syscall_enter[];
|
||||
extern const char syscall_enter_end[];
|
||||
extern const char trap_entry_tbl[];
|
||||
extern const char vsyscall_page[];
|
||||
|
||||
#define TRAP_ENTRY_SIZE 16
|
||||
|
||||
/*
|
||||
* We use the same general GDT layout as Linux so that can we use
|
||||
* the same syscall MSR values. In practice only code segments
|
||||
* matter, since ia-32e mode ignores most of segment values anyway,
|
||||
* but just to be extra careful we match data as well.
|
||||
*/
|
||||
#define GD_KT 0x10
|
||||
#define GD_KD 0x18
|
||||
#define GD_UD 0x28
|
||||
#define GD_UT 0x30
|
||||
#define GD_TSS 0x38
|
||||
#define GD_TSS2 0x40
|
||||
#define GDT_ENTRIES 9
|
||||
|
||||
struct env_tf {
|
||||
/* manually saved, arguments */
|
||||
uint64_t rdi;
|
||||
uint64_t rsi;
|
||||
uint64_t rdx;
|
||||
uint64_t rcx;
|
||||
uint64_t r8;
|
||||
uint64_t r9;
|
||||
uint64_t r10;
|
||||
uint64_t r11;
|
||||
|
||||
/* saved by C calling conventions */
|
||||
uint64_t rbx;
|
||||
uint64_t rbp;
|
||||
uint64_t r12;
|
||||
uint64_t r13;
|
||||
uint64_t r14;
|
||||
uint64_t r15;
|
||||
|
||||
/* system call number, ret */
|
||||
uint64_t rax;
|
||||
|
||||
/* exception frame */
|
||||
uint32_t err;
|
||||
uint32_t pad1;
|
||||
uint64_t rip;
|
||||
uint16_t cs;
|
||||
uint16_t pad2[3];
|
||||
uint64_t rflags;
|
||||
uint64_t rsp;
|
||||
uint16_t ss;
|
||||
uint16_t pad3[3];
|
||||
} __packed;
|
||||
|
||||
#define ARG0(tf) ((tf)->rdi)
|
||||
#define ARG1(tf) ((tf)->rsi)
|
||||
#define ARG2(tf) ((tf)->rdx)
|
||||
#define ARG3(tf) ((tf)->rcx)
|
||||
#define ARG4(tf) ((tf)->r8)
|
||||
#define ARG5(tf) ((tf)->r9)
|
||||
|
||||
extern void pop_tf(struct env_tf *tf) __noreturn;
|
||||
extern void pop_tf_user(struct env_tf *tf) __noreturn;
|
||||
extern void pop_tf_user_fast(struct env_tf *tf) __noreturn;
|
||||
extern void switch_tf(struct env_tf *curtf, struct env_tf *newtf);
|
||||
|
||||
struct entry_percpu {
|
||||
void *percpu_ptr;
|
||||
uint64_t tmp;
|
||||
uintptr_t kfs_base;
|
||||
uintptr_t ufs_base;
|
||||
uintptr_t ugs_base;
|
||||
uint64_t flags;
|
||||
void *thread_stack;
|
||||
uint32_t preempt_cnt;
|
||||
uint32_t pad;
|
||||
} __packed;
|
||||
|
||||
#define ENTRY_FLAG_IN_USER 0x1 /* in usermode? */
|
||||
#define ENTRY_FLAG_LOAD_USER 0x2 /* restore usermode segs? */
|
||||
|
||||
static inline void entry_set_thread_stack(uintptr_t val)
|
||||
{
|
||||
asm("movq %0, %%gs:%c[thread_stack]"
|
||||
: /* no outputs */
|
||||
: "r"(val), [thread_stack]"i"(offsetof(struct entry_percpu, thread_stack))
|
||||
: "memory");
|
||||
}
|
||||
|
||||
static inline uint64_t entry_get_kfs_base(void)
|
||||
{
|
||||
uint64_t val;
|
||||
asm("movq %%gs:%c[kfs_base], %0"
|
||||
: "=r"(val)
|
||||
: [kfs_base]"i"(offsetof(struct entry_percpu, kfs_base))
|
||||
: "memory");
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
static inline void entry_set_kfs_base(uint64_t val)
|
||||
{
|
||||
asm("movq %0, %%gs:%c[kfs_base]"
|
||||
: /* no outputs */
|
||||
: "r"(val), [kfs_base]"i"(offsetof(struct entry_percpu, kfs_base))
|
||||
: "memory");
|
||||
}
|
||||
|
||||
static inline uint64_t entry_get_ufs_base(void)
|
||||
{
|
||||
uint64_t val;
|
||||
asm("movq %%gs:%c[ufs_base], %0"
|
||||
: "=r"(val)
|
||||
: [ufs_base]"i"(offsetof(struct entry_percpu, ufs_base))
|
||||
: "memory");
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
static inline void entry_set_ufs_base(uint64_t val)
|
||||
{
|
||||
asm("movq %0, %%gs:%c[ufs_base]"
|
||||
: /* no outputs */
|
||||
: "r"(val), [ufs_base]"i"(offsetof(struct entry_percpu, ufs_base))
|
||||
: "memory");
|
||||
}
|
||||
|
||||
static inline uint64_t entry_get_ugs_base(void)
|
||||
{
|
||||
uint64_t val;
|
||||
asm("movq %%gs:%c[ugs_base], %0"
|
||||
: "=r"(val)
|
||||
: [ugs_base]"i"(offsetof(struct entry_percpu, ugs_base))
|
||||
: "memory");
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
static inline void entry_set_ugs_base(uint64_t val)
|
||||
{
|
||||
asm("movq %0, %%gs:%c[ugs_base]"
|
||||
: /* no outputs */
|
||||
: "r"(val), [ugs_base]"i"(offsetof(struct entry_percpu, ugs_base))
|
||||
: "memory");
|
||||
}
|
||||
|
||||
static inline void entry_set_flag_mask(uint64_t val)
|
||||
{
|
||||
asm("orq %0, %%gs:%c[flags]"
|
||||
: /* no outputs */
|
||||
: "r"(val), [flags]"i"(offsetof(struct entry_percpu, flags))
|
||||
: "memory", "cc");
|
||||
}
|
||||
|
||||
static inline void entry_clear_flag_mask(uint64_t val)
|
||||
{
|
||||
asm("andq %0, %%gs:%c[flags]"
|
||||
: /* no outputs */
|
||||
: "r"(~(val)), [flags]"i"(offsetof(struct entry_percpu, flags))
|
||||
: "memory", "cc");
|
||||
}
|
||||
|
||||
static inline bool entry_test_flag_mask(uint64_t val)
|
||||
{
|
||||
asm goto("testq %0, %%gs:%c[flags]\n\t"
|
||||
"jz %l[no_match]\n\t"
|
||||
: /* no outputs */
|
||||
: "r"(val), [flags]"i"(offsetof(struct entry_percpu, flags))
|
||||
: "memory", "cc"
|
||||
: no_match);
|
||||
|
||||
return true;
|
||||
|
||||
no_match:
|
||||
return false;
|
||||
}
|
123
inc/dune/fpu.h
123
inc/dune/fpu.h
@ -1,123 +0,0 @@
|
||||
/*
|
||||
* fpu.h - x86 floating point, MMX, SSE, and AVX support for Dune
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <base/types.h>
|
||||
|
||||
struct fxsave_area {
|
||||
uint16_t cwd;
|
||||
uint16_t swd;
|
||||
uint16_t twd;
|
||||
uint16_t fop;
|
||||
uint64_t rip;
|
||||
uint64_t rdp;
|
||||
uint32_t mxcsr;
|
||||
uint32_t mxcsr_mask;
|
||||
uint32_t st_regs[32]; // 8 128-bit FP registers
|
||||
uint32_t xmm_regs[64]; // 16 128-bit XMM registers
|
||||
uint32_t padding[24];
|
||||
} __attribute__((packed));
|
||||
|
||||
struct xsave_header {
|
||||
uint64_t xstate_bv;
|
||||
uint64_t xcomp_bv;
|
||||
uint64_t reserved_zero;
|
||||
uint64_t reserved[5];
|
||||
} __attribute__((packed));
|
||||
|
||||
struct xsave_area {
|
||||
struct fxsave_area fxsave;
|
||||
struct xsave_header header;
|
||||
uint32_t ymm_regs[64]; // extends XMM registers to 256-bit
|
||||
/* FIXME: check CPUID, could be other extensions in the future */
|
||||
} __attribute__((packed, aligned(64)));
|
||||
|
||||
struct fpu_area {
|
||||
/* we only support xsave, since it's available in nehalem and later */
|
||||
struct xsave_area xsave;
|
||||
};
|
||||
|
||||
static inline void fpu_xsave(struct fpu_area *fp, uint64_t mask)
|
||||
{
|
||||
uint32_t lmask = mask;
|
||||
uint32_t umask = mask >> 32;
|
||||
|
||||
asm volatile("xsaveq %0\n\t" : "=m"(fp->xsave) :
|
||||
"a"(lmask), "d"(umask) :
|
||||
"memory");
|
||||
}
|
||||
|
||||
static inline void fpu_xsaveopt(struct fpu_area *fp, uint64_t mask)
|
||||
{
|
||||
uint32_t lmask = mask;
|
||||
uint32_t umask = mask >> 32;
|
||||
|
||||
asm volatile("xsaveoptq %0\n\t" : "=m"(fp->xsave) :
|
||||
"a"(lmask), "d"(umask) :
|
||||
"memory");
|
||||
}
|
||||
|
||||
static inline void fpu_xrstor(struct fpu_area *fp, uint64_t mask)
|
||||
{
|
||||
uint32_t lmask = mask;
|
||||
uint32_t umask = mask >> 32;
|
||||
|
||||
asm volatile("xrstorq %0\n\t" : : "m"(fp->xsave),
|
||||
"a"(lmask), "d"(umask) :
|
||||
"memory");
|
||||
}
|
||||
|
||||
/*
|
||||
* fpu_init - initializes an fpu area
|
||||
* @fp: the fpu area
|
||||
*/
|
||||
static inline void fpu_init(struct fpu_area *fp)
|
||||
{
|
||||
fp->xsave.header.xstate_bv = 0;
|
||||
fp->xsave.header.xcomp_bv = 0;
|
||||
fp->xsave.header.reserved_zero = 0;
|
||||
fp->xsave.fxsave.cwd = 0x37f;
|
||||
fp->xsave.fxsave.mxcsr = 0x1f80;
|
||||
}
|
||||
|
||||
/*
|
||||
* fpu_load - loads an fpu area into fpu registers
|
||||
* @fp: the fpu area
|
||||
*/
|
||||
static inline void fpu_load(struct fpu_area *fp)
|
||||
{
|
||||
fpu_xrstor(fp, -1);
|
||||
}
|
||||
|
||||
/*
|
||||
* fpu_save - saves fpu registers to an fpu area
|
||||
* @fp: the fpu area
|
||||
*
|
||||
* WARNING: Do not call this function on a memory region
|
||||
* that was not previously loaded with fpu_load().
|
||||
*
|
||||
* If you do, register state corruption might be possible. See
|
||||
* "XSAVEOPT Usage Guidlines" under the XSAVEOPT instruction
|
||||
* description in the Intel Manual Instruction Set Reference
|
||||
* for more details.
|
||||
*/
|
||||
static inline void fpu_save(struct fpu_area *fp)
|
||||
{
|
||||
// FIXME: need to check CPUID because only
|
||||
// sandybridge and later support XSAVEOPT
|
||||
fpu_xsaveopt(fp, -1);
|
||||
}
|
||||
|
||||
/*
|
||||
* fpu_save_safe - saves an fpu area from CPU registers
|
||||
* @fp: the fpu area
|
||||
*
|
||||
* Works under all conditions, but may be slower.
|
||||
*/
|
||||
static inline void fpu_save_safe(struct fpu_area *fp)
|
||||
{
|
||||
fpu_xsave(fp, -1);
|
||||
}
|
||||
|
264
inc/dune/mmu.h
264
inc/dune/mmu.h
@ -1,264 +0,0 @@
|
||||
/*
|
||||
* mmu.h - x86 MMU definitions
|
||||
*
|
||||
* NOTE: This code is derived from JOS, created by MIT PDOS.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <base/types.h>
|
||||
|
||||
typedef uint64_t ptent_t;
|
||||
|
||||
#define UINT64(x) ((uint64_t) x)
|
||||
#define CAST64(x) ((uint64_t) x)
|
||||
#define ONE UINT64 (1)
|
||||
|
||||
#define NPTBITS 9 /* log2(NPTENTRIES) */
|
||||
|
||||
|
||||
/*
|
||||
*
|
||||
* Part 1. Paging data structures and control registers
|
||||
*
|
||||
*/
|
||||
|
||||
/* index into:
|
||||
* n = 0 => page table
|
||||
* n = 1 => page directory
|
||||
* n = 2 => page directory pointer
|
||||
* n = 3 => page map level 4
|
||||
*/
|
||||
#define PDXMASK ((1 << NPTBITS) - 1)
|
||||
#define PDSHIFT(n) (12 + NPTBITS * (n))
|
||||
#define PDX(n, la) ((((uintptr_t) (la)) >> PDSHIFT(n)) & PDXMASK)
|
||||
|
||||
#define NPTENTRIES (1 << NPTBITS)
|
||||
|
||||
/* page number field of address */
|
||||
#define PPN(la) ((la) >> PGSHIFT)
|
||||
|
||||
/* page size */
|
||||
#define PGSHIFT 12 /* log2(PGSIZE) */
|
||||
#define PGSIZE (1 << PGSHIFT) /* bytes mapped by a page */
|
||||
#define PGMASK (PGSIZE - 1)
|
||||
|
||||
/* offset in page */
|
||||
#define PGOFF(la) (((uintptr_t) (la)) & PGMASK)
|
||||
#define PGADDR(la) (((uintptr_t) (la)) & ~CAST64(PGMASK))
|
||||
|
||||
/* big page size */
|
||||
#define BIG_PGSHIFT 21
|
||||
#define BIG_PGSIZE (1 << BIG_PGSHIFT)
|
||||
#define BIG_PGMASK (BIG_PGSIZE - 1)
|
||||
|
||||
/* offset in big page */
|
||||
#define BIG_PGOFF(la) (((uintptr_t) (la)) & BIG_PGMASK)
|
||||
#define BIG_PGADDR(la) (((uintptr_t) (la)) & ~CAST64(BIG_PGMASK))
|
||||
|
||||
/* Page table/directory entry flags. */
|
||||
#define PTE_P 0x0001 /* Present */
|
||||
#define PTE_W 0x0002 /* Writeable */
|
||||
#define PTE_U 0x0004 /* User */
|
||||
#define PTE_PWT 0x0008 /* Write-Through */
|
||||
#define PTE_PCD 0x0010 /* Cache-Disable */
|
||||
#define PTE_A 0x0020 /* Accessed */
|
||||
#define PTE_D 0x0040 /* Dirty */
|
||||
#define PTE_PS 0x0080 /* Page size, in PD/PDP/PML4 */
|
||||
#define PTE_PAT 0x0080 /* Page attribute table, in 4KB PTE */
|
||||
#define PTE_G 0x0100 /* Global */
|
||||
#define PTE_AVAIL 0x0E00 /* 3 bits not used by hardware */
|
||||
#define PTE_PAT_PS 0x1000 /* Page attribute table, in 2MB PTE */
|
||||
#define PTE_AVAIL2 0x7FF0000000000000UL /* 11 bits not used by hardware */
|
||||
#define PTE_NX 0x8000000000000000UL /* No execute */
|
||||
|
||||
/* OS Specific Flags - Using available bits in PTE */
|
||||
#define PTE_PAGE 0x0200 /* backed by a page */
|
||||
#define PTE_COW 0x0400 /* copy-on-write */
|
||||
|
||||
/* address in page table entry */
|
||||
#define PTE_ADDR(pte) ((physaddr_t)(pte) & 0xffffffffff000UL)
|
||||
#define PTE_FLAGS(pte) ((physaddr_t)(pte) & 0xfff0000000000fffUL)
|
||||
|
||||
/* Control Register flags */
|
||||
#define CR0_PE 0x1 /* Protected mode enable */
|
||||
#define CR0_MP 0x2 /* Monitor coProcessor */
|
||||
#define CR0_EM 0x4 /* Emulation */
|
||||
#define CR0_TS 0x8 /* Task Switched */
|
||||
#define CR0_ET 0x10 /* Extension Type */
|
||||
#define CR0_NE 0x20 /* Numeric Errror */
|
||||
#define CR0_WP 0x10000 /* Write Protect */
|
||||
#define CR0_AM 0x40000 /* Alignment Mask */
|
||||
#define CR0_NW 0x20000000 /* Not Writethrough */
|
||||
#define CR0_CD 0x40000000 /* Cache Disable */
|
||||
#define CR0_PG 0x80000000 /* Paging */
|
||||
|
||||
#define CR3_PWT 0x8 /* Page-level writethrough */
|
||||
#define CR3_PCD 0x10 /* Page-level cache disable */
|
||||
|
||||
#define CR4_VME 0x1 /* V86 Mode Extensions */
|
||||
#define CR4_PVI 0x2 /* Protected-Mode Virtual Interrupts */
|
||||
#define CR4_TSD 0x4 /* Time Stamp Disable */
|
||||
#define CR4_DE 0x8 /* Debugging Extensions */
|
||||
#define CR4_PSE 0x10 /* Page Size Extensions */
|
||||
#define CR4_PAE 0x20 /* Page address extension */
|
||||
#define CR4_MCE 0x40 /* Machine Check Enable */
|
||||
#define CR4_PGE 0x80 /* Page-global enable */
|
||||
#define CR4_PCE 0x100 /* Performance counter enable */
|
||||
#define CR4_OSFXSR 0x200 /* FXSAVE/FXRSTOR support */
|
||||
#define CR4_OSX 0x400 /* OS unmasked exception support */
|
||||
|
||||
/* MTRR registers */
|
||||
#define MTRR_CAP 0xfe /* MTRR capabilities */
|
||||
#define MTRR_CAP_VCNT_MASK 0xff /* Variable-size register count */
|
||||
#define MTRR_CAP_FIX 0x100 /* Fixed-size register support */
|
||||
#define MTRR_CAP_WC 0x400 /* Write-combining support */
|
||||
#define MTRR_BASE(i) (0x200 + 2*(i)) /* Physical address base */
|
||||
#define MTRR_BASE_UC 0x00 /* Uncacheable */
|
||||
#define MTRR_BASE_WC 0x01 /* Write-Combining */
|
||||
#define MTRR_BASE_WT 0x04 /* Writethrough */
|
||||
#define MTRR_BASE_WP 0x05 /* Write-Protect */
|
||||
#define MTRR_BASE_WB 0x06 /* Writeback */
|
||||
#define MTRR_MASK(i) (0x201 + 2*(i)) /* Physical address mask */
|
||||
#define MTRR_MASK_FULL PGADDR((ONE << 36) - 1)
|
||||
#define MTRR_MASK_VALID 0x800
|
||||
|
||||
/* EFER Register */
|
||||
#define EFER 0xc0000080 /* MSR number */
|
||||
#define EFER_SCE 0x1 /* System-call extension */
|
||||
#define EFER_LME 0x100 /* Long mode enable */
|
||||
#define EFER_LMA 0x400 /* Long mode active */
|
||||
#define EFER_NXE 0x800 /* No-execute enable */
|
||||
#define EFER_FFXSR 0x4000 /* Fast FXSAVE/FXRSTOR */
|
||||
|
||||
/* FS/GS base registers */
|
||||
#define MSR_FS_BASE 0xc0000100
|
||||
#define MSR_GS_BASE 0xc0000101
|
||||
|
||||
/* Debug registers */
|
||||
#define MSR_DEBUG_CTL 0x1d9 /* MSR number */
|
||||
#define DEBUG_CTL_LBR (1 << 0) /* Last-Branch Record */
|
||||
|
||||
#define MSR_LBR_FROM_IP 0x1db /* Last branch from IP */
|
||||
#define MSR_LBR_TO_IP 0x1dc /* Last branch to IP */
|
||||
#define MSR_LEX_FROM_IP 0x1dd /* Last exception from IP */
|
||||
#define MSR_LEX_TO_IP 0x1de /* Last exception to IP */
|
||||
|
||||
#define DR7_L(n) (ONE << ((n)*2)) /* Local breakpoint enable */
|
||||
#define DR7_G(n) (ONE << ((n)*2+1)) /* Global breakpoint enable */
|
||||
#define DR7_LE (ONE << 8) /* Local enable */
|
||||
#define DR7_GE (ONE << 9) /* Global enable */
|
||||
#define DR7_GD (ONE << 13) /* General-detect enable */
|
||||
#define DR7_RW_SHIFT(n) ((n) * 4 + 16) /* Breakpoint access mode */
|
||||
#define DR7_LEN_SHIFT(n) ((n) * 4 + 18) /* Breakpoint addr length */
|
||||
|
||||
#define DR7_RW_EXEC 0x0
|
||||
#define DR7_RW_WRITE 0x1
|
||||
#define DR7_RW_IO 0x2
|
||||
#define DR7_RW_RW 0x3
|
||||
|
||||
#define DR7_LEN_1 0x0
|
||||
#define DR7_LEN_2 0x1
|
||||
#define DR7_LEN_8 0x2
|
||||
#define DR7_LEN_4 0x3
|
||||
|
||||
/* Rflags register */
|
||||
#define FL_CF 0x00000001 /* Carry Flag */
|
||||
#define FL_PF 0x00000004 /* Parity Flag */
|
||||
#define FL_AF 0x00000010 /* Auxiliary carry Flag */
|
||||
#define FL_ZF 0x00000040 /* Zero Flag */
|
||||
#define FL_SF 0x00000080 /* Sign Flag */
|
||||
#define FL_TF 0x00000100 /* Trap Flag */
|
||||
#define FL_IF 0x00000200 /* Interrupt Flag */
|
||||
#define FL_DF 0x00000400 /* Direction Flag */
|
||||
#define FL_OF 0x00000800 /* Overflow Flag */
|
||||
#define FL_IOPL_MASK 0x00003000 /* I/O Privilege Level bitmask */
|
||||
#define FL_IOPL_0 0x00000000 /* IOPL == 0 */
|
||||
#define FL_IOPL_1 0x00001000 /* IOPL == 1 */
|
||||
#define FL_IOPL_2 0x00002000 /* IOPL == 2 */
|
||||
#define FL_IOPL_3 0x00003000 /* IOPL == 3 */
|
||||
#define FL_NT 0x00004000 /* Nested Task */
|
||||
#define FL_RF 0x00010000 /* Resume Flag */
|
||||
#define FL_VM 0x00020000 /* Virtual 8086 mode */
|
||||
#define FL_AC 0x00040000 /* Alignment Check */
|
||||
#define FL_VIF 0x00080000 /* Virtual Interrupt Flag */
|
||||
#define FL_VIP 0x00100000 /* Virtual Interrupt Pending */
|
||||
#define FL_ID 0x00200000 /* ID flag */
|
||||
|
||||
/* Page fault error codes */
|
||||
#define FEC_P 0x1 /* Fault caused by protection violation */
|
||||
#define FEC_W 0x2 /* Fault caused by a write */
|
||||
#define FEC_U 0x4 /* Fault occured in user mode */
|
||||
#define FEC_RSV 0x8 /* Fault caused by reserved PTE bit */
|
||||
#define FEC_I 0x10 /* Fault caused by instruction fetch */
|
||||
|
||||
/*
|
||||
*
|
||||
* Part 2. Segmentation data structures and constants.
|
||||
*
|
||||
*/
|
||||
|
||||
/* STA_ macros are for segment type values */
|
||||
#define STA_A (ONE << 0) /* Accessed */
|
||||
#define STA_W (ONE << 1) /* Writable (for data segments) */
|
||||
#define STA_E (ONE << 2) /* Expand down (for data segments) */
|
||||
#define STA_X (ONE << 3) /* 1 = Code segment (executable) */
|
||||
#define STA_R (ONE << 1) /* Readable (for code segments) */
|
||||
#define STA_C (ONE << 2) /* Conforming (for code segments) */
|
||||
|
||||
/* SEG_ macros specify segment type values shifted into place */
|
||||
#define SEG_A (STA_A << 40) /* Accessed */
|
||||
#define SEG_W (STA_W << 40) /* Writable (for data segments) */
|
||||
#define SEG_E (STA_E << 40) /* Expand down (for data segments) */
|
||||
#define SEG_X (STA_X << 40) /* 1 = Code segment (executable) */
|
||||
#define SEG_R (STA_R << 40) /* Readable (for code segments) */
|
||||
#define SEG_C (STA_C << 40) /* Conforming (for code segments) */
|
||||
|
||||
#define SEG_S (ONE << 44) /* 1 = non-system, 0 = system segment */
|
||||
|
||||
#define SEG_LDT (UINT64 (0x2) << 40) /* 64-bit local descriptor segment */
|
||||
#define SEG_TSSA (UINT64 (0x9) << 40) /* Available 64-bit TSS */
|
||||
#define SEG_TSSB (UINT64 (0xa) << 40) /* Busy 64-bit TSS */
|
||||
#define SEG_CG (UINT64 (0xc) << 40) /* 64-bit Call Gate */
|
||||
#define SEG_IG (UINT64 (0xe) << 40) /* 64-bit Interrupt Gate */
|
||||
#define SEG_TG (UINT64 (0xf) << 40) /* 64-bit Trap Gate */
|
||||
|
||||
#define SEG_DPL(x) (((x) & UINT64(3)) << 45) /* Descriptor privilege level */
|
||||
#define SEG_P (ONE << 47) /* Present */
|
||||
#define SEG_L (ONE << 53) /* Long mode */
|
||||
#define SEG_D (ONE << 54) /* 1 = 32-bit in legacy, 0 in long mode */
|
||||
#define SEG_G (ONE << 55) /* Granulatity: 1 = scale limit by 4K */
|
||||
|
||||
/* Base and limit for 32-bit or low half of 64-bit segments */
|
||||
#define SEG_LIM(x) (((x) & 0xffff) | ((x) & UINT64 (0xf0000)) << 32)
|
||||
#define SEG_BASELO(x) (((CAST64 (x) & 0xffffff) << 16) \
|
||||
| ((CAST64 (x) & 0xff000000) << 32))
|
||||
#define SEG_BASEHI(x) (CAST64 (x) >> 32)
|
||||
|
||||
#define SEG32_ASM(type, base, lim) \
|
||||
.word (((lim) >> 12) & 0xffff), ((base) & 0xffff); \
|
||||
.byte (((base) >> 16) & 0xff), (0x90 | (type)), \
|
||||
(0xC0 | (((lim) >> 28) & 0xf)), (((base) >> 24) & 0xff)
|
||||
|
||||
#define SEG32(type, base, lim, dpl) \
|
||||
((type) | SEG_S | SEG_P | SEG_D | SEG_G | SEG_A | SEG_DPL (dpl) \
|
||||
| SEG_BASELO (base) | SEG_LIM ((lim) >> 12))
|
||||
|
||||
#define SEG64(type, dpl) \
|
||||
((type) | SEG_S | SEG_P | SEG_G | SEG_L | SEG_A | SEG_DPL (dpl) \
|
||||
| SEG_LIM (0xffffffff))
|
||||
|
||||
/* Target and segment selector for trap/interrupt gates */
|
||||
#define SEG_SEL(x) (((x) & 0xffff) << 16)
|
||||
#define SEG_TARGETLO(x) ((CAST64 (x) & 0xffff) \
|
||||
| ((CAST64 (x) & 0xffff0000) << 32))
|
||||
#define SEG_TARGETHI(x) (CAST64 (x) >> 32)
|
||||
|
||||
#define GATE32(type, sel, target, dpl) \
|
||||
((type) | SEG_DPL (dpl) | SEG_P | SEG_SEL (sel) | SEG_TARGETLO (target))
|
||||
#define SETGATE(gate, type, sel, target, dpl) \
|
||||
do { \
|
||||
gate.gd_lo = GATE32 (type, sel, target, dpl); \
|
||||
gate.gd_hi = SEG_TARGETHI (target); \
|
||||
} while (0)
|
||||
|
131
inc/dune/msr.h
131
inc/dune/msr.h
@ -1,131 +0,0 @@
|
||||
/*
|
||||
* msr.h - x86 Machine-specific Register (MSR) support
|
||||
*
|
||||
* Based on code from XV6, created by MIT PDOS.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <base/types.h>
|
||||
|
||||
static inline uint64_t rdmsr(uint64_t msr)
|
||||
{
|
||||
uint32_t low, high;
|
||||
asm volatile("rdmsr" : "=a" (low), "=d" (high) : "c" (msr));
|
||||
return (uint64_t)low | ((uint64_t)high << 32);
|
||||
}
|
||||
|
||||
static inline void wrmsr(uint64_t msr, uint64_t val)
|
||||
{
|
||||
uint32_t low = (val & 0xffffffff);
|
||||
uint32_t high = (val >> 32);
|
||||
asm volatile("wrmsr" : : "c" (msr), "a" (low), "d" (high) : "memory");
|
||||
}
|
||||
|
||||
// FS/GS base registers
|
||||
#define MSR_FS_BASE 0xc0000100
|
||||
#define MSR_GS_BASE 0xc0000101
|
||||
#define MSR_GS_KERNBASE 0xc0000102
|
||||
|
||||
// SYSCALL and SYSRET registers
|
||||
#define MSR_STAR 0xc0000081
|
||||
#define MSR_LSTAR 0xc0000082
|
||||
#define MSR_CSTAR 0xc0000083
|
||||
#define MSR_SFMASK 0xc0000084
|
||||
|
||||
#define MSR_INTEL_MISC_ENABLE 0x1a0
|
||||
#define MISC_ENABLE_PEBS_UNAVAILABLE (1<<12) // Read-only
|
||||
|
||||
// AMD performance event-select registers
|
||||
#define MSR_AMD_PERF_SEL0 0xC0010000
|
||||
#define MSR_AMD_PERF_SEL1 0xC0010001
|
||||
#define MSR_AMD_PERF_SEL2 0xC0010002
|
||||
#define MSR_AMD_PERF_SEL3 0xC0010003
|
||||
// AMD performance event-count registers
|
||||
#define MSR_AMD_PERF_CNT0 0xC0010004
|
||||
#define MSR_AMD_PERF_CNT1 0xC0010005
|
||||
#define MSR_AMD_PERF_CNT2 0xC0010006
|
||||
#define MSR_AMD_PERF_CNT3 0xC0010007
|
||||
|
||||
// Intel performance event-select registers
|
||||
#define MSR_INTEL_PERF_SEL0 0x00000186
|
||||
// Intel performance event-count registers
|
||||
#define MSR_INTEL_PERF_CNT0 0x000000c1
|
||||
#define MSR_INTEL_PERF_GLOBAL_STATUS 0x38e
|
||||
#define PERF_GLOBAL_STATUS_PEBS (1ull << 62)
|
||||
#define MSR_INTEL_PERF_GLOBAL_CTRL 0x38f
|
||||
#define MSR_INTEL_PERF_GLOBAL_OVF_CTRL 0x390
|
||||
|
||||
#define MSR_INTEL_PERF_CAPABILITIES 0x345 // RO
|
||||
#define MSR_INTEL_PEBS_ENABLE 0x3f1
|
||||
#define MSR_INTEL_PEBS_LD_LAT 0x3f6
|
||||
#define MSR_INTEL_DS_AREA 0x600
|
||||
|
||||
// Common event-select bits
|
||||
#define PERF_SEL_USR (1ULL << 16)
|
||||
#define PERF_SEL_OS (1ULL << 17)
|
||||
#define PERF_SEL_EDGE (1ULL << 18)
|
||||
#define PERF_SEL_INT (1ULL << 20)
|
||||
#define PERF_SEL_ENABLE (1ULL << 22)
|
||||
#define PERF_SEL_INV (1ULL << 23)
|
||||
#define PERF_SEL_CMASK_SHIFT 24
|
||||
|
||||
// APIC Base Address Register MSR
|
||||
#define MSR_APIC_BAR 0x0000001b
|
||||
#define APIC_BAR_XAPIC_EN (1 << 11)
|
||||
#define APIC_BAR_X2APIC_EN (1 << 10)
|
||||
|
||||
#define MSR_PKG_ENERGY_STATUS 0x00000611
|
||||
|
||||
static inline uintptr_t getfsbase(void)
|
||||
{
|
||||
#ifdef USE_RDWRGSFS
|
||||
uintptr_t base;
|
||||
asm volatile("rdfsbase %0" : "=r"(base));
|
||||
return base;
|
||||
#else
|
||||
return rdmsr(MSR_FS_BASE);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline uintptr_t getgsbase(void)
|
||||
{
|
||||
#ifdef USE_RDWRGSFS
|
||||
uintptr_t base;
|
||||
asm volatile("rdgsbase %0" : "=r"(base));
|
||||
return base;
|
||||
#else
|
||||
return rdmsr(MSR_GS_BASE);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void setfsbase(uintptr_t base)
|
||||
{
|
||||
#ifdef USE_RDWRGSFS
|
||||
asm volatile("wrfsbase %0" : : "r"(base));
|
||||
#else
|
||||
wrmsr(MSR_FS_BASE, base);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void setgsbase(uintptr_t base)
|
||||
{
|
||||
#ifdef USE_RDWRGSFS
|
||||
asm volatile("wrgsbase %0" : : "r"(base));
|
||||
#else
|
||||
wrmsr(MSR_GS_BASE, base);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void setgskernbase(uintptr_t base)
|
||||
{
|
||||
assert(!is_irq_enabled());
|
||||
|
||||
asm volatile("swapgs");
|
||||
#ifdef USE_RDWRGSFS
|
||||
asm volatile("wrgsbase %0" : : "r"(base));
|
||||
#else
|
||||
wrmsr(MSR_GS_BASE, base);
|
||||
#endif
|
||||
asm volatile("swapgs");
|
||||
}
|
148
inc/dune/ops.h
148
inc/dune/ops.h
@ -1,148 +0,0 @@
|
||||
/*
|
||||
* ops.h - useful x86 opcodes
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ix/stddef.h>
|
||||
|
||||
/* CPUID Flags. */
|
||||
#define CPUID_FLAG_FPU 0x1 /* Floating Point Unit. */
|
||||
#define CPUID_FLAG_VME 0x2 /* Virtual Mode Extensions. */
|
||||
#define CPUID_FLAG_DE 0x4 /* Debugging Extensions. */
|
||||
#define CPUID_FLAG_PSE 0x8 /* Page Size Extensions. */
|
||||
#define CPUID_FLAG_TSC 0x10 /* Time Stamp Counter. */
|
||||
#define CPUID_FLAG_MSR 0x20 /* Model-specific registers. */
|
||||
#define CPUID_FLAG_PAE 0x40 /* Physical Address Extensions. */
|
||||
#define CPUID_FLAG_MCE 0x80 /* Machine Check Exceptions. */
|
||||
#define CPUID_FLAG_CXCHG8 0x100 /* Compare and exchange 8-byte. */
|
||||
#define CPUID_FLAG_APIC 0x200 /* On-chip APIC. */
|
||||
#define CPUID_FLAG_SEP 0x800 /* Fast System Calls. */
|
||||
#define CPUID_FLAG_MTRR 0x1000 /* Memory Type Range Registers. */
|
||||
#define CPUID_FLAG_PGE 0x2000 /* Page Global Enable. */
|
||||
#define CPUID_FLAG_MCA 0x4000 /* Machine Check Architecture. */
|
||||
#define CPUID_FLAG_CMOV 0x8000 /* Conditional move-instruction. */
|
||||
#define CPUID_FLAG_PAT 0x10000 /* Page Attribute Table. */
|
||||
#define CPUID_FLAG_PSE36 0x20000 /* 36-bit Page Size Extensions. */
|
||||
#define CPUID_FLAG_PSN 0x40000 /* Processor Serial Number. */
|
||||
#define CPUID_FLAG_CLFL 0x80000 /* CLFLUSH - fixme? */
|
||||
#define CPUID_FLAG_DTES 0x200000 /* Debug Trace and EMON Store MSRs. */
|
||||
#define CPUID_FLAG_ACPI 0x400000 /* Thermal Cotrol MSR. */
|
||||
#define CPUID_FLAG_MMX 0x800000 /* MMX instruction set. */
|
||||
#define CPUID_FLAG_FXSR 0x1000000 /* Fast floating point save/restore. */
|
||||
#define CPUID_FLAG_SSE 0x2000000 /* SSE (Streaming SIMD Extensions) */
|
||||
#define CPUID_FLAG_SSE2 0x4000000 /* SSE2 (Streaming SIMD Extensions - #2) */
|
||||
#define CPUID_FLAG_SS 0x8000000 /* Selfsnoop. */
|
||||
#define CPUID_FLAG_HTT 0x10000000 /* Hyper-Threading Technology. */
|
||||
#define CPUID_FLAG_TM1 0x20000000 /* Thermal Interrupts, Status MSRs. */
|
||||
#define CPUID_FLAG_IA64 0x40000000 /* IA-64 (64-bit Intel CPU) */
|
||||
#define CPUID_FLAG_PBE 0x80000000 /* Pending Break Event. */
|
||||
|
||||
/* from xv6, created by MIT PDOS */
|
||||
static inline void cpuid(uint32_t info, uint32_t *eaxp,
|
||||
uint32_t *ebxp, uint32_t *ecxp,
|
||||
uint32_t *edxp)
|
||||
{
|
||||
uint32_t eax, ebx, ecx, edx;
|
||||
asm volatile("cpuid"
|
||||
: "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
|
||||
: "a" (info));
|
||||
if (eaxp)
|
||||
*eaxp = eax;
|
||||
if (ebxp)
|
||||
*ebxp = ebx;
|
||||
if (ecxp)
|
||||
*ecxp = ecx;
|
||||
if (edxp)
|
||||
*edxp = edx;
|
||||
}
|
||||
|
||||
static inline uint64_t rdtsc(void)
|
||||
{
|
||||
uint32_t a, d;
|
||||
asm volatile("rdtsc" : "=a" (a), "=d" (d));
|
||||
return ((uint64_t) a) | (((uint64_t) d) << 32);
|
||||
}
|
||||
|
||||
static inline uint64_t rdtscp(uint32_t *auxp)
|
||||
{
|
||||
unsigned int a, d, c;
|
||||
asm volatile("rdtscp" : "=a" (a), "=d" (d), "=c" (c));
|
||||
if (auxp)
|
||||
*auxp = c;
|
||||
return ((uint64_t) a) | (((uint64_t) d) << 32);
|
||||
}
|
||||
|
||||
static inline uint64_t read_cr3(void)
|
||||
{
|
||||
uint64_t val;
|
||||
asm volatile("movq %%cr3, %0" : "=r" (val));
|
||||
return val;
|
||||
}
|
||||
|
||||
static inline void write_cr3(uint64_t val)
|
||||
{
|
||||
asm volatile("movq %0, %%cr3" : : "r" (val));
|
||||
}
|
||||
|
||||
#define PCID_COUNT (1 << 12)
|
||||
|
||||
#ifdef USE_INVPCID
|
||||
|
||||
static inline void invpcid(uint16_t pcid, uint64_t type, uintptr_t la)
|
||||
{
|
||||
struct {
|
||||
uint64_t pcid:12;
|
||||
uint64_t rsv:52;
|
||||
uint64_t la;
|
||||
} desc;
|
||||
|
||||
|
||||
assert(pcid < PCID_COUNT);
|
||||
|
||||
desc.pcid = pcid;
|
||||
desc.rsv = 0;
|
||||
desc.la = la;
|
||||
|
||||
asm volatile("invpcid (%0), %1" : :
|
||||
"r" (&desc), "r" (type) : "memory");
|
||||
}
|
||||
|
||||
enum {
|
||||
INVPCID_TYPE_ADDR = 0, /* individual address invalidation */
|
||||
INVPCID_TYPE_CTX, /* single context invalidation */
|
||||
INVPCID_TYPE_ALL_GLB, /* all contexts and global translations */
|
||||
INVPCID_TYPE_ALL, /* all contexts except global translations */
|
||||
};
|
||||
|
||||
#endif /* USE_INVPCID */
|
||||
|
||||
static inline void flush_tlb_addr(const void *va)
|
||||
{
|
||||
asm volatile("invlpg (%0)" : : "r" (va) : "memory");
|
||||
}
|
||||
|
||||
static inline void set_pgroot(uint16_t pcid, uintptr_t pa, bool inval)
|
||||
{
|
||||
assert(pcid < PCID_COUNT);
|
||||
|
||||
if (inval)
|
||||
write_cr3(pa | (uintptr_t) pcid);
|
||||
else
|
||||
write_cr3(pa | (uintptr_t) pcid | (1UL << 63));
|
||||
}
|
||||
|
||||
static inline void monitor(void const *p, unsigned extensions, unsigned hints)
|
||||
{
|
||||
asm volatile("monitor" : : "a" (p), "c" (extensions), "d" (hints));
|
||||
}
|
||||
|
||||
static inline void mwait(unsigned idle_state, unsigned flags)
|
||||
{
|
||||
asm volatile("mwait" : : "a" (idle_state), "c" (flags));
|
||||
}
|
||||
|
||||
#define IDLE_STATE_C1 0x00 /* ~2 microseconds */
|
||||
#define IDLE_STATE_C1E 0x01 /* ~10 microseconds */
|
||||
#define IDLE_STATE_C3 0x10 /* ~33 microseconds */
|
||||
|
@ -1,33 +0,0 @@
|
||||
/*
|
||||
* procmap.h - parses linux process map information
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <base/stddef.h>
|
||||
|
||||
struct procmap_entry {
|
||||
uintptr_t begin;
|
||||
uintptr_t end;
|
||||
uint64_t offset;
|
||||
bool r; // Readable
|
||||
bool w; // Writable
|
||||
bool x; // Executable
|
||||
bool p; // Private (or shared)
|
||||
char *path;
|
||||
int type;
|
||||
};
|
||||
|
||||
#define PROCMAP_TYPE_UNKNOWN 0x00
|
||||
#define PROCMAP_TYPE_FILE 0x01
|
||||
#define PROCMAP_TYPE_ANONYMOUS 0x02
|
||||
#define PROCMAP_TYPE_HEAP 0x03
|
||||
#define PROCMAP_TYPE_STACK 0x04
|
||||
#define PROCMAP_TYPE_VSYSCALL 0x05
|
||||
#define PROCMAP_TYPE_VDSO 0x06
|
||||
#define PROCMAP_TYPE_VVAR 0x07
|
||||
|
||||
typedef int (*procmap_cb_t)(const struct procmap_entry *, unsigned long data);
|
||||
|
||||
extern int procmap_iterate(procmap_cb_t cb, unsigned long data);
|
||||
extern void procmap_dump(void);
|
128
inc/dune/trap.h
128
inc/dune/trap.h
@ -1,128 +0,0 @@
|
||||
/*
|
||||
* trap.h - x86 exception and interrupt support
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <base/stddef.h>
|
||||
#include <base/cpu.h>
|
||||
#include <dune/mmu.h>
|
||||
|
||||
/* format used by LGDT and LIDT instructions */
|
||||
struct tptr {
|
||||
uint16_t limit;
|
||||
uint64_t base;
|
||||
} __packed;
|
||||
|
||||
/* the interrupt descriptor table (IDT) descriptor format */
|
||||
struct idtd {
|
||||
uint16_t low;
|
||||
uint16_t selector;
|
||||
uint8_t ist;
|
||||
uint8_t type;
|
||||
uint16_t middle;
|
||||
uint32_t high;
|
||||
uint32_t zero;
|
||||
} __packed;
|
||||
|
||||
#define IDTD_P (1 << 7)
|
||||
#define IDTD_CPL3 (3 << 5)
|
||||
#define IDTD_TRAP_GATE 0xF
|
||||
#define IDTD_INTERRUPT_GATE 0xE
|
||||
|
||||
#define IDT_ENTRIES 256
|
||||
|
||||
/* the task-switch segment (TSS) descriptor format */
|
||||
struct tssd {
|
||||
char ign1[4];
|
||||
uint64_t rsp[3];
|
||||
uint64_t ist[8];
|
||||
char ign2[10];
|
||||
uint16_t iomb;
|
||||
uint8_t iopb[];
|
||||
} __packed;
|
||||
|
||||
/* x86 trap codes */
|
||||
#define T_DIVIDE 0 // divide error
|
||||
#define T_DEBUG 1 // debug exception
|
||||
#define T_NMI 2 // non-maskable interrupt
|
||||
#define T_BRKPT 3 // breakpoint
|
||||
#define T_OFLOW 4 // overflow
|
||||
#define T_BOUND 5 // bounds check
|
||||
#define T_ILLOP 6 // illegal opcode
|
||||
#define T_DEVICE 7 // device not available
|
||||
#define T_DBLFLT 8 // double fault
|
||||
/* #define T_COPROC 9 */ // reserved (not generated by recent processors)
|
||||
#define T_TSS 10 // invalid task switch segment
|
||||
#define T_SEGNP 11 // segment not present
|
||||
#define T_STACK 12 // stack exception
|
||||
#define T_GPFLT 13 // genernal protection fault
|
||||
#define T_PGFLT 14 // page fault
|
||||
/* #define T_RES 15 */ // reserved
|
||||
#define T_FPERR 16 // floating point error
|
||||
#define T_ALIGN 17 // aligment check
|
||||
#define T_MCHK 18 // machine check
|
||||
#define T_SIMDERR 19 // SIMD floating point error
|
||||
|
||||
/**
|
||||
* irq_disable - disables interrupts
|
||||
*/
|
||||
static inline void irq_disable(void)
|
||||
{
|
||||
asm volatile("cli" : : : "memory");
|
||||
}
|
||||
|
||||
/**
|
||||
* irq_enable - enables interrupts
|
||||
*/
|
||||
static inline void irq_enable(void)
|
||||
{
|
||||
asm volatile("sti" : : : "memory");
|
||||
}
|
||||
|
||||
/**
|
||||
* is_irq_enabled - are interrupts currently enabled?
|
||||
*
|
||||
* Returns true if interrupts are enabled.
|
||||
*/
|
||||
static inline bool is_irq_enabled(void)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
asm volatile("pushf\n\t"
|
||||
"pop %0\n\t"
|
||||
: "=rm" (flags) : : "memory");
|
||||
|
||||
return ((flags & FL_IF) > 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* irq_save - disables interrupts, saving the current interrupt status
|
||||
*
|
||||
* Returns the current FLAGS.
|
||||
*/
|
||||
static inline unsigned long irq_save(void)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
asm volatile("pushf\n\t"
|
||||
"pop %0\n\t"
|
||||
: "=rm" (flags) : : "memory");
|
||||
|
||||
if (flags & FL_IF)
|
||||
irq_disable();
|
||||
|
||||
return flags;
|
||||
}
|
||||
|
||||
/**
|
||||
* irq_restore - restores the previous interrupt status
|
||||
* @flags: the previous FLAGS
|
||||
*/
|
||||
static inline void irq_restore(unsigned long flags)
|
||||
{
|
||||
asm volatile("push %0\n\t"
|
||||
"popf\n\t"
|
||||
: : "g" (flags) : "memory", "cc");
|
||||
}
|
||||
|
@ -1,84 +0,0 @@
|
||||
/*
|
||||
* vm.h - virtual memory management
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <base/mem.h>
|
||||
#include <base/page.h>
|
||||
#include <dune/mmu.h>
|
||||
|
||||
#define PGLEVEL_4KB 0
|
||||
#define PGLEVEL_2MB 1
|
||||
#define PGLEVEL_1GB 2
|
||||
#define PGLEVEL_NUM 4
|
||||
|
||||
#define PGLEVEL_TO_SIZE(level) (1 << PDSHIFT(level))
|
||||
#define PGSIZE_TO_LEVEL(size) ((__builtin_ctz(size) - PGSHIFT_4KB) / NPTBITS)
|
||||
|
||||
|
||||
/*
|
||||
* Raw Operations
|
||||
*/
|
||||
|
||||
extern int
|
||||
vm_lookup_pte(ptent_t *tbl, const void *va,
|
||||
int *level_out, ptent_t **pte_out);
|
||||
extern int
|
||||
vm_insert_pte(ptent_t *tbl, const void *va,
|
||||
int level, ptent_t pte_in);
|
||||
extern int
|
||||
vm_get_pte(ptent_t *tbl, const void *va,
|
||||
int level, ptent_t **pte_out);
|
||||
extern int
|
||||
vm_remove_pte(ptent_t *tbl, const void *va,
|
||||
int *level_out, ptent_t *pte_out);
|
||||
|
||||
|
||||
/*
|
||||
* Page Operations
|
||||
*/
|
||||
|
||||
extern int
|
||||
vm_lookup_page(ptent_t *tbl, const void *va, struct page **pg_out);
|
||||
extern int
|
||||
vm_insert_page(ptent_t *tbl, const void *va,
|
||||
struct page *pg, ptent_t flags);
|
||||
extern int
|
||||
vm_remove_page(ptent_t *tbl, const void *va,
|
||||
struct page **pg_out);
|
||||
|
||||
|
||||
/*
|
||||
* Ranged Operations
|
||||
*/
|
||||
|
||||
extern int
|
||||
vm_map_phys(ptent_t *tbl, physaddr_t pa, const void *va,
|
||||
size_t len, int pgsize, ptent_t flags);
|
||||
extern int
|
||||
vm_map_pages(ptent_t *tbl, const void *va, size_t len,
|
||||
int pgsize, ptent_t flags);
|
||||
extern int
|
||||
vm_map_copy(ptent_t *tbl, const void *src_va, const void *map_va,
|
||||
size_t len, int pgsize, ptent_t flags);
|
||||
extern bool
|
||||
vm_mod(ptent_t *tbl, const void *va, size_t len, int pgsize, ptent_t flags);
|
||||
extern bool
|
||||
vm_disable(ptent_t *tbl, const void *va, size_t len, int pgsize);
|
||||
extern void
|
||||
vm_unmap(ptent_t *tbl, const void *va, size_t len, int pgsize);
|
||||
extern void
|
||||
vm_unmap_pages(ptent_t *tbl, const void *va, size_t len, int pgsize);
|
||||
|
||||
|
||||
/*
|
||||
* Page Tables
|
||||
*/
|
||||
|
||||
extern ptent_t *vm_create_pt(void);
|
||||
extern ptent_t *vm_clone_kern_pt(void);
|
||||
extern void vm_destroy_pt(ptent_t *tbl);
|
||||
|
||||
extern ptent_t *kern_pgtbl;
|
||||
|
Loading…
Reference in New Issue
Block a user