diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index 6ba3820e7fbd..3a2b17026e02 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -1494,6 +1494,14 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) if (env != NULL) strlcpy(kernelname, env, sizeof(kernelname)); +#ifdef XENHVM + if (inw(0x10) == 0x49d2) { + if (bootverbose) + printf("Xen detected: disabling emulated block and network devices\n"); + outw(0x10, 3); + } +#endif + /* Location of kernel stack for locore */ return ((u_int64_t)thread0.td_pcb); } diff --git a/sys/amd64/conf/XENHVM b/sys/amd64/conf/XENHVM new file mode 100644 index 000000000000..ed2f70f96d9c --- /dev/null +++ b/sys/amd64/conf/XENHVM @@ -0,0 +1,160 @@ +# +# XENHVM -- Xen HVM kernel configuration file for FreeBSD/amd64 +# +# For more information on this file, please read the config(5) manual page, +# and/or the handbook section on Kernel Configuration Files: +# +# http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html +# +# The handbook is also available locally in /usr/share/doc/handbook +# if you've installed the doc distribution, otherwise always see the +# FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the +# latest information. +# +# An exhaustive list of options and more detailed explanations of the +# device lines is also present in the ../../conf/NOTES and NOTES files. +# If you are in doubt as to the purpose or necessity of a line, check first +# in NOTES. +# +# $FreeBSD$ + +cpu HAMMER +ident GENERIC + +# To statically compile in device wiring instead of /boot/device.hints +#hints "GENERIC.hints" # Default places to look for devices. + +# Use the following to compile in values accessible to the kernel +# through getenv() (or kenv(1) in userland). The format of the file +# is 'variable=value', see kenv(1) +# +# env "GENERIC.env" + +makeoptions DEBUG=-g # Build kernel with gdb(1) debug symbols +makeoptions MODULES_OVERRIDE="" + +options SCHED_ULE # ULE scheduler +options PREEMPTION # Enable kernel thread preemption +options INET # InterNETworking +options INET6 # IPv6 communications protocols +options SCTP # Stream Control Transmission Protocol +options FFS # Berkeley Fast Filesystem +options SOFTUPDATES # Enable FFS soft updates support +options UFS_ACL # Support for access control lists +options UFS_DIRHASH # Improve performance on big directories +options UFS_GJOURNAL # Enable gjournal-based UFS journaling +options MD_ROOT # MD is a potential root device +options NFSCLIENT # Network Filesystem Client +options NFSSERVER # Network Filesystem Server +options NFSLOCKD # Network Lock Manager +options NFS_ROOT # NFS usable as /, requires NFSCLIENT +options MSDOSFS # MSDOS Filesystem +options CD9660 # ISO 9660 Filesystem +options PROCFS # Process filesystem (requires PSEUDOFS) +options PSEUDOFS # Pseudo-filesystem framework +options GEOM_PART_GPT # GUID Partition Tables. +options GEOM_LABEL # Provides labelization +options COMPAT_43TTY # BSD 4.3 TTY compat (sgtty) +options COMPAT_IA32 # Compatible with i386 binaries +options COMPAT_FREEBSD4 # Compatible with FreeBSD4 +options COMPAT_FREEBSD5 # Compatible with FreeBSD5 +options COMPAT_FREEBSD6 # Compatible with FreeBSD6 +options COMPAT_FREEBSD7 # Compatible with FreeBSD7 +options SCSI_DELAY=5000 # Delay (in ms) before probing SCSI +options KTRACE # ktrace(1) support +options STACK # stack(9) support +options SYSVSHM # SYSV-style shared memory +options SYSVMSG # SYSV-style message queues +options SYSVSEM # SYSV-style semaphores +options _KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions +options KBD_INSTALL_CDEV # install a CDEV entry in /dev +options STOP_NMI # Stop CPUS using NMI instead of IPI +options HWPMC_HOOKS # Necessary kernel hooks for hwpmc(4) +options AUDIT # Security event auditing +#options KDTRACE_FRAME # Ensure frames are compiled in +#options KDTRACE_HOOKS # Kernel DTrace hooks + +# Debugging for use in -current +options KDB # Enable kernel debugger support. +options DDB # Support DDB. +options GDB # Support remote GDB. +options INVARIANTS # Enable calls of extra sanity checking +options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS +options WITNESS # Enable checks to detect deadlocks and cycles +options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed + +# Make an SMP-capable kernel by default +options SMP # Symmetric MultiProcessor Kernel + +# CPU frequency control +device cpufreq + +# Bus support. +device acpi +device pci + +# Floppy drives +device fdc + +# Xen HVM support +options XENHVM +device xenpci + +# ATA and ATAPI devices +device ata +device atadisk # ATA disk drives +device ataraid # ATA RAID drives +device atapicd # ATAPI CDROM drives +device atapifd # ATAPI floppy drives +device atapist # ATAPI tape drives +options ATA_STATIC_ID # Static device numbering + +# SCSI peripherals +device scbus # SCSI bus (required for SCSI) +device ch # SCSI media changers +device da # Direct Access (disks) +device sa # Sequential Access (tape etc) +device cd # CD +device pass # Passthrough device (direct SCSI access) +device ses # SCSI Environmental Services (and SAF-TE) + + +# atkbdc0 controls both the keyboard and the PS/2 mouse +device atkbdc # AT keyboard controller +device atkbd # AT keyboard +device psm # PS/2 mouse + +device kbdmux # keyboard multiplexer + +device vga # VGA video card driver + +device splash # Splash screen and screen saver support + +# syscons is the default console driver, resembling an SCO console +device sc + +device agp # support several AGP chipsets + +# Serial (COM) ports +device uart # Generic UART driver + +# PCI Ethernet NICs that use the common MII bus controller code. +# NOTE: Be sure to keep the 'device miibus' line in order to use these NICs! +device miibus # MII bus support +device re # RealTek 8139C+/8169/8169S/8110S + +# Pseudo devices. +device loop # Network loopback +device random # Entropy device +device ether # Ethernet support +device tun # Packet tunnel. +device pty # BSD-style compatibility pseudo ttys +device md # Memory "disks" +device gif # IPv6 and IPv4 tunneling +device faith # IPv6-to-IPv4 relaying (translation) +device firmware # firmware assist module + +# The `bpf' device enables the Berkeley Packet Filter. +# Be aware of the administrative consequences of enabling this! +# Note that 'bpf' is required for DHCP. +device bpf # Berkeley packet filter diff --git a/sys/amd64/include/pcpu.h b/sys/amd64/include/pcpu.h index e9faf282c77a..23818ca81328 100644 --- a/sys/amd64/include/pcpu.h +++ b/sys/amd64/include/pcpu.h @@ -33,6 +33,24 @@ #error "sys/cdefs.h is a prerequisite for this file" #endif +#if defined(XEN) || defined(XENHVM) +#ifndef NR_VIRQS +#define NR_VIRQS 24 +#endif +#ifndef NR_IPIS +#define NR_IPIS 2 +#endif +#endif + +#ifdef XENHVM +#define PCPU_XEN_FIELDS \ + ; \ + unsigned int pc_last_processed_l1i; \ + unsigned int pc_last_processed_l2i +#else +#define PCPU_XEN_FIELDS +#endif + /* * The SMP parts are setup in pmap.c and locore.s for the BSP, and * mp_machdep.c sets up the data for the AP's to "see" when they awake. @@ -49,7 +67,8 @@ register_t pc_scratch_rsp; /* User %rsp in syscall */ \ u_int pc_apic_id; \ u_int pc_acpi_id; /* ACPI CPU id */ \ - struct user_segment_descriptor *pc_gs32p + struct user_segment_descriptor *pc_gs32p \ + PCPU_XEN_FIELDS #ifdef _KERNEL diff --git a/sys/amd64/include/xen/hypercall.h b/sys/amd64/include/xen/hypercall.h new file mode 100644 index 000000000000..50fa376ff90b --- /dev/null +++ b/sys/amd64/include/xen/hypercall.h @@ -0,0 +1,415 @@ +/****************************************************************************** + * hypercall.h + * + * Linux-specific hypervisor handling. + * + * Copyright (c) 2002-2004, K A Fraser + * + * 64-bit updates: + * Benjamin Liu + * Jun Nakajima + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __MACHINE_XEN_HYPERCALL_H__ +#define __MACHINE_XEN_HYPERCALL_H__ + +#include + +#ifndef __XEN_HYPERVISOR_H__ +# error "please don't include this file directly" +#endif + +#define __STR(x) #x +#define STR(x) __STR(x) +#define ENOXENSYS 38 +#define CONFIG_XEN_COMPAT 0x030002 +#define __must_check + +#ifdef XEN +#define HYPERCALL_STR(name) \ + "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)" +#else +#define HYPERCALL_STR(name) \ + "mov $("STR(__HYPERVISOR_##name)" * 32),%%eax; "\ + "add hypercall_stubs(%%rip),%%rax; " \ + "call *%%rax" +#endif + +#define _hypercall0(type, name) \ +({ \ + type __res; \ + __asm__ volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res) \ + : \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall1(type, name, a1) \ +({ \ + type __res; \ + long __ign1; \ + __asm__ volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res), "=D" (__ign1) \ + : "1" ((long)(a1)) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall2(type, name, a1, a2) \ +({ \ + type __res; \ + long __ign1, __ign2; \ + __asm__ volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res), "=D" (__ign1), "=S" (__ign2) \ + : "1" ((long)(a1)), "2" ((long)(a2)) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall3(type, name, a1, a2, a3) \ +({ \ + type __res; \ + long __ign1, __ign2, __ign3; \ + __asm__ volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \ + "=d" (__ign3) \ + : "1" ((long)(a1)), "2" ((long)(a2)), \ + "3" ((long)(a3)) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall4(type, name, a1, a2, a3, a4) \ +({ \ + type __res; \ + long __ign1, __ign2, __ign3; \ + register long __arg4 __asm__("r10") = (long)(a4); \ + __asm__ volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \ + "=d" (__ign3), "+r" (__arg4) \ + : "1" ((long)(a1)), "2" ((long)(a2)), \ + "3" ((long)(a3)) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall5(type, name, a1, a2, a3, a4, a5) \ +({ \ + type __res; \ + long __ign1, __ign2, __ign3; \ + register long __arg4 __asm__("r10") = (long)(a4); \ + register long __arg5 __asm__("r8") = (long)(a5); \ + __asm__ volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \ + "=d" (__ign3), "+r" (__arg4), "+r" (__arg5) \ + : "1" ((long)(a1)), "2" ((long)(a2)), \ + "3" ((long)(a3)) \ + : "memory" ); \ + __res; \ +}) + +static inline int __must_check +HYPERVISOR_set_trap_table( + const trap_info_t *table) +{ + return _hypercall1(int, set_trap_table, table); +} + +static inline int __must_check +HYPERVISOR_mmu_update( + mmu_update_t *req, unsigned int count, unsigned int *success_count, + domid_t domid) +{ + return _hypercall4(int, mmu_update, req, count, success_count, domid); +} + +static inline int __must_check +HYPERVISOR_mmuext_op( + struct mmuext_op *op, unsigned int count, unsigned int *success_count, + domid_t domid) +{ + return _hypercall4(int, mmuext_op, op, count, success_count, domid); +} + +static inline int __must_check +HYPERVISOR_set_gdt( + unsigned long *frame_list, unsigned int entries) +{ + return _hypercall2(int, set_gdt, frame_list, entries); +} + +static inline int __must_check +HYPERVISOR_stack_switch( + unsigned long ss, unsigned long esp) +{ + return _hypercall2(int, stack_switch, ss, esp); +} + +static inline int __must_check +HYPERVISOR_set_callbacks( + unsigned long event_address, unsigned long failsafe_address, + unsigned long syscall_address) +{ + return _hypercall3(int, set_callbacks, + event_address, failsafe_address, syscall_address); +} + +static inline int +HYPERVISOR_fpu_taskswitch( + int set) +{ + return _hypercall1(int, fpu_taskswitch, set); +} + +static inline int __must_check +HYPERVISOR_sched_op_compat( + int cmd, unsigned long arg) +{ + return _hypercall2(int, sched_op_compat, cmd, arg); +} + +static inline int __must_check +HYPERVISOR_sched_op( + int cmd, void *arg) +{ + return _hypercall2(int, sched_op, cmd, arg); +} + +static inline long __must_check +HYPERVISOR_set_timer_op( + uint64_t timeout) +{ + return _hypercall1(long, set_timer_op, timeout); +} + +static inline int __must_check +HYPERVISOR_platform_op( + struct xen_platform_op *platform_op) +{ + platform_op->interface_version = XENPF_INTERFACE_VERSION; + return _hypercall1(int, platform_op, platform_op); +} + +static inline int __must_check +HYPERVISOR_set_debugreg( + unsigned int reg, unsigned long value) +{ + return _hypercall2(int, set_debugreg, reg, value); +} + +static inline unsigned long __must_check +HYPERVISOR_get_debugreg( + unsigned int reg) +{ + return _hypercall1(unsigned long, get_debugreg, reg); +} + +static inline int __must_check +HYPERVISOR_update_descriptor( + unsigned long ma, unsigned long word) +{ + return _hypercall2(int, update_descriptor, ma, word); +} + +static inline int __must_check +HYPERVISOR_memory_op( + unsigned int cmd, void *arg) +{ + return _hypercall2(int, memory_op, cmd, arg); +} + +static inline int __must_check +HYPERVISOR_multicall( + multicall_entry_t *call_list, unsigned int nr_calls) +{ + return _hypercall2(int, multicall, call_list, nr_calls); +} + +static inline int __must_check +HYPERVISOR_update_va_mapping( + unsigned long va, uint64_t new_val, unsigned long flags) +{ + return _hypercall3(int, update_va_mapping, va, new_val, flags); +} + +static inline int __must_check +HYPERVISOR_event_channel_op( + int cmd, void *arg) +{ + int rc = _hypercall2(int, event_channel_op, cmd, arg); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (unlikely(rc == -ENOXENSYS)) { + struct evtchn_op op; + op.cmd = cmd; + memcpy(&op.u, arg, sizeof(op.u)); + rc = _hypercall1(int, event_channel_op_compat, &op); + memcpy(arg, &op.u, sizeof(op.u)); + } +#endif + + return rc; +} + +static inline int __must_check +HYPERVISOR_xen_version( + int cmd, void *arg) +{ + return _hypercall2(int, xen_version, cmd, arg); +} + +static inline int __must_check +HYPERVISOR_console_io( + int cmd, unsigned int count, char *str) +{ + return _hypercall3(int, console_io, cmd, count, str); +} + +static inline int __must_check +HYPERVISOR_physdev_op( + int cmd, void *arg) +{ + int rc = _hypercall2(int, physdev_op, cmd, arg); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (unlikely(rc == -ENOXENSYS)) { + struct physdev_op op; + op.cmd = cmd; + memcpy(&op.u, arg, sizeof(op.u)); + rc = _hypercall1(int, physdev_op_compat, &op); + memcpy(arg, &op.u, sizeof(op.u)); + } +#endif + + return rc; +} + +static inline int __must_check +HYPERVISOR_grant_table_op( + unsigned int cmd, void *uop, unsigned int count) +{ + return _hypercall3(int, grant_table_op, cmd, uop, count); +} + +static inline int __must_check +HYPERVISOR_update_va_mapping_otherdomain( + unsigned long va, uint64_t new_val, unsigned long flags, domid_t domid) +{ + return _hypercall4(int, update_va_mapping_otherdomain, va, + new_val, flags, domid); +} + +static inline int __must_check +HYPERVISOR_vm_assist( + unsigned int cmd, unsigned int type) +{ + return _hypercall2(int, vm_assist, cmd, type); +} + +static inline int __must_check +HYPERVISOR_vcpu_op( + int cmd, unsigned int vcpuid, void *extra_args) +{ + return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args); +} + +static inline int __must_check +HYPERVISOR_set_segment_base( + int reg, unsigned long value) +{ + return _hypercall2(int, set_segment_base, reg, value); +} + +static inline int __must_check +HYPERVISOR_suspend( + unsigned long srec) +{ + struct sched_shutdown sched_shutdown = { + .reason = SHUTDOWN_suspend + }; + + int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown, + &sched_shutdown, srec); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (rc == -ENOXENSYS) + rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown, + SHUTDOWN_suspend, srec); +#endif + + return rc; +} + +#if CONFIG_XEN_COMPAT <= 0x030002 +static inline int +HYPERVISOR_nmi_op( + unsigned long op, void *arg) +{ + return _hypercall2(int, nmi_op, op, arg); +} +#endif + +#ifndef CONFIG_XEN +static inline unsigned long __must_check +HYPERVISOR_hvm_op( + int op, void *arg) +{ + return _hypercall2(unsigned long, hvm_op, op, arg); +} +#endif + +static inline int __must_check +HYPERVISOR_callback_op( + int cmd, const void *arg) +{ + return _hypercall2(int, callback_op, cmd, arg); +} + +static inline int __must_check +HYPERVISOR_xenoprof_op( + int op, void *arg) +{ + return _hypercall2(int, xenoprof_op, op, arg); +} + +static inline int __must_check +HYPERVISOR_kexec_op( + unsigned long op, void *args) +{ + return _hypercall2(int, kexec_op, op, args); +} + +#undef __must_check + +#endif /* __MACHINE_XEN_HYPERCALL_H__ */ diff --git a/sys/amd64/include/xen/synch_bitops.h b/sys/amd64/include/xen/synch_bitops.h new file mode 100644 index 000000000000..746687aa91bd --- /dev/null +++ b/sys/amd64/include/xen/synch_bitops.h @@ -0,0 +1,129 @@ +#ifndef __XEN_SYNCH_BITOPS_H__ +#define __XEN_SYNCH_BITOPS_H__ + +/* + * Copyright 1992, Linus Torvalds. + * Heavily modified to provide guaranteed strong synchronisation + * when communicating with Xen or other guest OSes running on other CPUs. + */ + + +#define ADDR (*(volatile long *) addr) + +static __inline__ void synch_set_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__ ( + "lock btsl %1,%0" + : "=m" (ADDR) : "Ir" (nr) : "memory" ); +} + +static __inline__ void synch_clear_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__ ( + "lock btrl %1,%0" + : "=m" (ADDR) : "Ir" (nr) : "memory" ); +} + +static __inline__ void synch_change_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__ ( + "lock btcl %1,%0" + : "=m" (ADDR) : "Ir" (nr) : "memory" ); +} + +static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr) +{ + int oldbit; + __asm__ __volatile__ ( + "lock btsl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory"); + return oldbit; +} + +static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr) +{ + int oldbit; + __asm__ __volatile__ ( + "lock btrl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory"); + return oldbit; +} + +static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr) +{ + int oldbit; + + __asm__ __volatile__ ( + "lock btcl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory"); + return oldbit; +} + +struct __synch_xchg_dummy { unsigned long a[100]; }; +#define __synch_xg(x) ((volatile struct __synch_xchg_dummy *)(x)) + +#define synch_cmpxchg(ptr, old, new) \ +((__typeof__(*(ptr)))__synch_cmpxchg((ptr),\ + (unsigned long)(old), \ + (unsigned long)(new), \ + sizeof(*(ptr)))) + +static inline unsigned long __synch_cmpxchg(volatile void *ptr, + unsigned long old, + unsigned long new, int size) +{ + unsigned long prev; + switch (size) { + case 1: + __asm__ __volatile__("lock; cmpxchgb %b1,%2" + : "=a"(prev) + : "q"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; + case 2: + __asm__ __volatile__("lock; cmpxchgw %w1,%2" + : "=a"(prev) + : "q"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; + case 4: + __asm__ __volatile__("lock; cmpxchgl %k1,%2" + : "=a"(prev) + : "q"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; + case 8: + __asm__ __volatile__("lock; cmpxchgq %1,%2" + : "=a"(prev) + : "q"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; + } + return old; +} + +static __inline__ int synch_const_test_bit(int nr, const volatile void * addr) +{ + return ((1UL << (nr & 31)) & + (((const volatile unsigned int *) addr)[nr >> 5])) != 0; +} + +static __inline__ int synch_var_test_bit(int nr, volatile void * addr) +{ + int oldbit; + __asm__ __volatile__ ( + "btl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit) : "m" (ADDR), "Ir" (nr) ); + return oldbit; +} + +#define synch_test_bit(nr,addr) \ +(__builtin_constant_p(nr) ? \ + synch_const_test_bit((nr),(addr)) : \ + synch_var_test_bit((nr),(addr))) + +#endif /* __XEN_SYNCH_BITOPS_H__ */ diff --git a/sys/amd64/include/xen/xen-os.h b/sys/amd64/include/xen/xen-os.h new file mode 100644 index 000000000000..163e7f2e0574 --- /dev/null +++ b/sys/amd64/include/xen/xen-os.h @@ -0,0 +1,296 @@ +/****************************************************************************** + * os.h + * + * random collection of macros and definition + */ + +#ifndef _XEN_OS_H_ +#define _XEN_OS_H_ + +#ifdef PAE +#define CONFIG_X86_PAE +#endif + +#if !defined(__XEN_INTERFACE_VERSION__) +/* + * Can update to a more recent version when we implement + * the hypercall page + */ +#define __XEN_INTERFACE_VERSION__ 0x00030204 +#endif + +#include + +/* Force a proper event-channel callback from Xen. */ +void force_evtchn_callback(void); + +extern int gdtset; + +extern shared_info_t *HYPERVISOR_shared_info; + +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ +static inline void rep_nop(void) +{ + __asm__ __volatile__ ( "rep;nop" : : : "memory" ); +} +#define cpu_relax() rep_nop() + +/* crude memory allocator for memory allocation early in + * boot + */ +void *bootmem_alloc(unsigned int size); +void bootmem_free(void *ptr, unsigned int size); + + +/* Everything below this point is not included by assembler (.S) files. */ +#ifndef __ASSEMBLY__ + +void printk(const char *fmt, ...); + +/* some function prototypes */ +void trap_init(void); + +#define likely(x) __builtin_expect((x),1) +#define unlikely(x) __builtin_expect((x),0) + +#ifndef XENHVM + +/* + * STI/CLI equivalents. These basically set and clear the virtual + * event_enable flag in teh shared_info structure. Note that when + * the enable bit is set, there may be pending events to be handled. + * We may therefore call into do_hypervisor_callback() directly. + */ + +#define __cli() \ +do { \ + vcpu_info_t *_vcpu; \ + _vcpu = &HYPERVISOR_shared_info->vcpu_info[PCPU_GET(cpuid)]; \ + _vcpu->evtchn_upcall_mask = 1; \ + barrier(); \ +} while (0) + +#define __sti() \ +do { \ + vcpu_info_t *_vcpu; \ + barrier(); \ + _vcpu = &HYPERVISOR_shared_info->vcpu_info[PCPU_GET(cpuid)]; \ + _vcpu->evtchn_upcall_mask = 0; \ + barrier(); /* unmask then check (avoid races) */ \ + if ( unlikely(_vcpu->evtchn_upcall_pending) ) \ + force_evtchn_callback(); \ +} while (0) + +#define __restore_flags(x) \ +do { \ + vcpu_info_t *_vcpu; \ + barrier(); \ + _vcpu = &HYPERVISOR_shared_info->vcpu_info[PCPU_GET(cpuid)]; \ + if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \ + barrier(); /* unmask then check (avoid races) */ \ + if ( unlikely(_vcpu->evtchn_upcall_pending) ) \ + force_evtchn_callback(); \ + } \ +} while (0) + +/* + * Add critical_{enter, exit}? + * + */ +#define __save_and_cli(x) \ +do { \ + vcpu_info_t *_vcpu; \ + _vcpu = &HYPERVISOR_shared_info->vcpu_info[PCPU_GET(cpuid)]; \ + (x) = _vcpu->evtchn_upcall_mask; \ + _vcpu->evtchn_upcall_mask = 1; \ + barrier(); \ +} while (0) + + +#define cli() __cli() +#define sti() __sti() +#define save_flags(x) __save_flags(x) +#define restore_flags(x) __restore_flags(x) +#define save_and_cli(x) __save_and_cli(x) + +#define local_irq_save(x) __save_and_cli(x) +#define local_irq_restore(x) __restore_flags(x) +#define local_irq_disable() __cli() +#define local_irq_enable() __sti() + +#define mtx_lock_irqsave(lock, x) {local_irq_save((x)); mtx_lock_spin((lock));} +#define mtx_unlock_irqrestore(lock, x) {mtx_unlock_spin((lock)); local_irq_restore((x)); } +#define spin_lock_irqsave mtx_lock_irqsave +#define spin_unlock_irqrestore mtx_unlock_irqrestore + +#else +#endif + +#ifndef mb +#define mb() __asm__ __volatile__("mfence":::"memory") +#endif +#ifndef rmb +#define rmb() __asm__ __volatile__("lfence":::"memory"); +#endif +#ifndef wmb +#define wmb() barrier() +#endif +#ifdef SMP +#define smp_mb() mb() +#define smp_rmb() rmb() +#define smp_wmb() wmb() +#define smp_read_barrier_depends() read_barrier_depends() +#define set_mb(var, value) do { xchg(&var, value); } while (0) +#else +#define smp_mb() barrier() +#define smp_rmb() barrier() +#define smp_wmb() barrier() +#define smp_read_barrier_depends() do { } while(0) +#define set_mb(var, value) do { var = value; barrier(); } while (0) +#endif + + +/* This is a barrier for the compiler only, NOT the processor! */ +#define barrier() __asm__ __volatile__("": : :"memory") + +#define LOCK_PREFIX "" +#define LOCK "" +#define ADDR (*(volatile long *) addr) +/* + * Make sure gcc doesn't try to be clever and move things around + * on us. We need to use _exactly_ the address the user gave us, + * not some alias that contains the same information. + */ +typedef struct { volatile int counter; } atomic_t; + + + +#define xen_xchg(ptr,v) \ + ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr)))) +struct __xchg_dummy { unsigned long a[100]; }; +#define __xg(x) ((volatile struct __xchg_dummy *)(x)) +static __inline unsigned long __xchg(unsigned long x, volatile void * ptr, + int size) +{ + switch (size) { + case 1: + __asm__ __volatile__("xchgb %b0,%1" + :"=q" (x) + :"m" (*__xg(ptr)), "0" (x) + :"memory"); + break; + case 2: + __asm__ __volatile__("xchgw %w0,%1" + :"=r" (x) + :"m" (*__xg(ptr)), "0" (x) + :"memory"); + break; + case 4: + __asm__ __volatile__("xchgl %0,%1" + :"=r" (x) + :"m" (*__xg(ptr)), "0" (x) + :"memory"); + break; + } + return x; +} + +/** + * test_and_clear_bit - Clear a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ +static __inline int test_and_clear_bit(int nr, volatile void * addr) +{ + int oldbit; + + __asm__ __volatile__( LOCK_PREFIX + "btrl %2,%1\n\tsbbl %0,%0" + :"=r" (oldbit),"=m" (ADDR) + :"Ir" (nr) : "memory"); + return oldbit; +} + +static __inline int constant_test_bit(int nr, const volatile void * addr) +{ + return ((1UL << (nr & 31)) & (((const volatile unsigned int *) addr)[nr >> 5])) != 0; +} + +static __inline int variable_test_bit(int nr, volatile void * addr) +{ + int oldbit; + + __asm__ __volatile__( + "btl %2,%1\n\tsbbl %0,%0" + :"=r" (oldbit) + :"m" (ADDR),"Ir" (nr)); + return oldbit; +} + +#define test_bit(nr,addr) \ +(__builtin_constant_p(nr) ? \ + constant_test_bit((nr),(addr)) : \ + variable_test_bit((nr),(addr))) + + +/** + * set_bit - Atomically set a bit in memory + * @nr: the bit to set + * @addr: the address to start counting from + * + * This function is atomic and may not be reordered. See __set_bit() + * if you do not require the atomic guarantees. + * Note that @nr may be almost arbitrarily large; this function is not + * restricted to acting on a single-word quantity. + */ +static __inline__ void set_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__( LOCK_PREFIX + "btsl %1,%0" + :"=m" (ADDR) + :"Ir" (nr)); +} + +/** + * clear_bit - Clears a bit in memory + * @nr: Bit to clear + * @addr: Address to start counting from + * + * clear_bit() is atomic and may not be reordered. However, it does + * not contain a memory barrier, so if it is used for locking purposes, + * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() + * in order to ensure changes are visible on other processors. + */ +static __inline__ void clear_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__( LOCK_PREFIX + "btrl %1,%0" + :"=m" (ADDR) + :"Ir" (nr)); +} + +/** + * atomic_inc - increment atomic variable + * @v: pointer of type atomic_t + * + * Atomically increments @v by 1. Note that the guaranteed + * useful range of an atomic_t is only 24 bits. + */ +static __inline__ void atomic_inc(atomic_t *v) +{ + __asm__ __volatile__( + LOCK "incl %0" + :"=m" (v->counter) + :"m" (v->counter)); +} + + +#define rdtscll(val) \ + __asm__ __volatile__("rdtsc" : "=A" (val)) + +#endif /* !__ASSEMBLY__ */ + +#endif /* _OS_H_ */ diff --git a/sys/amd64/include/xen/xenfunc.h b/sys/amd64/include/xen/xenfunc.h new file mode 100644 index 000000000000..b3a6672576cb --- /dev/null +++ b/sys/amd64/include/xen/xenfunc.h @@ -0,0 +1,83 @@ +/* + * + * Copyright (c) 2004,2005 Kip Macy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef _XEN_XENFUNC_H_ +#define _XEN_XENFUNC_H_ + +#ifdef XENHVM +#include +#else +#include +#include +#endif + +#define BKPT __asm__("int3"); +#define XPQ_CALL_DEPTH 5 +#define XPQ_CALL_COUNT 2 +#define PG_PRIV PG_AVAIL3 +typedef struct { + unsigned long pt_ref; + unsigned long pt_eip[XPQ_CALL_COUNT][XPQ_CALL_DEPTH]; +} pteinfo_t; + +extern pteinfo_t *pteinfo_list; +#ifdef XENDEBUG_LOW +#define __PRINTK(x) printk x +#else +#define __PRINTK(x) +#endif + +char *xen_setbootenv(char *cmd_line); + +int xen_boothowto(char *envp); + +void _xen_machphys_update(vm_paddr_t, vm_paddr_t, char *file, int line); + +#ifdef INVARIANTS +#define xen_machphys_update(a, b) _xen_machphys_update((a), (b), __FILE__, __LINE__) +#else +#define xen_machphys_update(a, b) _xen_machphys_update((a), (b), NULL, 0) +#endif + +#ifndef XENHVM +void xen_update_descriptor(union descriptor *, union descriptor *); +#endif + +extern struct mtx balloon_lock; +#if 0 +#define balloon_lock(__flags) mtx_lock_irqsave(&balloon_lock, __flags) +#define balloon_unlock(__flags) mtx_unlock_irqrestore(&balloon_lock, __flags) +#else +#define balloon_lock(__flags) __flags = 1 +#define balloon_unlock(__flags) __flags = 0 +#endif + + + +#endif /* _XEN_XENFUNC_H_ */ diff --git a/sys/amd64/include/xen/xenpmap.h b/sys/amd64/include/xen/xenpmap.h new file mode 100644 index 000000000000..d768dad5f311 --- /dev/null +++ b/sys/amd64/include/xen/xenpmap.h @@ -0,0 +1,227 @@ +/* + * + * Copyright (c) 2004 Christian Limpach. + * Copyright (c) 2004,2005 Kip Macy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Christian Limpach. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef _XEN_XENPMAP_H_ +#define _XEN_XENPMAP_H_ + +#include + +void _xen_queue_pt_update(vm_paddr_t, vm_paddr_t, char *, int); +void xen_pt_switch(vm_paddr_t); +void xen_set_ldt(vm_paddr_t, unsigned long); +void xen_pgdpt_pin(vm_paddr_t); +void xen_pgd_pin(vm_paddr_t); +void xen_pgd_unpin(vm_paddr_t); +void xen_pt_pin(vm_paddr_t); +void xen_pt_unpin(vm_paddr_t); +void xen_flush_queue(void); +void xen_check_queue(void); +#if 0 +void pmap_ref(pt_entry_t *pte, vm_paddr_t ma); +#endif + +#ifdef INVARIANTS +#define xen_queue_pt_update(a, b) _xen_queue_pt_update((a), (b), __FILE__, __LINE__) +#else +#define xen_queue_pt_update(a, b) _xen_queue_pt_update((a), (b), NULL, 0) +#endif + +#ifdef PMAP_DEBUG +#define PMAP_REF pmap_ref +#define PMAP_DEC_REF_PAGE pmap_dec_ref_page +#define PMAP_MARK_PRIV pmap_mark_privileged +#define PMAP_MARK_UNPRIV pmap_mark_unprivileged +#else +#define PMAP_MARK_PRIV(a) +#define PMAP_MARK_UNPRIV(a) +#define PMAP_REF(a, b) +#define PMAP_DEC_REF_PAGE(a) +#endif + +#define ALWAYS_SYNC 0 + +#ifdef PT_DEBUG +#define PT_LOG() printk("WP PT_SET %s:%d\n", __FILE__, __LINE__) +#else +#define PT_LOG() +#endif + +#define INVALID_P2M_ENTRY (~0UL) + +#define pmap_valid_entry(E) ((E) & PG_V) /* is PDE or PTE valid? */ + +#define SH_PD_SET_VA 1 +#define SH_PD_SET_VA_MA 2 +#define SH_PD_SET_VA_CLEAR 3 + +struct pmap; +void pd_set(struct pmap *pmap, int ptepindex, vm_paddr_t val, int type); +#ifdef notyet +static vm_paddr_t +vptetomachpte(vm_paddr_t *pte) +{ + vm_offset_t offset, ppte; + vm_paddr_t pgoffset, retval, *pdir_shadow_ptr; + int pgindex; + + ppte = (vm_offset_t)pte; + pgoffset = (ppte & PAGE_MASK); + offset = ppte - (vm_offset_t)PTmap; + pgindex = ppte >> PDRSHIFT; + + pdir_shadow_ptr = (vm_paddr_t *)PCPU_GET(pdir_shadow); + retval = (pdir_shadow_ptr[pgindex] & ~PAGE_MASK) + pgoffset; + return (retval); +} +#endif +#define PT_GET(_ptp) \ + (pmap_valid_entry(*(_ptp)) ? xpmap_mtop(*(_ptp)) : (0)) + +#ifdef WRITABLE_PAGETABLES + +#define PT_SET_VA(_ptp,_npte,sync) do { \ + PMAP_REF((_ptp), xpmap_ptom(_npte)); \ + PT_LOG(); \ + *(_ptp) = xpmap_ptom((_npte)); \ +} while (/*CONSTCOND*/0) +#define PT_SET_VA_MA(_ptp,_npte,sync) do { \ + PMAP_REF((_ptp), (_npte)); \ + PT_LOG(); \ + *(_ptp) = (_npte); \ +} while (/*CONSTCOND*/0) +#define PT_CLEAR_VA(_ptp, sync) do { \ + PMAP_REF((pt_entry_t *)(_ptp), 0); \ + PT_LOG(); \ + *(_ptp) = 0; \ +} while (/*CONSTCOND*/0) + +#define PD_SET_VA(_pmap, _ptp, _npte, sync) do { \ + PMAP_REF((_ptp), xpmap_ptom(_npte)); \ + pd_set((_pmap),(_ptp),(_npte), SH_PD_SET_VA); \ + if (sync || ALWAYS_SYNC) xen_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PD_SET_VA_MA(_pmap, _ptp, _npte, sync) do { \ + PMAP_REF((_ptp), (_npte)); \ + pd_set((_pmap),(_ptp),(_npte), SH_PD_SET_VA_MA); \ + if (sync || ALWAYS_SYNC) xen_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PD_CLEAR_VA(_pmap, _ptp, sync) do { \ + PMAP_REF((pt_entry_t *)(_ptp), 0); \ + pd_set((_pmap),(_ptp), 0, SH_PD_SET_VA_CLEAR); \ + if (sync || ALWAYS_SYNC) xen_flush_queue(); \ +} while (/*CONSTCOND*/0) + +#else /* !WRITABLE_PAGETABLES */ + +#define PT_SET_VA(_ptp,_npte,sync) do { \ + PMAP_REF((_ptp), xpmap_ptom(_npte)); \ + xen_queue_pt_update(vtomach(_ptp), \ + xpmap_ptom(_npte)); \ + if (sync || ALWAYS_SYNC) xen_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PT_SET_VA_MA(_ptp,_npte,sync) do { \ + PMAP_REF((_ptp), (_npte)); \ + xen_queue_pt_update(vtomach(_ptp), _npte); \ + if (sync || ALWAYS_SYNC) xen_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PT_CLEAR_VA(_ptp, sync) do { \ + PMAP_REF((pt_entry_t *)(_ptp), 0); \ + xen_queue_pt_update(vtomach(_ptp), 0); \ + if (sync || ALWAYS_SYNC) \ + xen_flush_queue(); \ +} while (/*CONSTCOND*/0) + +#define PD_SET_VA(_pmap, _ptepindex,_npte,sync) do { \ + PMAP_REF((_ptp), xpmap_ptom(_npte)); \ + pd_set((_pmap),(_ptepindex),(_npte), SH_PD_SET_VA); \ + if (sync || ALWAYS_SYNC) xen_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PD_SET_VA_MA(_pmap, _ptepindex,_npte,sync) do { \ + PMAP_REF((_ptp), (_npte)); \ + pd_set((_pmap),(_ptepindex),(_npte), SH_PD_SET_VA_MA); \ + if (sync || ALWAYS_SYNC) xen_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PD_CLEAR_VA(_pmap, _ptepindex, sync) do { \ + PMAP_REF((pt_entry_t *)(_ptp), 0); \ + pd_set((_pmap),(_ptepindex), 0, SH_PD_SET_VA_CLEAR); \ + if (sync || ALWAYS_SYNC) xen_flush_queue(); \ +} while (/*CONSTCOND*/0) + +#endif + +#define PT_SET_MA(_va, _ma) \ +do { \ + PANIC_IF(HYPERVISOR_update_va_mapping(((unsigned long)(_va)),\ + (_ma), \ + UVMF_INVLPG| UVMF_ALL) < 0); \ +} while (/*CONSTCOND*/0) + +#define PT_UPDATES_FLUSH() do { \ + xen_flush_queue(); \ +} while (/*CONSTCOND*/0) + +static __inline vm_paddr_t +xpmap_mtop(vm_paddr_t mpa) +{ + vm_paddr_t tmp = (mpa & PG_FRAME); + + return machtophys(tmp) | (mpa & ~PG_FRAME); +} + +static __inline vm_paddr_t +xpmap_ptom(vm_paddr_t ppa) +{ + vm_paddr_t tmp = (ppa & PG_FRAME); + + return phystomach(tmp) | (ppa & ~PG_FRAME); +} + +static __inline void +set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ +#ifdef notyet + PANIC_IF(max_mapnr && pfn >= max_mapnr); +#endif + if (xen_feature(XENFEAT_auto_translated_physmap)) { +#ifdef notyet + PANIC_IF((pfn != mfn && mfn != INVALID_P2M_ENTRY)); +#endif + return; + } + xen_phys_machine[pfn] = mfn; +} + + + + +#endif /* _XEN_XENPMAP_H_ */ diff --git a/sys/amd64/include/xen/xenvar.h b/sys/amd64/include/xen/xenvar.h new file mode 100644 index 000000000000..1433b76871ec --- /dev/null +++ b/sys/amd64/include/xen/xenvar.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2008 Kip Macy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * + * $FreeBSD$ + */ +#ifndef XENVAR_H_ +#define XENVAR_H_ +#define XBOOTUP 0x1 +#define XPMAP 0x2 +extern int xendebug_flags; +#ifndef NOXENDEBUG +#define XENPRINTF printk +#else +#define XENPRINTF printf +#endif +#include + +#if 0 +#define TRACE_ENTER XENPRINTF("(file=%s, line=%d) entered %s\n", __FILE__, __LINE__, __FUNCTION__) +#define TRACE_EXIT XENPRINTF("(file=%s, line=%d) exiting %s\n", __FILE__, __LINE__, __FUNCTION__) +#define TRACE_DEBUG(argflags, _f, _a...) \ +if (xendebug_flags & argflags) XENPRINTF("(file=%s, line=%d) " _f "\n", __FILE__, __LINE__, ## _a); +#else +#define TRACE_ENTER +#define TRACE_EXIT +#define TRACE_DEBUG(argflags, _f, _a...) +#endif + +#ifdef XENHVM + +static inline vm_paddr_t +phystomach(vm_paddr_t pa) +{ + + return (pa); +} + +static inline vm_paddr_t +machtophys(vm_paddr_t ma) +{ + + return (ma); +} + +#define vtomach(va) pmap_kextract((vm_offset_t) (va)) +#define PFNTOMFN(pa) (pa) +#define MFNTOPFN(ma) (ma) + +#define set_phys_to_machine(pfn, mfn) ((void)0) +#define phys_to_machine_mapping_valid(pfn) (TRUE) +#define PT_UPDATES_FLUSH() ((void)0) + +#else + +extern xen_pfn_t *xen_phys_machine; + + +extern xen_pfn_t *xen_machine_phys; +/* Xen starts physical pages after the 4MB ISA hole - + * FreeBSD doesn't + */ + + +#undef ADD_ISA_HOLE /* XXX */ + +#ifdef ADD_ISA_HOLE +#define ISA_INDEX_OFFSET 1024 +#define ISA_PDR_OFFSET 1 +#else +#define ISA_INDEX_OFFSET 0 +#define ISA_PDR_OFFSET 0 +#endif + + +#define PFNTOMFN(i) (xen_phys_machine[(i)]) +#define MFNTOPFN(i) ((vm_paddr_t)xen_machine_phys[(i)]) + +#define VTOP(x) ((((uintptr_t)(x))) - KERNBASE) +#define PTOV(x) (((uintptr_t)(x)) + KERNBASE) + +#define VTOPFN(x) (VTOP(x) >> PAGE_SHIFT) +#define PFNTOV(x) PTOV((vm_paddr_t)(x) << PAGE_SHIFT) + +#define VTOMFN(va) (vtomach(va) >> PAGE_SHIFT) +#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) + +#define phystomach(pa) (((vm_paddr_t)(PFNTOMFN((pa) >> PAGE_SHIFT))) << PAGE_SHIFT) +#define machtophys(ma) (((vm_paddr_t)(MFNTOPFN((ma) >> PAGE_SHIFT))) << PAGE_SHIFT) + +#endif + +void xpq_init(void); + +int xen_create_contiguous_region(vm_page_t pages, int npages); + +void xen_destroy_contiguous_region(void * addr, int npages); + +#endif diff --git a/sys/conf/files b/sys/conf/files index f3e90aa77729..6a14ef9fbbf1 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -2758,21 +2758,24 @@ gnu/fs/xfs/xfs_iomap.c optional xfs \ gnu/fs/xfs/xfs_behavior.c optional xfs \ compile-with "${NORMAL_C} -I$S/gnu/fs/xfs/FreeBSD -I$S/gnu/fs/xfs/FreeBSD/support -I$S/gnu/fs/xfs" -xen/gnttab.c optional xen -xen/features.c optional xen -xen/evtchn/evtchn.c optional xen -xen/evtchn/evtchn_dev.c optional xen -xen/xenbus/xenbus_client.c optional xen -xen/xenbus/xenbus_comms.c optional xen -xen/xenbus/xenbus_dev.c optional xen -xen/xenbus/xenbus_if.m optional xen -xen/xenbus/xenbus_probe.c optional xen -#xen/xenbus/xenbus_probe_backend.c optional xen -xen/xenbus/xenbus_xs.c optional xen -dev/xen/console/console.c optional xen -dev/xen/console/xencons_ring.c optional xen -dev/xen/blkfront/blkfront.c optional xen -dev/xen/netfront/netfront.c optional xen -#dev/xen/xenpci/xenpci.c optional xen -#xen/xenbus/xenbus_newbus.c optional xenhvm +xen/gnttab.c optional xen | xenhvm +xen/features.c optional xen | xenhvm +xen/evtchn/evtchn.c optional xen +xen/evtchn/evtchn_dev.c optional xen | xenhvm +xen/reboot.c optional xen +xen/xenbus/xenbus_client.c optional xen | xenhvm +xen/xenbus/xenbus_comms.c optional xen | xenhvm +xen/xenbus/xenbus_dev.c optional xen | xenhvm +xen/xenbus/xenbus_if.m optional xen | xenhvm +xen/xenbus/xenbus_probe.c optional xen | xenhvm +#xen/xenbus/xenbus_probe_backend.c optional xen +xen/xenbus/xenbus_xs.c optional xen | xenhvm +dev/xen/balloon/balloon.c optional xen | xenhvm +dev/xen/console/console.c optional xen +dev/xen/console/xencons_ring.c optional xen +dev/xen/blkfront/blkfront.c optional xen | xenhvm +dev/xen/netfront/netfront.c optional xen | xenhvm +dev/xen/xenpci/xenpci.c optional xenpci +dev/xen/xenpci/evtchn.c optional xenpci +dev/xen/xenpci/machine_reboot.c optional xenpci diff --git a/sys/conf/options.amd64 b/sys/conf/options.amd64 index 1e693632ef43..5247921eb8ac 100644 --- a/sys/conf/options.amd64 +++ b/sys/conf/options.amd64 @@ -57,3 +57,5 @@ KDTRACE_FRAME opt_kdtrace.h # BPF just-in-time compiler BPF_JITTER opt_bpf.h + +XENHVM opt_global.h diff --git a/sys/dev/xen/balloon/balloon.c b/sys/dev/xen/balloon/balloon.c index fa49196bc02e..c23433cbebcd 100644 --- a/sys/dev/xen/balloon/balloon.c +++ b/sys/dev/xen/balloon/balloon.c @@ -34,11 +34,24 @@ __FBSDID("$FreeBSD$"); #include #include +#include +#include +#include #include +#include -#include -#include -#include +#include +#include +#include +#include +#include + +#include +#include + +MALLOC_DEFINE(M_BALLOON, "Balloon", "Xen Balloon Driver"); + +struct mtx balloon_mutex; /* * Protects atomic reservation decrease/increase against concurrent increases. @@ -46,23 +59,44 @@ __FBSDID("$FreeBSD$"); * balloon lists. */ struct mtx balloon_lock; -#ifdef notyet -/* We aim for 'current allocation' == 'target allocation'. */ -static unsigned long current_pages; -static unsigned long target_pages; +/* We increase/decrease in batches which fit in a page */ +static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)]; +#define ARRAY_SIZE(A) (sizeof(A) / sizeof(A[0])) -/* VM /proc information for memory */ -extern unsigned long totalram_pages; +struct balloon_stats { + /* We aim for 'current allocation' == 'target allocation'. */ + unsigned long current_pages; + unsigned long target_pages; + /* We may hit the hard limit in Xen. If we do then we remember it. */ + unsigned long hard_limit; + /* + * Drivers may alter the memory reservation independently, but they + * must inform the balloon driver so we avoid hitting the hard limit. + */ + unsigned long driver_pages; + /* Number of pages in high- and low-memory balloons. */ + unsigned long balloon_low; + unsigned long balloon_high; +}; -/* We may hit the hard limit in Xen. If we do then we remember it. */ -static unsigned long hard_limit; +static struct balloon_stats balloon_stats; +#define bs balloon_stats -/* - * Drivers may alter the memory reservation independently, but they must - * inform the balloon driver so that we can avoid hitting the hard limit. - */ -static unsigned long driver_pages; +SYSCTL_DECL(_dev_xen); +SYSCTL_NODE(_dev_xen, OID_AUTO, balloon, CTLFLAG_RD, NULL, "Balloon"); +SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, current, CTLFLAG_RD, + &bs.current_pages, 0, "Current allocation"); +SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, target, CTLFLAG_RD, + &bs.target_pages, 0, "Target allocation"); +SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, driver_pages, CTLFLAG_RD, + &bs.driver_pages, 0, "Driver pages"); +SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, hard_limit, CTLFLAG_RD, + &bs.hard_limit, 0, "Xen hard limit"); +SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, low_mem, CTLFLAG_RD, + &bs.balloon_low, 0, "Low-mem balloon"); +SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, high_mem, CTLFLAG_RD, + &bs.balloon_high, 0, "High-mem balloon"); struct balloon_entry { vm_page_t page; @@ -72,9 +106,6 @@ struct balloon_entry { /* List of ballooned pages, threaded through the mem_map array. */ static STAILQ_HEAD(,balloon_entry) ballooned_pages; -static unsigned long balloon_low, balloon_high; - - /* Main work function, always executed in process context. */ static void balloon_process(void *unused); @@ -89,10 +120,10 @@ balloon_append(vm_page_t page) { struct balloon_entry *entry; - entry = malloc(sizeof(struct balloon_entry), M_WAITOK); - + entry = malloc(sizeof(struct balloon_entry), M_BALLOON, M_WAITOK); + entry->page = page; STAILQ_INSERT_HEAD(&ballooned_pages, entry, list); - balloon_low++; + bs.balloon_low++; } /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ @@ -111,13 +142,13 @@ balloon_retrieve(void) page = entry->page; free(entry, M_DEVBUF); - balloon_low--; + bs.balloon_low--; return page; } static void -balloon_alarm(unsigned long unused) +balloon_alarm(void *unused) { wakeup(balloon_process); } @@ -125,17 +156,56 @@ balloon_alarm(unsigned long unused) static unsigned long current_target(void) { - unsigned long target = min(target_pages, hard_limit); - if (target > (current_pages + balloon_low + balloon_high)) - target = current_pages + balloon_low + balloon_high; + unsigned long target = min(bs.target_pages, bs.hard_limit); + if (target > (bs.current_pages + bs.balloon_low + bs.balloon_high)) + target = bs.current_pages + bs.balloon_low + bs.balloon_high; return target; } +static unsigned long +minimum_target(void) +{ +#ifdef XENHVM +#define max_pfn physmem +#endif + unsigned long min_pages, curr_pages = current_target(); + +#define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) + /* Simple continuous piecewiese linear function: + * max MiB -> min MiB gradient + * 0 0 + * 16 16 + * 32 24 + * 128 72 (1/2) + * 512 168 (1/4) + * 2048 360 (1/8) + * 8192 552 (1/32) + * 32768 1320 + * 131072 4392 + */ + if (max_pfn < MB2PAGES(128)) + min_pages = MB2PAGES(8) + (max_pfn >> 1); + else if (max_pfn < MB2PAGES(512)) + min_pages = MB2PAGES(40) + (max_pfn >> 2); + else if (max_pfn < MB2PAGES(2048)) + min_pages = MB2PAGES(104) + (max_pfn >> 3); + else + min_pages = MB2PAGES(296) + (max_pfn >> 5); +#undef MB2PAGES + + /* Don't enforce growth */ + return min(min_pages, curr_pages); +#ifndef CONFIG_XEN +#undef max_pfn +#endif +} + static int increase_reservation(unsigned long nr_pages) { - unsigned long *mfn_list, pfn, i, flags; - struct page *page; + unsigned long pfn, i; + struct balloon_entry *entry; + vm_page_t page; long rc; struct xen_memory_reservation reservation = { .address_bits = 0, @@ -143,64 +213,81 @@ increase_reservation(unsigned long nr_pages) .domid = DOMID_SELF }; - if (nr_pages > (PAGE_SIZE / sizeof(unsigned long))) - nr_pages = PAGE_SIZE / sizeof(unsigned long); + if (nr_pages > ARRAY_SIZE(frame_list)) + nr_pages = ARRAY_SIZE(frame_list); - mfn_list = (unsigned long *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT); - if (mfn_list == NULL) - return ENOMEM; + mtx_lock(&balloon_lock); + for (entry = STAILQ_FIRST(&ballooned_pages), i = 0; + i < nr_pages; i++, entry = STAILQ_NEXT(entry, list)) { + KASSERT(entry, ("ballooned_pages list corrupt")); + page = entry->page; + frame_list[i] = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT); + } - reservation.extent_start = mfn_list; + set_xen_guest_handle(reservation.extent_start, frame_list); reservation.nr_extents = nr_pages; rc = HYPERVISOR_memory_op( - XENMEM_increase_reservation, &reservation); + XENMEM_populate_physmap, &reservation); if (rc < nr_pages) { - int ret; - /* We hit the Xen hard limit: reprobe. */ - reservation.extent_start = mfn_list; - reservation.nr_extents = rc; - ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, - &reservation); - PANIC_IF(ret != rc); - hard_limit = current_pages + rc - driver_pages; + if (rc > 0) { + int ret; + + /* We hit the Xen hard limit: reprobe. */ + reservation.nr_extents = rc; + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, + &reservation); + KASSERT(ret == rc, ("HYPERVISOR_memory_op failed")); + } + if (rc >= 0) + bs.hard_limit = (bs.current_pages + rc - + bs.driver_pages); goto out; } for (i = 0; i < nr_pages; i++) { page = balloon_retrieve(); - PANIC_IF(page == NULL); + KASSERT(page, ("balloon_retrieve failed")); pfn = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT); - PANIC_IF(phys_to_machine_mapping_valid(pfn)); + KASSERT((xen_feature(XENFEAT_auto_translated_physmap) || + !phys_to_machine_mapping_valid(pfn)), + ("auto translated physmap but mapping is valid")); + + set_phys_to_machine(pfn, frame_list[i]); + +#ifndef XENHVM + /* Link back into the page tables if not highmem. */ + if (pfn < max_low_pfn) { + int ret; + ret = HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + pfn_pte_ma(frame_list[i], PAGE_KERNEL), + 0); + PASSING(ret == 0, + ("HYPERVISOR_update_va_mapping failed")); + } +#endif - /* Update P->M and M->P tables. */ - PFNTOMFN(pfn) = mfn_list[i]; - xen_machphys_update(mfn_list[i], pfn); - /* Relinquish the page back to the allocator. */ - ClearPageReserved(page); - set_page_count(page, 1); + vm_page_unwire(page, 0); vm_page_free(page); } - current_pages += nr_pages; - totalram_pages = current_pages; + bs.current_pages += nr_pages; + //totalram_pages = bs.current_pages; out: - balloon_unlock(flags); - - free((mfn_list); + mtx_unlock(&balloon_lock); return 0; } -static int +static int decrease_reservation(unsigned long nr_pages) { - unsigned long *mfn_list, pfn, i, flags; - struct page *page; - void *v; + unsigned long pfn, i; + vm_page_t page; int need_sleep = 0; int ret; struct xen_memory_reservation reservation = { @@ -209,48 +296,68 @@ decrease_reservation(unsigned long nr_pages) .domid = DOMID_SELF }; - if (nr_pages > (PAGE_SIZE / sizeof(unsigned long))) - nr_pages = PAGE_SIZE / sizeof(unsigned long); - - mfn_list = (unsigned long *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT); - if (mfn_list == NULL) - return ENOMEM; + if (nr_pages > ARRAY_SIZE(frame_list)) + nr_pages = ARRAY_SIZE(frame_list); for (i = 0; i < nr_pages; i++) { int color = 0; if ((page = vm_page_alloc(NULL, color++, - VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | - VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { + VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | + VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { nr_pages = i; need_sleep = 1; break; } + pfn = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT); - mfn_list[i] = PFNTOMFN(pfn); + frame_list[i] = PFNTOMFN(pfn); + +#if 0 + if (!PageHighMem(page)) { + v = phys_to_virt(pfn << PAGE_SHIFT); + scrub_pages(v, 1); +#ifdef CONFIG_XEN + ret = HYPERVISOR_update_va_mapping( + (unsigned long)v, __pte_ma(0), 0); + BUG_ON(ret); +#endif + } +#endif +#ifdef CONFIG_XEN_SCRUB_PAGES + else { + v = kmap(page); + scrub_pages(v, 1); + kunmap(page); + } +#endif } - balloon_lock(flags); +#ifdef CONFIG_XEN + /* Ensure that ballooned highmem pages don't have kmaps. */ + kmap_flush_unused(); + flush_tlb_all(); +#endif + + mtx_lock(&balloon_lock); /* No more mappings: invalidate P2M and add to balloon. */ for (i = 0; i < nr_pages; i++) { - pfn = MFNTOPFN(mfn_list[i]); - PFNTOMFN(pfn) = INVALID_P2M_ENTRY; + pfn = MFNTOPFN(frame_list[i]); + set_phys_to_machine(pfn, INVALID_P2M_ENTRY); balloon_append(PHYS_TO_VM_PAGE(pfn << PAGE_SHIFT)); } - reservation.extent_start = mfn_list; + set_xen_guest_handle(reservation.extent_start, frame_list); reservation.nr_extents = nr_pages; ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); - PANIC_IF(ret != nr_pages); + KASSERT(ret == nr_pages, ("HYPERVISOR_memory_op failed")); - current_pages -= nr_pages; - totalram_pages = current_pages; + bs.current_pages -= nr_pages; + //totalram_pages = bs.current_pages; - balloon_unlock(flags); + mtx_unlock(&balloon_lock); - free(mfn_list, M_DEVBUF); - - return need_sleep; + return (need_sleep); } /* @@ -265,27 +372,24 @@ balloon_process(void *unused) int need_sleep = 0; long credit; + mtx_lock(&balloon_mutex); for (;;) { do { - credit = current_target() - current_pages; + credit = current_target() - bs.current_pages; if (credit > 0) need_sleep = (increase_reservation(credit) != 0); if (credit < 0) need_sleep = (decrease_reservation(-credit) != 0); -#ifndef CONFIG_PREEMPT - if (need_resched()) - schedule(); -#endif } while ((credit != 0) && !need_sleep); /* Schedule more work if there is some still to be done. */ - if (current_target() != current_pages) - timeout(balloon_alarm, NULL, ticks + HZ); + if (current_target() != bs.current_pages) + timeout(balloon_alarm, NULL, ticks + hz); - msleep(balloon_process, balloon_lock, 0, "balloon", -1); + msleep(balloon_process, &balloon_mutex, 0, "balloon", -1); } - + mtx_unlock(&balloon_mutex); } /* Resets the Xen limit, sets new target, and kicks off processing. */ @@ -293,8 +397,8 @@ static void set_new_target(unsigned long target) { /* No need for lock. Not read-modify-write updates. */ - hard_limit = ~0UL; - target_pages = target; + bs.hard_limit = ~0UL; + bs.target_pages = max(target, minimum_target()); wakeup(balloon_process); } @@ -311,8 +415,9 @@ watch_target(struct xenbus_watch *watch, unsigned long long new_target; int err; - err = xenbus_scanf(NULL, "memory", "target", "%llu", &new_target); - if (err != 1) { + err = xenbus_scanf(XBT_NIL, "memory", "target", NULL, + "%llu", &new_target); + if (err) { /* This is ok (for domain0 at least) - so just return */ return; } @@ -325,7 +430,7 @@ watch_target(struct xenbus_watch *watch, } static void -balloon_init_watcher(void *) +balloon_init_watcher(void *arg) { int err; @@ -334,48 +439,60 @@ balloon_init_watcher(void *) printf("Failed to set balloon watcher\n"); } +SYSINIT(balloon_init_watcher, SI_SUB_PSEUDO, SI_ORDER_ANY, + balloon_init_watcher, NULL); static void -balloon_init(void *) +balloon_init(void *arg) { - unsigned long pfn; - struct page *page; +#ifndef XENHVM + vm_page_t page; +#endif - IPRINTK("Initialising balloon driver.\n"); + if (!is_running_on_xen()) + return; - if (xen_init() < 0) - return -1; + mtx_init(&balloon_lock, "balloon_lock", NULL, MTX_DEF); + mtx_init(&balloon_mutex, "balloon_mutex", NULL, MTX_DEF); - current_pages = min(xen_start_info->nr_pages, max_pfn); - target_pages = current_pages; - balloon_low = 0; - balloon_high = 0; - driver_pages = 0UL; - hard_limit = ~0UL; +#ifndef XENHVM + bs.current_pages = min(xen_start_info->nr_pages, max_pfn); +#else + bs.current_pages = physmem; +#endif + bs.target_pages = bs.current_pages; + bs.balloon_low = 0; + bs.balloon_high = 0; + bs.driver_pages = 0UL; + bs.hard_limit = ~0UL; - init_timer(&balloon_timer); - balloon_timer.data = 0; - balloon_timer.function = balloon_alarm; + kproc_create(balloon_process, NULL, NULL, 0, 0, "balloon"); +// init_timer(&balloon_timer); +// balloon_timer.data = 0; +// balloon_timer.function = balloon_alarm; +#ifndef XENHVM /* Initialise the balloon with excess memory space. */ for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) { page = PHYS_TO_VM_PAGE(pfn << PAGE_SHIFT); balloon_append(page); } +#endif target_watch.callback = watch_target; - return 0; + return; } +SYSINIT(balloon_init, SI_SUB_PSEUDO, SI_ORDER_ANY, balloon_init, NULL); + +void balloon_update_driver_allowance(long delta); void balloon_update_driver_allowance(long delta) { - unsigned long flags; - - balloon_lock(flags); - driver_pages += delta; - balloon_unlock(flags); + mtx_lock(&balloon_lock); + bs.driver_pages += delta; + mtx_unlock(&balloon_lock); } #if 0 @@ -393,17 +510,18 @@ static int dealloc_pte_fn( set_pte_at(&init_mm, addr, pte, __pte_ma(0)); set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY); ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); - PANIC_IF(ret != 1); + KASSERT(ret == 1, ("HYPERVISOR_memory_op failed")); return 0; } #endif + +#if 0 vm_page_t balloon_alloc_empty_page_range(unsigned long nr_pages) { - unsigned long flags; vm_page_t pages; - int i; + int i, rc; unsigned long *mfn_list; struct xen_memory_reservation reservation = { .address_bits = 0, @@ -422,7 +540,9 @@ balloon_alloc_empty_page_range(unsigned long nr_pages) PFNTOMFN(i) = INVALID_P2M_ENTRY; reservation.extent_start = mfn_list; reservation.nr_extents = nr_pages; - PANIC_IF(HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation) != nr_pages); + rc = HYPERVISOR_memory_op(XENMEM_decrease_reservation, + &reservation); + KASSERT(rc == nr_pages, ("HYPERVISOR_memory_op failed")); } current_pages -= nr_pages; @@ -435,12 +555,11 @@ balloon_alloc_empty_page_range(unsigned long nr_pages) void balloon_dealloc_empty_page_range(vm_page_t page, unsigned long nr_pages) { - unsigned long i, flags; + unsigned long i; for (i = 0; i < nr_pages; i++) balloon_append(page + i); wakeup(balloon_process); } - #endif diff --git a/sys/dev/xen/blkfront/blkfront.c b/sys/dev/xen/blkfront/blkfront.c index fdebc9d34b2c..01493a6a8595 100644 --- a/sys/dev/xen/blkfront/blkfront.c +++ b/sys/dev/xen/blkfront/blkfront.c @@ -40,17 +40,17 @@ __FBSDID("$FreeBSD$"); #include #include -#include #include +#include +#include #include #include +#include #include #include #include #include -#include -#include #include @@ -106,7 +106,7 @@ static char * blkif_status_name[] = { #endif #define WPRINTK(fmt, args...) printf("[XEN] " fmt, ##args) #if 0 -#define DPRINTK(fmt, args...) printf("[XEN] %s:%d" fmt ".\n", __FUNCTION__, __LINE__,##args) +#define DPRINTK(fmt, args...) printf("[XEN] %s:%d: " fmt ".\n", __func__, __LINE__, ##args) #else #define DPRINTK(fmt, args...) #endif @@ -138,7 +138,6 @@ pfn_to_mfn(vm_paddr_t pfn) return (phystomach(pfn << PAGE_SHIFT) >> PAGE_SHIFT); } - /* * Translate Linux major/minor to an appropriate name and unit * number. For HVM guests, this allows us to use the same drive names @@ -323,17 +322,17 @@ blkfront_probe(device_t dev) static int blkfront_attach(device_t dev) { - int err, vdevice, i, unit; + int error, vdevice, i, unit; struct blkfront_info *info; const char *name; /* FIXME: Use dynamic device id if this is not set. */ - err = xenbus_scanf(XBT_NIL, xenbus_get_node(dev), + error = xenbus_scanf(XBT_NIL, xenbus_get_node(dev), "virtual-device", NULL, "%i", &vdevice); - if (err) { - xenbus_dev_fatal(dev, err, "reading virtual-device"); + if (error) { + xenbus_dev_fatal(dev, error, "reading virtual-device"); printf("couldn't find virtual device"); - return (err); + return (error); } blkfront_vdevice_to_unit(vdevice, &unit, &name); @@ -362,9 +361,22 @@ blkfront_attach(device_t dev) /* Front end dir is a number, which is used as the id. */ info->handle = strtoul(strrchr(xenbus_get_node(dev),'/')+1, NULL, 0); - err = talk_to_backend(dev, info); - if (err) - return (err); + error = talk_to_backend(dev, info); + if (error) + return (error); + + return (0); +} + +static int +blkfront_suspend(device_t dev) +{ + struct blkfront_info *info = device_get_softc(dev); + + /* Prevent new requests being issued until we fix things up. */ + mtx_lock(&blkif_io_lock); + info->connected = BLKIF_STATE_SUSPENDED; + mtx_unlock(&blkif_io_lock); return (0); } @@ -375,16 +387,14 @@ blkfront_resume(device_t dev) struct blkfront_info *info = device_get_softc(dev); int err; - DPRINTK("blkfront_resume: %s\n", dev->nodename); + DPRINTK("blkfront_resume: %s\n", xenbus_get_node(dev)); blkif_free(info, 1); - err = talk_to_backend(dev, info); - if (info->connected == BLKIF_STATE_SUSPENDED && !err) blkif_recover(info); - return err; + return (err); } /* Common code used when first setting up, and when resuming. */ @@ -425,6 +435,7 @@ talk_to_backend(device_t dev, struct blkfront_info *info) message = "writing protocol"; goto abort_transaction; } + err = xenbus_transaction_end(xbt, 0); if (err) { if (err == EAGAIN) @@ -462,8 +473,8 @@ setup_blkring(device_t dev, struct blkfront_info *info) SHARED_RING_INIT(sring); FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); - error = xenbus_grant_ring(dev, (vtomach(info->ring.sring) >> PAGE_SHIFT), - &info->ring_ref); + error = xenbus_grant_ring(dev, + (vtomach(info->ring.sring) >> PAGE_SHIFT), &info->ring_ref); if (error) { free(sring, M_DEVBUF); info->ring.sring = NULL; @@ -471,11 +482,11 @@ setup_blkring(device_t dev, struct blkfront_info *info) } error = bind_listening_port_to_irqhandler(xenbus_get_otherend_id(dev), - "xbd", (driver_intr_t *)blkif_int, info, - INTR_TYPE_BIO | INTR_MPSAFE, &info->irq); + "xbd", (driver_intr_t *)blkif_int, info, + INTR_TYPE_BIO | INTR_MPSAFE, &info->irq); if (error) { xenbus_dev_fatal(dev, error, - "bind_evtchn_to_irqhandler failed"); + "bind_evtchn_to_irqhandler failed"); goto fail; } @@ -494,7 +505,7 @@ blkfront_backend_changed(device_t dev, XenbusState backend_state) { struct blkfront_info *info = device_get_softc(dev); - DPRINTK("blkfront:backend_changed.\n"); + DPRINTK("backend_state=%d\n", backend_state); switch (backend_state) { case XenbusStateUnknown: @@ -707,7 +718,7 @@ blkif_open(struct disk *dp) struct xb_softc *sc = (struct xb_softc *)dp->d_drv1; if (sc == NULL) { - printk("xb%d: not found", sc->xb_unit); + printf("xb%d: not found", sc->xb_unit); return (ENXIO); } @@ -1019,9 +1030,11 @@ blkif_recover(struct blkfront_info *info) blkif_request_t *req; struct blk_shadow *copy; + if (!info->sc) + return; + /* Stage 1: Make a safe copy of the shadow state. */ copy = (struct blk_shadow *)malloc(sizeof(info->shadow), M_DEVBUF, M_NOWAIT|M_ZERO); - PANIC_IF(copy == NULL); memcpy(copy, info->shadow, sizeof(info->shadow)); /* Stage 2: Set up free list. */ @@ -1084,7 +1097,7 @@ static device_method_t blkfront_methods[] = { DEVMETHOD(device_attach, blkfront_attach), DEVMETHOD(device_detach, blkfront_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), - DEVMETHOD(device_suspend, bus_generic_suspend), + DEVMETHOD(device_suspend, blkfront_suspend), DEVMETHOD(device_resume, blkfront_resume), /* Xenbus interface */ diff --git a/sys/dev/xen/console/console.c b/sys/dev/xen/console/console.c index a3d616a74856..0634dadada30 100644 --- a/sys/dev/xen/console/console.c +++ b/sys/dev/xen/console/console.c @@ -5,6 +5,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -18,7 +19,7 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include +#include #include #include @@ -125,12 +126,17 @@ xccngetc(struct consdev *dev) return 0; do { if ((c = xccncheckc(dev)) == -1) { - /* polling without sleeping in Xen doesn't work well. - * Sleeping gives other things like clock a chance to - * run - */ - tsleep(&cn_mtx, PWAIT | PCATCH, "console sleep", - XC_POLLTIME); +#ifdef KDB + if (!kdb_active) +#endif + /* + * Polling without sleeping in Xen + * doesn't work well. Sleeping gives + * other things like clock a chance to + * run + */ + tsleep(&cn_mtx, PWAIT | PCATCH, + "console sleep", XC_POLLTIME); } } while(c == -1); return c; @@ -140,11 +146,13 @@ int xccncheckc(struct consdev *dev) { int ret = (xc_mute ? 0 : -1); - if (xencons_has_input()) - xencons_handle_input(NULL); + + if (xencons_has_input()) + xencons_handle_input(NULL); CN_LOCK(cn_mtx); if ((rp - rc)) { + if (kdb_active) printf("%s:%d\n", __func__, __LINE__); /* we need to return only one char */ ret = (int)rbuf[RBUF_MASK(rc)]; rc++; @@ -235,17 +243,16 @@ xc_attach(device_t dev) if (xen_start_info->flags & SIF_INITDOMAIN) { error = bind_virq_to_irqhandler( - VIRQ_CONSOLE, - 0, - "console", - NULL, - xencons_priv_interrupt, - sc, INTR_TYPE_TTY, NULL); + VIRQ_CONSOLE, + 0, + "console", + NULL, + xencons_priv_interrupt, + INTR_TYPE_TTY, NULL); KASSERT(error >= 0, ("can't register console interrupt")); } - /* register handler to flush console on shutdown */ if ((EVENTHANDLER_REGISTER(shutdown_post_sync, xc_shutdown, NULL, SHUTDOWN_PRI_DEFAULT)) == NULL) @@ -270,7 +277,11 @@ xencons_rx(char *buf, unsigned len) int i; struct tty *tp = xccons; - if (xen_console_up) { + if (xen_console_up +#ifdef DDB + && !kdb_active +#endif + ) { tty_lock(tp); for (i = 0; i < len; i++) ttydisc_rint(tp, buf[i], 0); @@ -423,12 +434,3 @@ xcons_force_flush(void) } DRIVER_MODULE(xc, nexus, xc_driver, xc_devclass, 0, 0); -/* - * Local variables: - * mode: C - * c-set-style: "BSD" - * c-basic-offset: 8 - * tab-width: 4 - * indent-tabs-mode: t - * End: - */ diff --git a/sys/dev/xen/console/xencons_ring.c b/sys/dev/xen/console/xencons_ring.c index 596b5de48d36..fc9522e4f44b 100644 --- a/sys/dev/xen/console/xencons_ring.c +++ b/sys/dev/xen/console/xencons_ring.c @@ -13,19 +13,24 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include + #include #include #include #include #include +#include +#include +#include #include #include #include - #define console_evtchn console.domU.evtchn +static unsigned int console_irq; extern char *console_page; extern struct mtx cn_mtx; @@ -60,7 +65,8 @@ xencons_ring_send(const char *data, unsigned len) sent = 0; mb(); - PANIC_IF((prod - cons) > sizeof(intf->out)); + KASSERT((prod - cons) <= sizeof(intf->out), + ("console send ring inconsistent")); while ((sent < len) && ((prod - cons) < sizeof(intf->out))) intf->out[MASK_XENCONS_IDX(prod++, intf->out)] = data[sent++]; @@ -119,15 +125,18 @@ xencons_ring_init(void) return 0; err = bind_caller_port_to_irqhandler(xen_start_info->console_evtchn, - "xencons", xencons_handle_input, NULL, - INTR_TYPE_MISC | INTR_MPSAFE, NULL); + "xencons", xencons_handle_input, NULL, + INTR_TYPE_MISC | INTR_MPSAFE, &console_irq); if (err) { return err; } return 0; } -#ifdef notyet + +extern void xencons_suspend(void); +extern void xencons_resume(void); + void xencons_suspend(void) { @@ -135,7 +144,7 @@ xencons_suspend(void) if (!xen_start_info->console_evtchn) return; - unbind_evtchn_from_irqhandler(xen_start_info->console_evtchn, NULL); + unbind_from_irqhandler(console_irq); } void @@ -144,7 +153,7 @@ xencons_resume(void) (void)xencons_ring_init(); } -#endif + /* * Local variables: * mode: C diff --git a/sys/dev/xen/netfront/netfront.c b/sys/dev/xen/netfront/netfront.c index dbf50137b0b8..a70c47cffd82 100644 --- a/sys/dev/xen/netfront/netfront.c +++ b/sys/dev/xen/netfront/netfront.c @@ -24,11 +24,11 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include #include #include #include #include +#include #include #include @@ -47,6 +47,10 @@ __FBSDID("$FreeBSD$"); #include #include #include +#if __FreeBSD_version >= 700000 +#include +#include +#endif #include #include @@ -63,23 +67,42 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include #include #include -#include -#include #include #include +#include + #include "xenbus_if.h" +#define XN_CSUM_FEATURES (CSUM_TCP | CSUM_UDP | CSUM_TSO) + #define GRANT_INVALID_REF 0 #define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE) #define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE) +#if __FreeBSD_version >= 700000 +/* + * Should the driver do LRO on the RX end + * this can be toggled on the fly, but the + * interface must be reset (down/up) for it + * to take effect. + */ +static int xn_enable_lro = 1; +TUNABLE_INT("hw.xn.enable_lro", &xn_enable_lro); +#else + +#define IFCAP_TSO4 0 +#define CSUM_TSO 0 + +#endif + #ifdef CONFIG_XEN static int MODPARM_rx_copy = 0; module_param_named(rx_copy, MODPARM_rx_copy, bool, 0); @@ -92,6 +115,7 @@ static const int MODPARM_rx_copy = 1; static const int MODPARM_rx_flip = 0; #endif +#define MAX_SKB_FRAGS (65536/PAGE_SIZE + 2) #define RX_COPY_THRESHOLD 256 #define net_ratelimit() 0 @@ -192,6 +216,9 @@ struct net_device_stats struct netfront_info { struct ifnet *xn_ifp; +#if __FreeBSD_version >= 700000 + struct lro_ctrl xn_lro; +#endif struct net_device_stats stats; u_int tx_full; @@ -329,31 +356,12 @@ xennet_get_rx_ref(struct netfront_info *np, RING_IDX ri) printf("[XEN] " fmt, ##args) #define WPRINTK(fmt, args...) \ printf("[XEN] " fmt, ##args) +#if 0 #define DPRINTK(fmt, args...) \ printf("[XEN] %s: " fmt, __func__, ##args) - - -static __inline struct mbuf* -makembuf (struct mbuf *buf) -{ - struct mbuf *m = NULL; - - MGETHDR (m, M_DONTWAIT, MT_DATA); - - if (! m) - return 0; - - M_MOVE_PKTHDR(m, buf); - - m_cljget(m, M_DONTWAIT, MJUMPAGESIZE); - m->m_pkthdr.len = buf->m_pkthdr.len; - m->m_len = buf->m_len; - m_copydata(buf, 0, buf->m_pkthdr.len, mtod(m,caddr_t) ); - - m->m_ext.ext_arg1 = (caddr_t *)(uintptr_t)(vtophys(mtod(m,caddr_t)) >> PAGE_SHIFT); - - return m; -} +#else +#define DPRINTK(fmt, args...) +#endif /** * Read the 'mac' node at the given device's node in the store, and parse that @@ -414,6 +422,13 @@ netfront_attach(device_t dev) return err; } +#if __FreeBSD_version >= 700000 + SYSCTL_ADD_INT(device_get_sysctl_ctx(dev), + SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), + OID_AUTO, "enable_lro", CTLTYPE_INT|CTLFLAG_RW, + &xn_enable_lro, 0, "Large Receive Offload"); +#endif + return 0; } @@ -489,17 +504,12 @@ talk_to_backend(device_t dev, struct netfront_info *info) message = "writing feature-rx-notify"; goto abort_transaction; } - err = xenbus_printf(xbt, node, "feature-no-csum-offload", "%d", 1); - if (err) { - message = "writing feature-no-csum-offload"; - goto abort_transaction; - } err = xenbus_printf(xbt, node, "feature-sg", "%d", 1); if (err) { message = "writing feature-sg"; goto abort_transaction; } -#ifdef HAVE_TSO +#if __FreeBSD_version >= 700000 err = xenbus_printf(xbt, node, "feature-gso-tcpv4", "%d", 1); if (err) { message = "writing feature-gso-tcpv4"; @@ -569,7 +579,7 @@ setup_device(device_t dev, struct netfront_info *info) goto fail; error = bind_listening_port_to_irqhandler(xenbus_get_otherend_id(dev), - "xn", xn_intr, info, INTR_TYPE_NET | INTR_MPSAFE, &info->irq); + "xn", xn_intr, info, INTR_TYPE_NET | INTR_MPSAFE, &info->irq); if (error) { xenbus_dev_fatal(dev, error, @@ -586,6 +596,24 @@ setup_device(device_t dev, struct netfront_info *info) return (error); } +/** + * If this interface has an ipv4 address, send an arp for it. This + * helps to get the network going again after migrating hosts. + */ +static void +netfront_send_fake_arp(device_t dev, struct netfront_info *info) +{ + struct ifnet *ifp; + struct ifaddr *ifa; + + ifp = info->xn_ifp; + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family == AF_INET) { + arp_ifinit(ifp, ifa); + } + } +} + /** * Callback received when the backend's state changes. */ @@ -611,9 +639,7 @@ netfront_backend_changed(device_t dev, XenbusState newstate) if (network_connect(sc) != 0) break; xenbus_set_state(dev, XenbusStateConnected); -#ifdef notyet - (void)send_fake_arp(netdev); -#endif + netfront_send_fake_arp(dev, sc); break; case XenbusStateClosing: xenbus_set_state(dev, XenbusStateClosed); @@ -851,6 +877,10 @@ static void xn_rxeof(struct netfront_info *np) { struct ifnet *ifp; +#if __FreeBSD_version >= 700000 + struct lro_ctrl *lro = &np->xn_lro; + struct lro_entry *queued; +#endif struct netfront_rx_info rinfo; struct netif_rx_response *rx = &rinfo.rx; struct netif_extra_info *extras = rinfo.extras; @@ -945,13 +975,35 @@ xn_rxeof(struct netfront_info *np) * Do we really need to drop the rx lock? */ XN_RX_UNLOCK(np); - /* Pass it up. */ +#if __FreeBSD_version >= 700000 + /* Use LRO if possible */ + if ((ifp->if_capenable & IFCAP_LRO) == 0 || + lro->lro_cnt == 0 || tcp_lro_rx(lro, m, 0)) { + /* + * If LRO fails, pass up to the stack + * directly. + */ + (*ifp->if_input)(ifp, m); + } +#else (*ifp->if_input)(ifp, m); +#endif XN_RX_LOCK(np); } np->rx.rsp_cons = i; +#if __FreeBSD_version >= 700000 + /* + * Flush any outstanding LRO work + */ + while (!SLIST_EMPTY(&lro->lro_active)) { + queued = SLIST_FIRST(&lro->lro_active); + SLIST_REMOVE_HEAD(&lro->lro_active, next); + tcp_lro_flush(lro, queued); + } +#endif + #if 0 /* If we get a callback with very few responses, reduce fill target. */ /* NB. Note exponential increase, linear decrease. */ @@ -972,6 +1024,7 @@ xn_txeof(struct netfront_info *np) RING_IDX i, prod; unsigned short id; struct ifnet *ifp; + netif_tx_response_t *txr; struct mbuf *m; XN_TX_LOCK_ASSERT(np); @@ -987,10 +1040,19 @@ xn_txeof(struct netfront_info *np) rmb(); /* Ensure we see responses up to 'rp'. */ for (i = np->tx.rsp_cons; i != prod; i++) { - id = RING_GET_RESPONSE(&np->tx, i)->id; + txr = RING_GET_RESPONSE(&np->tx, i); + if (txr->status == NETIF_RSP_NULL) + continue; + + id = txr->id; m = np->xn_cdata.xn_tx_chain[id]; - ifp->if_opackets++; + /* + * Increment packet count if this is the last + * mbuf of the chain. + */ + if (!m->m_next) + ifp->if_opackets++; KASSERT(m != NULL, ("mbuf not found in xn_tx_chain")); M_ASSERTVALID(m); if (unlikely(gnttab_query_foreign_access( @@ -1008,7 +1070,7 @@ xn_txeof(struct netfront_info *np) np->xn_cdata.xn_tx_chain[id] = NULL; add_id_to_freelist(np->xn_cdata.xn_tx_chain, id); - m_freem(m); + m_free(m); } np->tx.rsp_cons = prod; @@ -1235,12 +1297,11 @@ xennet_get_responses(struct netfront_info *np, gnttab_release_grant_reference(&np->gref_rx_head, ref); next: - if (m == NULL) - break; - - m->m_len = rx->status; - m->m_data += rx->offset; - m0->m_pkthdr.len += rx->status; + if (m != NULL) { + m->m_len = rx->status; + m->m_data += rx->offset; + m0->m_pkthdr.len += rx->status; + } if (!(rx->flags & NETRXF_more_data)) break; @@ -1304,13 +1365,14 @@ xn_start_locked(struct ifnet *ifp) { int otherend_id; unsigned short id; - struct mbuf *m_head, *new_m; + struct mbuf *m_head, *m; struct netfront_info *sc; netif_tx_request_t *tx; + netif_extra_info_t *extra; RING_IDX i; grant_ref_t ref; u_long mfn, tx_bytes; - int notify; + int notify, nfrags; sc = ifp->if_softc; otherend_id = xenbus_get_otherend_id(sc->xbdev); @@ -1330,36 +1392,96 @@ xn_start_locked(struct ifnet *ifp) break; } - id = get_id_from_freelist(sc->xn_cdata.xn_tx_chain); + + /* + * Defragment the mbuf if necessary. + */ + for (m = m_head, nfrags = 0; m; m = m->m_next) + nfrags++; + if (nfrags > MAX_SKB_FRAGS) { + m = m_defrag(m_head, M_DONTWAIT); + if (!m) { + m_freem(m_head); + break; + } + m_head = m; + } /* * Start packing the mbufs in this chain into * the fragment pointers. Stop when we run out * of fragments or hit the end of the mbuf chain. */ - new_m = makembuf(m_head); - tx = RING_GET_REQUEST(&sc->tx, i); - tx->id = id; - ref = gnttab_claim_grant_reference(&sc->gref_tx_head); - KASSERT((short)ref >= 0, ("Negative ref")); - mfn = virt_to_mfn(mtod(new_m, vm_offset_t)); - gnttab_grant_foreign_access_ref(ref, otherend_id, - mfn, GNTMAP_readonly); - tx->gref = sc->grant_tx_ref[id] = ref; - tx->size = new_m->m_pkthdr.len; -#if 0 - tx->flags = (skb->ip_summed == CHECKSUM_HW) ? NETTXF_csum_blank : 0; + m = m_head; + extra = NULL; + for (m = m_head; m; m = m->m_next) { + tx = RING_GET_REQUEST(&sc->tx, i); + id = get_id_from_freelist(sc->xn_cdata.xn_tx_chain); + sc->xn_cdata.xn_tx_chain[id] = m; + tx->id = id; + ref = gnttab_claim_grant_reference(&sc->gref_tx_head); + KASSERT((short)ref >= 0, ("Negative ref")); + mfn = virt_to_mfn(mtod(m, vm_offset_t)); + gnttab_grant_foreign_access_ref(ref, otherend_id, + mfn, GNTMAP_readonly); + tx->gref = sc->grant_tx_ref[id] = ref; + tx->offset = mtod(m, vm_offset_t) & (PAGE_SIZE - 1); + tx->flags = 0; + if (m == m_head) { + /* + * The first fragment has the entire packet + * size, subsequent fragments have just the + * fragment size. The backend works out the + * true size of the first fragment by + * subtracting the sizes of the other + * fragments. + */ + tx->size = m->m_pkthdr.len; + + /* + * The first fragment contains the + * checksum flags and is optionally + * followed by extra data for TSO etc. + */ + if (m->m_pkthdr.csum_flags + & CSUM_DELAY_DATA) { + tx->flags |= (NETTXF_csum_blank + | NETTXF_data_validated); + } +#if __FreeBSD_version >= 700000 + if (m->m_pkthdr.csum_flags & CSUM_TSO) { + struct netif_extra_info *gso = + (struct netif_extra_info *) + RING_GET_REQUEST(&sc->tx, ++i); + + if (extra) + extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE; + else + tx->flags |= NETTXF_extra_info; + + gso->u.gso.size = m->m_pkthdr.tso_segsz; + gso->u.gso.type = + XEN_NETIF_GSO_TYPE_TCPV4; + gso->u.gso.pad = 0; + gso->u.gso.features = 0; + + gso->type = XEN_NETIF_EXTRA_TYPE_GSO; + gso->flags = 0; + extra = gso; + } #endif - tx->flags = 0; - new_m->m_next = NULL; - new_m->m_nextpkt = NULL; + } else { + tx->size = m->m_len; + } + if (m->m_next) { + tx->flags |= NETTXF_more_data; + i++; + } + } - m_freem(m_head); + BPF_MTAP(ifp, m_head); - sc->xn_cdata.xn_tx_chain[id] = new_m; - BPF_MTAP(ifp, new_m); - - sc->stats.tx_bytes += new_m->m_pkthdr.len; + sc->stats.tx_bytes += m_head->m_pkthdr.len; sc->stats.tx_packets++; } @@ -1445,9 +1567,9 @@ xn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) xn_ifinit_locked(sc); arp_ifinit(ifp, ifa); - XN_UNLOCK(sc); + XN_UNLOCK(sc); } else { - XN_UNLOCK(sc); + XN_UNLOCK(sc); error = ether_ioctl(ifp, cmd, data); } break; @@ -1501,12 +1623,39 @@ xn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) break; case SIOCSIFCAP: mask = ifr->ifr_reqcap ^ ifp->if_capenable; - if (mask & IFCAP_HWCSUM) { - if (IFCAP_HWCSUM & ifp->if_capenable) - ifp->if_capenable &= ~IFCAP_HWCSUM; - else - ifp->if_capenable |= IFCAP_HWCSUM; + if (mask & IFCAP_TXCSUM) { + if (IFCAP_TXCSUM & ifp->if_capenable) { + ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4); + ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP + | CSUM_IP | CSUM_TSO); + } else { + ifp->if_capenable |= IFCAP_TXCSUM; + ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP + | CSUM_IP); + } } + if (mask & IFCAP_RXCSUM) { + ifp->if_capenable ^= IFCAP_RXCSUM; + } +#if __FreeBSD_version >= 700000 + if (mask & IFCAP_TSO4) { + if (IFCAP_TSO4 & ifp->if_capenable) { + ifp->if_capenable &= ~IFCAP_TSO4; + ifp->if_hwassist &= ~CSUM_TSO; + } else if (IFCAP_TXCSUM & ifp->if_capenable) { + ifp->if_capenable |= IFCAP_TSO4; + ifp->if_hwassist |= CSUM_TSO; + } else { + DPRINTK("Xen requires tx checksum offload" + " be enabled to use TSO\n"); + error = EINVAL; + } + } + if (mask & IFCAP_LRO) { + ifp->if_capenable ^= IFCAP_LRO; + + } +#endif error = 0; break; case SIOCADDMULTI: @@ -1715,11 +1864,21 @@ create_netdev(device_t dev) ifp->if_mtu = ETHERMTU; ifp->if_snd.ifq_maxlen = NET_TX_RING_SIZE - 1; -#ifdef notyet ifp->if_hwassist = XN_CSUM_FEATURES; ifp->if_capabilities = IFCAP_HWCSUM; +#if __FreeBSD_version >= 700000 + ifp->if_capabilities |= IFCAP_TSO4; + if (xn_enable_lro) { + int err = tcp_lro_init(&np->xn_lro); + if (err) { + device_printf(dev, "LRO initialization failed\n"); + goto exit; + } + np->xn_lro.ifp = ifp; + ifp->if_capabilities |= IFCAP_LRO; + } +#endif ifp->if_capenable = ifp->if_capabilities; -#endif ether_ifattach(ifp, np->mac); callout_init(&np->xn_stat_ch, CALLOUT_MPSAFE); diff --git a/sys/dev/xen/xenpci/evtchn.c b/sys/dev/xen/xenpci/evtchn.c new file mode 100644 index 000000000000..bdf3ad155722 --- /dev/null +++ b/sys/dev/xen/xenpci/evtchn.c @@ -0,0 +1,418 @@ +/****************************************************************************** + * evtchn.c + * + * A simplified event channel for para-drivers in unmodified linux + * + * Copyright (c) 2002-2005, K A Fraser + * Copyright (c) 2005, Intel Corporation + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +static inline unsigned long __ffs(unsigned long word) +{ + __asm__("bsfq %1,%0" + :"=r" (word) + :"rm" (word)); + return word; +} + +#define is_valid_evtchn(x) ((x) != 0) +#define evtchn_from_irq(x) (irq_evtchn[irq].evtchn) + +static struct { + struct mtx lock; + driver_intr_t *handler; + void *arg; + int evtchn; + int close:1; /* close on unbind_from_irqhandler()? */ + int inuse:1; + int in_handler:1; + int mpsafe:1; +} irq_evtchn[256]; +static int evtchn_to_irq[NR_EVENT_CHANNELS] = { + [0 ... NR_EVENT_CHANNELS-1] = -1 }; + +static struct mtx irq_alloc_lock; +static device_t xenpci_device; + +#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0])) + +static unsigned int +alloc_xen_irq(void) +{ + static int warned; + unsigned int irq; + + mtx_lock(&irq_alloc_lock); + + for (irq = 1; irq < ARRAY_SIZE(irq_evtchn); irq++) { + if (irq_evtchn[irq].inuse) + continue; + irq_evtchn[irq].inuse = 1; + mtx_unlock(&irq_alloc_lock); + return irq; + } + + if (!warned) { + warned = 1; + printf("alloc_xen_irq: No available IRQ to bind to: " + "increase irq_evtchn[] size in evtchn.c.\n"); + } + + mtx_unlock(&irq_alloc_lock); + + return -ENOSPC; +} + +static void +free_xen_irq(int irq) +{ + + mtx_lock(&irq_alloc_lock); + irq_evtchn[irq].inuse = 0; + mtx_unlock(&irq_alloc_lock); +} + +int +irq_to_evtchn_port(int irq) +{ + + return irq_evtchn[irq].evtchn; +} + +void +mask_evtchn(int port) +{ + shared_info_t *s = HYPERVISOR_shared_info; + + synch_set_bit(port, &s->evtchn_mask[0]); +} + +void +unmask_evtchn(int port) +{ + evtchn_unmask_t op = { .port = port }; + + HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &op); +} + +int +bind_listening_port_to_irqhandler(unsigned int remote_domain, + const char *devname, driver_intr_t handler, void *arg, + unsigned long irqflags, unsigned int *irqp) +{ + struct evtchn_alloc_unbound alloc_unbound; + unsigned int irq; + int error; + + irq = alloc_xen_irq(); + if (irq < 0) + return irq; + + mtx_lock(&irq_evtchn[irq].lock); + + alloc_unbound.dom = DOMID_SELF; + alloc_unbound.remote_dom = remote_domain; + error = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, + &alloc_unbound); + if (error) { + mtx_unlock(&irq_evtchn[irq].lock); + free_xen_irq(irq); + return (-error); + } + + irq_evtchn[irq].handler = handler; + irq_evtchn[irq].arg = arg; + irq_evtchn[irq].evtchn = alloc_unbound.port; + irq_evtchn[irq].close = 1; + irq_evtchn[irq].mpsafe = (irqflags & INTR_MPSAFE) != 0; + + evtchn_to_irq[alloc_unbound.port] = irq; + + unmask_evtchn(alloc_unbound.port); + + mtx_unlock(&irq_evtchn[irq].lock); + + if (irqp) + *irqp = irq; + return (0); +} + +int +bind_caller_port_to_irqhandler(unsigned int caller_port, + const char *devname, driver_intr_t handler, void *arg, + unsigned long irqflags, unsigned int *irqp) +{ + unsigned int irq; + + irq = alloc_xen_irq(); + if (irq < 0) + return irq; + + mtx_lock(&irq_evtchn[irq].lock); + + irq_evtchn[irq].handler = handler; + irq_evtchn[irq].arg = arg; + irq_evtchn[irq].evtchn = caller_port; + irq_evtchn[irq].close = 0; + irq_evtchn[irq].mpsafe = (irqflags & INTR_MPSAFE) != 0; + + evtchn_to_irq[caller_port] = irq; + + unmask_evtchn(caller_port); + + mtx_unlock(&irq_evtchn[irq].lock); + + if (irqp) + *irqp = irq; + return (0); +} + +void +unbind_from_irqhandler(unsigned int irq) +{ + int evtchn; + + mtx_lock(&irq_evtchn[irq].lock); + + evtchn = evtchn_from_irq(irq); + + if (is_valid_evtchn(evtchn)) { + evtchn_to_irq[evtchn] = -1; + mask_evtchn(evtchn); + if (irq_evtchn[irq].close) { + struct evtchn_close close = { .port = evtchn }; + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close)) + panic("EVTCHNOP_close failed"); + } + } + + irq_evtchn[irq].handler = NULL; + irq_evtchn[irq].evtchn = 0; + + mtx_unlock(&irq_evtchn[irq].lock); + + while (irq_evtchn[irq].in_handler) + cpu_relax(); + + free_xen_irq(irq); +} + +void notify_remote_via_irq(int irq) +{ + int evtchn; + + evtchn = evtchn_from_irq(irq); + if (is_valid_evtchn(evtchn)) + notify_remote_via_evtchn(evtchn); +} + +static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh, + unsigned int idx) +{ + return (sh->evtchn_pending[idx] & ~sh->evtchn_mask[idx]); +} + +static void +evtchn_interrupt(void *arg) +{ + unsigned int l1i, l2i, port; + unsigned long masked_l1, masked_l2; + /* XXX: All events are bound to vcpu0 but irq may be redirected. */ + int cpu = 0; /*smp_processor_id();*/ + driver_intr_t *handler; + void *handler_arg; + int irq, handler_mpsafe; + shared_info_t *s = HYPERVISOR_shared_info; + vcpu_info_t *v = &s->vcpu_info[cpu]; + struct pcpu *pc = pcpu_find(cpu); + unsigned long l1, l2; + + v->evtchn_upcall_pending = 0; + +#if 0 +#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ + /* Clear master flag /before/ clearing selector flag. */ + wmb(); +#endif +#endif + + l1 = atomic_readandclear_long(&v->evtchn_pending_sel); + + l1i = pc->pc_last_processed_l1i; + l2i = pc->pc_last_processed_l2i; + + while (l1 != 0) { + + l1i = (l1i + 1) % LONG_BIT; + masked_l1 = l1 & ((~0UL) << l1i); + + if (masked_l1 == 0) { /* if we masked out all events, wrap around to the beginning */ + l1i = LONG_BIT - 1; + l2i = LONG_BIT - 1; + continue; + } + l1i = __ffs(masked_l1); + + do { + l2 = active_evtchns(cpu, s, l1i); + + l2i = (l2i + 1) % LONG_BIT; + masked_l2 = l2 & ((~0UL) << l2i); + + if (masked_l2 == 0) { /* if we masked out all events, move on */ + l2i = LONG_BIT - 1; + break; + } + l2i = __ffs(masked_l2); + + /* process port */ + port = (l1i * LONG_BIT) + l2i; + synch_clear_bit(port, &s->evtchn_pending[0]); + + irq = evtchn_to_irq[port]; + if (irq < 0) + continue; + + mtx_lock(&irq_evtchn[irq].lock); + handler = irq_evtchn[irq].handler; + handler_arg = irq_evtchn[irq].arg; + handler_mpsafe = irq_evtchn[irq].mpsafe; + if (unlikely(handler == NULL)) { + printf("Xen IRQ%d (port %d) has no handler!\n", + irq, port); + mtx_unlock(&irq_evtchn[irq].lock); + continue; + } + irq_evtchn[irq].in_handler = 1; + mtx_unlock(&irq_evtchn[irq].lock); + + //local_irq_enable(); + if (!handler_mpsafe) + mtx_lock(&Giant); + handler(handler_arg); + if (!handler_mpsafe) + mtx_unlock(&Giant); + //local_irq_disable(); + + mtx_lock(&irq_evtchn[irq].lock); + irq_evtchn[irq].in_handler = 0; + mtx_unlock(&irq_evtchn[irq].lock); + + /* if this is the final port processed, we'll pick up here+1 next time */ + pc->pc_last_processed_l1i = l1i; + pc->pc_last_processed_l2i = l2i; + + } while (l2i != LONG_BIT - 1); + + l2 = active_evtchns(cpu, s, l1i); + if (l2 == 0) /* we handled all ports, so we can clear the selector bit */ + l1 &= ~(1UL << l1i); + } +} + +void +irq_suspend(void) +{ + struct xenpci_softc *scp = device_get_softc(xenpci_device); + + /* + * Take our interrupt handler out of the list of handlers + * that can handle this irq. + */ + if (scp->intr_cookie != NULL) { + if (BUS_TEARDOWN_INTR(device_get_parent(xenpci_device), + xenpci_device, scp->res_irq, scp->intr_cookie) != 0) + printf("intr teardown failed.. continuing\n"); + scp->intr_cookie = NULL; + } +} + +void +irq_resume(void) +{ + struct xenpci_softc *scp = device_get_softc(xenpci_device); + int evtchn, irq; + + for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) { + mask_evtchn(evtchn); + evtchn_to_irq[evtchn] = -1; + } + + for (irq = 0; irq < ARRAY_SIZE(irq_evtchn); irq++) + irq_evtchn[irq].evtchn = 0; + + BUS_SETUP_INTR(device_get_parent(xenpci_device), + xenpci_device, scp->res_irq, INTR_TYPE_MISC, + NULL, evtchn_interrupt, NULL, &scp->intr_cookie); +} + +int +xenpci_irq_init(device_t device, struct xenpci_softc *scp) +{ + int irq, cpu; + int error; + + mtx_init(&irq_alloc_lock, "xen-irq-lock", NULL, MTX_DEF); + + for (irq = 0; irq < ARRAY_SIZE(irq_evtchn); irq++) + mtx_init(&irq_evtchn[irq].lock, "irq-evtchn", NULL, MTX_DEF); + + for (cpu = 0; cpu < mp_ncpus; cpu++) { + pcpu_find(cpu)->pc_last_processed_l1i = LONG_BIT - 1; + pcpu_find(cpu)->pc_last_processed_l2i = LONG_BIT - 1; + } + + error = BUS_SETUP_INTR(device_get_parent(device), device, + scp->res_irq, INTR_MPSAFE|INTR_TYPE_MISC, NULL, evtchn_interrupt, + NULL, &scp->intr_cookie); + if (error) + return (error); + + xenpci_device = device; + + return (0); +} diff --git a/sys/dev/xen/xenpci/machine_reboot.c b/sys/dev/xen/xenpci/machine_reboot.c new file mode 100644 index 000000000000..40365545b23e --- /dev/null +++ b/sys/dev/xen/xenpci/machine_reboot.c @@ -0,0 +1,80 @@ +/*- + * Copyright (c) 2008 Citrix Systems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +void +xen_suspend() +{ + int suspend_cancelled; + + if (DEVICE_SUSPEND(root_bus)) { + printf("xen_suspend: device_suspend failed\n"); + return; + } + + /* + * Make sure we don't change cpus or switch to some other + * thread. for the duration. + */ + critical_enter(); + + /* + * Prevent any races with evtchn_interrupt() handler. + */ + irq_suspend(); + disable_intr(); + + suspend_cancelled = HYPERVISOR_suspend(0); + if (!suspend_cancelled) + xenpci_resume(); + + /* + * Re-enable interrupts and put the scheduler back to normal. + */ + enable_intr(); + critical_exit(); + + /* + * FreeBSD really needs to add DEVICE_SUSPEND_CANCEL or + * similar. + */ + if (!suspend_cancelled) + DEVICE_RESUME(root_bus); +} diff --git a/sys/dev/xen/xenpci/xenpci.c b/sys/dev/xen/xenpci/xenpci.c new file mode 100644 index 000000000000..2f2a79fff21d --- /dev/null +++ b/sys/dev/xen/xenpci/xenpci.c @@ -0,0 +1,399 @@ +/* + * Copyright (c) 2008 Citrix Systems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include + +/* + * These variables are used by the rest of the kernel to access the + * hypervisor. + */ +char *hypercall_stubs; +shared_info_t *HYPERVISOR_shared_info; +static vm_paddr_t shared_info_pa; + +/* + * This is used to find our platform device instance. + */ +static devclass_t xenpci_devclass; + +/* + * Return the CPUID base address for Xen functions. + */ +static uint32_t +xenpci_cpuid_base(void) +{ + uint32_t base, regs[4]; + + for (base = 0x40000000; base < 0x40001000; base += 0x100) { + do_cpuid(base, regs); + if (!memcmp("XenVMMXenVMM", ®s[1], 12) + && (regs[0] - base) >= 2) + return (base); + } + return (0); +} + +/* + * Allocate and fill in the hypcall page. + */ +static int +xenpci_init_hypercall_stubs(device_t dev, struct xenpci_softc * scp) +{ + uint32_t base, regs[4]; + int i; + + base = xenpci_cpuid_base(); + if (!base) { + device_printf(dev, "Xen platform device but not Xen VMM\n"); + return (EINVAL); + } + + if (bootverbose) { + do_cpuid(base + 1, regs); + device_printf(dev, "Xen version %d.%d.\n", + regs[0] >> 16, regs[0] & 0xffff); + } + + /* + * Find the hypercall pages. + */ + do_cpuid(base + 2, regs); + + hypercall_stubs = malloc(regs[0] * PAGE_SIZE, M_TEMP, M_WAITOK); + + for (i = 0; i < regs[0]; i++) { + wrmsr(regs[1], vtophys(hypercall_stubs + i * PAGE_SIZE) + i); + } + + return (0); +} + +/* + * After a resume, re-initialise the hypercall page. + */ +static void +xenpci_resume_hypercall_stubs(device_t dev, struct xenpci_softc * scp) +{ + uint32_t base, regs[4]; + int i; + + base = xenpci_cpuid_base(); + + do_cpuid(base + 2, regs); + for (i = 0; i < regs[0]; i++) { + wrmsr(regs[1], vtophys(hypercall_stubs + i * PAGE_SIZE) + i); + } +} + +/* + * Tell the hypervisor how to contact us for event channel callbacks. + */ +static void +xenpci_set_callback(device_t dev) +{ + int irq; + uint64_t callback; + struct xen_hvm_param xhp; + + irq = pci_get_irq(dev); + if (irq < 16) { + callback = irq; + } else { + callback = (pci_get_intpin(dev) - 1) & 3; + callback |= pci_get_slot(dev) << 11; + callback |= 1ull << 56; + } + + xhp.domid = DOMID_SELF; + xhp.index = HVM_PARAM_CALLBACK_IRQ; + xhp.value = callback; + if (HYPERVISOR_hvm_op(HVMOP_set_param, &xhp)) + panic("Can't set evtchn callback"); +} + + +/* + * Deallocate anything allocated by xenpci_allocate_resources. + */ +static int +xenpci_deallocate_resources(device_t dev) +{ + struct xenpci_softc *scp = device_get_softc(dev); + + if (scp->res_irq != 0) { + bus_deactivate_resource(dev, SYS_RES_IRQ, + scp->rid_irq, scp->res_irq); + bus_release_resource(dev, SYS_RES_IRQ, + scp->rid_irq, scp->res_irq); + scp->res_irq = 0; + } + if (scp->res_memory != 0) { + bus_deactivate_resource(dev, SYS_RES_MEMORY, + scp->rid_memory, scp->res_memory); + bus_release_resource(dev, SYS_RES_MEMORY, + scp->rid_memory, scp->res_memory); + scp->res_memory = 0; + } + + return (0); +} + +/* + * Allocate irq and memory resources. + */ +static int +xenpci_allocate_resources(device_t dev) +{ + struct xenpci_softc *scp = device_get_softc(dev); + + scp->res_irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, + &scp->rid_irq, RF_SHAREABLE|RF_ACTIVE); + if (scp->res_irq == NULL) + goto errexit; + + scp->rid_memory = PCIR_BAR(1); + scp->res_memory = bus_alloc_resource_any(dev, SYS_RES_MEMORY, + &scp->rid_memory, RF_ACTIVE); + if (scp->res_memory == NULL) + goto errexit; + return (0); + +errexit: + /* Cleanup anything we may have assigned. */ + xenpci_deallocate_resources(dev); + return (ENXIO); /* For want of a better idea. */ +} + +/* + * Allocate a physical address range from our mmio region. + */ +static int +xenpci_alloc_space_int(struct xenpci_softc *scp, size_t sz, + vm_paddr_t *pa) +{ + + if (scp->phys_next + sz > rman_get_end(scp->res_memory)) { + return (ENOMEM); + } + + *pa = scp->phys_next; + scp->phys_next += sz; + + return (0); +} + +/* + * Allocate a physical address range from our mmio region. + */ +int +xenpci_alloc_space(size_t sz, vm_paddr_t *pa) +{ + device_t dev = devclass_get_device(xenpci_devclass, 0); + + if (dev) { + return (xenpci_alloc_space_int(device_get_softc(dev), + sz, pa)); + } else { + return (ENOMEM); + } +} + +/* + * Called very early in the resume sequence - reinitialise the various + * bits of Xen machinery including the hypercall page and the shared + * info page. + */ +void +xenpci_resume() +{ + device_t dev = devclass_get_device(xenpci_devclass, 0); + struct xenpci_softc *scp = device_get_softc(dev); + struct xen_add_to_physmap xatp; + + xenpci_resume_hypercall_stubs(dev, scp); + + xatp.domid = DOMID_SELF; + xatp.idx = 0; + xatp.space = XENMAPSPACE_shared_info; + xatp.gpfn = shared_info_pa >> PAGE_SHIFT; + if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) + panic("HYPERVISOR_memory_op failed"); + + pmap_kenter((vm_offset_t) HYPERVISOR_shared_info, shared_info_pa); + + xenpci_set_callback(dev); + + gnttab_resume(); + irq_resume(); +} + +/* + * Probe - just check device ID. + */ +static int +xenpci_probe(device_t dev) +{ + + if (pci_get_devid(dev) != 0x00015853) + return (ENXIO); + + device_set_desc(dev, "Xen Platform Device"); + return (bus_generic_probe(dev)); +} + +/* + * Attach - find resources and talk to Xen. + */ +static int +xenpci_attach(device_t dev) +{ + int error; + struct xenpci_softc *scp = device_get_softc(dev); + struct xen_add_to_physmap xatp; + vm_offset_t shared_va; + + error = xenpci_allocate_resources(dev); + if (error) + goto errexit; + + scp->phys_next = rman_get_start(scp->res_memory); + + error = xenpci_init_hypercall_stubs(dev, scp); + if (error) + goto errexit; + + setup_xen_features(); + + xenpci_alloc_space_int(scp, PAGE_SIZE, &shared_info_pa); + + xatp.domid = DOMID_SELF; + xatp.idx = 0; + xatp.space = XENMAPSPACE_shared_info; + xatp.gpfn = shared_info_pa >> PAGE_SHIFT; + if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) + panic("HYPERVISOR_memory_op failed"); + + shared_va = kmem_alloc_nofault(kernel_map, PAGE_SIZE); + pmap_kenter(shared_va, shared_info_pa); + HYPERVISOR_shared_info = (void *) shared_va; + + /* + * Hook the irq up to evtchn + */ + xenpci_irq_init(dev, scp); + xenpci_set_callback(dev); + + return (bus_generic_attach(dev)); + +errexit: + /* + * Undo anything we may have done. + */ + xenpci_deallocate_resources(dev); + return (error); +} + +/* + * Detach - reverse anything done by attach. + */ +static int +xenpci_detach(device_t dev) +{ + struct xenpci_softc *scp = device_get_softc(dev); + device_t parent = device_get_parent(dev); + + /* + * Take our interrupt handler out of the list of handlers + * that can handle this irq. + */ + if (scp->intr_cookie != NULL) { + if (BUS_TEARDOWN_INTR(parent, dev, + scp->res_irq, scp->intr_cookie) != 0) + printf("intr teardown failed.. continuing\n"); + scp->intr_cookie = NULL; + } + + /* + * Deallocate any system resources we may have + * allocated on behalf of this driver. + */ + return (xenpci_deallocate_resources(dev)); +} + +static device_method_t xenpci_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, xenpci_probe), + DEVMETHOD(device_attach, xenpci_attach), + DEVMETHOD(device_detach, xenpci_detach), + DEVMETHOD(device_suspend, bus_generic_suspend), + DEVMETHOD(device_resume, bus_generic_resume), + + /* Bus interface */ + DEVMETHOD(bus_add_child, bus_generic_add_child), + + { 0, 0 } +}; + +static driver_t xenpci_driver = { + "xenpci", + xenpci_methods, + sizeof(struct xenpci_softc), +}; + +DRIVER_MODULE(xenpci, pci, xenpci_driver, xenpci_devclass, 0, 0); diff --git a/sys/dev/xen/xenpci/xenpcivar.h b/sys/dev/xen/xenpci/xenpcivar.h new file mode 100644 index 000000000000..a57c080b31d3 --- /dev/null +++ b/sys/dev/xen/xenpci/xenpcivar.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2008 Citrix Systems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * One of these per allocated device. + */ +struct xenpci_softc { + int rid_ioport; + int rid_memory; + int rid_irq; + struct resource* res_memory; /* Resource for mem range. */ + struct resource* res_irq; /* Resource for irq range. */ + void *intr_cookie; + + vm_paddr_t phys_next; /* next page from mem range */ +}; + +extern int xenpci_irq_init(device_t device, struct xenpci_softc *scp); +extern int xenpci_alloc_space(size_t sz, vm_paddr_t *pa); +extern void xenpci_resume(void); +extern void xen_suspend(void); diff --git a/sys/i386/include/xen/xenpmap.h b/sys/i386/include/xen/xenpmap.h index 17d1f9254e42..4bfd99e65e55 100644 --- a/sys/i386/include/xen/xenpmap.h +++ b/sys/i386/include/xen/xenpmap.h @@ -222,7 +222,11 @@ set_phys_to_machine(unsigned long pfn, unsigned long mfn) xen_phys_machine[pfn] = mfn; } - +static __inline int +phys_to_machine_mapping_valid(unsigned long pfn) +{ + return xen_phys_machine[pfn] != INVALID_P2M_ENTRY; +} #endif /* _XEN_XENPMAP_H_ */ diff --git a/sys/xen/evtchn/evtchn.c b/sys/xen/evtchn/evtchn.c index 884270c666a6..61b738b0d981 100644 --- a/sys/xen/evtchn/evtchn.c +++ b/sys/xen/evtchn/evtchn.c @@ -13,56 +13,28 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include #include #include #include +#include #include #include + #include +#include #include #include #include #include #include - - -/* linux helper functions that got sucked in - * rename and move XXX - */ - - -static inline int find_first_bit(const unsigned long *addr, unsigned size) -{ - int d0, d1; - int res; - - /* This looks at memory. Mark it volatile to tell gcc not to move it around */ - __asm__ __volatile__( - "xorl %%eax,%%eax\n\t" - "repe; scasl\n\t" - "jz 1f\n\t" - "leal -4(%%edi),%%edi\n\t" - "bsfl (%%edi),%%eax\n" - "1:\tsubl %%ebx,%%edi\n\t" - "shll $3,%%edi\n\t" - "addl %%edi,%%eax" - :"=a" (res), "=&c" (d0), "=&D" (d1) - :"1" ((size + 31) >> 5), "2" (addr), "b" (addr) : "memory"); - return res; -} - -#define min_t(type,x,y) \ - ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; }) -#define first_cpu(src) __first_cpu(&(src), NR_CPUS) -static inline int __first_cpu(const xen_cpumask_t *srcp, int nbits) -{ - return min_t(int, nbits, find_first_bit(srcp->bits, nbits)); -} +#include +#include static inline unsigned long __ffs(unsigned long word) { @@ -166,7 +138,7 @@ static int irq_bindcount[NR_IRQS]; #ifdef SMP static uint8_t cpu_evtchn[NR_EVENT_CHANNELS]; -static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG]; +static unsigned long cpu_evtchn_mask[MAX_VIRT_CPUS][NR_EVENT_CHANNELS/LONG_BIT]; #define active_evtchns(cpu,sh,idx) \ ((sh)->evtchn_pending[idx] & \ @@ -220,7 +192,7 @@ evtchn_do_upcall(struct trapframe *frame) shared_info_t *s; vcpu_info_t *vcpu_info; - cpu = smp_processor_id(); + cpu = PCPU_GET(cpuid); s = HYPERVISOR_shared_info; vcpu_info = &s->vcpu_info[cpu]; @@ -236,7 +208,7 @@ evtchn_do_upcall(struct trapframe *frame) while ((l2 = active_evtchns(cpu, s, l1i)) != 0) { l2i = __ffs(l2); - port = (l1i * BITS_PER_LONG) + l2i; + port = (l1i * LONG_BIT) + l2i; if ((irq = evtchn_to_irq[port]) != -1) { struct intsrc *isrc = intr_lookup_source(irq); /* @@ -258,7 +230,7 @@ ipi_pcpu(unsigned int cpu, int vector) { int irq; - irq = per_cpu(ipi_to_irq, cpu)[vector]; + irq = PCPU_GET(ipi_to_irq[vector]); notify_remote_via_irq(irq); } @@ -310,11 +282,12 @@ bind_local_port_to_irq(unsigned int local_port) mtx_lock_spin(&irq_mapping_update_lock); - PANIC_IF(evtchn_to_irq[local_port] != -1); - + KASSERT(evtchn_to_irq[local_port] == -1, + ("evtchn_to_irq inconsistent")); + if ((irq = find_unbound_irq()) < 0) { struct evtchn_close close = { .port = local_port }; - PANIC_IF(HYPERVISOR_event_channel_op(EVTCHNOP_close, &close)); + HYPERVISOR_event_channel_op(EVTCHNOP_close, &close); goto out; } @@ -368,21 +341,20 @@ bind_virq_to_irq(unsigned int virq, unsigned int cpu) mtx_lock_spin(&irq_mapping_update_lock); - if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) { + if ((irq = pcpu_find(cpu)->pc_virq_to_irq[virq]) == -1) { if ((irq = find_unbound_irq()) < 0) goto out; bind_virq.virq = virq; bind_virq.vcpu = cpu; - PANIC_IF(HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, - &bind_virq) != 0); + HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, &bind_virq); evtchn = bind_virq.port; evtchn_to_irq[evtchn] = irq; irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); - per_cpu(virq_to_irq, cpu)[virq] = irq; + pcpu_find(cpu)->pc_virq_to_irq[virq] = irq; bind_evtchn_to_cpu(evtchn, cpu); } @@ -407,18 +379,18 @@ bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) mtx_lock_spin(&irq_mapping_update_lock); - if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) { + if ((irq = pcpu_find(cpu)->pc_ipi_to_irq[ipi]) == -1) { if ((irq = find_unbound_irq()) < 0) goto out; bind_ipi.vcpu = cpu; - PANIC_IF(HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind_ipi) != 0); + HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind_ipi); evtchn = bind_ipi.port; evtchn_to_irq[evtchn] = irq; irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn); - per_cpu(ipi_to_irq, cpu)[ipi] = irq; + pcpu_find(cpu)->pc_ipi_to_irq[ipi] = irq; bind_evtchn_to_cpu(evtchn, cpu); } @@ -432,24 +404,27 @@ bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) } -void +static void unbind_from_irq(int irq) { struct evtchn_close close; int evtchn = evtchn_from_irq(irq); + int cpu; mtx_lock_spin(&irq_mapping_update_lock); if ((--irq_bindcount[irq] == 0) && VALID_EVTCHN(evtchn)) { close.port = evtchn; - PANIC_IF(HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0); + HYPERVISOR_event_channel_op(EVTCHNOP_close, &close); switch (type_from_irq(irq)) { case IRQT_VIRQ: - per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))[index_from_irq(irq)] = -1; + cpu = cpu_from_evtchn(evtchn); + pcpu_find(cpu)->pc_virq_to_irq[index_from_irq(irq)] = -1; break; case IRQT_IPI: - per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn))[index_from_irq(irq)] = -1; + cpu = cpu_from_evtchn(evtchn); + pcpu_find(cpu)->pc_ipi_to_irq[index_from_irq(irq)] = -1; break; default: break; @@ -467,11 +442,8 @@ unbind_from_irq(int irq) int bind_caller_port_to_irqhandler(unsigned int caller_port, - const char *devname, - driver_intr_t handler, - void *arg, - unsigned long irqflags, - unsigned int *irqp) + const char *devname, driver_intr_t handler, void *arg, + unsigned long irqflags, unsigned int *irqp) { unsigned int irq; int error; @@ -493,13 +465,9 @@ bind_caller_port_to_irqhandler(unsigned int caller_port, } int -bind_listening_port_to_irqhandler( - unsigned int remote_domain, - const char *devname, - driver_intr_t handler, - void *arg, - unsigned long irqflags, - unsigned int *irqp) +bind_listening_port_to_irqhandler(unsigned int remote_domain, + const char *devname, driver_intr_t handler, void *arg, + unsigned long irqflags, unsigned int *irqp) { unsigned int irq; int error; @@ -519,14 +487,10 @@ bind_listening_port_to_irqhandler( } int -bind_interdomain_evtchn_to_irqhandler( - unsigned int remote_domain, - unsigned int remote_port, - const char *devname, - driver_filter_t filter, - driver_intr_t handler, - unsigned long irqflags, - unsigned int *irqp) +bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, + unsigned int remote_port, const char *devname, + driver_filter_t filter, driver_intr_t handler, + unsigned long irqflags, unsigned int *irqp) { unsigned int irq; int error; @@ -546,14 +510,9 @@ bind_interdomain_evtchn_to_irqhandler( } int -bind_virq_to_irqhandler(unsigned int virq, - unsigned int cpu, - const char *devname, - driver_filter_t filter, - driver_intr_t handler, - void *arg, - unsigned long irqflags, - unsigned int *irqp) +bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, + const char *devname, driver_filter_t filter, driver_intr_t handler, + unsigned long irqflags, unsigned int *irqp) { unsigned int irq; int error; @@ -573,12 +532,9 @@ bind_virq_to_irqhandler(unsigned int virq, } int -bind_ipi_to_irqhandler(unsigned int ipi, - unsigned int cpu, - const char *devname, - driver_filter_t filter, - unsigned long irqflags, - unsigned int *irqp) +bind_ipi_to_irqhandler(unsigned int ipi, unsigned int cpu, + const char *devname, driver_filter_t filter, + unsigned long irqflags, unsigned int *irqp) { unsigned int irq; int error; @@ -636,9 +592,9 @@ rebind_irq_to_cpu(unsigned irq, unsigned tcpu) } -static void set_affinity_irq(unsigned irq, xen_cpumask_t dest) +static void set_affinity_irq(unsigned irq, cpumask_t dest) { - unsigned tcpu = first_cpu(dest); + unsigned tcpu = ffs(dest) - 1; rebind_irq_to_cpu(irq, tcpu); } #endif @@ -656,13 +612,11 @@ static void xenpic_dynirq_enable_source(struct intsrc *isrc); static void xenpic_dynirq_disable_source(struct intsrc *isrc, int); static void xenpic_dynirq_eoi_source(struct intsrc *isrc); static void xenpic_dynirq_enable_intr(struct intsrc *isrc); -static void xenpic_dynirq_disable_intr(struct intsrc *isrc); static void xenpic_pirq_enable_source(struct intsrc *isrc); static void xenpic_pirq_disable_source(struct intsrc *isrc, int); static void xenpic_pirq_eoi_source(struct intsrc *isrc); static void xenpic_pirq_enable_intr(struct intsrc *isrc); -static void xenpic_pirq_disable_intr(struct intsrc *isrc); static int xenpic_vector(struct intsrc *isrc); @@ -677,7 +631,6 @@ struct pic xenpic_dynirq_template = { .pic_disable_source = xenpic_dynirq_disable_source, .pic_eoi_source = xenpic_dynirq_eoi_source, .pic_enable_intr = xenpic_dynirq_enable_intr, - .pic_disable_intr = xenpic_dynirq_disable_intr, .pic_vector = xenpic_vector, .pic_source_pending = xenpic_source_pending, .pic_suspend = xenpic_suspend, @@ -689,7 +642,6 @@ struct pic xenpic_pirq_template = { .pic_disable_source = xenpic_pirq_disable_source, .pic_eoi_source = xenpic_pirq_eoi_source, .pic_enable_intr = xenpic_pirq_enable_intr, - .pic_disable_intr = xenpic_pirq_disable_intr, .pic_vector = xenpic_vector, .pic_source_pending = xenpic_source_pending, .pic_suspend = xenpic_suspend, @@ -747,20 +699,6 @@ xenpic_dynirq_enable_intr(struct intsrc *isrc) mtx_unlock_spin(&irq_mapping_update_lock); } -static void -xenpic_dynirq_disable_intr(struct intsrc *isrc) -{ - unsigned int irq; - struct xenpic_intsrc *xp; - - xp = (struct xenpic_intsrc *)isrc; - mtx_lock_spin(&irq_mapping_update_lock); - xp->xp_masked = 1; - irq = xenpic_vector(isrc); - mask_evtchn(evtchn_from_irq(irq)); - mtx_unlock_spin(&irq_mapping_update_lock); -} - static void xenpic_dynirq_eoi_source(struct intsrc *isrc) { @@ -825,7 +763,7 @@ notify_remote_via_irq(int irq) if (VALID_EVTCHN(evtchn)) notify_remote_via_evtchn(evtchn); else - panic("invalid evtchn"); + panic("invalid evtchn %d", irq); } /* required for support of physical devices */ @@ -898,32 +836,6 @@ xenpic_pirq_enable_intr(struct intsrc *isrc) mtx_unlock_spin(&irq_mapping_update_lock); } -static void -xenpic_pirq_disable_intr(struct intsrc *isrc) -{ - unsigned int irq; - int evtchn; - struct evtchn_close close; - - mtx_lock_spin(&irq_mapping_update_lock); - irq = xenpic_vector(isrc); - evtchn = evtchn_from_irq(irq); - - if (!VALID_EVTCHN(evtchn)) - goto done; - - mask_evtchn(evtchn); - - close.port = evtchn; - PANIC_IF(HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0); - - bind_evtchn_to_cpu(evtchn, 0); - evtchn_to_irq[evtchn] = -1; - irq_info[irq] = IRQ_UNBOUND; - done: - mtx_unlock_spin(&irq_mapping_update_lock); -} - static void xenpic_pirq_enable_source(struct intsrc *isrc) { @@ -998,7 +910,7 @@ void unmask_evtchn(int port) { shared_info_t *s = HYPERVISOR_shared_info; - unsigned int cpu = smp_processor_id(); + unsigned int cpu = PCPU_GET(cpuid); vcpu_info_t *vcpu_info = &s->vcpu_info[cpu]; /* Slow path (hypercall) if this is a non-local port. */ @@ -1016,7 +928,7 @@ unmask_evtchn(int port) * masked. */ if (synch_test_bit(port, &s->evtchn_pending) && - !synch_test_and_set_bit(port / BITS_PER_LONG, + !synch_test_and_set_bit(port / LONG_BIT, &vcpu_info->evtchn_pending_sel)) { vcpu_info->evtchn_upcall_pending = 1; if (!vcpu_info->evtchn_upcall_mask) @@ -1039,15 +951,21 @@ void irq_resume(void) mask_evtchn(evtchn); /* Check that no PIRQs are still bound. */ - for (pirq = 0; pirq < NR_PIRQS; pirq++) - PANIC_IF(irq_info[pirq_to_irq(pirq)] != IRQ_UNBOUND); + for (pirq = 0; pirq < NR_PIRQS; pirq++) { + KASSERT(irq_info[pirq_to_irq(pirq)] == IRQ_UNBOUND, + ("pirq_to_irq inconsistent")); + } /* Secondary CPUs must have no VIRQ or IPI bindings. */ - for (cpu = 1; cpu < NR_CPUS; cpu++) { - for (virq = 0; virq < NR_VIRQS; virq++) - PANIC_IF(per_cpu(virq_to_irq, cpu)[virq] != -1); - for (ipi = 0; ipi < NR_IPIS; ipi++) - PANIC_IF(per_cpu(ipi_to_irq, cpu)[ipi] != -1); + for (cpu = 1; cpu < MAX_VIRT_CPUS; cpu++) { + for (virq = 0; virq < NR_VIRQS; virq++) { + KASSERT(pcpu_find(cpu)->pc_virq_to_irq[virq] == -1, + ("virq_to_irq inconsistent")); + } + for (ipi = 0; ipi < NR_IPIS; ipi++) { + KASSERT(pcpu_find(cpu)->pc_ipi_to_irq[ipi] == -1, + ("ipi_to_irq inconsistent")); + } } /* No IRQ <-> event-channel mappings. */ @@ -1058,15 +976,16 @@ void irq_resume(void) /* Primary CPU: rebind VIRQs automatically. */ for (virq = 0; virq < NR_VIRQS; virq++) { - if ((irq = per_cpu(virq_to_irq, 0)[virq]) == -1) + if ((irq = pcpu_find(0)->pc_virq_to_irq[virq]) == -1) continue; - PANIC_IF(irq_info[irq] != mk_irq_info(IRQT_VIRQ, virq, 0)); + KASSERT(irq_info[irq] == mk_irq_info(IRQT_VIRQ, virq, 0), + ("irq_info inconsistent")); /* Get a new binding from Xen. */ bind_virq.virq = virq; bind_virq.vcpu = 0; - PANIC_IF(HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, &bind_virq) != 0); + HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, &bind_virq); evtchn = bind_virq.port; /* Record the new mapping. */ @@ -1079,15 +998,16 @@ void irq_resume(void) /* Primary CPU: rebind IPIs automatically. */ for (ipi = 0; ipi < NR_IPIS; ipi++) { - if ((irq = per_cpu(ipi_to_irq, 0)[ipi]) == -1) + if ((irq = pcpu_find(0)->pc_ipi_to_irq[ipi]) == -1) continue; - PANIC_IF(irq_info[irq] != mk_irq_info(IRQT_IPI, ipi, 0)); + KASSERT(irq_info[irq] == mk_irq_info(IRQT_IPI, ipi, 0), + ("irq_info inconsistent")); /* Get a new binding from Xen. */ memset(&op, 0, sizeof(op)); bind_ipi.vcpu = 0; - PANIC_IF(HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind_ipi) != 0); + HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind_ipi); evtchn = bind_ipi.port; /* Record the new mapping. */ @@ -1111,9 +1031,9 @@ evtchn_init(void *dummy __unused) /* No VIRQ or IPI bindings. */ for (cpu = 0; cpu < mp_ncpus; cpu++) { for (i = 0; i < NR_VIRQS; i++) - per_cpu(virq_to_irq, cpu)[i] = -1; + pcpu_find(cpu)->pc_virq_to_irq[i] = -1; for (i = 0; i < NR_IPIS; i++) - per_cpu(ipi_to_irq, cpu)[i] = -1; + pcpu_find(cpu)->pc_ipi_to_irq[i] = -1; } /* No event-channel -> IRQ mappings. */ diff --git a/sys/xen/evtchn/evtchn_dev.c b/sys/xen/evtchn/evtchn_dev.c index ea12860dbefb..4253d8a1700d 100644 --- a/sys/xen/evtchn/evtchn_dev.c +++ b/sys/xen/evtchn/evtchn_dev.c @@ -23,8 +23,6 @@ __FBSDID("$FreeBSD$"); #include #include -#include -#include #include #include #include @@ -234,14 +232,14 @@ evtchn_ioctl(struct cdev *dev, unsigned long cmd, caddr_t arg, __evtchn_reset_buffer_ring(); break; case EVTCHN_BIND: - if ( !synch_test_and_set_bit((int)arg, &bound_ports[0]) ) - unmask_evtchn((int)arg); + if ( !synch_test_and_set_bit((uintptr_t)arg, &bound_ports[0]) ) + unmask_evtchn((uintptr_t)arg); else rc = EINVAL; break; case EVTCHN_UNBIND: - if ( synch_test_and_clear_bit((int)arg, &bound_ports[0]) ) - mask_evtchn((int)arg); + if ( synch_test_and_clear_bit((uintptr_t)arg, &bound_ports[0]) ) + mask_evtchn((uintptr_t)arg); else rc = EINVAL; break; @@ -383,12 +381,12 @@ evtchn_dev_init(void *dummy __unused) /* (DEVFS) automatically destroy the symlink with its destination. */ devfs_auto_unregister(evtchn_miscdev.devfs_handle, symlink_handle); #endif - printk("Event-channel device installed.\n"); + if (bootverbose) + printf("Event-channel device installed.\n"); return 0; } - SYSINIT(evtchn_dev_init, SI_SUB_DRIVERS, SI_ORDER_FIRST, evtchn_dev_init, NULL); diff --git a/sys/xen/features.c b/sys/xen/features.c index 876a7d1e568f..f28fe049177c 100644 --- a/sys/xen/features.c +++ b/sys/xen/features.c @@ -1,10 +1,12 @@ #include __FBSDID("$FreeBSD$"); -#include +#include +#include + #include #include -#include +#include uint8_t xen_features[XENFEAT_NR_SUBMAPS * 32] /* __read_mostly */; diff --git a/sys/xen/features.h b/sys/xen/features.h new file mode 100644 index 000000000000..b4cce2fd4b1b --- /dev/null +++ b/sys/xen/features.h @@ -0,0 +1,20 @@ +/****************************************************************************** + * features.h + * + * Query the features reported by Xen. + * + * Copyright (c) 2006, Ian Campbell + */ + +#ifndef __ASM_XEN_FEATURES_H__ +#define __ASM_XEN_FEATURES_H__ + +#include + +extern void setup_xen_features(void); + +extern uint8_t xen_features[XENFEAT_NR_SUBMAPS * 32]; + +#define xen_feature(flag) (xen_features[flag]) + +#endif /* __ASM_XEN_FEATURES_H__ */ diff --git a/sys/xen/gnttab.c b/sys/xen/gnttab.c index 967565506dfc..d05790bbf84f 100644 --- a/sys/xen/gnttab.c +++ b/sys/xen/gnttab.c @@ -25,29 +25,21 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include -#include - -#include -#include +#include #include #include + +#include #include +#include +#include +#include +#include + #define cmpxchg(a, b, c) atomic_cmpset_int((volatile u_int *)(a),(b),(c)) -#if 1 -#define ASSERT(_p) \ - if ( !(_p) ) { printk("Assertion '%s': line %d, file %s\n", \ - #_p , __LINE__, __FILE__); *(int*)0=0; } -#else -#define ASSERT(_p) ((void)0) -#endif - -#define WPRINTK(fmt, args...) \ - printk("xen_grant: " fmt, ##args) - /* External tools reserve first few grant table entries. */ #define NR_RESERVED_ENTRIES 8 #define GNTTAB_LIST_END 0xffffffff @@ -72,14 +64,14 @@ static int gnttab_expand(unsigned int req_entries); static int get_free_entries(int count, int *entries) { - int ref, rc; + int ref, error; grant_ref_t head; mtx_lock(&gnttab_list_lock); if ((gnttab_free_count < count) && - ((rc = gnttab_expand(count - gnttab_free_count)) != 0)) { + ((error = gnttab_expand(count - gnttab_free_count)) != 0)) { mtx_unlock(&gnttab_list_lock); - return (rc); + return (error); } ref = head = gnttab_free_head; gnttab_free_count -= count; @@ -163,6 +155,7 @@ void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, unsigned long frame, int readonly) { + shared[ref].frame = frame; shared[ref].domid = domid; wmb(); @@ -213,7 +206,8 @@ gnttab_end_foreign_access(grant_ref_t ref, void *page) } int -gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn) +gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn, + grant_ref_t *result) { int error, ref; @@ -223,7 +217,8 @@ gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn) gnttab_grant_foreign_transfer_ref(ref, domid, pfn); - return (ref); + *result = ref; + return (0); } void @@ -261,7 +256,7 @@ gnttab_end_foreign_transfer_ref(grant_ref_t ref) /* Read the frame number /after/ reading completion status. */ rmb(); frame = shared[ref].frame; - PANIC_IF(frame == 0); + KASSERT(frame != 0, ("grant table inconsistent")); return (frame); } @@ -320,6 +315,7 @@ gnttab_alloc_grant_references(uint16_t count, grant_ref_t *head) int gnttab_empty_grant_references(const grant_ref_t *private_head) { + return (*private_head == GNTTAB_LIST_END); } @@ -331,20 +327,20 @@ gnttab_claim_grant_reference(grant_ref_t *private_head) if (unlikely(g == GNTTAB_LIST_END)) return (ENOSPC); *private_head = gnttab_entry(g); - return (g); } void gnttab_release_grant_reference(grant_ref_t *private_head, grant_ref_t release) { + gnttab_entry(release) = *private_head; *private_head = release; } void gnttab_request_free_callback(struct gnttab_free_callback *callback, - void (*fn)(void *), void *arg, uint16_t count) + void (*fn)(void *), void *arg, uint16_t count) { mtx_lock(&gnttab_list_lock); @@ -387,7 +383,8 @@ grow_gnttab_list(unsigned int more_frames) for (i = nr_grant_frames; i < new_nr_grant_frames; i++) { - gnttab_list[i] = (grant_ref_t *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT); + gnttab_list[i] = (grant_ref_t *) + malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT); if (!gnttab_list[i]) goto grow_nomem; @@ -405,12 +402,12 @@ grow_gnttab_list(unsigned int more_frames) check_free_callbacks(); - return 0; + return (0); grow_nomem: for ( ; i >= nr_grant_frames; i--) free(gnttab_list[i], M_DEVBUF); - return (-ENOMEM); + return (ENOMEM); } static unsigned int @@ -464,6 +461,8 @@ unmap_pte_fn(pte_t *pte, struct page *pmd_page, } #endif +#ifndef XENHVM + static int gnttab_map(unsigned int start_idx, unsigned int end_idx) { @@ -486,50 +485,117 @@ gnttab_map(unsigned int start_idx, unsigned int end_idx) free(frames, M_DEVBUF); return (ENOSYS); } - PANIC_IF(rc || setup.status); + KASSERT(!(rc || setup.status), + ("unexpected result from grant_table_op")); if (shared == NULL) { vm_offset_t area; area = kmem_alloc_nofault(kernel_map, PAGE_SIZE * max_nr_grant_frames()); - PANIC_IF(area == 0); + KASSERT(area, ("can't allocate VM space for grant table")); shared = (grant_entry_t *)area; } + for (i = 0; i < nr_gframes; i++) PT_SET_MA(((caddr_t)shared) + i*PAGE_SIZE, ((vm_paddr_t)frames[i]) << PAGE_SHIFT | PG_RW | PG_V); free(frames, M_DEVBUF); - return 0; + return (0); } int gnttab_resume(void) { + if (max_nr_grant_frames() < nr_grant_frames) - return -ENOSYS; - return gnttab_map(0, nr_grant_frames - 1); + return (ENOSYS); + return (gnttab_map(0, nr_grant_frames - 1)); } int gnttab_suspend(void) { - int i, pages; + int i; - pages = (PAGE_SIZE*nr_grant_frames) >> PAGE_SHIFT; - - for (i = 0; i < pages; i++) - PT_SET_MA(shared + (i*PAGE_SIZE), (vm_paddr_t)0); + for (i = 0; i < nr_grant_frames; i++) + pmap_kremove((vm_offset_t) shared + i * PAGE_SIZE); return (0); } +#else /* XENHVM */ + +#include + +static vm_paddr_t resume_frames; + +static int gnttab_map(unsigned int start_idx, unsigned int end_idx) +{ + struct xen_add_to_physmap xatp; + unsigned int i = end_idx; + + /* + * Loop backwards, so that the first hypercall has the largest index, + * ensuring that the table will grow only once. + */ + do { + xatp.domid = DOMID_SELF; + xatp.idx = i; + xatp.space = XENMAPSPACE_grant_table; + xatp.gpfn = (resume_frames >> PAGE_SHIFT) + i; + if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) + panic("HYPERVISOR_memory_op failed to map gnttab"); + } while (i-- > start_idx); + + if (shared == NULL) { + vm_offset_t area; + + area = kmem_alloc_nofault(kernel_map, + PAGE_SIZE * max_nr_grant_frames()); + KASSERT(area, ("can't allocate VM space for grant table")); + shared = (grant_entry_t *)area; + } + + for (i = start_idx; i <= end_idx; i++) { + pmap_kenter((vm_offset_t) shared + i * PAGE_SIZE, + resume_frames + i * PAGE_SIZE); + } + + return (0); +} + +int +gnttab_resume(void) +{ + int error; + unsigned int max_nr_gframes, nr_gframes; + + nr_gframes = nr_grant_frames; + max_nr_gframes = max_nr_grant_frames(); + if (max_nr_gframes < nr_gframes) + return (ENOSYS); + + if (!resume_frames) { + error = xenpci_alloc_space(PAGE_SIZE * max_nr_gframes, + &resume_frames); + if (error) { + printf("error mapping gnttab share frames\n"); + return (error); + } + } + + return (gnttab_map(0, nr_gframes - 1)); +} + +#endif + static int gnttab_expand(unsigned int req_entries) { - int rc; + int error; unsigned int cur, extra; cur = nr_grant_frames; @@ -538,10 +604,11 @@ gnttab_expand(unsigned int req_entries) if (cur + extra > max_nr_grant_frames()) return (ENOSPC); - if ((rc = gnttab_map(cur, cur + extra - 1)) == 0) - rc = grow_gnttab_list(extra); + error = gnttab_map(cur, cur + extra - 1); + if (!error) + error = grow_gnttab_list(extra); - return rc; + return (error); } int @@ -552,7 +619,7 @@ gnttab_init() unsigned int nr_init_grefs; if (!is_running_on_xen()) - return -ENODEV; + return (ENODEV); nr_grant_frames = 1; boot_max_nr_grant_frames = __max_nr_grant_frames(); @@ -571,7 +638,8 @@ gnttab_init() return (ENOMEM); for (i = 0; i < nr_grant_frames; i++) { - gnttab_list[i] = (grant_ref_t *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT); + gnttab_list[i] = (grant_ref_t *) + malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT); if (gnttab_list[i] == NULL) goto ini_nomem; } @@ -588,8 +656,10 @@ gnttab_init() gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES; gnttab_free_head = NR_RESERVED_ENTRIES; - printk("Grant table initialized\n"); - return 0; + if (bootverbose) + printf("Grant table initialized\n"); + + return (0); ini_nomem: for (i--; i >= 0; i--) diff --git a/sys/xen/gnttab.h b/sys/xen/gnttab.h index bcefbbc131bf..8348af5351f1 100644 --- a/sys/xen/gnttab.h +++ b/sys/xen/gnttab.h @@ -36,10 +36,12 @@ #ifndef __ASM_GNTTAB_H__ +#include + #include #include #include -#include +#include struct gnttab_free_callback { struct gnttab_free_callback *next; @@ -50,6 +52,10 @@ struct gnttab_free_callback { int gnttab_init(void); +/* + * Allocate a grant table reference and return it in *result. Returns + * zero on success or errno on error. + */ int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, int flags, grant_ref_t *result); @@ -68,7 +74,7 @@ int gnttab_end_foreign_access_ref(grant_ref_t ref); */ void gnttab_end_foreign_access(grant_ref_t ref, void *page); -int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn); +int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn, grant_ref_t *result); unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref); unsigned long gnttab_end_foreign_transfer(grant_ref_t ref); @@ -104,6 +110,10 @@ void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid, int gnttab_suspend(void); int gnttab_resume(void); +#if 0 + +#include + static inline void gnttab_set_map_op(struct gnttab_map_grant_ref *map, vm_paddr_t addr, uint32_t flags, grant_ref_t ref, domid_t domid) @@ -149,5 +159,6 @@ gnttab_set_replace_op(struct gnttab_unmap_and_replace *unmap, vm_paddr_t addr, unmap->handle = handle; } +#endif #endif /* __ASM_GNTTAB_H__ */ diff --git a/sys/xen/hypervisor.h b/sys/xen/hypervisor.h index 369b0c4d5b6c..0d93f66dbd18 100644 --- a/sys/xen/hypervisor.h +++ b/sys/xen/hypervisor.h @@ -8,11 +8,19 @@ * $FreeBSD$ */ -#ifndef __HYPERVISOR_H__ -#define __HYPERVISOR_H__ +#ifndef __XEN_HYPERVISOR_H__ +#define __XEN_HYPERVISOR_H__ + +#ifdef XENHVM + +#define is_running_on_xen() (HYPERVISOR_shared_info != NULL) + +#else #define is_running_on_xen() 1 +#endif + #ifdef PAE #ifndef CONFIG_X86_PAE #define CONFIG_X86_PAE @@ -27,6 +35,7 @@ #include #include #include +#include #include #if defined(__amd64__) @@ -131,7 +140,7 @@ MULTI_update_va_mapping( mcl->op = __HYPERVISOR_update_va_mapping; mcl->args[0] = va; #if defined(__amd64__) - mcl->args[1] = new_val.pte; + mcl->args[1] = new_val; #elif defined(PAE) mcl->args[1] = (uint32_t)(new_val & 0xffffffff) ; mcl->args[2] = (uint32_t)(new_val >> 32); @@ -142,4 +151,4 @@ MULTI_update_va_mapping( mcl->args[MULTI_UVMFLAGS_INDEX] = flags; } -#endif /* __HYPERVISOR_H__ */ +#endif /* __XEN_HYPERVISOR_H__ */ diff --git a/sys/xen/interface/arch-x86/xen.h b/sys/xen/interface/arch-x86/xen.h index 038048ef279d..2c878ef464c5 100644 --- a/sys/xen/interface/arch-x86/xen.h +++ b/sys/xen/interface/arch-x86/xen.h @@ -32,7 +32,8 @@ #define ___DEFINE_XEN_GUEST_HANDLE(name, type) \ typedef struct { type *p; } __guest_handle_ ## name #else -#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \ +#error "using old handle" +#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \ typedef type * __guest_handle_ ## name #endif @@ -50,7 +51,7 @@ #if defined(__i386__) #include #elif defined(__x86_64__) -#include "xen-x86_64.h" +#include #endif #ifndef __ASSEMBLY__ diff --git a/sys/xen/interface/hvm/params.h b/sys/xen/interface/hvm/params.h index 5f75ed78e8a7..6befa78df8a0 100644 --- a/sys/xen/interface/hvm/params.h +++ b/sys/xen/interface/hvm/params.h @@ -21,7 +21,7 @@ #ifndef __XEN_PUBLIC_HVM_PARAMS_H__ #define __XEN_PUBLIC_HVM_PARAMS_H__ -#include "hvm_op.h" +#include /* * Parameter space for HVMOP_{set,get}_param. diff --git a/sys/xen/reboot.c b/sys/xen/reboot.c new file mode 100644 index 000000000000..892dfbf3c91a --- /dev/null +++ b/sys/xen/reboot.c @@ -0,0 +1,262 @@ +/* + * + * Copyright (c) 2004 Christian Limpach. + * Copyright (c) 2004-2006,2008 Kip Macy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Christian Limpach. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#ifdef XENHVM + +#include + +#else + +static void xen_suspend(void); + +#endif + +static void +shutdown_handler(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + char *str; + struct xenbus_transaction xbt; + int error, howto; + + howto = 0; + + again: + error = xenbus_transaction_start(&xbt); + if (error) + return; + + error = xenbus_read(xbt, "control", "shutdown", NULL, (void **) &str); + + /* Ignore read errors and empty reads. */ + if (error || strlen(str) == 0) { + xenbus_transaction_end(xbt, 1); + return; + } + + xenbus_write(xbt, "control", "shutdown", ""); + + error = xenbus_transaction_end(xbt, 0); + if (error == EAGAIN) { + free(str, M_DEVBUF); + goto again; + } + + if (strcmp(str, "reboot") == 0) + howto = 0; + else if (strcmp(str, "poweroff") == 0) + howto |= (RB_POWEROFF | RB_HALT); + else if (strcmp(str, "halt") == 0) +#ifdef XENHVM + /* + * We rely on acpi powerdown to halt the VM. + */ + howto |= (RB_POWEROFF | RB_HALT); +#else + howto |= RB_HALT; +#endif + else if (strcmp(str, "suspend") == 0) + howto = -1; + else { + printf("Ignoring shutdown request: %s\n", str); + goto done; + } + + if (howto == -1) { + xen_suspend(); + goto done; + } + + shutdown_nice(howto); + done: + free(str, M_DEVBUF); +} + +#ifndef XENHVM + +/* + * In HV mode, we let acpi take care of halts and reboots. + */ + +static void +xen_shutdown_final(void *arg, int howto) +{ + + if (howto & (RB_HALT | RB_POWEROFF)) + HYPERVISOR_shutdown(SHUTDOWN_poweroff); + else + HYPERVISOR_shutdown(SHUTDOWN_reboot); +} + +#endif + +static struct xenbus_watch shutdown_watch = { + .node = "control/shutdown", + .callback = shutdown_handler +}; + +static void +setup_shutdown_watcher(void *unused) +{ + + if (register_xenbus_watch(&shutdown_watch)) + printf("Failed to set shutdown watcher\n"); +#ifndef XENHVM + EVENTHANDLER_REGISTER(shutdown_final, xen_shutdown_final, NULL, + SHUTDOWN_PRI_LAST); +#endif +} + +SYSINIT(shutdown, SI_SUB_PSEUDO, SI_ORDER_ANY, setup_shutdown_watcher, NULL); + +#ifndef XENHVM + +extern void xencons_suspend(void); +extern void xencons_resume(void); + +static void +xen_suspend() +{ + int i, j, k, fpp; + unsigned long max_pfn, start_info_mfn; + +#ifdef SMP + cpumask_t map; + /* + * Bind us to CPU 0 and stop any other VCPUs. + */ + mtx_lock_spin(&sched_lock); + sched_bind(curthread, 0); + mtx_unlock_spin(&sched_lock); + KASSERT(PCPU_GET(cpuid) == 0, ("xen_suspend: not running on cpu 0")); + + map = PCPU_GET(other_cpus) & ~stopped_cpus; + if (map) + stop_cpus(map); +#endif + + if (DEVICE_SUSPEND(root_bus) != 0) { + printf("xen_suspend: device_suspend failed\n"); + if (map) + restart_cpus(map); + return; + } + + local_irq_disable(); + + xencons_suspend(); + gnttab_suspend(); + + max_pfn = HYPERVISOR_shared_info->arch.max_pfn; + + void *shared_info = HYPERVISOR_shared_info; + HYPERVISOR_shared_info = NULL; + pmap_kremove((vm_offset_t) shared_info); + PT_UPDATES_FLUSH(); + + xen_start_info->store_mfn = MFNTOPFN(xen_start_info->store_mfn); + xen_start_info->console.domU.mfn = MFNTOPFN(xen_start_info->console.domU.mfn); + + /* + * We'll stop somewhere inside this hypercall. When it returns, + * we'll start resuming after the restore. + */ + start_info_mfn = VTOMFN(xen_start_info); + pmap_suspend(); + HYPERVISOR_suspend(start_info_mfn); + pmap_resume(); + + pmap_kenter_ma((vm_offset_t) shared_info, xen_start_info->shared_info); + HYPERVISOR_shared_info = shared_info; + + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = + VTOMFN(xen_pfn_to_mfn_frame_list_list); + + fpp = PAGE_SIZE/sizeof(unsigned long); + for (i = 0, j = 0, k = -1; i < max_pfn; i += fpp, j++) { + if ((j % fpp) == 0) { + k++; + xen_pfn_to_mfn_frame_list_list[k] = + VTOMFN(xen_pfn_to_mfn_frame_list[k]); + j = 0; + } + xen_pfn_to_mfn_frame_list[k][j] = + VTOMFN(&xen_phys_machine[i]); + } + HYPERVISOR_shared_info->arch.max_pfn = max_pfn; + + gnttab_resume(); + irq_resume(); + local_irq_enable(); + xencons_resume(); + +#ifdef CONFIG_SMP + for_each_cpu(i) + vcpu_prepare(i); + +#endif + /* + * Only resume xenbus /after/ we've prepared our VCPUs; otherwise + * the VCPU hotplug callback can race with our vcpu_prepare + */ + DEVICE_RESUME(root_bus); + +#ifdef SMP + sched_unbind(curthread); + if (map) + restart_cpus(map); +#endif +} + +#endif diff --git a/sys/xen/xen_intr.h b/sys/xen/xen_intr.h index 528fa7f40d80..68f594333fdd 100644 --- a/sys/xen/xen_intr.h +++ b/sys/xen/xen_intr.h @@ -29,37 +29,63 @@ #define dynirq_to_irq(_x) ((_x) + DYNIRQ_BASE) #define irq_to_dynirq(_x) ((_x) - DYNIRQ_BASE) -/* Dynamic binding of event channels and VIRQ sources to Linux IRQ space. */ -extern void unbind_from_irq(int irq); +/* + * Dynamic binding of event channels and VIRQ sources to guest IRQ space. + */ +/* + * Bind a caller port event channel to an interrupt handler. If + * successful, the guest IRQ number is returned in *irqp. Return zero + * on success or errno otherwise. + */ extern int bind_caller_port_to_irqhandler(unsigned int caller_port, const char *devname, driver_intr_t handler, void *arg, unsigned long irqflags, unsigned int *irqp); + +/* + * Bind a listening port to an interrupt handler. If successful, the + * guest IRQ number is returned in *irqp. Return zero on success or + * errno otherwise. + */ extern int bind_listening_port_to_irqhandler(unsigned int remote_domain, - const char *devname, driver_intr_t handler, void *arg, unsigned long irqflags, - unsigned int *irqp); + const char *devname, driver_intr_t handler, void *arg, + unsigned long irqflags, unsigned int *irqp); + +/* + * Bind a VIRQ to an interrupt handler. If successful, the guest IRQ + * number is returned in *irqp. Return zero on success or errno + * otherwise. + */ extern int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, const char *devname, driver_filter_t filter, driver_intr_t handler, void *arg, unsigned long irqflags, unsigned int *irqp); -extern int bind_ipi_to_irqhandler(unsigned int ipi, - unsigned int cpu, - const char *devname, - driver_filter_t handler, - unsigned long irqflags, - unsigned int *irqp); +/* + * Bind an IPI to an interrupt handler. If successful, the guest + * IRQ number is returned in *irqp. Return zero on success or errno + * otherwise. + */ +extern int bind_ipi_to_irqhandler(unsigned int ipi, unsigned int cpu, + const char *devname, driver_filter_t filter, + unsigned long irqflags, unsigned int *irqp); + +/* + * Bind an interdomain event channel to an interrupt handler. If + * successful, the guest IRQ number is returned in *irqp. Return zero + * on success or errno otherwise. + */ extern int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, - unsigned int remote_port, - const char *devname, - driver_filter_t filter, - driver_intr_t handler, - unsigned long irqflags, - unsigned int *irqp); + unsigned int remote_port, const char *devname, + driver_filter_t filter, driver_intr_t handler, + unsigned long irqflags, unsigned int *irqp); +/* + * Unbind an interrupt handler using the guest IRQ number returned + * when it was bound. + */ +extern void unbind_from_irqhandler(unsigned int irq); - -extern void unbind_from_irqhandler(unsigned int evtchn); -static __inline__ int irq_cannonicalize(int irq) +static __inline__ int irq_cannonicalize(unsigned int irq) { return (irq == 2) ? 9 : irq; } diff --git a/sys/xen/xenbus/xenbus_probe.c b/sys/xen/xenbus/xenbus_probe.c index 3d2cb4bb81ca..f04f8eca4638 100644 --- a/sys/xen/xenbus/xenbus_probe.c +++ b/sys/xen/xenbus/xenbus_probe.c @@ -565,7 +565,6 @@ xenbus_write_ivar(device_t dev, device_t child, int index, uintptr_t value) return (ENOENT); } -SYSCTL_DECL(_dev); SYSCTL_NODE(_dev, OID_AUTO, xen, CTLFLAG_RD, NULL, "Xen"); SYSCTL_INT(_dev_xen, OID_AUTO, xsd_port, CTLFLAG_RD, &xen_store_evtchn, 0, ""); SYSCTL_ULONG(_dev_xen, OID_AUTO, xsd_kva, CTLFLAG_RD, (u_long *) &xen_store, 0, ""); diff --git a/sys/xen/xenbus/xenbus_xs.c b/sys/xen/xenbus/xenbus_xs.c index 9e0f7798fcf7..806955678457 100644 --- a/sys/xen/xenbus/xenbus_xs.c +++ b/sys/xen/xenbus/xenbus_xs.c @@ -142,21 +142,17 @@ xs_read_reply(enum xsd_sockmsg_type *type, unsigned int *len, void **result) mtx_lock(&xs_state.reply_lock); while (TAILQ_EMPTY(&xs_state.reply_list)) { - while (TAILQ_EMPTY(&xs_state.reply_list)) { - error = mtx_sleep(&xs_state.reply_waitq, - &xs_state.reply_lock, - PCATCH, "xswait", hz/10); - if (error && error != EWOULDBLOCK) { - mtx_unlock(&xs_state.reply_lock); - return (error); - } - + while (TAILQ_EMPTY(&xs_state.reply_list)) { + error = mtx_sleep(&xs_state.reply_waitq, + &xs_state.reply_lock, + PCATCH, "xswait", hz/10); + if (error && error != EWOULDBLOCK) { + mtx_unlock(&xs_state.reply_lock); + return (error); } - - } + } - msg = TAILQ_FIRST(&xs_state.reply_list); TAILQ_REMOVE(&xs_state.reply_list, msg, list); @@ -202,7 +198,8 @@ xenbus_dev_request_and_reply(struct xsd_sockmsg *msg, void **result) sx_xlock(&xs_state.request_mutex); - error = xb_write(msg, sizeof(*msg) + msg->len, &xs_state.request_mutex.lock_object); + error = xb_write(msg, sizeof(*msg) + msg->len, + &xs_state.request_mutex.lock_object); if (error) { msg->type = XS_ERROR; } else { @@ -243,7 +240,8 @@ xs_talkv(struct xenbus_transaction t, enum xsd_sockmsg_type type, sx_xlock(&xs_state.request_mutex); - error = xb_write(&msg, sizeof(msg), &xs_state.request_mutex.lock_object); + error = xb_write(&msg, sizeof(msg), + &xs_state.request_mutex.lock_object); if (error) { sx_xunlock(&xs_state.request_mutex); printf("xs_talkv failed %d\n", error); @@ -251,7 +249,8 @@ xs_talkv(struct xenbus_transaction t, enum xsd_sockmsg_type type, } for (i = 0; i < num_vecs; i++) { - error = xb_write(iovec[i].iov_base, iovec[i].iov_len, &xs_state.request_mutex.lock_object); + error = xb_write(iovec[i].iov_base, iovec[i].iov_len, + &xs_state.request_mutex.lock_object); if (error) { sx_xunlock(&xs_state.request_mutex); printf("xs_talkv failed %d\n", error); @@ -791,7 +790,8 @@ xs_process_msg(enum xsd_sockmsg_type *type) msg = malloc(sizeof(*msg), M_DEVBUF, M_WAITOK); mtx_lock(&xs_state.reply_lock); - error = xb_read(&msg->hdr, sizeof(msg->hdr), &xs_state.reply_lock.lock_object); + error = xb_read(&msg->hdr, sizeof(msg->hdr), + &xs_state.reply_lock.lock_object); mtx_unlock(&xs_state.reply_lock); if (error) { free(msg, M_DEVBUF); @@ -800,7 +800,8 @@ xs_process_msg(enum xsd_sockmsg_type *type) body = malloc(msg->hdr.len + 1, M_DEVBUF, M_WAITOK); mtx_lock(&xs_state.reply_lock); - error = xb_read(body, msg->hdr.len, &xs_state.reply_lock.lock_object); + error = xb_read(body, msg->hdr.len, + &xs_state.reply_lock.lock_object); mtx_unlock(&xs_state.reply_lock); if (error) { free(body, M_DEVBUF);