freebsd-dev/sys/kern/vfs_aio.c

3020 lines
75 KiB
C
Raw Normal View History

/*-
* Copyright (c) 1997 John S. Dyson. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. John S. Dyson's name may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* DISCLAIMER: This code isn't warranted to do anything useful. Anything
* bad that happens because of using this software isn't the responsibility
* of the author. This software is distributed AS-IS.
*/
/*
* This file contains support for the POSIX 1003.1B AIO/LIO facility.
*/
2003-06-11 00:56:59 +00:00
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_compat.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/capability.h>
#include <sys/eventhandler.h>
#include <sys/sysproto.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
2004-05-30 20:34:58 +00:00
#include <sys/module.h>
#include <sys/kthread.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/unistd.h>
#include <sys/posix4.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/signalvar.h>
#include <sys/protosw.h>
Switch the vm_object mutex to be a rwlock. This will enable in the future further optimizations where the vm_object lock will be held in read mode most of the time the page cache resident pool of pages are accessed for reading purposes. The change is mostly mechanical but few notes are reported: * The KPI changes as follow: - VM_OBJECT_LOCK() -> VM_OBJECT_WLOCK() - VM_OBJECT_TRYLOCK() -> VM_OBJECT_TRYWLOCK() - VM_OBJECT_UNLOCK() -> VM_OBJECT_WUNLOCK() - VM_OBJECT_LOCK_ASSERT(MA_OWNED) -> VM_OBJECT_ASSERT_WLOCKED() (in order to avoid visibility of implementation details) - The read-mode operations are added: VM_OBJECT_RLOCK(), VM_OBJECT_TRYRLOCK(), VM_OBJECT_RUNLOCK(), VM_OBJECT_ASSERT_RLOCKED(), VM_OBJECT_ASSERT_LOCKED() * The vm/vm_pager.h namespace pollution avoidance (forcing requiring sys/mutex.h in consumers directly to cater its inlining functions using VM_OBJECT_LOCK()) imposes that all the vm/vm_pager.h consumers now must include also sys/rwlock.h. * zfs requires a quite convoluted fix to include FreeBSD rwlocks into the compat layer because the name clash between FreeBSD and solaris versions must be avoided. At this purpose zfs redefines the vm_object locking functions directly, isolating the FreeBSD components in specific compat stubs. The KPI results heavilly broken by this commit. Thirdy part ports must be updated accordingly (I can think off-hand of VirtualBox, for example). Sponsored by: EMC / Isilon storage division Reviewed by: jeff Reviewed by: pjd (ZFS specific review) Discussed with: alc Tested by: pho
2013-03-09 02:32:23 +00:00
#include <sys/rwlock.h>
2006-01-22 05:59:27 +00:00
#include <sys/sema.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/syscall.h>
#include <sys/sysent.h>
#include <sys/sysctl.h>
#include <sys/sx.h>
2006-01-22 05:59:27 +00:00
#include <sys/taskqueue.h>
#include <sys/vnode.h>
#include <sys/conf.h>
#include <sys/event.h>
2006-03-23 08:46:42 +00:00
#include <sys/mount.h>
2006-01-22 05:59:27 +00:00
#include <machine/atomic.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
2006-03-23 08:46:42 +00:00
#include <vm/vm_object.h>
#include <vm/uma.h>
#include <sys/aio.h>
#include "opt_vfs_aio.h"
/*
* Counter for allocating reference ids to new jobs. Wrapped to 1 on
2006-03-23 08:46:42 +00:00
* overflow. (XXX will be removed soon.)
*/
2006-03-23 08:46:42 +00:00
static u_long jobrefid;
2006-03-23 08:46:42 +00:00
/*
* Counter for aio_fsync.
*/
static uint64_t jobseqno;
#define JOBST_NULL 0
#define JOBST_JOBQSOCK 1
#define JOBST_JOBQGLOBAL 2
#define JOBST_JOBRUNNING 3
#define JOBST_JOBFINISHED 4
#define JOBST_JOBQBUF 5
#define JOBST_JOBQSYNC 6
1997-11-30 04:36:31 +00:00
#ifndef MAX_AIO_PER_PROC
#define MAX_AIO_PER_PROC 32
1997-11-30 04:36:31 +00:00
#endif
#ifndef MAX_AIO_QUEUE_PER_PROC
#define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */
1997-11-30 04:36:31 +00:00
#endif
#ifndef MAX_AIO_PROCS
#define MAX_AIO_PROCS 32
1997-11-30 04:36:31 +00:00
#endif
#ifndef MAX_AIO_QUEUE
#define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */
1997-11-30 04:36:31 +00:00
#endif
#ifndef TARGET_AIO_PROCS
#define TARGET_AIO_PROCS 4
1997-11-30 04:36:31 +00:00
#endif
#ifndef MAX_BUF_AIO
#define MAX_BUF_AIO 16
1997-11-30 04:36:31 +00:00
#endif
#ifndef AIOD_TIMEOUT_DEFAULT
#define AIOD_TIMEOUT_DEFAULT (10 * hz)
1997-11-30 04:36:31 +00:00
#endif
#ifndef AIOD_LIFETIME_DEFAULT
#define AIOD_LIFETIME_DEFAULT (30 * hz)
1997-11-30 04:36:31 +00:00
#endif
FEATURE(aio, "Asynchronous I/O");
static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list");
2005-02-10 12:23:29 +00:00
static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management");
static int max_aio_procs = MAX_AIO_PROCS;
SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
CTLFLAG_RW, &max_aio_procs, 0,
"Maximum number of kernel threads to use for handling async IO ");
static int num_aio_procs = 0;
SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
CTLFLAG_RD, &num_aio_procs, 0,
"Number of presently active kernel threads for async IO");
/*
* The code will adjust the actual number of AIO processes towards this
* number when it gets a chance.
*/
static int target_aio_procs = TARGET_AIO_PROCS;
SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
0, "Preferred number of ready kernel threads for async IO");
static int max_queue_count = MAX_AIO_QUEUE;
SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
"Maximum number of aio requests to queue, globally");
static int num_queue_count = 0;
SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
"Number of queued aio requests");
static int num_buf_aio = 0;
SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
"Number of aio requests presently handled by the buf subsystem");
/* Number of async I/O thread in the process of being started */
/* XXX This should be local to aio_aqueue() */
static int num_aio_resv_start = 0;
static int aiod_timeout;
SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0,
"Timeout value for synchronous aio operations");
1997-11-30 04:36:31 +00:00
static int aiod_lifetime;
SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
"Maximum lifetime for idle aiod");
1997-11-30 04:36:31 +00:00
static int unloadable = 0;
SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0,
"Allow unload of aio (not recommended)");
static int max_aio_per_proc = MAX_AIO_PER_PROC;
SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
0, "Maximum active aio requests per process (stored in the process)");
static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
&max_aio_queue_per_proc, 0,
"Maximum queued aio requests per process (stored in the process)");
static int max_buf_aio = MAX_BUF_AIO;
SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
"Maximum buf aio requests per process (stored in the process)");
typedef struct oaiocb {
int aio_fildes; /* File descriptor */
off_t aio_offset; /* File offset for I/O */
volatile void *aio_buf; /* I/O buffer in process space */
size_t aio_nbytes; /* Number of bytes for I/O */
struct osigevent aio_sigevent; /* Signal to deliver */
int aio_lio_opcode; /* LIO opcode */
int aio_reqprio; /* Request priority -- ignored */
struct __aiocb_private _aiocb_private;
} oaiocb_t;
/*
* Below is a key of locks used to protect each member of struct aiocblist
* aioliojob and kaioinfo and any backends.
*
* * - need not protected
* a - locked by kaioinfo lock
* b - locked by backend lock, the backend lock can be null in some cases,
* for example, BIO belongs to this type, in this case, proc lock is
* reused.
* c - locked by aio_job_mtx, the lock for the generic file I/O backend.
*/
/*
* Current, there is only two backends: BIO and generic file I/O.
* socket I/O is served by generic file I/O, this is not a good idea, since
* disk file I/O and any other types without O_NONBLOCK flag can block daemon
* threads, if there is no thread to serve socket I/O, the socket I/O will be
* delayed too long or starved, we should create some threads dedicated to
* sockets to do non-blocking I/O, same for pipe and fifo, for these I/O
* systems we really need non-blocking interface, fiddling O_NONBLOCK in file
* structure is not safe because there is race between userland and aio
* daemons.
*/
struct aiocblist {
TAILQ_ENTRY(aiocblist) list; /* (b) internal list of for backend */
TAILQ_ENTRY(aiocblist) plist; /* (a) list of jobs for each backend */
TAILQ_ENTRY(aiocblist) allist; /* (a) list of all jobs in proc */
int jobflags; /* (a) job flags */
int jobstate; /* (b) job state */
int inputcharge; /* (*) input blockes */
int outputcharge; /* (*) output blockes */
struct buf *bp; /* (*) private to BIO backend,
* buffer pointer
*/
struct proc *userproc; /* (*) user process */
struct ucred *cred; /* (*) active credential when created */
struct file *fd_file; /* (*) pointer to file structure */
struct aioliojob *lio; /* (*) optional lio job */
struct aiocb *uuaiocb; /* (*) pointer in userspace of aiocb */
struct knlist klist; /* (a) list of knotes */
struct aiocb uaiocb; /* (*) kernel I/O control block */
ksiginfo_t ksi; /* (a) realtime signal info */
2006-03-23 08:46:42 +00:00
struct task biotask; /* (*) private to BIO backend */
uint64_t seqno; /* (*) job number */
int pending; /* (a) number of pending I/O, aio_fsync only */
};
/* jobflags */
2006-03-23 08:46:42 +00:00
#define AIOCBLIST_DONE 0x01
#define AIOCBLIST_BUFDONE 0x02
2006-01-22 05:59:27 +00:00
#define AIOCBLIST_RUNDOWN 0x04
2006-03-23 08:46:42 +00:00
#define AIOCBLIST_CHECKSYNC 0x08
/*
* AIO process info
*/
1997-11-30 04:36:31 +00:00
#define AIOP_FREE 0x1 /* proc on free queue */
struct aiothreadlist {
int aiothreadflags; /* (c) AIO proc flags */
TAILQ_ENTRY(aiothreadlist) list; /* (c) list of processes */
struct thread *aiothread; /* (*) the AIO thread */
};
1997-11-30 04:36:31 +00:00
/*
* data-structure for lio signal management
*/
2006-01-22 05:59:27 +00:00
struct aioliojob {
int lioj_flags; /* (a) listio flags */
int lioj_count; /* (a) listio flags */
int lioj_finished_count; /* (a) listio flags */
struct sigevent lioj_signal; /* (a) signal on all I/O done */
TAILQ_ENTRY(aioliojob) lioj_list; /* (a) lio list */
struct knlist klist; /* (a) list of knotes */
ksiginfo_t lioj_ksi; /* (a) Realtime signal info */
1997-11-30 04:36:31 +00:00
};
2006-01-22 05:59:27 +00:00
#define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */
1997-11-30 04:36:31 +00:00
#define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */
#define LIOJ_KEVENT_POSTED 0x4 /* kevent triggered */
1997-11-30 04:36:31 +00:00
/*
* per process aio data structure
*/
struct kaioinfo {
struct mtx kaio_mtx; /* the lock to protect this struct */
int kaio_flags; /* (a) per process kaio flags */
int kaio_maxactive_count; /* (*) maximum number of AIOs */
int kaio_active_count; /* (c) number of currently used AIOs */
int kaio_qallowed_count; /* (*) maxiumu size of AIO queue */
int kaio_count; /* (a) size of AIO queue */
int kaio_ballowed_count; /* (*) maximum number of buffers */
int kaio_buffer_count; /* (a) number of physio buffers */
TAILQ_HEAD(,aiocblist) kaio_all; /* (a) all AIOs in the process */
TAILQ_HEAD(,aiocblist) kaio_done; /* (a) done queue for process */
TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
TAILQ_HEAD(,aiocblist) kaio_jobqueue; /* (a) job queue for process */
TAILQ_HEAD(,aiocblist) kaio_bufqueue; /* (a) buffer job queue for process */
TAILQ_HEAD(,aiocblist) kaio_sockqueue; /* (a) queue for aios waiting on sockets,
2006-03-23 08:46:42 +00:00
* NOT USED YET.
*/
2006-03-23 08:46:42 +00:00
TAILQ_HEAD(,aiocblist) kaio_syncqueue; /* (a) queue for aio_fsync */
struct task kaio_task; /* (*) task to kick aio threads */
};
#define AIO_LOCK(ki) mtx_lock(&(ki)->kaio_mtx)
#define AIO_UNLOCK(ki) mtx_unlock(&(ki)->kaio_mtx)
#define AIO_LOCK_ASSERT(ki, f) mtx_assert(&(ki)->kaio_mtx, (f))
#define AIO_MTX(ki) (&(ki)->kaio_mtx)
#define KAIO_RUNDOWN 0x1 /* process is being run down */
#define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */
/*
* Operations used to interact with userland aio control blocks.
* Different ABIs provide their own operations.
*/
struct aiocb_ops {
int (*copyin)(struct aiocb *ujob, struct aiocb *kjob);
long (*fetch_status)(struct aiocb *ujob);
long (*fetch_error)(struct aiocb *ujob);
int (*store_status)(struct aiocb *ujob, long status);
int (*store_error)(struct aiocb *ujob, long error);
int (*store_kernelinfo)(struct aiocb *ujob, long jobref);
int (*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob);
};
static TAILQ_HEAD(,aiothreadlist) aio_freeproc; /* (c) Idle daemons */
2006-01-22 05:59:27 +00:00
static struct sema aio_newproc_sem;
static struct mtx aio_job_mtx;
static struct mtx aio_sock_mtx;
static TAILQ_HEAD(,aiocblist) aio_jobs; /* (c) Async job list */
2006-01-22 05:59:27 +00:00
static struct unrhdr *aiod_unr;
MFP4 (with some minor changes): Implement the linux_io_* syscalls (AIO). They are only enabled if the native AIO code is available (either compiled in to the kernel or as a module) at the time the functions are used. If the AIO stuff is not available there will be a ENOSYS. From the submitter: ---snip--- DESIGN NOTES: 1. Linux permits a process to own multiple AIO queues (distinguished by "context"), but FreeBSD creates only one single AIO queue per process. My code maintains a request queue (STAILQ of queue(3)) per "context", and throws all AIO requests of all contexts owned by a process into the single FreeBSD per-process AIO queue. When the process calls io_destroy(2), io_getevents(2), io_submit(2) and io_cancel(2), my code can pick out requests owned by the specified context from the single FreeBSD per-process AIO queue according to the per-context request queues maintained by my code. 2. The request queue maintained by my code stores contrast information between Linux IO control blocks (struct linux_iocb) and FreeBSD IO control blocks (struct aiocb). FreeBSD IO control block actually exists in userland memory space, required by FreeBSD native aio_XXXXXX(2). 3. It is quite troubling that the function io_getevents() of libaio-0.3.105 needs to use Linux-specific "struct aio_ring", which is a partial mirror of context in user space. I would rather take the address of context in kernel as the context ID, but the io_getevents() of libaio forces me to take the address of the "ring" in user space as the context ID. To my surprise, one comment line in the file "io_getevents.c" of libaio-0.3.105 reads: Ben will hate me for this REFERENCE: 1. Linux kernel source code: http://www.kernel.org/pub/linux/kernel/v2.6/ (include/linux/aio_abi.h, fs/aio.c) 2. Linux manual pages: http://www.kernel.org/pub/linux/docs/manpages/ (io_setup(2), io_destroy(2), io_getevents(2), io_submit(2), io_cancel(2)) 3. Linux Scalability Effort: http://lse.sourceforge.net/io/aio.html The design notes: http://lse.sourceforge.net/io/aionotes.txt 4. The package libaio, both source and binary: http://rpmfind.net/linux/rpm2html/search.php?query=libaio Simple transparent interface to Linux AIO system calls. 5. Libaio-oracle: http://oss.oracle.com/projects/libaio-oracle/ POSIX AIO implementation based on Linux AIO system calls (depending on libaio). ---snip--- Submitted by: Li, Xiao <intron@intron.ac>
2006-10-15 14:22:14 +00:00
void aio_init_aioinfo(struct proc *p);
static int aio_onceonly(void);
static int aio_free_entry(struct aiocblist *aiocbe);
static void aio_process(struct aiocblist *aiocbe);
2006-01-22 05:59:27 +00:00
static int aio_newproc(int *);
MFP4 (with some minor changes): Implement the linux_io_* syscalls (AIO). They are only enabled if the native AIO code is available (either compiled in to the kernel or as a module) at the time the functions are used. If the AIO stuff is not available there will be a ENOSYS. From the submitter: ---snip--- DESIGN NOTES: 1. Linux permits a process to own multiple AIO queues (distinguished by "context"), but FreeBSD creates only one single AIO queue per process. My code maintains a request queue (STAILQ of queue(3)) per "context", and throws all AIO requests of all contexts owned by a process into the single FreeBSD per-process AIO queue. When the process calls io_destroy(2), io_getevents(2), io_submit(2) and io_cancel(2), my code can pick out requests owned by the specified context from the single FreeBSD per-process AIO queue according to the per-context request queues maintained by my code. 2. The request queue maintained by my code stores contrast information between Linux IO control blocks (struct linux_iocb) and FreeBSD IO control blocks (struct aiocb). FreeBSD IO control block actually exists in userland memory space, required by FreeBSD native aio_XXXXXX(2). 3. It is quite troubling that the function io_getevents() of libaio-0.3.105 needs to use Linux-specific "struct aio_ring", which is a partial mirror of context in user space. I would rather take the address of context in kernel as the context ID, but the io_getevents() of libaio forces me to take the address of the "ring" in user space as the context ID. To my surprise, one comment line in the file "io_getevents.c" of libaio-0.3.105 reads: Ben will hate me for this REFERENCE: 1. Linux kernel source code: http://www.kernel.org/pub/linux/kernel/v2.6/ (include/linux/aio_abi.h, fs/aio.c) 2. Linux manual pages: http://www.kernel.org/pub/linux/docs/manpages/ (io_setup(2), io_destroy(2), io_getevents(2), io_submit(2), io_cancel(2)) 3. Linux Scalability Effort: http://lse.sourceforge.net/io/aio.html The design notes: http://lse.sourceforge.net/io/aionotes.txt 4. The package libaio, both source and binary: http://rpmfind.net/linux/rpm2html/search.php?query=libaio Simple transparent interface to Linux AIO system calls. 5. Libaio-oracle: http://oss.oracle.com/projects/libaio-oracle/ POSIX AIO implementation based on Linux AIO system calls (depending on libaio). ---snip--- Submitted by: Li, Xiao <intron@intron.ac>
2006-10-15 14:22:14 +00:00
int aio_aqueue(struct thread *td, struct aiocb *job,
struct aioliojob *lio, int type, struct aiocb_ops *ops);
static void aio_physwakeup(struct buf *bp);
static void aio_proc_rundown(void *arg, struct proc *p);
static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp);
static int aio_qphysio(struct proc *p, struct aiocblist *iocb);
2006-01-22 05:59:27 +00:00
static void biohelper(void *, int);
static void aio_daemon(void *param);
static void aio_swake_cb(struct socket *, struct sockbuf *);
static int aio_unload(void);
2006-03-23 08:46:42 +00:00
static void aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type);
#define DONE_BUF 1
#define DONE_QUEUE 2
static int aio_kick(struct proc *userp);
2006-03-23 08:46:42 +00:00
static void aio_kick_nowait(struct proc *userp);
static void aio_kick_helper(void *context, int pending);
static int filt_aioattach(struct knote *kn);
static void filt_aiodetach(struct knote *kn);
static int filt_aio(struct knote *kn, long hint);
static int filt_lioattach(struct knote *kn);
static void filt_liodetach(struct knote *kn);
static int filt_lio(struct knote *kn, long hint);
/*
* Zones for:
* kaio Per process async io info
* aiop async io thread data
* aiocb async io jobs
* aiol list io job pointer - internal to aio_suspend XXX
* aiolio list io jobs
*/
static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone;
/* kqueue filters for aio */
static struct filterops aio_filtops = {
.f_isfd = 0,
.f_attach = filt_aioattach,
.f_detach = filt_aiodetach,
.f_event = filt_aio,
};
static struct filterops lio_filtops = {
.f_isfd = 0,
.f_attach = filt_lioattach,
.f_detach = filt_liodetach,
.f_event = filt_lio
};
static eventhandler_tag exit_tag, exec_tag;
2006-01-22 05:59:27 +00:00
TASKQUEUE_DEFINE_THREAD(aiod_bio);
/*
* Main operations function for use as a kernel module.
*/
static int
aio_modload(struct module *module, int cmd, void *arg)
{
int error = 0;
switch (cmd) {
case MOD_LOAD:
aio_onceonly();
break;
case MOD_UNLOAD:
error = aio_unload();
break;
case MOD_SHUTDOWN:
break;
default:
error = EINVAL;
break;
}
return (error);
}
static moduledata_t aio_mod = {
"aio",
&aio_modload,
NULL
};
static struct syscall_helper_data aio_syscalls[] = {
SYSCALL_INIT_HELPER(aio_cancel),
SYSCALL_INIT_HELPER(aio_error),
SYSCALL_INIT_HELPER(aio_fsync),
SYSCALL_INIT_HELPER(aio_read),
SYSCALL_INIT_HELPER(aio_return),
SYSCALL_INIT_HELPER(aio_suspend),
SYSCALL_INIT_HELPER(aio_waitcomplete),
SYSCALL_INIT_HELPER(aio_write),
SYSCALL_INIT_HELPER(lio_listio),
SYSCALL_INIT_HELPER(oaio_read),
SYSCALL_INIT_HELPER(oaio_write),
SYSCALL_INIT_HELPER(olio_listio),
SYSCALL_INIT_LAST
};
#ifdef COMPAT_FREEBSD32
#include <sys/mount.h>
#include <sys/socket.h>
#include <compat/freebsd32/freebsd32.h>
#include <compat/freebsd32/freebsd32_proto.h>
#include <compat/freebsd32/freebsd32_signal.h>
#include <compat/freebsd32/freebsd32_syscall.h>
#include <compat/freebsd32/freebsd32_util.h>
static struct syscall_helper_data aio32_syscalls[] = {
SYSCALL32_INIT_HELPER(freebsd32_aio_return),
SYSCALL32_INIT_HELPER(freebsd32_aio_suspend),
SYSCALL32_INIT_HELPER(freebsd32_aio_cancel),
SYSCALL32_INIT_HELPER(freebsd32_aio_error),
SYSCALL32_INIT_HELPER(freebsd32_aio_fsync),
SYSCALL32_INIT_HELPER(freebsd32_aio_read),
SYSCALL32_INIT_HELPER(freebsd32_aio_write),
SYSCALL32_INIT_HELPER(freebsd32_aio_waitcomplete),
SYSCALL32_INIT_HELPER(freebsd32_lio_listio),
SYSCALL32_INIT_HELPER(freebsd32_oaio_read),
SYSCALL32_INIT_HELPER(freebsd32_oaio_write),
SYSCALL32_INIT_HELPER(freebsd32_olio_listio),
SYSCALL_INIT_LAST
};
#endif
DECLARE_MODULE(aio, aio_mod,
SI_SUB_VFS, SI_ORDER_ANY);
MODULE_VERSION(aio, 1);
/*
* Startup initialization
*/
static int
aio_onceonly(void)
{
int error;
/* XXX: should probably just use so->callback */
aio_swake = &aio_swake_cb;
exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
EVENTHANDLER_PRI_ANY);
exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec, NULL,
EVENTHANDLER_PRI_ANY);
kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
TAILQ_INIT(&aio_freeproc);
2006-01-22 05:59:27 +00:00
sema_init(&aio_newproc_sem, 0, "aio_new_proc");
mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
mtx_init(&aio_sock_mtx, "aio_sock", NULL, MTX_DEF);
TAILQ_INIT(&aio_jobs);
2006-01-22 05:59:27 +00:00
aiod_unr = new_unrhdr(1, INT_MAX, NULL);
kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
aiop_zone = uma_zcreate("AIOP", sizeof(struct aiothreadlist), NULL,
NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
aiocb_zone = uma_zcreate("AIOCB", sizeof(struct aiocblist), NULL, NULL,
NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL,
NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
2006-01-22 05:59:27 +00:00
aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1997-11-30 04:36:31 +00:00
aiod_timeout = AIOD_TIMEOUT_DEFAULT;
aiod_lifetime = AIOD_LIFETIME_DEFAULT;
jobrefid = 1;
async_io_version = _POSIX_VERSION;
p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, AIO_LISTIO_MAX);
p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE);
p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0);
error = syscall_helper_register(aio_syscalls);
if (error)
return (error);
#ifdef COMPAT_FREEBSD32
error = syscall32_helper_register(aio32_syscalls);
if (error)
return (error);
#endif
return (0);
}
/*
* Callback for unload of AIO when used as a module.
*/
static int
aio_unload(void)
{
int error;
/*
* XXX: no unloads by default, it's too dangerous.
* perhaps we could do it if locked out callers and then
* did an aio_proc_rundown() on each process.
*
* jhb: aio_proc_rundown() needs to run on curproc though,
* so I don't think that would fly.
*/
if (!unloadable)
return (EOPNOTSUPP);
#ifdef COMPAT_FREEBSD32
syscall32_helper_unregister(aio32_syscalls);
#endif
syscall_helper_unregister(aio_syscalls);
error = kqueue_del_filteropts(EVFILT_AIO);
if (error)
return error;
error = kqueue_del_filteropts(EVFILT_LIO);
if (error)
return error;
async_io_version = 0;
aio_swake = NULL;
2006-01-22 05:59:27 +00:00
taskqueue_free(taskqueue_aiod_bio);
delete_unrhdr(aiod_unr);
uma_zdestroy(kaio_zone);
uma_zdestroy(aiop_zone);
uma_zdestroy(aiocb_zone);
uma_zdestroy(aiol_zone);
uma_zdestroy(aiolio_zone);
EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
EVENTHANDLER_DEREGISTER(process_exec, exec_tag);
2006-01-22 05:59:27 +00:00
mtx_destroy(&aio_job_mtx);
mtx_destroy(&aio_sock_mtx);
sema_destroy(&aio_newproc_sem);
p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, -1);
p31b_setcfg(CTL_P1003_1B_AIO_MAX, -1);
p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, -1);
return (0);
}
/*
* Init the per-process aioinfo structure. The aioinfo limits are set
* per-process for user limit (resource) management.
*/
MFP4 (with some minor changes): Implement the linux_io_* syscalls (AIO). They are only enabled if the native AIO code is available (either compiled in to the kernel or as a module) at the time the functions are used. If the AIO stuff is not available there will be a ENOSYS. From the submitter: ---snip--- DESIGN NOTES: 1. Linux permits a process to own multiple AIO queues (distinguished by "context"), but FreeBSD creates only one single AIO queue per process. My code maintains a request queue (STAILQ of queue(3)) per "context", and throws all AIO requests of all contexts owned by a process into the single FreeBSD per-process AIO queue. When the process calls io_destroy(2), io_getevents(2), io_submit(2) and io_cancel(2), my code can pick out requests owned by the specified context from the single FreeBSD per-process AIO queue according to the per-context request queues maintained by my code. 2. The request queue maintained by my code stores contrast information between Linux IO control blocks (struct linux_iocb) and FreeBSD IO control blocks (struct aiocb). FreeBSD IO control block actually exists in userland memory space, required by FreeBSD native aio_XXXXXX(2). 3. It is quite troubling that the function io_getevents() of libaio-0.3.105 needs to use Linux-specific "struct aio_ring", which is a partial mirror of context in user space. I would rather take the address of context in kernel as the context ID, but the io_getevents() of libaio forces me to take the address of the "ring" in user space as the context ID. To my surprise, one comment line in the file "io_getevents.c" of libaio-0.3.105 reads: Ben will hate me for this REFERENCE: 1. Linux kernel source code: http://www.kernel.org/pub/linux/kernel/v2.6/ (include/linux/aio_abi.h, fs/aio.c) 2. Linux manual pages: http://www.kernel.org/pub/linux/docs/manpages/ (io_setup(2), io_destroy(2), io_getevents(2), io_submit(2), io_cancel(2)) 3. Linux Scalability Effort: http://lse.sourceforge.net/io/aio.html The design notes: http://lse.sourceforge.net/io/aionotes.txt 4. The package libaio, both source and binary: http://rpmfind.net/linux/rpm2html/search.php?query=libaio Simple transparent interface to Linux AIO system calls. 5. Libaio-oracle: http://oss.oracle.com/projects/libaio-oracle/ POSIX AIO implementation based on Linux AIO system calls (depending on libaio). ---snip--- Submitted by: Li, Xiao <intron@intron.ac>
2006-10-15 14:22:14 +00:00
void
aio_init_aioinfo(struct proc *p)
{
struct kaioinfo *ki;
ki = uma_zalloc(kaio_zone, M_WAITOK);
mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF);
ki->kaio_flags = 0;
ki->kaio_maxactive_count = max_aio_per_proc;
ki->kaio_active_count = 0;
ki->kaio_qallowed_count = max_aio_queue_per_proc;
2006-01-22 05:59:27 +00:00
ki->kaio_count = 0;
ki->kaio_ballowed_count = max_buf_aio;
ki->kaio_buffer_count = 0;
2006-01-22 05:59:27 +00:00
TAILQ_INIT(&ki->kaio_all);
TAILQ_INIT(&ki->kaio_done);
TAILQ_INIT(&ki->kaio_jobqueue);
TAILQ_INIT(&ki->kaio_bufqueue);
TAILQ_INIT(&ki->kaio_liojoblist);
TAILQ_INIT(&ki->kaio_sockqueue);
2006-03-23 08:46:42 +00:00
TAILQ_INIT(&ki->kaio_syncqueue);
TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
PROC_LOCK(p);
if (p->p_aioinfo == NULL) {
p->p_aioinfo = ki;
PROC_UNLOCK(p);
} else {
PROC_UNLOCK(p);
mtx_destroy(&ki->kaio_mtx);
uma_zfree(kaio_zone, ki);
}
2004-08-13 17:43:53 +00:00
while (num_aio_procs < MIN(target_aio_procs, max_aio_procs))
2006-01-22 05:59:27 +00:00
aio_newproc(NULL);
}
static int
aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
{
struct thread *td;
int error;
error = sigev_findtd(p, sigev, &td);
if (error)
return (error);
if (!KSI_ONQ(ksi)) {
ksiginfo_set_sigev(ksi, sigev);
ksi->ksi_code = SI_ASYNCIO;
ksi->ksi_flags |= KSI_EXT | KSI_INS;
tdsendsignal(p, td, ksi->ksi_signo, ksi);
}
PROC_UNLOCK(p);
return (error);
}
/*
* Free a job entry. Wait for completion if it is currently active, but don't
* delay forever. If we delay, we return a flag that says that we have to
* restart the queue scan.
*/
static int
aio_free_entry(struct aiocblist *aiocbe)
{
struct kaioinfo *ki;
2006-01-22 05:59:27 +00:00
struct aioliojob *lj;
struct proc *p;
p = aiocbe->userproc;
2006-01-22 05:59:27 +00:00
MPASS(curproc == p);
ki = p->p_aioinfo;
2006-01-22 05:59:27 +00:00
MPASS(ki != NULL);
AIO_LOCK_ASSERT(ki, MA_OWNED);
MPASS(aiocbe->jobstate == JOBST_JOBFINISHED);
2006-01-22 05:59:27 +00:00
atomic_subtract_int(&num_queue_count, 1);
ki->kaio_count--;
MPASS(ki->kaio_count >= 0);
TAILQ_REMOVE(&ki->kaio_done, aiocbe, plist);
TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist);
1997-11-30 04:36:31 +00:00
lj = aiocbe->lio;
2006-01-22 05:59:27 +00:00
if (lj) {
lj->lioj_count--;
lj->lioj_finished_count--;
if (lj->lioj_count == 0) {
2006-01-22 05:59:27 +00:00
TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
/* lio is going away, we need to destroy any knotes */
knlist_delete(&lj->klist, curthread, 1);
PROC_LOCK(p);
2006-01-22 05:59:27 +00:00
sigqueue_take(&lj->lioj_ksi);
PROC_UNLOCK(p);
2006-01-22 05:59:27 +00:00
uma_zfree(aiolio_zone, lj);
1997-11-30 04:36:31 +00:00
}
}
/* aiocbe is going away, we need to destroy any knotes */
2006-01-22 05:59:27 +00:00
knlist_delete(&aiocbe->klist, curthread, 1);
PROC_LOCK(p);
2006-01-22 05:59:27 +00:00
sigqueue_take(&aiocbe->ksi);
PROC_UNLOCK(p);
2006-01-22 05:59:27 +00:00
2006-03-23 08:46:42 +00:00
MPASS(aiocbe->bp == NULL);
2006-01-22 05:59:27 +00:00
aiocbe->jobstate = JOBST_NULL;
AIO_UNLOCK(ki);
/*
* The thread argument here is used to find the owning process
* and is also passed to fo_close() which may pass it to various
* places such as devsw close() routines. Because of that, we
* need a thread pointer from the process owning the job that is
* persistent and won't disappear out from under us or move to
* another process.
*
* Currently, all the callers of this function call it to remove
* an aiocblist from the current process' job list either via a
* syscall or due to the current process calling exit() or
* execve(). Thus, we know that p == curproc. We also know that
* curthread can't exit since we are curthread.
*
* Therefore, we use curthread as the thread to pass to
* knlist_delete(). This does mean that it is possible for the
* thread pointer at close time to differ from the thread pointer
* at open time, but this is already true of file descriptors in
* a multithreaded process.
*/
fdrop(aiocbe->fd_file, curthread);
crfree(aiocbe->cred);
uma_zfree(aiocb_zone, aiocbe);
AIO_LOCK(ki);
2006-01-22 05:59:27 +00:00
return (0);
}
static void
aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp __unused)
{
aio_proc_rundown(arg, p);
}
/*
2004-08-13 17:43:53 +00:00
* Rundown the jobs for a given process.
*/
static void
aio_proc_rundown(void *arg, struct proc *p)
{
struct kaioinfo *ki;
2006-01-22 05:59:27 +00:00
struct aioliojob *lj;
struct aiocblist *cbe, *cbn;
struct file *fp;
struct socket *so;
int remove;
KASSERT(curthread->td_proc == p,
("%s: called on non-curproc", __func__));
ki = p->p_aioinfo;
if (ki == NULL)
return;
AIO_LOCK(ki);
ki->kaio_flags |= KAIO_RUNDOWN;
2006-01-22 05:59:27 +00:00
restart:
/*
2006-01-22 05:59:27 +00:00
* Try to cancel all pending requests. This code simulates
* aio_cancel on all pending I/O requests.
*/
2006-01-22 05:59:27 +00:00
TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) {
remove = 0;
2006-01-22 05:59:27 +00:00
mtx_lock(&aio_job_mtx);
if (cbe->jobstate == JOBST_JOBQGLOBAL) {
TAILQ_REMOVE(&aio_jobs, cbe, list);
remove = 1;
} else if (cbe->jobstate == JOBST_JOBQSOCK) {
fp = cbe->fd_file;
MPASS(fp->f_type == DTYPE_SOCKET);
so = fp->f_data;
TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
remove = 1;
2006-03-23 08:46:42 +00:00
} else if (cbe->jobstate == JOBST_JOBQSYNC) {
TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list);
remove = 1;
}
mtx_unlock(&aio_job_mtx);
if (remove) {
2006-01-22 05:59:27 +00:00
cbe->jobstate = JOBST_JOBFINISHED;
cbe->uaiocb._aiocb_private.status = -1;
cbe->uaiocb._aiocb_private.error = ECANCELED;
TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
aio_bio_done_notify(p, cbe, DONE_QUEUE);
}
}
2006-01-22 05:59:27 +00:00
/* Wait for all running I/O to be finished */
if (TAILQ_FIRST(&ki->kaio_bufqueue) ||
TAILQ_FIRST(&ki->kaio_jobqueue)) {
1997-11-30 04:36:31 +00:00
ki->kaio_flags |= KAIO_WAKEUP;
msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
2006-01-22 05:59:27 +00:00
goto restart;
1997-11-30 04:36:31 +00:00
}
2006-01-22 05:59:27 +00:00
/* Free all completed I/O requests. */
while ((cbe = TAILQ_FIRST(&ki->kaio_done)) != NULL)
aio_free_entry(cbe);
while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
if (lj->lioj_count == 0) {
TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
2006-01-22 05:59:27 +00:00
knlist_delete(&lj->klist, curthread, 1);
PROC_LOCK(p);
2006-01-22 05:59:27 +00:00
sigqueue_take(&lj->lioj_ksi);
PROC_UNLOCK(p);
uma_zfree(aiolio_zone, lj);
} else {
panic("LIO job not cleaned up: C:%d, FC:%d\n",
lj->lioj_count, lj->lioj_finished_count);
}
1997-11-30 04:36:31 +00:00
}
AIO_UNLOCK(ki);
2006-03-23 08:46:42 +00:00
taskqueue_drain(taskqueue_aiod_bio, &ki->kaio_task);
mtx_destroy(&ki->kaio_mtx);
uma_zfree(kaio_zone, ki);
p->p_aioinfo = NULL;
}
/*
* Select a job to run (called by an AIO daemon).
*/
static struct aiocblist *
aio_selectjob(struct aiothreadlist *aiop)
{
struct aiocblist *aiocbe;
struct kaioinfo *ki;
struct proc *userp;
2006-01-22 05:59:27 +00:00
mtx_assert(&aio_job_mtx, MA_OWNED);
TAILQ_FOREACH(aiocbe, &aio_jobs, list) {
userp = aiocbe->userproc;
ki = userp->p_aioinfo;
if (ki->kaio_active_count < ki->kaio_maxactive_count) {
TAILQ_REMOVE(&aio_jobs, aiocbe, list);
2006-01-22 05:59:27 +00:00
/* Account for currently active jobs. */
ki->kaio_active_count++;
aiocbe->jobstate = JOBST_JOBRUNNING;
break;
}
}
2006-01-22 05:59:27 +00:00
return (aiocbe);
}
2006-03-23 08:46:42 +00:00
/*
* Move all data to a permanent storage device, this code
* simulates fsync syscall.
*/
static int
aio_fsync_vnode(struct thread *td, struct vnode *vp)
{
struct mount *mp;
int error;
if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
goto drop;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2006-03-23 08:46:42 +00:00
if (vp->v_object != NULL) {
Switch the vm_object mutex to be a rwlock. This will enable in the future further optimizations where the vm_object lock will be held in read mode most of the time the page cache resident pool of pages are accessed for reading purposes. The change is mostly mechanical but few notes are reported: * The KPI changes as follow: - VM_OBJECT_LOCK() -> VM_OBJECT_WLOCK() - VM_OBJECT_TRYLOCK() -> VM_OBJECT_TRYWLOCK() - VM_OBJECT_UNLOCK() -> VM_OBJECT_WUNLOCK() - VM_OBJECT_LOCK_ASSERT(MA_OWNED) -> VM_OBJECT_ASSERT_WLOCKED() (in order to avoid visibility of implementation details) - The read-mode operations are added: VM_OBJECT_RLOCK(), VM_OBJECT_TRYRLOCK(), VM_OBJECT_RUNLOCK(), VM_OBJECT_ASSERT_RLOCKED(), VM_OBJECT_ASSERT_LOCKED() * The vm/vm_pager.h namespace pollution avoidance (forcing requiring sys/mutex.h in consumers directly to cater its inlining functions using VM_OBJECT_LOCK()) imposes that all the vm/vm_pager.h consumers now must include also sys/rwlock.h. * zfs requires a quite convoluted fix to include FreeBSD rwlocks into the compat layer because the name clash between FreeBSD and solaris versions must be avoided. At this purpose zfs redefines the vm_object locking functions directly, isolating the FreeBSD components in specific compat stubs. The KPI results heavilly broken by this commit. Thirdy part ports must be updated accordingly (I can think off-hand of VirtualBox, for example). Sponsored by: EMC / Isilon storage division Reviewed by: jeff Reviewed by: pjd (ZFS specific review) Discussed with: alc Tested by: pho
2013-03-09 02:32:23 +00:00
VM_OBJECT_WLOCK(vp->v_object);
2006-03-23 08:46:42 +00:00
vm_object_page_clean(vp->v_object, 0, 0, 0);
Switch the vm_object mutex to be a rwlock. This will enable in the future further optimizations where the vm_object lock will be held in read mode most of the time the page cache resident pool of pages are accessed for reading purposes. The change is mostly mechanical but few notes are reported: * The KPI changes as follow: - VM_OBJECT_LOCK() -> VM_OBJECT_WLOCK() - VM_OBJECT_TRYLOCK() -> VM_OBJECT_TRYWLOCK() - VM_OBJECT_UNLOCK() -> VM_OBJECT_WUNLOCK() - VM_OBJECT_LOCK_ASSERT(MA_OWNED) -> VM_OBJECT_ASSERT_WLOCKED() (in order to avoid visibility of implementation details) - The read-mode operations are added: VM_OBJECT_RLOCK(), VM_OBJECT_TRYRLOCK(), VM_OBJECT_RUNLOCK(), VM_OBJECT_ASSERT_RLOCKED(), VM_OBJECT_ASSERT_LOCKED() * The vm/vm_pager.h namespace pollution avoidance (forcing requiring sys/mutex.h in consumers directly to cater its inlining functions using VM_OBJECT_LOCK()) imposes that all the vm/vm_pager.h consumers now must include also sys/rwlock.h. * zfs requires a quite convoluted fix to include FreeBSD rwlocks into the compat layer because the name clash between FreeBSD and solaris versions must be avoided. At this purpose zfs redefines the vm_object locking functions directly, isolating the FreeBSD components in specific compat stubs. The KPI results heavilly broken by this commit. Thirdy part ports must be updated accordingly (I can think off-hand of VirtualBox, for example). Sponsored by: EMC / Isilon storage division Reviewed by: jeff Reviewed by: pjd (ZFS specific review) Discussed with: alc Tested by: pho
2013-03-09 02:32:23 +00:00
VM_OBJECT_WUNLOCK(vp->v_object);
2006-03-23 08:46:42 +00:00
}
error = VOP_FSYNC(vp, MNT_WAIT, td);
VOP_UNLOCK(vp, 0);
2006-03-23 08:46:42 +00:00
vn_finished_write(mp);
drop:
return (error);
}
/*
* The AIO processing activity. This is the code that does the I/O request for
* the non-physio version of the operations. The normal vn operations are used,
* and this code should work in all instances for every type of file, including
* pipes, sockets, fifos, and regular files.
2006-01-22 05:59:27 +00:00
*
* XXX I don't think it works well for socket, pipe, and fifo.
*/
static void
aio_process(struct aiocblist *aiocbe)
{
struct ucred *td_savedcred;
struct thread *td;
struct aiocb *cb;
struct file *fp;
2006-01-22 05:59:27 +00:00
struct socket *so;
struct uio auio;
struct iovec aiov;
int cnt;
int error;
int oublock_st, oublock_end;
int inblock_st, inblock_end;
td = curthread;
td_savedcred = td->td_ucred;
td->td_ucred = aiocbe->cred;
cb = &aiocbe->uaiocb;
fp = aiocbe->fd_file;
2006-03-23 08:46:42 +00:00
if (cb->aio_lio_opcode == LIO_SYNC) {
error = 0;
cnt = 0;
if (fp->f_vnode != NULL)
error = aio_fsync_vnode(td, fp->f_vnode);
cb->_aiocb_private.error = error;
cb->_aiocb_private.status = 0;
td->td_ucred = td_savedcred;
return;
}
aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
aiov.iov_len = cb->aio_nbytes;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_offset = cb->aio_offset;
auio.uio_resid = cb->aio_nbytes;
cnt = cb->aio_nbytes;
auio.uio_segflg = UIO_USERSPACE;
auio.uio_td = td;
inblock_st = td->td_ru.ru_inblock;
oublock_st = td->td_ru.ru_oublock;
/*
* aio_aqueue() acquires a reference to the file that is
* released in aio_free_entry().
*/
if (cb->aio_lio_opcode == LIO_READ) {
auio.uio_rw = UIO_READ;
if (auio.uio_resid == 0)
error = 0;
else
error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
} else {
if (fp->f_type == DTYPE_VNODE)
bwillwrite();
auio.uio_rw = UIO_WRITE;
error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
}
inblock_end = td->td_ru.ru_inblock;
oublock_end = td->td_ru.ru_oublock;
aiocbe->inputcharge = inblock_end - inblock_st;
aiocbe->outputcharge = oublock_end - oublock_st;
if ((error) && (auio.uio_resid != cnt)) {
if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
error = 0;
if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
2006-01-22 05:59:27 +00:00
int sigpipe = 1;
if (fp->f_type == DTYPE_SOCKET) {
so = fp->f_data;
if (so->so_options & SO_NOSIGPIPE)
sigpipe = 0;
}
if (sigpipe) {
PROC_LOCK(aiocbe->userproc);
kern_psignal(aiocbe->userproc, SIGPIPE);
2006-01-22 05:59:27 +00:00
PROC_UNLOCK(aiocbe->userproc);
}
}
}
cnt -= auio.uio_resid;
cb->_aiocb_private.error = error;
cb->_aiocb_private.status = cnt;
td->td_ucred = td_savedcred;
}
static void
2006-01-22 05:59:27 +00:00
aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type)
{
struct aioliojob *lj;
struct kaioinfo *ki;
2006-03-23 08:46:42 +00:00
struct aiocblist *scb, *scbn;
2006-01-22 05:59:27 +00:00
int lj_done;
ki = userp->p_aioinfo;
AIO_LOCK_ASSERT(ki, MA_OWNED);
lj = aiocbe->lio;
lj_done = 0;
if (lj) {
2006-01-22 05:59:27 +00:00
lj->lioj_finished_count++;
if (lj->lioj_count == lj->lioj_finished_count)
lj_done = 1;
}
2006-01-22 05:59:27 +00:00
if (type == DONE_QUEUE) {
aiocbe->jobflags |= AIOCBLIST_DONE;
} else {
aiocbe->jobflags |= AIOCBLIST_BUFDONE;
}
TAILQ_INSERT_TAIL(&ki->kaio_done, aiocbe, plist);
aiocbe->jobstate = JOBST_JOBFINISHED;
if (ki->kaio_flags & KAIO_RUNDOWN)
goto notification_done;
2006-01-22 05:59:27 +00:00
if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
aio_sendsig(userp, &aiocbe->uaiocb.aio_sigevent, &aiocbe->ksi);
2006-01-22 05:59:27 +00:00
KNOTE_LOCKED(&aiocbe->klist, 1);
2006-01-22 05:59:27 +00:00
if (lj_done) {
if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
lj->lioj_flags |= LIOJ_KEVENT_POSTED;
KNOTE_LOCKED(&lj->klist, 1);
}
if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
== LIOJ_SIGNAL
&& (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi);
lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
}
}
notification_done:
2006-03-23 08:46:42 +00:00
if (aiocbe->jobflags & AIOCBLIST_CHECKSYNC) {
TAILQ_FOREACH_SAFE(scb, &ki->kaio_syncqueue, list, scbn) {
if (aiocbe->fd_file == scb->fd_file &&
2006-03-23 08:46:42 +00:00
aiocbe->seqno < scb->seqno) {
if (--scb->pending == 0) {
mtx_lock(&aio_job_mtx);
scb->jobstate = JOBST_JOBQGLOBAL;
TAILQ_REMOVE(&ki->kaio_syncqueue, scb, list);
TAILQ_INSERT_TAIL(&aio_jobs, scb, list);
aio_kick_nowait(userp);
mtx_unlock(&aio_job_mtx);
}
}
}
}
if (ki->kaio_flags & KAIO_WAKEUP) {
2006-01-22 05:59:27 +00:00
ki->kaio_flags &= ~KAIO_WAKEUP;
wakeup(&userp->p_aioinfo);
}
}
2006-01-22 05:59:27 +00:00
/*
1997-11-30 04:36:31 +00:00
* The AIO daemon, most of the actual work is done in aio_process,
* but the setup (and address space mgmt) is done in this routine.
*/
static void
2006-01-22 05:59:27 +00:00
aio_daemon(void *_id)
{
struct aiocblist *aiocbe;
struct aiothreadlist *aiop;
struct kaioinfo *ki;
struct proc *curcp, *mycp, *userp;
struct vmspace *myvm, *tmpvm;
struct thread *td = curthread;
2006-01-22 05:59:27 +00:00
int id = (intptr_t)_id;
/*
* Local copies of curproc (cp) and vmspace (myvm)
*/
mycp = td->td_proc;
myvm = mycp->p_vmspace;
KASSERT(mycp->p_textvp == NULL, ("kthread has a textvp"));
/*
* Allocate and ready the aio control info. There is one aiop structure
* per daemon.
*/
aiop = uma_zalloc(aiop_zone, M_WAITOK);
aiop->aiothread = td;
2006-03-23 08:46:42 +00:00
aiop->aiothreadflags = 0;
2006-01-24 02:50:42 +00:00
/* The daemon resides in its own pgrp. */
sys_setsid(td, NULL);
/*
* Wakeup parent process. (Parent sleeps to keep from blasting away
* and creating too many daemons.)
*/
2006-01-22 05:59:27 +00:00
sema_post(&aio_newproc_sem);
2006-01-22 05:59:27 +00:00
mtx_lock(&aio_job_mtx);
for (;;) {
/*
* curcp is the current daemon process context.
* userp is the current user process context.
*/
curcp = mycp;
/*
* Take daemon off of free queue
*/
if (aiop->aiothreadflags & AIOP_FREE) {
TAILQ_REMOVE(&aio_freeproc, aiop, list);
aiop->aiothreadflags &= ~AIOP_FREE;
}
/*
* Check for jobs.
*/
while ((aiocbe = aio_selectjob(aiop)) != NULL) {
2006-01-22 05:59:27 +00:00
mtx_unlock(&aio_job_mtx);
userp = aiocbe->userproc;
/*
* Connect to process address space for user program.
*/
if (userp != curcp) {
/*
* Save the current address space that we are
* connected to.
*/
tmpvm = mycp->p_vmspace;
2004-08-13 17:43:53 +00:00
/*
* Point to the new user address space, and
* refer to it.
*/
mycp->p_vmspace = userp->p_vmspace;
atomic_add_int(&mycp->p_vmspace->vm_refcnt, 1);
2004-08-13 17:43:53 +00:00
/* Activate the new mapping. */
pmap_activate(FIRST_THREAD_IN_PROC(mycp));
2004-08-13 17:43:53 +00:00
/*
* If the old address space wasn't the daemons
* own address space, then we need to remove the
* daemon's reference from the other process
* that it was acting on behalf of.
*/
if (tmpvm != myvm) {
vmspace_free(tmpvm);
}
curcp = userp;
}
ki = userp->p_aioinfo;
1997-11-30 04:36:31 +00:00
/* Do the I/O function. */
aio_process(aiocbe);
1997-11-30 04:36:31 +00:00
mtx_lock(&aio_job_mtx);
/* Decrement the active job count. */
ki->kaio_active_count--;
mtx_unlock(&aio_job_mtx);
AIO_LOCK(ki);
2006-01-22 05:59:27 +00:00
TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
aio_bio_done_notify(userp, aiocbe, DONE_QUEUE);
AIO_UNLOCK(ki);
2006-01-22 05:59:27 +00:00
mtx_lock(&aio_job_mtx);
}
/*
* Disconnect from user address space.
*/
if (curcp != mycp) {
2006-01-22 05:59:27 +00:00
mtx_unlock(&aio_job_mtx);
/* Get the user address space to disconnect from. */
tmpvm = mycp->p_vmspace;
2004-08-13 17:43:53 +00:00
/* Get original address space for daemon. */
mycp->p_vmspace = myvm;
2004-08-13 17:43:53 +00:00
/* Activate the daemon's address space. */
pmap_activate(FIRST_THREAD_IN_PROC(mycp));
#ifdef DIAGNOSTIC
if (tmpvm == myvm) {
printf("AIOD: vmspace problem -- %d\n",
mycp->p_pid);
}
Fix error handling for VCHR type I/O. Also, fix another spl problem, and remove alot of overly verbose debugging statements. ioproclist { int aioprocflags; /* AIO proc flags */ TAILQ_ENTRY(aioproclist) list; /* List of processes */ struct proc *aioproc; /* The AIO thread */ TAILQ_HEAD (,aiocblist) jobtorun; /* suggested job to run */ }; /* * data-structure for lio signal management */ struct aio_liojob { int lioj_flags; int lioj_buffer_count; int lioj_buffer_finished_count; int lioj_queue_count; int lioj_queue_finished_count; struct sigevent lioj_signal; /* signal on all I/O done */ TAILQ_ENTRY (aio_liojob) lioj_list; struct kaioinfo *lioj_ki; }; #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ /* * per process aio data structure */ struct kaioinfo { int kaio_flags; /* per process kaio flags */ int kaio_maxactive_count; /* maximum number of AIOs */ int kaio_active_count; /* number of currently used AIOs */ int kaio_qallowed_count; /* maxiumu size of AIO queue */ int kaio_queue_count; /* size of AIO queue */ int kaio_ballowed_count; /* maximum number of buffers */ int kaio_queue_finished_count; /* number of daemon jobs finished */ int kaio_buffer_count; /* number of physio buffers */ int kaio_buffer_finished_count; /* count of I/O done */ struct proc *kaio_p; /* process that uses this kaio block */ TAILQ_HEAD (,aio_liojob) kaio_liojoblist; /* list of lio jobs */ TAILQ_HEAD (,aiocblist) kaio_jobqueue; /* job queue for process */ TAILQ_HEAD (,aiocblist) kaio_jobdone; /* done queue for process */ TAILQ_HEAD (,aiocblist) kaio_bufqueue; /* buffer job queue for process */ TAILQ_HEAD (,aiocblist) kaio_bufdone; /* buffer done queue for process */ }; #define KAIO_RUNDOWN 0x1 /* process is being run down */ #define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */ TAILQ_HEAD (,aioproclist) aio_freeproc, aio_activeproc; TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */ TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */ TAILQ_HEAD(,aiocblist) aio_freejobs; /* Pool of free jobs */ static void aio_init_aioinfo(struct proc *p) ; static void aio_onceonly(void *) ; static int aio_free_entry(struct aiocblist *aiocbe); static void aio_process(struct aiocblist *aiocbe); static int aio_newproc(void) ; static int aio_aqueue(struct proc *p, struct aiocb *job, int type) ; static void aio_physwakeup(struct buf *bp); static int aio_fphysio(struct proc *p, struct aiocblist *aiocbe, int type); static int aio_qphysio(struct proc *p, struct aiocblist *iocb); static void aio_daemon(void *uproc); SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL); static vm_zone_t kaio_zone=0, aiop_zone=0, aiocb_zone=0, aiol_zone=0, aiolio_zone=0; /* * Single AIOD vmspace shared amongst all of them */ static struct vmspace *aiovmspace = NULL; /* * Startup initialization */ void aio_onceonly(void *na) { TAILQ_INIT(&aio_freeproc); TAILQ_INIT(&aio_activeproc); TAILQ_INIT(&aio_jobs); TAILQ_INIT(&aio_bufjobs); TAILQ_INIT(&aio_freejobs); kaio_zone = zinit("AIO", sizeof (struct kaioinfo), 0, 0, 1); aiop_zone = zinit("AIOP", sizeof (struct aioproclist), 0, 0, 1); aiocb_zone = zinit("AIOCB", sizeof (struct aiocblist), 0, 0, 1); aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof (int), 0, 0, 1); aiolio_zone = zinit("AIOLIO", AIO_LISTIO_MAX * sizeof (struct aio_liojob), 0, 0, 1); aiod_timeout = AIOD_TIMEOUT_DEFAULT; aiod_lifetime = AIOD_LIFETIME_DEFAULT; jobrefid = 1; } /* * Init the per-process aioinfo structure. * The aioinfo limits are set per-process for user limit (resource) management. */ void aio_init_aioinfo(struct proc *p) { struct kaioinfo *ki; if (p->p_aioinfo == NULL) { ki = zalloc(kaio_zone); p->p_aioinfo = ki
1997-12-01 07:01:45 +00:00
#endif
/* Remove our vmspace reference. */
vmspace_free(tmpvm);
2004-08-13 17:43:53 +00:00
curcp = mycp;
2006-01-22 05:59:27 +00:00
mtx_lock(&aio_job_mtx);
/*
* We have to restart to avoid race, we only sleep if
* no job can be selected, that should be
* curcp == mycp.
*/
continue;
}
2006-01-22 05:59:27 +00:00
mtx_assert(&aio_job_mtx, MA_OWNED);
TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
aiop->aiothreadflags |= AIOP_FREE;
/*
* If daemon is inactive for a long time, allow it to exit,
* thereby freeing resources.
*/
2006-01-22 05:59:27 +00:00
if (msleep(aiop->aiothread, &aio_job_mtx, PRIBIO, "aiordy",
aiod_lifetime)) {
if (TAILQ_EMPTY(&aio_jobs)) {
if ((aiop->aiothreadflags & AIOP_FREE) &&
(num_aio_procs > target_aio_procs)) {
TAILQ_REMOVE(&aio_freeproc, aiop, list);
1997-11-30 04:36:31 +00:00
num_aio_procs--;
2006-01-22 05:59:27 +00:00
mtx_unlock(&aio_job_mtx);
uma_zfree(aiop_zone, aiop);
free_unr(aiod_unr, id);
#ifdef DIAGNOSTIC
if (mycp->p_vmspace->vm_refcnt <= 1) {
printf("AIOD: bad vm refcnt for"
" exiting daemon: %d\n",
mycp->p_vmspace->vm_refcnt);
}
Fix error handling for VCHR type I/O. Also, fix another spl problem, and remove alot of overly verbose debugging statements. ioproclist { int aioprocflags; /* AIO proc flags */ TAILQ_ENTRY(aioproclist) list; /* List of processes */ struct proc *aioproc; /* The AIO thread */ TAILQ_HEAD (,aiocblist) jobtorun; /* suggested job to run */ }; /* * data-structure for lio signal management */ struct aio_liojob { int lioj_flags; int lioj_buffer_count; int lioj_buffer_finished_count; int lioj_queue_count; int lioj_queue_finished_count; struct sigevent lioj_signal; /* signal on all I/O done */ TAILQ_ENTRY (aio_liojob) lioj_list; struct kaioinfo *lioj_ki; }; #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ /* * per process aio data structure */ struct kaioinfo { int kaio_flags; /* per process kaio flags */ int kaio_maxactive_count; /* maximum number of AIOs */ int kaio_active_count; /* number of currently used AIOs */ int kaio_qallowed_count; /* maxiumu size of AIO queue */ int kaio_queue_count; /* size of AIO queue */ int kaio_ballowed_count; /* maximum number of buffers */ int kaio_queue_finished_count; /* number of daemon jobs finished */ int kaio_buffer_count; /* number of physio buffers */ int kaio_buffer_finished_count; /* count of I/O done */ struct proc *kaio_p; /* process that uses this kaio block */ TAILQ_HEAD (,aio_liojob) kaio_liojoblist; /* list of lio jobs */ TAILQ_HEAD (,aiocblist) kaio_jobqueue; /* job queue for process */ TAILQ_HEAD (,aiocblist) kaio_jobdone; /* done queue for process */ TAILQ_HEAD (,aiocblist) kaio_bufqueue; /* buffer job queue for process */ TAILQ_HEAD (,aiocblist) kaio_bufdone; /* buffer done queue for process */ }; #define KAIO_RUNDOWN 0x1 /* process is being run down */ #define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */ TAILQ_HEAD (,aioproclist) aio_freeproc, aio_activeproc; TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */ TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */ TAILQ_HEAD(,aiocblist) aio_freejobs; /* Pool of free jobs */ static void aio_init_aioinfo(struct proc *p) ; static void aio_onceonly(void *) ; static int aio_free_entry(struct aiocblist *aiocbe); static void aio_process(struct aiocblist *aiocbe); static int aio_newproc(void) ; static int aio_aqueue(struct proc *p, struct aiocb *job, int type) ; static void aio_physwakeup(struct buf *bp); static int aio_fphysio(struct proc *p, struct aiocblist *aiocbe, int type); static int aio_qphysio(struct proc *p, struct aiocblist *iocb); static void aio_daemon(void *uproc); SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL); static vm_zone_t kaio_zone=0, aiop_zone=0, aiocb_zone=0, aiol_zone=0, aiolio_zone=0; /* * Single AIOD vmspace shared amongst all of them */ static struct vmspace *aiovmspace = NULL; /* * Startup initialization */ void aio_onceonly(void *na) { TAILQ_INIT(&aio_freeproc); TAILQ_INIT(&aio_activeproc); TAILQ_INIT(&aio_jobs); TAILQ_INIT(&aio_bufjobs); TAILQ_INIT(&aio_freejobs); kaio_zone = zinit("AIO", sizeof (struct kaioinfo), 0, 0, 1); aiop_zone = zinit("AIOP", sizeof (struct aioproclist), 0, 0, 1); aiocb_zone = zinit("AIOCB", sizeof (struct aiocblist), 0, 0, 1); aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof (int), 0, 0, 1); aiolio_zone = zinit("AIOLIO", AIO_LISTIO_MAX * sizeof (struct aio_liojob), 0, 0, 1); aiod_timeout = AIOD_TIMEOUT_DEFAULT; aiod_lifetime = AIOD_LIFETIME_DEFAULT; jobrefid = 1; } /* * Init the per-process aioinfo structure. * The aioinfo limits are set per-process for user limit (resource) management. */ void aio_init_aioinfo(struct proc *p) { struct kaioinfo *ki; if (p->p_aioinfo == NULL) { ki = zalloc(kaio_zone); p->p_aioinfo = ki
1997-12-01 07:01:45 +00:00
#endif
kproc_exit(0);
}
}
}
}
2006-01-22 05:59:27 +00:00
mtx_unlock(&aio_job_mtx);
panic("shouldn't be here\n");
}
/*
2006-01-22 05:59:27 +00:00
* Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
* AIO daemon modifies its environment itself.
*/
static int
2006-01-22 05:59:27 +00:00
aio_newproc(int *start)
{
int error;
struct proc *p;
2006-01-22 05:59:27 +00:00
int id;
2006-01-22 05:59:27 +00:00
id = alloc_unr(aiod_unr);
error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p,
2006-01-22 05:59:27 +00:00
RFNOWAIT, 0, "aiod%d", id);
if (error == 0) {
/*
* Wait until daemon is started.
*/
sema_wait(&aio_newproc_sem);
mtx_lock(&aio_job_mtx);
num_aio_procs++;
if (start != NULL)
2006-01-23 23:46:30 +00:00
(*start)--;
2006-01-22 05:59:27 +00:00
mtx_unlock(&aio_job_mtx);
} else {
free_unr(aiod_unr, id);
}
return (error);
}
/*
* Try the high-performance, low-overhead physio method for eligible
* VCHR devices. This method doesn't use an aio helper thread, and
2004-08-13 17:43:53 +00:00
* thus has very low overhead.
*
* Assumes that the caller, aio_aqueue(), has incremented the file
* structure's reference count, preventing its deallocation for the
2004-08-13 17:43:53 +00:00
* duration of this call.
*/
static int
aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
{
struct aiocb *cb;
struct file *fp;
struct buf *bp;
struct vnode *vp;
struct kaioinfo *ki;
2006-01-22 05:59:27 +00:00
struct aioliojob *lj;
int error;
1997-11-30 04:36:31 +00:00
cb = &aiocbe->uaiocb;
fp = aiocbe->fd_file;
2004-08-13 17:43:53 +00:00
if (fp->f_type != DTYPE_VNODE)
return (-1);
vp = fp->f_vnode;
Fix error handling for VCHR type I/O. Also, fix another spl problem, and remove alot of overly verbose debugging statements. ioproclist { int aioprocflags; /* AIO proc flags */ TAILQ_ENTRY(aioproclist) list; /* List of processes */ struct proc *aioproc; /* The AIO thread */ TAILQ_HEAD (,aiocblist) jobtorun; /* suggested job to run */ }; /* * data-structure for lio signal management */ struct aio_liojob { int lioj_flags; int lioj_buffer_count; int lioj_buffer_finished_count; int lioj_queue_count; int lioj_queue_finished_count; struct sigevent lioj_signal; /* signal on all I/O done */ TAILQ_ENTRY (aio_liojob) lioj_list; struct kaioinfo *lioj_ki; }; #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ /* * per process aio data structure */ struct kaioinfo { int kaio_flags; /* per process kaio flags */ int kaio_maxactive_count; /* maximum number of AIOs */ int kaio_active_count; /* number of currently used AIOs */ int kaio_qallowed_count; /* maxiumu size of AIO queue */ int kaio_queue_count; /* size of AIO queue */ int kaio_ballowed_count; /* maximum number of buffers */ int kaio_queue_finished_count; /* number of daemon jobs finished */ int kaio_buffer_count; /* number of physio buffers */ int kaio_buffer_finished_count; /* count of I/O done */ struct proc *kaio_p; /* process that uses this kaio block */ TAILQ_HEAD (,aio_liojob) kaio_liojoblist; /* list of lio jobs */ TAILQ_HEAD (,aiocblist) kaio_jobqueue; /* job queue for process */ TAILQ_HEAD (,aiocblist) kaio_jobdone; /* done queue for process */ TAILQ_HEAD (,aiocblist) kaio_bufqueue; /* buffer job queue for process */ TAILQ_HEAD (,aiocblist) kaio_bufdone; /* buffer done queue for process */ }; #define KAIO_RUNDOWN 0x1 /* process is being run down */ #define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */ TAILQ_HEAD (,aioproclist) aio_freeproc, aio_activeproc; TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */ TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */ TAILQ_HEAD(,aiocblist) aio_freejobs; /* Pool of free jobs */ static void aio_init_aioinfo(struct proc *p) ; static void aio_onceonly(void *) ; static int aio_free_entry(struct aiocblist *aiocbe); static void aio_process(struct aiocblist *aiocbe); static int aio_newproc(void) ; static int aio_aqueue(struct proc *p, struct aiocb *job, int type) ; static void aio_physwakeup(struct buf *bp); static int aio_fphysio(struct proc *p, struct aiocblist *aiocbe, int type); static int aio_qphysio(struct proc *p, struct aiocblist *iocb); static void aio_daemon(void *uproc); SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL); static vm_zone_t kaio_zone=0, aiop_zone=0, aiocb_zone=0, aiol_zone=0, aiolio_zone=0; /* * Single AIOD vmspace shared amongst all of them */ static struct vmspace *aiovmspace = NULL; /* * Startup initialization */ void aio_onceonly(void *na) { TAILQ_INIT(&aio_freeproc); TAILQ_INIT(&aio_activeproc); TAILQ_INIT(&aio_jobs); TAILQ_INIT(&aio_bufjobs); TAILQ_INIT(&aio_freejobs); kaio_zone = zinit("AIO", sizeof (struct kaioinfo), 0, 0, 1); aiop_zone = zinit("AIOP", sizeof (struct aioproclist), 0, 0, 1); aiocb_zone = zinit("AIOCB", sizeof (struct aiocblist), 0, 0, 1); aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof (int), 0, 0, 1); aiolio_zone = zinit("AIOLIO", AIO_LISTIO_MAX * sizeof (struct aio_liojob), 0, 0, 1); aiod_timeout = AIOD_TIMEOUT_DEFAULT; aiod_lifetime = AIOD_LIFETIME_DEFAULT; jobrefid = 1; } /* * Init the per-process aioinfo structure. * The aioinfo limits are set per-process for user limit (resource) management. */ void aio_init_aioinfo(struct proc *p) { struct kaioinfo *ki; if (p->p_aioinfo == NULL) { ki = zalloc(kaio_zone); p->p_aioinfo = ki
1997-12-01 07:01:45 +00:00
/*
* If its not a disk, we don't want to return a positive error.
* It causes the aio code to not fall through to try the thread
* way when you're talking to a regular file.
*/
if (!vn_isdisk(vp, &error)) {
if (error == ENOTBLK)
return (-1);
else
return (error);
}
if (vp->v_bufobj.bo_bsize == 0)
return (-1);
if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
return (-1);
if (cb->aio_nbytes > vp->v_rdev->si_iosize_max)
return (-1);
if (cb->aio_nbytes >
MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
return (-1);
ki = p->p_aioinfo;
2004-08-13 17:43:53 +00:00
if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
return (-1);
/* Create and build a buffer header for a transfer. */
bp = (struct buf *)getpbuf(NULL);
BUF_KERNPROC(bp);
AIO_LOCK(ki);
2006-01-22 05:59:27 +00:00
ki->kaio_count++;
ki->kaio_buffer_count++;
lj = aiocbe->lio;
if (lj)
lj->lioj_count++;
AIO_UNLOCK(ki);
2006-01-22 05:59:27 +00:00
/*
* Get a copy of the kva from the physical buffer.
*/
error = 0;
bp->b_bcount = cb->aio_nbytes;
bp->b_bufsize = cb->aio_nbytes;
bp->b_iodone = aio_physwakeup;
bp->b_saveaddr = bp->b_data;
bp->b_data = (void *)(uintptr_t)cb->aio_buf;
bp->b_offset = cb->aio_offset;
bp->b_iooffset = cb->aio_offset;
bp->b_blkno = btodb(cb->aio_offset);
bp->b_iocmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
/*
* Bring buffer into kernel space.
*/
if (vmapbuf(bp, 1) < 0) {
error = EFAULT;
goto doerror;
}
AIO_LOCK(ki);
aiocbe->bp = bp;
bp->b_caller1 = (void *)aiocbe;
1997-11-30 04:36:31 +00:00
TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
2006-01-22 05:59:27 +00:00
TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
aiocbe->jobstate = JOBST_JOBQBUF;
1997-11-30 04:36:31 +00:00
cb->_aiocb_private.status = cb->aio_nbytes;
AIO_UNLOCK(ki);
2006-01-22 05:59:27 +00:00
atomic_add_int(&num_queue_count, 1);
atomic_add_int(&num_buf_aio, 1);
bp->b_error = 0;
2006-01-22 05:59:27 +00:00
TASK_INIT(&aiocbe->biotask, 0, biohelper, aiocbe);
2004-08-13 17:43:53 +00:00
/* Perform transfer. */
dev_strategy(vp->v_rdev, bp);
return (0);
doerror:
AIO_LOCK(ki);
2006-01-22 05:59:27 +00:00
ki->kaio_count--;
ki->kaio_buffer_count--;
if (lj)
2006-01-22 05:59:27 +00:00
lj->lioj_count--;
1997-11-30 04:36:31 +00:00
aiocbe->bp = NULL;
AIO_UNLOCK(ki);
relpbuf(bp, NULL);
return (error);
}
/*
* Wake up aio requests that may be serviceable now.
*/
static void
aio_swake_cb(struct socket *so, struct sockbuf *sb)
{
2006-01-22 05:59:27 +00:00
struct aiocblist *cb, *cbn;
2006-03-23 08:46:42 +00:00
int opcode;
Rework socket upcalls to close some races with setup/teardown of upcalls. - Each socket upcall is now invoked with the appropriate socket buffer locked. It is not permissible to call soisconnected() with this lock held; however, so socket upcalls now return an integer value. The two possible values are SU_OK and SU_ISCONNECTED. If an upcall returns SU_ISCONNECTED, then the soisconnected() will be invoked on the socket after the socket buffer lock is dropped. - A new API is provided for setting and clearing socket upcalls. The API consists of soupcall_set() and soupcall_clear(). - To simplify locking, each socket buffer now has a separate upcall. - When a socket upcall returns SU_ISCONNECTED, the upcall is cleared from the receive socket buffer automatically. Note that a SO_SND upcall should never return SU_ISCONNECTED. - All this means that accept filters should now return SU_ISCONNECTED instead of calling soisconnected() directly. They also no longer need to explicitly clear the upcall on the new socket. - The HTTP accept filter still uses soupcall_set() to manage its internal state machine, but other accept filters no longer have any explicit knowlege of socket upcall internals aside from their return value. - The various RPC client upcalls currently drop the socket buffer lock while invoking soreceive() as a temporary band-aid. The plan for the future is to add a new flag to allow soreceive() to be called with the socket buffer locked. - The AIO callback for socket I/O is now also invoked with the socket buffer locked. Previously sowakeup() would drop the socket buffer lock only to call aio_swake() which immediately re-acquired the socket buffer lock for the duration of the function call. Discussed with: rwatson, rmacklem
2009-06-01 21:17:03 +00:00
SOCKBUF_LOCK_ASSERT(sb);
if (sb == &so->so_snd)
opcode = LIO_WRITE;
else
opcode = LIO_READ;
sb->sb_flags &= ~SB_AIO;
mtx_lock(&aio_job_mtx);
TAILQ_FOREACH_SAFE(cb, &so->so_aiojobq, list, cbn) {
if (opcode == cb->uaiocb.aio_lio_opcode) {
2006-01-22 09:39:59 +00:00
if (cb->jobstate != JOBST_JOBQSOCK)
2006-01-22 05:59:27 +00:00
panic("invalid queue value");
/* XXX
* We don't have actual sockets backend yet,
* so we simply move the requests to the generic
* file I/O backend.
2006-01-22 05:59:27 +00:00
*/
TAILQ_REMOVE(&so->so_aiojobq, cb, list);
2006-01-22 05:59:27 +00:00
TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
2006-03-23 08:46:42 +00:00
aio_kick_nowait(cb->userproc);
}
}
mtx_unlock(&aio_job_mtx);
}
static int
convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig)
{
/*
* Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
* supported by AIO with the old sigevent structure.
*/
nsig->sigev_notify = osig->sigev_notify;
switch (nsig->sigev_notify) {
case SIGEV_NONE:
break;
case SIGEV_SIGNAL:
nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
break;
case SIGEV_KEVENT:
nsig->sigev_notify_kqueue =
osig->__sigev_u.__sigev_notify_kqueue;
nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr;
break;
default:
return (EINVAL);
}
return (0);
}
static int
aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
{
struct oaiocb *ojob;
int error;
bzero(kjob, sizeof(struct aiocb));
error = copyin(ujob, kjob, sizeof(struct oaiocb));
if (error)
return (error);
ojob = (struct oaiocb *)kjob;
return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent));
}
static int
aiocb_copyin(struct aiocb *ujob, struct aiocb *kjob)
{
return (copyin(ujob, kjob, sizeof(struct aiocb)));
}
static long
aiocb_fetch_status(struct aiocb *ujob)
{
return (fuword(&ujob->_aiocb_private.status));
}
static long
aiocb_fetch_error(struct aiocb *ujob)
{
return (fuword(&ujob->_aiocb_private.error));
}
static int
aiocb_store_status(struct aiocb *ujob, long status)
{
return (suword(&ujob->_aiocb_private.status, status));
}
static int
aiocb_store_error(struct aiocb *ujob, long error)
{
return (suword(&ujob->_aiocb_private.error, error));
}
static int
aiocb_store_kernelinfo(struct aiocb *ujob, long jobref)
{
return (suword(&ujob->_aiocb_private.kernelinfo, jobref));
}
static int
aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
{
return (suword(ujobp, (long)ujob));
}
static struct aiocb_ops aiocb_ops = {
.copyin = aiocb_copyin,
.fetch_status = aiocb_fetch_status,
.fetch_error = aiocb_fetch_error,
.store_status = aiocb_store_status,
.store_error = aiocb_store_error,
.store_kernelinfo = aiocb_store_kernelinfo,
.store_aiocb = aiocb_store_aiocb,
};
static struct aiocb_ops aiocb_ops_osigevent = {
.copyin = aiocb_copyin_old_sigevent,
.fetch_status = aiocb_fetch_status,
.fetch_error = aiocb_fetch_error,
.store_status = aiocb_store_status,
.store_error = aiocb_store_error,
.store_kernelinfo = aiocb_store_kernelinfo,
.store_aiocb = aiocb_store_aiocb,
};
/*
* Queue a new AIO request. Choosing either the threaded or direct physio VCHR
* technique is done in this code.
*/
MFP4 (with some minor changes): Implement the linux_io_* syscalls (AIO). They are only enabled if the native AIO code is available (either compiled in to the kernel or as a module) at the time the functions are used. If the AIO stuff is not available there will be a ENOSYS. From the submitter: ---snip--- DESIGN NOTES: 1. Linux permits a process to own multiple AIO queues (distinguished by "context"), but FreeBSD creates only one single AIO queue per process. My code maintains a request queue (STAILQ of queue(3)) per "context", and throws all AIO requests of all contexts owned by a process into the single FreeBSD per-process AIO queue. When the process calls io_destroy(2), io_getevents(2), io_submit(2) and io_cancel(2), my code can pick out requests owned by the specified context from the single FreeBSD per-process AIO queue according to the per-context request queues maintained by my code. 2. The request queue maintained by my code stores contrast information between Linux IO control blocks (struct linux_iocb) and FreeBSD IO control blocks (struct aiocb). FreeBSD IO control block actually exists in userland memory space, required by FreeBSD native aio_XXXXXX(2). 3. It is quite troubling that the function io_getevents() of libaio-0.3.105 needs to use Linux-specific "struct aio_ring", which is a partial mirror of context in user space. I would rather take the address of context in kernel as the context ID, but the io_getevents() of libaio forces me to take the address of the "ring" in user space as the context ID. To my surprise, one comment line in the file "io_getevents.c" of libaio-0.3.105 reads: Ben will hate me for this REFERENCE: 1. Linux kernel source code: http://www.kernel.org/pub/linux/kernel/v2.6/ (include/linux/aio_abi.h, fs/aio.c) 2. Linux manual pages: http://www.kernel.org/pub/linux/docs/manpages/ (io_setup(2), io_destroy(2), io_getevents(2), io_submit(2), io_cancel(2)) 3. Linux Scalability Effort: http://lse.sourceforge.net/io/aio.html The design notes: http://lse.sourceforge.net/io/aionotes.txt 4. The package libaio, both source and binary: http://rpmfind.net/linux/rpm2html/search.php?query=libaio Simple transparent interface to Linux AIO system calls. 5. Libaio-oracle: http://oss.oracle.com/projects/libaio-oracle/ POSIX AIO implementation based on Linux AIO system calls (depending on libaio). ---snip--- Submitted by: Li, Xiao <intron@intron.ac>
2006-10-15 14:22:14 +00:00
int
aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lj,
int type, struct aiocb_ops *ops)
{
struct proc *p = td->td_proc;
struct file *fp;
struct socket *so;
struct aiocblist *aiocbe, *cb;
struct kaioinfo *ki;
struct kevent kev;
struct sockbuf *sb;
2006-01-22 05:59:27 +00:00
int opcode;
int error;
int fd, kqfd;
2006-01-22 05:59:27 +00:00
int jid;
u_short evflags;
2006-01-22 05:59:27 +00:00
if (p->p_aioinfo == NULL)
aio_init_aioinfo(p);
2006-01-22 05:59:27 +00:00
ki = p->p_aioinfo;
ops->store_status(job, -1);
ops->store_error(job, 0);
ops->store_kernelinfo(job, -1);
if (num_queue_count >= max_queue_count ||
ki->kaio_count >= ki->kaio_qallowed_count) {
ops->store_error(job, EAGAIN);
return (EAGAIN);
}
2006-01-22 05:59:27 +00:00
aiocbe = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);
aiocbe->inputcharge = 0;
aiocbe->outputcharge = 0;
knlist_init_mtx(&aiocbe->klist, AIO_MTX(ki));
error = ops->copyin(job, &aiocbe->uaiocb);
if (error) {
ops->store_error(job, error);
uma_zfree(aiocb_zone, aiocbe);
return (error);
}
/* XXX: aio_nbytes is later casted to signed types. */
if (aiocbe->uaiocb.aio_nbytes > INT_MAX) {
uma_zfree(aiocb_zone, aiocbe);
return (EINVAL);
}
if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
ops->store_error(job, EINVAL);
uma_zfree(aiocb_zone, aiocbe);
return (EINVAL);
}
if ((aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
!_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
uma_zfree(aiocb_zone, aiocbe);
return (EINVAL);
}
ksiginfo_init(&aiocbe->ksi);
/* Save userspace address of the job info. */
Fix error handling for VCHR type I/O. Also, fix another spl problem, and remove alot of overly verbose debugging statements. ioproclist { int aioprocflags; /* AIO proc flags */ TAILQ_ENTRY(aioproclist) list; /* List of processes */ struct proc *aioproc; /* The AIO thread */ TAILQ_HEAD (,aiocblist) jobtorun; /* suggested job to run */ }; /* * data-structure for lio signal management */ struct aio_liojob { int lioj_flags; int lioj_buffer_count; int lioj_buffer_finished_count; int lioj_queue_count; int lioj_queue_finished_count; struct sigevent lioj_signal; /* signal on all I/O done */ TAILQ_ENTRY (aio_liojob) lioj_list; struct kaioinfo *lioj_ki; }; #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ /* * per process aio data structure */ struct kaioinfo { int kaio_flags; /* per process kaio flags */ int kaio_maxactive_count; /* maximum number of AIOs */ int kaio_active_count; /* number of currently used AIOs */ int kaio_qallowed_count; /* maxiumu size of AIO queue */ int kaio_queue_count; /* size of AIO queue */ int kaio_ballowed_count; /* maximum number of buffers */ int kaio_queue_finished_count; /* number of daemon jobs finished */ int kaio_buffer_count; /* number of physio buffers */ int kaio_buffer_finished_count; /* count of I/O done */ struct proc *kaio_p; /* process that uses this kaio block */ TAILQ_HEAD (,aio_liojob) kaio_liojoblist; /* list of lio jobs */ TAILQ_HEAD (,aiocblist) kaio_jobqueue; /* job queue for process */ TAILQ_HEAD (,aiocblist) kaio_jobdone; /* done queue for process */ TAILQ_HEAD (,aiocblist) kaio_bufqueue; /* buffer job queue for process */ TAILQ_HEAD (,aiocblist) kaio_bufdone; /* buffer done queue for process */ }; #define KAIO_RUNDOWN 0x1 /* process is being run down */ #define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */ TAILQ_HEAD (,aioproclist) aio_freeproc, aio_activeproc; TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */ TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */ TAILQ_HEAD(,aiocblist) aio_freejobs; /* Pool of free jobs */ static void aio_init_aioinfo(struct proc *p) ; static void aio_onceonly(void *) ; static int aio_free_entry(struct aiocblist *aiocbe); static void aio_process(struct aiocblist *aiocbe); static int aio_newproc(void) ; static int aio_aqueue(struct proc *p, struct aiocb *job, int type) ; static void aio_physwakeup(struct buf *bp); static int aio_fphysio(struct proc *p, struct aiocblist *aiocbe, int type); static int aio_qphysio(struct proc *p, struct aiocblist *iocb); static void aio_daemon(void *uproc); SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL); static vm_zone_t kaio_zone=0, aiop_zone=0, aiocb_zone=0, aiol_zone=0, aiolio_zone=0; /* * Single AIOD vmspace shared amongst all of them */ static struct vmspace *aiovmspace = NULL; /* * Startup initialization */ void aio_onceonly(void *na) { TAILQ_INIT(&aio_freeproc); TAILQ_INIT(&aio_activeproc); TAILQ_INIT(&aio_jobs); TAILQ_INIT(&aio_bufjobs); TAILQ_INIT(&aio_freejobs); kaio_zone = zinit("AIO", sizeof (struct kaioinfo), 0, 0, 1); aiop_zone = zinit("AIOP", sizeof (struct aioproclist), 0, 0, 1); aiocb_zone = zinit("AIOCB", sizeof (struct aiocblist), 0, 0, 1); aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof (int), 0, 0, 1); aiolio_zone = zinit("AIOLIO", AIO_LISTIO_MAX * sizeof (struct aio_liojob), 0, 0, 1); aiod_timeout = AIOD_TIMEOUT_DEFAULT; aiod_lifetime = AIOD_LIFETIME_DEFAULT; jobrefid = 1; } /* * Init the per-process aioinfo structure. * The aioinfo limits are set per-process for user limit (resource) management. */ void aio_init_aioinfo(struct proc *p) { struct kaioinfo *ki; if (p->p_aioinfo == NULL) { ki = zalloc(kaio_zone); p->p_aioinfo = ki
1997-12-01 07:01:45 +00:00
aiocbe->uuaiocb = job;
/* Get the opcode. */
if (type != LIO_NOP)
aiocbe->uaiocb.aio_lio_opcode = type;
opcode = aiocbe->uaiocb.aio_lio_opcode;
/*
* Validate the opcode and fetch the file object for the specified
* file descriptor.
*
* XXXRW: Moved the opcode validation up here so that we don't
* retrieve a file descriptor without knowing what the capabiltity
* should be.
*/
fd = aiocbe->uaiocb.aio_fildes;
switch (opcode) {
case LIO_WRITE:
Merge Capsicum overhaul: - Capability is no longer separate descriptor type. Now every descriptor has set of its own capability rights. - The cap_new(2) system call is left, but it is no longer documented and should not be used in new code. - The new syscall cap_rights_limit(2) should be used instead of cap_new(2), which limits capability rights of the given descriptor without creating a new one. - The cap_getrights(2) syscall is renamed to cap_rights_get(2). - If CAP_IOCTL capability right is present we can further reduce allowed ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed ioctls can be retrived with cap_ioctls_get(2) syscall. - If CAP_FCNTL capability right is present we can further reduce fcntls that can be used with the new cap_fcntls_limit(2) syscall and retrive them with cap_fcntls_get(2). - To support ioctl and fcntl white-listing the filedesc structure was heavly modified. - The audit subsystem, kdump and procstat tools were updated to recognize new syscalls. - Capability rights were revised and eventhough I tried hard to provide backward API and ABI compatibility there are some incompatible changes that are described in detail below: CAP_CREATE old behaviour: - Allow for openat(2)+O_CREAT. - Allow for linkat(2). - Allow for symlinkat(2). CAP_CREATE new behaviour: - Allow for openat(2)+O_CREAT. Added CAP_LINKAT: - Allow for linkat(2). ABI: Reuses CAP_RMDIR bit. - Allow to be target for renameat(2). Added CAP_SYMLINKAT: - Allow for symlinkat(2). Removed CAP_DELETE. Old behaviour: - Allow for unlinkat(2) when removing non-directory object. - Allow to be source for renameat(2). Removed CAP_RMDIR. Old behaviour: - Allow for unlinkat(2) when removing directory. Added CAP_RENAMEAT: - Required for source directory for the renameat(2) syscall. Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR): - Allow for unlinkat(2) on any object. - Required if target of renameat(2) exists and will be removed by this call. Removed CAP_MAPEXEC. CAP_MMAP old behaviour: - Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and PROT_WRITE. CAP_MMAP new behaviour: - Allow for mmap(2)+PROT_NONE. Added CAP_MMAP_R: - Allow for mmap(PROT_READ). Added CAP_MMAP_W: - Allow for mmap(PROT_WRITE). Added CAP_MMAP_X: - Allow for mmap(PROT_EXEC). Added CAP_MMAP_RW: - Allow for mmap(PROT_READ | PROT_WRITE). Added CAP_MMAP_RX: - Allow for mmap(PROT_READ | PROT_EXEC). Added CAP_MMAP_WX: - Allow for mmap(PROT_WRITE | PROT_EXEC). Added CAP_MMAP_RWX: - Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC). Renamed CAP_MKDIR to CAP_MKDIRAT. Renamed CAP_MKFIFO to CAP_MKFIFOAT. Renamed CAP_MKNODE to CAP_MKNODEAT. CAP_READ old behaviour: - Allow pread(2). - Disallow read(2), readv(2) (if there is no CAP_SEEK). CAP_READ new behaviour: - Allow read(2), readv(2). - Disallow pread(2) (CAP_SEEK was also required). CAP_WRITE old behaviour: - Allow pwrite(2). - Disallow write(2), writev(2) (if there is no CAP_SEEK). CAP_WRITE new behaviour: - Allow write(2), writev(2). - Disallow pwrite(2) (CAP_SEEK was also required). Added convinient defines: #define CAP_PREAD (CAP_SEEK | CAP_READ) #define CAP_PWRITE (CAP_SEEK | CAP_WRITE) #define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ) #define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE) #define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL) #define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W) #define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X) #define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X) #define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X) #define CAP_RECV CAP_READ #define CAP_SEND CAP_WRITE #define CAP_SOCK_CLIENT \ (CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \ CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN) #define CAP_SOCK_SERVER \ (CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \ CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \ CAP_SETSOCKOPT | CAP_SHUTDOWN) Added defines for backward API compatibility: #define CAP_MAPEXEC CAP_MMAP_X #define CAP_DELETE CAP_UNLINKAT #define CAP_MKDIR CAP_MKDIRAT #define CAP_RMDIR CAP_UNLINKAT #define CAP_MKFIFO CAP_MKFIFOAT #define CAP_MKNOD CAP_MKNODAT #define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER) Sponsored by: The FreeBSD Foundation Reviewed by: Christoph Mallon <christoph.mallon@gmx.de> Many aspects discussed with: rwatson, benl, jonathan ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
error = fget_write(td, fd, CAP_PWRITE, &fp);
break;
case LIO_READ:
Merge Capsicum overhaul: - Capability is no longer separate descriptor type. Now every descriptor has set of its own capability rights. - The cap_new(2) system call is left, but it is no longer documented and should not be used in new code. - The new syscall cap_rights_limit(2) should be used instead of cap_new(2), which limits capability rights of the given descriptor without creating a new one. - The cap_getrights(2) syscall is renamed to cap_rights_get(2). - If CAP_IOCTL capability right is present we can further reduce allowed ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed ioctls can be retrived with cap_ioctls_get(2) syscall. - If CAP_FCNTL capability right is present we can further reduce fcntls that can be used with the new cap_fcntls_limit(2) syscall and retrive them with cap_fcntls_get(2). - To support ioctl and fcntl white-listing the filedesc structure was heavly modified. - The audit subsystem, kdump and procstat tools were updated to recognize new syscalls. - Capability rights were revised and eventhough I tried hard to provide backward API and ABI compatibility there are some incompatible changes that are described in detail below: CAP_CREATE old behaviour: - Allow for openat(2)+O_CREAT. - Allow for linkat(2). - Allow for symlinkat(2). CAP_CREATE new behaviour: - Allow for openat(2)+O_CREAT. Added CAP_LINKAT: - Allow for linkat(2). ABI: Reuses CAP_RMDIR bit. - Allow to be target for renameat(2). Added CAP_SYMLINKAT: - Allow for symlinkat(2). Removed CAP_DELETE. Old behaviour: - Allow for unlinkat(2) when removing non-directory object. - Allow to be source for renameat(2). Removed CAP_RMDIR. Old behaviour: - Allow for unlinkat(2) when removing directory. Added CAP_RENAMEAT: - Required for source directory for the renameat(2) syscall. Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR): - Allow for unlinkat(2) on any object. - Required if target of renameat(2) exists and will be removed by this call. Removed CAP_MAPEXEC. CAP_MMAP old behaviour: - Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and PROT_WRITE. CAP_MMAP new behaviour: - Allow for mmap(2)+PROT_NONE. Added CAP_MMAP_R: - Allow for mmap(PROT_READ). Added CAP_MMAP_W: - Allow for mmap(PROT_WRITE). Added CAP_MMAP_X: - Allow for mmap(PROT_EXEC). Added CAP_MMAP_RW: - Allow for mmap(PROT_READ | PROT_WRITE). Added CAP_MMAP_RX: - Allow for mmap(PROT_READ | PROT_EXEC). Added CAP_MMAP_WX: - Allow for mmap(PROT_WRITE | PROT_EXEC). Added CAP_MMAP_RWX: - Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC). Renamed CAP_MKDIR to CAP_MKDIRAT. Renamed CAP_MKFIFO to CAP_MKFIFOAT. Renamed CAP_MKNODE to CAP_MKNODEAT. CAP_READ old behaviour: - Allow pread(2). - Disallow read(2), readv(2) (if there is no CAP_SEEK). CAP_READ new behaviour: - Allow read(2), readv(2). - Disallow pread(2) (CAP_SEEK was also required). CAP_WRITE old behaviour: - Allow pwrite(2). - Disallow write(2), writev(2) (if there is no CAP_SEEK). CAP_WRITE new behaviour: - Allow write(2), writev(2). - Disallow pwrite(2) (CAP_SEEK was also required). Added convinient defines: #define CAP_PREAD (CAP_SEEK | CAP_READ) #define CAP_PWRITE (CAP_SEEK | CAP_WRITE) #define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ) #define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE) #define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL) #define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W) #define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X) #define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X) #define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X) #define CAP_RECV CAP_READ #define CAP_SEND CAP_WRITE #define CAP_SOCK_CLIENT \ (CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \ CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN) #define CAP_SOCK_SERVER \ (CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \ CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \ CAP_SETSOCKOPT | CAP_SHUTDOWN) Added defines for backward API compatibility: #define CAP_MAPEXEC CAP_MMAP_X #define CAP_DELETE CAP_UNLINKAT #define CAP_MKDIR CAP_MKDIRAT #define CAP_RMDIR CAP_UNLINKAT #define CAP_MKFIFO CAP_MKFIFOAT #define CAP_MKNOD CAP_MKNODAT #define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER) Sponsored by: The FreeBSD Foundation Reviewed by: Christoph Mallon <christoph.mallon@gmx.de> Many aspects discussed with: rwatson, benl, jonathan ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
error = fget_read(td, fd, CAP_PREAD, &fp);
break;
case LIO_SYNC:
error = fget(td, fd, CAP_FSYNC, &fp);
break;
case LIO_NOP:
Merge Capsicum overhaul: - Capability is no longer separate descriptor type. Now every descriptor has set of its own capability rights. - The cap_new(2) system call is left, but it is no longer documented and should not be used in new code. - The new syscall cap_rights_limit(2) should be used instead of cap_new(2), which limits capability rights of the given descriptor without creating a new one. - The cap_getrights(2) syscall is renamed to cap_rights_get(2). - If CAP_IOCTL capability right is present we can further reduce allowed ioctls list with the new cap_ioctls_limit(2) syscall. List of allowed ioctls can be retrived with cap_ioctls_get(2) syscall. - If CAP_FCNTL capability right is present we can further reduce fcntls that can be used with the new cap_fcntls_limit(2) syscall and retrive them with cap_fcntls_get(2). - To support ioctl and fcntl white-listing the filedesc structure was heavly modified. - The audit subsystem, kdump and procstat tools were updated to recognize new syscalls. - Capability rights were revised and eventhough I tried hard to provide backward API and ABI compatibility there are some incompatible changes that are described in detail below: CAP_CREATE old behaviour: - Allow for openat(2)+O_CREAT. - Allow for linkat(2). - Allow for symlinkat(2). CAP_CREATE new behaviour: - Allow for openat(2)+O_CREAT. Added CAP_LINKAT: - Allow for linkat(2). ABI: Reuses CAP_RMDIR bit. - Allow to be target for renameat(2). Added CAP_SYMLINKAT: - Allow for symlinkat(2). Removed CAP_DELETE. Old behaviour: - Allow for unlinkat(2) when removing non-directory object. - Allow to be source for renameat(2). Removed CAP_RMDIR. Old behaviour: - Allow for unlinkat(2) when removing directory. Added CAP_RENAMEAT: - Required for source directory for the renameat(2) syscall. Added CAP_UNLINKAT (effectively it replaces CAP_DELETE and CAP_RMDIR): - Allow for unlinkat(2) on any object. - Required if target of renameat(2) exists and will be removed by this call. Removed CAP_MAPEXEC. CAP_MMAP old behaviour: - Allow for mmap(2) with any combination of PROT_NONE, PROT_READ and PROT_WRITE. CAP_MMAP new behaviour: - Allow for mmap(2)+PROT_NONE. Added CAP_MMAP_R: - Allow for mmap(PROT_READ). Added CAP_MMAP_W: - Allow for mmap(PROT_WRITE). Added CAP_MMAP_X: - Allow for mmap(PROT_EXEC). Added CAP_MMAP_RW: - Allow for mmap(PROT_READ | PROT_WRITE). Added CAP_MMAP_RX: - Allow for mmap(PROT_READ | PROT_EXEC). Added CAP_MMAP_WX: - Allow for mmap(PROT_WRITE | PROT_EXEC). Added CAP_MMAP_RWX: - Allow for mmap(PROT_READ | PROT_WRITE | PROT_EXEC). Renamed CAP_MKDIR to CAP_MKDIRAT. Renamed CAP_MKFIFO to CAP_MKFIFOAT. Renamed CAP_MKNODE to CAP_MKNODEAT. CAP_READ old behaviour: - Allow pread(2). - Disallow read(2), readv(2) (if there is no CAP_SEEK). CAP_READ new behaviour: - Allow read(2), readv(2). - Disallow pread(2) (CAP_SEEK was also required). CAP_WRITE old behaviour: - Allow pwrite(2). - Disallow write(2), writev(2) (if there is no CAP_SEEK). CAP_WRITE new behaviour: - Allow write(2), writev(2). - Disallow pwrite(2) (CAP_SEEK was also required). Added convinient defines: #define CAP_PREAD (CAP_SEEK | CAP_READ) #define CAP_PWRITE (CAP_SEEK | CAP_WRITE) #define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ) #define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE) #define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000008ULL) #define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W) #define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X) #define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X) #define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X) #define CAP_RECV CAP_READ #define CAP_SEND CAP_WRITE #define CAP_SOCK_CLIENT \ (CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \ CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN) #define CAP_SOCK_SERVER \ (CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \ CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \ CAP_SETSOCKOPT | CAP_SHUTDOWN) Added defines for backward API compatibility: #define CAP_MAPEXEC CAP_MMAP_X #define CAP_DELETE CAP_UNLINKAT #define CAP_MKDIR CAP_MKDIRAT #define CAP_RMDIR CAP_UNLINKAT #define CAP_MKFIFO CAP_MKFIFOAT #define CAP_MKNOD CAP_MKNODAT #define CAP_SOCK_ALL (CAP_SOCK_CLIENT | CAP_SOCK_SERVER) Sponsored by: The FreeBSD Foundation Reviewed by: Christoph Mallon <christoph.mallon@gmx.de> Many aspects discussed with: rwatson, benl, jonathan ABI compatibility discussed with: kib
2013-03-02 00:53:12 +00:00
error = fget(td, fd, CAP_NONE, &fp);
break;
default:
error = EINVAL;
}
if (error) {
uma_zfree(aiocb_zone, aiocbe);
ops->store_error(job, error);
return (error);
}
2006-03-23 08:46:42 +00:00
if (opcode == LIO_SYNC && fp->f_vnode == NULL) {
error = EINVAL;
goto aqueue_fail;
}
if (opcode != LIO_SYNC && aiocbe->uaiocb.aio_offset == -1LL) {
error = EINVAL;
goto aqueue_fail;
}
2006-01-22 05:59:27 +00:00
2006-03-23 08:46:42 +00:00
aiocbe->fd_file = fp;
2006-01-22 05:59:27 +00:00
mtx_lock(&aio_job_mtx);
2006-03-23 08:46:42 +00:00
jid = jobrefid++;
aiocbe->seqno = jobseqno++;
2006-01-22 05:59:27 +00:00
mtx_unlock(&aio_job_mtx);
error = ops->store_kernelinfo(job, jid);
2006-01-22 05:59:27 +00:00
if (error) {
error = EINVAL;
goto aqueue_fail;
}
aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;
2004-08-13 17:43:53 +00:00
if (opcode == LIO_NOP) {
fdrop(fp, td);
uma_zfree(aiocb_zone, aiocbe);
return (0);
}
if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
goto no_kqueue;
evflags = aiocbe->uaiocb.aio_sigevent.sigev_notify_kevent_flags;
if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) {
error = EINVAL;
goto aqueue_fail;
}
kqfd = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
kev.ident = (uintptr_t)aiocbe->uuaiocb;
kev.filter = EVFILT_AIO;
kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags;
kev.data = (intptr_t)aiocbe;
2006-01-22 05:59:27 +00:00
kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sival_ptr;
error = kqfd_register(kqfd, &kev, td, 1);
aqueue_fail:
if (error) {
fdrop(fp, td);
uma_zfree(aiocb_zone, aiocbe);
ops->store_error(job, error);
goto done;
}
no_kqueue:
ops->store_error(job, EINPROGRESS);
aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
aiocbe->userproc = p;
aiocbe->cred = crhold(td->td_ucred);
aiocbe->jobflags = 0;
1997-11-30 04:36:31 +00:00
aiocbe->lio = lj;
2006-03-23 08:46:42 +00:00
if (opcode == LIO_SYNC)
goto queueit;
if (fp->f_type == DTYPE_SOCKET) {
/*
* Alternate queueing for socket ops: Reach down into the
* descriptor to get the socket data. Then check to see if the
* socket is ready to be read or written (based on the requested
* operation).
*
* If it is not ready for io, then queue the aiocbe on the
* socket, and set the flags so we get a call when sbnotify()
* happens.
*
* Note if opcode is neither LIO_WRITE nor LIO_READ we lock
* and unlock the snd sockbuf for no reason.
*/
so = fp->f_data;
sb = (opcode == LIO_READ) ? &so->so_rcv : &so->so_snd;
SOCKBUF_LOCK(sb);
if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
LIO_WRITE) && (!sowriteable(so)))) {
sb->sb_flags |= SB_AIO;
mtx_lock(&aio_job_mtx);
TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
mtx_unlock(&aio_job_mtx);
2006-01-22 05:59:27 +00:00
AIO_LOCK(ki);
2006-01-22 05:59:27 +00:00
TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
2006-01-22 05:59:27 +00:00
aiocbe->jobstate = JOBST_JOBQSOCK;
ki->kaio_count++;
if (lj)
lj->lioj_count++;
AIO_UNLOCK(ki);
SOCKBUF_UNLOCK(sb);
2006-01-22 05:59:27 +00:00
atomic_add_int(&num_queue_count, 1);
error = 0;
goto done;
}
SOCKBUF_UNLOCK(sb);
}
if ((error = aio_qphysio(p, aiocbe)) == 0)
goto done;
2006-01-22 05:59:27 +00:00
#if 0
if (error > 0) {
aiocbe->uaiocb._aiocb_private.error = error;
ops->store_error(job, error);
goto done;
}
2006-01-22 05:59:27 +00:00
#endif
2006-03-23 08:46:42 +00:00
queueit:
/* No buffer for daemon I/O. */
1997-11-30 04:36:31 +00:00
aiocbe->bp = NULL;
2006-03-23 08:46:42 +00:00
atomic_add_int(&num_queue_count, 1);
1997-11-30 04:36:31 +00:00
AIO_LOCK(ki);
2006-01-22 05:59:27 +00:00
ki->kaio_count++;
if (lj)
2006-01-22 05:59:27 +00:00
lj->lioj_count++;
TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
2006-01-22 05:59:27 +00:00
TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
2006-03-23 08:46:42 +00:00
if (opcode == LIO_SYNC) {
TAILQ_FOREACH(cb, &ki->kaio_jobqueue, plist) {
if (cb->fd_file == aiocbe->fd_file &&
cb->uaiocb.aio_lio_opcode != LIO_SYNC &&
cb->seqno < aiocbe->seqno) {
cb->jobflags |= AIOCBLIST_CHECKSYNC;
aiocbe->pending++;
}
}
TAILQ_FOREACH(cb, &ki->kaio_bufqueue, plist) {
if (cb->fd_file == aiocbe->fd_file &&
cb->uaiocb.aio_lio_opcode != LIO_SYNC &&
cb->seqno < aiocbe->seqno) {
cb->jobflags |= AIOCBLIST_CHECKSYNC;
aiocbe->pending++;
}
}
if (aiocbe->pending != 0) {
TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, aiocbe, list);
aiocbe->jobstate = JOBST_JOBQSYNC;
AIO_UNLOCK(ki);
goto done;
}
2006-03-23 08:46:42 +00:00
}
mtx_lock(&aio_job_mtx);
TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
aiocbe->jobstate = JOBST_JOBQGLOBAL;
aio_kick_nowait(p);
mtx_unlock(&aio_job_mtx);
AIO_UNLOCK(ki);
2006-03-23 08:46:42 +00:00
error = 0;
done:
return (error);
}
2006-03-23 08:46:42 +00:00
static void
aio_kick_nowait(struct proc *userp)
{
struct kaioinfo *ki = userp->p_aioinfo;
struct aiothreadlist *aiop;
2006-03-23 08:46:42 +00:00
mtx_assert(&aio_job_mtx, MA_OWNED);
if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
TAILQ_REMOVE(&aio_freeproc, aiop, list);
aiop->aiothreadflags &= ~AIOP_FREE;
wakeup(aiop->aiothread);
} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
((ki->kaio_active_count + num_aio_resv_start) <
ki->kaio_maxactive_count)) {
2006-03-23 08:46:42 +00:00
taskqueue_enqueue(taskqueue_aiod_bio, &ki->kaio_task);
}
}
static int
2006-03-23 08:46:42 +00:00
aio_kick(struct proc *userp)
{
struct kaioinfo *ki = userp->p_aioinfo;
struct aiothreadlist *aiop;
int error, ret = 0;
2006-03-23 08:46:42 +00:00
mtx_assert(&aio_job_mtx, MA_OWNED);
retryproc:
if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
TAILQ_REMOVE(&aio_freeproc, aiop, list);
aiop->aiothreadflags &= ~AIOP_FREE;
wakeup(aiop->aiothread);
} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
((ki->kaio_active_count + num_aio_resv_start) <
ki->kaio_maxactive_count)) {
num_aio_resv_start++;
2006-01-22 05:59:27 +00:00
mtx_unlock(&aio_job_mtx);
error = aio_newproc(&num_aio_resv_start);
mtx_lock(&aio_job_mtx);
if (error) {
num_aio_resv_start--;
goto retryproc;
2006-01-22 05:59:27 +00:00
}
} else {
ret = -1;
}
return (ret);
2006-03-23 08:46:42 +00:00
}
2006-01-22 05:59:27 +00:00
2006-03-23 08:46:42 +00:00
static void
aio_kick_helper(void *context, int pending)
{
struct proc *userp = context;
mtx_lock(&aio_job_mtx);
while (--pending >= 0) {
if (aio_kick(userp))
break;
}
2006-03-23 08:46:42 +00:00
mtx_unlock(&aio_job_mtx);
}
/*
* Support the aio_return system call, as a side-effect, kernel resources are
* released.
*/
static int
kern_aio_return(struct thread *td, struct aiocb *uaiocb, struct aiocb_ops *ops)
{
struct proc *p = td->td_proc;
2006-01-22 05:59:27 +00:00
struct aiocblist *cb;
struct kaioinfo *ki;
2006-01-22 05:59:27 +00:00
int status, error;
ki = p->p_aioinfo;
if (ki == NULL)
return (EINVAL);
AIO_LOCK(ki);
2006-01-22 05:59:27 +00:00
TAILQ_FOREACH(cb, &ki->kaio_done, plist) {
if (cb->uuaiocb == uaiocb)
break;
1997-11-30 04:36:31 +00:00
}
if (cb != NULL) {
2006-01-22 05:59:27 +00:00
MPASS(cb->jobstate == JOBST_JOBFINISHED);
status = cb->uaiocb._aiocb_private.status;
error = cb->uaiocb._aiocb_private.error;
td->td_retval[0] = status;
if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
td->td_ru.ru_oublock += cb->outputcharge;
cb->outputcharge = 0;
} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
td->td_ru.ru_inblock += cb->inputcharge;
cb->inputcharge = 0;
}
aio_free_entry(cb);
AIO_UNLOCK(ki);
ops->store_error(uaiocb, error);
ops->store_status(uaiocb, status);
} else {
2006-01-22 05:59:27 +00:00
error = EINVAL;
AIO_UNLOCK(ki);
}
2006-01-22 05:59:27 +00:00
return (error);
}
int
sys_aio_return(struct thread *td, struct aio_return_args *uap)
{
return (kern_aio_return(td, uap->aiocbp, &aiocb_ops));
}
/*
* Allow a process to wakeup when any of the I/O requests are completed.
*/
static int
kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist,
struct timespec *ts)
{
struct proc *p = td->td_proc;
struct timeval atv;
struct kaioinfo *ki;
2006-01-22 05:59:27 +00:00
struct aiocblist *cb, *cbfirst;
int error, i, timo;
timo = 0;
if (ts) {
if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
return (EINVAL);
TIMESPEC_TO_TIMEVAL(&atv, ts);
if (itimerfix(&atv))
return (EINVAL);
timo = tvtohz(&atv);
}
ki = p->p_aioinfo;
if (ki == NULL)
return (EAGAIN);
if (njoblist == 0)
return (0);
AIO_LOCK(ki);
for (;;) {
2006-01-22 05:59:27 +00:00
cbfirst = NULL;
error = 0;
TAILQ_FOREACH(cb, &ki->kaio_all, allist) {
for (i = 0; i < njoblist; i++) {
2006-01-22 05:59:27 +00:00
if (cb->uuaiocb == ujoblist[i]) {
if (cbfirst == NULL)
cbfirst = cb;
if (cb->jobstate == JOBST_JOBFINISHED)
goto RETURN;
}
}
}
2006-01-22 05:59:27 +00:00
/* All tasks were finished. */
if (cbfirst == NULL)
break;
ki->kaio_flags |= KAIO_WAKEUP;
error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
2006-01-22 05:59:27 +00:00
"aiospn", timo);
if (error == ERESTART)
error = EINTR;
if (error)
break;
}
2006-01-22 05:59:27 +00:00
RETURN:
AIO_UNLOCK(ki);
return (error);
}
int
sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap)
{
struct timespec ts, *tsp;
struct aiocb **ujoblist;
int error;
if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
return (EINVAL);
if (uap->timeout) {
/* Get timespec struct. */
if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
return (error);
tsp = &ts;
} else
tsp = NULL;
ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0]));
if (error == 0)
error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
2006-01-22 05:59:27 +00:00
uma_zfree(aiol_zone, ujoblist);
return (error);
}
/*
* aio_cancel cancels any non-physio aio operations not currently in
* progress.
*/
int
sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap)
{
struct proc *p = td->td_proc;
struct kaioinfo *ki;
struct aiocblist *cbe, *cbn;
struct file *fp;
struct socket *so;
2006-01-22 05:59:27 +00:00
int error;
int remove;
2006-01-22 05:59:27 +00:00
int cancelled = 0;
int notcancelled = 0;
struct vnode *vp;
/* Lookup file object. */
error = fget(td, uap->fd, 0, &fp);
if (error)
return (error);
2006-01-22 05:59:27 +00:00
ki = p->p_aioinfo;
if (ki == NULL)
goto done;
2004-08-13 17:43:53 +00:00
if (fp->f_type == DTYPE_VNODE) {
vp = fp->f_vnode;
2006-01-22 05:59:27 +00:00
if (vn_isdisk(vp, &error)) {
fdrop(fp, td);
td->td_retval[0] = AIO_NOTCANCELED;
2004-08-13 17:43:53 +00:00
return (0);
}
}
AIO_LOCK(ki);
TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) {
if ((uap->fd == cbe->uaiocb.aio_fildes) &&
2006-01-22 05:59:27 +00:00
((uap->aiocbp == NULL) ||
(uap->aiocbp == cbe->uuaiocb))) {
remove = 0;
2006-01-22 05:59:27 +00:00
mtx_lock(&aio_job_mtx);
if (cbe->jobstate == JOBST_JOBQGLOBAL) {
TAILQ_REMOVE(&aio_jobs, cbe, list);
remove = 1;
} else if (cbe->jobstate == JOBST_JOBQSOCK) {
MPASS(fp->f_type == DTYPE_SOCKET);
so = fp->f_data;
TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
remove = 1;
2006-03-23 08:46:42 +00:00
} else if (cbe->jobstate == JOBST_JOBQSYNC) {
TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list);
remove = 1;
}
mtx_unlock(&aio_job_mtx);
if (remove) {
2006-01-22 05:59:27 +00:00
TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
cbe->uaiocb._aiocb_private.status = -1;
cbe->uaiocb._aiocb_private.error = ECANCELED;
2006-01-22 05:59:27 +00:00
aio_bio_done_notify(p, cbe, DONE_QUEUE);
cancelled++;
} else {
notcancelled++;
}
if (uap->aiocbp != NULL)
break;
}
}
AIO_UNLOCK(ki);
2006-01-22 05:59:27 +00:00
done:
fdrop(fp, td);
if (uap->aiocbp != NULL) {
if (cancelled) {
td->td_retval[0] = AIO_CANCELED;
return (0);
}
}
if (notcancelled) {
td->td_retval[0] = AIO_NOTCANCELED;
return (0);
}
if (cancelled) {
td->td_retval[0] = AIO_CANCELED;
return (0);
}
td->td_retval[0] = AIO_ALLDONE;
return (0);
}
/*
* aio_error is implemented in the kernel level for compatibility purposes
* only. For a user mode async implementation, it would be best to do it in
* a userland subroutine.
*/
static int
kern_aio_error(struct thread *td, struct aiocb *aiocbp, struct aiocb_ops *ops)
{
struct proc *p = td->td_proc;
struct aiocblist *cb;
struct kaioinfo *ki;
2006-01-22 05:59:27 +00:00
int status;
ki = p->p_aioinfo;
2006-01-22 05:59:27 +00:00
if (ki == NULL) {
td->td_retval[0] = EINVAL;
return (0);
1997-11-30 04:36:31 +00:00
}
AIO_LOCK(ki);
2006-01-22 05:59:27 +00:00
TAILQ_FOREACH(cb, &ki->kaio_all, allist) {
if (cb->uuaiocb == aiocbp) {
2006-01-22 05:59:27 +00:00
if (cb->jobstate == JOBST_JOBFINISHED)
td->td_retval[0] =
cb->uaiocb._aiocb_private.error;
else
td->td_retval[0] = EINPROGRESS;
AIO_UNLOCK(ki);
return (0);
1997-11-30 04:36:31 +00:00
}
}
AIO_UNLOCK(ki);
1997-11-30 04:36:31 +00:00
/*
* Hack for failure of aio_aqueue.
*/
status = ops->fetch_status(aiocbp);
2006-01-22 05:59:27 +00:00
if (status == -1) {
td->td_retval[0] = ops->fetch_error(aiocbp);
2006-01-22 05:59:27 +00:00
return (0);
}
td->td_retval[0] = EINVAL;
return (0);
}
int
sys_aio_error(struct thread *td, struct aio_error_args *uap)
{
return (kern_aio_error(td, uap->aiocbp, &aiocb_ops));
}
/* syscall - asynchronous read from a file (REALTIME) */
int
sys_oaio_read(struct thread *td, struct oaio_read_args *uap)
{
return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
&aiocb_ops_osigevent));
}
int
sys_aio_read(struct thread *td, struct aio_read_args *uap)
{
return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops));
}
/* syscall - asynchronous write to a file (REALTIME) */
int
sys_oaio_write(struct thread *td, struct oaio_write_args *uap)
{
return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
&aiocb_ops_osigevent));
}
int
sys_aio_write(struct thread *td, struct aio_write_args *uap)
{
return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops));
}
static int
kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list,
struct aiocb **acb_list, int nent, struct sigevent *sig,
struct aiocb_ops *ops)
{
struct proc *p = td->td_proc;
struct aiocb *iocb;
struct kaioinfo *ki;
2006-01-22 05:59:27 +00:00
struct aioliojob *lj;
struct kevent kev;
2006-01-22 05:59:27 +00:00
int error;
int nerror;
int i;
if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT))
return (EINVAL);
if (nent < 0 || nent > AIO_LISTIO_MAX)
return (EINVAL);
if (p->p_aioinfo == NULL)
aio_init_aioinfo(p);
ki = p->p_aioinfo;
lj = uma_zalloc(aiolio_zone, M_WAITOK);
1997-11-30 04:36:31 +00:00
lj->lioj_flags = 0;
2006-01-22 05:59:27 +00:00
lj->lioj_count = 0;
lj->lioj_finished_count = 0;
knlist_init_mtx(&lj->klist, AIO_MTX(ki));
ksiginfo_init(&lj->lioj_ksi);
1997-11-30 04:36:31 +00:00
/*
* Setup signal.
1997-11-30 04:36:31 +00:00
*/
if (sig && (mode == LIO_NOWAIT)) {
bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal));
if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
/* Assume only new style KEVENT */
kev.filter = EVFILT_LIO;
kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
kev.ident = (uintptr_t)uacb_list; /* something unique */
kev.data = (intptr_t)lj;
2006-01-22 05:59:27 +00:00
/* pass user defined sigval data */
kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
error = kqfd_register(
lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1);
if (error) {
uma_zfree(aiolio_zone, lj);
return (error);
}
2006-01-22 05:59:27 +00:00
} else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
;
} else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
uma_zfree(aiolio_zone, lj);
return EINVAL;
}
lj->lioj_flags |= LIOJ_SIGNAL;
} else {
uma_zfree(aiolio_zone, lj);
return EINVAL;
}
2006-01-22 05:59:27 +00:00
}
AIO_LOCK(ki);
TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
2006-01-22 05:59:27 +00:00
/*
* Add extra aiocb count to avoid the lio to be freed
* by other threads doing aio_waitcomplete or aio_return,
* and prevent event from being sent until we have queued
* all tasks.
*/
lj->lioj_count = 1;
AIO_UNLOCK(ki);
2006-01-22 05:59:27 +00:00
/*
* Get pointers to the list of I/O requests.
*/
nerror = 0;
for (i = 0; i < nent; i++) {
iocb = acb_list[i];
if (iocb != NULL) {
error = aio_aqueue(td, iocb, lj, LIO_NOP, ops);
2006-01-22 05:59:27 +00:00
if (error != 0)
nerror++;
}
}
2006-01-22 05:59:27 +00:00
error = 0;
AIO_LOCK(ki);
if (mode == LIO_WAIT) {
2006-01-22 05:59:27 +00:00
while (lj->lioj_count - 1 != lj->lioj_finished_count) {
ki->kaio_flags |= KAIO_WAKEUP;
error = msleep(&p->p_aioinfo, AIO_MTX(ki),
2006-01-22 05:59:27 +00:00
PRIBIO | PCATCH, "aiospn", 0);
if (error == ERESTART)
error = EINTR;
if (error)
break;
}
} else {
if (lj->lioj_count - 1 == lj->lioj_finished_count) {
if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
lj->lioj_flags |= LIOJ_KEVENT_POSTED;
KNOTE_LOCKED(&lj->klist, 1);
}
if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
== LIOJ_SIGNAL
&& (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
aio_sendsig(p, &lj->lioj_signal,
&lj->lioj_ksi);
lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
}
}
}
2006-01-22 05:59:27 +00:00
lj->lioj_count--;
if (lj->lioj_count == 0) {
TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
knlist_delete(&lj->klist, curthread, 1);
PROC_LOCK(p);
2006-01-22 05:59:27 +00:00
sigqueue_take(&lj->lioj_ksi);
PROC_UNLOCK(p);
AIO_UNLOCK(ki);
2006-01-22 05:59:27 +00:00
uma_zfree(aiolio_zone, lj);
} else
AIO_UNLOCK(ki);
2006-01-22 05:59:27 +00:00
if (nerror)
return (EIO);
return (error);
}
/* syscall - list directed I/O (REALTIME) */
int
sys_olio_listio(struct thread *td, struct olio_listio_args *uap)
{
struct aiocb **acb_list;
struct sigevent *sigp, sig;
struct osigevent osig;
int error, nent;
if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
return (EINVAL);
nent = uap->nent;
if (nent < 0 || nent > AIO_LISTIO_MAX)
return (EINVAL);
if (uap->sig && (uap->mode == LIO_NOWAIT)) {
error = copyin(uap->sig, &osig, sizeof(osig));
if (error)
return (error);
error = convert_old_sigevent(&osig, &sig);
if (error)
return (error);
sigp = &sig;
} else
sigp = NULL;
acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
if (error == 0)
error = kern_lio_listio(td, uap->mode,
(struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
&aiocb_ops_osigevent);
free(acb_list, M_LIO);
return (error);
}
/* syscall - list directed I/O (REALTIME) */
int
sys_lio_listio(struct thread *td, struct lio_listio_args *uap)
{
struct aiocb **acb_list;
struct sigevent *sigp, sig;
int error, nent;
if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
return (EINVAL);
nent = uap->nent;
if (nent < 0 || nent > AIO_LISTIO_MAX)
return (EINVAL);
if (uap->sig && (uap->mode == LIO_NOWAIT)) {
error = copyin(uap->sig, &sig, sizeof(sig));
if (error)
return (error);
sigp = &sig;
} else
sigp = NULL;
acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
if (error == 0)
error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list,
nent, sigp, &aiocb_ops);
free(acb_list, M_LIO);
return (error);
}
1997-11-30 04:36:31 +00:00
/*
2006-01-22 05:59:27 +00:00
* Called from interrupt thread for physio, we should return as fast
* as possible, so we schedule a biohelper task.
1997-11-30 04:36:31 +00:00
*/
static void
aio_physwakeup(struct buf *bp)
{
1997-11-30 04:36:31 +00:00
struct aiocblist *aiocbe;
aiocbe = (struct aiocblist *)bp->b_caller1;
2006-01-22 05:59:27 +00:00
taskqueue_enqueue(taskqueue_aiod_bio, &aiocbe->biotask);
}
1997-11-30 04:36:31 +00:00
2006-01-22 05:59:27 +00:00
/*
* Task routine to perform heavy tasks, process wakeup, and signals.
*/
static void
biohelper(void *context, int pending)
{
struct aiocblist *aiocbe = context;
struct buf *bp;
struct proc *userp;
struct kaioinfo *ki;
2006-01-22 05:59:27 +00:00
int nblks;
1997-11-30 04:36:31 +00:00
2006-01-22 05:59:27 +00:00
bp = aiocbe->bp;
userp = aiocbe->userproc;
ki = userp->p_aioinfo;
AIO_LOCK(ki);
2006-01-22 05:59:27 +00:00
aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
aiocbe->uaiocb._aiocb_private.error = 0;
if (bp->b_ioflags & BIO_ERROR)
aiocbe->uaiocb._aiocb_private.error = bp->b_error;
nblks = btodb(aiocbe->uaiocb.aio_nbytes);
if (aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE)
aiocbe->outputcharge += nblks;
else
aiocbe->inputcharge += nblks;
aiocbe->bp = NULL;
TAILQ_REMOVE(&userp->p_aioinfo->kaio_bufqueue, aiocbe, plist);
ki->kaio_buffer_count--;
2006-01-22 05:59:27 +00:00
aio_bio_done_notify(userp, aiocbe, DONE_BUF);
AIO_UNLOCK(ki);
1997-11-30 04:36:31 +00:00
2006-01-22 05:59:27 +00:00
/* Release mapping into kernel space. */
vunmapbuf(bp);
relpbuf(bp, NULL);
atomic_subtract_int(&num_buf_aio, 1);
}
/* syscall - wait for the next completion of an aio request */
static int
kern_aio_waitcomplete(struct thread *td, struct aiocb **aiocbp,
struct timespec *ts, struct aiocb_ops *ops)
{
struct proc *p = td->td_proc;
struct timeval atv;
struct kaioinfo *ki;
2006-01-22 05:59:27 +00:00
struct aiocblist *cb;
struct aiocb *uuaiocb;
int error, status, timo;
2004-08-13 17:43:53 +00:00
ops->store_aiocb(aiocbp, NULL);
timo = 0;
if (ts) {
if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000))
return (EINVAL);
TIMESPEC_TO_TIMEVAL(&atv, ts);
if (itimerfix(&atv))
return (EINVAL);
timo = tvtohz(&atv);
}
if (p->p_aioinfo == NULL)
aio_init_aioinfo(p);
ki = p->p_aioinfo;
2006-01-22 05:59:27 +00:00
error = 0;
cb = NULL;
AIO_LOCK(ki);
2006-01-22 05:59:27 +00:00
while ((cb = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
ki->kaio_flags |= KAIO_WAKEUP;
error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
2006-01-22 05:59:27 +00:00
"aiowc", timo);
if (timo && error == ERESTART)
2006-01-22 05:59:27 +00:00
error = EINTR;
if (error)
break;
}
2006-01-22 05:59:27 +00:00
if (cb != NULL) {
MPASS(cb->jobstate == JOBST_JOBFINISHED);
uuaiocb = cb->uuaiocb;
status = cb->uaiocb._aiocb_private.status;
error = cb->uaiocb._aiocb_private.error;
td->td_retval[0] = status;
if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
td->td_ru.ru_oublock += cb->outputcharge;
2006-01-22 05:59:27 +00:00
cb->outputcharge = 0;
} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
td->td_ru.ru_inblock += cb->inputcharge;
2006-01-22 05:59:27 +00:00
cb->inputcharge = 0;
}
aio_free_entry(cb);
AIO_UNLOCK(ki);
ops->store_aiocb(aiocbp, uuaiocb);
ops->store_error(uuaiocb, error);
ops->store_status(uuaiocb, status);
2006-01-22 05:59:27 +00:00
} else
AIO_UNLOCK(ki);
2006-01-22 05:59:27 +00:00
return (error);
}
2006-03-23 08:46:42 +00:00
int
sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
{
struct timespec ts, *tsp;
int error;
if (uap->timeout) {
/* Get timespec struct. */
error = copyin(uap->timeout, &ts, sizeof(ts));
if (error)
return (error);
tsp = &ts;
} else
tsp = NULL;
return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops));
}
static int
kern_aio_fsync(struct thread *td, int op, struct aiocb *aiocbp,
struct aiocb_ops *ops)
2006-03-23 08:46:42 +00:00
{
struct proc *p = td->td_proc;
struct kaioinfo *ki;
if (op != O_SYNC) /* XXX lack of O_DSYNC */
2006-03-23 08:46:42 +00:00
return (EINVAL);
ki = p->p_aioinfo;
if (ki == NULL)
aio_init_aioinfo(p);
return (aio_aqueue(td, aiocbp, NULL, LIO_SYNC, ops));
}
int
sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap)
{
return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops));
2006-03-23 08:46:42 +00:00
}
/* kqueue attach function */
static int
filt_aioattach(struct knote *kn)
{
struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
/*
* The aiocbe pointer must be validated before using it, so
* registration is restricted to the kernel; the user cannot
* set EV_FLAG1.
*/
if ((kn->kn_flags & EV_FLAG1) == 0)
return (EPERM);
kn->kn_ptr.p_aio = aiocbe;
kn->kn_flags &= ~EV_FLAG1;
knlist_add(&aiocbe->klist, kn, 0);
return (0);
}
/* kqueue detach function */
static void
filt_aiodetach(struct knote *kn)
{
struct knlist *knl;
knl = &kn->kn_ptr.p_aio->klist;
knl->kl_lock(knl->kl_lockarg);
if (!knlist_empty(knl))
knlist_remove(knl, kn, 1);
knl->kl_unlock(knl->kl_lockarg);
}
/* kqueue filter function */
/*ARGSUSED*/
static int
filt_aio(struct knote *kn, long hint)
{
struct aiocblist *aiocbe = kn->kn_ptr.p_aio;
kn->kn_data = aiocbe->uaiocb._aiocb_private.error;
2006-01-22 05:59:27 +00:00
if (aiocbe->jobstate != JOBST_JOBFINISHED)
return (0);
2004-08-13 17:43:53 +00:00
kn->kn_flags |= EV_EOF;
return (1);
}
/* kqueue attach function */
static int
filt_lioattach(struct knote *kn)
{
2006-01-22 05:59:27 +00:00
struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata;
/*
2006-01-22 05:59:27 +00:00
* The aioliojob pointer must be validated before using it, so
* registration is restricted to the kernel; the user cannot
* set EV_FLAG1.
*/
if ((kn->kn_flags & EV_FLAG1) == 0)
return (EPERM);
kn->kn_ptr.p_lio = lj;
kn->kn_flags &= ~EV_FLAG1;
knlist_add(&lj->klist, kn, 0);
return (0);
}
/* kqueue detach function */
static void
filt_liodetach(struct knote *kn)
{
struct knlist *knl;
knl = &kn->kn_ptr.p_lio->klist;
knl->kl_lock(knl->kl_lockarg);
if (!knlist_empty(knl))
knlist_remove(knl, kn, 1);
knl->kl_unlock(knl->kl_lockarg);
}
/* kqueue filter function */
/*ARGSUSED*/
static int
filt_lio(struct knote *kn, long hint)
{
struct aioliojob * lj = kn->kn_ptr.p_lio;
2006-01-22 05:59:27 +00:00
return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
}
#ifdef COMPAT_FREEBSD32
struct __aiocb_private32 {
int32_t status;
int32_t error;
uint32_t kernelinfo;
};
typedef struct oaiocb32 {
int aio_fildes; /* File descriptor */
uint64_t aio_offset __packed; /* File offset for I/O */
uint32_t aio_buf; /* I/O buffer in process space */
uint32_t aio_nbytes; /* Number of bytes for I/O */
struct osigevent32 aio_sigevent; /* Signal to deliver */
int aio_lio_opcode; /* LIO opcode */
int aio_reqprio; /* Request priority -- ignored */
struct __aiocb_private32 _aiocb_private;
} oaiocb32_t;
typedef struct aiocb32 {
int32_t aio_fildes; /* File descriptor */
uint64_t aio_offset __packed; /* File offset for I/O */
uint32_t aio_buf; /* I/O buffer in process space */
uint32_t aio_nbytes; /* Number of bytes for I/O */
int __spare__[2];
uint32_t __spare2__;
int aio_lio_opcode; /* LIO opcode */
int aio_reqprio; /* Request priority -- ignored */
struct __aiocb_private32 _aiocb_private;
struct sigevent32 aio_sigevent; /* Signal to deliver */
} aiocb32_t;
static int
convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig)
{
/*
* Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
* supported by AIO with the old sigevent structure.
*/
CP(*osig, *nsig, sigev_notify);
switch (nsig->sigev_notify) {
case SIGEV_NONE:
break;
case SIGEV_SIGNAL:
nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
break;
case SIGEV_KEVENT:
nsig->sigev_notify_kqueue =
osig->__sigev_u.__sigev_notify_kqueue;
PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr);
break;
default:
return (EINVAL);
}
return (0);
}
static int
aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
{
struct oaiocb32 job32;
int error;
bzero(kjob, sizeof(struct aiocb));
error = copyin(ujob, &job32, sizeof(job32));
if (error)
return (error);
CP(job32, *kjob, aio_fildes);
CP(job32, *kjob, aio_offset);
PTRIN_CP(job32, *kjob, aio_buf);
CP(job32, *kjob, aio_nbytes);
CP(job32, *kjob, aio_lio_opcode);
CP(job32, *kjob, aio_reqprio);
CP(job32, *kjob, _aiocb_private.status);
CP(job32, *kjob, _aiocb_private.error);
PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
return (convert_old_sigevent32(&job32.aio_sigevent,
&kjob->aio_sigevent));
}
static int
convert_sigevent32(struct sigevent32 *sig32, struct sigevent *sig)
{
CP(*sig32, *sig, sigev_notify);
switch (sig->sigev_notify) {
case SIGEV_NONE:
break;
case SIGEV_THREAD_ID:
CP(*sig32, *sig, sigev_notify_thread_id);
/* FALLTHROUGH */
case SIGEV_SIGNAL:
CP(*sig32, *sig, sigev_signo);
break;
case SIGEV_KEVENT:
CP(*sig32, *sig, sigev_notify_kqueue);
CP(*sig32, *sig, sigev_notify_kevent_flags);
PTRIN_CP(*sig32, *sig, sigev_value.sival_ptr);
break;
default:
return (EINVAL);
}
return (0);
}
static int
aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob)
{
struct aiocb32 job32;
int error;
error = copyin(ujob, &job32, sizeof(job32));
if (error)
return (error);
CP(job32, *kjob, aio_fildes);
CP(job32, *kjob, aio_offset);
PTRIN_CP(job32, *kjob, aio_buf);
CP(job32, *kjob, aio_nbytes);
CP(job32, *kjob, aio_lio_opcode);
CP(job32, *kjob, aio_reqprio);
CP(job32, *kjob, _aiocb_private.status);
CP(job32, *kjob, _aiocb_private.error);
PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent));
}
static long
aiocb32_fetch_status(struct aiocb *ujob)
{
struct aiocb32 *ujob32;
ujob32 = (struct aiocb32 *)ujob;
return (fuword32(&ujob32->_aiocb_private.status));
}
static long
aiocb32_fetch_error(struct aiocb *ujob)
{
struct aiocb32 *ujob32;
ujob32 = (struct aiocb32 *)ujob;
return (fuword32(&ujob32->_aiocb_private.error));
}
static int
aiocb32_store_status(struct aiocb *ujob, long status)
{
struct aiocb32 *ujob32;
ujob32 = (struct aiocb32 *)ujob;
return (suword32(&ujob32->_aiocb_private.status, status));
}
static int
aiocb32_store_error(struct aiocb *ujob, long error)
{
struct aiocb32 *ujob32;
ujob32 = (struct aiocb32 *)ujob;
return (suword32(&ujob32->_aiocb_private.error, error));
}
static int
aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref)
{
struct aiocb32 *ujob32;
ujob32 = (struct aiocb32 *)ujob;
return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref));
}
static int
aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
{
return (suword32(ujobp, (long)ujob));
}
static struct aiocb_ops aiocb32_ops = {
.copyin = aiocb32_copyin,
.fetch_status = aiocb32_fetch_status,
.fetch_error = aiocb32_fetch_error,
.store_status = aiocb32_store_status,
.store_error = aiocb32_store_error,
.store_kernelinfo = aiocb32_store_kernelinfo,
.store_aiocb = aiocb32_store_aiocb,
};
static struct aiocb_ops aiocb32_ops_osigevent = {
.copyin = aiocb32_copyin_old_sigevent,
.fetch_status = aiocb32_fetch_status,
.fetch_error = aiocb32_fetch_error,
.store_status = aiocb32_store_status,
.store_error = aiocb32_store_error,
.store_kernelinfo = aiocb32_store_kernelinfo,
.store_aiocb = aiocb32_store_aiocb,
};
int
freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap)
{
return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
}
int
freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap)
{
struct timespec32 ts32;
struct timespec ts, *tsp;
struct aiocb **ujoblist;
uint32_t *ujoblist32;
int error, i;
if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
return (EINVAL);
if (uap->timeout) {
/* Get timespec struct. */
if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0)
return (error);
CP(ts32, ts, tv_sec);
CP(ts32, ts, tv_nsec);
tsp = &ts;
} else
tsp = NULL;
ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
ujoblist32 = (uint32_t *)ujoblist;
error = copyin(uap->aiocbp, ujoblist32, uap->nent *
sizeof(ujoblist32[0]));
if (error == 0) {
for (i = uap->nent; i > 0; i--)
ujoblist[i] = PTRIN(ujoblist32[i]);
error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
}
uma_zfree(aiol_zone, ujoblist);
return (error);
}
int
freebsd32_aio_cancel(struct thread *td, struct freebsd32_aio_cancel_args *uap)
{
return (sys_aio_cancel(td, (struct aio_cancel_args *)uap));
}
int
freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap)
{
return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
}
int
freebsd32_oaio_read(struct thread *td, struct freebsd32_oaio_read_args *uap)
{
return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
&aiocb32_ops_osigevent));
}
int
freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap)
{
return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
&aiocb32_ops));
}
int
freebsd32_oaio_write(struct thread *td, struct freebsd32_oaio_write_args *uap)
{
return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
&aiocb32_ops_osigevent));
}
int
freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap)
{
return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
&aiocb32_ops));
}
int
freebsd32_aio_waitcomplete(struct thread *td,
struct freebsd32_aio_waitcomplete_args *uap)
{
struct timespec32 ts32;
struct timespec ts, *tsp;
int error;
if (uap->timeout) {
/* Get timespec struct. */
error = copyin(uap->timeout, &ts32, sizeof(ts32));
if (error)
return (error);
CP(ts32, ts, tv_sec);
CP(ts32, ts, tv_nsec);
tsp = &ts;
} else
tsp = NULL;
return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp,
&aiocb32_ops));
}
int
freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap)
{
return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp,
&aiocb32_ops));
}
int
freebsd32_olio_listio(struct thread *td, struct freebsd32_olio_listio_args *uap)
{
struct aiocb **acb_list;
struct sigevent *sigp, sig;
struct osigevent32 osig;
uint32_t *acb_list32;
int error, i, nent;
if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
return (EINVAL);
nent = uap->nent;
if (nent < 0 || nent > AIO_LISTIO_MAX)
return (EINVAL);
if (uap->sig && (uap->mode == LIO_NOWAIT)) {
error = copyin(uap->sig, &osig, sizeof(osig));
if (error)
return (error);
error = convert_old_sigevent32(&osig, &sig);
if (error)
return (error);
sigp = &sig;
} else
sigp = NULL;
acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
if (error) {
free(acb_list32, M_LIO);
return (error);
}
acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
for (i = 0; i < nent; i++)
acb_list[i] = PTRIN(acb_list32[i]);
free(acb_list32, M_LIO);
error = kern_lio_listio(td, uap->mode,
(struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
&aiocb32_ops_osigevent);
free(acb_list, M_LIO);
return (error);
}
int
freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap)
{
struct aiocb **acb_list;
struct sigevent *sigp, sig;
struct sigevent32 sig32;
uint32_t *acb_list32;
int error, i, nent;
if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
return (EINVAL);
nent = uap->nent;
if (nent < 0 || nent > AIO_LISTIO_MAX)
return (EINVAL);
if (uap->sig && (uap->mode == LIO_NOWAIT)) {
error = copyin(uap->sig, &sig32, sizeof(sig32));
if (error)
return (error);
error = convert_sigevent32(&sig32, &sig);
if (error)
return (error);
sigp = &sig;
} else
sigp = NULL;
acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
if (error) {
free(acb_list32, M_LIO);
return (error);
}
acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
for (i = 0; i < nent; i++)
acb_list[i] = PTRIN(acb_list32[i]);
free(acb_list32, M_LIO);
error = kern_lio_listio(td, uap->mode,
(struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
&aiocb32_ops);
free(acb_list, M_LIO);
return (error);
}
#endif