2005-01-06 23:35:40 +00:00
|
|
|
/*-
|
1997-06-16 00:27:26 +00:00
|
|
|
* Copyright (c) 1997 John S. Dyson. All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. John S. Dyson's name may not be used to endorse or promote products
|
|
|
|
* derived from this software without specific prior written permission.
|
|
|
|
*
|
|
|
|
* DISCLAIMER: This code isn't warranted to do anything useful. Anything
|
|
|
|
* bad that happens because of using this software isn't the responsibility
|
|
|
|
* of the author. This software is distributed AS-IS.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
1998-03-28 11:51:01 +00:00
|
|
|
* This file contains support for the POSIX 1003.1B AIO/LIO facility.
|
1997-06-16 00:27:26 +00:00
|
|
|
*/
|
|
|
|
|
2003-06-11 00:56:59 +00:00
|
|
|
#include <sys/cdefs.h>
|
|
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
|
2008-12-10 20:56:19 +00:00
|
|
|
#include "opt_compat.h"
|
|
|
|
|
1997-06-16 00:27:26 +00:00
|
|
|
#include <sys/param.h>
|
|
|
|
#include <sys/systm.h>
|
2002-02-23 11:12:57 +00:00
|
|
|
#include <sys/malloc.h>
|
2000-05-05 09:59:14 +00:00
|
|
|
#include <sys/bio.h>
|
1999-02-25 15:54:06 +00:00
|
|
|
#include <sys/buf.h>
|
2014-03-16 10:55:57 +00:00
|
|
|
#include <sys/capsicum.h>
|
2003-03-24 21:15:35 +00:00
|
|
|
#include <sys/eventhandler.h>
|
1997-06-16 00:27:26 +00:00
|
|
|
#include <sys/sysproto.h>
|
|
|
|
#include <sys/filedesc.h>
|
|
|
|
#include <sys/kernel.h>
|
2004-05-30 20:34:58 +00:00
|
|
|
#include <sys/module.h>
|
2001-03-09 06:27:01 +00:00
|
|
|
#include <sys/kthread.h>
|
1997-06-16 00:27:26 +00:00
|
|
|
#include <sys/fcntl.h>
|
|
|
|
#include <sys/file.h>
|
2003-04-29 13:36:06 +00:00
|
|
|
#include <sys/limits.h>
|
1997-11-18 10:02:40 +00:00
|
|
|
#include <sys/lock.h>
|
2000-10-20 07:58:15 +00:00
|
|
|
#include <sys/mutex.h>
|
1997-06-16 00:27:26 +00:00
|
|
|
#include <sys/unistd.h>
|
2006-11-11 16:26:58 +00:00
|
|
|
#include <sys/posix4.h>
|
1997-06-16 00:27:26 +00:00
|
|
|
#include <sys/proc.h>
|
1998-08-17 17:28:10 +00:00
|
|
|
#include <sys/resourcevar.h>
|
1997-06-16 00:27:26 +00:00
|
|
|
#include <sys/signalvar.h>
|
2000-01-14 02:53:29 +00:00
|
|
|
#include <sys/protosw.h>
|
2013-03-09 02:32:23 +00:00
|
|
|
#include <sys/rwlock.h>
|
2006-01-22 05:59:27 +00:00
|
|
|
#include <sys/sema.h>
|
|
|
|
#include <sys/socket.h>
|
2000-01-14 02:53:29 +00:00
|
|
|
#include <sys/socketvar.h>
|
2001-12-29 07:13:47 +00:00
|
|
|
#include <sys/syscall.h>
|
|
|
|
#include <sys/sysent.h>
|
1997-10-09 04:14:41 +00:00
|
|
|
#include <sys/sysctl.h>
|
2002-03-25 21:52:04 +00:00
|
|
|
#include <sys/sx.h>
|
2006-01-22 05:59:27 +00:00
|
|
|
#include <sys/taskqueue.h>
|
1997-11-29 01:33:10 +00:00
|
|
|
#include <sys/vnode.h>
|
|
|
|
#include <sys/conf.h>
|
2000-04-16 18:53:38 +00:00
|
|
|
#include <sys/event.h>
|
2006-03-23 08:46:42 +00:00
|
|
|
#include <sys/mount.h>
|
2015-04-22 18:11:34 +00:00
|
|
|
#include <geom/geom.h>
|
1997-06-16 00:27:26 +00:00
|
|
|
|
2006-01-22 05:59:27 +00:00
|
|
|
#include <machine/atomic.h>
|
|
|
|
|
1997-06-16 00:27:26 +00:00
|
|
|
#include <vm/vm.h>
|
2015-04-22 18:11:34 +00:00
|
|
|
#include <vm/vm_page.h>
|
1997-06-16 00:27:26 +00:00
|
|
|
#include <vm/vm_extern.h>
|
1997-07-06 02:40:43 +00:00
|
|
|
#include <vm/pmap.h>
|
|
|
|
#include <vm/vm_map.h>
|
2006-03-23 08:46:42 +00:00
|
|
|
#include <vm/vm_object.h>
|
2002-03-20 04:09:59 +00:00
|
|
|
#include <vm/uma.h>
|
1997-06-16 00:27:26 +00:00
|
|
|
#include <sys/aio.h>
|
1997-07-17 04:49:43 +00:00
|
|
|
|
2002-03-05 15:38:49 +00:00
|
|
|
/*
|
|
|
|
* Counter for allocating reference ids to new jobs. Wrapped to 1 on
|
2006-03-23 08:46:42 +00:00
|
|
|
* overflow. (XXX will be removed soon.)
|
2002-03-05 15:38:49 +00:00
|
|
|
*/
|
2006-03-23 08:46:42 +00:00
|
|
|
static u_long jobrefid;
|
1997-07-06 02:40:43 +00:00
|
|
|
|
2006-03-23 08:46:42 +00:00
|
|
|
/*
|
|
|
|
* Counter for aio_fsync.
|
|
|
|
*/
|
|
|
|
static uint64_t jobseqno;
|
|
|
|
|
Finish up the vast majority of the AIO/LIO functionality. Proper signal
support was missing in the previous version of the AIO code. More
tunables added, and very efficient support for VCHR files has been added.
Kernel threads are not used for VCHR files, all work for such files is
done for the requesting process directly. Some attempt has been made to
charge the requesting process for resource utilization, but more work
is needed. aio_fsync is still missing (but the original fsync system
call can be used for now.) aio_cancel is essentially a noop, but that
is okay per POSIX. More aio_cancel functionality can be added later,
if it is found to be needed.
The functions implemented include:
aio_read, aio_write, lio_listio, aio_error, aio_return,
aio_cancel, aio_suspend.
The code has been implemented to support the POSIX spec 1003.1b
(formerly known as POSIX 1003.4 spec) features of the above. The
async I/O features are truly async, with the VCHR mode of operation
being essentially the same as physio (for appropriate files) for
maximum efficiency. This code also supports the signal capability,
is highly tunable, allowing management of resource usage, and
has been written to allow a per process usage quota.
Both the O'Reilly POSIX.4 book and the actual POSIX 1003.1b document
were the reference specs used. Any filedescriptor can be used with
these new system calls. I know of no exceptions where these
system calls will not work. (TTY's will also probably work.)
1997-11-30 04:36:31 +00:00
|
|
|
#ifndef MAX_AIO_PER_PROC
|
1997-07-06 02:40:43 +00:00
|
|
|
#define MAX_AIO_PER_PROC 32
|
Finish up the vast majority of the AIO/LIO functionality. Proper signal
support was missing in the previous version of the AIO code. More
tunables added, and very efficient support for VCHR files has been added.
Kernel threads are not used for VCHR files, all work for such files is
done for the requesting process directly. Some attempt has been made to
charge the requesting process for resource utilization, but more work
is needed. aio_fsync is still missing (but the original fsync system
call can be used for now.) aio_cancel is essentially a noop, but that
is okay per POSIX. More aio_cancel functionality can be added later,
if it is found to be needed.
The functions implemented include:
aio_read, aio_write, lio_listio, aio_error, aio_return,
aio_cancel, aio_suspend.
The code has been implemented to support the POSIX spec 1003.1b
(formerly known as POSIX 1003.4 spec) features of the above. The
async I/O features are truly async, with the VCHR mode of operation
being essentially the same as physio (for appropriate files) for
maximum efficiency. This code also supports the signal capability,
is highly tunable, allowing management of resource usage, and
has been written to allow a per process usage quota.
Both the O'Reilly POSIX.4 book and the actual POSIX 1003.1b document
were the reference specs used. Any filedescriptor can be used with
these new system calls. I know of no exceptions where these
system calls will not work. (TTY's will also probably work.)
1997-11-30 04:36:31 +00:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef MAX_AIO_QUEUE_PER_PROC
|
1997-07-06 02:40:43 +00:00
|
|
|
#define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */
|
Finish up the vast majority of the AIO/LIO functionality. Proper signal
support was missing in the previous version of the AIO code. More
tunables added, and very efficient support for VCHR files has been added.
Kernel threads are not used for VCHR files, all work for such files is
done for the requesting process directly. Some attempt has been made to
charge the requesting process for resource utilization, but more work
is needed. aio_fsync is still missing (but the original fsync system
call can be used for now.) aio_cancel is essentially a noop, but that
is okay per POSIX. More aio_cancel functionality can be added later,
if it is found to be needed.
The functions implemented include:
aio_read, aio_write, lio_listio, aio_error, aio_return,
aio_cancel, aio_suspend.
The code has been implemented to support the POSIX spec 1003.1b
(formerly known as POSIX 1003.4 spec) features of the above. The
async I/O features are truly async, with the VCHR mode of operation
being essentially the same as physio (for appropriate files) for
maximum efficiency. This code also supports the signal capability,
is highly tunable, allowing management of resource usage, and
has been written to allow a per process usage quota.
Both the O'Reilly POSIX.4 book and the actual POSIX 1003.1b document
were the reference specs used. Any filedescriptor can be used with
these new system calls. I know of no exceptions where these
system calls will not work. (TTY's will also probably work.)
1997-11-30 04:36:31 +00:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef MAX_AIO_QUEUE
|
1997-07-06 02:40:43 +00:00
|
|
|
#define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */
|
Finish up the vast majority of the AIO/LIO functionality. Proper signal
support was missing in the previous version of the AIO code. More
tunables added, and very efficient support for VCHR files has been added.
Kernel threads are not used for VCHR files, all work for such files is
done for the requesting process directly. Some attempt has been made to
charge the requesting process for resource utilization, but more work
is needed. aio_fsync is still missing (but the original fsync system
call can be used for now.) aio_cancel is essentially a noop, but that
is okay per POSIX. More aio_cancel functionality can be added later,
if it is found to be needed.
The functions implemented include:
aio_read, aio_write, lio_listio, aio_error, aio_return,
aio_cancel, aio_suspend.
The code has been implemented to support the POSIX spec 1003.1b
(formerly known as POSIX 1003.4 spec) features of the above. The
async I/O features are truly async, with the VCHR mode of operation
being essentially the same as physio (for appropriate files) for
maximum efficiency. This code also supports the signal capability,
is highly tunable, allowing management of resource usage, and
has been written to allow a per process usage quota.
Both the O'Reilly POSIX.4 book and the actual POSIX 1003.1b document
were the reference specs used. Any filedescriptor can be used with
these new system calls. I know of no exceptions where these
system calls will not work. (TTY's will also probably work.)
1997-11-30 04:36:31 +00:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef MAX_BUF_AIO
|
2000-01-14 02:53:29 +00:00
|
|
|
#define MAX_BUF_AIO 16
|
Finish up the vast majority of the AIO/LIO functionality. Proper signal
support was missing in the previous version of the AIO code. More
tunables added, and very efficient support for VCHR files has been added.
Kernel threads are not used for VCHR files, all work for such files is
done for the requesting process directly. Some attempt has been made to
charge the requesting process for resource utilization, but more work
is needed. aio_fsync is still missing (but the original fsync system
call can be used for now.) aio_cancel is essentially a noop, but that
is okay per POSIX. More aio_cancel functionality can be added later,
if it is found to be needed.
The functions implemented include:
aio_read, aio_write, lio_listio, aio_error, aio_return,
aio_cancel, aio_suspend.
The code has been implemented to support the POSIX spec 1003.1b
(formerly known as POSIX 1003.4 spec) features of the above. The
async I/O features are truly async, with the VCHR mode of operation
being essentially the same as physio (for appropriate files) for
maximum efficiency. This code also supports the signal capability,
is highly tunable, allowing management of resource usage, and
has been written to allow a per process usage quota.
Both the O'Reilly POSIX.4 book and the actual POSIX 1003.1b document
were the reference specs used. Any filedescriptor can be used with
these new system calls. I know of no exceptions where these
system calls will not work. (TTY's will also probably work.)
1997-11-30 04:36:31 +00:00
|
|
|
#endif
|
|
|
|
|
2008-02-01 11:59:14 +00:00
|
|
|
FEATURE(aio, "Asynchronous I/O");
|
|
|
|
|
2008-12-10 20:56:19 +00:00
|
|
|
static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list");
|
|
|
|
|
2016-01-26 21:24:49 +00:00
|
|
|
static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0,
|
|
|
|
"Async IO management");
|
1997-10-09 04:14:41 +00:00
|
|
|
|
2016-03-01 18:12:14 +00:00
|
|
|
static int enable_aio_unsafe = 0;
|
|
|
|
SYSCTL_INT(_vfs_aio, OID_AUTO, enable_unsafe, CTLFLAG_RW, &enable_aio_unsafe, 0,
|
|
|
|
"Permit asynchronous IO on all file types, not just known-safe types");
|
|
|
|
|
2002-03-05 15:38:49 +00:00
|
|
|
static int max_aio_procs = MAX_AIO_PROCS;
|
2016-01-26 21:24:49 +00:00
|
|
|
SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0,
|
|
|
|
"Maximum number of kernel processes to use for handling async IO ");
|
1997-10-09 04:14:41 +00:00
|
|
|
|
2002-03-05 15:38:49 +00:00
|
|
|
static int num_aio_procs = 0;
|
2016-01-26 21:24:49 +00:00
|
|
|
SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0,
|
|
|
|
"Number of presently active kernel processes for async IO");
|
1997-10-09 04:14:41 +00:00
|
|
|
|
2002-03-05 15:38:49 +00:00
|
|
|
/*
|
|
|
|
* The code will adjust the actual number of AIO processes towards this
|
|
|
|
* number when it gets a chance.
|
|
|
|
*/
|
|
|
|
static int target_aio_procs = TARGET_AIO_PROCS;
|
|
|
|
SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
|
2016-01-26 21:24:49 +00:00
|
|
|
0,
|
|
|
|
"Preferred number of ready kernel processes for async IO");
|
1997-10-09 04:14:41 +00:00
|
|
|
|
2002-03-05 15:38:49 +00:00
|
|
|
static int max_queue_count = MAX_AIO_QUEUE;
|
|
|
|
SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
|
|
|
|
"Maximum number of aio requests to queue, globally");
|
1997-10-09 04:14:41 +00:00
|
|
|
|
2002-03-05 15:38:49 +00:00
|
|
|
static int num_queue_count = 0;
|
|
|
|
SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
|
|
|
|
"Number of queued aio requests");
|
1997-10-09 04:14:41 +00:00
|
|
|
|
2002-03-05 15:38:49 +00:00
|
|
|
static int num_buf_aio = 0;
|
|
|
|
SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
|
|
|
|
"Number of aio requests presently handled by the buf subsystem");
|
1997-11-29 01:33:10 +00:00
|
|
|
|
2016-01-21 02:20:38 +00:00
|
|
|
/* Number of async I/O processes in the process of being started */
|
2006-01-23 02:49:34 +00:00
|
|
|
/* XXX This should be local to aio_aqueue() */
|
2002-03-05 15:38:49 +00:00
|
|
|
static int num_aio_resv_start = 0;
|
1997-11-29 01:33:10 +00:00
|
|
|
|
2002-03-05 15:38:49 +00:00
|
|
|
static int aiod_lifetime;
|
|
|
|
SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
|
|
|
|
"Maximum lifetime for idle aiod");
|
Finish up the vast majority of the AIO/LIO functionality. Proper signal
support was missing in the previous version of the AIO code. More
tunables added, and very efficient support for VCHR files has been added.
Kernel threads are not used for VCHR files, all work for such files is
done for the requesting process directly. Some attempt has been made to
charge the requesting process for resource utilization, but more work
is needed. aio_fsync is still missing (but the original fsync system
call can be used for now.) aio_cancel is essentially a noop, but that
is okay per POSIX. More aio_cancel functionality can be added later,
if it is found to be needed.
The functions implemented include:
aio_read, aio_write, lio_listio, aio_error, aio_return,
aio_cancel, aio_suspend.
The code has been implemented to support the POSIX spec 1003.1b
(formerly known as POSIX 1003.4 spec) features of the above. The
async I/O features are truly async, with the VCHR mode of operation
being essentially the same as physio (for appropriate files) for
maximum efficiency. This code also supports the signal capability,
is highly tunable, allowing management of resource usage, and
has been written to allow a per process usage quota.
Both the O'Reilly POSIX.4 book and the actual POSIX 1003.1b document
were the reference specs used. Any filedescriptor can be used with
these new system calls. I know of no exceptions where these
system calls will not work. (TTY's will also probably work.)
1997-11-30 04:36:31 +00:00
|
|
|
|
2002-03-05 15:38:49 +00:00
|
|
|
static int max_aio_per_proc = MAX_AIO_PER_PROC;
|
|
|
|
SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
|
2016-01-26 21:24:49 +00:00
|
|
|
0,
|
|
|
|
"Maximum active aio requests per process (stored in the process)");
|
2002-03-05 15:38:49 +00:00
|
|
|
|
|
|
|
static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
|
|
|
|
SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
|
|
|
|
&max_aio_queue_per_proc, 0,
|
|
|
|
"Maximum queued aio requests per process (stored in the process)");
|
|
|
|
|
|
|
|
static int max_buf_aio = MAX_BUF_AIO;
|
|
|
|
SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
|
|
|
|
"Maximum buf aio requests per process (stored in the process)");
|
|
|
|
|
2016-03-09 19:05:11 +00:00
|
|
|
#ifdef COMPAT_FREEBSD6
|
2005-10-30 02:12:49 +00:00
|
|
|
typedef struct oaiocb {
|
|
|
|
int aio_fildes; /* File descriptor */
|
|
|
|
off_t aio_offset; /* File offset for I/O */
|
|
|
|
volatile void *aio_buf; /* I/O buffer in process space */
|
|
|
|
size_t aio_nbytes; /* Number of bytes for I/O */
|
|
|
|
struct osigevent aio_sigevent; /* Signal to deliver */
|
|
|
|
int aio_lio_opcode; /* LIO opcode */
|
|
|
|
int aio_reqprio; /* Request priority -- ignored */
|
|
|
|
struct __aiocb_private _aiocb_private;
|
|
|
|
} oaiocb_t;
|
2016-03-09 19:05:11 +00:00
|
|
|
#endif
|
2005-10-30 02:12:49 +00:00
|
|
|
|
2006-01-24 07:24:24 +00:00
|
|
|
/*
|
2016-02-05 20:38:09 +00:00
|
|
|
* Below is a key of locks used to protect each member of struct kaiocb
|
2006-01-24 07:24:24 +00:00
|
|
|
* aioliojob and kaioinfo and any backends.
|
|
|
|
*
|
|
|
|
* * - need not protected
|
2006-05-09 00:10:11 +00:00
|
|
|
* a - locked by kaioinfo lock
|
2006-01-24 07:24:24 +00:00
|
|
|
* b - locked by backend lock, the backend lock can be null in some cases,
|
|
|
|
* for example, BIO belongs to this type, in this case, proc lock is
|
|
|
|
* reused.
|
|
|
|
* c - locked by aio_job_mtx, the lock for the generic file I/O backend.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
2016-03-01 18:12:14 +00:00
|
|
|
* If the routine that services an AIO request blocks while running in an
|
|
|
|
* AIO kernel process it can starve other I/O requests. BIO requests
|
|
|
|
* queued via aio_qphysio() complete in GEOM and do not use AIO kernel
|
|
|
|
* processes at all. Socket I/O requests use a separate pool of
|
|
|
|
* kprocs and also force non-blocking I/O. Other file I/O requests
|
|
|
|
* use the generic fo_read/fo_write operations which can block. The
|
|
|
|
* fsync and mlock operations can also block while executing. Ideally
|
|
|
|
* none of these requests would block while executing.
|
|
|
|
*
|
|
|
|
* Note that the service routines cannot toggle O_NONBLOCK in the file
|
|
|
|
* structure directly while handling a request due to races with
|
|
|
|
* userland threads.
|
2006-01-24 07:24:24 +00:00
|
|
|
*/
|
|
|
|
|
2002-01-06 21:03:39 +00:00
|
|
|
/* jobflags */
|
2016-03-01 18:12:14 +00:00
|
|
|
#define KAIOCB_QUEUEING 0x01
|
|
|
|
#define KAIOCB_CANCELLED 0x02
|
|
|
|
#define KAIOCB_CANCELLING 0x04
|
2016-02-05 20:38:09 +00:00
|
|
|
#define KAIOCB_CHECKSYNC 0x08
|
2016-03-01 18:12:14 +00:00
|
|
|
#define KAIOCB_CLEARED 0x10
|
|
|
|
#define KAIOCB_FINISHED 0x20
|
2002-01-06 21:03:39 +00:00
|
|
|
|
1997-07-06 02:40:43 +00:00
|
|
|
/*
|
|
|
|
* AIO process info
|
|
|
|
*/
|
Finish up the vast majority of the AIO/LIO functionality. Proper signal
support was missing in the previous version of the AIO code. More
tunables added, and very efficient support for VCHR files has been added.
Kernel threads are not used for VCHR files, all work for such files is
done for the requesting process directly. Some attempt has been made to
charge the requesting process for resource utilization, but more work
is needed. aio_fsync is still missing (but the original fsync system
call can be used for now.) aio_cancel is essentially a noop, but that
is okay per POSIX. More aio_cancel functionality can be added later,
if it is found to be needed.
The functions implemented include:
aio_read, aio_write, lio_listio, aio_error, aio_return,
aio_cancel, aio_suspend.
The code has been implemented to support the POSIX spec 1003.1b
(formerly known as POSIX 1003.4 spec) features of the above. The
async I/O features are truly async, with the VCHR mode of operation
being essentially the same as physio (for appropriate files) for
maximum efficiency. This code also supports the signal capability,
is highly tunable, allowing management of resource usage, and
has been written to allow a per process usage quota.
Both the O'Reilly POSIX.4 book and the actual POSIX 1003.1b document
were the reference specs used. Any filedescriptor can be used with
these new system calls. I know of no exceptions where these
system calls will not work. (TTY's will also probably work.)
1997-11-30 04:36:31 +00:00
|
|
|
#define AIOP_FREE 0x1 /* proc on free queue */
|
|
|
|
|
2016-01-21 02:20:38 +00:00
|
|
|
struct aioproc {
|
2016-01-26 21:24:49 +00:00
|
|
|
int aioprocflags; /* (c) AIO proc flags */
|
2016-01-21 02:20:38 +00:00
|
|
|
TAILQ_ENTRY(aioproc) list; /* (c) list of processes */
|
2016-01-26 21:24:49 +00:00
|
|
|
struct proc *aioproc; /* (*) the AIO proc */
|
1997-07-06 02:40:43 +00:00
|
|
|
};
|
|
|
|
|
Finish up the vast majority of the AIO/LIO functionality. Proper signal
support was missing in the previous version of the AIO code. More
tunables added, and very efficient support for VCHR files has been added.
Kernel threads are not used for VCHR files, all work for such files is
done for the requesting process directly. Some attempt has been made to
charge the requesting process for resource utilization, but more work
is needed. aio_fsync is still missing (but the original fsync system
call can be used for now.) aio_cancel is essentially a noop, but that
is okay per POSIX. More aio_cancel functionality can be added later,
if it is found to be needed.
The functions implemented include:
aio_read, aio_write, lio_listio, aio_error, aio_return,
aio_cancel, aio_suspend.
The code has been implemented to support the POSIX spec 1003.1b
(formerly known as POSIX 1003.4 spec) features of the above. The
async I/O features are truly async, with the VCHR mode of operation
being essentially the same as physio (for appropriate files) for
maximum efficiency. This code also supports the signal capability,
is highly tunable, allowing management of resource usage, and
has been written to allow a per process usage quota.
Both the O'Reilly POSIX.4 book and the actual POSIX 1003.1b document
were the reference specs used. Any filedescriptor can be used with
these new system calls. I know of no exceptions where these
system calls will not work. (TTY's will also probably work.)
1997-11-30 04:36:31 +00:00
|
|
|
/*
|
|
|
|
* data-structure for lio signal management
|
|
|
|
*/
|
2006-01-22 05:59:27 +00:00
|
|
|
struct aioliojob {
|
2006-01-24 07:24:24 +00:00
|
|
|
int lioj_flags; /* (a) listio flags */
|
|
|
|
int lioj_count; /* (a) listio flags */
|
|
|
|
int lioj_finished_count; /* (a) listio flags */
|
|
|
|
struct sigevent lioj_signal; /* (a) signal on all I/O done */
|
|
|
|
TAILQ_ENTRY(aioliojob) lioj_list; /* (a) lio list */
|
2016-01-26 21:24:49 +00:00
|
|
|
struct knlist klist; /* (a) list of knotes */
|
2006-01-24 07:24:24 +00:00
|
|
|
ksiginfo_t lioj_ksi; /* (a) Realtime signal info */
|
Finish up the vast majority of the AIO/LIO functionality. Proper signal
support was missing in the previous version of the AIO code. More
tunables added, and very efficient support for VCHR files has been added.
Kernel threads are not used for VCHR files, all work for such files is
done for the requesting process directly. Some attempt has been made to
charge the requesting process for resource utilization, but more work
is needed. aio_fsync is still missing (but the original fsync system
call can be used for now.) aio_cancel is essentially a noop, but that
is okay per POSIX. More aio_cancel functionality can be added later,
if it is found to be needed.
The functions implemented include:
aio_read, aio_write, lio_listio, aio_error, aio_return,
aio_cancel, aio_suspend.
The code has been implemented to support the POSIX spec 1003.1b
(formerly known as POSIX 1003.4 spec) features of the above. The
async I/O features are truly async, with the VCHR mode of operation
being essentially the same as physio (for appropriate files) for
maximum efficiency. This code also supports the signal capability,
is highly tunable, allowing management of resource usage, and
has been written to allow a per process usage quota.
Both the O'Reilly POSIX.4 book and the actual POSIX 1003.1b document
were the reference specs used. Any filedescriptor can be used with
these new system calls. I know of no exceptions where these
system calls will not work. (TTY's will also probably work.)
1997-11-30 04:36:31 +00:00
|
|
|
};
|
2006-01-22 05:59:27 +00:00
|
|
|
|
2000-01-14 02:53:29 +00:00
|
|
|
#define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */
|
Finish up the vast majority of the AIO/LIO functionality. Proper signal
support was missing in the previous version of the AIO code. More
tunables added, and very efficient support for VCHR files has been added.
Kernel threads are not used for VCHR files, all work for such files is
done for the requesting process directly. Some attempt has been made to
charge the requesting process for resource utilization, but more work
is needed. aio_fsync is still missing (but the original fsync system
call can be used for now.) aio_cancel is essentially a noop, but that
is okay per POSIX. More aio_cancel functionality can be added later,
if it is found to be needed.
The functions implemented include:
aio_read, aio_write, lio_listio, aio_error, aio_return,
aio_cancel, aio_suspend.
The code has been implemented to support the POSIX spec 1003.1b
(formerly known as POSIX 1003.4 spec) features of the above. The
async I/O features are truly async, with the VCHR mode of operation
being essentially the same as physio (for appropriate files) for
maximum efficiency. This code also supports the signal capability,
is highly tunable, allowing management of resource usage, and
has been written to allow a per process usage quota.
Both the O'Reilly POSIX.4 book and the actual POSIX 1003.1b document
were the reference specs used. Any filedescriptor can be used with
these new system calls. I know of no exceptions where these
system calls will not work. (TTY's will also probably work.)
1997-11-30 04:36:31 +00:00
|
|
|
#define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */
|
2005-10-12 17:51:31 +00:00
|
|
|
#define LIOJ_KEVENT_POSTED 0x4 /* kevent triggered */
|
Finish up the vast majority of the AIO/LIO functionality. Proper signal
support was missing in the previous version of the AIO code. More
tunables added, and very efficient support for VCHR files has been added.
Kernel threads are not used for VCHR files, all work for such files is
done for the requesting process directly. Some attempt has been made to
charge the requesting process for resource utilization, but more work
is needed. aio_fsync is still missing (but the original fsync system
call can be used for now.) aio_cancel is essentially a noop, but that
is okay per POSIX. More aio_cancel functionality can be added later,
if it is found to be needed.
The functions implemented include:
aio_read, aio_write, lio_listio, aio_error, aio_return,
aio_cancel, aio_suspend.
The code has been implemented to support the POSIX spec 1003.1b
(formerly known as POSIX 1003.4 spec) features of the above. The
async I/O features are truly async, with the VCHR mode of operation
being essentially the same as physio (for appropriate files) for
maximum efficiency. This code also supports the signal capability,
is highly tunable, allowing management of resource usage, and
has been written to allow a per process usage quota.
Both the O'Reilly POSIX.4 book and the actual POSIX 1003.1b document
were the reference specs used. Any filedescriptor can be used with
these new system calls. I know of no exceptions where these
system calls will not work. (TTY's will also probably work.)
1997-11-30 04:36:31 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* per process aio data structure
|
|
|
|
*/
|
1997-07-06 02:40:43 +00:00
|
|
|
struct kaioinfo {
|
2016-01-26 21:24:49 +00:00
|
|
|
struct mtx kaio_mtx; /* the lock to protect this struct */
|
2006-01-24 07:24:24 +00:00
|
|
|
int kaio_flags; /* (a) per process kaio flags */
|
|
|
|
int kaio_maxactive_count; /* (*) maximum number of AIOs */
|
|
|
|
int kaio_active_count; /* (c) number of currently used AIOs */
|
|
|
|
int kaio_qallowed_count; /* (*) maxiumu size of AIO queue */
|
|
|
|
int kaio_count; /* (a) size of AIO queue */
|
|
|
|
int kaio_ballowed_count; /* (*) maximum number of buffers */
|
|
|
|
int kaio_buffer_count; /* (a) number of physio buffers */
|
2016-02-05 20:38:09 +00:00
|
|
|
TAILQ_HEAD(,kaiocb) kaio_all; /* (a) all AIOs in a process */
|
|
|
|
TAILQ_HEAD(,kaiocb) kaio_done; /* (a) done queue for process */
|
2006-01-24 07:24:24 +00:00
|
|
|
TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
|
2016-02-05 20:38:09 +00:00
|
|
|
TAILQ_HEAD(,kaiocb) kaio_jobqueue; /* (a) job queue for process */
|
|
|
|
TAILQ_HEAD(,kaiocb) kaio_syncqueue; /* (a) queue for aio_fsync */
|
2016-03-01 18:12:14 +00:00
|
|
|
TAILQ_HEAD(,kaiocb) kaio_syncready; /* (a) second q for aio_fsync */
|
2016-01-26 21:24:49 +00:00
|
|
|
struct task kaio_task; /* (*) task to kick aio processes */
|
2016-03-01 18:12:14 +00:00
|
|
|
struct task kaio_sync_task; /* (*) task to schedule fsync jobs */
|
1997-07-06 02:40:43 +00:00
|
|
|
};
|
|
|
|
|
2006-05-09 00:10:11 +00:00
|
|
|
#define AIO_LOCK(ki) mtx_lock(&(ki)->kaio_mtx)
|
|
|
|
#define AIO_UNLOCK(ki) mtx_unlock(&(ki)->kaio_mtx)
|
|
|
|
#define AIO_LOCK_ASSERT(ki, f) mtx_assert(&(ki)->kaio_mtx, (f))
|
|
|
|
#define AIO_MTX(ki) (&(ki)->kaio_mtx)
|
|
|
|
|
2000-01-14 02:53:29 +00:00
|
|
|
#define KAIO_RUNDOWN 0x1 /* process is being run down */
|
2016-01-26 21:24:49 +00:00
|
|
|
#define KAIO_WAKEUP 0x2 /* wakeup process when AIO completes */
|
1997-11-29 01:33:10 +00:00
|
|
|
|
2008-12-10 20:56:19 +00:00
|
|
|
/*
|
|
|
|
* Operations used to interact with userland aio control blocks.
|
|
|
|
* Different ABIs provide their own operations.
|
|
|
|
*/
|
|
|
|
struct aiocb_ops {
|
|
|
|
int (*copyin)(struct aiocb *ujob, struct aiocb *kjob);
|
|
|
|
long (*fetch_status)(struct aiocb *ujob);
|
|
|
|
long (*fetch_error)(struct aiocb *ujob);
|
|
|
|
int (*store_status)(struct aiocb *ujob, long status);
|
|
|
|
int (*store_error)(struct aiocb *ujob, long error);
|
|
|
|
int (*store_kernelinfo)(struct aiocb *ujob, long jobref);
|
|
|
|
int (*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob);
|
|
|
|
};
|
|
|
|
|
2016-01-21 02:20:38 +00:00
|
|
|
static TAILQ_HEAD(,aioproc) aio_freeproc; /* (c) Idle daemons */
|
2006-01-22 05:59:27 +00:00
|
|
|
static struct sema aio_newproc_sem;
|
|
|
|
static struct mtx aio_job_mtx;
|
2016-02-05 20:38:09 +00:00
|
|
|
static TAILQ_HEAD(,kaiocb) aio_jobs; /* (c) Async job list */
|
2006-01-22 05:59:27 +00:00
|
|
|
static struct unrhdr *aiod_unr;
|
1997-07-06 02:40:43 +00:00
|
|
|
|
MFP4 (with some minor changes):
Implement the linux_io_* syscalls (AIO). They are only enabled if the native
AIO code is available (either compiled in to the kernel or as a module) at
the time the functions are used. If the AIO stuff is not available there
will be a ENOSYS.
From the submitter:
---snip---
DESIGN NOTES:
1. Linux permits a process to own multiple AIO queues (distinguished by
"context"), but FreeBSD creates only one single AIO queue per process.
My code maintains a request queue (STAILQ of queue(3)) per "context",
and throws all AIO requests of all contexts owned by a process into
the single FreeBSD per-process AIO queue.
When the process calls io_destroy(2), io_getevents(2), io_submit(2) and
io_cancel(2), my code can pick out requests owned by the specified context
from the single FreeBSD per-process AIO queue according to the per-context
request queues maintained by my code.
2. The request queue maintained by my code stores contrast information between
Linux IO control blocks (struct linux_iocb) and FreeBSD IO control blocks
(struct aiocb). FreeBSD IO control block actually exists in userland memory
space, required by FreeBSD native aio_XXXXXX(2).
3. It is quite troubling that the function io_getevents() of libaio-0.3.105
needs to use Linux-specific "struct aio_ring", which is a partial mirror
of context in user space. I would rather take the address of context in
kernel as the context ID, but the io_getevents() of libaio forces me to
take the address of the "ring" in user space as the context ID.
To my surprise, one comment line in the file "io_getevents.c" of
libaio-0.3.105 reads:
Ben will hate me for this
REFERENCE:
1. Linux kernel source code: http://www.kernel.org/pub/linux/kernel/v2.6/
(include/linux/aio_abi.h, fs/aio.c)
2. Linux manual pages: http://www.kernel.org/pub/linux/docs/manpages/
(io_setup(2), io_destroy(2), io_getevents(2), io_submit(2), io_cancel(2))
3. Linux Scalability Effort: http://lse.sourceforge.net/io/aio.html
The design notes: http://lse.sourceforge.net/io/aionotes.txt
4. The package libaio, both source and binary:
http://rpmfind.net/linux/rpm2html/search.php?query=libaio
Simple transparent interface to Linux AIO system calls.
5. Libaio-oracle: http://oss.oracle.com/projects/libaio-oracle/
POSIX AIO implementation based on Linux AIO system calls (depending on
libaio).
---snip---
Submitted by: Li, Xiao <intron@intron.ac>
2006-10-15 14:22:14 +00:00
|
|
|
void aio_init_aioinfo(struct proc *p);
|
2010-03-19 11:11:34 +00:00
|
|
|
static int aio_onceonly(void);
|
2016-02-05 20:38:09 +00:00
|
|
|
static int aio_free_entry(struct kaiocb *job);
|
|
|
|
static void aio_process_rw(struct kaiocb *job);
|
|
|
|
static void aio_process_sync(struct kaiocb *job);
|
|
|
|
static void aio_process_mlock(struct kaiocb *job);
|
2016-03-01 18:12:14 +00:00
|
|
|
static void aio_schedule_fsync(void *context, int pending);
|
2006-01-22 05:59:27 +00:00
|
|
|
static int aio_newproc(int *);
|
2016-02-05 20:38:09 +00:00
|
|
|
int aio_aqueue(struct thread *td, struct aiocb *ujob,
|
2016-01-26 21:24:49 +00:00
|
|
|
struct aioliojob *lio, int type, struct aiocb_ops *ops);
|
2016-03-01 18:12:14 +00:00
|
|
|
static int aio_queue_file(struct file *fp, struct kaiocb *job);
|
2015-04-22 18:11:34 +00:00
|
|
|
static void aio_physwakeup(struct bio *bp);
|
2003-03-24 21:15:35 +00:00
|
|
|
static void aio_proc_rundown(void *arg, struct proc *p);
|
2016-01-26 21:24:49 +00:00
|
|
|
static void aio_proc_rundown_exec(void *arg, struct proc *p,
|
|
|
|
struct image_params *imgp);
|
2016-02-05 20:38:09 +00:00
|
|
|
static int aio_qphysio(struct proc *p, struct kaiocb *job);
|
2006-01-22 05:59:27 +00:00
|
|
|
static void aio_daemon(void *param);
|
2016-03-01 18:12:14 +00:00
|
|
|
static void aio_bio_done_notify(struct proc *userp, struct kaiocb *job);
|
2006-03-24 00:50:06 +00:00
|
|
|
static int aio_kick(struct proc *userp);
|
2006-03-23 08:46:42 +00:00
|
|
|
static void aio_kick_nowait(struct proc *userp);
|
|
|
|
static void aio_kick_helper(void *context, int pending);
|
2001-12-29 07:13:47 +00:00
|
|
|
static int filt_aioattach(struct knote *kn);
|
|
|
|
static void filt_aiodetach(struct knote *kn);
|
|
|
|
static int filt_aio(struct knote *kn, long hint);
|
2005-10-12 17:51:31 +00:00
|
|
|
static int filt_lioattach(struct knote *kn);
|
|
|
|
static void filt_liodetach(struct knote *kn);
|
|
|
|
static int filt_lio(struct knote *kn, long hint);
|
1997-07-06 02:40:43 +00:00
|
|
|
|
2002-03-05 15:38:49 +00:00
|
|
|
/*
|
|
|
|
* Zones for:
|
|
|
|
* kaio Per process async io info
|
2016-01-21 02:20:38 +00:00
|
|
|
* aiop async io process data
|
2002-03-05 15:38:49 +00:00
|
|
|
* aiocb async io jobs
|
|
|
|
* aiol list io job pointer - internal to aio_suspend XXX
|
|
|
|
* aiolio list io jobs
|
|
|
|
*/
|
2002-03-20 04:09:59 +00:00
|
|
|
static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone;
|
1997-11-29 01:33:10 +00:00
|
|
|
|
2002-03-05 15:38:49 +00:00
|
|
|
/* kqueue filters for aio */
|
2009-09-12 20:03:45 +00:00
|
|
|
static struct filterops aio_filtops = {
|
|
|
|
.f_isfd = 0,
|
|
|
|
.f_attach = filt_aioattach,
|
|
|
|
.f_detach = filt_aiodetach,
|
|
|
|
.f_event = filt_aio,
|
|
|
|
};
|
|
|
|
static struct filterops lio_filtops = {
|
|
|
|
.f_isfd = 0,
|
|
|
|
.f_attach = filt_lioattach,
|
|
|
|
.f_detach = filt_liodetach,
|
|
|
|
.f_event = filt_lio
|
|
|
|
};
|
2001-12-29 07:13:47 +00:00
|
|
|
|
2003-03-24 21:15:35 +00:00
|
|
|
static eventhandler_tag exit_tag, exec_tag;
|
|
|
|
|
2016-01-14 20:51:48 +00:00
|
|
|
TASKQUEUE_DEFINE_THREAD(aiod_kick);
|
2006-01-22 05:59:27 +00:00
|
|
|
|
2002-03-05 15:38:49 +00:00
|
|
|
/*
|
|
|
|
* Main operations function for use as a kernel module.
|
|
|
|
*/
|
2001-12-29 07:13:47 +00:00
|
|
|
static int
|
|
|
|
aio_modload(struct module *module, int cmd, void *arg)
|
|
|
|
{
|
|
|
|
int error = 0;
|
|
|
|
|
|
|
|
switch (cmd) {
|
|
|
|
case MOD_LOAD:
|
|
|
|
aio_onceonly();
|
|
|
|
break;
|
|
|
|
case MOD_SHUTDOWN:
|
|
|
|
break;
|
|
|
|
default:
|
2016-03-01 18:12:14 +00:00
|
|
|
error = EOPNOTSUPP;
|
2001-12-29 07:13:47 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
static moduledata_t aio_mod = {
|
|
|
|
"aio",
|
|
|
|
&aio_modload,
|
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
2016-03-09 19:05:11 +00:00
|
|
|
DECLARE_MODULE(aio, aio_mod, SI_SUB_VFS, SI_ORDER_ANY);
|
2001-12-29 07:13:47 +00:00
|
|
|
MODULE_VERSION(aio, 1);
|
|
|
|
|
1997-07-06 02:40:43 +00:00
|
|
|
/*
|
|
|
|
* Startup initialization
|
|
|
|
*/
|
2010-03-19 11:11:34 +00:00
|
|
|
static int
|
2001-12-29 07:13:47 +00:00
|
|
|
aio_onceonly(void)
|
1997-11-29 01:33:10 +00:00
|
|
|
{
|
2001-12-29 07:13:47 +00:00
|
|
|
|
2003-03-24 21:15:35 +00:00
|
|
|
exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
|
|
|
|
EVENTHANDLER_PRI_ANY);
|
2016-01-26 21:24:49 +00:00
|
|
|
exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec,
|
|
|
|
NULL, EVENTHANDLER_PRI_ANY);
|
2001-12-29 07:13:47 +00:00
|
|
|
kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
|
2005-10-12 17:51:31 +00:00
|
|
|
kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
|
1997-07-06 02:40:43 +00:00
|
|
|
TAILQ_INIT(&aio_freeproc);
|
2006-01-22 05:59:27 +00:00
|
|
|
sema_init(&aio_newproc_sem, 0, "aio_new_proc");
|
|
|
|
mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
|
1997-07-06 02:40:43 +00:00
|
|
|
TAILQ_INIT(&aio_jobs);
|
2006-01-22 05:59:27 +00:00
|
|
|
aiod_unr = new_unrhdr(1, INT_MAX, NULL);
|
2002-03-20 04:09:59 +00:00
|
|
|
kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
|
|
|
|
NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
|
2016-01-21 02:20:38 +00:00
|
|
|
aiop_zone = uma_zcreate("AIOP", sizeof(struct aioproc), NULL,
|
2002-03-20 04:09:59 +00:00
|
|
|
NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
|
2016-02-05 20:38:09 +00:00
|
|
|
aiocb_zone = uma_zcreate("AIOCB", sizeof(struct kaiocb), NULL, NULL,
|
2002-03-20 04:09:59 +00:00
|
|
|
NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
|
|
|
|
aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL,
|
|
|
|
NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
|
2006-01-22 05:59:27 +00:00
|
|
|
aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
|
2002-03-20 04:09:59 +00:00
|
|
|
NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
|
Finish up the vast majority of the AIO/LIO functionality. Proper signal
support was missing in the previous version of the AIO code. More
tunables added, and very efficient support for VCHR files has been added.
Kernel threads are not used for VCHR files, all work for such files is
done for the requesting process directly. Some attempt has been made to
charge the requesting process for resource utilization, but more work
is needed. aio_fsync is still missing (but the original fsync system
call can be used for now.) aio_cancel is essentially a noop, but that
is okay per POSIX. More aio_cancel functionality can be added later,
if it is found to be needed.
The functions implemented include:
aio_read, aio_write, lio_listio, aio_error, aio_return,
aio_cancel, aio_suspend.
The code has been implemented to support the POSIX spec 1003.1b
(formerly known as POSIX 1003.4 spec) features of the above. The
async I/O features are truly async, with the VCHR mode of operation
being essentially the same as physio (for appropriate files) for
maximum efficiency. This code also supports the signal capability,
is highly tunable, allowing management of resource usage, and
has been written to allow a per process usage quota.
Both the O'Reilly POSIX.4 book and the actual POSIX 1003.1b document
were the reference specs used. Any filedescriptor can be used with
these new system calls. I know of no exceptions where these
system calls will not work. (TTY's will also probably work.)
1997-11-30 04:36:31 +00:00
|
|
|
aiod_lifetime = AIOD_LIFETIME_DEFAULT;
|
1997-11-29 01:33:10 +00:00
|
|
|
jobrefid = 1;
|
2016-03-09 19:05:11 +00:00
|
|
|
p31b_setcfg(CTL_P1003_1B_ASYNCHRONOUS_IO, _POSIX_ASYNCHRONOUS_IO);
|
2002-11-16 04:22:55 +00:00
|
|
|
p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, AIO_LISTIO_MAX);
|
2002-11-16 06:38:07 +00:00
|
|
|
p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE);
|
|
|
|
p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0);
|
2010-03-19 11:11:34 +00:00
|
|
|
|
|
|
|
return (0);
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2000-01-14 02:53:29 +00:00
|
|
|
* Init the per-process aioinfo structure. The aioinfo limits are set
|
|
|
|
* per-process for user limit (resource) management.
|
1997-07-06 02:40:43 +00:00
|
|
|
*/
|
MFP4 (with some minor changes):
Implement the linux_io_* syscalls (AIO). They are only enabled if the native
AIO code is available (either compiled in to the kernel or as a module) at
the time the functions are used. If the AIO stuff is not available there
will be a ENOSYS.
From the submitter:
---snip---
DESIGN NOTES:
1. Linux permits a process to own multiple AIO queues (distinguished by
"context"), but FreeBSD creates only one single AIO queue per process.
My code maintains a request queue (STAILQ of queue(3)) per "context",
and throws all AIO requests of all contexts owned by a process into
the single FreeBSD per-process AIO queue.
When the process calls io_destroy(2), io_getevents(2), io_submit(2) and
io_cancel(2), my code can pick out requests owned by the specified context
from the single FreeBSD per-process AIO queue according to the per-context
request queues maintained by my code.
2. The request queue maintained by my code stores contrast information between
Linux IO control blocks (struct linux_iocb) and FreeBSD IO control blocks
(struct aiocb). FreeBSD IO control block actually exists in userland memory
space, required by FreeBSD native aio_XXXXXX(2).
3. It is quite troubling that the function io_getevents() of libaio-0.3.105
needs to use Linux-specific "struct aio_ring", which is a partial mirror
of context in user space. I would rather take the address of context in
kernel as the context ID, but the io_getevents() of libaio forces me to
take the address of the "ring" in user space as the context ID.
To my surprise, one comment line in the file "io_getevents.c" of
libaio-0.3.105 reads:
Ben will hate me for this
REFERENCE:
1. Linux kernel source code: http://www.kernel.org/pub/linux/kernel/v2.6/
(include/linux/aio_abi.h, fs/aio.c)
2. Linux manual pages: http://www.kernel.org/pub/linux/docs/manpages/
(io_setup(2), io_destroy(2), io_getevents(2), io_submit(2), io_cancel(2))
3. Linux Scalability Effort: http://lse.sourceforge.net/io/aio.html
The design notes: http://lse.sourceforge.net/io/aionotes.txt
4. The package libaio, both source and binary:
http://rpmfind.net/linux/rpm2html/search.php?query=libaio
Simple transparent interface to Linux AIO system calls.
5. Libaio-oracle: http://oss.oracle.com/projects/libaio-oracle/
POSIX AIO implementation based on Linux AIO system calls (depending on
libaio).
---snip---
Submitted by: Li, Xiao <intron@intron.ac>
2006-10-15 14:22:14 +00:00
|
|
|
void
|
1997-11-29 01:33:10 +00:00
|
|
|
aio_init_aioinfo(struct proc *p)
|
|
|
|
{
|
1997-07-06 02:40:43 +00:00
|
|
|
struct kaioinfo *ki;
|
2003-01-13 15:06:05 +00:00
|
|
|
|
2005-05-30 19:33:33 +00:00
|
|
|
ki = uma_zalloc(kaio_zone, M_WAITOK);
|
2015-07-06 14:09:00 +00:00
|
|
|
mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF | MTX_NEW);
|
2005-05-30 19:33:33 +00:00
|
|
|
ki->kaio_flags = 0;
|
|
|
|
ki->kaio_maxactive_count = max_aio_per_proc;
|
|
|
|
ki->kaio_active_count = 0;
|
|
|
|
ki->kaio_qallowed_count = max_aio_queue_per_proc;
|
2006-01-22 05:59:27 +00:00
|
|
|
ki->kaio_count = 0;
|
2005-05-30 19:33:33 +00:00
|
|
|
ki->kaio_ballowed_count = max_buf_aio;
|
|
|
|
ki->kaio_buffer_count = 0;
|
2006-01-22 05:59:27 +00:00
|
|
|
TAILQ_INIT(&ki->kaio_all);
|
|
|
|
TAILQ_INIT(&ki->kaio_done);
|
2005-05-30 19:33:33 +00:00
|
|
|
TAILQ_INIT(&ki->kaio_jobqueue);
|
|
|
|
TAILQ_INIT(&ki->kaio_liojoblist);
|
2006-03-23 08:46:42 +00:00
|
|
|
TAILQ_INIT(&ki->kaio_syncqueue);
|
2016-03-01 18:12:14 +00:00
|
|
|
TAILQ_INIT(&ki->kaio_syncready);
|
2006-03-23 08:46:42 +00:00
|
|
|
TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
|
2016-03-01 18:12:14 +00:00
|
|
|
TASK_INIT(&ki->kaio_sync_task, 0, aio_schedule_fsync, ki);
|
2005-05-30 19:33:33 +00:00
|
|
|
PROC_LOCK(p);
|
1997-07-06 02:40:43 +00:00
|
|
|
if (p->p_aioinfo == NULL) {
|
|
|
|
p->p_aioinfo = ki;
|
2005-05-30 19:33:33 +00:00
|
|
|
PROC_UNLOCK(p);
|
|
|
|
} else {
|
|
|
|
PROC_UNLOCK(p);
|
2006-05-09 00:10:11 +00:00
|
|
|
mtx_destroy(&ki->kaio_mtx);
|
2005-05-30 19:33:33 +00:00
|
|
|
uma_zfree(kaio_zone, ki);
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
2004-08-13 17:43:53 +00:00
|
|
|
|
2008-06-21 11:34:34 +00:00
|
|
|
while (num_aio_procs < MIN(target_aio_procs, max_aio_procs))
|
2006-01-22 05:59:27 +00:00
|
|
|
aio_newproc(NULL);
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
|
|
|
|
2005-11-03 05:25:26 +00:00
|
|
|
static int
|
|
|
|
aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
|
|
|
|
{
|
2010-10-09 02:50:23 +00:00
|
|
|
struct thread *td;
|
|
|
|
int error;
|
2006-05-09 00:10:11 +00:00
|
|
|
|
2010-10-09 02:50:23 +00:00
|
|
|
error = sigev_findtd(p, sigev, &td);
|
|
|
|
if (error)
|
|
|
|
return (error);
|
2005-11-03 05:25:26 +00:00
|
|
|
if (!KSI_ONQ(ksi)) {
|
2010-10-09 02:50:23 +00:00
|
|
|
ksiginfo_set_sigev(ksi, sigev);
|
2005-11-03 05:25:26 +00:00
|
|
|
ksi->ksi_code = SI_ASYNCIO;
|
|
|
|
ksi->ksi_flags |= KSI_EXT | KSI_INS;
|
2010-10-09 02:50:23 +00:00
|
|
|
tdsendsignal(p, td, ksi->ksi_signo, ksi);
|
2005-11-03 05:25:26 +00:00
|
|
|
}
|
2006-05-09 00:10:11 +00:00
|
|
|
PROC_UNLOCK(p);
|
2010-10-09 02:50:23 +00:00
|
|
|
return (error);
|
2005-11-03 05:25:26 +00:00
|
|
|
}
|
|
|
|
|
1997-07-06 02:40:43 +00:00
|
|
|
/*
|
2000-01-14 02:53:29 +00:00
|
|
|
* Free a job entry. Wait for completion if it is currently active, but don't
|
|
|
|
* delay forever. If we delay, we return a flag that says that we have to
|
|
|
|
* restart the queue scan.
|
1997-07-06 02:40:43 +00:00
|
|
|
*/
|
2001-03-05 01:30:23 +00:00
|
|
|
static int
|
2016-02-05 20:38:09 +00:00
|
|
|
aio_free_entry(struct kaiocb *job)
|
1997-11-29 01:33:10 +00:00
|
|
|
{
|
1997-07-06 02:40:43 +00:00
|
|
|
struct kaioinfo *ki;
|
2006-01-22 05:59:27 +00:00
|
|
|
struct aioliojob *lj;
|
1997-07-06 02:40:43 +00:00
|
|
|
struct proc *p;
|
|
|
|
|
2016-02-05 20:38:09 +00:00
|
|
|
p = job->userproc;
|
2006-01-22 05:59:27 +00:00
|
|
|
MPASS(curproc == p);
|
1997-07-06 02:40:43 +00:00
|
|
|
ki = p->p_aioinfo;
|
2006-01-22 05:59:27 +00:00
|
|
|
MPASS(ki != NULL);
|
|
|
|
|
2006-05-09 00:10:11 +00:00
|
|
|
AIO_LOCK_ASSERT(ki, MA_OWNED);
|
2016-03-01 18:12:14 +00:00
|
|
|
MPASS(job->jobflags & KAIOCB_FINISHED);
|
2006-05-09 00:10:11 +00:00
|
|
|
|
2006-01-22 05:59:27 +00:00
|
|
|
atomic_subtract_int(&num_queue_count, 1);
|
|
|
|
|
|
|
|
ki->kaio_count--;
|
|
|
|
MPASS(ki->kaio_count >= 0);
|
|
|
|
|
2016-02-05 20:38:09 +00:00
|
|
|
TAILQ_REMOVE(&ki->kaio_done, job, plist);
|
|
|
|
TAILQ_REMOVE(&ki->kaio_all, job, allist);
|
2006-02-26 12:56:23 +00:00
|
|
|
|
2016-02-05 20:38:09 +00:00
|
|
|
lj = job->lio;
|
2006-01-22 05:59:27 +00:00
|
|
|
if (lj) {
|
|
|
|
lj->lioj_count--;
|
|
|
|
lj->lioj_finished_count--;
|
1997-07-06 02:40:43 +00:00
|
|
|
|
2006-01-23 02:49:34 +00:00
|
|
|
if (lj->lioj_count == 0) {
|
2006-01-22 05:59:27 +00:00
|
|
|
TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
|
|
|
|
/* lio is going away, we need to destroy any knotes */
|
|
|
|
knlist_delete(&lj->klist, curthread, 1);
|
2006-05-09 00:10:11 +00:00
|
|
|
PROC_LOCK(p);
|
2006-01-22 05:59:27 +00:00
|
|
|
sigqueue_take(&lj->lioj_ksi);
|
2006-05-09 00:10:11 +00:00
|
|
|
PROC_UNLOCK(p);
|
2006-01-22 05:59:27 +00:00
|
|
|
uma_zfree(aiolio_zone, lj);
|
Finish up the vast majority of the AIO/LIO functionality. Proper signal
support was missing in the previous version of the AIO code. More
tunables added, and very efficient support for VCHR files has been added.
Kernel threads are not used for VCHR files, all work for such files is
done for the requesting process directly. Some attempt has been made to
charge the requesting process for resource utilization, but more work
is needed. aio_fsync is still missing (but the original fsync system
call can be used for now.) aio_cancel is essentially a noop, but that
is okay per POSIX. More aio_cancel functionality can be added later,
if it is found to be needed.
The functions implemented include:
aio_read, aio_write, lio_listio, aio_error, aio_return,
aio_cancel, aio_suspend.
The code has been implemented to support the POSIX spec 1003.1b
(formerly known as POSIX 1003.4 spec) features of the above. The
async I/O features are truly async, with the VCHR mode of operation
being essentially the same as physio (for appropriate files) for
maximum efficiency. This code also supports the signal capability,
is highly tunable, allowing management of resource usage, and
has been written to allow a per process usage quota.
Both the O'Reilly POSIX.4 book and the actual POSIX 1003.1b document
were the reference specs used. Any filedescriptor can be used with
these new system calls. I know of no exceptions where these
system calls will not work. (TTY's will also probably work.)
1997-11-30 04:36:31 +00:00
|
|
|
}
|
1997-11-29 01:33:10 +00:00
|
|
|
}
|
|
|
|
|
2016-02-05 20:38:09 +00:00
|
|
|
/* job is going away, we need to destroy any knotes */
|
|
|
|
knlist_delete(&job->klist, curthread, 1);
|
2006-05-09 00:10:11 +00:00
|
|
|
PROC_LOCK(p);
|
2016-02-05 20:38:09 +00:00
|
|
|
sigqueue_take(&job->ksi);
|
2006-05-09 00:10:11 +00:00
|
|
|
PROC_UNLOCK(p);
|
2006-01-22 05:59:27 +00:00
|
|
|
|
2006-05-09 00:10:11 +00:00
|
|
|
AIO_UNLOCK(ki);
|
2005-11-08 17:43:05 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The thread argument here is used to find the owning process
|
|
|
|
* and is also passed to fo_close() which may pass it to various
|
|
|
|
* places such as devsw close() routines. Because of that, we
|
|
|
|
* need a thread pointer from the process owning the job that is
|
|
|
|
* persistent and won't disappear out from under us or move to
|
|
|
|
* another process.
|
|
|
|
*
|
|
|
|
* Currently, all the callers of this function call it to remove
|
2016-02-05 20:38:09 +00:00
|
|
|
* a kaiocb from the current process' job list either via a
|
2005-11-08 17:43:05 +00:00
|
|
|
* syscall or due to the current process calling exit() or
|
|
|
|
* execve(). Thus, we know that p == curproc. We also know that
|
|
|
|
* curthread can't exit since we are curthread.
|
|
|
|
*
|
|
|
|
* Therefore, we use curthread as the thread to pass to
|
|
|
|
* knlist_delete(). This does mean that it is possible for the
|
|
|
|
* thread pointer at close time to differ from the thread pointer
|
|
|
|
* at open time, but this is already true of file descriptors in
|
|
|
|
* a multithreaded process.
|
2001-09-12 08:38:13 +00:00
|
|
|
*/
|
2016-02-05 20:38:09 +00:00
|
|
|
if (job->fd_file)
|
|
|
|
fdrop(job->fd_file, curthread);
|
|
|
|
crfree(job->cred);
|
|
|
|
uma_zfree(aiocb_zone, job);
|
2006-05-09 00:10:11 +00:00
|
|
|
AIO_LOCK(ki);
|
2006-01-22 05:59:27 +00:00
|
|
|
|
2003-01-13 15:06:05 +00:00
|
|
|
return (0);
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
|
|
|
|
2006-08-15 12:10:57 +00:00
|
|
|
static void
|
2016-01-26 21:24:49 +00:00
|
|
|
aio_proc_rundown_exec(void *arg, struct proc *p,
|
|
|
|
struct image_params *imgp __unused)
|
2006-08-15 12:10:57 +00:00
|
|
|
{
|
|
|
|
aio_proc_rundown(arg, p);
|
|
|
|
}
|
|
|
|
|
2016-03-01 18:12:14 +00:00
|
|
|
static int
|
|
|
|
aio_cancel_job(struct proc *p, struct kaioinfo *ki, struct kaiocb *job)
|
|
|
|
{
|
|
|
|
aio_cancel_fn_t *func;
|
|
|
|
int cancelled;
|
|
|
|
|
|
|
|
AIO_LOCK_ASSERT(ki, MA_OWNED);
|
|
|
|
if (job->jobflags & (KAIOCB_CANCELLED | KAIOCB_FINISHED))
|
|
|
|
return (0);
|
|
|
|
MPASS((job->jobflags & KAIOCB_CANCELLING) == 0);
|
|
|
|
job->jobflags |= KAIOCB_CANCELLED;
|
|
|
|
|
|
|
|
func = job->cancel_fn;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If there is no cancel routine, just leave the job marked as
|
|
|
|
* cancelled. The job should be in active use by a caller who
|
|
|
|
* should complete it normally or when it fails to install a
|
|
|
|
* cancel routine.
|
|
|
|
*/
|
|
|
|
if (func == NULL)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Set the CANCELLING flag so that aio_complete() will defer
|
|
|
|
* completions of this job. This prevents the job from being
|
|
|
|
* freed out from under the cancel callback. After the
|
|
|
|
* callback any deferred completion (whether from the callback
|
|
|
|
* or any other source) will be completed.
|
|
|
|
*/
|
|
|
|
job->jobflags |= KAIOCB_CANCELLING;
|
|
|
|
AIO_UNLOCK(ki);
|
|
|
|
func(job);
|
|
|
|
AIO_LOCK(ki);
|
|
|
|
job->jobflags &= ~KAIOCB_CANCELLING;
|
|
|
|
if (job->jobflags & KAIOCB_FINISHED) {
|
|
|
|
cancelled = job->uaiocb._aiocb_private.error == ECANCELED;
|
|
|
|
TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
|
|
|
|
aio_bio_done_notify(p, job);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* The cancel callback might have scheduled an
|
|
|
|
* operation to cancel this request, but it is
|
|
|
|
* only counted as cancelled if the request is
|
|
|
|
* cancelled when the callback returns.
|
|
|
|
*/
|
|
|
|
cancelled = 0;
|
|
|
|
}
|
|
|
|
return (cancelled);
|
|
|
|
}
|
|
|
|
|
1997-07-06 02:40:43 +00:00
|
|
|
/*
|
2004-08-13 17:43:53 +00:00
|
|
|
* Rundown the jobs for a given process.
|
1997-07-06 02:40:43 +00:00
|
|
|
*/
|
2001-12-29 07:13:47 +00:00
|
|
|
static void
|
2003-03-24 21:15:35 +00:00
|
|
|
aio_proc_rundown(void *arg, struct proc *p)
|
1997-11-29 01:33:10 +00:00
|
|
|
{
|
1997-07-06 02:40:43 +00:00
|
|
|
struct kaioinfo *ki;
|
2006-01-22 05:59:27 +00:00
|
|
|
struct aioliojob *lj;
|
2016-02-05 20:38:09 +00:00
|
|
|
struct kaiocb *job, *jobn;
|
2000-01-14 02:53:29 +00:00
|
|
|
|
2005-11-08 17:43:05 +00:00
|
|
|
KASSERT(curthread->td_proc == p,
|
|
|
|
("%s: called on non-curproc", __func__));
|
1997-07-06 02:40:43 +00:00
|
|
|
ki = p->p_aioinfo;
|
|
|
|
if (ki == NULL)
|
|
|
|
return;
|
|
|
|
|
2006-05-09 00:10:11 +00:00
|
|
|
AIO_LOCK(ki);
|
2006-02-26 12:56:23 +00:00
|
|
|
ki->kaio_flags |= KAIO_RUNDOWN;
|
2006-01-22 05:59:27 +00:00
|
|
|
|
|
|
|
restart:
|
1997-10-09 04:14:41 +00:00
|
|
|
|
2000-01-14 02:53:29 +00:00
|
|
|
/*
|
2006-01-22 05:59:27 +00:00
|
|
|
* Try to cancel all pending requests. This code simulates
|
|
|
|
* aio_cancel on all pending I/O requests.
|
2000-01-14 02:53:29 +00:00
|
|
|
*/
|
2016-02-05 20:38:09 +00:00
|
|
|
TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
|
2016-03-01 18:12:14 +00:00
|
|
|
aio_cancel_job(p, ki, job);
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
|
|
|
|
2006-01-22 05:59:27 +00:00
|
|
|
/* Wait for all running I/O to be finished */
|
2016-03-01 18:12:14 +00:00
|
|
|
if (TAILQ_FIRST(&ki->kaio_jobqueue) || ki->kaio_active_count != 0) {
|
Finish up the vast majority of the AIO/LIO functionality. Proper signal
support was missing in the previous version of the AIO code. More
tunables added, and very efficient support for VCHR files has been added.
Kernel threads are not used for VCHR files, all work for such files is
done for the requesting process directly. Some attempt has been made to
charge the requesting process for resource utilization, but more work
is needed. aio_fsync is still missing (but the original fsync system
call can be used for now.) aio_cancel is essentially a noop, but that
is okay per POSIX. More aio_cancel functionality can be added later,
if it is found to be needed.
The functions implemented include:
aio_read, aio_write, lio_listio, aio_error, aio_return,
aio_cancel, aio_suspend.
The code has been implemented to support the POSIX spec 1003.1b
(formerly known as POSIX 1003.4 spec) features of the above. The
async I/O features are truly async, with the VCHR mode of operation
being essentially the same as physio (for appropriate files) for
maximum efficiency. This code also supports the signal capability,
is highly tunable, allowing management of resource usage, and
has been written to allow a per process usage quota.
Both the O'Reilly POSIX.4 book and the actual POSIX 1003.1b document
were the reference specs used. Any filedescriptor can be used with
these new system calls. I know of no exceptions where these
system calls will not work. (TTY's will also probably work.)
1997-11-30 04:36:31 +00:00
|
|
|
ki->kaio_flags |= KAIO_WAKEUP;
|
2006-05-09 00:10:11 +00:00
|
|
|
msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
|
2006-01-22 05:59:27 +00:00
|
|
|
goto restart;
|
Finish up the vast majority of the AIO/LIO functionality. Proper signal
support was missing in the previous version of the AIO code. More
tunables added, and very efficient support for VCHR files has been added.
Kernel threads are not used for VCHR files, all work for such files is
done for the requesting process directly. Some attempt has been made to
charge the requesting process for resource utilization, but more work
is needed. aio_fsync is still missing (but the original fsync system
call can be used for now.) aio_cancel is essentially a noop, but that
is okay per POSIX. More aio_cancel functionality can be added later,
if it is found to be needed.
The functions implemented include:
aio_read, aio_write, lio_listio, aio_error, aio_return,
aio_cancel, aio_suspend.
The code has been implemented to support the POSIX spec 1003.1b
(formerly known as POSIX 1003.4 spec) features of the above. The
async I/O features are truly async, with the VCHR mode of operation
being essentially the same as physio (for appropriate files) for
maximum efficiency. This code also supports the signal capability,
is highly tunable, allowing management of resource usage, and
has been written to allow a per process usage quota.
Both the O'Reilly POSIX.4 book and the actual POSIX 1003.1b document
were the reference specs used. Any filedescriptor can be used with
these new system calls. I know of no exceptions where these
system calls will not work. (TTY's will also probably work.)
1997-11-30 04:36:31 +00:00
|
|
|
}
|
|
|
|
|
2006-01-22 05:59:27 +00:00
|
|
|
/* Free all completed I/O requests. */
|
2016-02-05 20:38:09 +00:00
|
|
|
while ((job = TAILQ_FIRST(&ki->kaio_done)) != NULL)
|
|
|
|
aio_free_entry(job);
|
2006-01-22 05:59:27 +00:00
|
|
|
|
|
|
|
while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
|
2006-01-23 02:49:34 +00:00
|
|
|
if (lj->lioj_count == 0) {
|
2000-01-14 02:53:29 +00:00
|
|
|
TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
|
2006-01-22 05:59:27 +00:00
|
|
|
knlist_delete(&lj->klist, curthread, 1);
|
2006-05-09 00:10:11 +00:00
|
|
|
PROC_LOCK(p);
|
2006-01-22 05:59:27 +00:00
|
|
|
sigqueue_take(&lj->lioj_ksi);
|
2006-05-09 00:10:11 +00:00
|
|
|
PROC_UNLOCK(p);
|
2002-03-20 04:09:59 +00:00
|
|
|
uma_zfree(aiolio_zone, lj);
|
2000-01-14 02:53:29 +00:00
|
|
|
} else {
|
2006-01-23 02:49:34 +00:00
|
|
|
panic("LIO job not cleaned up: C:%d, FC:%d\n",
|
|
|
|
lj->lioj_count, lj->lioj_finished_count);
|
2000-01-14 02:53:29 +00:00
|
|
|
}
|
Finish up the vast majority of the AIO/LIO functionality. Proper signal
support was missing in the previous version of the AIO code. More
tunables added, and very efficient support for VCHR files has been added.
Kernel threads are not used for VCHR files, all work for such files is
done for the requesting process directly. Some attempt has been made to
charge the requesting process for resource utilization, but more work
is needed. aio_fsync is still missing (but the original fsync system
call can be used for now.) aio_cancel is essentially a noop, but that
is okay per POSIX. More aio_cancel functionality can be added later,
if it is found to be needed.
The functions implemented include:
aio_read, aio_write, lio_listio, aio_error, aio_return,
aio_cancel, aio_suspend.
The code has been implemented to support the POSIX spec 1003.1b
(formerly known as POSIX 1003.4 spec) features of the above. The
async I/O features are truly async, with the VCHR mode of operation
being essentially the same as physio (for appropriate files) for
maximum efficiency. This code also supports the signal capability,
is highly tunable, allowing management of resource usage, and
has been written to allow a per process usage quota.
Both the O'Reilly POSIX.4 book and the actual POSIX 1003.1b document
were the reference specs used. Any filedescriptor can be used with
these new system calls. I know of no exceptions where these
system calls will not work. (TTY's will also probably work.)
1997-11-30 04:36:31 +00:00
|
|
|
}
|
2006-05-09 00:10:11 +00:00
|
|
|
AIO_UNLOCK(ki);
|
2016-01-14 20:51:48 +00:00
|
|
|
taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_task);
|
2016-03-01 18:12:14 +00:00
|
|
|
taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_sync_task);
|
2007-08-20 11:53:26 +00:00
|
|
|
mtx_destroy(&ki->kaio_mtx);
|
2002-03-20 04:09:59 +00:00
|
|
|
uma_zfree(kaio_zone, ki);
|
1997-10-09 04:14:41 +00:00
|
|
|
p->p_aioinfo = NULL;
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2000-01-14 02:53:29 +00:00
|
|
|
* Select a job to run (called by an AIO daemon).
|
1997-07-06 02:40:43 +00:00
|
|
|
*/
|
2016-02-05 20:38:09 +00:00
|
|
|
static struct kaiocb *
|
2016-01-21 02:20:38 +00:00
|
|
|
aio_selectjob(struct aioproc *aiop)
|
1997-11-29 01:33:10 +00:00
|
|
|
{
|
2016-02-05 20:38:09 +00:00
|
|
|
struct kaiocb *job;
|
2000-01-14 02:53:29 +00:00
|
|
|
struct kaioinfo *ki;
|
|
|
|
struct proc *userp;
|
1997-07-06 02:40:43 +00:00
|
|
|
|
2006-01-22 05:59:27 +00:00
|
|
|
mtx_assert(&aio_job_mtx, MA_OWNED);
|
2016-03-01 18:12:14 +00:00
|
|
|
restart:
|
2016-02-05 20:38:09 +00:00
|
|
|
TAILQ_FOREACH(job, &aio_jobs, list) {
|
|
|
|
userp = job->userproc;
|
1997-07-06 02:40:43 +00:00
|
|
|
ki = userp->p_aioinfo;
|
|
|
|
|
|
|
|
if (ki->kaio_active_count < ki->kaio_maxactive_count) {
|
2016-02-05 20:38:09 +00:00
|
|
|
TAILQ_REMOVE(&aio_jobs, job, list);
|
2016-03-01 18:12:14 +00:00
|
|
|
if (!aio_clear_cancel_function(job))
|
|
|
|
goto restart;
|
|
|
|
|
2006-01-22 05:59:27 +00:00
|
|
|
/* Account for currently active jobs. */
|
|
|
|
ki->kaio_active_count++;
|
|
|
|
break;
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
|
|
|
}
|
2016-02-05 20:38:09 +00:00
|
|
|
return (job);
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
|
|
|
|
2006-03-23 08:46:42 +00:00
|
|
|
/*
|
2016-01-26 21:24:49 +00:00
|
|
|
* Move all data to a permanent storage device. This code
|
|
|
|
* simulates the fsync syscall.
|
2006-03-23 08:46:42 +00:00
|
|
|
*/
|
|
|
|
static int
|
|
|
|
aio_fsync_vnode(struct thread *td, struct vnode *vp)
|
|
|
|
{
|
|
|
|
struct mount *mp;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
|
|
|
|
goto drop;
|
2008-01-10 01:10:58 +00:00
|
|
|
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
|
2006-03-23 08:46:42 +00:00
|
|
|
if (vp->v_object != NULL) {
|
2013-03-09 02:32:23 +00:00
|
|
|
VM_OBJECT_WLOCK(vp->v_object);
|
2006-03-23 08:46:42 +00:00
|
|
|
vm_object_page_clean(vp->v_object, 0, 0, 0);
|
2013-03-09 02:32:23 +00:00
|
|
|
VM_OBJECT_WUNLOCK(vp->v_object);
|
2006-03-23 08:46:42 +00:00
|
|
|
}
|
|
|
|
error = VOP_FSYNC(vp, MNT_WAIT, td);
|
|
|
|
|
2008-01-13 14:44:15 +00:00
|
|
|
VOP_UNLOCK(vp, 0);
|
2006-03-23 08:46:42 +00:00
|
|
|
vn_finished_write(mp);
|
|
|
|
drop:
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
1997-07-06 02:40:43 +00:00
|
|
|
/*
|
2013-06-08 13:02:43 +00:00
|
|
|
* The AIO processing activity for LIO_READ/LIO_WRITE. This is the code that
|
|
|
|
* does the I/O request for the non-physio version of the operations. The
|
|
|
|
* normal vn operations are used, and this code should work in all instances
|
|
|
|
* for every type of file, including pipes, sockets, fifos, and regular files.
|
2006-01-22 05:59:27 +00:00
|
|
|
*
|
2006-01-24 07:24:24 +00:00
|
|
|
* XXX I don't think it works well for socket, pipe, and fifo.
|
1997-07-06 02:40:43 +00:00
|
|
|
*/
|
2001-03-05 01:30:23 +00:00
|
|
|
static void
|
2016-02-05 20:38:09 +00:00
|
|
|
aio_process_rw(struct kaiocb *job)
|
1997-11-29 01:33:10 +00:00
|
|
|
{
|
2002-11-07 20:46:37 +00:00
|
|
|
struct ucred *td_savedcred;
|
2001-09-12 08:38:13 +00:00
|
|
|
struct thread *td;
|
1997-07-06 02:40:43 +00:00
|
|
|
struct aiocb *cb;
|
|
|
|
struct file *fp;
|
|
|
|
struct uio auio;
|
|
|
|
struct iovec aiov;
|
2016-03-21 21:37:33 +00:00
|
|
|
ssize_t cnt;
|
2016-06-21 22:19:06 +00:00
|
|
|
long msgsnd_st, msgsnd_end;
|
|
|
|
long msgrcv_st, msgrcv_end;
|
|
|
|
long oublock_st, oublock_end;
|
|
|
|
long inblock_st, inblock_end;
|
1997-07-06 02:40:43 +00:00
|
|
|
int error;
|
|
|
|
|
2016-02-05 20:38:09 +00:00
|
|
|
KASSERT(job->uaiocb.aio_lio_opcode == LIO_READ ||
|
|
|
|
job->uaiocb.aio_lio_opcode == LIO_WRITE,
|
|
|
|
("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
|
2013-06-08 13:02:43 +00:00
|
|
|
|
2016-03-01 18:12:14 +00:00
|
|
|
aio_switch_vmspace(job);
|
2001-09-12 08:38:13 +00:00
|
|
|
td = curthread;
|
2002-11-07 20:46:37 +00:00
|
|
|
td_savedcred = td->td_ucred;
|
2016-02-05 20:38:09 +00:00
|
|
|
td->td_ucred = job->cred;
|
|
|
|
cb = &job->uaiocb;
|
|
|
|
fp = job->fd_file;
|
2000-01-14 02:53:29 +00:00
|
|
|
|
2001-12-09 08:16:36 +00:00
|
|
|
aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
|
1997-07-06 02:40:43 +00:00
|
|
|
aiov.iov_len = cb->aio_nbytes;
|
|
|
|
|
|
|
|
auio.uio_iov = &aiov;
|
|
|
|
auio.uio_iovcnt = 1;
|
2002-04-04 02:13:20 +00:00
|
|
|
auio.uio_offset = cb->aio_offset;
|
1997-07-06 02:40:43 +00:00
|
|
|
auio.uio_resid = cb->aio_nbytes;
|
|
|
|
cnt = cb->aio_nbytes;
|
|
|
|
auio.uio_segflg = UIO_USERSPACE;
|
2001-09-12 08:38:13 +00:00
|
|
|
auio.uio_td = td;
|
1997-07-06 02:40:43 +00:00
|
|
|
|
2016-06-21 22:19:06 +00:00
|
|
|
msgrcv_st = td->td_ru.ru_msgrcv;
|
|
|
|
msgsnd_st = td->td_ru.ru_msgsnd;
|
2007-06-01 01:12:45 +00:00
|
|
|
inblock_st = td->td_ru.ru_inblock;
|
|
|
|
oublock_st = td->td_ru.ru_oublock;
|
2016-06-21 22:19:06 +00:00
|
|
|
|
2000-11-18 21:01:04 +00:00
|
|
|
/*
|
2006-01-23 02:49:34 +00:00
|
|
|
* aio_aqueue() acquires a reference to the file that is
|
2002-04-04 02:13:20 +00:00
|
|
|
* released in aio_free_entry().
|
2000-11-18 21:01:04 +00:00
|
|
|
*/
|
1997-07-06 02:40:43 +00:00
|
|
|
if (cb->aio_lio_opcode == LIO_READ) {
|
|
|
|
auio.uio_rw = UIO_READ;
|
2007-08-20 11:53:26 +00:00
|
|
|
if (auio.uio_resid == 0)
|
|
|
|
error = 0;
|
|
|
|
else
|
|
|
|
error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
|
1997-07-06 02:40:43 +00:00
|
|
|
} else {
|
2006-01-27 08:02:25 +00:00
|
|
|
if (fp->f_type == DTYPE_VNODE)
|
|
|
|
bwillwrite();
|
1997-07-06 02:40:43 +00:00
|
|
|
auio.uio_rw = UIO_WRITE;
|
2001-09-12 08:38:13 +00:00
|
|
|
error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
2016-06-21 22:19:06 +00:00
|
|
|
msgrcv_end = td->td_ru.ru_msgrcv;
|
|
|
|
msgsnd_end = td->td_ru.ru_msgsnd;
|
2007-06-01 01:12:45 +00:00
|
|
|
inblock_end = td->td_ru.ru_inblock;
|
|
|
|
oublock_end = td->td_ru.ru_oublock;
|
1997-11-29 01:33:10 +00:00
|
|
|
|
2016-06-21 22:19:06 +00:00
|
|
|
job->msgrcv = msgrcv_end - msgrcv_st;
|
|
|
|
job->msgsnd = msgsnd_end - msgsnd_st;
|
|
|
|
job->inblock = inblock_end - inblock_st;
|
|
|
|
job->outblock = oublock_end - oublock_st;
|
1997-07-06 02:40:43 +00:00
|
|
|
|
2000-01-14 02:53:29 +00:00
|
|
|
if ((error) && (auio.uio_resid != cnt)) {
|
|
|
|
if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
|
|
|
|
error = 0;
|
2001-03-07 03:37:06 +00:00
|
|
|
if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
|
2016-03-01 18:12:14 +00:00
|
|
|
PROC_LOCK(job->userproc);
|
|
|
|
kern_psignal(job->userproc, SIGPIPE);
|
|
|
|
PROC_UNLOCK(job->userproc);
|
2001-03-07 03:37:06 +00:00
|
|
|
}
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
cnt -= auio.uio_resid;
|
2002-11-07 20:46:37 +00:00
|
|
|
td->td_ucred = td_savedcred;
|
2016-05-20 19:46:25 +00:00
|
|
|
if (error)
|
|
|
|
aio_complete(job, -1, error);
|
|
|
|
else
|
|
|
|
aio_complete(job, cnt, 0);
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
|
|
|
|
2013-06-08 13:02:43 +00:00
|
|
|
static void
|
2016-02-05 20:38:09 +00:00
|
|
|
aio_process_sync(struct kaiocb *job)
|
2013-06-08 13:02:43 +00:00
|
|
|
{
|
|
|
|
struct thread *td = curthread;
|
|
|
|
struct ucred *td_savedcred = td->td_ucred;
|
2016-02-05 20:38:09 +00:00
|
|
|
struct file *fp = job->fd_file;
|
2013-06-08 13:02:43 +00:00
|
|
|
int error = 0;
|
|
|
|
|
2016-02-05 20:38:09 +00:00
|
|
|
KASSERT(job->uaiocb.aio_lio_opcode == LIO_SYNC,
|
|
|
|
("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
|
2013-06-08 13:02:43 +00:00
|
|
|
|
2016-02-05 20:38:09 +00:00
|
|
|
td->td_ucred = job->cred;
|
2013-06-08 13:02:43 +00:00
|
|
|
if (fp->f_vnode != NULL)
|
|
|
|
error = aio_fsync_vnode(td, fp->f_vnode);
|
|
|
|
td->td_ucred = td_savedcred;
|
2016-05-20 19:46:25 +00:00
|
|
|
if (error)
|
|
|
|
aio_complete(job, -1, error);
|
|
|
|
else
|
|
|
|
aio_complete(job, 0, 0);
|
2013-06-08 13:02:43 +00:00
|
|
|
}
|
|
|
|
|
2013-06-08 13:27:57 +00:00
|
|
|
static void
|
2016-02-05 20:38:09 +00:00
|
|
|
aio_process_mlock(struct kaiocb *job)
|
2013-06-08 13:27:57 +00:00
|
|
|
{
|
2016-02-05 20:38:09 +00:00
|
|
|
struct aiocb *cb = &job->uaiocb;
|
2013-06-08 13:27:57 +00:00
|
|
|
int error;
|
|
|
|
|
2016-02-05 20:38:09 +00:00
|
|
|
KASSERT(job->uaiocb.aio_lio_opcode == LIO_MLOCK,
|
|
|
|
("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
|
2013-06-08 13:27:57 +00:00
|
|
|
|
2016-03-01 18:12:14 +00:00
|
|
|
aio_switch_vmspace(job);
|
2016-02-05 20:38:09 +00:00
|
|
|
error = vm_mlock(job->userproc, job->cred,
|
2013-06-08 13:27:57 +00:00
|
|
|
__DEVOLATILE(void *, cb->aio_buf), cb->aio_nbytes);
|
2016-05-20 19:46:25 +00:00
|
|
|
if (error)
|
|
|
|
aio_complete(job, -1, error);
|
|
|
|
else
|
|
|
|
aio_complete(job, 0, 0);
|
2013-06-08 13:27:57 +00:00
|
|
|
}
|
|
|
|
|
2005-10-12 17:51:31 +00:00
|
|
|
static void
|
2016-03-01 18:12:14 +00:00
|
|
|
aio_bio_done_notify(struct proc *userp, struct kaiocb *job)
|
2006-01-22 05:59:27 +00:00
|
|
|
{
|
|
|
|
struct aioliojob *lj;
|
2005-10-12 17:51:31 +00:00
|
|
|
struct kaioinfo *ki;
|
2016-02-05 20:38:09 +00:00
|
|
|
struct kaiocb *sjob, *sjobn;
|
2006-01-22 05:59:27 +00:00
|
|
|
int lj_done;
|
2016-03-01 18:12:14 +00:00
|
|
|
bool schedule_fsync;
|
2005-10-12 17:51:31 +00:00
|
|
|
|
|
|
|
ki = userp->p_aioinfo;
|
2006-05-09 00:10:11 +00:00
|
|
|
AIO_LOCK_ASSERT(ki, MA_OWNED);
|
2016-02-05 20:38:09 +00:00
|
|
|
lj = job->lio;
|
2005-10-12 17:51:31 +00:00
|
|
|
lj_done = 0;
|
|
|
|
if (lj) {
|
2006-01-22 05:59:27 +00:00
|
|
|
lj->lioj_finished_count++;
|
|
|
|
if (lj->lioj_count == lj->lioj_finished_count)
|
2005-10-12 17:51:31 +00:00
|
|
|
lj_done = 1;
|
|
|
|
}
|
2016-02-05 20:38:09 +00:00
|
|
|
TAILQ_INSERT_TAIL(&ki->kaio_done, job, plist);
|
2016-03-01 18:12:14 +00:00
|
|
|
MPASS(job->jobflags & KAIOCB_FINISHED);
|
2006-02-26 12:56:23 +00:00
|
|
|
|
|
|
|
if (ki->kaio_flags & KAIO_RUNDOWN)
|
|
|
|
goto notification_done;
|
|
|
|
|
2016-02-05 20:38:09 +00:00
|
|
|
if (job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
|
|
|
|
job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
|
|
|
|
aio_sendsig(userp, &job->uaiocb.aio_sigevent, &job->ksi);
|
2005-10-12 17:51:31 +00:00
|
|
|
|
2016-02-05 20:38:09 +00:00
|
|
|
KNOTE_LOCKED(&job->klist, 1);
|
2005-10-12 17:51:31 +00:00
|
|
|
|
2006-01-22 05:59:27 +00:00
|
|
|
if (lj_done) {
|
|
|
|
if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
|
|
|
|
lj->lioj_flags |= LIOJ_KEVENT_POSTED;
|
|
|
|
KNOTE_LOCKED(&lj->klist, 1);
|
|
|
|
}
|
|
|
|
if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
|
|
|
|
== LIOJ_SIGNAL
|
|
|
|
&& (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
|
|
|
|
lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
|
|
|
|
aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi);
|
|
|
|
lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
|
2005-10-12 17:51:31 +00:00
|
|
|
}
|
|
|
|
}
|
2006-02-26 12:56:23 +00:00
|
|
|
|
|
|
|
notification_done:
|
2016-02-05 20:38:09 +00:00
|
|
|
if (job->jobflags & KAIOCB_CHECKSYNC) {
|
2016-03-01 18:12:14 +00:00
|
|
|
schedule_fsync = false;
|
2016-02-05 20:38:09 +00:00
|
|
|
TAILQ_FOREACH_SAFE(sjob, &ki->kaio_syncqueue, list, sjobn) {
|
|
|
|
if (job->fd_file == sjob->fd_file &&
|
|
|
|
job->seqno < sjob->seqno) {
|
|
|
|
if (--sjob->pending == 0) {
|
|
|
|
TAILQ_REMOVE(&ki->kaio_syncqueue, sjob,
|
2016-01-26 21:24:49 +00:00
|
|
|
list);
|
2016-03-01 18:12:14 +00:00
|
|
|
if (!aio_clear_cancel_function(sjob))
|
|
|
|
continue;
|
|
|
|
TAILQ_INSERT_TAIL(&ki->kaio_syncready,
|
|
|
|
sjob, list);
|
|
|
|
schedule_fsync = true;
|
2006-03-23 08:46:42 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2016-03-01 18:12:14 +00:00
|
|
|
if (schedule_fsync)
|
|
|
|
taskqueue_enqueue(taskqueue_aiod_kick,
|
|
|
|
&ki->kaio_sync_task);
|
2006-03-23 08:46:42 +00:00
|
|
|
}
|
2006-02-26 12:56:23 +00:00
|
|
|
if (ki->kaio_flags & KAIO_WAKEUP) {
|
2006-01-22 05:59:27 +00:00
|
|
|
ki->kaio_flags &= ~KAIO_WAKEUP;
|
|
|
|
wakeup(&userp->p_aioinfo);
|
2005-10-12 17:51:31 +00:00
|
|
|
}
|
|
|
|
}
|
2006-01-22 05:59:27 +00:00
|
|
|
|
2016-01-19 21:37:51 +00:00
|
|
|
static void
|
2016-03-01 18:12:14 +00:00
|
|
|
aio_schedule_fsync(void *context, int pending)
|
|
|
|
{
|
|
|
|
struct kaioinfo *ki;
|
|
|
|
struct kaiocb *job;
|
|
|
|
|
|
|
|
ki = context;
|
|
|
|
AIO_LOCK(ki);
|
|
|
|
while (!TAILQ_EMPTY(&ki->kaio_syncready)) {
|
|
|
|
job = TAILQ_FIRST(&ki->kaio_syncready);
|
|
|
|
TAILQ_REMOVE(&ki->kaio_syncready, job, list);
|
|
|
|
AIO_UNLOCK(ki);
|
|
|
|
aio_schedule(job, aio_process_sync);
|
|
|
|
AIO_LOCK(ki);
|
|
|
|
}
|
|
|
|
AIO_UNLOCK(ki);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
|
|
|
aio_cancel_cleared(struct kaiocb *job)
|
|
|
|
{
|
|
|
|
struct kaioinfo *ki;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The caller should hold the same queue lock held when
|
|
|
|
* aio_clear_cancel_function() was called and set this flag
|
|
|
|
* ensuring this check sees an up-to-date value. However,
|
|
|
|
* there is no way to assert that.
|
|
|
|
*/
|
|
|
|
ki = job->userproc->p_aioinfo;
|
|
|
|
return ((job->jobflags & KAIOCB_CLEARED) != 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
|
|
|
aio_clear_cancel_function(struct kaiocb *job)
|
|
|
|
{
|
|
|
|
struct kaioinfo *ki;
|
|
|
|
|
|
|
|
ki = job->userproc->p_aioinfo;
|
|
|
|
AIO_LOCK(ki);
|
|
|
|
MPASS(job->cancel_fn != NULL);
|
|
|
|
if (job->jobflags & KAIOCB_CANCELLING) {
|
|
|
|
job->jobflags |= KAIOCB_CLEARED;
|
|
|
|
AIO_UNLOCK(ki);
|
|
|
|
return (false);
|
|
|
|
}
|
|
|
|
job->cancel_fn = NULL;
|
|
|
|
AIO_UNLOCK(ki);
|
|
|
|
return (true);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
|
|
|
aio_set_cancel_function(struct kaiocb *job, aio_cancel_fn_t *func)
|
|
|
|
{
|
|
|
|
struct kaioinfo *ki;
|
|
|
|
|
|
|
|
ki = job->userproc->p_aioinfo;
|
|
|
|
AIO_LOCK(ki);
|
|
|
|
if (job->jobflags & KAIOCB_CANCELLED) {
|
|
|
|
AIO_UNLOCK(ki);
|
|
|
|
return (false);
|
|
|
|
}
|
|
|
|
job->cancel_fn = func;
|
|
|
|
AIO_UNLOCK(ki);
|
|
|
|
return (true);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
aio_complete(struct kaiocb *job, long status, int error)
|
|
|
|
{
|
|
|
|
struct kaioinfo *ki;
|
|
|
|
struct proc *userp;
|
|
|
|
|
|
|
|
job->uaiocb._aiocb_private.error = error;
|
|
|
|
job->uaiocb._aiocb_private.status = status;
|
|
|
|
|
|
|
|
userp = job->userproc;
|
|
|
|
ki = userp->p_aioinfo;
|
|
|
|
|
|
|
|
AIO_LOCK(ki);
|
|
|
|
KASSERT(!(job->jobflags & KAIOCB_FINISHED),
|
|
|
|
("duplicate aio_complete"));
|
|
|
|
job->jobflags |= KAIOCB_FINISHED;
|
|
|
|
if ((job->jobflags & (KAIOCB_QUEUEING | KAIOCB_CANCELLING)) == 0) {
|
|
|
|
TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
|
|
|
|
aio_bio_done_notify(userp, job);
|
|
|
|
}
|
|
|
|
AIO_UNLOCK(ki);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
aio_cancel(struct kaiocb *job)
|
|
|
|
{
|
|
|
|
|
|
|
|
aio_complete(job, -1, ECANCELED);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2016-02-05 20:38:09 +00:00
|
|
|
aio_switch_vmspace(struct kaiocb *job)
|
2016-01-19 21:37:51 +00:00
|
|
|
{
|
|
|
|
|
2016-02-05 20:38:09 +00:00
|
|
|
vmspace_switch_aio(job->userproc->p_vmspace);
|
2016-01-19 21:37:51 +00:00
|
|
|
}
|
|
|
|
|
1997-07-06 02:40:43 +00:00
|
|
|
/*
|
2013-06-08 13:02:43 +00:00
|
|
|
* The AIO daemon, most of the actual work is done in aio_process_*,
|
Finish up the vast majority of the AIO/LIO functionality. Proper signal
support was missing in the previous version of the AIO code. More
tunables added, and very efficient support for VCHR files has been added.
Kernel threads are not used for VCHR files, all work for such files is
done for the requesting process directly. Some attempt has been made to
charge the requesting process for resource utilization, but more work
is needed. aio_fsync is still missing (but the original fsync system
call can be used for now.) aio_cancel is essentially a noop, but that
is okay per POSIX. More aio_cancel functionality can be added later,
if it is found to be needed.
The functions implemented include:
aio_read, aio_write, lio_listio, aio_error, aio_return,
aio_cancel, aio_suspend.
The code has been implemented to support the POSIX spec 1003.1b
(formerly known as POSIX 1003.4 spec) features of the above. The
async I/O features are truly async, with the VCHR mode of operation
being essentially the same as physio (for appropriate files) for
maximum efficiency. This code also supports the signal capability,
is highly tunable, allowing management of resource usage, and
has been written to allow a per process usage quota.
Both the O'Reilly POSIX.4 book and the actual POSIX 1003.1b document
were the reference specs used. Any filedescriptor can be used with
these new system calls. I know of no exceptions where these
system calls will not work. (TTY's will also probably work.)
1997-11-30 04:36:31 +00:00
|
|
|
* but the setup (and address space mgmt) is done in this routine.
|
1997-07-06 02:40:43 +00:00
|
|
|
*/
|
|
|
|
static void
|
2006-01-22 05:59:27 +00:00
|
|
|
aio_daemon(void *_id)
|
1997-07-06 02:40:43 +00:00
|
|
|
{
|
2016-02-05 20:38:09 +00:00
|
|
|
struct kaiocb *job;
|
2016-01-21 02:20:38 +00:00
|
|
|
struct aioproc *aiop;
|
2000-01-14 02:53:29 +00:00
|
|
|
struct kaioinfo *ki;
|
2016-03-01 18:12:14 +00:00
|
|
|
struct proc *p;
|
2016-01-19 21:37:51 +00:00
|
|
|
struct vmspace *myvm;
|
2001-09-12 08:38:13 +00:00
|
|
|
struct thread *td = curthread;
|
2006-01-22 05:59:27 +00:00
|
|
|
int id = (intptr_t)_id;
|
1997-07-06 02:40:43 +00:00
|
|
|
|
|
|
|
/*
|
2016-01-19 21:37:51 +00:00
|
|
|
* Grab an extra reference on the daemon's vmspace so that it
|
|
|
|
* doesn't get freed by jobs that switch to a different
|
|
|
|
* vmspace.
|
1997-07-06 02:40:43 +00:00
|
|
|
*/
|
2016-01-19 21:37:51 +00:00
|
|
|
p = td->td_proc;
|
|
|
|
myvm = vmspace_acquire_ref(p);
|
1997-07-06 02:40:43 +00:00
|
|
|
|
2016-01-19 21:37:51 +00:00
|
|
|
KASSERT(p->p_textvp == NULL, ("kthread has a textvp"));
|
1997-07-06 02:40:43 +00:00
|
|
|
|
1997-11-29 01:33:10 +00:00
|
|
|
/*
|
2000-01-14 02:53:29 +00:00
|
|
|
* Allocate and ready the aio control info. There is one aiop structure
|
|
|
|
* per daemon.
|
1997-11-29 01:33:10 +00:00
|
|
|
*/
|
2003-02-19 05:47:46 +00:00
|
|
|
aiop = uma_zalloc(aiop_zone, M_WAITOK);
|
2016-01-21 02:20:38 +00:00
|
|
|
aiop->aioproc = p;
|
|
|
|
aiop->aioprocflags = 0;
|
2000-01-14 02:53:29 +00:00
|
|
|
|
1997-11-29 01:33:10 +00:00
|
|
|
/*
|
|
|
|
* Wakeup parent process. (Parent sleeps to keep from blasting away
|
2001-09-12 08:38:13 +00:00
|
|
|
* and creating too many daemons.)
|
1997-11-29 01:33:10 +00:00
|
|
|
*/
|
2006-01-22 05:59:27 +00:00
|
|
|
sema_post(&aio_newproc_sem);
|
1997-07-06 02:40:43 +00:00
|
|
|
|
2006-01-22 05:59:27 +00:00
|
|
|
mtx_lock(&aio_job_mtx);
|
2000-01-14 02:53:29 +00:00
|
|
|
for (;;) {
|
1997-11-29 01:33:10 +00:00
|
|
|
/*
|
|
|
|
* Take daemon off of free queue
|
|
|
|
*/
|
2016-01-21 02:20:38 +00:00
|
|
|
if (aiop->aioprocflags & AIOP_FREE) {
|
1997-07-06 02:40:43 +00:00
|
|
|
TAILQ_REMOVE(&aio_freeproc, aiop, list);
|
2016-01-21 02:20:38 +00:00
|
|
|
aiop->aioprocflags &= ~AIOP_FREE;
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
|
|
|
|
1997-11-29 01:33:10 +00:00
|
|
|
/*
|
2000-01-14 02:53:29 +00:00
|
|
|
* Check for jobs.
|
1997-11-29 01:33:10 +00:00
|
|
|
*/
|
2016-02-05 20:38:09 +00:00
|
|
|
while ((job = aio_selectjob(aiop)) != NULL) {
|
2006-01-22 05:59:27 +00:00
|
|
|
mtx_unlock(&aio_job_mtx);
|
1997-07-06 02:40:43 +00:00
|
|
|
|
2016-03-01 18:12:14 +00:00
|
|
|
ki = job->userproc->p_aioinfo;
|
|
|
|
job->handle_fn(job);
|
Finish up the vast majority of the AIO/LIO functionality. Proper signal
support was missing in the previous version of the AIO code. More
tunables added, and very efficient support for VCHR files has been added.
Kernel threads are not used for VCHR files, all work for such files is
done for the requesting process directly. Some attempt has been made to
charge the requesting process for resource utilization, but more work
is needed. aio_fsync is still missing (but the original fsync system
call can be used for now.) aio_cancel is essentially a noop, but that
is okay per POSIX. More aio_cancel functionality can be added later,
if it is found to be needed.
The functions implemented include:
aio_read, aio_write, lio_listio, aio_error, aio_return,
aio_cancel, aio_suspend.
The code has been implemented to support the POSIX spec 1003.1b
(formerly known as POSIX 1003.4 spec) features of the above. The
async I/O features are truly async, with the VCHR mode of operation
being essentially the same as physio (for appropriate files) for
maximum efficiency. This code also supports the signal capability,
is highly tunable, allowing management of resource usage, and
has been written to allow a per process usage quota.
Both the O'Reilly POSIX.4 book and the actual POSIX 1003.1b document
were the reference specs used. Any filedescriptor can be used with
these new system calls. I know of no exceptions where these
system calls will not work. (TTY's will also probably work.)
1997-11-30 04:36:31 +00:00
|
|
|
|
2006-01-22 09:25:52 +00:00
|
|
|
mtx_lock(&aio_job_mtx);
|
|
|
|
/* Decrement the active job count. */
|
|
|
|
ki->kaio_active_count--;
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
|
|
|
|
1997-11-29 01:33:10 +00:00
|
|
|
/*
|
2000-01-14 02:53:29 +00:00
|
|
|
* Disconnect from user address space.
|
1997-11-29 01:33:10 +00:00
|
|
|
*/
|
2016-01-19 21:37:51 +00:00
|
|
|
if (p->p_vmspace != myvm) {
|
2006-01-22 05:59:27 +00:00
|
|
|
mtx_unlock(&aio_job_mtx);
|
2016-01-19 21:37:51 +00:00
|
|
|
vmspace_switch_aio(myvm);
|
2006-01-22 05:59:27 +00:00
|
|
|
mtx_lock(&aio_job_mtx);
|
|
|
|
/*
|
|
|
|
* We have to restart to avoid race, we only sleep if
|
2016-01-19 21:37:51 +00:00
|
|
|
* no job can be selected.
|
2006-01-22 05:59:27 +00:00
|
|
|
*/
|
|
|
|
continue;
|
1997-11-29 01:33:10 +00:00
|
|
|
}
|
|
|
|
|
2006-01-22 05:59:27 +00:00
|
|
|
mtx_assert(&aio_job_mtx, MA_OWNED);
|
|
|
|
|
1997-11-29 01:33:10 +00:00
|
|
|
TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
|
2016-01-21 02:20:38 +00:00
|
|
|
aiop->aioprocflags |= AIOP_FREE;
|
1997-11-29 01:33:10 +00:00
|
|
|
|
|
|
|
/*
|
2000-01-14 02:53:29 +00:00
|
|
|
* If daemon is inactive for a long time, allow it to exit,
|
|
|
|
* thereby freeing resources.
|
1997-11-29 01:33:10 +00:00
|
|
|
*/
|
2016-01-21 02:20:38 +00:00
|
|
|
if (msleep(p, &aio_job_mtx, PRIBIO, "aiordy",
|
2016-01-19 21:37:51 +00:00
|
|
|
aiod_lifetime) == EWOULDBLOCK && TAILQ_EMPTY(&aio_jobs) &&
|
2016-01-21 02:20:38 +00:00
|
|
|
(aiop->aioprocflags & AIOP_FREE) &&
|
2016-01-19 21:37:51 +00:00
|
|
|
num_aio_procs > target_aio_procs)
|
|
|
|
break;
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
2016-01-19 21:37:51 +00:00
|
|
|
TAILQ_REMOVE(&aio_freeproc, aiop, list);
|
|
|
|
num_aio_procs--;
|
2006-01-22 05:59:27 +00:00
|
|
|
mtx_unlock(&aio_job_mtx);
|
2016-01-19 21:37:51 +00:00
|
|
|
uma_zfree(aiop_zone, aiop);
|
|
|
|
free_unr(aiod_unr, id);
|
|
|
|
vmspace_free(myvm);
|
|
|
|
|
|
|
|
KASSERT(p->p_vmspace == myvm,
|
|
|
|
("AIOD: bad vmspace for exiting daemon"));
|
|
|
|
KASSERT(myvm->vm_refcnt > 1,
|
|
|
|
("AIOD: bad vm refcnt for exiting daemon: %d", myvm->vm_refcnt));
|
|
|
|
kproc_exit(0);
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2006-01-22 05:59:27 +00:00
|
|
|
* Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
|
2000-01-14 02:53:29 +00:00
|
|
|
* AIO daemon modifies its environment itself.
|
1997-07-06 02:40:43 +00:00
|
|
|
*/
|
|
|
|
static int
|
2006-01-22 05:59:27 +00:00
|
|
|
aio_newproc(int *start)
|
1997-11-29 01:33:10 +00:00
|
|
|
{
|
1997-07-06 02:40:43 +00:00
|
|
|
int error;
|
2001-03-09 06:27:01 +00:00
|
|
|
struct proc *p;
|
2006-01-22 05:59:27 +00:00
|
|
|
int id;
|
1997-07-06 02:40:43 +00:00
|
|
|
|
2006-01-22 05:59:27 +00:00
|
|
|
id = alloc_unr(aiod_unr);
|
2007-10-20 23:23:23 +00:00
|
|
|
error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p,
|
2006-01-22 05:59:27 +00:00
|
|
|
RFNOWAIT, 0, "aiod%d", id);
|
|
|
|
if (error == 0) {
|
|
|
|
/*
|
|
|
|
* Wait until daemon is started.
|
|
|
|
*/
|
|
|
|
sema_wait(&aio_newproc_sem);
|
|
|
|
mtx_lock(&aio_job_mtx);
|
|
|
|
num_aio_procs++;
|
|
|
|
if (start != NULL)
|
2006-01-23 23:46:30 +00:00
|
|
|
(*start)--;
|
2006-01-22 05:59:27 +00:00
|
|
|
mtx_unlock(&aio_job_mtx);
|
|
|
|
} else {
|
|
|
|
free_unr(aiod_unr, id);
|
|
|
|
}
|
2003-01-13 15:06:05 +00:00
|
|
|
return (error);
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
|
|
|
|
1997-11-29 01:33:10 +00:00
|
|
|
/*
|
2001-03-05 01:30:23 +00:00
|
|
|
* Try the high-performance, low-overhead physio method for eligible
|
|
|
|
* VCHR devices. This method doesn't use an aio helper thread, and
|
2004-08-13 17:43:53 +00:00
|
|
|
* thus has very low overhead.
|
2001-03-05 01:30:23 +00:00
|
|
|
*
|
2006-01-23 02:49:34 +00:00
|
|
|
* Assumes that the caller, aio_aqueue(), has incremented the file
|
2001-03-05 01:30:23 +00:00
|
|
|
* structure's reference count, preventing its deallocation for the
|
2004-08-13 17:43:53 +00:00
|
|
|
* duration of this call.
|
1997-11-29 01:33:10 +00:00
|
|
|
*/
|
2001-03-05 01:30:23 +00:00
|
|
|
static int
|
2016-02-05 20:38:09 +00:00
|
|
|
aio_qphysio(struct proc *p, struct kaiocb *job)
|
1997-11-29 01:33:10 +00:00
|
|
|
{
|
|
|
|
struct aiocb *cb;
|
|
|
|
struct file *fp;
|
2015-04-22 18:11:34 +00:00
|
|
|
struct bio *bp;
|
|
|
|
struct buf *pbuf;
|
1997-11-29 01:33:10 +00:00
|
|
|
struct vnode *vp;
|
2013-03-27 11:47:52 +00:00
|
|
|
struct cdevsw *csw;
|
|
|
|
struct cdev *dev;
|
1997-11-29 01:33:10 +00:00
|
|
|
struct kaioinfo *ki;
|
2016-03-31 17:27:30 +00:00
|
|
|
int error, ref, poff;
|
2015-04-22 18:11:34 +00:00
|
|
|
vm_prot_t prot;
|
1997-11-29 01:33:10 +00:00
|
|
|
|
2016-02-05 20:38:09 +00:00
|
|
|
cb = &job->uaiocb;
|
|
|
|
fp = job->fd_file;
|
1997-11-29 01:33:10 +00:00
|
|
|
|
2013-06-08 13:27:57 +00:00
|
|
|
if (fp == NULL || fp->f_type != DTYPE_VNODE)
|
1999-11-07 13:09:09 +00:00
|
|
|
return (-1);
|
1997-11-29 01:33:10 +00:00
|
|
|
|
2003-06-22 08:41:43 +00:00
|
|
|
vp = fp->f_vnode;
|
2015-04-22 18:11:34 +00:00
|
|
|
if (vp->v_type != VCHR)
|
1999-11-07 13:09:09 +00:00
|
|
|
return (-1);
|
2015-04-22 18:11:34 +00:00
|
|
|
if (vp->v_bufobj.bo_bsize == 0)
|
1999-11-07 13:09:09 +00:00
|
|
|
return (-1);
|
2015-04-22 18:11:34 +00:00
|
|
|
if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
|
1999-11-07 13:09:09 +00:00
|
|
|
return (-1);
|
1997-11-29 01:33:10 +00:00
|
|
|
|
2013-03-27 11:47:52 +00:00
|
|
|
ref = 0;
|
|
|
|
csw = devvn_refthread(vp, &dev, &ref);
|
|
|
|
if (csw == NULL)
|
|
|
|
return (ENXIO);
|
2015-04-22 18:11:34 +00:00
|
|
|
|
|
|
|
if ((csw->d_flags & D_DISK) == 0) {
|
|
|
|
error = -1;
|
|
|
|
goto unref;
|
|
|
|
}
|
2013-03-27 11:47:52 +00:00
|
|
|
if (cb->aio_nbytes > dev->si_iosize_max) {
|
|
|
|
error = -1;
|
|
|
|
goto unref;
|
|
|
|
}
|
|
|
|
|
2015-04-22 18:11:34 +00:00
|
|
|
ki = p->p_aioinfo;
|
|
|
|
poff = (vm_offset_t)cb->aio_buf & PAGE_MASK;
|
2016-03-31 17:27:30 +00:00
|
|
|
if ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed) {
|
2015-04-22 18:11:34 +00:00
|
|
|
if (cb->aio_nbytes > MAXPHYS) {
|
|
|
|
error = -1;
|
|
|
|
goto unref;
|
|
|
|
}
|
2016-03-31 17:27:30 +00:00
|
|
|
|
|
|
|
pbuf = NULL;
|
2015-04-22 18:11:34 +00:00
|
|
|
} else {
|
|
|
|
if (cb->aio_nbytes > MAXPHYS - poff) {
|
|
|
|
error = -1;
|
|
|
|
goto unref;
|
|
|
|
}
|
|
|
|
if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) {
|
|
|
|
error = -1;
|
|
|
|
goto unref;
|
|
|
|
}
|
2016-03-31 17:27:30 +00:00
|
|
|
|
2016-02-05 20:38:09 +00:00
|
|
|
job->pbuf = pbuf = (struct buf *)getpbuf(NULL);
|
2015-04-22 18:11:34 +00:00
|
|
|
BUF_KERNPROC(pbuf);
|
2016-03-31 17:27:30 +00:00
|
|
|
AIO_LOCK(ki);
|
2015-04-22 18:11:34 +00:00
|
|
|
ki->kaio_buffer_count++;
|
2016-03-31 17:27:30 +00:00
|
|
|
AIO_UNLOCK(ki);
|
|
|
|
}
|
|
|
|
job->bp = bp = g_alloc_bio();
|
2006-01-22 05:59:27 +00:00
|
|
|
|
2015-04-22 18:11:34 +00:00
|
|
|
bp->bio_length = cb->aio_nbytes;
|
|
|
|
bp->bio_bcount = cb->aio_nbytes;
|
|
|
|
bp->bio_done = aio_physwakeup;
|
|
|
|
bp->bio_data = (void *)(uintptr_t)cb->aio_buf;
|
|
|
|
bp->bio_offset = cb->aio_offset;
|
|
|
|
bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
|
|
|
|
bp->bio_dev = dev;
|
2016-02-05 20:38:09 +00:00
|
|
|
bp->bio_caller1 = (void *)job;
|
2015-04-22 18:11:34 +00:00
|
|
|
|
|
|
|
prot = VM_PROT_READ;
|
|
|
|
if (cb->aio_lio_opcode == LIO_READ)
|
|
|
|
prot |= VM_PROT_WRITE; /* Less backwards than it looks */
|
2016-03-31 17:27:30 +00:00
|
|
|
job->npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
|
2016-02-05 20:38:09 +00:00
|
|
|
(vm_offset_t)bp->bio_data, bp->bio_length, prot, job->pages,
|
2016-03-31 17:27:30 +00:00
|
|
|
nitems(job->pages));
|
|
|
|
if (job->npages < 0) {
|
2015-04-22 18:11:34 +00:00
|
|
|
error = EFAULT;
|
|
|
|
goto doerror;
|
|
|
|
}
|
2016-03-31 17:27:30 +00:00
|
|
|
if (pbuf != NULL) {
|
2015-04-22 18:11:34 +00:00
|
|
|
pmap_qenter((vm_offset_t)pbuf->b_data,
|
2016-02-05 20:38:09 +00:00
|
|
|
job->pages, job->npages);
|
2015-04-22 18:11:34 +00:00
|
|
|
bp->bio_data = pbuf->b_data + poff;
|
2016-03-31 17:27:30 +00:00
|
|
|
atomic_add_int(&num_buf_aio, 1);
|
2015-04-22 18:11:34 +00:00
|
|
|
} else {
|
2016-02-05 20:38:09 +00:00
|
|
|
bp->bio_ma = job->pages;
|
|
|
|
bp->bio_ma_n = job->npages;
|
2015-04-22 18:11:34 +00:00
|
|
|
bp->bio_ma_offset = poff;
|
|
|
|
bp->bio_data = unmapped_buf;
|
|
|
|
bp->bio_flags |= BIO_UNMAPPED;
|
|
|
|
}
|
1997-11-29 01:33:10 +00:00
|
|
|
|
2000-01-14 02:53:29 +00:00
|
|
|
/* Perform transfer. */
|
2015-04-22 18:11:34 +00:00
|
|
|
csw->d_strategy(bp);
|
2013-03-27 11:47:52 +00:00
|
|
|
dev_relthread(dev, ref);
|
2003-01-13 15:06:05 +00:00
|
|
|
return (0);
|
1997-11-29 01:33:10 +00:00
|
|
|
|
|
|
|
doerror:
|
2016-03-31 17:27:30 +00:00
|
|
|
if (pbuf != NULL) {
|
|
|
|
AIO_LOCK(ki);
|
2015-04-22 18:11:34 +00:00
|
|
|
ki->kaio_buffer_count--;
|
2016-03-31 17:27:30 +00:00
|
|
|
AIO_UNLOCK(ki);
|
2015-04-22 18:11:34 +00:00
|
|
|
relpbuf(pbuf, NULL);
|
2016-02-05 20:38:09 +00:00
|
|
|
job->pbuf = NULL;
|
2015-04-22 18:11:34 +00:00
|
|
|
}
|
|
|
|
g_destroy_bio(bp);
|
2016-02-05 20:38:09 +00:00
|
|
|
job->bp = NULL;
|
2013-03-27 11:47:52 +00:00
|
|
|
unref:
|
|
|
|
dev_relthread(dev, ref);
|
1997-11-29 01:33:10 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2016-03-09 19:05:11 +00:00
|
|
|
#ifdef COMPAT_FREEBSD6
|
2008-12-10 20:56:19 +00:00
|
|
|
static int
|
|
|
|
convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig)
|
|
|
|
{
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
|
|
|
|
* supported by AIO with the old sigevent structure.
|
|
|
|
*/
|
|
|
|
nsig->sigev_notify = osig->sigev_notify;
|
|
|
|
switch (nsig->sigev_notify) {
|
|
|
|
case SIGEV_NONE:
|
|
|
|
break;
|
|
|
|
case SIGEV_SIGNAL:
|
|
|
|
nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
|
|
|
|
break;
|
|
|
|
case SIGEV_KEVENT:
|
|
|
|
nsig->sigev_notify_kqueue =
|
|
|
|
osig->__sigev_u.__sigev_notify_kqueue;
|
|
|
|
nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return (EINVAL);
|
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
|
|
|
|
{
|
|
|
|
struct oaiocb *ojob;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
bzero(kjob, sizeof(struct aiocb));
|
|
|
|
error = copyin(ujob, kjob, sizeof(struct oaiocb));
|
|
|
|
if (error)
|
|
|
|
return (error);
|
|
|
|
ojob = (struct oaiocb *)kjob;
|
|
|
|
return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent));
|
|
|
|
}
|
2016-03-09 19:05:11 +00:00
|
|
|
#endif
|
2008-12-10 20:56:19 +00:00
|
|
|
|
|
|
|
static int
|
|
|
|
aiocb_copyin(struct aiocb *ujob, struct aiocb *kjob)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (copyin(ujob, kjob, sizeof(struct aiocb)));
|
|
|
|
}
|
|
|
|
|
|
|
|
static long
|
|
|
|
aiocb_fetch_status(struct aiocb *ujob)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (fuword(&ujob->_aiocb_private.status));
|
|
|
|
}
|
|
|
|
|
|
|
|
static long
|
|
|
|
aiocb_fetch_error(struct aiocb *ujob)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (fuword(&ujob->_aiocb_private.error));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
aiocb_store_status(struct aiocb *ujob, long status)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (suword(&ujob->_aiocb_private.status, status));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
aiocb_store_error(struct aiocb *ujob, long error)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (suword(&ujob->_aiocb_private.error, error));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
aiocb_store_kernelinfo(struct aiocb *ujob, long jobref)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (suword(&ujob->_aiocb_private.kernelinfo, jobref));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (suword(ujobp, (long)ujob));
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct aiocb_ops aiocb_ops = {
|
|
|
|
.copyin = aiocb_copyin,
|
|
|
|
.fetch_status = aiocb_fetch_status,
|
|
|
|
.fetch_error = aiocb_fetch_error,
|
|
|
|
.store_status = aiocb_store_status,
|
|
|
|
.store_error = aiocb_store_error,
|
|
|
|
.store_kernelinfo = aiocb_store_kernelinfo,
|
|
|
|
.store_aiocb = aiocb_store_aiocb,
|
|
|
|
};
|
|
|
|
|
2016-03-09 19:05:11 +00:00
|
|
|
#ifdef COMPAT_FREEBSD6
|
2008-12-10 20:56:19 +00:00
|
|
|
static struct aiocb_ops aiocb_ops_osigevent = {
|
|
|
|
.copyin = aiocb_copyin_old_sigevent,
|
|
|
|
.fetch_status = aiocb_fetch_status,
|
|
|
|
.fetch_error = aiocb_fetch_error,
|
|
|
|
.store_status = aiocb_store_status,
|
|
|
|
.store_error = aiocb_store_error,
|
|
|
|
.store_kernelinfo = aiocb_store_kernelinfo,
|
|
|
|
.store_aiocb = aiocb_store_aiocb,
|
|
|
|
};
|
2016-03-09 19:05:11 +00:00
|
|
|
#endif
|
2008-12-10 20:56:19 +00:00
|
|
|
|
2000-01-14 02:53:29 +00:00
|
|
|
/*
|
|
|
|
* Queue a new AIO request. Choosing either the threaded or direct physio VCHR
|
|
|
|
* technique is done in this code.
|
1997-07-06 02:40:43 +00:00
|
|
|
*/
|
MFP4 (with some minor changes):
Implement the linux_io_* syscalls (AIO). They are only enabled if the native
AIO code is available (either compiled in to the kernel or as a module) at
the time the functions are used. If the AIO stuff is not available there
will be a ENOSYS.
From the submitter:
---snip---
DESIGN NOTES:
1. Linux permits a process to own multiple AIO queues (distinguished by
"context"), but FreeBSD creates only one single AIO queue per process.
My code maintains a request queue (STAILQ of queue(3)) per "context",
and throws all AIO requests of all contexts owned by a process into
the single FreeBSD per-process AIO queue.
When the process calls io_destroy(2), io_getevents(2), io_submit(2) and
io_cancel(2), my code can pick out requests owned by the specified context
from the single FreeBSD per-process AIO queue according to the per-context
request queues maintained by my code.
2. The request queue maintained by my code stores contrast information between
Linux IO control blocks (struct linux_iocb) and FreeBSD IO control blocks
(struct aiocb). FreeBSD IO control block actually exists in userland memory
space, required by FreeBSD native aio_XXXXXX(2).
3. It is quite troubling that the function io_getevents() of libaio-0.3.105
needs to use Linux-specific "struct aio_ring", which is a partial mirror
of context in user space. I would rather take the address of context in
kernel as the context ID, but the io_getevents() of libaio forces me to
take the address of the "ring" in user space as the context ID.
To my surprise, one comment line in the file "io_getevents.c" of
libaio-0.3.105 reads:
Ben will hate me for this
REFERENCE:
1. Linux kernel source code: http://www.kernel.org/pub/linux/kernel/v2.6/
(include/linux/aio_abi.h, fs/aio.c)
2. Linux manual pages: http://www.kernel.org/pub/linux/docs/manpages/
(io_setup(2), io_destroy(2), io_getevents(2), io_submit(2), io_cancel(2))
3. Linux Scalability Effort: http://lse.sourceforge.net/io/aio.html
The design notes: http://lse.sourceforge.net/io/aionotes.txt
4. The package libaio, both source and binary:
http://rpmfind.net/linux/rpm2html/search.php?query=libaio
Simple transparent interface to Linux AIO system calls.
5. Libaio-oracle: http://oss.oracle.com/projects/libaio-oracle/
POSIX AIO implementation based on Linux AIO system calls (depending on
libaio).
---snip---
Submitted by: Li, Xiao <intron@intron.ac>
2006-10-15 14:22:14 +00:00
|
|
|
int
|
2016-02-05 20:38:09 +00:00
|
|
|
aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj,
|
2008-12-10 20:56:19 +00:00
|
|
|
int type, struct aiocb_ops *ops)
|
1997-11-29 01:33:10 +00:00
|
|
|
{
|
2001-09-12 08:38:13 +00:00
|
|
|
struct proc *p = td->td_proc;
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
cap_rights_t rights;
|
1997-07-06 02:40:43 +00:00
|
|
|
struct file *fp;
|
2016-03-01 18:12:14 +00:00
|
|
|
struct kaiocb *job;
|
1997-07-06 02:40:43 +00:00
|
|
|
struct kaioinfo *ki;
|
2000-11-21 19:36:36 +00:00
|
|
|
struct kevent kev;
|
2006-01-22 05:59:27 +00:00
|
|
|
int opcode;
|
|
|
|
int error;
|
2006-09-24 04:47:47 +00:00
|
|
|
int fd, kqfd;
|
2006-01-22 05:59:27 +00:00
|
|
|
int jid;
|
2012-02-01 02:53:06 +00:00
|
|
|
u_short evflags;
|
2006-01-22 05:59:27 +00:00
|
|
|
|
2006-01-23 02:49:34 +00:00
|
|
|
if (p->p_aioinfo == NULL)
|
|
|
|
aio_init_aioinfo(p);
|
|
|
|
|
2006-01-22 05:59:27 +00:00
|
|
|
ki = p->p_aioinfo;
|
1997-07-06 02:40:43 +00:00
|
|
|
|
2016-02-05 20:38:09 +00:00
|
|
|
ops->store_status(ujob, -1);
|
|
|
|
ops->store_error(ujob, 0);
|
|
|
|
ops->store_kernelinfo(ujob, -1);
|
2006-01-23 02:49:34 +00:00
|
|
|
|
|
|
|
if (num_queue_count >= max_queue_count ||
|
|
|
|
ki->kaio_count >= ki->kaio_qallowed_count) {
|
2016-02-05 20:38:09 +00:00
|
|
|
ops->store_error(ujob, EAGAIN);
|
2006-01-23 02:49:34 +00:00
|
|
|
return (EAGAIN);
|
|
|
|
}
|
|
|
|
|
2016-02-05 20:38:09 +00:00
|
|
|
job = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);
|
|
|
|
knlist_init_mtx(&job->klist, AIO_MTX(ki));
|
1997-11-29 01:33:10 +00:00
|
|
|
|
2016-02-05 20:38:09 +00:00
|
|
|
error = ops->copyin(ujob, &job->uaiocb);
|
1997-07-06 02:40:43 +00:00
|
|
|
if (error) {
|
2016-02-05 20:38:09 +00:00
|
|
|
ops->store_error(ujob, error);
|
|
|
|
uma_zfree(aiocb_zone, job);
|
2003-01-13 15:06:05 +00:00
|
|
|
return (error);
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
2006-01-23 10:27:15 +00:00
|
|
|
|
2016-03-21 21:37:33 +00:00
|
|
|
if (job->uaiocb.aio_nbytes > IOSIZE_MAX) {
|
2016-02-05 20:38:09 +00:00
|
|
|
uma_zfree(aiocb_zone, job);
|
2012-01-26 11:59:48 +00:00
|
|
|
return (EINVAL);
|
|
|
|
}
|
|
|
|
|
2016-02-05 20:38:09 +00:00
|
|
|
if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
|
|
|
|
job->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
|
|
|
|
job->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
|
|
|
|
job->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
|
|
|
|
ops->store_error(ujob, EINVAL);
|
|
|
|
uma_zfree(aiocb_zone, job);
|
2006-01-23 10:27:15 +00:00
|
|
|
return (EINVAL);
|
|
|
|
}
|
2008-12-10 20:56:19 +00:00
|
|
|
|
2016-02-05 20:38:09 +00:00
|
|
|
if ((job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
|
|
|
|
job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
|
|
|
|
!_SIG_VALID(job->uaiocb.aio_sigevent.sigev_signo)) {
|
|
|
|
uma_zfree(aiocb_zone, job);
|
2003-01-13 15:06:05 +00:00
|
|
|
return (EINVAL);
|
2001-04-18 22:18:39 +00:00
|
|
|
}
|
1997-07-06 02:40:43 +00:00
|
|
|
|
2016-02-05 20:38:09 +00:00
|
|
|
ksiginfo_init(&job->ksi);
|
2005-11-03 05:25:26 +00:00
|
|
|
|
2000-01-14 02:53:29 +00:00
|
|
|
/* Save userspace address of the job info. */
|
2016-02-05 20:38:09 +00:00
|
|
|
job->ujob = ujob;
|
Fix error handling for VCHR type I/O. Also, fix another spl problem, and
remove alot of overly verbose debugging statements.
ioproclist {
int aioprocflags; /* AIO proc flags */
TAILQ_ENTRY(aioproclist) list; /* List of processes */
struct proc *aioproc; /* The AIO thread */
TAILQ_HEAD (,aiocblist) jobtorun; /* suggested job to run */
};
/*
* data-structure for lio signal management
*/
struct aio_liojob {
int lioj_flags;
int lioj_buffer_count;
int lioj_buffer_finished_count;
int lioj_queue_count;
int lioj_queue_finished_count;
struct sigevent lioj_signal; /* signal on all I/O done */
TAILQ_ENTRY (aio_liojob) lioj_list;
struct kaioinfo *lioj_ki;
};
#define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */
#define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */
/*
* per process aio data structure
*/
struct kaioinfo {
int kaio_flags; /* per process kaio flags */
int kaio_maxactive_count; /* maximum number of AIOs */
int kaio_active_count; /* number of currently used AIOs */
int kaio_qallowed_count; /* maxiumu size of AIO queue */
int kaio_queue_count; /* size of AIO queue */
int kaio_ballowed_count; /* maximum number of buffers */
int kaio_queue_finished_count; /* number of daemon jobs finished */
int kaio_buffer_count; /* number of physio buffers */
int kaio_buffer_finished_count; /* count of I/O done */
struct proc *kaio_p; /* process that uses this kaio block */
TAILQ_HEAD (,aio_liojob) kaio_liojoblist; /* list of lio jobs */
TAILQ_HEAD (,aiocblist) kaio_jobqueue; /* job queue for process */
TAILQ_HEAD (,aiocblist) kaio_jobdone; /* done queue for process */
TAILQ_HEAD (,aiocblist) kaio_bufqueue; /* buffer job queue for process */
TAILQ_HEAD (,aiocblist) kaio_bufdone; /* buffer done queue for process */
};
#define KAIO_RUNDOWN 0x1 /* process is being run down */
#define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant
event */
TAILQ_HEAD (,aioproclist) aio_freeproc, aio_activeproc;
TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */
TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */
TAILQ_HEAD(,aiocblist) aio_freejobs; /* Pool of free jobs */
static void aio_init_aioinfo(struct proc *p) ;
static void aio_onceonly(void *) ;
static int aio_free_entry(struct aiocblist *aiocbe);
static void aio_process(struct aiocblist *aiocbe);
static int aio_newproc(void) ;
static int aio_aqueue(struct proc *p, struct aiocb *job, int type) ;
static void aio_physwakeup(struct buf *bp);
static int aio_fphysio(struct proc *p, struct aiocblist *aiocbe, int type);
static int aio_qphysio(struct proc *p, struct aiocblist *iocb);
static void aio_daemon(void *uproc);
SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL);
static vm_zone_t kaio_zone=0, aiop_zone=0,
aiocb_zone=0, aiol_zone=0, aiolio_zone=0;
/*
* Single AIOD vmspace shared amongst all of them
*/
static struct vmspace *aiovmspace = NULL;
/*
* Startup initialization
*/
void
aio_onceonly(void *na)
{
TAILQ_INIT(&aio_freeproc);
TAILQ_INIT(&aio_activeproc);
TAILQ_INIT(&aio_jobs);
TAILQ_INIT(&aio_bufjobs);
TAILQ_INIT(&aio_freejobs);
kaio_zone = zinit("AIO", sizeof (struct kaioinfo), 0, 0, 1);
aiop_zone = zinit("AIOP", sizeof (struct aioproclist), 0, 0, 1);
aiocb_zone = zinit("AIOCB", sizeof (struct aiocblist), 0, 0, 1);
aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof (int), 0, 0, 1);
aiolio_zone = zinit("AIOLIO",
AIO_LISTIO_MAX * sizeof (struct aio_liojob), 0, 0, 1);
aiod_timeout = AIOD_TIMEOUT_DEFAULT;
aiod_lifetime = AIOD_LIFETIME_DEFAULT;
jobrefid = 1;
}
/*
* Init the per-process aioinfo structure.
* The aioinfo limits are set per-process for user limit (resource) management.
*/
void
aio_init_aioinfo(struct proc *p)
{
struct kaioinfo *ki;
if (p->p_aioinfo == NULL) {
ki = zalloc(kaio_zone);
p->p_aioinfo = ki
1997-12-01 07:01:45 +00:00
|
|
|
|
2000-01-14 02:53:29 +00:00
|
|
|
/* Get the opcode. */
|
|
|
|
if (type != LIO_NOP)
|
2016-02-05 20:38:09 +00:00
|
|
|
job->uaiocb.aio_lio_opcode = type;
|
|
|
|
opcode = job->uaiocb.aio_lio_opcode;
|
1997-07-06 02:40:43 +00:00
|
|
|
|
2011-08-11 12:30:23 +00:00
|
|
|
/*
|
|
|
|
* Validate the opcode and fetch the file object for the specified
|
|
|
|
* file descriptor.
|
|
|
|
*
|
|
|
|
* XXXRW: Moved the opcode validation up here so that we don't
|
|
|
|
* retrieve a file descriptor without knowing what the capabiltity
|
|
|
|
* should be.
|
|
|
|
*/
|
2016-02-05 20:38:09 +00:00
|
|
|
fd = job->uaiocb.aio_fildes;
|
2005-11-08 17:43:05 +00:00
|
|
|
switch (opcode) {
|
|
|
|
case LIO_WRITE:
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
error = fget_write(td, fd,
|
|
|
|
cap_rights_init(&rights, CAP_PWRITE), &fp);
|
2005-11-08 17:43:05 +00:00
|
|
|
break;
|
|
|
|
case LIO_READ:
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
error = fget_read(td, fd,
|
|
|
|
cap_rights_init(&rights, CAP_PREAD), &fp);
|
2011-08-11 12:30:23 +00:00
|
|
|
break;
|
|
|
|
case LIO_SYNC:
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
error = fget(td, fd, cap_rights_init(&rights, CAP_FSYNC), &fp);
|
2011-08-11 12:30:23 +00:00
|
|
|
break;
|
2013-06-08 13:27:57 +00:00
|
|
|
case LIO_MLOCK:
|
|
|
|
fp = NULL;
|
|
|
|
break;
|
2011-08-11 12:30:23 +00:00
|
|
|
case LIO_NOP:
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
error = fget(td, fd, cap_rights_init(&rights), &fp);
|
2005-11-08 17:43:05 +00:00
|
|
|
break;
|
|
|
|
default:
|
2011-08-11 12:30:23 +00:00
|
|
|
error = EINVAL;
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
2005-11-08 17:43:05 +00:00
|
|
|
if (error) {
|
2016-02-05 20:38:09 +00:00
|
|
|
uma_zfree(aiocb_zone, job);
|
|
|
|
ops->store_error(ujob, error);
|
2006-01-06 16:34:22 +00:00
|
|
|
return (error);
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
2006-03-23 08:46:42 +00:00
|
|
|
|
|
|
|
if (opcode == LIO_SYNC && fp->f_vnode == NULL) {
|
|
|
|
error = EINVAL;
|
|
|
|
goto aqueue_fail;
|
|
|
|
}
|
1997-07-06 02:40:43 +00:00
|
|
|
|
2016-02-05 20:38:09 +00:00
|
|
|
if (opcode != LIO_SYNC && job->uaiocb.aio_offset == -1LL) {
|
2002-04-07 07:17:59 +00:00
|
|
|
error = EINVAL;
|
|
|
|
goto aqueue_fail;
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
2006-01-22 05:59:27 +00:00
|
|
|
|
2016-02-05 20:38:09 +00:00
|
|
|
job->fd_file = fp;
|
2006-03-23 08:46:42 +00:00
|
|
|
|
2006-01-22 05:59:27 +00:00
|
|
|
mtx_lock(&aio_job_mtx);
|
2006-03-23 08:46:42 +00:00
|
|
|
jid = jobrefid++;
|
2016-02-05 20:38:09 +00:00
|
|
|
job->seqno = jobseqno++;
|
2006-01-22 05:59:27 +00:00
|
|
|
mtx_unlock(&aio_job_mtx);
|
2016-02-05 20:38:09 +00:00
|
|
|
error = ops->store_kernelinfo(ujob, jid);
|
2006-01-22 05:59:27 +00:00
|
|
|
if (error) {
|
|
|
|
error = EINVAL;
|
|
|
|
goto aqueue_fail;
|
|
|
|
}
|
2016-02-05 20:38:09 +00:00
|
|
|
job->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;
|
2004-08-13 17:43:53 +00:00
|
|
|
|
1997-07-06 02:40:43 +00:00
|
|
|
if (opcode == LIO_NOP) {
|
2002-03-31 20:17:56 +00:00
|
|
|
fdrop(fp, td);
|
2016-02-05 20:38:09 +00:00
|
|
|
uma_zfree(aiocb_zone, job);
|
2003-01-13 15:06:05 +00:00
|
|
|
return (0);
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
|
|
|
|
2016-02-05 20:38:09 +00:00
|
|
|
if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
|
2005-06-04 19:16:33 +00:00
|
|
|
goto no_kqueue;
|
2016-02-05 20:38:09 +00:00
|
|
|
evflags = job->uaiocb.aio_sigevent.sigev_notify_kevent_flags;
|
2012-02-01 02:53:06 +00:00
|
|
|
if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) {
|
|
|
|
error = EINVAL;
|
|
|
|
goto aqueue_fail;
|
|
|
|
}
|
2016-02-05 20:38:09 +00:00
|
|
|
kqfd = job->uaiocb.aio_sigevent.sigev_notify_kqueue;
|
|
|
|
kev.ident = (uintptr_t)job->ujob;
|
2000-11-21 19:36:36 +00:00
|
|
|
kev.filter = EVFILT_AIO;
|
2012-02-01 02:53:06 +00:00
|
|
|
kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags;
|
2016-02-05 20:38:09 +00:00
|
|
|
kev.data = (intptr_t)job;
|
|
|
|
kev.udata = job->uaiocb.aio_sigevent.sigev_value.sival_ptr;
|
2006-09-24 04:47:47 +00:00
|
|
|
error = kqfd_register(kqfd, &kev, td, 1);
|
2016-03-01 18:12:14 +00:00
|
|
|
if (error)
|
|
|
|
goto aqueue_fail;
|
|
|
|
|
2000-11-21 19:36:36 +00:00
|
|
|
no_kqueue:
|
2000-04-16 18:53:38 +00:00
|
|
|
|
2016-02-05 20:38:09 +00:00
|
|
|
ops->store_error(ujob, EINPROGRESS);
|
|
|
|
job->uaiocb._aiocb_private.error = EINPROGRESS;
|
|
|
|
job->userproc = p;
|
|
|
|
job->cred = crhold(td->td_ucred);
|
2016-03-01 18:12:14 +00:00
|
|
|
job->jobflags = KAIOCB_QUEUEING;
|
2016-02-05 20:38:09 +00:00
|
|
|
job->lio = lj;
|
1997-11-29 01:33:10 +00:00
|
|
|
|
2016-03-01 18:12:14 +00:00
|
|
|
if (opcode == LIO_MLOCK) {
|
|
|
|
aio_schedule(job, aio_process_mlock);
|
|
|
|
error = 0;
|
|
|
|
} else if (fp->f_ops->fo_aio_queue == NULL)
|
|
|
|
error = aio_queue_file(fp, job);
|
|
|
|
else
|
|
|
|
error = fo_aio_queue(fp, job);
|
|
|
|
if (error)
|
|
|
|
goto aqueue_fail;
|
2006-03-23 08:46:42 +00:00
|
|
|
|
2016-03-01 18:12:14 +00:00
|
|
|
AIO_LOCK(ki);
|
|
|
|
job->jobflags &= ~KAIOCB_QUEUEING;
|
|
|
|
TAILQ_INSERT_TAIL(&ki->kaio_all, job, allist);
|
|
|
|
ki->kaio_count++;
|
|
|
|
if (lj)
|
|
|
|
lj->lioj_count++;
|
|
|
|
atomic_add_int(&num_queue_count, 1);
|
|
|
|
if (job->jobflags & KAIOCB_FINISHED) {
|
2000-01-14 02:53:29 +00:00
|
|
|
/*
|
2016-03-01 18:12:14 +00:00
|
|
|
* The queue callback completed the request synchronously.
|
|
|
|
* The bulk of the completion is deferred in that case
|
|
|
|
* until this point.
|
2000-01-14 02:53:29 +00:00
|
|
|
*/
|
2016-03-01 18:12:14 +00:00
|
|
|
aio_bio_done_notify(p, job);
|
|
|
|
} else
|
|
|
|
TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, job, plist);
|
|
|
|
AIO_UNLOCK(ki);
|
|
|
|
return (0);
|
2006-01-24 07:24:24 +00:00
|
|
|
|
2016-03-01 18:12:14 +00:00
|
|
|
aqueue_fail:
|
|
|
|
knlist_delete(&job->klist, curthread, 0);
|
|
|
|
if (fp)
|
|
|
|
fdrop(fp, td);
|
|
|
|
uma_zfree(aiocb_zone, job);
|
|
|
|
ops->store_error(ujob, error);
|
|
|
|
return (error);
|
|
|
|
}
|
2006-01-22 05:59:27 +00:00
|
|
|
|
2016-03-01 18:12:14 +00:00
|
|
|
static void
|
|
|
|
aio_cancel_daemon_job(struct kaiocb *job)
|
|
|
|
{
|
|
|
|
|
|
|
|
mtx_lock(&aio_job_mtx);
|
|
|
|
if (!aio_cancel_cleared(job))
|
|
|
|
TAILQ_REMOVE(&aio_jobs, job, list);
|
|
|
|
mtx_unlock(&aio_job_mtx);
|
|
|
|
aio_cancel(job);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
aio_schedule(struct kaiocb *job, aio_handle_fn_t *func)
|
|
|
|
{
|
|
|
|
|
|
|
|
mtx_lock(&aio_job_mtx);
|
|
|
|
if (!aio_set_cancel_function(job, aio_cancel_daemon_job)) {
|
|
|
|
mtx_unlock(&aio_job_mtx);
|
|
|
|
aio_cancel(job);
|
|
|
|
return;
|
2000-01-14 02:53:29 +00:00
|
|
|
}
|
2016-03-01 18:12:14 +00:00
|
|
|
job->handle_fn = func;
|
|
|
|
TAILQ_INSERT_TAIL(&aio_jobs, job, list);
|
|
|
|
aio_kick_nowait(job->userproc);
|
|
|
|
mtx_unlock(&aio_job_mtx);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
aio_cancel_sync(struct kaiocb *job)
|
|
|
|
{
|
|
|
|
struct kaioinfo *ki;
|
|
|
|
|
|
|
|
ki = job->userproc->p_aioinfo;
|
|
|
|
mtx_lock(&aio_job_mtx);
|
|
|
|
if (!aio_cancel_cleared(job))
|
|
|
|
TAILQ_REMOVE(&ki->kaio_syncqueue, job, list);
|
|
|
|
mtx_unlock(&aio_job_mtx);
|
|
|
|
aio_cancel(job);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
aio_queue_file(struct file *fp, struct kaiocb *job)
|
|
|
|
{
|
|
|
|
struct aioliojob *lj;
|
|
|
|
struct kaioinfo *ki;
|
|
|
|
struct kaiocb *job2;
|
2016-07-21 17:07:06 +00:00
|
|
|
struct vnode *vp;
|
|
|
|
struct mount *mp;
|
2016-03-01 18:12:14 +00:00
|
|
|
int error, opcode;
|
2016-07-21 17:07:06 +00:00
|
|
|
bool safe;
|
2016-03-01 18:12:14 +00:00
|
|
|
|
|
|
|
lj = job->lio;
|
|
|
|
ki = job->userproc->p_aioinfo;
|
|
|
|
opcode = job->uaiocb.aio_lio_opcode;
|
|
|
|
if (opcode == LIO_SYNC)
|
|
|
|
goto queueit;
|
2000-01-14 02:53:29 +00:00
|
|
|
|
2016-03-01 18:12:14 +00:00
|
|
|
if ((error = aio_qphysio(job->userproc, job)) == 0)
|
2000-11-18 21:01:04 +00:00
|
|
|
goto done;
|
2006-01-22 05:59:27 +00:00
|
|
|
#if 0
|
2016-03-01 18:12:14 +00:00
|
|
|
/*
|
|
|
|
* XXX: This means qphysio() failed with EFAULT. The current
|
|
|
|
* behavior is to retry the operation via fo_read/fo_write.
|
|
|
|
* Wouldn't it be better to just complete the request with an
|
|
|
|
* error here?
|
|
|
|
*/
|
|
|
|
if (error > 0)
|
2000-11-18 21:01:04 +00:00
|
|
|
goto done;
|
2006-01-22 05:59:27 +00:00
|
|
|
#endif
|
2006-03-23 08:46:42 +00:00
|
|
|
queueit:
|
2016-07-21 17:07:06 +00:00
|
|
|
safe = false;
|
|
|
|
if (fp->f_type == DTYPE_VNODE) {
|
|
|
|
vp = fp->f_vnode;
|
|
|
|
if (vp->v_type == VREG || vp->v_type == VDIR) {
|
|
|
|
mp = fp->f_vnode->v_mount;
|
|
|
|
if (mp == NULL || (mp->mnt_flag & MNT_LOCAL) != 0)
|
|
|
|
safe = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!(safe || enable_aio_unsafe))
|
2016-03-01 18:12:14 +00:00
|
|
|
return (EOPNOTSUPP);
|
Finish up the vast majority of the AIO/LIO functionality. Proper signal
support was missing in the previous version of the AIO code. More
tunables added, and very efficient support for VCHR files has been added.
Kernel threads are not used for VCHR files, all work for such files is
done for the requesting process directly. Some attempt has been made to
charge the requesting process for resource utilization, but more work
is needed. aio_fsync is still missing (but the original fsync system
call can be used for now.) aio_cancel is essentially a noop, but that
is okay per POSIX. More aio_cancel functionality can be added later,
if it is found to be needed.
The functions implemented include:
aio_read, aio_write, lio_listio, aio_error, aio_return,
aio_cancel, aio_suspend.
The code has been implemented to support the POSIX spec 1003.1b
(formerly known as POSIX 1003.4 spec) features of the above. The
async I/O features are truly async, with the VCHR mode of operation
being essentially the same as physio (for appropriate files) for
maximum efficiency. This code also supports the signal capability,
is highly tunable, allowing management of resource usage, and
has been written to allow a per process usage quota.
Both the O'Reilly POSIX.4 book and the actual POSIX 1003.1b document
were the reference specs used. Any filedescriptor can be used with
these new system calls. I know of no exceptions where these
system calls will not work. (TTY's will also probably work.)
1997-11-30 04:36:31 +00:00
|
|
|
|
2006-03-23 08:46:42 +00:00
|
|
|
if (opcode == LIO_SYNC) {
|
2016-03-01 18:12:14 +00:00
|
|
|
AIO_LOCK(ki);
|
2016-02-05 20:38:09 +00:00
|
|
|
TAILQ_FOREACH(job2, &ki->kaio_jobqueue, plist) {
|
|
|
|
if (job2->fd_file == job->fd_file &&
|
|
|
|
job2->uaiocb.aio_lio_opcode != LIO_SYNC &&
|
|
|
|
job2->seqno < job->seqno) {
|
|
|
|
job2->jobflags |= KAIOCB_CHECKSYNC;
|
|
|
|
job->pending++;
|
2006-03-24 00:50:06 +00:00
|
|
|
}
|
|
|
|
}
|
2016-02-05 20:38:09 +00:00
|
|
|
if (job->pending != 0) {
|
2016-03-01 18:12:14 +00:00
|
|
|
if (!aio_set_cancel_function(job, aio_cancel_sync)) {
|
|
|
|
AIO_UNLOCK(ki);
|
|
|
|
aio_cancel(job);
|
|
|
|
return (0);
|
|
|
|
}
|
2016-02-05 20:38:09 +00:00
|
|
|
TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, job, list);
|
2006-05-09 00:10:11 +00:00
|
|
|
AIO_UNLOCK(ki);
|
2016-03-01 18:12:14 +00:00
|
|
|
return (0);
|
2006-03-24 00:50:06 +00:00
|
|
|
}
|
2016-03-01 18:12:14 +00:00
|
|
|
AIO_UNLOCK(ki);
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (opcode) {
|
|
|
|
case LIO_READ:
|
|
|
|
case LIO_WRITE:
|
|
|
|
aio_schedule(job, aio_process_rw);
|
|
|
|
error = 0;
|
|
|
|
break;
|
|
|
|
case LIO_SYNC:
|
|
|
|
aio_schedule(job, aio_process_sync);
|
|
|
|
error = 0;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
error = EINVAL;
|
2006-03-23 08:46:42 +00:00
|
|
|
}
|
|
|
|
done:
|
|
|
|
return (error);
|
|
|
|
}
|
1997-07-06 02:40:43 +00:00
|
|
|
|
2006-03-23 08:46:42 +00:00
|
|
|
static void
|
|
|
|
aio_kick_nowait(struct proc *userp)
|
|
|
|
{
|
|
|
|
struct kaioinfo *ki = userp->p_aioinfo;
|
2016-01-21 02:20:38 +00:00
|
|
|
struct aioproc *aiop;
|
1997-11-29 01:33:10 +00:00
|
|
|
|
2006-03-23 08:46:42 +00:00
|
|
|
mtx_assert(&aio_job_mtx, MA_OWNED);
|
|
|
|
if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
|
|
|
|
TAILQ_REMOVE(&aio_freeproc, aiop, list);
|
2016-01-21 02:20:38 +00:00
|
|
|
aiop->aioprocflags &= ~AIOP_FREE;
|
|
|
|
wakeup(aiop->aioproc);
|
2016-01-26 21:24:49 +00:00
|
|
|
} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
|
|
|
|
ki->kaio_active_count + num_aio_resv_start <
|
|
|
|
ki->kaio_maxactive_count) {
|
2016-01-14 20:51:48 +00:00
|
|
|
taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_task);
|
2006-03-23 08:46:42 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-03-24 00:50:06 +00:00
|
|
|
static int
|
2006-03-23 08:46:42 +00:00
|
|
|
aio_kick(struct proc *userp)
|
|
|
|
{
|
|
|
|
struct kaioinfo *ki = userp->p_aioinfo;
|
2016-01-21 02:20:38 +00:00
|
|
|
struct aioproc *aiop;
|
2006-03-24 00:50:06 +00:00
|
|
|
int error, ret = 0;
|
2006-03-23 08:46:42 +00:00
|
|
|
|
|
|
|
mtx_assert(&aio_job_mtx, MA_OWNED);
|
2002-01-14 07:26:33 +00:00
|
|
|
retryproc:
|
1999-01-27 21:50:00 +00:00
|
|
|
if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
|
1997-07-06 02:40:43 +00:00
|
|
|
TAILQ_REMOVE(&aio_freeproc, aiop, list);
|
2016-01-21 02:20:38 +00:00
|
|
|
aiop->aioprocflags &= ~AIOP_FREE;
|
|
|
|
wakeup(aiop->aioproc);
|
2016-01-26 21:24:49 +00:00
|
|
|
} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
|
|
|
|
ki->kaio_active_count + num_aio_resv_start <
|
|
|
|
ki->kaio_maxactive_count) {
|
1997-11-29 01:33:10 +00:00
|
|
|
num_aio_resv_start++;
|
2006-01-22 05:59:27 +00:00
|
|
|
mtx_unlock(&aio_job_mtx);
|
|
|
|
error = aio_newproc(&num_aio_resv_start);
|
|
|
|
mtx_lock(&aio_job_mtx);
|
|
|
|
if (error) {
|
|
|
|
num_aio_resv_start--;
|
2005-11-08 17:43:05 +00:00
|
|
|
goto retryproc;
|
2006-01-22 05:59:27 +00:00
|
|
|
}
|
2006-03-24 00:50:06 +00:00
|
|
|
} else {
|
|
|
|
ret = -1;
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
2006-03-24 00:50:06 +00:00
|
|
|
return (ret);
|
2006-03-23 08:46:42 +00:00
|
|
|
}
|
2006-01-22 05:59:27 +00:00
|
|
|
|
2006-03-23 08:46:42 +00:00
|
|
|
static void
|
|
|
|
aio_kick_helper(void *context, int pending)
|
|
|
|
{
|
|
|
|
struct proc *userp = context;
|
|
|
|
|
|
|
|
mtx_lock(&aio_job_mtx);
|
2006-03-24 00:50:06 +00:00
|
|
|
while (--pending >= 0) {
|
|
|
|
if (aio_kick(userp))
|
|
|
|
break;
|
|
|
|
}
|
2006-03-23 08:46:42 +00:00
|
|
|
mtx_unlock(&aio_job_mtx);
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2000-01-14 02:53:29 +00:00
|
|
|
* Support the aio_return system call, as a side-effect, kernel resources are
|
|
|
|
* released.
|
1997-07-06 02:40:43 +00:00
|
|
|
*/
|
2008-12-10 20:56:19 +00:00
|
|
|
static int
|
2016-02-05 20:38:09 +00:00
|
|
|
kern_aio_return(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops)
|
1997-11-29 01:33:10 +00:00
|
|
|
{
|
2001-09-12 08:38:13 +00:00
|
|
|
struct proc *p = td->td_proc;
|
2016-02-05 20:38:09 +00:00
|
|
|
struct kaiocb *job;
|
1997-07-06 02:40:43 +00:00
|
|
|
struct kaioinfo *ki;
|
2016-03-21 21:37:33 +00:00
|
|
|
long status, error;
|
1997-07-06 02:40:43 +00:00
|
|
|
|
2002-04-08 04:57:56 +00:00
|
|
|
ki = p->p_aioinfo;
|
|
|
|
if (ki == NULL)
|
2003-01-13 15:06:05 +00:00
|
|
|
return (EINVAL);
|
2006-05-09 00:10:11 +00:00
|
|
|
AIO_LOCK(ki);
|
2016-02-05 20:38:09 +00:00
|
|
|
TAILQ_FOREACH(job, &ki->kaio_done, plist) {
|
|
|
|
if (job->ujob == ujob)
|
2002-04-08 04:57:56 +00:00
|
|
|
break;
|
Finish up the vast majority of the AIO/LIO functionality. Proper signal
support was missing in the previous version of the AIO code. More
tunables added, and very efficient support for VCHR files has been added.
Kernel threads are not used for VCHR files, all work for such files is
done for the requesting process directly. Some attempt has been made to
charge the requesting process for resource utilization, but more work
is needed. aio_fsync is still missing (but the original fsync system
call can be used for now.) aio_cancel is essentially a noop, but that
is okay per POSIX. More aio_cancel functionality can be added later,
if it is found to be needed.
The functions implemented include:
aio_read, aio_write, lio_listio, aio_error, aio_return,
aio_cancel, aio_suspend.
The code has been implemented to support the POSIX spec 1003.1b
(formerly known as POSIX 1003.4 spec) features of the above. The
async I/O features are truly async, with the VCHR mode of operation
being essentially the same as physio (for appropriate files) for
maximum efficiency. This code also supports the signal capability,
is highly tunable, allowing management of resource usage, and
has been written to allow a per process usage quota.
Both the O'Reilly POSIX.4 book and the actual POSIX 1003.1b document
were the reference specs used. Any filedescriptor can be used with
these new system calls. I know of no exceptions where these
system calls will not work. (TTY's will also probably work.)
1997-11-30 04:36:31 +00:00
|
|
|
}
|
2016-02-05 20:38:09 +00:00
|
|
|
if (job != NULL) {
|
2016-03-01 18:12:14 +00:00
|
|
|
MPASS(job->jobflags & KAIOCB_FINISHED);
|
2016-02-05 20:38:09 +00:00
|
|
|
status = job->uaiocb._aiocb_private.status;
|
|
|
|
error = job->uaiocb._aiocb_private.error;
|
2006-01-22 05:59:27 +00:00
|
|
|
td->td_retval[0] = status;
|
2016-06-21 22:19:06 +00:00
|
|
|
td->td_ru.ru_oublock += job->outblock;
|
|
|
|
td->td_ru.ru_inblock += job->inblock;
|
|
|
|
td->td_ru.ru_msgsnd += job->msgsnd;
|
|
|
|
td->td_ru.ru_msgrcv += job->msgrcv;
|
2016-02-05 20:38:09 +00:00
|
|
|
aio_free_entry(job);
|
2006-05-09 00:10:11 +00:00
|
|
|
AIO_UNLOCK(ki);
|
2016-02-05 20:38:09 +00:00
|
|
|
ops->store_error(ujob, error);
|
|
|
|
ops->store_status(ujob, status);
|
2006-01-26 08:37:02 +00:00
|
|
|
} else {
|
2006-01-22 05:59:27 +00:00
|
|
|
error = EINVAL;
|
2006-05-09 00:10:11 +00:00
|
|
|
AIO_UNLOCK(ki);
|
2006-01-26 08:37:02 +00:00
|
|
|
}
|
2006-01-22 05:59:27 +00:00
|
|
|
return (error);
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
|
|
|
|
2008-12-10 20:56:19 +00:00
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_aio_return(struct thread *td, struct aio_return_args *uap)
|
2008-12-10 20:56:19 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
return (kern_aio_return(td, uap->aiocbp, &aiocb_ops));
|
|
|
|
}
|
|
|
|
|
1997-07-06 02:40:43 +00:00
|
|
|
/*
|
2000-01-14 02:53:29 +00:00
|
|
|
* Allow a process to wakeup when any of the I/O requests are completed.
|
1997-07-06 02:40:43 +00:00
|
|
|
*/
|
2008-12-10 20:56:19 +00:00
|
|
|
static int
|
|
|
|
kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist,
|
|
|
|
struct timespec *ts)
|
1997-11-29 01:33:10 +00:00
|
|
|
{
|
2001-09-12 08:38:13 +00:00
|
|
|
struct proc *p = td->td_proc;
|
1997-11-07 08:53:44 +00:00
|
|
|
struct timeval atv;
|
1997-07-06 02:40:43 +00:00
|
|
|
struct kaioinfo *ki;
|
2016-02-05 20:38:09 +00:00
|
|
|
struct kaiocb *firstjob, *job;
|
2008-12-10 20:56:19 +00:00
|
|
|
int error, i, timo;
|
1997-11-29 01:33:10 +00:00
|
|
|
|
1997-07-06 02:40:43 +00:00
|
|
|
timo = 0;
|
2008-12-10 20:56:19 +00:00
|
|
|
if (ts) {
|
|
|
|
if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
|
1997-07-06 02:40:43 +00:00
|
|
|
return (EINVAL);
|
|
|
|
|
2008-12-10 20:56:19 +00:00
|
|
|
TIMESPEC_TO_TIMEVAL(&atv, ts);
|
1997-07-06 02:40:43 +00:00
|
|
|
if (itimerfix(&atv))
|
|
|
|
return (EINVAL);
|
1998-03-30 09:56:58 +00:00
|
|
|
timo = tvtohz(&atv);
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
ki = p->p_aioinfo;
|
|
|
|
if (ki == NULL)
|
2003-01-13 15:06:05 +00:00
|
|
|
return (EAGAIN);
|
1997-07-06 02:40:43 +00:00
|
|
|
|
2008-12-10 20:56:19 +00:00
|
|
|
if (njoblist == 0)
|
2003-01-13 15:06:05 +00:00
|
|
|
return (0);
|
1997-07-06 02:40:43 +00:00
|
|
|
|
2006-05-09 00:10:11 +00:00
|
|
|
AIO_LOCK(ki);
|
2000-01-14 02:53:29 +00:00
|
|
|
for (;;) {
|
2016-02-05 20:38:09 +00:00
|
|
|
firstjob = NULL;
|
2006-01-22 05:59:27 +00:00
|
|
|
error = 0;
|
2016-02-05 20:38:09 +00:00
|
|
|
TAILQ_FOREACH(job, &ki->kaio_all, allist) {
|
2000-01-14 02:53:29 +00:00
|
|
|
for (i = 0; i < njoblist; i++) {
|
2016-02-05 20:38:09 +00:00
|
|
|
if (job->ujob == ujoblist[i]) {
|
|
|
|
if (firstjob == NULL)
|
|
|
|
firstjob = job;
|
2016-03-01 18:12:14 +00:00
|
|
|
if (job->jobflags & KAIOCB_FINISHED)
|
2006-01-22 05:59:27 +00:00
|
|
|
goto RETURN;
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2006-01-22 05:59:27 +00:00
|
|
|
/* All tasks were finished. */
|
2016-02-05 20:38:09 +00:00
|
|
|
if (firstjob == NULL)
|
2006-01-22 05:59:27 +00:00
|
|
|
break;
|
1997-07-06 02:40:43 +00:00
|
|
|
|
1997-11-29 01:33:10 +00:00
|
|
|
ki->kaio_flags |= KAIO_WAKEUP;
|
2006-05-09 00:10:11 +00:00
|
|
|
error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
|
2006-01-22 05:59:27 +00:00
|
|
|
"aiospn", timo);
|
|
|
|
if (error == ERESTART)
|
|
|
|
error = EINTR;
|
|
|
|
if (error)
|
|
|
|
break;
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
2006-01-22 05:59:27 +00:00
|
|
|
RETURN:
|
2006-05-09 00:10:11 +00:00
|
|
|
AIO_UNLOCK(ki);
|
2008-12-10 20:56:19 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap)
|
2008-12-10 20:56:19 +00:00
|
|
|
{
|
|
|
|
struct timespec ts, *tsp;
|
|
|
|
struct aiocb **ujoblist;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
|
|
|
|
return (EINVAL);
|
|
|
|
|
|
|
|
if (uap->timeout) {
|
|
|
|
/* Get timespec struct. */
|
|
|
|
if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
|
|
|
|
return (error);
|
|
|
|
tsp = &ts;
|
|
|
|
} else
|
|
|
|
tsp = NULL;
|
|
|
|
|
|
|
|
ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
|
|
|
|
error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0]));
|
|
|
|
if (error == 0)
|
|
|
|
error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
|
2006-01-22 05:59:27 +00:00
|
|
|
uma_zfree(aiol_zone, ujoblist);
|
|
|
|
return (error);
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
1997-06-16 00:27:26 +00:00
|
|
|
|
|
|
|
/*
|
2000-02-23 07:44:25 +00:00
|
|
|
* aio_cancel cancels any non-physio aio operations not currently in
|
|
|
|
* progress.
|
1997-06-16 00:27:26 +00:00
|
|
|
*/
|
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap)
|
1997-11-29 01:33:10 +00:00
|
|
|
{
|
2001-09-12 08:38:13 +00:00
|
|
|
struct proc *p = td->td_proc;
|
2000-02-23 07:44:25 +00:00
|
|
|
struct kaioinfo *ki;
|
2016-02-05 20:38:09 +00:00
|
|
|
struct kaiocb *job, *jobn;
|
2000-02-23 07:44:25 +00:00
|
|
|
struct file *fp;
|
2015-07-05 19:05:16 +00:00
|
|
|
cap_rights_t rights;
|
2006-01-22 05:59:27 +00:00
|
|
|
int error;
|
|
|
|
int cancelled = 0;
|
|
|
|
int notcancelled = 0;
|
2000-02-23 07:44:25 +00:00
|
|
|
struct vnode *vp;
|
|
|
|
|
2005-11-08 17:43:05 +00:00
|
|
|
/* Lookup file object. */
|
2015-07-05 19:05:16 +00:00
|
|
|
error = fget(td, uap->fd, cap_rights_init(&rights), &fp);
|
2005-11-08 17:43:05 +00:00
|
|
|
if (error)
|
|
|
|
return (error);
|
2000-02-23 07:44:25 +00:00
|
|
|
|
2006-01-22 05:59:27 +00:00
|
|
|
ki = p->p_aioinfo;
|
|
|
|
if (ki == NULL)
|
|
|
|
goto done;
|
|
|
|
|
2004-08-13 17:43:53 +00:00
|
|
|
if (fp->f_type == DTYPE_VNODE) {
|
2003-06-22 08:41:43 +00:00
|
|
|
vp = fp->f_vnode;
|
2006-01-22 05:59:27 +00:00
|
|
|
if (vn_isdisk(vp, &error)) {
|
2005-11-08 17:43:05 +00:00
|
|
|
fdrop(fp, td);
|
2001-09-12 08:38:13 +00:00
|
|
|
td->td_retval[0] = AIO_NOTCANCELED;
|
2004-08-13 17:43:53 +00:00
|
|
|
return (0);
|
2000-02-23 07:44:25 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-05-09 00:10:11 +00:00
|
|
|
AIO_LOCK(ki);
|
2016-02-05 20:38:09 +00:00
|
|
|
TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
|
|
|
|
if ((uap->fd == job->uaiocb.aio_fildes) &&
|
2006-01-22 05:59:27 +00:00
|
|
|
((uap->aiocbp == NULL) ||
|
2016-02-05 20:38:09 +00:00
|
|
|
(uap->aiocbp == job->ujob))) {
|
2016-03-01 18:12:14 +00:00
|
|
|
if (aio_cancel_job(p, ki, job)) {
|
2006-01-22 05:59:27 +00:00
|
|
|
cancelled++;
|
2000-02-23 07:44:25 +00:00
|
|
|
} else {
|
|
|
|
notcancelled++;
|
|
|
|
}
|
2006-01-24 07:24:24 +00:00
|
|
|
if (uap->aiocbp != NULL)
|
|
|
|
break;
|
2000-02-23 07:44:25 +00:00
|
|
|
}
|
|
|
|
}
|
2006-05-09 00:10:11 +00:00
|
|
|
AIO_UNLOCK(ki);
|
2006-01-22 05:59:27 +00:00
|
|
|
|
2002-08-11 19:04:17 +00:00
|
|
|
done:
|
2005-11-08 17:43:05 +00:00
|
|
|
fdrop(fp, td);
|
2006-01-24 07:24:24 +00:00
|
|
|
|
|
|
|
if (uap->aiocbp != NULL) {
|
|
|
|
if (cancelled) {
|
|
|
|
td->td_retval[0] = AIO_CANCELED;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2000-02-23 07:44:25 +00:00
|
|
|
if (notcancelled) {
|
2001-09-12 08:38:13 +00:00
|
|
|
td->td_retval[0] = AIO_NOTCANCELED;
|
2003-01-13 15:06:05 +00:00
|
|
|
return (0);
|
2000-02-23 07:44:25 +00:00
|
|
|
}
|
2006-01-24 07:24:24 +00:00
|
|
|
|
2000-02-23 07:44:25 +00:00
|
|
|
if (cancelled) {
|
2001-09-12 08:38:13 +00:00
|
|
|
td->td_retval[0] = AIO_CANCELED;
|
2003-01-13 15:06:05 +00:00
|
|
|
return (0);
|
2000-02-23 07:44:25 +00:00
|
|
|
}
|
2006-01-24 07:24:24 +00:00
|
|
|
|
2001-09-12 08:38:13 +00:00
|
|
|
td->td_retval[0] = AIO_ALLDONE;
|
2000-02-23 07:44:25 +00:00
|
|
|
|
2003-01-13 15:06:05 +00:00
|
|
|
return (0);
|
1997-06-16 00:27:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2007-03-05 13:10:58 +00:00
|
|
|
* aio_error is implemented in the kernel level for compatibility purposes
|
|
|
|
* only. For a user mode async implementation, it would be best to do it in
|
|
|
|
* a userland subroutine.
|
1997-06-16 00:27:26 +00:00
|
|
|
*/
|
2008-12-10 20:56:19 +00:00
|
|
|
static int
|
2016-02-05 20:38:09 +00:00
|
|
|
kern_aio_error(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops)
|
1997-11-29 01:33:10 +00:00
|
|
|
{
|
2001-09-12 08:38:13 +00:00
|
|
|
struct proc *p = td->td_proc;
|
2016-02-05 20:38:09 +00:00
|
|
|
struct kaiocb *job;
|
1997-07-06 02:40:43 +00:00
|
|
|
struct kaioinfo *ki;
|
2006-01-22 05:59:27 +00:00
|
|
|
int status;
|
1997-06-16 00:27:26 +00:00
|
|
|
|
1997-07-06 02:40:43 +00:00
|
|
|
ki = p->p_aioinfo;
|
2006-01-22 05:59:27 +00:00
|
|
|
if (ki == NULL) {
|
|
|
|
td->td_retval[0] = EINVAL;
|
|
|
|
return (0);
|
Finish up the vast majority of the AIO/LIO functionality. Proper signal
support was missing in the previous version of the AIO code. More
tunables added, and very efficient support for VCHR files has been added.
Kernel threads are not used for VCHR files, all work for such files is
done for the requesting process directly. Some attempt has been made to
charge the requesting process for resource utilization, but more work
is needed. aio_fsync is still missing (but the original fsync system
call can be used for now.) aio_cancel is essentially a noop, but that
is okay per POSIX. More aio_cancel functionality can be added later,
if it is found to be needed.
The functions implemented include:
aio_read, aio_write, lio_listio, aio_error, aio_return,
aio_cancel, aio_suspend.
The code has been implemented to support the POSIX spec 1003.1b
(formerly known as POSIX 1003.4 spec) features of the above. The
async I/O features are truly async, with the VCHR mode of operation
being essentially the same as physio (for appropriate files) for
maximum efficiency. This code also supports the signal capability,
is highly tunable, allowing management of resource usage, and
has been written to allow a per process usage quota.
Both the O'Reilly POSIX.4 book and the actual POSIX 1003.1b document
were the reference specs used. Any filedescriptor can be used with
these new system calls. I know of no exceptions where these
system calls will not work. (TTY's will also probably work.)
1997-11-30 04:36:31 +00:00
|
|
|
}
|
|
|
|
|
2006-05-09 00:10:11 +00:00
|
|
|
AIO_LOCK(ki);
|
2016-02-05 20:38:09 +00:00
|
|
|
TAILQ_FOREACH(job, &ki->kaio_all, allist) {
|
|
|
|
if (job->ujob == ujob) {
|
2016-03-01 18:12:14 +00:00
|
|
|
if (job->jobflags & KAIOCB_FINISHED)
|
2006-01-22 05:59:27 +00:00
|
|
|
td->td_retval[0] =
|
2016-02-05 20:38:09 +00:00
|
|
|
job->uaiocb._aiocb_private.error;
|
2006-01-22 05:59:27 +00:00
|
|
|
else
|
|
|
|
td->td_retval[0] = EINPROGRESS;
|
2006-05-09 00:10:11 +00:00
|
|
|
AIO_UNLOCK(ki);
|
2003-01-13 15:06:05 +00:00
|
|
|
return (0);
|
Finish up the vast majority of the AIO/LIO functionality. Proper signal
support was missing in the previous version of the AIO code. More
tunables added, and very efficient support for VCHR files has been added.
Kernel threads are not used for VCHR files, all work for such files is
done for the requesting process directly. Some attempt has been made to
charge the requesting process for resource utilization, but more work
is needed. aio_fsync is still missing (but the original fsync system
call can be used for now.) aio_cancel is essentially a noop, but that
is okay per POSIX. More aio_cancel functionality can be added later,
if it is found to be needed.
The functions implemented include:
aio_read, aio_write, lio_listio, aio_error, aio_return,
aio_cancel, aio_suspend.
The code has been implemented to support the POSIX spec 1003.1b
(formerly known as POSIX 1003.4 spec) features of the above. The
async I/O features are truly async, with the VCHR mode of operation
being essentially the same as physio (for appropriate files) for
maximum efficiency. This code also supports the signal capability,
is highly tunable, allowing management of resource usage, and
has been written to allow a per process usage quota.
Both the O'Reilly POSIX.4 book and the actual POSIX 1003.1b document
were the reference specs used. Any filedescriptor can be used with
these new system calls. I know of no exceptions where these
system calls will not work. (TTY's will also probably work.)
1997-11-30 04:36:31 +00:00
|
|
|
}
|
|
|
|
}
|
2006-05-09 00:10:11 +00:00
|
|
|
AIO_UNLOCK(ki);
|
Finish up the vast majority of the AIO/LIO functionality. Proper signal
support was missing in the previous version of the AIO code. More
tunables added, and very efficient support for VCHR files has been added.
Kernel threads are not used for VCHR files, all work for such files is
done for the requesting process directly. Some attempt has been made to
charge the requesting process for resource utilization, but more work
is needed. aio_fsync is still missing (but the original fsync system
call can be used for now.) aio_cancel is essentially a noop, but that
is okay per POSIX. More aio_cancel functionality can be added later,
if it is found to be needed.
The functions implemented include:
aio_read, aio_write, lio_listio, aio_error, aio_return,
aio_cancel, aio_suspend.
The code has been implemented to support the POSIX spec 1003.1b
(formerly known as POSIX 1003.4 spec) features of the above. The
async I/O features are truly async, with the VCHR mode of operation
being essentially the same as physio (for appropriate files) for
maximum efficiency. This code also supports the signal capability,
is highly tunable, allowing management of resource usage, and
has been written to allow a per process usage quota.
Both the O'Reilly POSIX.4 book and the actual POSIX 1003.1b document
were the reference specs used. Any filedescriptor can be used with
these new system calls. I know of no exceptions where these
system calls will not work. (TTY's will also probably work.)
1997-11-30 04:36:31 +00:00
|
|
|
|
1997-07-06 02:40:43 +00:00
|
|
|
/*
|
2006-01-23 02:49:34 +00:00
|
|
|
* Hack for failure of aio_aqueue.
|
1997-07-06 02:40:43 +00:00
|
|
|
*/
|
2016-02-05 20:38:09 +00:00
|
|
|
status = ops->fetch_status(ujob);
|
2006-01-22 05:59:27 +00:00
|
|
|
if (status == -1) {
|
2016-02-05 20:38:09 +00:00
|
|
|
td->td_retval[0] = ops->fetch_error(ujob);
|
2006-01-22 05:59:27 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
td->td_retval[0] = EINVAL;
|
|
|
|
return (0);
|
1997-06-16 00:27:26 +00:00
|
|
|
}
|
|
|
|
|
2008-12-10 20:56:19 +00:00
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_aio_error(struct thread *td, struct aio_error_args *uap)
|
2008-12-10 20:56:19 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
return (kern_aio_error(td, uap->aiocbp, &aiocb_ops));
|
|
|
|
}
|
|
|
|
|
2002-03-05 15:38:49 +00:00
|
|
|
/* syscall - asynchronous read from a file (REALTIME) */
|
2016-03-09 19:05:11 +00:00
|
|
|
#ifdef COMPAT_FREEBSD6
|
2005-10-30 02:12:49 +00:00
|
|
|
int
|
2016-03-09 19:05:11 +00:00
|
|
|
freebsd6_aio_read(struct thread *td, struct freebsd6_aio_read_args *uap)
|
2005-10-30 02:12:49 +00:00
|
|
|
{
|
|
|
|
|
2008-12-10 20:56:19 +00:00
|
|
|
return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
|
|
|
|
&aiocb_ops_osigevent));
|
2005-10-30 02:12:49 +00:00
|
|
|
}
|
2016-03-09 19:05:11 +00:00
|
|
|
#endif
|
2005-10-30 02:12:49 +00:00
|
|
|
|
1997-06-16 00:27:26 +00:00
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_aio_read(struct thread *td, struct aio_read_args *uap)
|
1997-11-29 01:33:10 +00:00
|
|
|
{
|
2001-12-29 07:13:47 +00:00
|
|
|
|
2008-12-10 20:56:19 +00:00
|
|
|
return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops));
|
1997-06-16 00:27:26 +00:00
|
|
|
}
|
|
|
|
|
2002-03-05 15:38:49 +00:00
|
|
|
/* syscall - asynchronous write to a file (REALTIME) */
|
2016-03-09 19:05:11 +00:00
|
|
|
#ifdef COMPAT_FREEBSD6
|
2005-10-30 02:12:49 +00:00
|
|
|
int
|
2016-03-09 19:05:11 +00:00
|
|
|
freebsd6_aio_write(struct thread *td, struct freebsd6_aio_write_args *uap)
|
2005-10-30 02:12:49 +00:00
|
|
|
{
|
|
|
|
|
2008-12-10 20:56:19 +00:00
|
|
|
return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
|
|
|
|
&aiocb_ops_osigevent));
|
2005-10-30 02:12:49 +00:00
|
|
|
}
|
2016-03-09 19:05:11 +00:00
|
|
|
#endif
|
2005-10-30 02:12:49 +00:00
|
|
|
|
1997-06-16 00:27:26 +00:00
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_aio_write(struct thread *td, struct aio_write_args *uap)
|
1997-11-29 01:33:10 +00:00
|
|
|
{
|
2001-12-29 07:13:47 +00:00
|
|
|
|
2008-12-10 20:56:19 +00:00
|
|
|
return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops));
|
2005-10-30 02:12:49 +00:00
|
|
|
}
|
|
|
|
|
2013-06-08 13:27:57 +00:00
|
|
|
int
|
|
|
|
sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops));
|
|
|
|
}
|
|
|
|
|
2005-10-30 02:12:49 +00:00
|
|
|
static int
|
2008-12-10 20:56:19 +00:00
|
|
|
kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list,
|
|
|
|
struct aiocb **acb_list, int nent, struct sigevent *sig,
|
|
|
|
struct aiocb_ops *ops)
|
1997-11-29 01:33:10 +00:00
|
|
|
{
|
2001-09-12 08:38:13 +00:00
|
|
|
struct proc *p = td->td_proc;
|
2016-02-05 20:38:09 +00:00
|
|
|
struct aiocb *job;
|
1997-07-06 02:40:43 +00:00
|
|
|
struct kaioinfo *ki;
|
2006-01-22 05:59:27 +00:00
|
|
|
struct aioliojob *lj;
|
2005-10-12 17:51:31 +00:00
|
|
|
struct kevent kev;
|
2006-01-22 05:59:27 +00:00
|
|
|
int error;
|
1997-11-29 01:33:10 +00:00
|
|
|
int nerror;
|
1997-06-16 00:27:26 +00:00
|
|
|
int i;
|
|
|
|
|
2008-12-10 20:56:19 +00:00
|
|
|
if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT))
|
2003-01-13 15:06:05 +00:00
|
|
|
return (EINVAL);
|
1997-07-06 02:40:43 +00:00
|
|
|
|
2003-01-12 09:40:23 +00:00
|
|
|
if (nent < 0 || nent > AIO_LISTIO_MAX)
|
2003-01-13 15:06:05 +00:00
|
|
|
return (EINVAL);
|
1997-07-06 02:40:43 +00:00
|
|
|
|
2000-01-14 02:53:29 +00:00
|
|
|
if (p->p_aioinfo == NULL)
|
1997-07-06 02:40:43 +00:00
|
|
|
aio_init_aioinfo(p);
|
|
|
|
|
|
|
|
ki = p->p_aioinfo;
|
|
|
|
|
2003-02-19 05:47:46 +00:00
|
|
|
lj = uma_zalloc(aiolio_zone, M_WAITOK);
|
Finish up the vast majority of the AIO/LIO functionality. Proper signal
support was missing in the previous version of the AIO code. More
tunables added, and very efficient support for VCHR files has been added.
Kernel threads are not used for VCHR files, all work for such files is
done for the requesting process directly. Some attempt has been made to
charge the requesting process for resource utilization, but more work
is needed. aio_fsync is still missing (but the original fsync system
call can be used for now.) aio_cancel is essentially a noop, but that
is okay per POSIX. More aio_cancel functionality can be added later,
if it is found to be needed.
The functions implemented include:
aio_read, aio_write, lio_listio, aio_error, aio_return,
aio_cancel, aio_suspend.
The code has been implemented to support the POSIX spec 1003.1b
(formerly known as POSIX 1003.4 spec) features of the above. The
async I/O features are truly async, with the VCHR mode of operation
being essentially the same as physio (for appropriate files) for
maximum efficiency. This code also supports the signal capability,
is highly tunable, allowing management of resource usage, and
has been written to allow a per process usage quota.
Both the O'Reilly POSIX.4 book and the actual POSIX 1003.1b document
were the reference specs used. Any filedescriptor can be used with
these new system calls. I know of no exceptions where these
system calls will not work. (TTY's will also probably work.)
1997-11-30 04:36:31 +00:00
|
|
|
lj->lioj_flags = 0;
|
2006-01-22 05:59:27 +00:00
|
|
|
lj->lioj_count = 0;
|
|
|
|
lj->lioj_finished_count = 0;
|
2009-06-10 20:59:32 +00:00
|
|
|
knlist_init_mtx(&lj->klist, AIO_MTX(ki));
|
2005-11-03 05:25:26 +00:00
|
|
|
ksiginfo_init(&lj->lioj_ksi);
|
2005-10-12 17:51:31 +00:00
|
|
|
|
Finish up the vast majority of the AIO/LIO functionality. Proper signal
support was missing in the previous version of the AIO code. More
tunables added, and very efficient support for VCHR files has been added.
Kernel threads are not used for VCHR files, all work for such files is
done for the requesting process directly. Some attempt has been made to
charge the requesting process for resource utilization, but more work
is needed. aio_fsync is still missing (but the original fsync system
call can be used for now.) aio_cancel is essentially a noop, but that
is okay per POSIX. More aio_cancel functionality can be added later,
if it is found to be needed.
The functions implemented include:
aio_read, aio_write, lio_listio, aio_error, aio_return,
aio_cancel, aio_suspend.
The code has been implemented to support the POSIX spec 1003.1b
(formerly known as POSIX 1003.4 spec) features of the above. The
async I/O features are truly async, with the VCHR mode of operation
being essentially the same as physio (for appropriate files) for
maximum efficiency. This code also supports the signal capability,
is highly tunable, allowing management of resource usage, and
has been written to allow a per process usage quota.
Both the O'Reilly POSIX.4 book and the actual POSIX 1003.1b document
were the reference specs used. Any filedescriptor can be used with
these new system calls. I know of no exceptions where these
system calls will not work. (TTY's will also probably work.)
1997-11-30 04:36:31 +00:00
|
|
|
/*
|
2000-01-14 02:53:29 +00:00
|
|
|
* Setup signal.
|
Finish up the vast majority of the AIO/LIO functionality. Proper signal
support was missing in the previous version of the AIO code. More
tunables added, and very efficient support for VCHR files has been added.
Kernel threads are not used for VCHR files, all work for such files is
done for the requesting process directly. Some attempt has been made to
charge the requesting process for resource utilization, but more work
is needed. aio_fsync is still missing (but the original fsync system
call can be used for now.) aio_cancel is essentially a noop, but that
is okay per POSIX. More aio_cancel functionality can be added later,
if it is found to be needed.
The functions implemented include:
aio_read, aio_write, lio_listio, aio_error, aio_return,
aio_cancel, aio_suspend.
The code has been implemented to support the POSIX spec 1003.1b
(formerly known as POSIX 1003.4 spec) features of the above. The
async I/O features are truly async, with the VCHR mode of operation
being essentially the same as physio (for appropriate files) for
maximum efficiency. This code also supports the signal capability,
is highly tunable, allowing management of resource usage, and
has been written to allow a per process usage quota.
Both the O'Reilly POSIX.4 book and the actual POSIX 1003.1b document
were the reference specs used. Any filedescriptor can be used with
these new system calls. I know of no exceptions where these
system calls will not work. (TTY's will also probably work.)
1997-11-30 04:36:31 +00:00
|
|
|
*/
|
2008-12-10 20:56:19 +00:00
|
|
|
if (sig && (mode == LIO_NOWAIT)) {
|
|
|
|
bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal));
|
2005-10-12 17:51:31 +00:00
|
|
|
if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
|
|
|
|
/* Assume only new style KEVENT */
|
|
|
|
kev.filter = EVFILT_LIO;
|
|
|
|
kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
|
2008-12-10 20:56:19 +00:00
|
|
|
kev.ident = (uintptr_t)uacb_list; /* something unique */
|
2005-10-12 17:51:31 +00:00
|
|
|
kev.data = (intptr_t)lj;
|
2006-01-22 05:59:27 +00:00
|
|
|
/* pass user defined sigval data */
|
|
|
|
kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
|
2006-09-24 04:47:47 +00:00
|
|
|
error = kqfd_register(
|
|
|
|
lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1);
|
2005-10-12 17:51:31 +00:00
|
|
|
if (error) {
|
|
|
|
uma_zfree(aiolio_zone, lj);
|
|
|
|
return (error);
|
|
|
|
}
|
2006-01-22 05:59:27 +00:00
|
|
|
} else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
|
|
|
|
;
|
2006-01-23 10:27:15 +00:00
|
|
|
} else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
|
|
|
|
lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
|
|
|
|
if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
|
|
|
|
uma_zfree(aiolio_zone, lj);
|
|
|
|
return EINVAL;
|
|
|
|
}
|
|
|
|
lj->lioj_flags |= LIOJ_SIGNAL;
|
|
|
|
} else {
|
2002-03-20 04:09:59 +00:00
|
|
|
uma_zfree(aiolio_zone, lj);
|
2005-10-12 17:51:31 +00:00
|
|
|
return EINVAL;
|
2001-04-18 22:18:39 +00:00
|
|
|
}
|
2006-01-22 05:59:27 +00:00
|
|
|
}
|
2005-10-12 17:51:31 +00:00
|
|
|
|
2006-05-09 00:10:11 +00:00
|
|
|
AIO_LOCK(ki);
|
2001-04-18 22:18:39 +00:00
|
|
|
TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
|
2006-01-22 05:59:27 +00:00
|
|
|
/*
|
|
|
|
* Add extra aiocb count to avoid the lio to be freed
|
|
|
|
* by other threads doing aio_waitcomplete or aio_return,
|
|
|
|
* and prevent event from being sent until we have queued
|
|
|
|
* all tasks.
|
|
|
|
*/
|
|
|
|
lj->lioj_count = 1;
|
2006-05-09 00:10:11 +00:00
|
|
|
AIO_UNLOCK(ki);
|
2006-01-22 05:59:27 +00:00
|
|
|
|
2000-01-14 02:53:29 +00:00
|
|
|
/*
|
|
|
|
* Get pointers to the list of I/O requests.
|
|
|
|
*/
|
1997-11-29 01:33:10 +00:00
|
|
|
nerror = 0;
|
2008-12-10 20:56:19 +00:00
|
|
|
for (i = 0; i < nent; i++) {
|
2016-02-05 20:38:09 +00:00
|
|
|
job = acb_list[i];
|
|
|
|
if (job != NULL) {
|
|
|
|
error = aio_aqueue(td, job, lj, LIO_NOP, ops);
|
2006-01-22 05:59:27 +00:00
|
|
|
if (error != 0)
|
1997-11-29 01:33:10 +00:00
|
|
|
nerror++;
|
|
|
|
}
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
|
|
|
|
2006-01-22 05:59:27 +00:00
|
|
|
error = 0;
|
2006-05-09 00:10:11 +00:00
|
|
|
AIO_LOCK(ki);
|
2008-12-10 20:56:19 +00:00
|
|
|
if (mode == LIO_WAIT) {
|
2006-01-22 05:59:27 +00:00
|
|
|
while (lj->lioj_count - 1 != lj->lioj_finished_count) {
|
1997-11-29 01:33:10 +00:00
|
|
|
ki->kaio_flags |= KAIO_WAKEUP;
|
2006-05-09 00:10:11 +00:00
|
|
|
error = msleep(&p->p_aioinfo, AIO_MTX(ki),
|
2006-01-22 05:59:27 +00:00
|
|
|
PRIBIO | PCATCH, "aiospn", 0);
|
|
|
|
if (error == ERESTART)
|
|
|
|
error = EINTR;
|
|
|
|
if (error)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (lj->lioj_count - 1 == lj->lioj_finished_count) {
|
|
|
|
if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
|
|
|
|
lj->lioj_flags |= LIOJ_KEVENT_POSTED;
|
|
|
|
KNOTE_LOCKED(&lj->klist, 1);
|
|
|
|
}
|
|
|
|
if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
|
|
|
|
== LIOJ_SIGNAL
|
|
|
|
&& (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
|
|
|
|
lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
|
|
|
|
aio_sendsig(p, &lj->lioj_signal,
|
|
|
|
&lj->lioj_ksi);
|
|
|
|
lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
|
|
|
|
}
|
1997-07-06 02:40:43 +00:00
|
|
|
}
|
|
|
|
}
|
2006-01-22 05:59:27 +00:00
|
|
|
lj->lioj_count--;
|
|
|
|
if (lj->lioj_count == 0) {
|
|
|
|
TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
|
|
|
|
knlist_delete(&lj->klist, curthread, 1);
|
2006-05-09 00:10:11 +00:00
|
|
|
PROC_LOCK(p);
|
2006-01-22 05:59:27 +00:00
|
|
|
sigqueue_take(&lj->lioj_ksi);
|
|
|
|
PROC_UNLOCK(p);
|
2006-05-09 00:10:11 +00:00
|
|
|
AIO_UNLOCK(ki);
|
2006-01-22 05:59:27 +00:00
|
|
|
uma_zfree(aiolio_zone, lj);
|
|
|
|
} else
|
2006-05-09 00:10:11 +00:00
|
|
|
AIO_UNLOCK(ki);
|
1997-07-06 02:40:43 +00:00
|
|
|
|
2006-01-22 05:59:27 +00:00
|
|
|
if (nerror)
|
|
|
|
return (EIO);
|
|
|
|
return (error);
|
1997-06-16 00:27:26 +00:00
|
|
|
}
|
1997-11-29 01:33:10 +00:00
|
|
|
|
2008-12-10 20:56:19 +00:00
|
|
|
/* syscall - list directed I/O (REALTIME) */
|
2016-03-09 19:05:11 +00:00
|
|
|
#ifdef COMPAT_FREEBSD6
|
2008-12-10 20:56:19 +00:00
|
|
|
int
|
2016-03-09 19:05:11 +00:00
|
|
|
freebsd6_lio_listio(struct thread *td, struct freebsd6_lio_listio_args *uap)
|
2008-12-10 20:56:19 +00:00
|
|
|
{
|
|
|
|
struct aiocb **acb_list;
|
|
|
|
struct sigevent *sigp, sig;
|
|
|
|
struct osigevent osig;
|
|
|
|
int error, nent;
|
|
|
|
|
|
|
|
if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
|
|
|
|
return (EINVAL);
|
|
|
|
|
|
|
|
nent = uap->nent;
|
|
|
|
if (nent < 0 || nent > AIO_LISTIO_MAX)
|
|
|
|
return (EINVAL);
|
|
|
|
|
|
|
|
if (uap->sig && (uap->mode == LIO_NOWAIT)) {
|
|
|
|
error = copyin(uap->sig, &osig, sizeof(osig));
|
|
|
|
if (error)
|
|
|
|
return (error);
|
|
|
|
error = convert_old_sigevent(&osig, &sig);
|
|
|
|
if (error)
|
|
|
|
return (error);
|
|
|
|
sigp = &sig;
|
|
|
|
} else
|
|
|
|
sigp = NULL;
|
|
|
|
|
|
|
|
acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
|
|
|
|
error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
|
|
|
|
if (error == 0)
|
|
|
|
error = kern_lio_listio(td, uap->mode,
|
|
|
|
(struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
|
|
|
|
&aiocb_ops_osigevent);
|
|
|
|
free(acb_list, M_LIO);
|
|
|
|
return (error);
|
|
|
|
}
|
2016-03-09 19:05:11 +00:00
|
|
|
#endif
|
2008-12-10 20:56:19 +00:00
|
|
|
|
|
|
|
/* syscall - list directed I/O (REALTIME) */
|
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_lio_listio(struct thread *td, struct lio_listio_args *uap)
|
2008-12-10 20:56:19 +00:00
|
|
|
{
|
|
|
|
struct aiocb **acb_list;
|
|
|
|
struct sigevent *sigp, sig;
|
|
|
|
int error, nent;
|
|
|
|
|
|
|
|
if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
|
|
|
|
return (EINVAL);
|
|
|
|
|
|
|
|
nent = uap->nent;
|
|
|
|
if (nent < 0 || nent > AIO_LISTIO_MAX)
|
|
|
|
return (EINVAL);
|
|
|
|
|
|
|
|
if (uap->sig && (uap->mode == LIO_NOWAIT)) {
|
|
|
|
error = copyin(uap->sig, &sig, sizeof(sig));
|
|
|
|
if (error)
|
|
|
|
return (error);
|
|
|
|
sigp = &sig;
|
|
|
|
} else
|
|
|
|
sigp = NULL;
|
|
|
|
|
|
|
|
acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
|
|
|
|
error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
|
|
|
|
if (error == 0)
|
|
|
|
error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list,
|
|
|
|
nent, sigp, &aiocb_ops);
|
|
|
|
free(acb_list, M_LIO);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
1997-11-29 01:33:10 +00:00
|
|
|
static void
|
2015-04-22 18:11:34 +00:00
|
|
|
aio_physwakeup(struct bio *bp)
|
1997-11-29 01:33:10 +00:00
|
|
|
{
|
2016-02-05 20:38:09 +00:00
|
|
|
struct kaiocb *job = (struct kaiocb *)bp->bio_caller1;
|
2006-01-22 05:59:27 +00:00
|
|
|
struct proc *userp;
|
2006-02-26 12:56:23 +00:00
|
|
|
struct kaioinfo *ki;
|
2016-03-01 18:12:14 +00:00
|
|
|
size_t nbytes;
|
|
|
|
int error, nblks;
|
Finish up the vast majority of the AIO/LIO functionality. Proper signal
support was missing in the previous version of the AIO code. More
tunables added, and very efficient support for VCHR files has been added.
Kernel threads are not used for VCHR files, all work for such files is
done for the requesting process directly. Some attempt has been made to
charge the requesting process for resource utilization, but more work
is needed. aio_fsync is still missing (but the original fsync system
call can be used for now.) aio_cancel is essentially a noop, but that
is okay per POSIX. More aio_cancel functionality can be added later,
if it is found to be needed.
The functions implemented include:
aio_read, aio_write, lio_listio, aio_error, aio_return,
aio_cancel, aio_suspend.
The code has been implemented to support the POSIX spec 1003.1b
(formerly known as POSIX 1003.4 spec) features of the above. The
async I/O features are truly async, with the VCHR mode of operation
being essentially the same as physio (for appropriate files) for
maximum efficiency. This code also supports the signal capability,
is highly tunable, allowing management of resource usage, and
has been written to allow a per process usage quota.
Both the O'Reilly POSIX.4 book and the actual POSIX 1003.1b document
were the reference specs used. Any filedescriptor can be used with
these new system calls. I know of no exceptions where these
system calls will not work. (TTY's will also probably work.)
1997-11-30 04:36:31 +00:00
|
|
|
|
2015-04-22 18:11:34 +00:00
|
|
|
/* Release mapping into kernel space. */
|
2016-03-01 18:12:14 +00:00
|
|
|
userp = job->userproc;
|
|
|
|
ki = userp->p_aioinfo;
|
2016-02-05 20:38:09 +00:00
|
|
|
if (job->pbuf) {
|
|
|
|
pmap_qremove((vm_offset_t)job->pbuf->b_data, job->npages);
|
|
|
|
relpbuf(job->pbuf, NULL);
|
|
|
|
job->pbuf = NULL;
|
2015-04-22 18:11:34 +00:00
|
|
|
atomic_subtract_int(&num_buf_aio, 1);
|
2016-03-01 18:12:14 +00:00
|
|
|
AIO_LOCK(ki);
|
|
|
|
ki->kaio_buffer_count--;
|
|
|
|
AIO_UNLOCK(ki);
|
2015-04-22 18:11:34 +00:00
|
|
|
}
|
2016-02-05 20:38:09 +00:00
|
|
|
vm_page_unhold_pages(job->pages, job->npages);
|
2015-04-22 18:11:34 +00:00
|
|
|
|
2016-02-05 20:38:09 +00:00
|
|
|
bp = job->bp;
|
|
|
|
job->bp = NULL;
|
2016-03-01 18:12:14 +00:00
|
|
|
nbytes = job->uaiocb.aio_nbytes - bp->bio_resid;
|
|
|
|
error = 0;
|
2015-04-22 18:11:34 +00:00
|
|
|
if (bp->bio_flags & BIO_ERROR)
|
2016-03-01 18:12:14 +00:00
|
|
|
error = bp->bio_error;
|
|
|
|
nblks = btodb(nbytes);
|
2016-02-05 20:38:09 +00:00
|
|
|
if (job->uaiocb.aio_lio_opcode == LIO_WRITE)
|
2016-06-21 22:19:06 +00:00
|
|
|
job->outblock += nblks;
|
2006-01-22 05:59:27 +00:00
|
|
|
else
|
2016-06-21 22:19:06 +00:00
|
|
|
job->inblock += nblks;
|
2016-03-01 18:12:14 +00:00
|
|
|
|
2016-05-20 19:46:25 +00:00
|
|
|
if (error)
|
|
|
|
aio_complete(job, -1, error);
|
|
|
|
else
|
|
|
|
aio_complete(job, nbytes, 0);
|
Finish up the vast majority of the AIO/LIO functionality. Proper signal
support was missing in the previous version of the AIO code. More
tunables added, and very efficient support for VCHR files has been added.
Kernel threads are not used for VCHR files, all work for such files is
done for the requesting process directly. Some attempt has been made to
charge the requesting process for resource utilization, but more work
is needed. aio_fsync is still missing (but the original fsync system
call can be used for now.) aio_cancel is essentially a noop, but that
is okay per POSIX. More aio_cancel functionality can be added later,
if it is found to be needed.
The functions implemented include:
aio_read, aio_write, lio_listio, aio_error, aio_return,
aio_cancel, aio_suspend.
The code has been implemented to support the POSIX spec 1003.1b
(formerly known as POSIX 1003.4 spec) features of the above. The
async I/O features are truly async, with the VCHR mode of operation
being essentially the same as physio (for appropriate files) for
maximum efficiency. This code also supports the signal capability,
is highly tunable, allowing management of resource usage, and
has been written to allow a per process usage quota.
Both the O'Reilly POSIX.4 book and the actual POSIX 1003.1b document
were the reference specs used. Any filedescriptor can be used with
these new system calls. I know of no exceptions where these
system calls will not work. (TTY's will also probably work.)
1997-11-30 04:36:31 +00:00
|
|
|
|
2015-04-22 18:11:34 +00:00
|
|
|
g_destroy_bio(bp);
|
1997-11-29 01:33:10 +00:00
|
|
|
}
|
2000-01-14 02:53:29 +00:00
|
|
|
|
2002-03-05 15:38:49 +00:00
|
|
|
/* syscall - wait for the next completion of an aio request */
|
2008-12-10 20:56:19 +00:00
|
|
|
static int
|
2016-02-05 20:38:09 +00:00
|
|
|
kern_aio_waitcomplete(struct thread *td, struct aiocb **ujobp,
|
2008-12-10 20:56:19 +00:00
|
|
|
struct timespec *ts, struct aiocb_ops *ops)
|
2000-01-14 02:53:29 +00:00
|
|
|
{
|
2001-09-12 08:38:13 +00:00
|
|
|
struct proc *p = td->td_proc;
|
2000-01-14 02:53:29 +00:00
|
|
|
struct timeval atv;
|
|
|
|
struct kaioinfo *ki;
|
2016-02-05 20:38:09 +00:00
|
|
|
struct kaiocb *job;
|
|
|
|
struct aiocb *ujob;
|
2016-03-21 21:37:33 +00:00
|
|
|
long error, status;
|
|
|
|
int timo;
|
2004-08-13 17:43:53 +00:00
|
|
|
|
2016-02-05 20:38:09 +00:00
|
|
|
ops->store_aiocb(ujobp, NULL);
|
2000-02-23 07:44:25 +00:00
|
|
|
|
2015-10-25 18:48:09 +00:00
|
|
|
if (ts == NULL) {
|
|
|
|
timo = 0;
|
|
|
|
} else if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
|
|
|
|
timo = -1;
|
|
|
|
} else {
|
2008-12-10 20:56:19 +00:00
|
|
|
if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000))
|
2000-01-14 02:53:29 +00:00
|
|
|
return (EINVAL);
|
|
|
|
|
2008-12-10 20:56:19 +00:00
|
|
|
TIMESPEC_TO_TIMEVAL(&atv, ts);
|
2000-01-14 02:53:29 +00:00
|
|
|
if (itimerfix(&atv))
|
|
|
|
return (EINVAL);
|
|
|
|
timo = tvtohz(&atv);
|
|
|
|
}
|
|
|
|
|
2006-01-15 01:55:45 +00:00
|
|
|
if (p->p_aioinfo == NULL)
|
2005-11-08 23:48:32 +00:00
|
|
|
aio_init_aioinfo(p);
|
2006-01-15 01:55:45 +00:00
|
|
|
ki = p->p_aioinfo;
|
2000-01-14 02:53:29 +00:00
|
|
|
|
2006-01-22 05:59:27 +00:00
|
|
|
error = 0;
|
2016-02-05 20:38:09 +00:00
|
|
|
job = NULL;
|
2006-05-09 00:10:11 +00:00
|
|
|
AIO_LOCK(ki);
|
2016-02-05 20:38:09 +00:00
|
|
|
while ((job = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
|
2015-10-25 18:48:09 +00:00
|
|
|
if (timo == -1) {
|
|
|
|
error = EWOULDBLOCK;
|
|
|
|
break;
|
|
|
|
}
|
2000-01-14 02:53:29 +00:00
|
|
|
ki->kaio_flags |= KAIO_WAKEUP;
|
2006-05-09 00:10:11 +00:00
|
|
|
error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
|
2006-01-22 05:59:27 +00:00
|
|
|
"aiowc", timo);
|
2006-02-26 12:56:23 +00:00
|
|
|
if (timo && error == ERESTART)
|
2006-01-22 05:59:27 +00:00
|
|
|
error = EINTR;
|
|
|
|
if (error)
|
|
|
|
break;
|
2000-01-14 02:53:29 +00:00
|
|
|
}
|
2006-01-22 05:59:27 +00:00
|
|
|
|
2016-02-05 20:38:09 +00:00
|
|
|
if (job != NULL) {
|
2016-03-01 18:12:14 +00:00
|
|
|
MPASS(job->jobflags & KAIOCB_FINISHED);
|
2016-02-05 20:38:09 +00:00
|
|
|
ujob = job->ujob;
|
|
|
|
status = job->uaiocb._aiocb_private.status;
|
|
|
|
error = job->uaiocb._aiocb_private.error;
|
2006-01-22 05:59:27 +00:00
|
|
|
td->td_retval[0] = status;
|
2016-06-21 22:19:06 +00:00
|
|
|
td->td_ru.ru_oublock += job->outblock;
|
|
|
|
td->td_ru.ru_inblock += job->inblock;
|
|
|
|
td->td_ru.ru_msgsnd += job->msgsnd;
|
|
|
|
td->td_ru.ru_msgrcv += job->msgrcv;
|
2016-02-05 20:38:09 +00:00
|
|
|
aio_free_entry(job);
|
2006-05-09 00:10:11 +00:00
|
|
|
AIO_UNLOCK(ki);
|
2016-02-05 20:38:09 +00:00
|
|
|
ops->store_aiocb(ujobp, ujob);
|
|
|
|
ops->store_error(ujob, error);
|
|
|
|
ops->store_status(ujob, status);
|
2006-01-22 05:59:27 +00:00
|
|
|
} else
|
2006-05-09 00:10:11 +00:00
|
|
|
AIO_UNLOCK(ki);
|
2006-01-22 05:59:27 +00:00
|
|
|
|
|
|
|
return (error);
|
2000-01-14 02:53:29 +00:00
|
|
|
}
|
2000-04-16 18:53:38 +00:00
|
|
|
|
2006-03-23 08:46:42 +00:00
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
|
2008-12-10 20:56:19 +00:00
|
|
|
{
|
|
|
|
struct timespec ts, *tsp;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (uap->timeout) {
|
|
|
|
/* Get timespec struct. */
|
|
|
|
error = copyin(uap->timeout, &ts, sizeof(ts));
|
|
|
|
if (error)
|
|
|
|
return (error);
|
|
|
|
tsp = &ts;
|
|
|
|
} else
|
|
|
|
tsp = NULL;
|
|
|
|
|
|
|
|
return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2016-02-05 20:38:09 +00:00
|
|
|
kern_aio_fsync(struct thread *td, int op, struct aiocb *ujob,
|
2008-12-10 20:56:19 +00:00
|
|
|
struct aiocb_ops *ops)
|
2006-03-23 08:46:42 +00:00
|
|
|
{
|
|
|
|
struct proc *p = td->td_proc;
|
|
|
|
struct kaioinfo *ki;
|
|
|
|
|
2008-12-10 20:56:19 +00:00
|
|
|
if (op != O_SYNC) /* XXX lack of O_DSYNC */
|
2006-03-23 08:46:42 +00:00
|
|
|
return (EINVAL);
|
|
|
|
ki = p->p_aioinfo;
|
|
|
|
if (ki == NULL)
|
|
|
|
aio_init_aioinfo(p);
|
2016-02-05 20:38:09 +00:00
|
|
|
return (aio_aqueue(td, ujob, NULL, LIO_SYNC, ops));
|
2008-12-10 20:56:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap)
|
2008-12-10 20:56:19 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops));
|
2006-03-23 08:46:42 +00:00
|
|
|
}
|
|
|
|
|
2002-03-05 15:38:49 +00:00
|
|
|
/* kqueue attach function */
|
2000-04-16 18:53:38 +00:00
|
|
|
static int
|
|
|
|
filt_aioattach(struct knote *kn)
|
|
|
|
{
|
2016-02-05 20:38:09 +00:00
|
|
|
struct kaiocb *job = (struct kaiocb *)kn->kn_sdata;
|
2000-04-16 18:53:38 +00:00
|
|
|
|
|
|
|
/*
|
2016-02-05 20:38:09 +00:00
|
|
|
* The job pointer must be validated before using it, so
|
2000-04-16 18:53:38 +00:00
|
|
|
* registration is restricted to the kernel; the user cannot
|
|
|
|
* set EV_FLAG1.
|
|
|
|
*/
|
|
|
|
if ((kn->kn_flags & EV_FLAG1) == 0)
|
|
|
|
return (EPERM);
|
2016-02-05 20:38:09 +00:00
|
|
|
kn->kn_ptr.p_aio = job;
|
2000-04-16 18:53:38 +00:00
|
|
|
kn->kn_flags &= ~EV_FLAG1;
|
|
|
|
|
2016-02-05 20:38:09 +00:00
|
|
|
knlist_add(&job->klist, kn, 0);
|
2000-04-16 18:53:38 +00:00
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2002-03-05 15:38:49 +00:00
|
|
|
/* kqueue detach function */
|
2000-04-16 18:53:38 +00:00
|
|
|
static void
|
|
|
|
filt_aiodetach(struct knote *kn)
|
|
|
|
{
|
2012-01-30 19:19:22 +00:00
|
|
|
struct knlist *knl;
|
2000-04-16 18:53:38 +00:00
|
|
|
|
2012-01-30 19:19:22 +00:00
|
|
|
knl = &kn->kn_ptr.p_aio->klist;
|
|
|
|
knl->kl_lock(knl->kl_lockarg);
|
|
|
|
if (!knlist_empty(knl))
|
|
|
|
knlist_remove(knl, kn, 1);
|
|
|
|
knl->kl_unlock(knl->kl_lockarg);
|
2000-04-16 18:53:38 +00:00
|
|
|
}
|
|
|
|
|
2002-03-05 15:38:49 +00:00
|
|
|
/* kqueue filter function */
|
2000-04-16 18:53:38 +00:00
|
|
|
/*ARGSUSED*/
|
|
|
|
static int
|
|
|
|
filt_aio(struct knote *kn, long hint)
|
|
|
|
{
|
2016-02-05 20:38:09 +00:00
|
|
|
struct kaiocb *job = kn->kn_ptr.p_aio;
|
2000-04-16 18:53:38 +00:00
|
|
|
|
2016-02-05 20:38:09 +00:00
|
|
|
kn->kn_data = job->uaiocb._aiocb_private.error;
|
2016-03-01 18:12:14 +00:00
|
|
|
if (!(job->jobflags & KAIOCB_FINISHED))
|
2000-04-16 18:53:38 +00:00
|
|
|
return (0);
|
2004-08-13 17:43:53 +00:00
|
|
|
kn->kn_flags |= EV_EOF;
|
2000-04-16 18:53:38 +00:00
|
|
|
return (1);
|
|
|
|
}
|
2005-10-12 17:51:31 +00:00
|
|
|
|
|
|
|
/* kqueue attach function */
|
|
|
|
static int
|
|
|
|
filt_lioattach(struct knote *kn)
|
|
|
|
{
|
2006-01-22 05:59:27 +00:00
|
|
|
struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata;
|
2005-10-12 17:51:31 +00:00
|
|
|
|
|
|
|
/*
|
2006-01-22 05:59:27 +00:00
|
|
|
* The aioliojob pointer must be validated before using it, so
|
2005-10-12 17:51:31 +00:00
|
|
|
* registration is restricted to the kernel; the user cannot
|
|
|
|
* set EV_FLAG1.
|
|
|
|
*/
|
|
|
|
if ((kn->kn_flags & EV_FLAG1) == 0)
|
|
|
|
return (EPERM);
|
2008-01-24 17:10:19 +00:00
|
|
|
kn->kn_ptr.p_lio = lj;
|
2005-10-12 17:51:31 +00:00
|
|
|
kn->kn_flags &= ~EV_FLAG1;
|
|
|
|
|
|
|
|
knlist_add(&lj->klist, kn, 0);
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* kqueue detach function */
|
|
|
|
static void
|
|
|
|
filt_liodetach(struct knote *kn)
|
|
|
|
{
|
2012-01-30 19:19:22 +00:00
|
|
|
struct knlist *knl;
|
2005-10-12 17:51:31 +00:00
|
|
|
|
2012-01-30 19:19:22 +00:00
|
|
|
knl = &kn->kn_ptr.p_lio->klist;
|
|
|
|
knl->kl_lock(knl->kl_lockarg);
|
|
|
|
if (!knlist_empty(knl))
|
|
|
|
knlist_remove(knl, kn, 1);
|
|
|
|
knl->kl_unlock(knl->kl_lockarg);
|
2005-10-12 17:51:31 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* kqueue filter function */
|
|
|
|
/*ARGSUSED*/
|
|
|
|
static int
|
|
|
|
filt_lio(struct knote *kn, long hint)
|
|
|
|
{
|
2008-01-24 17:10:19 +00:00
|
|
|
struct aioliojob * lj = kn->kn_ptr.p_lio;
|
2006-01-22 05:59:27 +00:00
|
|
|
|
2005-10-12 17:51:31 +00:00
|
|
|
return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
|
|
|
|
}
|
2008-12-10 20:56:19 +00:00
|
|
|
|
2010-03-11 14:49:06 +00:00
|
|
|
#ifdef COMPAT_FREEBSD32
|
2016-03-09 19:05:11 +00:00
|
|
|
#include <sys/mount.h>
|
|
|
|
#include <sys/socket.h>
|
|
|
|
#include <compat/freebsd32/freebsd32.h>
|
|
|
|
#include <compat/freebsd32/freebsd32_proto.h>
|
|
|
|
#include <compat/freebsd32/freebsd32_signal.h>
|
|
|
|
#include <compat/freebsd32/freebsd32_syscall.h>
|
|
|
|
#include <compat/freebsd32/freebsd32_util.h>
|
2008-12-10 20:56:19 +00:00
|
|
|
|
|
|
|
struct __aiocb_private32 {
|
|
|
|
int32_t status;
|
|
|
|
int32_t error;
|
|
|
|
uint32_t kernelinfo;
|
|
|
|
};
|
|
|
|
|
2016-03-09 19:05:11 +00:00
|
|
|
#ifdef COMPAT_FREEBSD6
|
2008-12-10 20:56:19 +00:00
|
|
|
typedef struct oaiocb32 {
|
|
|
|
int aio_fildes; /* File descriptor */
|
|
|
|
uint64_t aio_offset __packed; /* File offset for I/O */
|
|
|
|
uint32_t aio_buf; /* I/O buffer in process space */
|
|
|
|
uint32_t aio_nbytes; /* Number of bytes for I/O */
|
|
|
|
struct osigevent32 aio_sigevent; /* Signal to deliver */
|
|
|
|
int aio_lio_opcode; /* LIO opcode */
|
|
|
|
int aio_reqprio; /* Request priority -- ignored */
|
|
|
|
struct __aiocb_private32 _aiocb_private;
|
|
|
|
} oaiocb32_t;
|
2016-03-09 19:05:11 +00:00
|
|
|
#endif
|
2008-12-10 20:56:19 +00:00
|
|
|
|
|
|
|
typedef struct aiocb32 {
|
|
|
|
int32_t aio_fildes; /* File descriptor */
|
|
|
|
uint64_t aio_offset __packed; /* File offset for I/O */
|
|
|
|
uint32_t aio_buf; /* I/O buffer in process space */
|
|
|
|
uint32_t aio_nbytes; /* Number of bytes for I/O */
|
|
|
|
int __spare__[2];
|
|
|
|
uint32_t __spare2__;
|
|
|
|
int aio_lio_opcode; /* LIO opcode */
|
|
|
|
int aio_reqprio; /* Request priority -- ignored */
|
2016-01-26 21:24:49 +00:00
|
|
|
struct __aiocb_private32 _aiocb_private;
|
|
|
|
struct sigevent32 aio_sigevent; /* Signal to deliver */
|
2008-12-10 20:56:19 +00:00
|
|
|
} aiocb32_t;
|
|
|
|
|
2016-03-09 19:05:11 +00:00
|
|
|
#ifdef COMPAT_FREEBSD6
|
2008-12-10 20:56:19 +00:00
|
|
|
static int
|
|
|
|
convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig)
|
|
|
|
{
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
|
|
|
|
* supported by AIO with the old sigevent structure.
|
|
|
|
*/
|
|
|
|
CP(*osig, *nsig, sigev_notify);
|
|
|
|
switch (nsig->sigev_notify) {
|
|
|
|
case SIGEV_NONE:
|
|
|
|
break;
|
|
|
|
case SIGEV_SIGNAL:
|
|
|
|
nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
|
|
|
|
break;
|
|
|
|
case SIGEV_KEVENT:
|
|
|
|
nsig->sigev_notify_kqueue =
|
|
|
|
osig->__sigev_u.__sigev_notify_kqueue;
|
|
|
|
PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return (EINVAL);
|
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
|
|
|
|
{
|
|
|
|
struct oaiocb32 job32;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
bzero(kjob, sizeof(struct aiocb));
|
|
|
|
error = copyin(ujob, &job32, sizeof(job32));
|
|
|
|
if (error)
|
|
|
|
return (error);
|
|
|
|
|
|
|
|
CP(job32, *kjob, aio_fildes);
|
|
|
|
CP(job32, *kjob, aio_offset);
|
|
|
|
PTRIN_CP(job32, *kjob, aio_buf);
|
|
|
|
CP(job32, *kjob, aio_nbytes);
|
|
|
|
CP(job32, *kjob, aio_lio_opcode);
|
|
|
|
CP(job32, *kjob, aio_reqprio);
|
|
|
|
CP(job32, *kjob, _aiocb_private.status);
|
|
|
|
CP(job32, *kjob, _aiocb_private.error);
|
|
|
|
PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
|
|
|
|
return (convert_old_sigevent32(&job32.aio_sigevent,
|
|
|
|
&kjob->aio_sigevent));
|
|
|
|
}
|
2016-03-09 19:05:11 +00:00
|
|
|
#endif
|
2008-12-10 20:56:19 +00:00
|
|
|
|
|
|
|
static int
|
|
|
|
aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob)
|
|
|
|
{
|
|
|
|
struct aiocb32 job32;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
error = copyin(ujob, &job32, sizeof(job32));
|
|
|
|
if (error)
|
|
|
|
return (error);
|
|
|
|
CP(job32, *kjob, aio_fildes);
|
|
|
|
CP(job32, *kjob, aio_offset);
|
|
|
|
PTRIN_CP(job32, *kjob, aio_buf);
|
|
|
|
CP(job32, *kjob, aio_nbytes);
|
|
|
|
CP(job32, *kjob, aio_lio_opcode);
|
|
|
|
CP(job32, *kjob, aio_reqprio);
|
|
|
|
CP(job32, *kjob, _aiocb_private.status);
|
|
|
|
CP(job32, *kjob, _aiocb_private.error);
|
|
|
|
PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
|
|
|
|
return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent));
|
|
|
|
}
|
|
|
|
|
|
|
|
static long
|
|
|
|
aiocb32_fetch_status(struct aiocb *ujob)
|
|
|
|
{
|
|
|
|
struct aiocb32 *ujob32;
|
|
|
|
|
|
|
|
ujob32 = (struct aiocb32 *)ujob;
|
|
|
|
return (fuword32(&ujob32->_aiocb_private.status));
|
|
|
|
}
|
|
|
|
|
|
|
|
static long
|
|
|
|
aiocb32_fetch_error(struct aiocb *ujob)
|
|
|
|
{
|
|
|
|
struct aiocb32 *ujob32;
|
|
|
|
|
|
|
|
ujob32 = (struct aiocb32 *)ujob;
|
|
|
|
return (fuword32(&ujob32->_aiocb_private.error));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
aiocb32_store_status(struct aiocb *ujob, long status)
|
|
|
|
{
|
|
|
|
struct aiocb32 *ujob32;
|
|
|
|
|
|
|
|
ujob32 = (struct aiocb32 *)ujob;
|
|
|
|
return (suword32(&ujob32->_aiocb_private.status, status));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
aiocb32_store_error(struct aiocb *ujob, long error)
|
|
|
|
{
|
|
|
|
struct aiocb32 *ujob32;
|
|
|
|
|
|
|
|
ujob32 = (struct aiocb32 *)ujob;
|
|
|
|
return (suword32(&ujob32->_aiocb_private.error, error));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref)
|
|
|
|
{
|
|
|
|
struct aiocb32 *ujob32;
|
|
|
|
|
|
|
|
ujob32 = (struct aiocb32 *)ujob;
|
|
|
|
return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (suword32(ujobp, (long)ujob));
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct aiocb_ops aiocb32_ops = {
|
|
|
|
.copyin = aiocb32_copyin,
|
|
|
|
.fetch_status = aiocb32_fetch_status,
|
|
|
|
.fetch_error = aiocb32_fetch_error,
|
|
|
|
.store_status = aiocb32_store_status,
|
|
|
|
.store_error = aiocb32_store_error,
|
|
|
|
.store_kernelinfo = aiocb32_store_kernelinfo,
|
|
|
|
.store_aiocb = aiocb32_store_aiocb,
|
|
|
|
};
|
|
|
|
|
2016-03-09 19:05:11 +00:00
|
|
|
#ifdef COMPAT_FREEBSD6
|
2008-12-10 20:56:19 +00:00
|
|
|
static struct aiocb_ops aiocb32_ops_osigevent = {
|
|
|
|
.copyin = aiocb32_copyin_old_sigevent,
|
|
|
|
.fetch_status = aiocb32_fetch_status,
|
|
|
|
.fetch_error = aiocb32_fetch_error,
|
|
|
|
.store_status = aiocb32_store_status,
|
|
|
|
.store_error = aiocb32_store_error,
|
|
|
|
.store_kernelinfo = aiocb32_store_kernelinfo,
|
|
|
|
.store_aiocb = aiocb32_store_aiocb,
|
|
|
|
};
|
2016-03-09 19:05:11 +00:00
|
|
|
#endif
|
2008-12-10 20:56:19 +00:00
|
|
|
|
|
|
|
int
|
|
|
|
freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap)
|
|
|
|
{
|
|
|
|
struct timespec32 ts32;
|
|
|
|
struct timespec ts, *tsp;
|
|
|
|
struct aiocb **ujoblist;
|
|
|
|
uint32_t *ujoblist32;
|
|
|
|
int error, i;
|
|
|
|
|
|
|
|
if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
|
|
|
|
return (EINVAL);
|
|
|
|
|
|
|
|
if (uap->timeout) {
|
|
|
|
/* Get timespec struct. */
|
|
|
|
if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0)
|
|
|
|
return (error);
|
|
|
|
CP(ts32, ts, tv_sec);
|
|
|
|
CP(ts32, ts, tv_nsec);
|
|
|
|
tsp = &ts;
|
|
|
|
} else
|
|
|
|
tsp = NULL;
|
|
|
|
|
|
|
|
ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
|
|
|
|
ujoblist32 = (uint32_t *)ujoblist;
|
|
|
|
error = copyin(uap->aiocbp, ujoblist32, uap->nent *
|
|
|
|
sizeof(ujoblist32[0]));
|
|
|
|
if (error == 0) {
|
|
|
|
for (i = uap->nent; i > 0; i--)
|
|
|
|
ujoblist[i] = PTRIN(ujoblist32[i]);
|
|
|
|
|
|
|
|
error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
|
|
|
|
}
|
|
|
|
uma_zfree(aiol_zone, ujoblist);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
|
|
|
|
}
|
|
|
|
|
2016-03-09 19:05:11 +00:00
|
|
|
#ifdef COMPAT_FREEBSD6
|
2008-12-10 20:56:19 +00:00
|
|
|
int
|
2016-03-09 19:05:11 +00:00
|
|
|
freebsd6_freebsd32_aio_read(struct thread *td,
|
|
|
|
struct freebsd6_freebsd32_aio_read_args *uap)
|
2008-12-10 20:56:19 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
|
|
|
|
&aiocb32_ops_osigevent));
|
|
|
|
}
|
2016-03-09 19:05:11 +00:00
|
|
|
#endif
|
2008-12-10 20:56:19 +00:00
|
|
|
|
|
|
|
int
|
|
|
|
freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
|
|
|
|
&aiocb32_ops));
|
|
|
|
}
|
|
|
|
|
2016-03-09 19:05:11 +00:00
|
|
|
#ifdef COMPAT_FREEBSD6
|
2008-12-10 20:56:19 +00:00
|
|
|
int
|
2016-03-09 19:05:11 +00:00
|
|
|
freebsd6_freebsd32_aio_write(struct thread *td,
|
|
|
|
struct freebsd6_freebsd32_aio_write_args *uap)
|
2008-12-10 20:56:19 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
|
|
|
|
&aiocb32_ops_osigevent));
|
|
|
|
}
|
2016-03-09 19:05:11 +00:00
|
|
|
#endif
|
2008-12-10 20:56:19 +00:00
|
|
|
|
|
|
|
int
|
|
|
|
freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
|
|
|
|
&aiocb32_ops));
|
|
|
|
}
|
|
|
|
|
2013-06-08 13:27:57 +00:00
|
|
|
int
|
|
|
|
freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK,
|
|
|
|
&aiocb32_ops));
|
|
|
|
}
|
|
|
|
|
2008-12-10 20:56:19 +00:00
|
|
|
int
|
|
|
|
freebsd32_aio_waitcomplete(struct thread *td,
|
|
|
|
struct freebsd32_aio_waitcomplete_args *uap)
|
|
|
|
{
|
2009-01-23 13:23:17 +00:00
|
|
|
struct timespec32 ts32;
|
2008-12-10 20:56:19 +00:00
|
|
|
struct timespec ts, *tsp;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (uap->timeout) {
|
|
|
|
/* Get timespec struct. */
|
|
|
|
error = copyin(uap->timeout, &ts32, sizeof(ts32));
|
|
|
|
if (error)
|
|
|
|
return (error);
|
|
|
|
CP(ts32, ts, tv_sec);
|
|
|
|
CP(ts32, ts, tv_nsec);
|
|
|
|
tsp = &ts;
|
|
|
|
} else
|
|
|
|
tsp = NULL;
|
|
|
|
|
|
|
|
return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp,
|
|
|
|
&aiocb32_ops));
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp,
|
|
|
|
&aiocb32_ops));
|
|
|
|
}
|
|
|
|
|
2016-03-09 19:05:11 +00:00
|
|
|
#ifdef COMPAT_FREEBSD6
|
2008-12-10 20:56:19 +00:00
|
|
|
int
|
2016-03-09 19:05:11 +00:00
|
|
|
freebsd6_freebsd32_lio_listio(struct thread *td,
|
|
|
|
struct freebsd6_freebsd32_lio_listio_args *uap)
|
2008-12-10 20:56:19 +00:00
|
|
|
{
|
|
|
|
struct aiocb **acb_list;
|
|
|
|
struct sigevent *sigp, sig;
|
|
|
|
struct osigevent32 osig;
|
|
|
|
uint32_t *acb_list32;
|
|
|
|
int error, i, nent;
|
|
|
|
|
|
|
|
if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
|
|
|
|
return (EINVAL);
|
|
|
|
|
|
|
|
nent = uap->nent;
|
|
|
|
if (nent < 0 || nent > AIO_LISTIO_MAX)
|
|
|
|
return (EINVAL);
|
|
|
|
|
|
|
|
if (uap->sig && (uap->mode == LIO_NOWAIT)) {
|
|
|
|
error = copyin(uap->sig, &osig, sizeof(osig));
|
|
|
|
if (error)
|
|
|
|
return (error);
|
|
|
|
error = convert_old_sigevent32(&osig, &sig);
|
|
|
|
if (error)
|
|
|
|
return (error);
|
|
|
|
sigp = &sig;
|
|
|
|
} else
|
|
|
|
sigp = NULL;
|
|
|
|
|
|
|
|
acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
|
|
|
|
error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
|
|
|
|
if (error) {
|
|
|
|
free(acb_list32, M_LIO);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
|
|
|
|
for (i = 0; i < nent; i++)
|
|
|
|
acb_list[i] = PTRIN(acb_list32[i]);
|
|
|
|
free(acb_list32, M_LIO);
|
|
|
|
|
|
|
|
error = kern_lio_listio(td, uap->mode,
|
|
|
|
(struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
|
|
|
|
&aiocb32_ops_osigevent);
|
|
|
|
free(acb_list, M_LIO);
|
|
|
|
return (error);
|
|
|
|
}
|
2016-03-09 19:05:11 +00:00
|
|
|
#endif
|
2008-12-10 20:56:19 +00:00
|
|
|
|
|
|
|
int
|
|
|
|
freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap)
|
|
|
|
{
|
|
|
|
struct aiocb **acb_list;
|
|
|
|
struct sigevent *sigp, sig;
|
|
|
|
struct sigevent32 sig32;
|
|
|
|
uint32_t *acb_list32;
|
|
|
|
int error, i, nent;
|
|
|
|
|
|
|
|
if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
|
|
|
|
return (EINVAL);
|
|
|
|
|
|
|
|
nent = uap->nent;
|
|
|
|
if (nent < 0 || nent > AIO_LISTIO_MAX)
|
|
|
|
return (EINVAL);
|
|
|
|
|
|
|
|
if (uap->sig && (uap->mode == LIO_NOWAIT)) {
|
|
|
|
error = copyin(uap->sig, &sig32, sizeof(sig32));
|
|
|
|
if (error)
|
|
|
|
return (error);
|
|
|
|
error = convert_sigevent32(&sig32, &sig);
|
|
|
|
if (error)
|
|
|
|
return (error);
|
|
|
|
sigp = &sig;
|
|
|
|
} else
|
|
|
|
sigp = NULL;
|
|
|
|
|
|
|
|
acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
|
|
|
|
error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
|
|
|
|
if (error) {
|
|
|
|
free(acb_list32, M_LIO);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
|
|
|
|
for (i = 0; i < nent; i++)
|
|
|
|
acb_list[i] = PTRIN(acb_list32[i]);
|
|
|
|
free(acb_list32, M_LIO);
|
|
|
|
|
|
|
|
error = kern_lio_listio(td, uap->mode,
|
|
|
|
(struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
|
|
|
|
&aiocb32_ops);
|
|
|
|
free(acb_list, M_LIO);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|