This patch implements O_DIRECT about 80% of the way. It takes a patchset

Tor created a while ago, removes the raw I/O piece (that has cache coherency
problems), and adds a buffer cache / VM freeing piece.

Essentially this patch causes O_DIRECT I/O to not be left in the cache, but
does not prevent it from going through the cache, hence the 80%.  For
the last 20% we need a method by which the I/O can be issued directly to
buffer supplied by the user process and bypass the buffer cache entirely,
but still maintain cache coherency.

I also have the code working under -stable but the changes made to sys/file.h
may not be MFCable, so an MFC is not on the table yet.

Submitted by:	tegge, dillon
This commit is contained in:
Matthew Dillon 2001-05-24 07:22:27 +00:00
parent e8f64f5ebf
commit ac8f990bde
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=77115
12 changed files with 100 additions and 15 deletions

View File

@ -147,6 +147,11 @@ corresponds to the
.Dv O_APPEND
flag of
.Xr open 2 .
.It Dv O_DIRECT
Minimize or eliminate the cache effects of reading and writing. The system
will attempt to avoid caching the data you read or write. If it cannot
avoid caching the data, it will minimize the impact the data has on the cache.
Use of this flag can drastically reduce performance if not used with care.
.It Dv O_ASYNC
Enable the
.Dv SIGIO

View File

@ -83,6 +83,7 @@ O_TRUNC truncate size to 0
O_EXCL error if create and file exists
O_SHLOCK atomically obtain a shared lock
O_EXLOCK atomically obtain an exclusive lock
O_DIRECT eliminate or reduce cache effects
O_FSYNC synchronous writes
O_NOFOLLOW do not follow symlinks
.Ed
@ -150,6 +151,12 @@ If creating a file with
the request for the lock will never fail
(provided that the underlying filesystem supports locking).
.Pp
.Dv O_DIRECT may be used to
minimize or eliminate the cache effects of reading and writing. The system
will attempt to avoid caching the data you read or write. If it cannot
avoid caching the data, it will minimize the impact the data has on the cache.
Use of this flag can drastically reduce performance if not used with care.
.Pp
If successful,
.Fn open
returns a non-negative integer, termed a file descriptor.

View File

@ -1249,7 +1249,7 @@ brelse(struct buf * bp)
/* unlock */
BUF_UNLOCK(bp);
bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
bp->b_ioflags &= ~BIO_ORDERED;
if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
panic("brelse: not dirty");
@ -1264,6 +1264,8 @@ brelse(struct buf * bp)
* biodone() to requeue an async I/O on completion. It is also used when
* known good buffers need to be requeued but we think we may need the data
* again soon.
*
* XXX we should be able to leave the B_RELBUF hint set on completion.
*/
void
bqrelse(struct buf * bp)
@ -1355,12 +1357,15 @@ vfs_vmio_release(bp)
vm_page_flag_clear(m, PG_ZERO);
/*
* Might as well free the page if we can and it has
* no valid data.
* no valid data. We also free the page if the
* buffer was used for direct I/O
*/
if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) {
vm_page_busy(m);
vm_page_protect(m, VM_PROT_NONE);
vm_page_free(m);
} else if (bp->b_flags & B_DIRECT) {
vm_page_try_to_free(m);
} else if (vm_page_count_severe()) {
vm_page_try_to_cache(m);
}

View File

@ -505,6 +505,15 @@ cluster_callback(bp)
tbp->b_dirtyoff = tbp->b_dirtyend = 0;
tbp->b_flags &= ~B_INVAL;
tbp->b_ioflags &= ~BIO_ERROR;
/*
* XXX the bdwrite()/bqrelse() issued during
* cluster building clears B_RELBUF (see bqrelse()
* comment). If direct I/O was specified, we have
* to restore it here to allow the buffer and VM
* to be freed.
*/
if (tbp->b_flags & B_DIRECT)
tbp->b_flags |= B_RELBUF;
}
bufdone(tbp);
}

View File

@ -352,6 +352,8 @@ vn_read(fp, uio, cred, flags, p)
ioflag = 0;
if (fp->f_flag & FNONBLOCK)
ioflag |= IO_NDELAY;
if (fp->f_flag & O_DIRECT)
ioflag |= IO_DIRECT;
VOP_LEASE(vp, p, cred, LEASE_READ);
vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p);
if ((flags & FOF_OFFSET) == 0)
@ -393,6 +395,8 @@ vn_write(fp, uio, cred, flags, p)
ioflag |= IO_APPEND;
if (fp->f_flag & FNONBLOCK)
ioflag |= IO_NDELAY;
if (fp->f_flag & O_DIRECT)
ioflag |= IO_DIRECT;
if ((fp->f_flag & O_FSYNC) ||
(vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
ioflag |= IO_SYNC;

View File

@ -187,13 +187,17 @@ struct buf {
* The buffer's data is always PAGE_SIZE aligned even
* if b_bufsize and b_bcount are not. ( b_bufsize is
* always at least DEV_BSIZE aligned, though ).
*
*
* B_DIRECT Hint that we should attempt to completely free
* the pages underlying the buffer. B_DIRECT is
* sticky until the buffer is released and typically
* only has an effect when B_RELBUF is also set.
*/
#define B_AGE 0x00000001 /* Move to age queue when I/O done. */
#define B_NEEDCOMMIT 0x00000002 /* Append-write in progress. */
#define B_ASYNC 0x00000004 /* Start I/O, do not wait. */
#define B_UNUSED0 0x00000008 /* Old B_BAD */
#define B_DIRECT 0x00000008 /* direct I/O flag (pls free vmio) */
#define B_DEFERRED 0x00000010 /* Skipped over for cleaning */
#define B_CACHE 0x00000020 /* Bread found us in the cache. */
#define B_VALIDSUSPWRT 0x00000040 /* Valid write during suspension. */
@ -225,7 +229,7 @@ struct buf {
"\33paging\32xxx\31writeinprog\30want\27relbuf\26dirty" \
"\25read\24raw\23phys\22clusterok\21malloc\20nocache" \
"\17locked\16inval\15scanned\14error\13eintr\12done\11freebuf" \
"\10delwri\7call\6cache\4bad\3async\2needcommit\1age"
"\10delwri\7call\6cache\4direct\3async\2needcommit\1age"
/*
* These flags are kept in b_xflags.

View File

@ -98,15 +98,18 @@
/* Defined by POSIX 1003.1; BSD default, but must be distinct from O_RDONLY. */
#define O_NOCTTY 0x8000 /* don't assign controlling terminal */
/* Attempt to bypass buffer cache */
#define O_DIRECT 0x00010000
#ifdef _KERNEL
/* convert from open() flags to/from fflags; convert O_RD/WR to FREAD/FWRITE */
#define FFLAGS(oflags) ((oflags) + 1)
#define OFLAGS(fflags) ((fflags) - 1)
/* bits to save after open */
#define FMASK (FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK)
#define FMASK (FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK|O_DIRECT)
/* bits settable by fcntl(F_SETFL, ...) */
#define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM)
#define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM|O_DIRECT)
#endif
/*

View File

@ -56,7 +56,7 @@ struct knote;
*/
struct file {
LIST_ENTRY(file) f_list;/* list of active files */
short f_flag; /* see fcntl.h */
short f_FILLER3; /* (old f_flag) */
#define DTYPE_VNODE 1 /* file */
#define DTYPE_SOCKET 2 /* communications endpoint */
#define DTYPE_PIPE 3 /* pipe */
@ -93,6 +93,7 @@ struct file {
*/
off_t f_offset;
caddr_t f_data; /* vnode or socket */
u_int f_flag; /* see fcntl.h */
};
#ifdef MALLOC_DECLARE

View File

@ -220,6 +220,7 @@ struct vattr {
#define IO_VMIO 0x20 /* data already in VMIO space */
#define IO_INVAL 0x40 /* invalidate after I/O */
#define IO_ASYNC 0x80 /* bawrite rather then bdwrite */
#define IO_DIRECT 0x100 /* attempt to bypass buffer cache */
/*
* Modes. Some values same as Ixxx entries from inode.h for now.

View File

@ -286,6 +286,15 @@ READ(ap)
break;
}
/*
* If IO_DIRECT then set B_DIRECT for the buffer. This
* will cause us to attempt to release the buffer later on
* and will cause the buffer cache to attempt to free the
* underlying pages.
*/
if (ioflag & IO_DIRECT)
bp->b_flags |= B_DIRECT;
/*
* We should only get non-zero b_resid when an I/O error
* has occurred, which should cause us to break above.
@ -328,12 +337,12 @@ READ(ap)
if (error)
break;
if ((ioflag & IO_VMIO) &&
if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
(LIST_FIRST(&bp->b_dep) == NULL)) {
/*
* If there are no dependencies, and
* it's VMIO, then we don't need the buf,
* mark it available for freeing. The VM has the data.
* If there are no dependencies, and it's VMIO,
* then we don't need the buf, mark it available
* for freeing. The VM has the data.
*/
bp->b_flags |= B_RELBUF;
brelse(bp);
@ -355,7 +364,7 @@ READ(ap)
* so it must have come from a 'break' statement
*/
if (bp != NULL) {
if ((ioflag & IO_VMIO) &&
if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
(LIST_FIRST(&bp->b_dep) == NULL)) {
bp->b_flags |= B_RELBUF;
brelse(bp);
@ -514,6 +523,8 @@ WRITE(ap)
ap->a_cred, flags, &bp);
if (error != 0)
break;
if (ioflag & IO_DIRECT)
bp->b_flags |= B_DIRECT;
if (uio->uio_offset + xfersize > ip->i_size) {
ip->i_size = uio->uio_offset + xfersize;
@ -526,10 +537,18 @@ WRITE(ap)
error =
uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
if ((ioflag & IO_VMIO) &&
(LIST_FIRST(&bp->b_dep) == NULL))
if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
(LIST_FIRST(&bp->b_dep) == NULL)) {
bp->b_flags |= B_RELBUF;
}
/*
* If IO_SYNC each buffer is written synchronously. Otherwise
* if we have a severe page deficiency write the buffer
* asynchronously. Otherwise try to cluster, and if that
* doesn't do it then either do an async write (if O_DIRECT),
* or a delayed write (if not).
*/
if (ioflag & IO_SYNC) {
(void)bwrite(bp);
} else if (vm_page_count_severe() ||
@ -544,6 +563,9 @@ WRITE(ap)
} else {
bawrite(bp);
}
} else if (ioflag & IO_DIRECT) {
bp->b_flags |= B_CLUSTEROK;
bawrite(bp);
} else {
bp->b_flags |= B_CLUSTEROK;
bdwrite(bp);

View File

@ -1303,6 +1303,29 @@ vm_page_try_to_cache(vm_page_t m)
return(1);
}
/*
* vm_page_try_to_free()
*
* Attempt to free the page. If we cannot free it, we do nothing.
* 1 is returned on success, 0 on failure.
*/
int
vm_page_try_to_free(m)
vm_page_t m;
{
if (m->dirty || m->hold_count || m->busy || m->wire_count ||
(m->flags & (PG_BUSY|PG_UNMANAGED))) {
return(0);
}
vm_page_test_dirty(m);
if (m->dirty)
return(0);
vm_page_busy(m);
vm_page_protect(m, VM_PROT_NONE);
vm_page_free(m);
return(1);
}
/*
* vm_page_cache
*

View File

@ -421,6 +421,7 @@ vm_page_t vm_page_alloc __P((vm_object_t, vm_pindex_t, int));
vm_page_t vm_page_grab __P((vm_object_t, vm_pindex_t, int));
void vm_page_cache __P((register vm_page_t));
int vm_page_try_to_cache __P((vm_page_t));
int vm_page_try_to_free __P((vm_page_t));
void vm_page_dontneed __P((register vm_page_t));
static __inline void vm_page_copy __P((vm_page_t, vm_page_t));
static __inline void vm_page_free __P((vm_page_t));