Sync with several changes in UFS/FFS:

- 77115: Implement support for O_DIRECT.
- 98425: Fix a performance issue introduced in 70131 that was causing
  reads before writes even when writing full blocks.
- 98658: Rename the BALLOC flags from B_* to BA_* to avoid confusion with
  the struct buf B_ flags.
- 100344: Merge the BA_ and IO_ flags so so that they may both be used in
  the same flags word. This merger is possible by assigning the IO_ flags
  to the low sixteen bits and the BA_ flags the high sixteen bits.
- 105422: Fix a file-rewrite performance case.
- 129545: Implement IO_INVAL in VOP_WRITE() by marking the buffer as
  "no cache".
- Readd the DOINGASYNC() macro and use it to control asynchronous writes.
  Change i-node updates to honor DOINGASYNC() instead of always being
  synchronous.
- Use a PRIV_VFS_RETAINSUGID check instead of checking cr_uid against 0
  directly when deciding whether or not to clear suid and sgid bits.

Submitted by:	Pedro F. Giffuni  giffunip at yahoo
This commit is contained in:
jhb 2011-04-28 14:27:17 +00:00
parent bc631ee68a
commit 574178d5e6
8 changed files with 168 additions and 69 deletions

View File

@ -41,7 +41,7 @@
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/lock.h>
#include <sys/ucred.h>
#include <sys/mount.h>
#include <sys/vnode.h>
#include <fs/ext2fs/inode.h>
@ -143,7 +143,7 @@ ext2_balloc(ip, lbn, size, cred, bpp, flags)
return (error);
bp = getblk(vp, lbn, nsize, 0, 0, 0);
bp->b_blkno = fsbtodb(fs, newb);
if (flags & B_CLRBUF)
if (flags & BA_CLRBUF)
vfs_bio_clrbuf(bp);
}
ip->i_db[lbn] = dbtofsb(fs, bp->b_blkno);
@ -235,7 +235,7 @@ ext2_balloc(ip, lbn, size, cred, bpp, flags)
* If required, write synchronously, otherwise use
* delayed write.
*/
if (flags & B_SYNC) {
if (flags & IO_SYNC) {
bwrite(bp);
} else {
if (bp->b_bufsize == fs->e2fs_bsize)
@ -258,14 +258,14 @@ ext2_balloc(ip, lbn, size, cred, bpp, flags)
nb = newb;
nbp = getblk(vp, lbn, fs->e2fs_bsize, 0, 0, 0);
nbp->b_blkno = fsbtodb(fs, nb);
if (flags & B_CLRBUF)
if (flags & BA_CLRBUF)
vfs_bio_clrbuf(nbp);
bap[indirs[i].in_off] = nb;
/*
* If required, write synchronously, otherwise use
* delayed write.
*/
if (flags & B_SYNC) {
if (flags & IO_SYNC) {
bwrite(bp);
} else {
if (bp->b_bufsize == fs->e2fs_bsize)
@ -276,8 +276,15 @@ ext2_balloc(ip, lbn, size, cred, bpp, flags)
return (0);
}
brelse(bp);
if (flags & B_CLRBUF) {
error = bread(vp, lbn, (int)fs->e2fs_bsize, NOCRED, &nbp);
if (flags & BA_CLRBUF) {
int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
error = cluster_read(vp, ip->i_size, lbn,
(int)fs->e2fs_bsize, NOCRED,
MAXBSIZE, seqcount, &nbp);
} else {
error = bread(vp, lbn, (int)fs->e2fs_bsize, NOCRED, &nbp);
}
if (error) {
brelse(nbp);
return (error);

View File

@ -81,11 +81,13 @@ int ext2_checkpath(struct inode *, struct inode *, struct ucred *);
int cg_has_sb(int i);
int ext2_inactive(struct vop_inactive_args *);
/* Flags to low-level allocation routines. */
#define B_CLRBUF 0x01 /* Request allocated buffer be cleared. */
#define B_SYNC 0x02 /* Do all allocations synchronously. */
#define B_METAONLY 0x04 /* Return indirect block buffer. */
#define B_NOWAIT 0x08 /* do not sleep to await lock */
/* Flags to low-level allocation routines.
* The low 16-bits are reserved for IO_ flags from vnode.h.
*/
#define BA_CLRBUF 0x00010000 /* Clear invalid areas of buffer. */
#define BA_SEQMASK 0x7F000000 /* Bits holding seq heuristic. */
#define BA_SEQSHIFT 24
#define BA_SEQMAX 0x7F
extern struct vop_vector ext2_vnodeops;
extern struct vop_vector ext2_fifoops;

View File

@ -92,7 +92,7 @@ ext2_update(vp, waitfor)
}
ext2_i2ei(ip, (struct ext2fs_dinode *)((char *)bp->b_data +
EXT2_INODE_SIZE(fs) * ino_to_fsbo(fs, ip->i_number)));
if (waitfor && (vp->v_mount->mnt_kern_flag & MNTK_ASYNC) == 0)
if (waitfor && !DOINGASYNC(vp))
return (bwrite(bp));
else {
bdwrite(bp);
@ -125,7 +125,7 @@ ext2_truncate(vp, length, flags, cred, td)
struct buf *bp;
int offset, size, level;
long count, nblocks, blocksreleased = 0;
int aflags, error, i, allerror;
int error, i, allerror;
off_t osize;
oip = VTOI(ovp);
@ -164,10 +164,8 @@ ext2_truncate(vp, length, flags, cred, td)
vnode_pager_setsize(ovp, length);
offset = blkoff(fs, length - 1);
lbn = lblkno(fs, length - 1);
aflags = B_CLRBUF;
if (flags & IO_SYNC)
aflags |= B_SYNC;
error = ext2_balloc(oip, lbn, offset + 1, cred, &bp, aflags);
flags |= BA_CLRBUF;
error = ext2_balloc(oip, lbn, offset + 1, cred, &bp, flags);
if (error) {
vnode_pager_setsize(vp, osize);
return (error);
@ -175,9 +173,9 @@ ext2_truncate(vp, length, flags, cred, td)
oip->i_size = length;
if (bp->b_bufsize == fs->e2fs_bsize)
bp->b_flags |= B_CLUSTEROK;
if (aflags & B_SYNC)
if (flags & IO_SYNC)
bwrite(bp);
else if (ovp->v_mount->mnt_flag & MNT_ASYNC)
else if (DOINGASYNC(ovp))
bdwrite(bp);
else
bawrite(bp);
@ -197,10 +195,8 @@ ext2_truncate(vp, length, flags, cred, td)
oip->i_size = length;
} else {
lbn = lblkno(fs, length);
aflags = B_CLRBUF;
if (flags & IO_SYNC)
aflags |= B_SYNC;
error = ext2_balloc(oip, lbn, offset, cred, &bp, aflags);
flags |= BA_CLRBUF;
error = ext2_balloc(oip, lbn, offset, cred, &bp, flags);
if (error)
return (error);
oip->i_size = length;
@ -209,9 +205,9 @@ ext2_truncate(vp, length, flags, cred, td)
allocbuf(bp, size);
if (bp->b_bufsize == fs->e2fs_bsize)
bp->b_flags |= B_CLUSTEROK;
if (aflags & B_SYNC)
if (flags & IO_SYNC)
bwrite(bp);
else if (ovp->v_mount->mnt_flag & MNT_ASYNC)
else if (DOINGASYNC(ovp))
bdwrite(bp);
else
bawrite(bp);

View File

@ -890,7 +890,12 @@ ext2_direnter(ip, dvp, cnp)
ep = (struct ext2fs_direct_2 *)((char *)ep + dsize);
}
bcopy((caddr_t)&newdir, (caddr_t)ep, (u_int)newentrysize);
error = bwrite(bp);
if (DOINGASYNC(dvp)) {
bdwrite(bp);
error = 0;
} else {
error = bwrite(bp);
}
dp->i_flag |= IN_CHANGE | IN_UPDATE;
if (!error && dp->i_endoff && dp->i_endoff < dp->i_size)
error = ext2_truncate(dvp, (off_t)dp->i_endoff, IO_SYNC,
@ -947,7 +952,10 @@ ext2_dirremove(dvp, cnp)
else
rep = (struct ext2fs_direct_2 *)((char *)ep + ep->e2d_reclen);
ep->e2d_reclen += rep->e2d_reclen;
error = bwrite(bp);
if (DOINGASYNC(dvp) && dp->i_count != 0)
bdwrite(bp);
else
error = bwrite(bp);
dp->i_flag |= IN_CHANGE | IN_UPDATE;
return (error);
}

View File

@ -45,6 +45,15 @@
#define WRITE ext2_write
#define WRITE_S "ext2_write"
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_pager.h>
#include <vm/vnode_pager.h>
#include "opt_directio.h"
/*
* Vnode op for reading.
*/
@ -66,15 +75,16 @@ READ(ap)
off_t bytesinfile;
long size, xfersize, blkoffset;
int error, orig_resid, seqcount;
seqcount = ap->a_ioflag >> IO_SEQSHIFT;
u_short mode;
int ioflag;
vp = ap->a_vp;
ip = VTOI(vp);
mode = ip->i_mode;
uio = ap->a_uio;
ioflag = ap->a_ioflag;
#ifdef DIAGNOSTIC
seqcount = ap->a_ioflag >> IO_SEQSHIFT;
ip = VTOI(vp);
#ifdef INVARIANTS
if (uio->uio_rw != UIO_READ)
panic("%s: mode", READ_S);
@ -90,8 +100,10 @@ READ(ap)
return (0);
KASSERT(uio->uio_offset >= 0, ("ext2_read: uio->uio_offset < 0"));
fs = ip->I_FS;
if (uio->uio_offset < ip->i_size && uio->uio_offset >= fs->e2fs_maxfilesize)
return (EOVERFLOW);
if (uio->uio_offset < ip->i_size &&
uio->uio_offset >= fs->e2fs_maxfilesize)
return (EOVERFLOW);
for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
break;
@ -109,8 +121,8 @@ READ(ap)
if (lblktosize(fs, nextlbn) >= ip->i_size)
error = bread(vp, lbn, size, NOCRED, &bp);
else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0)
error = cluster_read(vp, ip->i_size, lbn, size,
NOCRED, blkoffset + uio->uio_resid, seqcount, &bp);
error = cluster_read(vp, ip->i_size, lbn, size,
NOCRED, blkoffset + uio->uio_resid, seqcount, &bp);
else if (seqcount > 1) {
int nextsize = BLKSIZE(fs, ip, nextlbn);
error = breadn(vp, lbn,
@ -123,6 +135,15 @@ READ(ap)
break;
}
/*
* If IO_DIRECT then set B_DIRECT for the buffer. This
* will cause us to attempt to release the buffer later on
* and will cause the buffer cache to attempt to free the
* underlying pages.
*/
if (ioflag & IO_DIRECT)
bp->b_flags |= B_DIRECT;
/*
* We should only get non-zero b_resid when an I/O error
* has occurred, which should cause us to break above.
@ -141,10 +162,42 @@ READ(ap)
if (error)
break;
bqrelse(bp);
if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
(LIST_FIRST(&bp->b_dep) == NULL)) {
/*
* If there are no dependencies, and it's VMIO,
* then we don't need the buf, mark it available
* for freeing. The VM has the data.
*/
bp->b_flags |= B_RELBUF;
brelse(bp);
} else {
/*
* Otherwise let whoever
* made the request take care of
* freeing it. We just queue
* it onto another list.
*/
bqrelse(bp);
}
}
if (bp != NULL)
bqrelse(bp);
/*
* This can only happen in the case of an error
* because the loop above resets bp to NULL on each iteration
* and on normal completion has not set a new value into it.
* so it must have come from a 'break' statement
*/
if (bp != NULL) {
if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
(LIST_FIRST(&bp->b_dep) == NULL)) {
bp->b_flags |= B_RELBUF;
brelse(bp);
} else {
bqrelse(bp);
}
}
if ((error == 0 || uio->uio_resid != orig_resid) &&
(vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
ip->i_flag |= IN_ACCESS;
@ -173,12 +226,13 @@ WRITE(ap)
int blkoffset, error, flags, ioflag, resid, size, seqcount, xfersize;
ioflag = ap->a_ioflag;
seqcount = ioflag >> IO_SEQSHIFT;
uio = ap->a_uio;
vp = ap->a_vp;
seqcount = ioflag >> IO_SEQSHIFT;
ip = VTOI(vp);
#ifdef DIAGNOSTIC
#ifdef INVARIANTS
if (uio->uio_rw != UIO_WRITE)
panic("%s: mode", WRITE_S);
#endif
@ -217,7 +271,12 @@ WRITE(ap)
resid = uio->uio_resid;
osize = ip->i_size;
flags = ioflag & IO_SYNC ? B_SYNC : 0;
if (seqcount > BA_SEQMAX)
flags = BA_SEQMAX << BA_SEQSHIFT;
else
flags = seqcount << BA_SEQSHIFT;
if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
flags |= IO_SYNC;
for (error = 0; uio->uio_resid > 0;) {
lbn = lblkno(fs, uio->uio_offset);
@ -228,17 +287,30 @@ WRITE(ap)
if (uio->uio_offset + xfersize > ip->i_size)
vnode_pager_setsize(vp, uio->uio_offset + xfersize);
/*
* Avoid a data-consistency race between write() and mmap()
* by ensuring that newly allocated blocks are zeroed. The
* race can occur even in the case where the write covers
* the entire block.
*/
flags |= B_CLRBUF;
/*
* We must perform a read-before-write if the transfer size
* does not cover the entire buffer.
*/
if (fs->e2fs_bsize > xfersize)
flags |= BA_CLRBUF;
else
flags &= ~BA_CLRBUF;
error = ext2_balloc(ip, lbn, blkoffset + xfersize,
ap->a_cred, &bp, flags);
ap->a_cred, &bp, flags);
if (error != 0)
break;
/*
* If the buffer is not valid and we did not clear garbage
* out above, we have to do so here even though the write
* covers the entire buffer in order to avoid a mmap()/write
* race where another process may see the garbage prior to
* the uiomove() for a write replacing it.
*/
if ((bp->b_flags & B_CACHE) == 0 && fs->e2fs_bsize <= xfersize)
vfs_bio_clrbuf(bp);
if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL))
bp->b_flags |= B_NOCACHE;
if (uio->uio_offset + xfersize > ip->i_size)
ip->i_size = uio->uio_offset + xfersize;
size = BLKSIZE(fs, ip, lbn) - bp->b_resid;
@ -247,12 +319,25 @@ WRITE(ap)
error =
uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
if ((ioflag & IO_VMIO) &&
LIST_FIRST(&bp->b_dep) == NULL) /* in ext2fs? */
if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
(LIST_EMPTY(&bp->b_dep))) { /* in ext2fs? */
bp->b_flags |= B_RELBUF;
}
/*
* If IO_SYNC each buffer is written synchronously. Otherwise
* if we have a severe page deficiency write the buffer
* asynchronously. Otherwise try to cluster, and if that
* doesn't do it then either do an async write (if O_DIRECT),
* or a delayed write (if not).
*/
if (ioflag & IO_SYNC) {
(void)bwrite(bp);
} else if (vm_page_count_severe() ||
buf_dirty_count_severe() ||
(ioflag & IO_ASYNC)) {
bp->b_flags |= B_CLUSTEROK;
bawrite(bp);
} else if (xfersize + blkoffset == fs->e2fs_fsize) {
if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
bp->b_flags |= B_CLUSTEROK;
@ -260,6 +345,9 @@ WRITE(ap)
} else {
bawrite(bp);
}
} else if (ioflag & IO_DIRECT) {
bp->b_flags |= B_CLUSTEROK;
bawrite(bp);
} else {
bp->b_flags |= B_CLUSTEROK;
bdwrite(bp);
@ -271,18 +359,13 @@ WRITE(ap)
* If we successfully wrote any data, and we are not the superuser
* we clear the setuid and setgid bits as a precaution against
* tampering.
* XXX too late, the tamperer may have opened the file while we
* were writing the data (or before).
* XXX too early, if (error && ioflag & IO_UNIT) then we will
* unwrite the data.
*/
if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0)
ip->i_mode &= ~(ISUID | ISGID);
if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid &&
ap->a_cred) {
if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0))
ip->i_mode &= ~(ISUID | ISGID);
}
if (error) {
/*
* XXX should truncate to the last successfully written
* data if the uiomove() failed.
*/
if (ioflag & IO_UNIT) {
(void)ext2_truncate(vp, osize,
ioflag & IO_SYNC, ap->a_cred, uio->uio_td);

View File

@ -738,7 +738,7 @@ ext2_link(ap)
}
ip->i_nlink++;
ip->i_flag |= IN_CHANGE;
error = ext2_update(vp, 1);
error = ext2_update(vp, !DOINGASYNC(vp));
if (!error)
error = ext2_direnter(ip, tdvp, cnp);
if (error) {
@ -884,7 +884,7 @@ abortit:
*/
ip->i_nlink++;
ip->i_flag |= IN_CHANGE;
if ((error = ext2_update(fvp, 1)) != 0) {
if ((error = ext2_update(fvp, !DOINGASYNC(fvp))) != 0) {
VOP_UNLOCK(fvp, 0);
goto bad;
}
@ -943,7 +943,7 @@ abortit:
}
dp->i_nlink++;
dp->i_flag |= IN_CHANGE;
error = ext2_update(tdvp, 1);
error = ext2_update(tdvp, !DOINGASYNC(tdvp));
if (error)
goto bad;
}
@ -1211,7 +1211,7 @@ ext2_mkdir(ap)
*/
dp->i_nlink++;
dp->i_flag |= IN_CHANGE;
error = ext2_update(dvp, 1);
error = ext2_update(dvp, !DOINGASYNC(dvp));
if (error)
goto bad;
@ -1655,7 +1655,7 @@ ext2_makeinode(mode, dvp, vpp, cnp)
/*
* Make sure inode goes to disk before directory entry.
*/
error = ext2_update(tvp, 1);
error = ext2_update(tvp, !DOINGASYNC(tvp));
if (error)
goto bad;
error = ext2_direnter(ip, dvp, cnp);

View File

@ -158,6 +158,9 @@ struct indir {
#define VTOI(vp) ((struct inode *)(vp)->v_data)
#define ITOV(ip) ((ip)->i_vnode)
/* Check whether the MNTK_ASYNC flag has been set for a mount point */
#define DOINGASYNC(vp) ((vp)->v_mount->mnt_kern_flag & MNTK_ASYNC)
/* This overlays the fid structure (see mount.h). */
struct ufid {
uint16_t ufid_len; /* Length of structure. */

View File

@ -2,7 +2,7 @@
.PATH: ${.CURDIR}/../../fs/ext2fs
KMOD= ext2fs
SRCS= opt_ddb.h opt_quota.h opt_suiddir.h vnode_if.h \
SRCS= opt_ddb.h opt_directio.h opt_quota.h opt_suiddir.h vnode_if.h \
ext2_alloc.c ext2_balloc.c ext2_bmap.c ext2_inode.c \
ext2_inode_cnv.c ext2_lookup.c ext2_subr.c ext2_vfsops.c \
ext2_vnops.c