Move UFS from DEVFS backing to GEOM backing.

This eliminates a bunch of vnode overhead (approx 1-2 % speed
improvement) and gives us more control over the access to the storage
device.

Access counts on the underlying device are not correctly tracked and
therefore it is possible to read-only mount the same disk device multiple
times:
	syv# mount -p
	/dev/md0        /var    ufs rw  2 2
	/dev/ad0        /mnt    ufs ro  1 1
	/dev/ad0        /mnt2   ufs ro  1 1
	/dev/ad0        /mnt3   ufs ro  1 1

Since UFS/FFS is not a synchrousely consistent filesystem (ie: it caches
things in RAM) this is not possible with read-write mounts, and the system
will correctly reject this.

Details:

	Add a geom consumer and a bufobj pointer to ufsmount.

	Eliminate the vnode argument from softdep_disk_prewrite().
	Pick the vnode out of bp->b_vp for now.  Eventually we
	should find it through bp->b_bufobj->b_private.

	In the mountcode, use g_vfs_open() once we have used
	VOP_ACCESS() to check permissions.

	When upgrading and downgrading between r/o and r/w do the
	right thing with GEOM access counts.  Remove all the
	workarounds for not being able to do this with VOP_OPEN().

	If we are the root mount, drop the exclusive access count
	until we upgrade to r/w.  This allows fsck of the root
	filesystem and the MNT_RELOAD to work correctly.

	Set bo_private to the GEOM consumer on the device bufobj.

	Change the ffs_ops->strategy function to call g_vfs_strategy()

	In ufs_strategy() directly call the strategy on the disk
	bufobj.  Same in rawread.

	In ffs_fsync() we will no longer see VCHR device nodes, so
	remove code which synced the filesystem mounted on it, in
	case we came there.  I'm not sure this code made sense in
	the first place since we would have taken the specfs route
	on such a vnode.

	Redo the highly bogus readblock() function in the snapshot
	code to something slightly less bogus: Constructing an uio
	and using physio was really quite a detour.  Instead just
	fill in a bio and ship it down.
This commit is contained in:
Poul-Henning Kamp 2004-10-29 10:15:56 +00:00
parent 4d13ab3da2
commit 4392001125
10 changed files with 98 additions and 126 deletions

View File

@ -1728,9 +1728,6 @@ ffs_blkfree(fs, devvp, bno, size, inum)
return;
}
#ifdef DIAGNOSTIC
if (dev->si_mountpoint &&
(dev->si_mountpoint->mnt_kern_flag & MNTK_SUSPENDED))
panic("ffs_blkfree: deallocation on suspended filesystem");
if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 ||
fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) {
printf("dev=%s, bno = %jd, bsize = %ld, size = %ld, fs = %s\n",

View File

@ -118,7 +118,7 @@ void softdep_setup_allocindir_page(struct inode *, ufs_lbn_t,
struct buf *, int, ufs2_daddr_t, ufs2_daddr_t, struct buf *);
void softdep_fsync_mountdev(struct vnode *);
int softdep_sync_metadata(struct vop_fsync_args *);
int softdep_disk_prewrite(struct vnode *vp, struct buf *bp);
int softdep_disk_prewrite(struct buf *bp);
/* XXX incorrectly moved to mount.h - should be indirect function */
#if 0
int softdep_fsync(struct vnode *vp);

View File

@ -248,15 +248,11 @@ ffs_rawread_readahead(struct vnode *vp,
if (bp->b_bcount + blockoff * DEV_BSIZE > bsize * (1 + bforwards))
bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE;
bp->b_bufsize = bp->b_bcount;
bp->b_dev = dp->v_rdev;
if (vmapbuf(bp) < 0)
return EFAULT;
if (dp->v_type == VCHR)
(void) VOP_SPECSTRATEGY(dp, bp);
else
(void) VOP_STRATEGY(dp, bp);
dp->v_bufobj.bo_ops->bop_strategy(&dp->v_bufobj, bp);
return 0;
}

View File

@ -52,6 +52,8 @@ __FBSDID("$FreeBSD$");
#include <sys/resourcevar.h>
#include <sys/vnode.h>
#include <geom/geom.h>
#include <ufs/ufs/extattr.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/ufsmount.h>
@ -2119,19 +2121,21 @@ readblock(vp, bp, lbn)
struct buf *bp;
ufs2_daddr_t lbn;
{
struct uio auio;
struct iovec aiov;
struct thread *td = curthread;
struct inode *ip = VTOI(vp);
struct bio *bip;
aiov.iov_base = bp->b_data;
aiov.iov_len = bp->b_bcount;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn)));
auio.uio_resid = bp->b_bcount;
auio.uio_rw = UIO_READ;
auio.uio_segflg = UIO_SYSSPACE;
auio.uio_td = td;
return (physio(ip->i_devvp->v_rdev, &auio, 0));
bip = g_alloc_bio();
bip->bio_cmd = BIO_READ;
bip->bio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn)));
bip->bio_data = bp->b_data;
bip->bio_length = bp->b_bcount;
g_io_request(bip, ip->i_devvp->v_bufobj.bo_private);
do
msleep(bip, NULL, PRIBIO, "snaprdb", hz/10);
while (!(bip->bio_flags & BIO_DONE));
bp->b_error = bip->bio_error;
g_destroy_bio(bip);
return (bp->b_error);
}

View File

@ -3416,9 +3416,10 @@ handle_workitem_freefile(freefile)
}
int
softdep_disk_prewrite(struct vnode *vp, struct buf *bp)
softdep_disk_prewrite(struct buf *bp)
{
int error;
struct vnode *vp = bp->b_vp;
KASSERT(bp->b_iocmd == BIO_WRITE,
("softdep_disk_prewrite on non-BIO_WRITE buffer"));
@ -4983,17 +4984,8 @@ softdep_sync_metadata(ap)
struct worklist *wk;
int i, error, waitfor;
/*
* Check whether this vnode is involved in a filesystem
* that is doing soft dependency processing.
*/
if (!vn_isdisk(vp, NULL)) {
if (!DOINGSOFTDEP(vp))
return (0);
} else
if (vp->v_rdev->si_mountpoint == NULL ||
(vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP) == 0)
return (0);
if (!DOINGSOFTDEP(vp))
return (0);
/*
* Ensure that any direct block dependencies have been cleared.
*/
@ -5222,18 +5214,6 @@ loop:
VI_UNLOCK(vp);
FREE_LOCK(&lk);
/*
* If we are trying to sync a block device, some of its buffers may
* contain metadata that cannot be written until the contents of some
* partially written files have been written to disk. The only easy
* way to accomplish this is to sync the entire filesystem (luckily
* this happens rarely).
*/
if (vn_isdisk(vp, NULL) &&
vp->v_rdev->si_mountpoint && !VOP_ISLOCKED(vp, NULL) &&
(error = VFS_SYNC(vp->v_rdev->si_mountpoint, MNT_WAIT, ap->a_cred,
ap->a_td)) != 0)
return (error);
return (0);
}
@ -5912,6 +5892,8 @@ getdirtybuf(bpp, mtx, waitfor)
/*
* Wait for pending output on a vnode to complete.
* Must be called with vnode lock and interlock locked.
*
* XXX: Should just be a call to bufobj_wwait().
*/
static void
drain_output(vp, islocked)

View File

@ -66,6 +66,9 @@ __FBSDID("$FreeBSD$");
#include <vm/uma.h>
#include <vm/vm_page.h>
#include <geom/geom.h>
#include <geom/geom_vfs.h>
uma_zone_t uma_inode, uma_ufs1, uma_ufs2;
static int ffs_sbupdate(struct ufsmount *, int);
@ -240,6 +243,11 @@ ffs_omount(struct mount *mp, char *path, caddr_t data, struct thread *td)
return (error);
}
vn_finished_write(mp);
DROP_GIANT();
g_topology_lock();
g_access(ump->um_cp, 0, -1, 0);
g_topology_unlock();
PICKUP_GIANT();
}
if ((mp->mnt_flag & MNT_RELOAD) &&
(error = ffs_reload(mp, td)) != 0)
@ -258,6 +266,20 @@ ffs_omount(struct mount *mp, char *path, caddr_t data, struct thread *td)
}
VOP_UNLOCK(devvp, 0, td);
}
DROP_GIANT();
g_topology_lock();
/*
* If we're the root device, we may not have an E count
* yet, get it now.
*/
if (ump->um_cp->ace == 0)
error = g_access(ump->um_cp, 0, 1, 1);
else
error = g_access(ump->um_cp, 0, 1, 0);
g_topology_unlock();
PICKUP_GIANT();
if (error)
return (error);
fs->fs_flags &= ~FS_UNCLEAN;
if (fs->fs_clean == 0) {
fs->fs_flags |= FS_UNCLEAN;
@ -350,8 +372,7 @@ ffs_omount(struct mount *mp, char *path, caddr_t data, struct thread *td)
* then it's not correct.
*/
if (devvp != ump->um_devvp &&
devvp->v_rdev != ump->um_devvp->v_rdev)
if (devvp->v_rdev != ump->um_devvp->v_rdev)
error = EINVAL; /* needs translation */
vrele(devvp);
if (error)
@ -412,7 +433,6 @@ ffs_reload(struct mount *mp, struct thread *td)
vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
if (vinvalbuf(devvp, 0, td->td_ucred, td, 0, 0) != 0)
panic("ffs_reload: dirty1");
vfs_object_create(devvp, td, td->td_ucred);
VOP_UNLOCK(devvp, 0, td);
@ -552,45 +572,45 @@ ffs_mountfs(devvp, mp, td)
int32_t *lp;
struct ucred *cred;
size_t strsize;
struct g_consumer *cp;
dev = devvp->v_rdev;
cred = td ? td->td_ucred : NOCRED;
vfs_object_create(devvp, td, cred);
ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
#if 0
/*
* Disallow multiple mounts of the same device.
* Disallow mounting of a device that is currently in use
* (except for root, which might share swap device for miniroot).
* Flush out any old buffers remaining from a previous use.
* XXX: check filesystem permissions, they may be more strict
* XXX: than what geom enforces.
* XXX: But since we're root, they wouldn't matter, would they ?
*/
error = vfs_mountedon(devvp);
if (error)
return (error);
if (vcount(devvp) > 1)
return (EBUSY);
vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
error = vinvalbuf(devvp, V_SAVE, cred, td, 0, 0);
error = VOP_ACCESS(devvp, ronly ? FREAD : FREAD | FWRITE, FSCRED, td);
if (error) {
VOP_UNLOCK(devvp, 0, td);
return (error);
}
#endif
DROP_GIANT();
g_topology_lock();
error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1);
#if 0
/*
* Note that it is optional that the backing device be VMIOed. This
* increases the opportunity for metadata caching.
*/
vfs_object_create(devvp, td, cred);
ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
/*
* XXX: open the device with read and write access even if only
* read access is needed now. Write access is needed if the
* filesystem is ever mounted read/write, and we don't change the
* access mode for remounts.
*/
#ifdef notyet
error = VOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, FSCRED, td, -1);
#else
error = VOP_OPEN(devvp, FREAD | FWRITE, FSCRED, td, -1);
#endif
/*
* If we are a root mount, drop the E flag so fsck can do its magic.
* We will pick it up again when we remounte R/W.
*/
if (error == 0 && ronly && (mp->mnt_flag & MNT_ROOTFS))
error = g_access(cp, 0, 0, -1);
g_topology_unlock();
PICKUP_GIANT();
VOP_UNLOCK(devvp, 0, td);
if (error)
return (error);
@ -599,6 +619,7 @@ ffs_mountfs(devvp, mp, td)
if (mp->mnt_iosize_max > MAXPHYS)
mp->mnt_iosize_max = MAXPHYS;
devvp->v_bufobj.bo_private = cp;
devvp->v_bufobj.bo_ops = &ffs_ops;
bp = NULL;
@ -663,6 +684,8 @@ ffs_mountfs(devvp, mp, td)
fs->fs_pendinginodes = 0;
}
ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO);
ump->um_cp = cp;
ump->um_bo = &devvp->v_bufobj;
ump->um_fs = malloc((u_long)fs->fs_sbsize, M_UFSMNT, M_WAITOK);
if (fs->fs_magic == FS_UFS1_MAGIC) {
ump->um_fstype = UFS1;
@ -751,8 +774,6 @@ ffs_mountfs(devvp, mp, td)
#ifdef UFS_EXTATTR
ufs_extattr_uepm_init(&ump->um_extattr);
#endif
devvp->v_rdev->si_mountpoint = mp;
/*
* Set FS local "last mounted on" information (NULL pad)
*/
@ -804,15 +825,15 @@ ffs_mountfs(devvp, mp, td)
#endif /* !UFS_EXTATTR */
return (0);
out:
devvp->v_rdev->si_mountpoint = NULL;
if (bp)
brelse(bp);
/* XXX: see comment above VOP_OPEN. */
#ifdef notyet
(void)VOP_CLOSE(devvp, ronly ? FREAD : FREAD | FWRITE, cred, td);
#else
(void)VOP_CLOSE(devvp, FREAD | FWRITE, cred, td);
#endif
if (cp != NULL) {
DROP_GIANT();
g_topology_lock();
g_wither_geom_close(cp->geom, ENXIO);
g_topology_unlock();
PICKUP_GIANT();
}
if (ump) {
free(ump->um_fs, M_UFSMNT);
free(ump, M_UFSMNT);
@ -964,16 +985,12 @@ ffs_unmount(mp, mntflags, td)
return (error);
}
}
ump->um_devvp->v_rdev->si_mountpoint = NULL;
vinvalbuf(ump->um_devvp, V_SAVE, NOCRED, td, 0, 0);
/* XXX: see comment above VOP_OPEN. */
#ifdef notyet
error = VOP_CLOSE(ump->um_devvp, fs->fs_ronly ? FREAD : FREAD | FWRITE,
NOCRED, td);
#else
error = VOP_CLOSE(ump->um_devvp, FREAD | FWRITE, NOCRED, td);
#endif
DROP_GIANT();
g_topology_lock();
g_wither_geom_close(ump->um_cp->geom, ENXIO);
g_topology_unlock();
PICKUP_GIANT();
vrele(ump->um_devvp);
free(fs->fs_csp, M_UFSMNT);
free(fs, M_UFSMNT);
@ -1533,24 +1550,10 @@ ffs_ifree(struct ufsmount *ump, struct inode *ip)
static void
ffs_geom_strategy(struct bufobj *bo, struct buf *bp)
{
int i = 0;
struct vnode *vp;
vp = bp->b_vp;
#if 0
KASSERT(vp == bo->bo_vnode, ("Inconsistent vnode bufstrategy"));
KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp));
#endif
if (vp->v_type == VCHR) {
#ifdef SOFTUPDATES
if (bp->b_iocmd == BIO_WRITE && softdep_disk_prewrite(bp->b_vp, bp))
return;
if (bp->b_iocmd == BIO_WRITE && softdep_disk_prewrite(bp))
return;
#endif
i = VOP_SPECSTRATEGY(vp, bp);
} else {
i = VOP_STRATEGY(vp, bp);
}
KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp));
g_vfs_strategy(bo, bp);
}

View File

@ -178,14 +178,7 @@ ffs_fsync(ap)
ufs_lbn_t lbn;
wait = (ap->a_waitfor == MNT_WAIT);
if (vn_isdisk(vp, NULL)) {
lbn = INT_MAX;
if (vp->v_rdev->si_mountpoint != NULL &&
(vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP))
softdep_fsync_mountdev(vp);
} else {
lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1));
}
lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1));
/*
* Flush all dirty buffers associated with a vnode.
@ -225,8 +218,6 @@ loop:
VI_UNLOCK(vp);
if ((bp->b_flags & B_DELWRI) == 0)
panic("ffs_fsync: not dirty");
if (vp != bp->b_vp)
panic("ffs_fsync: vp != vp->b_vp");
/*
* If this is a synchronous flush request, or it is not a
* file or device, start the write on this buffer immediatly.
@ -1212,7 +1203,7 @@ ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td
}
/*
* Vnode extattr strategy routine for special devices and fifos.
* Vnode extattr strategy routine for fifos.
*
* We need to check for a read or write of the external attributes.
* Otherwise we just fall through and do the usual thing.

View File

@ -123,6 +123,7 @@ struct inode {
#define IN_SPACECOUNTED 0x0080 /* Blocks to be freed in free count. */
#define i_devvp i_ump->um_devvp
#define i_umbufobj i_ump->um_bo
#define i_dirhash i_un.dirhash
#define i_snapblklist i_un.snapblklist
#define i_din1 dinode_u.din1

View File

@ -1927,6 +1927,7 @@ ufs_strategy(ap)
{
struct buf *bp = ap->a_bp;
struct vnode *vp = ap->a_vp;
struct bufobj *bo;
struct inode *ip;
ufs2_daddr_t blkno;
int error;
@ -1948,14 +1949,9 @@ ufs_strategy(ap)
bufdone(bp);
return (0);
}
vp = ip->i_devvp;
bp->b_dev = vp->v_rdev;
bp->b_iooffset = dbtob(bp->b_blkno);
#ifdef SOFTUPDATES
if (bp->b_iocmd == BIO_WRITE && softdep_disk_prewrite(vp, bp))
return (0);
#endif
VOP_SPECSTRATEGY(vp, bp);
bo = ip->i_umbufobj;
bo->bo_ops->bop_strategy(bo, bp);
return (0);
}

View File

@ -60,6 +60,8 @@ struct ufs_extattr_per_mount;
struct ufsmount {
struct mount *um_mountp; /* filesystem vfs structure */
struct cdev *um_dev; /* device mounted */
struct g_consumer *um_cp;
struct bufobj *um_bo; /* Buffer cache object */
struct vnode *um_devvp; /* block device mounted vnode */
u_long um_fstype; /* type of filesystem */
struct fs *um_fs; /* pointer to superblock */