Extend the KPI to lock and unlock f_offset member of struct file. It

now fully encapsulates all accesses to f_offset, and extends f_offset
locking to other consumers that need it, in particular, to lseek() and
variants of getdirentries().

Ensure that on 32bit architectures f_offset, which is 64bit quantity,
always read and written under the mtxpool protection. This fixes
apparently easy to trigger race when parallel lseek()s or lseek() and
read/write could destroy file offset.

The already broken ABI emulations, including iBCS and SysV, are not
converted (yet).

Tested by:	pho
No objections from:	jhb
MFC after:    3 weeks
This commit is contained in:
Konstantin Belousov 2012-07-02 21:01:03 +00:00
parent 9f1dae79da
commit c5c1199c83
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=238029
7 changed files with 133 additions and 70 deletions

View File

@ -357,15 +357,16 @@ getdents_common(struct thread *td, struct linux_getdents64_args *args,
return (EBADF);
}
off = foffset_lock(fp, 0);
vp = fp->f_vnode;
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
if (vp->v_type != VDIR) {
VFS_UNLOCK_GIANT(vfslocked);
foffset_unlock(fp, off, 0);
fdrop(fp, td);
return (EINVAL);
}
off = fp->f_offset;
buflen = max(LINUX_DIRBLKSIZ, nbytes);
buflen = min(buflen, MAXBSIZE);
@ -514,7 +515,6 @@ getdents_common(struct thread *td, struct linux_getdents64_args *args,
goto eof;
}
fp->f_offset = off;
if (justone)
nbytes = resid + linuxreclen;
@ -527,6 +527,7 @@ getdents_common(struct thread *td, struct linux_getdents64_args *args,
VOP_UNLOCK(vp, 0);
VFS_UNLOCK_GIANT(vfslocked);
foffset_unlock(fp, off, 0);
fdrop(fp, td);
free(buf, M_TEMP);
free(lbuf, M_TEMP);

View File

@ -1170,18 +1170,14 @@ devfs_read_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, st
if (ioflag & O_DIRECT)
ioflag |= IO_DIRECT;
if ((flags & FOF_OFFSET) == 0)
uio->uio_offset = fp->f_offset;
foffset_lock_uio(fp, uio, flags | FOF_NOLOCK);
error = dsw->d_read(dev, uio, ioflag);
if (uio->uio_resid != resid || (error == 0 && resid != 0))
vfs_timestamp(&dev->si_atime);
td->td_fpop = fpop;
dev_relthread(dev, ref);
if ((flags & FOF_OFFSET) == 0)
fp->f_offset = uio->uio_offset;
fp->f_nextoff = uio->uio_offset;
foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF);
return (error);
}
@ -1648,8 +1644,7 @@ devfs_write_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, s
ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT | O_FSYNC);
if (ioflag & O_DIRECT)
ioflag |= IO_DIRECT;
if ((flags & FOF_OFFSET) == 0)
uio->uio_offset = fp->f_offset;
foffset_lock_uio(fp, uio, flags | FOF_NOLOCK);
resid = uio->uio_resid;
@ -1661,9 +1656,7 @@ devfs_write_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, s
td->td_fpop = fpop;
dev_relthread(dev, ref);
if ((flags & FOF_OFFSET) == 0)
fp->f_offset = uio->uio_offset;
fp->f_nextoff = uio->uio_offset;
foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF);
return (error);
}

View File

@ -465,6 +465,7 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
int vfslocked;
u_int old, new;
uint64_t bsize;
off_t foffset;
vfslocked = 0;
error = 0;
@ -606,14 +607,15 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
}
flp = (struct flock *)arg;
if (flp->l_whence == SEEK_CUR) {
if (fp->f_offset < 0 ||
foffset = foffset_get(fp);
if (foffset < 0 ||
(flp->l_start > 0 &&
fp->f_offset > OFF_MAX - flp->l_start)) {
foffset > OFF_MAX - flp->l_start)) {
FILEDESC_SUNLOCK(fdp);
error = EOVERFLOW;
break;
}
flp->l_start += fp->f_offset;
flp->l_start += foffset;
}
/*
@ -727,15 +729,16 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
break;
}
if (flp->l_whence == SEEK_CUR) {
foffset = foffset_get(fp);
if ((flp->l_start > 0 &&
fp->f_offset > OFF_MAX - flp->l_start) ||
foffset > OFF_MAX - flp->l_start) ||
(flp->l_start < 0 &&
fp->f_offset < OFF_MIN - flp->l_start)) {
foffset < OFF_MIN - flp->l_start)) {
FILEDESC_SUNLOCK(fdp);
error = EOVERFLOW;
break;
}
flp->l_start += fp->f_offset;
flp->l_start += foffset;
}
/*
* VOP_ADVLOCK() may block.
@ -2810,7 +2813,7 @@ sysctl_kern_file(SYSCTL_HANDLER_ARGS)
xf.xf_type = fp->f_type;
xf.xf_count = fp->f_count;
xf.xf_msgcount = 0;
xf.xf_offset = fp->f_offset;
xf.xf_offset = foffset_get(fp);
xf.xf_flag = fp->f_flag;
error = SYSCTL_OUT(req, &xf, sizeof(xf));
if (error)
@ -3015,7 +3018,7 @@ sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
kif->kf_flags |= KF_FLAG_DIRECT;
if (fp->f_flag & FHASLOCK)
kif->kf_flags |= KF_FLAG_HASLOCK;
kif->kf_offset = fp->f_offset;
kif->kf_offset = foffset_get(fp);
if (vp != NULL) {
vref(vp);
switch (vp->v_type) {
@ -3359,7 +3362,7 @@ sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
}
refcnt = fp->f_count;
fflags = fp->f_flag;
offset = fp->f_offset;
offset = foffset_get(fp);
/*
* Create sysctl entry.

View File

@ -1981,7 +1981,7 @@ sys_lseek(td, uap)
struct file *fp;
struct vnode *vp;
struct vattr vattr;
off_t offset, size;
off_t foffset, offset, size;
int error, noneg;
int vfslocked;
@ -1993,18 +1993,19 @@ sys_lseek(td, uap)
return (ESPIPE);
}
vp = fp->f_vnode;
foffset = foffset_lock(fp, 0);
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
noneg = (vp->v_type != VCHR);
offset = uap->offset;
switch (uap->whence) {
case L_INCR:
if (noneg &&
(fp->f_offset < 0 ||
(offset > 0 && fp->f_offset > OFF_MAX - offset))) {
(foffset < 0 ||
(offset > 0 && foffset > OFF_MAX - offset))) {
error = EOVERFLOW;
break;
}
offset += fp->f_offset;
offset += foffset;
break;
case L_XTND:
vn_lock(vp, LK_SHARED | LK_RETRY);
@ -2044,12 +2045,12 @@ sys_lseek(td, uap)
error = EINVAL;
if (error != 0)
goto drop;
fp->f_offset = offset;
VFS_KNOTE_UNLOCKED(vp, 0);
*(off_t *)(td->td_retval) = fp->f_offset;
*(off_t *)(td->td_retval) = offset;
drop:
fdrop(fp, td);
VFS_UNLOCK_GIANT(vfslocked);
foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
return (error);
}
@ -3982,6 +3983,7 @@ kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
caddr_t dirbuf;
int error, eofflag, readcnt, vfslocked;
long loff;
off_t foffset;
/* XXX arbitrary sanity limit on `count'. */
if (uap->count > 64 * 1024)
@ -3994,10 +3996,12 @@ kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
return (EBADF);
}
vp = fp->f_vnode;
foffset = foffset_lock(fp, 0);
unionread:
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
if (vp->v_type != VDIR) {
VFS_UNLOCK_GIANT(vfslocked);
foffset_unlock(fp, foffset, 0);
fdrop(fp, td);
return (EINVAL);
}
@ -4010,12 +4014,13 @@ kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
auio.uio_td = td;
auio.uio_resid = uap->count;
vn_lock(vp, LK_SHARED | LK_RETRY);
loff = auio.uio_offset = fp->f_offset;
loff = auio.uio_offset = foffset;
#ifdef MAC
error = mac_vnode_check_readdir(td->td_ucred, vp);
if (error) {
VOP_UNLOCK(vp, 0);
VFS_UNLOCK_GIANT(vfslocked);
foffset_unlock(fp, foffset, FOF_NOUPDATE);
fdrop(fp, td);
return (error);
}
@ -4024,7 +4029,7 @@ kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
if (vp->v_mount->mnt_maxsymlinklen <= 0) {
error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
NULL, NULL);
fp->f_offset = auio.uio_offset;
foffset = auio.uio_offset;
} else
# endif
{
@ -4036,7 +4041,7 @@ kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
kiov.iov_base = dirbuf;
error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
NULL, NULL);
fp->f_offset = kuio.uio_offset;
foffset = kuio.uio_offset;
if (error == 0) {
readcnt = uap->count - kuio.uio_resid;
edp = (struct dirent *)&dirbuf[readcnt];
@ -4074,6 +4079,7 @@ kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
if (error) {
VOP_UNLOCK(vp, 0);
VFS_UNLOCK_GIANT(vfslocked);
foffset_unlock(fp, foffset, 0);
fdrop(fp, td);
return (error);
}
@ -4085,13 +4091,14 @@ kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
VREF(vp);
fp->f_vnode = vp;
fp->f_data = vp;
fp->f_offset = 0;
foffset = 0;
vput(tvp);
VFS_UNLOCK_GIANT(vfslocked);
goto unionread;
}
VOP_UNLOCK(vp, 0);
VFS_UNLOCK_GIANT(vfslocked);
foffset_unlock(fp, foffset, 0);
fdrop(fp, td);
td->td_retval[0] = uap->count - auio.uio_resid;
if (error == 0)
@ -4144,6 +4151,7 @@ kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
int vfslocked;
long loff;
int error, eofflag;
off_t foffset;
AUDIT_ARG_FD(fd);
if (count > IOSIZE_MAX)
@ -4157,6 +4165,7 @@ kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
return (EBADF);
}
vp = fp->f_vnode;
foffset = foffset_lock(fp, 0);
unionread:
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
if (vp->v_type != VDIR) {
@ -4173,14 +4182,14 @@ kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
auio.uio_td = td;
vn_lock(vp, LK_SHARED | LK_RETRY);
AUDIT_ARG_VNODE1(vp);
loff = auio.uio_offset = fp->f_offset;
loff = auio.uio_offset = foffset;
#ifdef MAC
error = mac_vnode_check_readdir(td->td_ucred, vp);
if (error == 0)
#endif
error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
NULL);
fp->f_offset = auio.uio_offset;
foffset = auio.uio_offset;
if (error) {
VOP_UNLOCK(vp, 0);
VFS_UNLOCK_GIANT(vfslocked);
@ -4194,7 +4203,7 @@ kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
VREF(vp);
fp->f_vnode = vp;
fp->f_data = vp;
fp->f_offset = 0;
foffset = 0;
vput(tvp);
VFS_UNLOCK_GIANT(vfslocked);
goto unionread;
@ -4206,6 +4215,7 @@ kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
*residp = auio.uio_resid;
td->td_retval[0] = count - auio.uio_resid;
fail:
foffset_unlock(fp, foffset, 0);
fdrop(fp, td);
return (error);
}

View File

@ -527,13 +527,22 @@ vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
return (error);
}
static void
foffset_lock(struct file *fp, struct uio *uio, int flags)
off_t
foffset_lock(struct file *fp, int flags)
{
struct mtx *mtxp;
off_t res;
if ((flags & FOF_OFFSET) != 0)
return;
KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
#if OFF_MAX <= LONG_MAX
/*
* Caller only wants the current f_offset value. Assume that
* the long and shorter integer types reads are atomic.
*/
if ((flags & FOF_NOLOCK) != 0)
return (fp->f_offset);
#endif
/*
* According to McKusick the vn lock was protecting f_offset here.
@ -541,14 +550,66 @@ foffset_lock(struct file *fp, struct uio *uio, int flags)
*/
mtxp = mtx_pool_find(mtxpool_sleep, fp);
mtx_lock(mtxp);
while (fp->f_vnread_flags & FOFFSET_LOCKED) {
fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
"vnread offlock", 0);
if ((flags & FOF_NOLOCK) == 0) {
while (fp->f_vnread_flags & FOFFSET_LOCKED) {
fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
"vofflock", 0);
}
fp->f_vnread_flags |= FOFFSET_LOCKED;
}
fp->f_vnread_flags |= FOFFSET_LOCKED;
uio->uio_offset = fp->f_offset;
res = fp->f_offset;
mtx_unlock(mtxp);
return (res);
}
void
foffset_unlock(struct file *fp, off_t val, int flags)
{
struct mtx *mtxp;
KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
#if OFF_MAX <= LONG_MAX
if ((flags & FOF_NOLOCK) != 0) {
if ((flags & FOF_NOUPDATE) == 0)
fp->f_offset = val;
if ((flags & FOF_NEXTOFF) != 0)
fp->f_nextoff = val;
return;
}
#endif
mtxp = mtx_pool_find(mtxpool_sleep, fp);
mtx_lock(mtxp);
if ((flags & FOF_NOUPDATE) == 0)
fp->f_offset = val;
if ((flags & FOF_NEXTOFF) != 0)
fp->f_nextoff = val;
if ((flags & FOF_NOLOCK) == 0) {
KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0,
("Lost FOFFSET_LOCKED"));
if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
wakeup(&fp->f_vnread_flags);
fp->f_vnread_flags = 0;
}
mtx_unlock(mtxp);
}
void
foffset_lock_uio(struct file *fp, struct uio *uio, int flags)
{
if ((flags & FOF_OFFSET) == 0)
uio->uio_offset = foffset_lock(fp, flags);
}
void
foffset_unlock_uio(struct file *fp, struct uio *uio, int flags)
{
if ((flags & FOF_OFFSET) == 0)
foffset_unlock(fp, uio->uio_offset, flags);
}
static int
@ -570,23 +631,6 @@ get_advice(struct file *fp, struct uio *uio)
return (ret);
}
static void
foffset_unlock(struct file *fp, struct uio *uio, int flags)
{
struct mtx *mtxp;
if ((flags & FOF_OFFSET) != 0)
return;
fp->f_offset = uio->uio_offset;
mtxp = mtx_pool_find(mtxpool_sleep, fp);
mtx_lock(mtxp);
if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
wakeup(&fp->f_vnread_flags);
fp->f_vnread_flags = 0;
mtx_unlock(mtxp);
}
/*
* File table vnode read routine.
*/
@ -865,7 +909,7 @@ vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
else
doio = vn_write;
vp = fp->f_vnode;
foffset_lock(fp, uio, flags);
foffset_lock_uio(fp, uio, flags);
if (uio->uio_segflg != UIO_USERSPACE || vp->v_type != VREG ||
((mp = vp->v_mount) != NULL &&
@ -982,7 +1026,7 @@ vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
vn_rangelock_unlock(vp, rl_cookie);
free(uio_clone, M_IOV);
out_last:
foffset_unlock(fp, uio, flags);
foffset_unlock_uio(fp, uio, flags);
return (error);
}

View File

@ -72,10 +72,25 @@ struct socket;
struct file;
struct ucred;
#define FOF_OFFSET 0x01 /* Use the offset in uio argument */
#define FOF_NOLOCK 0x02 /* Do not take FOFFSET_LOCK */
#define FOF_NEXTOFF 0x04 /* Also update f_nextoff */
#define FOF_NOUPDATE 0x10 /* Do not update f_offset */
off_t foffset_lock(struct file *fp, int flags);
void foffset_lock_uio(struct file *fp, struct uio *uio, int flags);
void foffset_unlock(struct file *fp, off_t val, int flags);
void foffset_unlock_uio(struct file *fp, struct uio *uio, int flags);
static inline off_t
foffset_get(struct file *fp)
{
return (foffset_lock(fp, FOF_NOLOCK));
}
typedef int fo_rdwr_t(struct file *fp, struct uio *uio,
struct ucred *active_cred, int flags,
struct thread *td);
#define FOF_OFFSET 1 /* Use the offset in uio argument */
typedef int fo_truncate_t(struct file *fp, off_t length,
struct ucred *active_cred, struct thread *td);
typedef int fo_ioctl_t(struct file *fp, u_long com, void *data,

View File

@ -2865,10 +2865,9 @@ buffered_write(fp, uio, active_cred, flags, td)
if (ip->i_devvp != devvp)
return (EINVAL);
fs = ip->i_fs;
foffset_lock_uio(fp, uio, flags);
vfslocked = VFS_LOCK_GIANT(ip->i_vnode->v_mount);
vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
if ((flags & FOF_OFFSET) == 0)
uio->uio_offset = fp->f_offset;
#ifdef DEBUG
if (fsckcmds) {
printf("%s: buffered write for block %jd\n",
@ -2893,11 +2892,9 @@ buffered_write(fp, uio, active_cred, flags, td)
goto out;
}
error = bwrite(bp);
if ((flags & FOF_OFFSET) == 0)
fp->f_offset = uio->uio_offset;
fp->f_nextoff = uio->uio_offset;
out:
VOP_UNLOCK(devvp, 0);
VFS_UNLOCK_GIANT(vfslocked);
foffset_unlock_uio(fp, uio, flags | FOF_NEXTOFF);
return (error);
}