- Implement a lockless file descriptor lookup algorithm in

fget_unlocked().
 - Save old file descriptor tables created on expansion until
   the entire descriptor table is freed so that pointers may be
   followed without regard for expanders.
 - Mark the file zone as NOFREE so we may attempt to reference
   potentially freed files.
 - Convert several fget_locked() users to fget_unlocked().  This
   requires us to manage reference counts explicitly but reduces
   locking overhead in the common case.
This commit is contained in:
Jeff Roberson 2009-05-14 03:24:22 +00:00
parent b40ce02a2f
commit bf422e5f27
6 changed files with 117 additions and 77 deletions

View File

@ -124,12 +124,24 @@ static void fdused(struct filedesc *fdp, int fd);
*/
#define OFILESIZE (sizeof(struct file *) + sizeof(char))
/*
* Storage to hold unused ofiles that need to be reclaimed.
*/
struct freetable {
struct file **ft_table;
SLIST_ENTRY(freetable) ft_next;
};
/*
* Basic allocation of descriptors:
* one of the above, plus arrays for NDFILE descriptors.
*/
struct filedesc0 {
struct filedesc fd_fd;
/*
* ofiles which need to be reclaimed on free.
*/
SLIST_HEAD(,freetable) fd_free;
/*
* These arrays are used when the number of open files is
* <= NDFILE, and are then pointed to by the pointers above.
@ -1268,7 +1280,10 @@ fpathconf(struct thread *td, struct fpathconf_args *uap)
static void
fdgrowtable(struct filedesc *fdp, int nfd)
{
struct filedesc0 *fdp0;
struct freetable *fo;
struct file **ntable;
struct file **otable;
char *nfileflags;
int nnfiles, onfiles;
NDSLOTTYPE *nmap;
@ -1287,7 +1302,7 @@ fdgrowtable(struct filedesc *fdp, int nfd)
/* allocate a new table and (if required) new bitmaps */
FILEDESC_XUNLOCK(fdp);
ntable = malloc(nnfiles * OFILESIZE,
ntable = malloc((nnfiles * OFILESIZE) + sizeof(struct freetable),
M_FILEDESC, M_ZERO | M_WAITOK);
nfileflags = (char *)&ntable[nnfiles];
if (NDSLOTS(nnfiles) > NDSLOTS(onfiles))
@ -1311,10 +1326,20 @@ fdgrowtable(struct filedesc *fdp, int nfd)
}
bcopy(fdp->fd_ofiles, ntable, onfiles * sizeof(*ntable));
bcopy(fdp->fd_ofileflags, nfileflags, onfiles);
if (onfiles > NDFILE)
free(fdp->fd_ofiles, M_FILEDESC);
fdp->fd_ofiles = ntable;
otable = fdp->fd_ofiles;
fdp->fd_ofileflags = nfileflags;
fdp->fd_ofiles = ntable;
/*
* We must preserve ofiles until the process exits because we can't
* be certain that no threads have references to the old table via
* _fget().
*/
if (onfiles > NDFILE) {
fo = (struct freetable *)&otable[onfiles];
fdp0 = (struct filedesc0 *)fdp;
fo->ft_table = otable;
SLIST_INSERT_HEAD(&fdp0->fd_free, fo, ft_next);
}
if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
bcopy(fdp->fd_map, nmap, NDSLOTS(onfiles) * sizeof(*nmap));
if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
@ -1512,6 +1537,8 @@ fdhold(struct proc *p)
static void
fddrop(struct filedesc *fdp)
{
struct filedesc0 *fdp0;
struct freetable *ft;
int i;
mtx_lock(&fdesc_mtx);
@ -1521,6 +1548,11 @@ fddrop(struct filedesc *fdp)
return;
FILEDESC_LOCK_DESTROY(fdp);
fdp0 = (struct filedesc0 *)fdp;
while ((ft = SLIST_FIRST(&fdp0->fd_free)) != NULL) {
SLIST_REMOVE_HEAD(&fdp0->fd_free, ft_next);
free(ft->ft_table, M_FILEDESC);
}
free(fdp, M_FILEDESC);
}
@ -2022,6 +2054,38 @@ finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
}
struct file *
fget_unlocked(struct filedesc *fdp, int fd)
{
struct file *fp;
u_int count;
if (fd < 0 || fd >= fdp->fd_nfiles)
return (NULL);
/*
* Fetch the descriptor locklessly. We avoid fdrop() races by
* never raising a refcount above 0. To accomplish this we have
* to use a cmpset loop rather than an atomic_add. The descriptor
* must be re-verified once we acquire a reference to be certain
* that the identity is still correct and we did not lose a race
* due to preemption.
*/
for (;;) {
fp = fdp->fd_ofiles[fd];
if (fp == NULL)
break;
count = fp->f_count;
if (count == 0)
continue;
if (atomic_cmpset_int(&fp->f_count, count, count + 1) != 1)
continue;
if (fp == ((struct file *volatile*)fdp->fd_ofiles)[fd])
break;
fdrop(fp, curthread);
}
return (fp);
}
/*
* Extract the file pointer associated with the specified descriptor for the
@ -2030,16 +2094,12 @@ finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
* If the descriptor doesn't exist or doesn't match 'flags', EBADF is
* returned.
*
* If 'hold' is set (non-zero) the file's refcount will be bumped on return.
* It should be dropped with fdrop(). If it is not set, then the refcount
* will not be bumped however the thread's filedesc struct will be returned
* locked (for fgetsock).
*
* If an error occured the non-zero error is returned and *fpp is set to
* NULL. Otherwise *fpp is set and zero is returned.
* NULL. Otherwise *fpp is held and set and zero is returned. Caller is
* responsible for fdrop().
*/
static __inline int
_fget(struct thread *td, int fd, struct file **fpp, int flags, int hold)
_fget(struct thread *td, int fd, struct file **fpp, int flags)
{
struct filedesc *fdp;
struct file *fp;
@ -2047,29 +2107,22 @@ _fget(struct thread *td, int fd, struct file **fpp, int flags, int hold)
*fpp = NULL;
if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
return (EBADF);
FILEDESC_SLOCK(fdp);
if ((fp = fget_locked(fdp, fd)) == NULL || fp->f_ops == &badfileops) {
FILEDESC_SUNLOCK(fdp);
if ((fp = fget_unlocked(fdp, fd)) == NULL)
return (EBADF);
if (fp->f_ops == &badfileops) {
fdrop(fp, td);
return (EBADF);
}
/*
* FREAD and FWRITE failure return EBADF as per POSIX.
*
* Only one flag, or 0, may be specified.
*/
if (flags == FREAD && (fp->f_flag & FREAD) == 0) {
FILEDESC_SUNLOCK(fdp);
if ((flags == FREAD && (fp->f_flag & FREAD) == 0) ||
(flags == FWRITE && (fp->f_flag & FWRITE) == 0)) {
fdrop(fp, td);
return (EBADF);
}
if (flags == FWRITE && (fp->f_flag & FWRITE) == 0) {
FILEDESC_SUNLOCK(fdp);
return (EBADF);
}
if (hold) {
fhold(fp);
FILEDESC_SUNLOCK(fdp);
}
*fpp = fp;
return (0);
}
@ -2078,21 +2131,21 @@ int
fget(struct thread *td, int fd, struct file **fpp)
{
return(_fget(td, fd, fpp, 0, 1));
return(_fget(td, fd, fpp, 0));
}
int
fget_read(struct thread *td, int fd, struct file **fpp)
{
return(_fget(td, fd, fpp, FREAD, 1));
return(_fget(td, fd, fpp, FREAD));
}
int
fget_write(struct thread *td, int fd, struct file **fpp)
{
return(_fget(td, fd, fpp, FWRITE, 1));
return(_fget(td, fd, fpp, FWRITE));
}
/*
@ -2109,7 +2162,7 @@ _fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags)
int error;
*vpp = NULL;
if ((error = _fget(td, fd, &fp, flags, 0)) != 0)
if ((error = _fget(td, fd, &fp, flags)) != 0)
return (error);
if (fp->f_vnode == NULL) {
error = EINVAL;
@ -2117,7 +2170,8 @@ _fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags)
*vpp = fp->f_vnode;
vref(*vpp);
}
FILEDESC_SUNLOCK(td->td_proc->p_fd);
fdrop(fp, td);
return (error);
}
@ -2164,7 +2218,7 @@ fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp)
*spp = NULL;
if (fflagp != NULL)
*fflagp = 0;
if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
if ((error = _fget(td, fd, &fp, 0)) != 0)
return (error);
if (fp->f_type != DTYPE_SOCKET) {
error = ENOTSOCK;
@ -2176,7 +2230,8 @@ fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp)
soref(*spp);
SOCK_UNLOCK(*spp);
}
FILEDESC_SUNLOCK(td->td_proc->p_fd);
fdrop(fp, td);
return (error);
}
@ -3147,7 +3202,7 @@ filelistinit(void *dummy)
{
file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
NULL, NULL, UMA_ALIGN_PTR, 0);
NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF);
}

View File

@ -988,7 +988,6 @@ selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits)
fdp = td->td_proc->p_fd;
stp = td->td_sel;
n = 0;
FILEDESC_SLOCK(fdp);
STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) {
fd = (int)(uintptr_t)sfp->sf_cookie;
si = sfp->sf_si;
@ -996,17 +995,15 @@ selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits)
/* If the selinfo wasn't cleared the event didn't fire. */
if (si != NULL)
continue;
if ((fp = fget_locked(fdp, fd)) == NULL) {
FILEDESC_SUNLOCK(fdp);
if ((fp = fget_unlocked(fdp, fd)) == NULL)
return (EBADF);
}
idx = fd / NFDBITS;
bit = (fd_mask)1 << (fd % NFDBITS);
ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td);
fdrop(fp, td);
if (ev != 0)
n += selsetbits(ibits, obits, idx, bit, ev);
}
FILEDESC_SUNLOCK(fdp);
stp->st_flags = 0;
td->td_retval[0] = n;
return (0);
@ -1030,7 +1027,6 @@ selscan(td, ibits, obits, nfd)
fdp = td->td_proc->p_fd;
n = 0;
FILEDESC_SLOCK(fdp);
for (idx = 0, fd = 0; fd < nfd; idx++) {
end = imin(fd + NFDBITS, nfd);
for (bit = 1; fd < end; bit <<= 1, fd++) {
@ -1038,18 +1034,16 @@ selscan(td, ibits, obits, nfd)
flags = selflags(ibits, idx, bit);
if (flags == 0)
continue;
if ((fp = fget_locked(fdp, fd)) == NULL) {
FILEDESC_SUNLOCK(fdp);
if ((fp = fget_unlocked(fdp, fd)) == NULL)
return (EBADF);
}
selfdalloc(td, (void *)(uintptr_t)fd);
ev = fo_poll(fp, flags, td->td_ucred, td);
fdrop(fp, td);
if (ev != 0)
n += selsetbits(ibits, obits, idx, bit, ev);
}
}
FILEDESC_SUNLOCK(fdp);
td->td_retval[0] = n;
return (0);
}

View File

@ -1720,10 +1720,13 @@ ttyhook_register(struct tty **rtp, struct proc *p, int fd,
/* Validate the file descriptor. */
if ((fdp = p->p_fd) == NULL)
return (EBADF);
FILEDESC_SLOCK(fdp);
if ((fp = fget_locked(fdp, fd)) == NULL || fp->f_ops == &badfileops) {
FILEDESC_SUNLOCK(fdp);
fp = fget_unlocked(fdp, fd);
if (fp == NULL)
return (EBADF);
if (fp->f_ops == &badfileops) {
error = EBADF;
goto done1;
}
/* Make sure the vnode is bound to a character device. */
@ -1763,7 +1766,7 @@ ttyhook_register(struct tty **rtp, struct proc *p, int fd,
done3: tty_unlock(tp);
done2: dev_relthread(dev);
done1: FILEDESC_SUNLOCK(fdp);
done1: fdrop(fp, curthread);
return (error);
}

View File

@ -122,24 +122,17 @@ getsock(struct filedesc *fdp, int fd, struct file **fpp, u_int *fflagp)
int error;
fp = NULL;
if (fdp == NULL)
if (fdp == NULL || (fp = fget_unlocked(fdp, fd)) == NULL) {
error = EBADF;
else {
FILEDESC_SLOCK(fdp);
fp = fget_locked(fdp, fd);
if (fp == NULL)
error = EBADF;
else if (fp->f_type != DTYPE_SOCKET) {
} else if (fp->f_type != DTYPE_SOCKET) {
fdrop(fp, curthread);
fp = NULL;
error = ENOTSOCK;
} else {
fhold(fp);
if (fflagp != NULL)
*fflagp = fp->f_flag;
error = 0;
}
FILEDESC_SUNLOCK(fdp);
}
*fpp = fp;
return (error);
}

View File

@ -4235,22 +4235,13 @@ getvnode(fdp, fd, fpp)
int error;
struct file *fp;
error = 0;
fp = NULL;
if (fdp == NULL)
error = EBADF;
else {
FILEDESC_SLOCK(fdp);
if ((u_int)fd >= fdp->fd_nfiles ||
(fp = fdp->fd_ofiles[fd]) == NULL)
if (fdp == NULL || (fp = fget_unlocked(fdp, fd)) == NULL)
error = EBADF;
else if (fp->f_vnode == NULL) {
fp = NULL;
error = EINVAL;
} else {
fhold(fp);
error = 0;
}
FILEDESC_SUNLOCK(fdp);
fdrop(fp, curthread);
}
*fpp = fp;
return (error);

View File

@ -129,6 +129,10 @@ int getvnode(struct filedesc *fdp, int fd, struct file **fpp);
void mountcheckdirs(struct vnode *olddp, struct vnode *newdp);
void setugidsafety(struct thread *td);
/* Return a referenced file from an unlocked descriptor. */
struct file *fget_unlocked(struct filedesc *fdp, int fd);
/* Requires a FILEDESC_{S,X}LOCK held and returns without a ref. */
static __inline struct file *
fget_locked(struct filedesc *fdp, int fd)
{