Fix linux_destroy_dev() behaviour when there are still files open from

the destroying cdev.

Currently linux_destroy_dev() waits for the reference count on the
linux cdev to drain, and each open file hold the reference.
Practically it means that linux_destroy_dev() is blocked until all
userspace processes that have the cdev open, exit.  FreeBSD devfs does
not have such problem, because device refcount only prevents freeing
of the cdev memory, and separate 'active methods' counter blocks
destroy_dev() until all threads leave the cdevsw methods.  After that,
attempts to enter cdevsw methods are refused with an error.

Implement somewhat similar mechanism for LinuxKPI cdevs.  Demote cdev
refcount to only mean a hold on the linux cdev memory.  Add sirefs
count to track both number of threads inside the cdev methods, and for
single-bit indicator that cdev is being destroyed.  In the later case,
the call is redirected to the dummy cdev.

Reviewed by:	markj
Discussed with:	hselasky
Tested by:	zeising
MFC after:	1 week
Sponsored by:	Mellanox Technologies
Differential revision:	https://reviews.freebsd.org/D18606
This commit is contained in:
Konstantin Belousov 2018-12-30 15:46:45 +00:00
parent e5a3393a15
commit f823a36e83
2 changed files with 186 additions and 83 deletions

View File

@ -52,7 +52,8 @@ struct linux_cdev {
struct cdev *cdev;
dev_t dev;
const struct file_operations *ops;
atomic_long_t refs;
u_int refs;
u_int siref;
};
static inline void
@ -61,7 +62,7 @@ cdev_init(struct linux_cdev *cdev, const struct file_operations *ops)
kobject_init(&cdev->kobj, &linux_cdev_static_ktype);
cdev->ops = ops;
atomic_long_set(&cdev->refs, 0);
cdev->refs = 1;
}
static inline struct linux_cdev *
@ -70,8 +71,9 @@ cdev_alloc(void)
struct linux_cdev *cdev;
cdev = kzalloc(sizeof(struct linux_cdev), M_WAITOK);
if (cdev)
if (cdev != NULL)
kobject_init(&cdev->kobj, &linux_cdev_ktype);
cdev->refs = 1;
return (cdev);
}

View File

@ -40,6 +40,7 @@ __FBSDID("$FreeBSD$");
#include <sys/proc.h>
#include <sys/sglist.h>
#include <sys/sleepqueue.h>
#include <sys/refcount.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/bus.h>
@ -100,6 +101,7 @@ MALLOC_DEFINE(M_KMALLOC, "linux", "Linux kmalloc compat");
#undef cdev
#define RB_ROOT(head) (head)->rbh_root
static void linux_cdev_deref(struct linux_cdev *ldev);
static struct vm_area_struct *linux_cdev_handle_find(void *handle);
struct kobject linux_class_root;
@ -691,6 +693,52 @@ zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
return (0);
}
static struct file_operations dummy_ldev_ops = {
/* XXXKIB */
};
static struct linux_cdev dummy_ldev = {
.ops = &dummy_ldev_ops,
};
#define LDEV_SI_DTR 0x0001
#define LDEV_SI_REF 0x0002
static void
linux_get_fop(struct linux_file *filp, const struct file_operations **fop,
struct linux_cdev **dev)
{
struct linux_cdev *ldev;
u_int siref;
ldev = filp->f_cdev;
*fop = filp->f_op;
if (ldev != NULL) {
for (siref = ldev->siref;;) {
if ((siref & LDEV_SI_DTR) != 0) {
ldev = &dummy_ldev;
siref = ldev->siref;
*fop = ldev->ops;
MPASS((ldev->siref & LDEV_SI_DTR) == 0);
} else if (atomic_fcmpset_int(&ldev->siref, &siref,
siref + LDEV_SI_REF)) {
break;
}
}
}
*dev = ldev;
}
static void
linux_drop_fop(struct linux_cdev *ldev)
{
if (ldev == NULL)
return;
MPASS((ldev->siref & ~LDEV_SI_DTR) != 0);
atomic_subtract_int(&ldev->siref, LDEV_SI_REF);
}
#define OPW(fp,td,code) ({ \
struct file *__fpop; \
__typeof(code) __retval; \
@ -703,10 +751,12 @@ zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
})
static int
linux_dev_fdopen(struct cdev *dev, int fflags, struct thread *td, struct file *file)
linux_dev_fdopen(struct cdev *dev, int fflags, struct thread *td,
struct file *file)
{
struct linux_cdev *ldev;
struct linux_file *filp;
const struct file_operations *fop;
int error;
ldev = dev->si_drv1;
@ -718,20 +768,17 @@ linux_dev_fdopen(struct cdev *dev, int fflags, struct thread *td, struct file *f
filp->f_flags = file->f_flag;
filp->f_vnode = file->f_vnode;
filp->_file = file;
refcount_acquire(&ldev->refs);
filp->f_cdev = ldev;
linux_set_current(td);
linux_get_fop(filp, &fop, &ldev);
/* get a reference on the Linux character device */
if (atomic_long_add_unless(&ldev->refs, 1, -1L) == 0) {
kfree(filp);
return (EINVAL);
}
if (filp->f_op->open) {
error = -filp->f_op->open(file->f_vnode, filp);
if (error) {
atomic_long_dec(&ldev->refs);
if (fop->open != NULL) {
error = -fop->open(file->f_vnode, filp);
if (error != 0) {
linux_drop_fop(ldev);
linux_cdev_deref(filp->f_cdev);
kfree(filp);
return (error);
}
@ -742,6 +789,7 @@ linux_dev_fdopen(struct cdev *dev, int fflags, struct thread *td, struct file *f
/* release the file from devfs */
finit(file, filp->f_mode, DTYPE_DEV, filp, &linuxfileops);
linux_drop_fop(ldev);
return (ENXIO);
}
@ -877,7 +925,8 @@ linux_get_error(struct task_struct *task, int error)
static int
linux_file_ioctl_sub(struct file *fp, struct linux_file *filp,
u_long cmd, caddr_t data, struct thread *td)
const struct file_operations *fop, u_long cmd, caddr_t data,
struct thread *td)
{
struct task_struct *task = current;
unsigned size;
@ -902,20 +951,28 @@ linux_file_ioctl_sub(struct file *fp, struct linux_file *filp,
#if defined(__amd64__)
if (td->td_proc->p_elf_machine == EM_386) {
/* try the compat IOCTL handler first */
if (filp->f_op->compat_ioctl != NULL)
error = -OPW(fp, td, filp->f_op->compat_ioctl(filp, cmd, (u_long)data));
else
if (fop->compat_ioctl != NULL) {
error = -OPW(fp, td, fop->compat_ioctl(filp,
cmd, (u_long)data));
} else {
error = ENOTTY;
}
/* fallback to the regular IOCTL handler, if any */
if (error == ENOTTY && filp->f_op->unlocked_ioctl != NULL)
error = -OPW(fp, td, filp->f_op->unlocked_ioctl(filp, cmd, (u_long)data));
if (error == ENOTTY && fop->unlocked_ioctl != NULL) {
error = -OPW(fp, td, fop->unlocked_ioctl(filp,
cmd, (u_long)data));
}
} else
#endif
if (filp->f_op->unlocked_ioctl != NULL)
error = -OPW(fp, td, filp->f_op->unlocked_ioctl(filp, cmd, (u_long)data));
else
error = ENOTTY;
{
if (fop->unlocked_ioctl != NULL) {
error = -OPW(fp, td, fop->unlocked_ioctl(filp,
cmd, (u_long)data));
} else {
error = ENOTTY;
}
}
if (size > 0) {
task->bsd_ioctl_data = NULL;
task->bsd_ioctl_len = 0;
@ -1084,30 +1141,36 @@ static struct filterops linux_dev_kqfiltops_write = {
static void
linux_file_kqfilter_poll(struct linux_file *filp, int kqflags)
{
struct thread *td;
const struct file_operations *fop;
struct linux_cdev *ldev;
int temp;
if (filp->f_kqflags & kqflags) {
struct thread *td = curthread;
if ((filp->f_kqflags & kqflags) == 0)
return;
/* get the latest polling state */
temp = OPW(filp->_file, td, filp->f_op->poll(filp, NULL));
td = curthread;
spin_lock(&filp->f_kqlock);
/* clear kqflags */
filp->f_kqflags &= ~(LINUX_KQ_FLAG_NEED_READ |
LINUX_KQ_FLAG_NEED_WRITE);
/* update kqflags */
if (temp & (POLLIN | POLLOUT)) {
if (temp & POLLIN)
filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ;
if (temp & POLLOUT)
filp->f_kqflags |= LINUX_KQ_FLAG_NEED_WRITE;
linux_get_fop(filp, &fop, &ldev);
/* get the latest polling state */
temp = OPW(filp->_file, td, fop->poll(filp, NULL));
linux_drop_fop(ldev);
/* make sure the "knote" gets woken up */
KNOTE_LOCKED(&filp->f_selinfo.si_note, 0);
}
spin_unlock(&filp->f_kqlock);
spin_lock(&filp->f_kqlock);
/* clear kqflags */
filp->f_kqflags &= ~(LINUX_KQ_FLAG_NEED_READ |
LINUX_KQ_FLAG_NEED_WRITE);
/* update kqflags */
if ((temp & (POLLIN | POLLOUT)) != 0) {
if ((temp & POLLIN) != 0)
filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ;
if ((temp & POLLOUT) != 0)
filp->f_kqflags |= LINUX_KQ_FLAG_NEED_WRITE;
/* make sure the "knote" gets woken up */
KNOTE_LOCKED(&filp->f_selinfo.si_note, 0);
}
spin_unlock(&filp->f_kqlock);
}
static int
@ -1156,9 +1219,9 @@ linux_file_kqfilter(struct file *file, struct knote *kn)
}
static int
linux_file_mmap_single(struct file *fp, vm_ooffset_t *offset,
vm_size_t size, struct vm_object **object, int nprot,
struct thread *td)
linux_file_mmap_single(struct file *fp, const struct file_operations *fop,
vm_ooffset_t *offset, vm_size_t size, struct vm_object **object,
int nprot, struct thread *td)
{
struct task_struct *task;
struct vm_area_struct *vmap;
@ -1170,7 +1233,7 @@ linux_file_mmap_single(struct file *fp, vm_ooffset_t *offset,
filp = (struct linux_file *)fp->f_data;
filp->f_flags = fp->f_flag;
if (filp->f_op->mmap == NULL)
if (fop->mmap == NULL)
return (EOPNOTSUPP);
linux_set_current(td);
@ -1200,7 +1263,7 @@ linux_file_mmap_single(struct file *fp, vm_ooffset_t *offset,
if (unlikely(down_write_killable(&vmap->vm_mm->mmap_sem))) {
error = linux_get_error(task, EINTR);
} else {
error = -OPW(fp, td, filp->f_op->mmap(filp, vmap));
error = -OPW(fp, td, fop->mmap(filp, vmap));
error = linux_get_error(task, error);
up_write(&vmap->vm_mm->mmap_sem);
}
@ -1319,6 +1382,8 @@ linux_file_read(struct file *file, struct uio *uio, struct ucred *active_cred,
int flags, struct thread *td)
{
struct linux_file *filp;
const struct file_operations *fop;
struct linux_cdev *ldev;
ssize_t bytes;
int error;
@ -1331,8 +1396,10 @@ linux_file_read(struct file *file, struct uio *uio, struct ucred *active_cred,
if (uio->uio_resid > DEVFS_IOSIZE_MAX)
return (EINVAL);
linux_set_current(td);
if (filp->f_op->read) {
bytes = OPW(file, td, filp->f_op->read(filp, uio->uio_iov->iov_base,
linux_get_fop(filp, &fop, &ldev);
if (fop->read != NULL) {
bytes = OPW(file, td, fop->read(filp,
uio->uio_iov->iov_base,
uio->uio_iov->iov_len, &uio->uio_offset));
if (bytes >= 0) {
uio->uio_iov->iov_base =
@ -1347,6 +1414,7 @@ linux_file_read(struct file *file, struct uio *uio, struct ucred *active_cred,
/* update kqfilter status, if any */
linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_READ);
linux_drop_fop(ldev);
return (error);
}
@ -1356,10 +1424,11 @@ linux_file_write(struct file *file, struct uio *uio, struct ucred *active_cred,
int flags, struct thread *td)
{
struct linux_file *filp;
const struct file_operations *fop;
struct linux_cdev *ldev;
ssize_t bytes;
int error;
error = 0;
filp = (struct linux_file *)file->f_data;
filp->f_flags = file->f_flag;
/* XXX no support for I/O vectors currently */
@ -1368,14 +1437,17 @@ linux_file_write(struct file *file, struct uio *uio, struct ucred *active_cred,
if (uio->uio_resid > DEVFS_IOSIZE_MAX)
return (EINVAL);
linux_set_current(td);
if (filp->f_op->write) {
bytes = OPW(file, td, filp->f_op->write(filp, uio->uio_iov->iov_base,
linux_get_fop(filp, &fop, &ldev);
if (fop->write != NULL) {
bytes = OPW(file, td, fop->write(filp,
uio->uio_iov->iov_base,
uio->uio_iov->iov_len, &uio->uio_offset));
if (bytes >= 0) {
uio->uio_iov->iov_base =
((uint8_t *)uio->uio_iov->iov_base) + bytes;
uio->uio_iov->iov_len -= bytes;
uio->uio_resid -= bytes;
error = 0;
} else {
error = linux_get_error(current, -bytes);
}
@ -1385,6 +1457,8 @@ linux_file_write(struct file *file, struct uio *uio, struct ucred *active_cred,
/* update kqfilter status, if any */
linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_WRITE);
linux_drop_fop(ldev);
return (error);
}
@ -1393,16 +1467,21 @@ linux_file_poll(struct file *file, int events, struct ucred *active_cred,
struct thread *td)
{
struct linux_file *filp;
const struct file_operations *fop;
struct linux_cdev *ldev;
int revents;
filp = (struct linux_file *)file->f_data;
filp->f_flags = file->f_flag;
linux_set_current(td);
if (filp->f_op->poll != NULL)
revents = OPW(file, td, filp->f_op->poll(filp, LINUX_POLL_TABLE_NORMAL)) & events;
else
linux_get_fop(filp, &fop, &ldev);
if (fop->poll != NULL) {
revents = OPW(file, td, fop->poll(filp,
LINUX_POLL_TABLE_NORMAL)) & events;
} else {
revents = 0;
}
linux_drop_fop(ldev);
return (revents);
}
@ -1410,23 +1489,28 @@ static int
linux_file_close(struct file *file, struct thread *td)
{
struct linux_file *filp;
const struct file_operations *fop;
struct linux_cdev *ldev;
int error;
filp = (struct linux_file *)file->f_data;
KASSERT(file_count(filp) == 0, ("File refcount(%d) is not zero", file_count(filp)));
KASSERT(file_count(filp) == 0,
("File refcount(%d) is not zero", file_count(filp)));
error = 0;
filp->f_flags = file->f_flag;
linux_set_current(td);
linux_poll_wait_dequeue(filp);
error = -OPW(file, td, filp->f_op->release(filp->f_vnode, filp));
linux_get_fop(filp, &fop, &ldev);
if (fop->release != NULL)
error = -OPW(file, td, fop->release(filp->f_vnode, filp));
funsetown(&filp->f_sigio);
if (filp->f_vnode != NULL)
vdrop(filp->f_vnode);
if (filp->f_cdev != NULL) {
/* put a reference on the Linux character device */
atomic_long_dec(&filp->f_cdev->refs);
}
linux_drop_fop(ldev);
if (filp->f_cdev != NULL)
linux_cdev_deref(filp->f_cdev);
kfree(filp);
return (error);
@ -1437,27 +1521,30 @@ linux_file_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *cred,
struct thread *td)
{
struct linux_file *filp;
const struct file_operations *fop;
struct linux_cdev *ldev;
int error;
error = 0;
filp = (struct linux_file *)fp->f_data;
filp->f_flags = fp->f_flag;
error = 0;
linux_get_fop(filp, &fop, &ldev);
linux_set_current(td);
switch (cmd) {
case FIONBIO:
break;
case FIOASYNC:
if (filp->f_op->fasync == NULL)
if (fop->fasync == NULL)
break;
error = -OPW(fp, td, filp->f_op->fasync(0, filp, fp->f_flag & FASYNC));
error = -OPW(fp, td, fop->fasync(0, filp, fp->f_flag & FASYNC));
break;
case FIOSETOWN:
error = fsetown(*(int *)data, &filp->f_sigio);
if (error == 0) {
if (filp->f_op->fasync == NULL)
if (fop->fasync == NULL)
break;
error = -OPW(fp, td, filp->f_op->fasync(0, filp,
error = -OPW(fp, td, fop->fasync(0, filp,
fp->f_flag & FASYNC));
}
break;
@ -1465,16 +1552,17 @@ linux_file_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *cred,
*(int *)data = fgetown(&filp->f_sigio);
break;
default:
error = linux_file_ioctl_sub(fp, filp, cmd, data, td);
error = linux_file_ioctl_sub(fp, filp, fop, cmd, data, td);
break;
}
linux_drop_fop(ldev);
return (error);
}
static int
linux_file_mmap_sub(struct thread *td, vm_size_t objsize, vm_prot_t prot,
vm_prot_t *maxprotp, int *flagsp, struct file *fp,
vm_ooffset_t *foff, vm_object_t *objp)
vm_ooffset_t *foff, const struct file_operations *fop, vm_object_t *objp)
{
/*
* Character devices do not provide private mappings
@ -1486,7 +1574,8 @@ linux_file_mmap_sub(struct thread *td, vm_size_t objsize, vm_prot_t prot,
if ((*flagsp & (MAP_PRIVATE | MAP_COPY)) != 0)
return (EINVAL);
return (linux_file_mmap_single(fp, foff, objsize, objp, (int)prot, td));
return (linux_file_mmap_single(fp, fop, foff, objsize, objp,
(int)prot, td));
}
static int
@ -1495,6 +1584,8 @@ linux_file_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size
struct thread *td)
{
struct linux_file *filp;
const struct file_operations *fop;
struct linux_cdev *ldev;
struct mount *mp;
struct vnode *vp;
vm_object_t object;
@ -1541,15 +1632,18 @@ linux_file_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size
}
maxprot &= cap_maxprot;
error = linux_file_mmap_sub(td, size, prot, &maxprot, &flags, fp, &foff,
&object);
linux_get_fop(filp, &fop, &ldev);
error = linux_file_mmap_sub(td, size, prot, &maxprot, &flags, fp,
&foff, fop, &object);
if (error != 0)
return (error);
goto out;
error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
foff, FALSE, td);
if (error != 0)
vm_object_deallocate(object);
out:
linux_drop_fop(ldev);
return (error);
}
@ -1970,6 +2064,14 @@ linux_completion_done(struct completion *c)
return (isdone);
}
static void
linux_cdev_deref(struct linux_cdev *ldev)
{
if (refcount_release(&ldev->refs))
kfree(ldev);
}
static void
linux_cdev_release(struct kobject *kobj)
{
@ -1979,7 +2081,7 @@ linux_cdev_release(struct kobject *kobj)
cdev = container_of(kobj, struct linux_cdev, kobj);
parent = kobj->parent;
linux_destroy_dev(cdev);
kfree(cdev);
linux_cdev_deref(cdev);
kobject_put(parent);
}
@ -1996,20 +2098,19 @@ linux_cdev_static_release(struct kobject *kobj)
}
void
linux_destroy_dev(struct linux_cdev *cdev)
linux_destroy_dev(struct linux_cdev *ldev)
{
if (cdev->cdev == NULL)
if (ldev->cdev == NULL)
return;
atomic_long_dec(&cdev->refs);
MPASS((ldev->siref & LDEV_SI_DTR) == 0);
atomic_set_int(&ldev->siref, LDEV_SI_DTR);
while ((atomic_load_int(&ldev->siref) & ~LDEV_SI_DTR) != 0)
pause("ldevdtr", hz / 4);
/* wait for all open files to be closed */
while (atomic_long_read(&cdev->refs) != -1L)
pause("ldevdrn", hz);
destroy_dev(cdev->cdev);
cdev->cdev = NULL;
destroy_dev(ldev->cdev);
ldev->cdev = NULL;
}
const struct kobj_type linux_cdev_ktype = {