freebsd-nq/sys/compat/linux/linux_futex.c
Roman Divacky 4732e446fb Implement robust futexes. Most of the code is modelled after
what Linux does. This is because robust futexes are mostly
userspace thing which we cannot alter. Two syscalls maintain
pointer to userspace list and when process exits a routine
walks this list waking up processes sleeping on futexes
from that list.

Reviewed by:	kib (mentor)
MFC after:	1 month
2008-05-13 20:01:27 +00:00

697 lines
16 KiB
C

/* $NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $ */
/*-
* Copyright (c) 2005 Emmanuel Dreyfus, all rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Emmanuel Dreyfus
* 4. The name of the author may not be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS''
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#if 0
__KERNEL_RCSID(1, "$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $");
#endif
#include "opt_compat.h"
#include <sys/param.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/imgact.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/priv.h>
#include <sys/sched.h>
#include <sys/sx.h>
#include <sys/malloc.h>
#ifdef COMPAT_LINUX32
#include <machine/../linux32/linux.h>
#include <machine/../linux32/linux32_proto.h>
#else
#include <machine/../linux/linux.h>
#include <machine/../linux/linux_proto.h>
#endif
#include <compat/linux/linux_emul.h>
#include <compat/linux/linux_futex.h>
struct futex;
struct waiting_proc {
struct thread *wp_t;
struct futex *wp_new_futex;
TAILQ_ENTRY(waiting_proc) wp_list;
};
struct futex {
void *f_uaddr;
int f_refcount;
LIST_ENTRY(futex) f_list;
TAILQ_HEAD(lf_waiting_proc, waiting_proc) f_waiting_proc;
};
LIST_HEAD(futex_list, futex) futex_list;
struct sx futex_sx; /* this protects the LIST of futexes */
#define FUTEX_LOCK sx_xlock(&futex_sx)
#define FUTEX_UNLOCK sx_xunlock(&futex_sx)
#define FUTEX_LOCKED 1
#define FUTEX_UNLOCKED 0
#define FUTEX_SYSTEM_LOCK mtx_lock(&Giant)
#define FUTEX_SYSTEM_UNLOCK mtx_unlock(&Giant)
static struct futex *futex_get(void *, int);
static void futex_put(struct futex *);
static int futex_sleep(struct futex *, struct thread *, unsigned long);
static int futex_wake(struct futex *, int, struct futex *, int);
static int futex_atomic_op(struct thread *td, int encoded_op, caddr_t uaddr);
/* support.s */
int futex_xchgl(int oparg, caddr_t uaddr, int *oldval);
int futex_addl(int oparg, caddr_t uaddr, int *oldval);
int futex_orl(int oparg, caddr_t uaddr, int *oldval);
int futex_andl(int oparg, caddr_t uaddr, int *oldval);
int futex_xorl(int oparg, caddr_t uaddr, int *oldval);
int
linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args)
{
int val;
int ret;
struct l_timespec timeout = {0, 0};
int error = 0;
struct futex *f;
struct futex *newf;
int timeout_hz;
struct timeval tv = {0, 0};
struct futex *f2;
int op_ret;
#ifdef DEBUG
if (ldebug(sys_futex))
printf(ARGS(futex, "%p, %i, %i, *, %p, %i"), args->uaddr, args->op,
args->val, args->uaddr2, args->val3);
#endif
/*
* Our implementation provides only privates futexes. Most of the apps
* should use private futexes but don't claim so. Therefore we treat
* all futexes as private by clearing the FUTEX_PRIVATE_FLAG. It works
* in most cases (ie. when futexes are not shared on file descriptor
* or between different processes.).
*/
args->op = (args->op & ~LINUX_FUTEX_PRIVATE_FLAG);
switch (args->op) {
case LINUX_FUTEX_WAIT:
FUTEX_SYSTEM_LOCK;
if ((error = copyin(args->uaddr,
&val, sizeof(val))) != 0) {
FUTEX_SYSTEM_UNLOCK;
return error;
}
if (val != args->val) {
FUTEX_SYSTEM_UNLOCK;
return EWOULDBLOCK;
}
if (args->timeout != NULL) {
if ((error = copyin(args->timeout,
&timeout, sizeof(timeout))) != 0) {
FUTEX_SYSTEM_UNLOCK;
return error;
}
}
#ifdef DEBUG
if (ldebug(sys_futex))
printf("FUTEX_WAIT %d: val = %d, uaddr = %p, "
"*uaddr = %d, timeout = %d.%09lu\n",
td->td_proc->p_pid, args->val,
args->uaddr, val, timeout.tv_sec,
(unsigned long)timeout.tv_nsec);
#endif
tv.tv_usec = timeout.tv_sec * 1000000 + timeout.tv_nsec / 1000;
timeout_hz = tvtohz(&tv);
if (timeout.tv_sec == 0 && timeout.tv_nsec == 0)
timeout_hz = 0;
/*
* If the user process requests a non null timeout,
* make sure we do not turn it into an infinite
* timeout because timeout_hz gets null.
*
* We use a minimal timeout of 1/hz. Maybe it would
* make sense to just return ETIMEDOUT without sleeping.
*/
if (((timeout.tv_sec != 0) || (timeout.tv_nsec != 0)) &&
(timeout_hz == 0))
timeout_hz = 1;
f = futex_get(args->uaddr, FUTEX_UNLOCKED);
ret = futex_sleep(f, td, timeout_hz);
futex_put(f);
#ifdef DEBUG
if (ldebug(sys_futex))
printf("FUTEX_WAIT %d: uaddr = %p, "
"ret = %d\n", td->td_proc->p_pid, args->uaddr, ret);
#endif
FUTEX_SYSTEM_UNLOCK;
switch (ret) {
case EWOULDBLOCK: /* timeout */
return ETIMEDOUT;
break;
case EINTR: /* signal */
return EINTR;
break;
case 0: /* FUTEX_WAKE received */
#ifdef DEBUG
if (ldebug(sys_futex))
printf("FUTEX_WAIT %d: uaddr = %p, "
"got FUTEX_WAKE\n",
td->td_proc->p_pid, args->uaddr);
#endif
return 0;
break;
default:
#ifdef DEBUG
if (ldebug(sys_futex))
printf("FUTEX_WAIT: unexpected ret = %d\n",
ret);
#endif
break;
}
/* NOTREACHED */
break;
case LINUX_FUTEX_WAKE:
FUTEX_SYSTEM_LOCK;
/*
* XXX: Linux is able to cope with different addresses
* corresponding to the same mapped memory in the sleeping
* and waker process(es).
*/
#ifdef DEBUG
if (ldebug(sys_futex))
printf("FUTEX_WAKE %d: uaddr = %p, val = %d\n",
td->td_proc->p_pid, args->uaddr, args->val);
#endif
f = futex_get(args->uaddr, FUTEX_UNLOCKED);
td->td_retval[0] = futex_wake(f, args->val, NULL, 0);
futex_put(f);
FUTEX_SYSTEM_UNLOCK;
break;
case LINUX_FUTEX_CMP_REQUEUE:
FUTEX_SYSTEM_LOCK;
if ((error = copyin(args->uaddr,
&val, sizeof(val))) != 0) {
FUTEX_SYSTEM_UNLOCK;
return error;
}
if (val != args->val3) {
FUTEX_SYSTEM_UNLOCK;
return EAGAIN;
}
f = futex_get(args->uaddr, FUTEX_UNLOCKED);
newf = futex_get(args->uaddr2, FUTEX_UNLOCKED);
td->td_retval[0] = futex_wake(f, args->val, newf,
(int)(unsigned long)args->timeout);
futex_put(f);
futex_put(newf);
FUTEX_SYSTEM_UNLOCK;
break;
case LINUX_FUTEX_REQUEUE:
FUTEX_SYSTEM_LOCK;
f = futex_get(args->uaddr, FUTEX_UNLOCKED);
newf = futex_get(args->uaddr2, FUTEX_UNLOCKED);
td->td_retval[0] = futex_wake(f, args->val, newf,
(int)(unsigned long)args->timeout);
futex_put(f);
futex_put(newf);
FUTEX_SYSTEM_UNLOCK;
break;
case LINUX_FUTEX_FD:
#ifdef DEBUG
printf("linux_sys_futex: unimplemented op %d\n",
args->op);
#endif
return (ENOSYS);
case LINUX_FUTEX_WAKE_OP:
FUTEX_SYSTEM_LOCK;
#ifdef DEBUG
if (ldebug(sys_futex))
printf("FUTEX_WAKE_OP: %d: uaddr = %p, op = %d, "
"val = %x, uaddr2 = %p, val3 = %x\n",
td->td_proc->p_pid, args->uaddr, args->op,
args->val, args->uaddr2, args->val3);
#endif
f = futex_get(args->uaddr, FUTEX_UNLOCKED);
f2 = futex_get(args->uaddr2, FUTEX_UNLOCKED);
/*
* This function returns positive number as results and
* negative as errors
*/
op_ret = futex_atomic_op(td, args->val3, args->uaddr2);
#ifdef DEBUG
if (ldebug(sys_futex))
printf("futex_atomic_op ret %d\n", op_ret);
#endif
if (op_ret < 0) {
/* XXX: We don't handle the EFAULT yet. */
if (op_ret != -EFAULT) {
futex_put(f);
futex_put(f2);
FUTEX_SYSTEM_UNLOCK;
return (-op_ret);
}
futex_put(f);
futex_put(f2);
FUTEX_SYSTEM_UNLOCK;
return (EFAULT);
}
ret = futex_wake(f, args->val, NULL, 0);
futex_put(f);
if (op_ret > 0) {
op_ret = 0;
/*
* Linux abuses the address of the timespec parameter
* as the number of retries.
*/
op_ret += futex_wake(f2,
(int)(unsigned long)args->timeout, NULL, 0);
ret += op_ret;
}
futex_put(f2);
td->td_retval[0] = ret;
FUTEX_SYSTEM_UNLOCK;
break;
case LINUX_FUTEX_LOCK_PI:
/* not yet implemented */
return (ENOSYS);
case LINUX_FUTEX_UNLOCK_PI:
/* not yet implemented */
return (ENOSYS);
case LINUX_FUTEX_TRYLOCK_PI:
/* not yet implemented */
return (ENOSYS);
default:
printf("linux_sys_futex: unknown op %d\n",
args->op);
return (ENOSYS);
}
return (0);
}
static struct futex *
futex_get(void *uaddr, int locked)
{
struct futex *f;
if (locked == FUTEX_UNLOCKED)
FUTEX_LOCK;
LIST_FOREACH(f, &futex_list, f_list) {
if (f->f_uaddr == uaddr) {
f->f_refcount++;
if (locked == FUTEX_UNLOCKED)
FUTEX_UNLOCK;
return f;
}
}
f = malloc(sizeof(*f), M_LINUX, M_WAITOK);
f->f_uaddr = uaddr;
f->f_refcount = 1;
TAILQ_INIT(&f->f_waiting_proc);
LIST_INSERT_HEAD(&futex_list, f, f_list);
if (locked == FUTEX_UNLOCKED)
FUTEX_UNLOCK;
return f;
}
static void
futex_put(f)
struct futex *f;
{
FUTEX_LOCK;
f->f_refcount--;
if (f->f_refcount == 0) {
LIST_REMOVE(f, f_list);
free(f, M_LINUX);
}
FUTEX_UNLOCK;
return;
}
static int
futex_sleep(struct futex *f, struct thread *td, unsigned long timeout)
{
struct waiting_proc *wp;
int ret;
wp = malloc(sizeof(*wp), M_LINUX, M_WAITOK);
wp->wp_t = td;
wp->wp_new_futex = NULL;
FUTEX_LOCK;
TAILQ_INSERT_TAIL(&f->f_waiting_proc, wp, wp_list);
FUTEX_UNLOCK;
#ifdef DEBUG
if (ldebug(sys_futex))
printf("FUTEX --> %d tlseep timeout = %ld\n",
td->td_proc->p_pid, timeout);
#endif
ret = tsleep(wp, PCATCH | PZERO, "linuxfutex", timeout);
#ifdef DEBUG
if (ldebug(sys_futex))
printf("FUTEX -> %d tsleep returns %d\n",
td->td_proc->p_pid, ret);
#endif
FUTEX_LOCK;
TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
FUTEX_UNLOCK;
/* if we got woken up in futex_wake */
if ((ret == 0) && (wp->wp_new_futex != NULL)) {
/* suspend us on the new futex */
ret = futex_sleep(wp->wp_new_futex, td, timeout);
/* and release the old one */
futex_put(wp->wp_new_futex);
}
free(wp, M_LINUX);
return ret;
}
static int
futex_wake(struct futex *f, int n, struct futex *newf, int n2)
{
struct waiting_proc *wp;
int count;
/*
* Linux is very strange it wakes up N threads for
* all operations BUT requeue ones where its N+1
* mimic this.
*/
count = newf ? 0 : 1;
FUTEX_LOCK;
TAILQ_FOREACH(wp, &f->f_waiting_proc, wp_list) {
if (count <= n) {
wakeup_one(wp);
count++;
} else {
if (newf != NULL) {
/* futex_put called after tsleep */
wp->wp_new_futex = futex_get(newf->f_uaddr,
FUTEX_LOCKED);
wakeup_one(wp);
if (count - n >= n2)
break;
}
}
}
FUTEX_UNLOCK;
return count;
}
static int
futex_atomic_op(struct thread *td, int encoded_op, caddr_t uaddr)
{
int op = (encoded_op >> 28) & 7;
int cmp = (encoded_op >> 24) & 15;
int oparg = (encoded_op << 8) >> 20;
int cmparg = (encoded_op << 20) >> 20;
int oldval = 0, ret;
if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
oparg = 1 << oparg;
#ifdef DEBUG
if (ldebug(sys_futex))
printf("futex_atomic_op: op = %d, cmp = %d, oparg = %x, "
"cmparg = %x, uaddr = %p\n",
op, cmp, oparg, cmparg, uaddr);
#endif
/* XXX: linux verifies access here and returns EFAULT */
switch (op) {
case FUTEX_OP_SET:
ret = futex_xchgl(oparg, uaddr, &oldval);
break;
case FUTEX_OP_ADD:
ret = futex_addl(oparg, uaddr, &oldval);
break;
case FUTEX_OP_OR:
ret = futex_orl(oparg, uaddr, &oldval);
break;
case FUTEX_OP_ANDN:
ret = futex_andl(~oparg, uaddr, &oldval);
break;
case FUTEX_OP_XOR:
ret = futex_xorl(oparg, uaddr, &oldval);
break;
default:
ret = -ENOSYS;
break;
}
if (ret)
return (ret);
switch (cmp) {
case FUTEX_OP_CMP_EQ:
return (oldval == cmparg);
case FUTEX_OP_CMP_NE:
return (oldval != cmparg);
case FUTEX_OP_CMP_LT:
return (oldval < cmparg);
case FUTEX_OP_CMP_GE:
return (oldval >= cmparg);
case FUTEX_OP_CMP_LE:
return (oldval <= cmparg);
case FUTEX_OP_CMP_GT:
return (oldval > cmparg);
default:
return (-ENOSYS);
}
}
int
linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args)
{
struct linux_emuldata *em;
#ifdef DEBUG
if (ldebug(set_robust_list))
printf(ARGS(set_robust_list, ""));
#endif
if (args->len != sizeof(struct linux_robust_list_head))
return (EINVAL);
em = em_find(td->td_proc, EMUL_DOLOCK);
em->robust_futexes = args->head;
EMUL_UNLOCK(&emul_lock);
return (0);
}
int
linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args)
{
struct linux_emuldata *em;
struct linux_robust_list_head *head;
l_size_t len = sizeof(struct linux_robust_list_head);
int error = 0;
#ifdef DEBUG
if (ldebug(get_robust_list))
printf(ARGS(get_robust_list, ""));
#endif
if (!args->pid) {
em = em_find(td->td_proc, EMUL_DONTLOCK);
head = em->robust_futexes;
} else {
struct proc *p;
p = pfind(args->pid);
if (p == NULL)
return (ESRCH);
em = em_find(p, EMUL_DONTLOCK);
/* XXX: ptrace? */
if (priv_check(td, PRIV_CRED_SETUID) ||
priv_check(td, PRIV_CRED_SETEUID) ||
p_candebug(td, p))
return (EPERM);
head = em->robust_futexes;
PROC_UNLOCK(p);
}
error = copyout(&len, args->len, sizeof(l_size_t));
if (error)
return (EFAULT);
error = copyout(head, args->head, sizeof(struct linux_robust_list_head));
return (error);
}
static int
handle_futex_death(void *uaddr, pid_t pid, int pi)
{
int uval, nval, mval;
struct futex *f;
retry:
if (copyin(uaddr, &uval, 4))
return (EFAULT);
if ((uval & FUTEX_TID_MASK) == pid) {
mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
nval = casuword32(uaddr, uval, mval);
if (nval == -1)
return (EFAULT);
if (nval != uval)
goto retry;
if (!pi && (uval & FUTEX_WAITERS)) {
f = futex_get(uaddr, FUTEX_UNLOCKED);
futex_wake(f, 1, NULL, 0);
}
}
return (0);
}
static int
fetch_robust_entry(struct linux_robust_list **entry,
struct linux_robust_list **head, int *pi)
{
l_ulong uentry;
if (copyin((const void *)head, &uentry, sizeof(l_ulong)))
return (EFAULT);
*entry = (void *)(uentry & ~1UL);
*pi = uentry & 1;
return (0);
}
/* This walks the list of robust futexes releasing them. */
void
release_futexes(struct proc *p)
{
struct linux_robust_list_head *head = NULL;
struct linux_robust_list *entry, *next_entry, *pending;
unsigned int limit = 2048, pi, next_pi, pip;
struct linux_emuldata *em;
l_ulong futex_offset;
int rc;
em = em_find(p, EMUL_DONTLOCK);
head = em->robust_futexes;
if (head == NULL)
return;
if (fetch_robust_entry(&entry, &head->list.next, &pi))
return;
if (copyin(&head->futex_offset, &futex_offset, sizeof(l_ulong)))
return;
if (fetch_robust_entry(&pending, &head->pending_list, &pip))
return;
while (entry != &head->list) {
rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
if (entry != pending)
if (handle_futex_death((char *)entry + futex_offset,
p->p_pid, pi))
return;
if (rc)
return;
entry = next_entry;
pi = next_pi;
if (!--limit)
break;
sched_relinquish(curthread);
}
if (pending)
handle_futex_death((char *) pending + futex_offset,
p->p_pid, pip);
}