The change in r236456 (atomic_store_rel not locked) exposed a bug

in the ithread code where we could lose ithread interrupts if
intr_event_schedule_thread() is called while the ithread is already
running.  Effectively memory writes could be ordered incorrectly
such that the unatomic write of 0 to ithd->it_need (in ithread_loop)
could be delayed until after it was set to be triggered again by
intr_event_schedule_thread().

This was particularly a big problem for CAM because CAM optimizes
scheduling of ithreads such that it only signals camisr() when it
queues to an empty queue.  This means that additional completion
events would not unstick CAM and quickly lead to a complete lockup
of the CAM stack.

To fix this use atomics in all places where ithd->it_need is accessed.

Submitted by: delphij, mav
Obtained from: TrueOS, iXsystems
MFC After: 1 week
This commit is contained in:
Alfred Perlstein 2013-07-04 05:53:05 +00:00
parent a38e2a64dc
commit 3eebd44d0c

View File

@ -841,7 +841,7 @@ intr_event_remove_handler(void *cookie)
* again and remove this handler if it has already passed
* it on the list.
*/
ie->ie_thread->it_need = 1;
atomic_store_rel_int(&ie->ie_thread->it_need, 1);
} else
TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
thread_unlock(ie->ie_thread->it_thread);
@ -912,7 +912,7 @@ intr_event_schedule_thread(struct intr_event *ie)
* running. Then, lock the thread and see if we actually need to
* put it on the runqueue.
*/
it->it_need = 1;
atomic_store_rel_int(&it->it_need, 1);
thread_lock(td);
if (TD_AWAITING_INTR(td)) {
CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid,
@ -990,7 +990,7 @@ intr_event_remove_handler(void *cookie)
* again and remove this handler if it has already passed
* it on the list.
*/
it->it_need = 1;
atomic_store_rel_int(&it->it_need, 1);
} else
TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
thread_unlock(it->it_thread);
@ -1066,7 +1066,7 @@ intr_event_schedule_thread(struct intr_event *ie, struct intr_thread *it)
* running. Then, lock the thread and see if we actually need to
* put it on the runqueue.
*/
it->it_need = 1;
atomic_store_rel_int(&it->it_need, 1);
thread_lock(td);
if (TD_AWAITING_INTR(td)) {
CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid,
@ -1247,7 +1247,7 @@ intr_event_execute_handlers(struct proc *p, struct intr_event *ie)
* interrupt threads always invoke all of their handlers.
*/
if (ie->ie_flags & IE_SOFT) {
if (!ih->ih_need)
if (atomic_load_acq_int(&ih->ih_need) == 0)
continue;
else
atomic_store_rel_int(&ih->ih_need, 0);
@ -1349,7 +1349,7 @@ ithread_loop(void *arg)
* we are running, it will set it_need to note that we
* should make another pass.
*/
while (ithd->it_need) {
while (atomic_load_acq_int(&ithd->it_need) != 0) {
/*
* This might need a full read and write barrier
* to make sure that this write posts before any
@ -1368,7 +1368,8 @@ ithread_loop(void *arg)
* set again, so we have to check it again.
*/
thread_lock(td);
if (!ithd->it_need && !(ithd->it_flags & (IT_DEAD | IT_WAIT))) {
if ((atomic_load_acq_int(&ithd->it_need) == 0) &&
!(ithd->it_flags & (IT_DEAD | IT_WAIT))) {
TD_SET_IWAIT(td);
ie->ie_count = 0;
mi_switch(SW_VOL | SWT_IWAIT, NULL);
@ -1529,7 +1530,7 @@ ithread_loop(void *arg)
* we are running, it will set it_need to note that we
* should make another pass.
*/
while (ithd->it_need) {
while (atomic_load_acq_int(&ithd->it_need) != 0) {
/*
* This might need a full read and write barrier
* to make sure that this write posts before any
@ -1551,7 +1552,8 @@ ithread_loop(void *arg)
* set again, so we have to check it again.
*/
thread_lock(td);
if (!ithd->it_need && !(ithd->it_flags & (IT_DEAD | IT_WAIT))) {
if ((atomic_load_acq_int(&ithd->it_need) == 0) &&
!(ithd->it_flags & (IT_DEAD | IT_WAIT))) {
TD_SET_IWAIT(td);
ie->ie_count = 0;
mi_switch(SW_VOL | SWT_IWAIT, NULL);