Fix possible deadlock where worker process sends an event to the main process

while the main process sends control message to the worker process, but worker
process hasn't started control thread yet, because it waits for reply from the
main process.

The fix is to start the control thread before sending any events.

Reported and fix suggested by:	Mikolaj Golub <to.my.trociny@gmail.com>
MFC after:	3 days
This commit is contained in:
Pawel Jakub Dawidek 2010-09-22 19:03:11 +00:00
parent 0c24d8e2a1
commit 8b70e6ae9c
2 changed files with 26 additions and 8 deletions

View File

@ -807,10 +807,20 @@ hastd_primary(struct hast_resource *res)
proto_send(res->hr_event, NULL, 0);
init_local(res);
if (real_remote(res) && init_remote(res, NULL, NULL))
sync_start();
init_ggate(res);
init_environment(res);
/*
* Create the control thread before sending any event to the parent,
* as we can deadlock when parent sends control request to worker,
* but worker has no control thread started yet, so parent waits.
* In the meantime worker sends an event to the parent, but parent
* is unable to handle the event, because it waits for control
* request response.
*/
error = pthread_create(&td, NULL, ctrl_thread, res);
assert(error == 0);
if (real_remote(res) && init_remote(res, NULL, NULL))
sync_start();
error = pthread_create(&td, NULL, ggate_recv_thread, res);
assert(error == 0);
error = pthread_create(&td, NULL, local_send_thread, res);
@ -823,8 +833,6 @@ hastd_primary(struct hast_resource *res)
assert(error == 0);
error = pthread_create(&td, NULL, sync_thread, res);
assert(error == 0);
error = pthread_create(&td, NULL, ctrl_thread, res);
assert(error == 0);
(void)guard_thread(res);
}

View File

@ -393,17 +393,27 @@ hastd_secondary(struct hast_resource *res, struct nv *nvin)
pjdlog_errno(LOG_WARNING, "Unable to set connection timeout");
init_local(res);
init_remote(res, nvin);
init_environment();
/*
* Create the control thread before sending any event to the parent,
* as we can deadlock when parent sends control request to worker,
* but worker has no control thread started yet, so parent waits.
* In the meantime worker sends an event to the parent, but parent
* is unable to handle the event, because it waits for control
* request response.
*/
error = pthread_create(&td, NULL, ctrl_thread, res);
assert(error == 0);
init_remote(res, nvin);
event_send(res, EVENT_CONNECT);
error = pthread_create(&td, NULL, recv_thread, res);
assert(error == 0);
error = pthread_create(&td, NULL, disk_thread, res);
assert(error == 0);
error = pthread_create(&td, NULL, send_thread, res);
assert(error == 0);
(void)ctrl_thread(res);
(void)send_thread(res);
}
static void