From 8b70e6ae9c216673b1db0f9605419ac90d6882a0 Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Wed, 22 Sep 2010 19:03:11 +0000 Subject: [PATCH] Fix possible deadlock where worker process sends an event to the main process while the main process sends control message to the worker process, but worker process hasn't started control thread yet, because it waits for reply from the main process. The fix is to start the control thread before sending any events. Reported and fix suggested by: Mikolaj Golub MFC after: 3 days --- sbin/hastd/primary.c | 16 ++++++++++++---- sbin/hastd/secondary.c | 18 ++++++++++++++---- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/sbin/hastd/primary.c b/sbin/hastd/primary.c index 0b2402add1a7..5d6896e7e0cc 100644 --- a/sbin/hastd/primary.c +++ b/sbin/hastd/primary.c @@ -807,10 +807,20 @@ hastd_primary(struct hast_resource *res) proto_send(res->hr_event, NULL, 0); init_local(res); - if (real_remote(res) && init_remote(res, NULL, NULL)) - sync_start(); init_ggate(res); init_environment(res); + /* + * Create the control thread before sending any event to the parent, + * as we can deadlock when parent sends control request to worker, + * but worker has no control thread started yet, so parent waits. + * In the meantime worker sends an event to the parent, but parent + * is unable to handle the event, because it waits for control + * request response. + */ + error = pthread_create(&td, NULL, ctrl_thread, res); + assert(error == 0); + if (real_remote(res) && init_remote(res, NULL, NULL)) + sync_start(); error = pthread_create(&td, NULL, ggate_recv_thread, res); assert(error == 0); error = pthread_create(&td, NULL, local_send_thread, res); @@ -823,8 +833,6 @@ hastd_primary(struct hast_resource *res) assert(error == 0); error = pthread_create(&td, NULL, sync_thread, res); assert(error == 0); - error = pthread_create(&td, NULL, ctrl_thread, res); - assert(error == 0); (void)guard_thread(res); } diff --git a/sbin/hastd/secondary.c b/sbin/hastd/secondary.c index 6f56239be1ce..9e1e53760852 100644 --- a/sbin/hastd/secondary.c +++ b/sbin/hastd/secondary.c @@ -393,17 +393,27 @@ hastd_secondary(struct hast_resource *res, struct nv *nvin) pjdlog_errno(LOG_WARNING, "Unable to set connection timeout"); init_local(res); - init_remote(res, nvin); init_environment(); + + /* + * Create the control thread before sending any event to the parent, + * as we can deadlock when parent sends control request to worker, + * but worker has no control thread started yet, so parent waits. + * In the meantime worker sends an event to the parent, but parent + * is unable to handle the event, because it waits for control + * request response. + */ + error = pthread_create(&td, NULL, ctrl_thread, res); + assert(error == 0); + + init_remote(res, nvin); event_send(res, EVENT_CONNECT); error = pthread_create(&td, NULL, recv_thread, res); assert(error == 0); error = pthread_create(&td, NULL, disk_thread, res); assert(error == 0); - error = pthread_create(&td, NULL, send_thread, res); - assert(error == 0); - (void)ctrl_thread(res); + (void)send_thread(res); } static void