A crash reported on freebsd-fs@ on Sep. 23, 2011 under the subject

heading "kernel panics with RPCSEC_GSS" appears to be caused by a
corrupted tailq list for the client structure. Looking at the code, calls
to the function svc_rpc_gss_forget_client() were done in an SMP unsafe
manner, with the svc_rpc_gss_lock only being acquired in the function
and not before it. As such, when multiple threads called
svc_rpc_gss_forget_client() concurrently, it could try and remove the
same client structure from the tailq lists multiple times.
The patch fixes this by moving the critical code into a separate
function called svc_rpc_gss_forget_client_locked(), which must be
called with the lock held. For the one case where the caller would
have no interest in the lock, svc_rpc_gss_forget_client() was retained,
but a loop was added to check that the client structure is still in
the tailq lists before removing it, to make it safe for multiple
concurrent calls.

Tested by:	clinton.adams at gmail.com (earlier version)
Reviewed by:	zkirsch
MFC after:	3 days
This commit is contained in:
Rick Macklem 2011-10-07 01:15:04 +00:00
parent bf3a36cc7b
commit 5328a32e58

View File

@ -608,6 +608,22 @@ svc_rpc_gss_release_client(struct svc_rpc_gss_client *client)
svc_rpc_gss_destroy_client(client);
}
/*
* Remove a client from our global lists.
* Must be called with svc_rpc_gss_lock held.
*/
static void
svc_rpc_gss_forget_client_locked(struct svc_rpc_gss_client *client)
{
struct svc_rpc_gss_client_list *list;
sx_assert(&svc_rpc_gss_lock, SX_XLOCKED);
list = &svc_rpc_gss_client_hash[client->cl_id.ci_id % CLIENT_HASH_SIZE];
TAILQ_REMOVE(list, client, cl_link);
TAILQ_REMOVE(&svc_rpc_gss_clients, client, cl_alllink);
svc_rpc_gss_client_count--;
}
/*
* Remove a client from our global lists and free it if we can.
*/
@ -615,21 +631,30 @@ static void
svc_rpc_gss_forget_client(struct svc_rpc_gss_client *client)
{
struct svc_rpc_gss_client_list *list;
struct svc_rpc_gss_client *tclient;
list = &svc_rpc_gss_client_hash[client->cl_id.ci_id % CLIENT_HASH_SIZE];
sx_xlock(&svc_rpc_gss_lock);
TAILQ_REMOVE(list, client, cl_link);
TAILQ_REMOVE(&svc_rpc_gss_clients, client, cl_alllink);
svc_rpc_gss_client_count--;
TAILQ_FOREACH(tclient, list, cl_link) {
/*
* Make sure this client has not already been removed
* from the lists by svc_rpc_gss_forget_client() or
* svc_rpc_gss_forget_client_locked() already.
*/
if (client == tclient) {
svc_rpc_gss_forget_client_locked(client);
sx_xunlock(&svc_rpc_gss_lock);
svc_rpc_gss_release_client(client);
return;
}
}
sx_xunlock(&svc_rpc_gss_lock);
svc_rpc_gss_release_client(client);
}
static void
svc_rpc_gss_timeout_clients(void)
{
struct svc_rpc_gss_client *client;
struct svc_rpc_gss_client *nclient;
time_t now = time_uptime;
rpc_gss_log_debug("in svc_rpc_gss_timeout_clients()");
@ -638,16 +663,29 @@ svc_rpc_gss_timeout_clients(void)
* First enforce the max client limit. We keep
* svc_rpc_gss_clients in LRU order.
*/
while (svc_rpc_gss_client_count > CLIENT_MAX)
svc_rpc_gss_forget_client(TAILQ_LAST(&svc_rpc_gss_clients,
svc_rpc_gss_client_list));
TAILQ_FOREACH_SAFE(client, &svc_rpc_gss_clients, cl_alllink, nclient) {
sx_xlock(&svc_rpc_gss_lock);
client = TAILQ_LAST(&svc_rpc_gss_clients, svc_rpc_gss_client_list);
while (svc_rpc_gss_client_count > CLIENT_MAX && client != NULL) {
svc_rpc_gss_forget_client_locked(client);
sx_xunlock(&svc_rpc_gss_lock);
svc_rpc_gss_release_client(client);
sx_xlock(&svc_rpc_gss_lock);
client = TAILQ_LAST(&svc_rpc_gss_clients,
svc_rpc_gss_client_list);
}
again:
TAILQ_FOREACH(client, &svc_rpc_gss_clients, cl_alllink) {
if (client->cl_state == CLIENT_STALE
|| now > client->cl_expiration) {
svc_rpc_gss_forget_client_locked(client);
sx_xunlock(&svc_rpc_gss_lock);
rpc_gss_log_debug("expiring client %p", client);
svc_rpc_gss_forget_client(client);
svc_rpc_gss_release_client(client);
sx_xlock(&svc_rpc_gss_lock);
goto again;
}
}
sx_xunlock(&svc_rpc_gss_lock);
}
#ifdef DEBUG