In rare cases, a SATA drive can stop responding to commands and trigger a

reset device task request from the driver.  If the drive fails to respond
with a signature FIS, the driver would previously get into an endless retry
loop, stalling all I/O to the drive and keeping user processes stranded.
Instead, fail the i/o and invalidate the device if the task management
command times out.  This is controllable with the sysctl and tunable
hw.isci.fail_on_task_timeout
dev.isci.0.fail_on_task_timeout

The default for these is 1.

Reviewed by:	jimharris
Obtained from:	Netflix, Inc.
MFC after:	2 days
This commit is contained in:
Scott Long 2014-06-30 01:01:54 +00:00
parent 58cf99d20d
commit 3da2a91a57
4 changed files with 40 additions and 3 deletions

View File

@ -164,6 +164,7 @@ struct ISCI_CONTROLLER
uint32_t initial_discovery_mask; uint32_t initial_discovery_mask;
BOOL is_frozen; BOOL is_frozen;
BOOL release_queued_ccbs; BOOL release_queued_ccbs;
BOOL fail_on_task_timeout;
uint8_t *remote_device_memory; uint8_t *remote_device_memory;
struct ISCI_MEMORY cached_controller_memory; struct ISCI_MEMORY cached_controller_memory;
struct ISCI_MEMORY uncached_controller_memory; struct ISCI_MEMORY uncached_controller_memory;

View File

@ -300,6 +300,8 @@ SCI_STATUS isci_controller_initialize(struct ISCI_CONTROLLER *controller)
SCI_CONTROLLER_HANDLE_T scic_controller_handle; SCI_CONTROLLER_HANDLE_T scic_controller_handle;
char led_name[64]; char led_name[64];
unsigned long tunable; unsigned long tunable;
uint32_t io_shortage;
uint32_t fail_on_timeout;
int i; int i;
scic_controller_handle = scic_controller_handle =
@ -365,10 +367,12 @@ SCI_STATUS isci_controller_initialize(struct ISCI_CONTROLLER *controller)
* this io_shortage parameter, which will tell CAM that we have a * this io_shortage parameter, which will tell CAM that we have a
* large queue depth than we really do. * large queue depth than we really do.
*/ */
uint32_t io_shortage = 0; io_shortage = 0;
TUNABLE_INT_FETCH("hw.isci.io_shortage", &io_shortage); TUNABLE_INT_FETCH("hw.isci.io_shortage", &io_shortage);
controller->sim_queue_depth += io_shortage; controller->sim_queue_depth += io_shortage;
fail_on_timeout = 1;
TUNABLE_INT_FETCH("hw.isci.fail_on_task_timeout", &fail_on_timeout);
/* Attach to CAM using xpt_bus_register now, then immediately freeze /* Attach to CAM using xpt_bus_register now, then immediately freeze
* the simq. It will get released later when initial domain discovery * the simq. It will get released later when initial domain discovery
* is complete. * is complete.

View File

@ -222,6 +222,24 @@ isci_sysctl_log_frozen_lun_masks(SYSCTL_HANDLER_ARGS)
return (0); return (0);
} }
static int
isci_sysctl_fail_on_task_timeout(SYSCTL_HANDLER_ARGS)
{
struct isci_softc *isci = (struct isci_softc *)arg1;
int32_t fail_on_timeout = 0;
int error, i;
error = sysctl_handle_int(oidp, &fail_on_timeout, 0, req);
if (error || fail_on_timeout == 0)
return (error);
for (i = 0; i < isci->controller_count; i++)
isci->controllers[i].fail_on_task_timeout = fail_on_timeout;
return (0);
}
void isci_sysctl_initialize(struct isci_softc *isci) void isci_sysctl_initialize(struct isci_softc *isci)
{ {
struct sysctl_ctx_list *sysctl_ctx = device_get_sysctl_ctx(isci->device); struct sysctl_ctx_list *sysctl_ctx = device_get_sysctl_ctx(isci->device);
@ -259,5 +277,10 @@ void isci_sysctl_initialize(struct isci_softc *isci)
"log_frozen_lun_masks", CTLTYPE_UINT| CTLFLAG_RW, isci, 0, "log_frozen_lun_masks", CTLTYPE_UINT| CTLFLAG_RW, isci, 0,
isci_sysctl_log_frozen_lun_masks, "IU", isci_sysctl_log_frozen_lun_masks, "IU",
"Log frozen lun masks to kernel log"); "Log frozen lun masks to kernel log");
SYSCTL_ADD_PROC(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
"fail_on_task_timeout", CTLTYPE_UINT | CTLFLAG_RW, isci, 0,
isci_sysctl_fail_on_task_timeout, "IU",
"Fail a command that has encountered a task management timeout");
} }

View File

@ -206,8 +206,17 @@ isci_task_request_complete(SCI_CONTROLLER_HANDLE_T scif_controller,
break; break;
case SCI_FAILURE_TIMEOUT: case SCI_FAILURE_TIMEOUT:
retry_task = TRUE; if (isci_controller->fail_on_task_timeout) {
isci_log_message(0, "ISCI", "task timeout - retrying\n"); retry_task = FALSE;
isci_log_message(0, "ISCI",
"task timeout - not retrying\n");
scif_cb_domain_device_removed(isci_controller,
isci_remote_device->domain, isci_remote_device);
} else {
retry_task = TRUE;
isci_log_message(0, "ISCI",
"task timeout - retrying\n");
}
break; break;
case SCI_TASK_FAILURE: case SCI_TASK_FAILURE: