First fix pr 167226:

ThunderBolt cannot read sector >= 2^32 or 2^21
with supplied patch.

Second the bigger change, fix RAID operation on ThunderBolt base
card such as physically removing a disk from a RAID and replacing
it.  The current situation is the RAID firmware effectively hangs
waiting for an acknowledgement from the driver.  This is due to
the firmware support of the driver actually accessing the RAID
from under the firmware.  This is an interesting feature that
the FreeBSD driver does not use.  However, when the firmare
detects the driver has attached it then expects the driver will
synchronize LD's with the firmware.  If the driver does not sync.
then the management part of the firmware will hang waiting for
it so a pulled driver will listed as still there.

The fix for this problem isn't extremely difficult.  However,
figuring out why some of the code was the way it was and then
redoing it was involved.  Not have a spec. made it harder to
try to figure out.  The existing driver would send a
MFI_DCMD_LD_MAP_GET_INFO command in write mode to acknowledge
a LD state change.  In read mode it gets the RAID map from the
firmware.  The FreeBSD driver doesn't do that currently.  It
could be added in the future with the appropriate structures.
To simplify things, get the current LD state and then build
the MFI_DCMD_LD_MAP_GET_INFO/write command so that it sends
an acknowledgement for each LD.  The map would probably state
which LD's changed so then the driver could probably just
acknowledge the LD's that changed versus all.  This doesn't seem
to be a problem.  When a MFI_DCMD_LD_MAP_GET_INFO/write command
is sent to the firmware, it will complete later when a change
to the LD's happen.  So it is very much like an AEN command
returning when something happened.  When the
MFI_DCMD_LD_MAP_GET_INFO/write command completes, we refire the
sync'ing of the LD state.  This needs to be done in as an event
so that MFI_DCMD_LD_GET_LIST can wait for that command to
complete before issuing the MFI_DCMD_LD_MAP_GET_INFO/write.
The prior code didn't use the call-back function and tried
to intercept the MFI_DCMD_LD_MAP_GET_INFO/write command when
processing an interrupt.  This added a bunch of code complexity
to the interrupt handler.  Using the call-back that is done
for other commands got rid of this need.  So the interrupt
handler is greatly simplified.  It seems that even commands
that shouldn't be acknowledged end up in the interrupt handler.
To deal with this, code was added to check to see if a command
is in the busy queue or not.  This might have contributed to the
interrupt storm happening without MSI enabled on these cards.

Note that MFI_DCMD_LD_MAP_GET_INFO/read returns right away.

It would be interesting to see what other complexity could
be removed from the ThunderBolt driver that really isn't
needed in our mode of operation.  Letting the RAID firmware
do all of the I/O to disks is a lot faster since it can
use its caches.  It greatly simplifies what the driver has
to do and potential bugs if the driver and firmware are
not in sync.

Simplify the aen_abort/cm_map_abort and put it in the softc
versus in the command structure.

This should get merged to 9 before the driver is merged to
8.

PR:		167226
Submitted by:	Petr Lampa
MFC after:	3 days
This commit is contained in:
Doug Ambrisko 2012-05-04 16:00:39 +00:00
parent bc96dccc69
commit ddbffe7f70
5 changed files with 225 additions and 85 deletions

View File

@ -90,8 +90,6 @@ static int mfi_get_controller_info(struct mfi_softc *);
static int mfi_get_log_state(struct mfi_softc *,
struct mfi_evt_log_state **);
static int mfi_parse_entries(struct mfi_softc *, int, int);
static int mfi_dcmd_command(struct mfi_softc *, struct mfi_command **,
uint32_t, void **, size_t);
static void mfi_data_cb(void *, bus_dma_segment_t *, int, int);
static void mfi_startup(void *arg);
static void mfi_intr(void *arg);
@ -377,6 +375,7 @@ mfi_attach(struct mfi_softc *sc)
TAILQ_INIT(&sc->mfi_syspd_tqh);
TAILQ_INIT(&sc->mfi_evt_queue);
TASK_INIT(&sc->mfi_evt_task, 0, mfi_handle_evt, sc);
TASK_INIT(&sc->mfi_map_sync_task, 0, mfi_handle_map_sync, sc);
TAILQ_INIT(&sc->mfi_aen_pids);
TAILQ_INIT(&sc->mfi_cam_ccbq);
@ -696,7 +695,6 @@ mfi_attach(struct mfi_softc *sc)
return (EINVAL);
}
sc->mfi_enable_intr(sc);
sc->map_id = 0;
} else {
if ((error = mfi_comms_init(sc)) != 0)
return (error);
@ -762,6 +760,10 @@ mfi_attach(struct mfi_softc *sc)
callout_reset(&sc->mfi_watchdog_callout, MFI_CMD_TIMEOUT * hz,
mfi_timeout, sc);
if (sc->mfi_flags & MFI_FLAGS_TBOLT) {
mfi_tbolt_sync_map_info(sc);
}
return (0);
}
@ -845,7 +847,7 @@ mfi_release_command(struct mfi_command *cm)
mfi_enqueue_free(cm);
}
static int
int
mfi_dcmd_command(struct mfi_softc *sc, struct mfi_command **cmp,
uint32_t opcode, void **bufp, size_t bufsize)
{
@ -1286,8 +1288,8 @@ mfi_shutdown(struct mfi_softc *sc)
if (sc->mfi_aen_cm != NULL)
mfi_abort(sc, sc->mfi_aen_cm);
if (sc->map_update_cmd != NULL)
mfi_abort(sc, sc->map_update_cmd);
if (sc->mfi_map_sync_cm != NULL)
mfi_abort(sc, sc->mfi_map_sync_cm);
dcmd = &cm->cm_frame->dcmd;
dcmd->header.flags = MFI_FRAME_DIR_NONE;
@ -1664,9 +1666,9 @@ mfi_aen_complete(struct mfi_command *cm)
if (sc->mfi_aen_cm == NULL)
return;
if (sc->mfi_aen_cm->cm_aen_abort ||
if (sc->cm_aen_abort ||
hdr->cmd_status == MFI_STAT_INVALID_STATUS) {
sc->mfi_aen_cm->cm_aen_abort = 0;
sc->cm_aen_abort = 0;
aborted = 1;
} else {
sc->mfi_aen_triggered = 1;
@ -2385,7 +2387,9 @@ mfi_abort(struct mfi_softc *sc, struct mfi_command *cm_abort)
cm->cm_flags = MFI_CMD_POLLED;
if (sc->mfi_aen_cm)
sc->mfi_aen_cm->cm_aen_abort = 1;
sc->cm_aen_abort = 1;
if (sc->mfi_map_sync_cm)
sc->cm_map_abort = 1;
mfi_mapcmd(sc, cm);
mfi_release_command(cm);
@ -2394,6 +2398,11 @@ mfi_abort(struct mfi_softc *sc, struct mfi_command *cm_abort)
5 * hz);
i++;
}
while (i < 5 && sc->mfi_map_sync_cm != NULL) {
msleep(&sc->mfi_map_sync_cm, &sc->mfi_io_lock, 0, "mfiabort",
5 * hz);
i++;
}
return (0);
}
@ -3549,9 +3558,9 @@ mfi_timeout(void *data)
}
mtx_lock(&sc->mfi_io_lock);
TAILQ_FOREACH(cm, &sc->mfi_busy, cm_link) {
if (sc->mfi_aen_cm == cm)
if (sc->mfi_aen_cm == cm || sc->mfi_map_sync_cm == cm)
continue;
if ((sc->mfi_aen_cm != cm) && (cm->cm_timestamp < deadline)) {
if (cm->cm_timestamp < deadline) {
if (sc->adpreset != 0 && sc->issuepend_done == 0) {
cm->cm_timestamp = time_uptime;
} else {

View File

@ -172,6 +172,9 @@ mfi_print_dcmd(struct mfi_softc *sc, device_t dev, struct mfi_command *cm)
case MFI_DCMD_CLUSTER_RESET_LD:
opcode = "CLUSTER_RESET_LD";
break;
case MFI_DCMD_LD_MAP_GET_INFO:
opcode = "LD_MAP_GET_INFO";
break;
default:
opcode = "UNKNOWN";
break;

View File

@ -82,7 +82,8 @@ map_tbolt_cmd_status(struct mfi_command *mfi_cmd, uint8_t status,
static void mfi_issue_pending_cmds_again (struct mfi_softc *sc);
static void mfi_kill_hba (struct mfi_softc *sc);
static void mfi_process_fw_state_chg_isr(void *arg);
uint8_t mfi_tbolt_get_map_info(struct mfi_softc *sc);
static void mfi_sync_map_complete(struct mfi_command *);
static void mfi_queue_map_sync(struct mfi_softc *sc);
#define MFI_FUSION_ENABLE_INTERRUPT_MASK (0x00000008)
@ -627,10 +628,11 @@ mfi_tbolt_return_cmd(struct mfi_softc *sc, struct mfi_cmd_tbolt *cmd)
TAILQ_INSERT_TAIL(&sc->mfi_cmd_tbolt_tqh, cmd, next);
}
void mfi_tbolt_complete_cmd(struct mfi_softc *sc)
void
mfi_tbolt_complete_cmd(struct mfi_softc *sc)
{
struct mfi_mpi2_reply_header *desc, *reply_desc;
struct mfi_command *cmd_mfi; /* For MFA Cmds */
struct mfi_command *cmd_mfi, *cmd_mfi_check; /* For MFA Cmds */
struct mfi_cmd_tbolt *cmd_tbolt;
uint16_t smid;
uint8_t reply_descript_type;
@ -657,7 +659,6 @@ void mfi_tbolt_complete_cmd(struct mfi_softc *sc)
/* Read Reply descriptor */
while ((val.u.low != 0xFFFFFFFF) && (val.u.high != 0xFFFFFFFF)) {
smid = reply_desc->SMID;
if (!smid || smid > sc->mfi_max_fw_cmds + 1) {
device_printf(sc->mfi_dev, "smid is %x. Cannot "
@ -669,66 +670,20 @@ void mfi_tbolt_complete_cmd(struct mfi_softc *sc)
cmd_mfi = &sc->mfi_commands[cmd_tbolt->sync_cmd_idx];
scsi_io_req = cmd_tbolt->io_request;
/* Check if internal commands */
status = cmd_mfi->cm_frame->dcmd.header.cmd_status;
extStatus = cmd_mfi->cm_frame->dcmd.header.scsi_status;
map_tbolt_cmd_status(cmd_mfi, status, extStatus);
switch (scsi_io_req->Function) {
case MPI2_FUNCTION_LD_IO_REQUEST:
/* Regular Path IO. */
/* Map the Fw Error Status. */
map_tbolt_cmd_status(cmd_mfi, status,
extStatus);
if ((cmd_mfi->cm_frame->dcmd.opcode
== MFI_DCMD_LD_MAP_GET_INFO)
&& (cmd_mfi->cm_frame->dcmd.mbox[1] == 1)) {
if (cmd_mfi->cm_frame->header.cmd_status
!= 0)
device_printf(sc->mfi_dev,
"map sync failed\n");
else {
sc->map_id++;
device_printf(sc->mfi_dev,
"map sync completed\n");
mfi_release_command(cmd_mfi);
}
}
if ((cmd_mfi->cm_flags & MFI_ON_MFIQ_BUSY)
== MFI_ON_MFIQ_BUSY
&& (cmd_mfi->cm_flags & MFI_CMD_POLLED) == 0) {
/* BHARAT poll workaround */
/* remove command from busy queue if not polled */
TAILQ_FOREACH(cmd_mfi_check, &sc->mfi_busy, cm_link) {
if (cmd_mfi_check == cmd_mfi) {
mfi_remove_busy(cmd_mfi);
cmd_mfi->cm_error = 0;
mfi_complete(sc, cmd_mfi);
break;
}
mfi_tbolt_return_cmd(sc, cmd_tbolt);
break;
case MPI2_FUNCTION_PASSTHRU_IO_REQUEST:
map_tbolt_cmd_status(cmd_mfi, status, extStatus);
if ((cmd_mfi->cm_frame->dcmd.opcode
== MFI_DCMD_LD_MAP_GET_INFO)
&& (cmd_mfi->cm_frame->dcmd.mbox[1] == 1)) {
if (cmd_mfi->cm_frame->header.cmd_status != 0)
device_printf(sc->mfi_dev,
"map sync failed\n");
else {
sc->map_id++;
device_printf(sc->mfi_dev,
"map sync completed\n");
mfi_release_command(cmd_mfi);
}
}
if ((cmd_mfi->cm_flags & MFI_ON_MFIQ_BUSY)
== MFI_ON_MFIQ_BUSY
&& (cmd_mfi->cm_flags & MFI_CMD_POLLED) == 0) {
/* BHARAT poll workaround */
mfi_remove_busy(cmd_mfi);
cmd_mfi->cm_error = 0;
mfi_complete(sc, cmd_mfi);
}
mfi_tbolt_return_cmd(sc, cmd_tbolt);
break;
}
cmd_mfi->cm_error = 0;
mfi_complete(sc, cmd_mfi);
mfi_tbolt_return_cmd(sc, cmd_tbolt);
sc->last_reply_idx++;
if (sc->last_reply_idx >= sc->mfi_max_fw_cmds) {
@ -949,7 +904,7 @@ mfi_tbolt_build_cdb(struct mfi_softc *sc, struct mfi_command *mfi_cmd,
lba_lo = mfi_cmd->cm_frame->io.lba_lo;
lba_hi = mfi_cmd->cm_frame->io.lba_hi;
if ((num_lba <= 0xFF) && (lba_lo <= 0x1FFFFF)) {
if (lba_hi == 0 && (num_lba <= 0xFF) && (lba_lo <= 0x1FFFFF)) {
if (mfi_cmd->cm_frame->header.cmd == MFI_CMD_LD_WRITE)
/* Read 6 or Write 6 */
cdb[0] = (uint8_t) (0x0A);
@ -962,7 +917,7 @@ mfi_tbolt_build_cdb(struct mfi_softc *sc, struct mfi_command *mfi_cmd,
cdb[1] = (uint8_t) ((lba_lo >> 16) & 0x1F);
cdb_len = 6;
}
else if ((num_lba <= 0xFFFF) && (lba_lo <= 0xFFFFFFFF)) {
else if (lba_hi == 0 && (num_lba <= 0xFFFF) && (lba_lo <= 0xFFFFFFFF)) {
if (mfi_cmd->cm_frame->header.cmd == MFI_CMD_LD_WRITE)
/* Read 10 or Write 10 */
cdb[0] = (uint8_t) (0x2A);
@ -1152,8 +1107,8 @@ mfi_tbolt_send_frame(struct mfi_softc *sc, struct mfi_command *cm)
cm->cm_timestamp = time_uptime;
mfi_enqueue_busy(cm);
}
else {
hdr->cmd_status = 0xff;
else { /* still get interrupts for it */
hdr->cmd_status = MFI_STAT_INVALID_STATUS;
hdr->flags |= MFI_FRAME_DONT_POST_IN_REPLY_QUEUE;
}
@ -1189,16 +1144,16 @@ mfi_tbolt_send_frame(struct mfi_softc *sc, struct mfi_command *cm)
return 0;
/* This is a polled command, so busy-wait for it to complete. */
while (hdr->cmd_status == 0xff) {
while (hdr->cmd_status == MFI_STAT_INVALID_STATUS) {
DELAY(1000);
tm -= 1;
if (tm <= 0)
break;
break;
}
if (hdr->cmd_status == 0xff) {
if (hdr->cmd_status == MFI_STAT_INVALID_STATUS) {
device_printf(sc->mfi_dev, "Frame %p timed out "
"command 0x%X\n", hdr, cm->cm_frame->dcmd.opcode);
"command 0x%X\n", hdr, cm->cm_frame->dcmd.opcode);
return (ETIMEDOUT);
}
return 0;
@ -1308,9 +1263,9 @@ static void mfi_process_fw_state_chg_isr(void *arg)
mfi_release_command(sc->mfi_aen_cm);
sc->mfi_aen_cm = NULL;
}
if (sc->map_update_cmd) {
mfi_release_command(sc->map_update_cmd);
sc->map_update_cmd = NULL;
if (sc->mfi_map_sync_cm) {
mfi_release_command(sc->mfi_map_sync_cm);
sc->mfi_map_sync_cm = NULL;
}
mfi_issue_pending_cmds_again(sc);
@ -1337,3 +1292,171 @@ static void mfi_process_fw_state_chg_isr(void *arg)
}
mtx_unlock(&sc->mfi_io_lock);
}
/*
* The ThunderBolt HW has an option for the driver to directly
* access the underlying disks and operate on the RAID. To
* do this there needs to be a capability to keep the RAID controller
* and driver in sync. The FreeBSD driver does not take advantage
* of this feature since it adds a lot of complexity and slows down
* performance. Performance is gained by using the controller's
* cache etc.
*
* Even though this driver doesn't access the disks directly, an
* AEN like command is used to inform the RAID firmware to "sync"
* with all LD's via the MFI_DCMD_LD_MAP_GET_INFO command. This
* command in write mode will return when the RAID firmware has
* detected a change to the RAID state. Examples of this type
* of change are removing a disk. Once the command returns then
* the driver needs to acknowledge this and "sync" all LD's again.
* This repeats until we shutdown. Then we need to cancel this
* pending command.
*
* If this is not done right the RAID firmware will not remove a
* pulled drive and the RAID won't go degraded etc. Effectively,
* stopping any RAID mangement to functions.
*
* Doing another LD sync, requires the use of an event since the
* driver needs to do a mfi_wait_command and can't do that in an
* interrupt thread.
*
* The driver could get the RAID state via the MFI_DCMD_LD_MAP_GET_INFO
* That requires a bunch of structure and it is simplier to just do
* the MFI_DCMD_LD_GET_LIST versus walking the RAID map.
*/
void
mfi_tbolt_sync_map_info(struct mfi_softc *sc)
{
int error = 0, i;
struct mfi_command *cmd;
struct mfi_dcmd_frame *dcmd;
uint32_t context = 0;
union mfi_ld_ref *ld_sync;
size_t ld_size;
struct mfi_frame_header *hdr;
struct mfi_command *cm = NULL;
struct mfi_ld_list *list = NULL;
if (sc->mfi_map_sync_cm != NULL || sc->cm_map_abort)
return;
mtx_lock(&sc->mfi_io_lock);
error = mfi_dcmd_command(sc, &cm, MFI_DCMD_LD_GET_LIST,
(void **)&list, sizeof(*list));
if (error)
goto out;
cm->cm_flags = MFI_CMD_POLLED | MFI_CMD_DATAIN;
if (mfi_wait_command(sc, cm) != 0) {
device_printf(sc->mfi_dev, "Failed to get device listing\n");
goto out;
}
hdr = &cm->cm_frame->header;
if (hdr->cmd_status != MFI_STAT_OK) {
device_printf(sc->mfi_dev, "MFI_DCMD_LD_GET_LIST failed %x\n",
hdr->cmd_status);
goto out;
}
ld_size = sizeof(*ld_sync) * list->ld_count;
mtx_unlock(&sc->mfi_io_lock);
ld_sync = (union mfi_ld_ref *) malloc(ld_size, M_MFIBUF,
M_WAITOK | M_ZERO);
for (i = 0; i < list->ld_count; i++) {
ld_sync[i].ref = list->ld_list[i].ld.ref;
}
mtx_lock(&sc->mfi_io_lock);
if ((cmd = mfi_dequeue_free(sc)) == NULL)
return;
context = cmd->cm_frame->header.context;
bzero(cmd->cm_frame, sizeof(union mfi_frame));
cmd->cm_frame->header.context = context;
dcmd = &cmd->cm_frame->dcmd;
bzero(dcmd->mbox, MFI_MBOX_SIZE);
dcmd->header.cmd = MFI_CMD_DCMD;
dcmd->header.flags = MFI_FRAME_DIR_WRITE;
dcmd->header.timeout = 0;
dcmd->header.data_len = ld_size;
dcmd->header.scsi_status = 0;
dcmd->opcode = MFI_DCMD_LD_MAP_GET_INFO;
cmd->cm_sg = &dcmd->sgl;
cmd->cm_total_frame_size = MFI_DCMD_FRAME_SIZE;
cmd->cm_data = ld_sync;
cmd->cm_private = ld_sync;
cmd->cm_len = ld_size;
cmd->cm_complete = mfi_sync_map_complete;
sc->mfi_map_sync_cm = cmd;
cmd->cm_flags = MFI_CMD_DATAOUT;
cmd->cm_frame->dcmd.mbox[0] = list->ld_count;
cmd->cm_frame->dcmd.mbox[1] = MFI_DCMD_MBOX_PEND_FLAG;
if ((error = mfi_mapcmd(sc, cmd)) != 0) {
device_printf(sc->mfi_dev, "failed to send map sync\n");
return;
}
out:
if (list)
free(list, M_MFIBUF);
if (cm)
mfi_release_command(cm);
mtx_unlock(&sc->mfi_io_lock);
return;
}
static void
mfi_sync_map_complete(struct mfi_command *cm)
{
struct mfi_frame_header *hdr;
struct mfi_softc *sc;
int aborted = 0;
sc = cm->cm_sc;
mtx_assert(&sc->mfi_io_lock, MA_OWNED);
hdr = &cm->cm_frame->header;
if (sc->mfi_map_sync_cm == NULL)
return;
if (sc->cm_map_abort ||
hdr->cmd_status == MFI_STAT_INVALID_STATUS) {
sc->cm_map_abort = 0;
aborted = 1;
}
free(cm->cm_data, M_MFIBUF);
sc->mfi_map_sync_cm = NULL;
wakeup(&sc->mfi_map_sync_cm);
mfi_release_command(cm);
/* set it up again so the driver can catch more events */
if (!aborted) {
mfi_queue_map_sync(sc);
}
}
static void
mfi_queue_map_sync(struct mfi_softc *sc)
{
mtx_assert(&sc->mfi_io_lock, MA_OWNED);
taskqueue_enqueue(taskqueue_swi, &sc->mfi_map_sync_task);
}
void
mfi_handle_map_sync(void *context, int pending)
{
struct mfi_softc *sc;
sc = context;
mfi_tbolt_sync_map_info(sc);
}

View File

@ -403,6 +403,7 @@ typedef enum {
#define MR_EVT_CTRL_HOST_BUS_SCAN_REQUESTED 0x0152
#define MR_EVT_PD_REMOVED 0x0070
#define MR_EVT_PD_INSERTED 0x005b
#define MR_EVT_LD_CHANGE 0x0051
typedef enum {
MR_LD_CACHE_WRITE_BACK = 0x01,

View File

@ -105,7 +105,6 @@ struct mfi_command {
#define MFI_ON_MFIQ_READY (1<<6)
#define MFI_ON_MFIQ_BUSY (1<<7)
#define MFI_ON_MFIQ_MASK ((1<<5)|(1<<6)|(1<<7))
int cm_aen_abort;
uint8_t retry_for_fw_reset;
void (* cm_complete)(struct mfi_command *cm);
void *cm_private;
@ -216,9 +215,13 @@ struct mfi_softc {
TAILQ_HEAD(,mfi_evt_queue_elm) mfi_evt_queue;
struct task mfi_evt_task;
struct task mfi_map_sync_task;
TAILQ_HEAD(,mfi_aen) mfi_aen_pids;
struct mfi_command *mfi_aen_cm;
struct mfi_command *mfi_skinny_cm;
struct mfi_command *mfi_map_sync_cm;
int cm_aen_abort;
int cm_map_abort;
uint32_t mfi_aen_triggered;
uint32_t mfi_poll_waiting;
uint32_t mfi_boot_seq_num;
@ -303,8 +306,6 @@ struct mfi_softc {
/* ThunderBolt */
uint32_t mfi_tbolt;
uint32_t MFA_enabled;
uint64_t map_id;
struct mfi_command *map_update_cmd;
/* Single Reply structure size */
uint16_t reply_size;
/* Singler message size. */
@ -417,7 +418,10 @@ extern int mfi_tbolt_alloc_cmd(struct mfi_softc *sc);
extern int mfi_tbolt_send_frame(struct mfi_softc *sc, struct mfi_command *cm);
extern int mfi_tbolt_adp_reset(struct mfi_softc *sc);
extern int mfi_tbolt_reset(struct mfi_softc *sc);
extern int mfi_tbolt_sync_map_info(struct mfi_softc *sc);
extern void mfi_tbolt_sync_map_info(struct mfi_softc *sc);
extern void mfi_handle_map_sync(void *context, int pending);
extern int mfi_dcmd_command(struct mfi_softc *, struct mfi_command **,
uint32_t, void **, size_t);
#define MFIQ_ADD(sc, qname) \
do { \