Fix bugs in recovery path and improve cm tracking

Eliminate the TIMEDOUT state. This state really conveyed two different
concepts: I timed out during recovery (and my command got put on the
recovery queue), and I timed out diring discovery (which doesn't).
Separate those two concepts into two flags. Use the TIMEDOUT flag to
fail requests as timed out. Use the on queue flag to remove them from
the queue.

In mps_intr_locked for MPI2_RPY_DESCRIPT_FLAGS_ADDRESS_REPLY message
type, when completing commands, ignore the ones that are not in state
INQUEUE. They were already completed as part of the recovery
process. When we complete them twice, we wind up with entries on the
free queue that are marked as busy, trigging asserts.

Reviewed by: scottl (earlier version, just for mpr)
Differential Revision: https://reviews.freebsd.org/D20785
This commit is contained in:
Warner Losh 2019-07-08 20:20:01 +00:00
parent 6529459a96
commit 8fe7bf064f
8 changed files with 60 additions and 32 deletions

View File

@ -2617,12 +2617,17 @@ mpr_intr_locked(void *data)
} else {
cm = &sc->commands[
le16toh(desc->AddressReply.SMID)];
if (cm->cm_state != MPR_CM_STATE_TIMEDOUT)
cm->cm_state = MPR_CM_STATE_BUSY;
cm->cm_reply = reply;
cm->cm_reply_data =
le32toh(desc->AddressReply.
ReplyFrameAddress);
if (cm->cm_state == MPR_CM_STATE_INQUEUE) {
cm->cm_reply = reply;
cm->cm_reply_data =
le32toh(desc->AddressReply.
ReplyFrameAddress);
} else {
mpr_dprint(sc, MPR_RECOVERY,
"Bad state for ADDRESS_REPLY status,"
" ignoring state %d cm %p\n",
cm->cm_state, cm);
}
}
break;
}

View File

@ -1692,7 +1692,7 @@ mprsas_scsiio_timeout(void *data)
* and been re-used, though this is unlikely.
*/
mpr_intr_locked(sc);
if (cm->cm_state != MPR_CM_STATE_INQUEUE) {
if (cm->cm_flags & MPR_CM_FLAGS_ON_RECOVERY) {
mprsas_log_command(cm, MPR_XINFO,
"SCSI command %p almost timed out\n", cm);
return;
@ -1721,7 +1721,7 @@ mprsas_scsiio_timeout(void *data)
* operational. if not, do a diag reset.
*/
mprsas_set_ccbstatus(cm->cm_ccb, CAM_CMD_TIMEOUT);
cm->cm_state = MPR_CM_STATE_TIMEDOUT;
cm->cm_flags |= MPR_CM_FLAGS_ON_RECOVERY | MPR_CM_FLAGS_TIMEDOUT;
TAILQ_INSERT_TAIL(&targ->timedout_commands, cm, cm_recovery);
if (targ->tm != NULL) {
@ -2529,9 +2529,11 @@ mprsas_scsiio_complete(struct mpr_softc *sc, struct mpr_command *cm)
TAILQ_REMOVE(&cm->cm_targ->commands, cm, cm_link);
ccb->ccb_h.status &= ~(CAM_STATUS_MASK | CAM_SIM_QUEUED);
if (cm->cm_state == MPR_CM_STATE_TIMEDOUT) {
if (cm->cm_flags & MPR_CM_FLAGS_ON_RECOVERY) {
TAILQ_REMOVE(&cm->cm_targ->timedout_commands, cm, cm_recovery);
cm->cm_state = MPR_CM_STATE_BUSY;
KASSERT(cm->cm_state == MPR_CM_STATE_BUSY,
("Not busy for CM_FLAGS_TIMEDOUT: %d\n", cm->cm_state));
cm->cm_flags &= ~MPR_CM_FLAGS_ON_RECOVERY;
if (cm->cm_reply != NULL)
mprsas_log_command(cm, MPR_RECOVERY,
"completed timedout cm %p ccb %p during recovery "
@ -2817,7 +2819,7 @@ mprsas_scsiio_complete(struct mpr_softc *sc, struct mpr_command *cm)
* retry counter), the only difference is what gets printed
* on the console.
*/
if (cm->cm_state == MPR_CM_STATE_TIMEDOUT)
if (cm->cm_flags & MPR_CM_FLAGS_TIMEDOUT)
mprsas_set_ccbstatus(ccb, CAM_CMD_TIMEOUT);
else
mprsas_set_ccbstatus(ccb, CAM_REQ_ABORTED);

View File

@ -1017,7 +1017,7 @@ mprsas_add_device(struct mpr_softc *sc, u16 handle, u8 linkrate)
cm = &sc->commands[i];
if (cm->cm_flags & MPR_CM_FLAGS_SATA_ID_TIMEOUT) {
targ->timeouts++;
cm->cm_state = MPR_CM_STATE_TIMEDOUT;
cm->cm_flags |= MPR_CM_FLAGS_TIMEDOUT;
if ((targ->tm = mprsas_alloc_tm(sc)) != NULL) {
mpr_dprint(sc, MPR_INFO, "%s: sending Target "
@ -1244,9 +1244,11 @@ mprsas_ata_id_timeout(struct mpr_softc *sc, struct mpr_command *cm)
/*
* The Abort Task cannot be sent from here because the driver has not
* completed setting up targets. Instead, the command is flagged so
* that special handling will be used to send the abort.
* that special handling will be used to send the abort. Now that
* this command has timed out, it's no longer in the queue.
*/
cm->cm_flags |= MPR_CM_FLAGS_SATA_ID_TIMEOUT;
cm->cm_state = MPR_CM_STATE_BUSY;
}
static int

View File

@ -275,11 +275,12 @@ struct mpr_command {
#define MPR_CM_FLAGS_ERROR_MASK MPR_CM_FLAGS_CHAIN_FAILED
#define MPR_CM_FLAGS_USE_CCB (1 << 9)
#define MPR_CM_FLAGS_SATA_ID_TIMEOUT (1 << 10)
#define MPR_CM_FLAGS_ON_RECOVERY (1 << 12)
#define MPR_CM_FLAGS_TIMEDOUT (1 << 13)
u_int cm_state;
#define MPR_CM_STATE_FREE 0
#define MPR_CM_STATE_BUSY 1
#define MPR_CM_STATE_TIMEDOUT 2
#define MPR_CM_STATE_INQUEUE 3
#define MPR_CM_STATE_INQUEUE 2
bus_dmamap_t cm_dmamap;
struct scsi_sense_data *cm_sense;
uint64_t *nvme_error_response;

View File

@ -2479,13 +2479,24 @@ mps_intr_locked(void *data)
(MPI2_EVENT_NOTIFICATION_REPLY *)
reply);
} else {
/*
* Ignore commands not in INQUEUE state
* since they've already been completed
* via another path.
*/
cm = &sc->commands[
le16toh(desc->AddressReply.SMID)];
if (cm->cm_state != MPS_CM_STATE_TIMEDOUT)
if (cm->cm_state == MPS_CM_STATE_INQUEUE) {
cm->cm_state = MPS_CM_STATE_BUSY;
cm->cm_reply = reply;
cm->cm_reply_data = le32toh(
desc->AddressReply.ReplyFrameAddress);
cm->cm_reply = reply;
cm->cm_reply_data = le32toh(
desc->AddressReply.ReplyFrameAddress);
} else {
mps_dprint(sc, MPS_RECOVERY,
"Bad state for ADDRESS_REPLY status,"
" ignoring state %d cm %p\n",
cm->cm_state, cm);
}
}
break;
}

View File

@ -1602,7 +1602,7 @@ mpssas_scsiio_timeout(void *data)
* and been re-used, though this is unlikely.
*/
mps_intr_locked(sc);
if (cm->cm_state != MPS_CM_STATE_INQUEUE) {
if (cm->cm_flags & MPS_CM_FLAGS_ON_RECOVERY) {
mpssas_log_command(cm, MPS_XINFO,
"SCSI command %p almost timed out\n", cm);
return;
@ -1626,7 +1626,7 @@ mpssas_scsiio_timeout(void *data)
* operational. if not, do a diag reset.
*/
mpssas_set_ccbstatus(cm->cm_ccb, CAM_CMD_TIMEOUT);
cm->cm_state = MPS_CM_STATE_TIMEDOUT;
cm->cm_flags |= MPS_CM_FLAGS_ON_RECOVERY | MPS_CM_FLAGS_TIMEDOUT;
TAILQ_INSERT_TAIL(&targ->timedout_commands, cm, cm_recovery);
if (targ->tm != NULL) {
@ -2040,9 +2040,11 @@ mpssas_scsiio_complete(struct mps_softc *sc, struct mps_command *cm)
biotrack(ccb->csio.bio, __func__);
#endif
if (cm->cm_state == MPS_CM_STATE_TIMEDOUT) {
if (cm->cm_flags & MPS_CM_FLAGS_ON_RECOVERY) {
TAILQ_REMOVE(&cm->cm_targ->timedout_commands, cm, cm_recovery);
cm->cm_state = MPS_CM_STATE_BUSY;
KASSERT(cm->cm_state == MPS_CM_STATE_BUSY,
("Not busy for CM_FLAGS_TIMEDOUT: %d\n", cm->cm_state));
cm->cm_flags &= ~MPS_CM_FLAGS_ON_RECOVERY;
if (cm->cm_reply != NULL)
mpssas_log_command(cm, MPS_RECOVERY,
"completed timedout cm %p ccb %p during recovery "
@ -2325,7 +2327,7 @@ mpssas_scsiio_complete(struct mps_softc *sc, struct mps_command *cm)
* retry counter), the only difference is what gets printed
* on the console.
*/
if (cm->cm_state == MPS_CM_STATE_TIMEDOUT)
if (cm->cm_flags & MPS_CM_FLAGS_TIMEDOUT)
mpssas_set_ccbstatus(ccb, CAM_CMD_TIMEOUT);
else
mpssas_set_ccbstatus(ccb, CAM_REQ_ABORTED);

View File

@ -790,7 +790,7 @@ mpssas_add_device(struct mps_softc *sc, u16 handle, u8 linkrate){
cm = &sc->commands[i];
if (cm->cm_flags & MPS_CM_FLAGS_SATA_ID_TIMEOUT) {
targ->timeouts++;
cm->cm_state = MPS_CM_STATE_TIMEDOUT;
cm->cm_flags |= MPS_CM_FLAGS_TIMEDOUT;
if ((targ->tm = mpssas_alloc_tm(sc)) != NULL) {
mps_dprint(sc, MPS_INFO, "%s: sending Target "
@ -1017,9 +1017,11 @@ mpssas_ata_id_timeout(struct mps_softc *sc, struct mps_command *cm)
/*
* The Abort Task cannot be sent from here because the driver has not
* completed setting up targets. Instead, the command is flagged so
* that special handling will be used to send the abort.
* that special handling will be used to send the abort. Now that
* this command has timed out, it's no longer in the queue.
*/
cm->cm_flags |= MPS_CM_FLAGS_SATA_ID_TIMEOUT;
cm->cm_state = MPS_CM_STATE_BUSY;
}
static int

View File

@ -242,11 +242,12 @@ struct mps_command {
#define MPS_CM_FLAGS_ERROR_MASK MPS_CM_FLAGS_CHAIN_FAILED
#define MPS_CM_FLAGS_USE_CCB (1 << 10)
#define MPS_CM_FLAGS_SATA_ID_TIMEOUT (1 << 11)
#define MPS_CM_FLAGS_ON_RECOVERY (1 << 12)
#define MPS_CM_FLAGS_TIMEDOUT (1 << 13)
u_int cm_state;
#define MPS_CM_STATE_FREE 0
#define MPS_CM_STATE_BUSY 1
#define MPS_CM_STATE_TIMEDOUT 2
#define MPS_CM_STATE_INQUEUE 3
#define MPS_CM_STATE_INQUEUE 2
bus_dmamap_t cm_dmamap;
struct scsi_sense_data *cm_sense;
TAILQ_HEAD(, mps_chain) cm_chain_list;
@ -545,7 +546,8 @@ mps_free_command(struct mps_softc *sc, struct mps_command *cm)
{
struct mps_chain *chain, *chain_temp;
KASSERT(cm->cm_state == MPS_CM_STATE_BUSY, ("state not busy\n"));
KASSERT(cm->cm_state == MPS_CM_STATE_BUSY,
("state not busy: %d\n", cm->cm_state));
if (cm->cm_reply != NULL)
mps_free_reply(sc, cm->cm_reply_data);
@ -581,7 +583,7 @@ mps_alloc_command(struct mps_softc *sc)
return (NULL);
KASSERT(cm->cm_state == MPS_CM_STATE_FREE,
("mps: Allocating busy command\n"));
("mps: Allocating busy command: %d\n", cm->cm_state));
TAILQ_REMOVE(&sc->req_list, cm, cm_link);
cm->cm_state = MPS_CM_STATE_BUSY;
@ -594,7 +596,8 @@ mps_free_high_priority_command(struct mps_softc *sc, struct mps_command *cm)
{
struct mps_chain *chain, *chain_temp;
KASSERT(cm->cm_state == MPS_CM_STATE_BUSY, ("state not busy\n"));
KASSERT(cm->cm_state == MPS_CM_STATE_BUSY,
("state not busy: %d\n", cm->cm_state));
if (cm->cm_reply != NULL)
mps_free_reply(sc, cm->cm_reply_data);
@ -623,7 +626,7 @@ mps_alloc_high_priority_command(struct mps_softc *sc)
return (NULL);
KASSERT(cm->cm_state == MPS_CM_STATE_FREE,
("mps: Allocating busy command\n"));
("mps: Allocating high priority busy command: %d\n", cm->cm_state));
TAILQ_REMOVE(&sc->high_priority_req_list, cm, cm_link);
cm->cm_state = MPS_CM_STATE_BUSY;