Fix mpr(4) and mps(4) state transitions and a use-after-free panic.

When the mpr(4) and mps(4) drivers probe a SATA device, they issue an
ATA Identify command (via mp{s,r}sas_get_sata_identify()) before the
target is fully setup in the driver.  The drivers wait for completion of
the identify command, and have a 5 second timeout.  If the timeout
fires, the command is marked with the SATA_ID_TIMEOUT flag so it can be
freed later.

That is where the use-after-free problem comes in.  Once the ATA
Identify times out, the driver sends a target reset, and then frees any
identify commands that have timed out.  But, once the target reset
completes, commands that were queued to the drive are returned to the
driver by the controller.

At that point, the driver (in mp{s,r}_intr_locked()) looks up the
command descriptor for that particular SMID, marks it CM_STATE_BUSY and
sends it on for completion handling.

The problem at this stage is that the command has already been freed,
and put on the free queue, so its state is CM_STATE_FREE.  If INVARIANTS
are turned on, we get a panic as soon as this command is allocated,
because its state is no longer CM_STATE_FREE, but rather CM_STATE_BUSY.

So, the solution is to not free ATA Identify commands that get stuck
until they actually return from the controller.  Hopefully this works
correctly on older firmware versions.  If not, it could result in
commands hanging around indefinitely.  But, the alternative is a
use-after-free panic or assertion (in the INVARIANTS case).

This also tightens up the state transitions between CM_STATE_FREE,
CM_STATE_BUSY and CM_STATE_INQUEUE, so that the state transitions happen
once, and we have assertions to make sure that commands are in the
correct state before transitioning to the next state.  Also, for each
state assertion, we print out the current state of the command if it is
incorrect.

mp{s,r}.c:      Add a new sysctl variable, dump_reqs_alltypes,
                that controls the behavior of the dump_reqs sysctl.
                If dump_reqs_alltypes is non-zero, it will dump
                all commands, not just the commands that are in the
                CM_STATE_INQUEUE state.  (You can see the commands
                that are in the queue by using mp{s,r}util debug
                dumpreqs.)

                Make sure that the INQUEUE -> BUSY state transition
                happens in one place, the mp{s,r}_complete_command
                routine.

mp{s,r}_sas.c:  Make sure we print the current command type in
                command state assertions.

mp{s,r}_sas_lsi.c:
                Add a new completion handler,
                mp{s,r}sas_ata_id_complete.  This completion
                handler will free data allocated for an ATA
                Identify command and free the command structure.

                In mp{s,r}_ata_id_timeout, do not set the command
                state to CM_STATE_BUSY.  The command is still in
                queue in the controller.  Since we were blocking
                waiting for this command to complete, there was
                no completion handler previously.  Set the
                completion handler, so that whenever the command
                does come back, it will get freed properly.

                Do not free ATA Identify commands that have timed
                out in mp{s,r}sas_add_device().  Wait for them
                to actually come back from the controller.

mp{s,r}var.h:   Add a dump_reqs_alltypes variable for the new
                dump_reqs_alltypes sysctl.

                Make sure we print the current state for state
                transition asserts.

This was tested in the Spectra Logic test bed (as described in the
review), as well Netflix's Open Connect fleet (where panics dropped from
a dozen or two a month to zero).

Reviewed by:		imp@ (who is handling the commit with ken's OK)
Sponsored by:		Spectra Logic
Differential Revision:	https://reviews.freebsd.org/D25476
This commit is contained in:
Kenneth D. Merry 2021-06-03 13:46:11 -06:00 committed by Warner Losh
parent cc384c67ce
commit 175ad3d003
8 changed files with 93 additions and 54 deletions

View File

@ -1141,7 +1141,8 @@ mpr_enqueue_request(struct mpr_softc *sc, struct mpr_command *cm)
if (++sc->io_cmds_active > sc->io_cmds_highwater)
sc->io_cmds_highwater++;
KASSERT(cm->cm_state == MPR_CM_STATE_BUSY, ("command not busy\n"));
KASSERT(cm->cm_state == MPR_CM_STATE_BUSY,
("command not busy, state = %u\n", cm->cm_state));
cm->cm_state = MPR_CM_STATE_INQUEUE;
if (sc->atomic_desc_capable) {
@ -1917,6 +1918,11 @@ mpr_setup_sysctl(struct mpr_softc *sc)
CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT,
sc, 0, mpr_dump_reqs, "I", "Dump Active Requests");
SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
OID_AUTO, "dump_reqs_alltypes", CTLFLAG_RW,
&sc->dump_reqs_alltypes, 0,
"dump all request types not just inqueue");
SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
OID_AUTO, "use_phy_num", CTLFLAG_RD, &sc->use_phynum, 0,
"Use the phy number for enumeration");
@ -2101,7 +2107,7 @@ mpr_dump_reqs(SYSCTL_HANDLER_ARGS)
/* Best effort, no locking */
for (i = smid; i < numreqs; i++) {
cm = &sc->commands[i];
if (cm->cm_state != state)
if ((sc->dump_reqs_alltypes == 0) && (cm->cm_state != state))
continue;
hdr.smid = i;
hdr.state = cm->cm_state;
@ -2365,6 +2371,8 @@ mpr_complete_command(struct mpr_softc *sc, struct mpr_command *cm)
return;
}
KASSERT(cm->cm_state == MPR_CM_STATE_INQUEUE,
("command not inqueue, state = %u\n", cm->cm_state));
cm->cm_state = MPR_CM_STATE_BUSY;
if (cm->cm_flags & MPR_CM_FLAGS_POLLED)
cm->cm_flags |= MPR_CM_FLAGS_COMPLETE;
@ -2544,9 +2552,6 @@ mpr_intr_locked(void *data)
case MPI25_RPY_DESCRIPT_FLAGS_FAST_PATH_SCSI_IO_SUCCESS:
case MPI26_RPY_DESCRIPT_FLAGS_PCIE_ENCAPSULATED_SUCCESS:
cm = &sc->commands[le16toh(desc->SCSIIOSuccess.SMID)];
KASSERT(cm->cm_state == MPR_CM_STATE_INQUEUE,
("command not inqueue\n"));
cm->cm_state = MPR_CM_STATE_BUSY;
cm->cm_reply = NULL;
break;
case MPI2_RPY_DESCRIPT_FLAGS_ADDRESS_REPLY:

View File

@ -1194,7 +1194,7 @@ mprsas_tm_timeout(void *data)
"out\n", tm);
KASSERT(tm->cm_state == MPR_CM_STATE_INQUEUE,
("command not inqueue\n"));
("command not inqueue, state = %u\n", tm->cm_state));
tm->cm_state = MPR_CM_STATE_BUSY;
mpr_reinit(sc);
@ -2437,7 +2437,7 @@ mprsas_scsiio_complete(struct mpr_softc *sc, struct mpr_command *cm)
if (cm->cm_flags & MPR_CM_FLAGS_ON_RECOVERY) {
TAILQ_REMOVE(&cm->cm_targ->timedout_commands, cm, cm_recovery);
KASSERT(cm->cm_state == MPR_CM_STATE_BUSY,
("Not busy for CM_FLAGS_TIMEDOUT: %d\n", cm->cm_state));
("Not busy for CM_FLAGS_TIMEDOUT: %u\n", cm->cm_state));
cm->cm_flags &= ~MPR_CM_FLAGS_ON_RECOVERY;
if (cm->cm_reply != NULL)
mprsas_log_command(cm, MPR_RECOVERY,

View File

@ -125,6 +125,7 @@ static int mprsas_add_pcie_device(struct mpr_softc *sc, u16 handle,
static int mprsas_get_sata_identify(struct mpr_softc *sc, u16 handle,
Mpi2SataPassthroughReply_t *mpi_reply, char *id_buffer, int sz,
u32 devinfo);
static void mprsas_ata_id_complete(struct mpr_softc *, struct mpr_command *);
static void mprsas_ata_id_timeout(struct mpr_softc *, struct mpr_command *);
int mprsas_get_sas_address_for_sata_disk(struct mpr_softc *sc,
u64 *sas_address, u16 handle, u32 device_info, u8 *is_SATA_SSD);
@ -1005,7 +1006,8 @@ mprsas_add_device(struct mpr_softc *sc, u16 handle, u8 linkrate)
* An Abort Task TM should be used instead of a Target Reset, but that
* would be much more difficult because targets have not been fully
* discovered yet, and LUN's haven't been setup. So, just reset the
* target instead of the LUN.
* target instead of the LUN. The commands should complete once
* the target has been reset.
*/
for (i = 1; i < sc->num_reqs; i++) {
cm = &sc->commands[i];
@ -1033,16 +1035,6 @@ mprsas_add_device(struct mpr_softc *sc, u16 handle, u8 linkrate)
}
}
out:
/*
* Free the commands that may not have been freed from the SATA ID call
*/
for (i = 1; i < sc->num_reqs; i++) {
cm = &sc->commands[i];
if (cm->cm_flags & MPR_CM_FLAGS_SATA_ID_TIMEOUT) {
free(cm->cm_data, M_MPR);
mpr_free_command(sc, cm);
}
}
mprsas_startup_decrement(sassc);
return (error);
}
@ -1218,8 +1210,8 @@ mprsas_get_sata_identify(struct mpr_softc *sc, u16 handle,
out:
/*
* If the SATA_ID_TIMEOUT flag has been set for this command, don't free
* it. The command and buffer will be freed after sending an Abort
* Task TM.
* it. The command and buffer will be freed after we send a Target
* Reset TM and the command comes back from the controller.
*/
if ((cm->cm_flags & MPR_CM_FLAGS_SATA_ID_TIMEOUT) == 0) {
mpr_free_command(sc, cm);
@ -1228,6 +1220,22 @@ out:
return (error);
}
/*
* This is completion handler to make sure that commands and allocated
* buffers get freed when timed out SATA ID commands finally complete after
* we've reset the target. In the normal case, we wait for the command to
* complete.
*/
static void
mprsas_ata_id_complete(struct mpr_softc *sc, struct mpr_command *cm)
{
mpr_dprint(sc, MPR_INFO, "%s ATA ID completed late cm %p sc %p\n",
__func__, cm, sc);
free(cm->cm_data, M_MPR);
mpr_free_command(sc, cm);
}
static void
mprsas_ata_id_timeout(struct mpr_softc *sc, struct mpr_command *cm)
{
@ -1242,7 +1250,12 @@ mprsas_ata_id_timeout(struct mpr_softc *sc, struct mpr_command *cm)
* this command has timed out, it's no longer in the queue.
*/
cm->cm_flags |= MPR_CM_FLAGS_SATA_ID_TIMEOUT;
cm->cm_state = MPR_CM_STATE_BUSY;
/*
* Since we will no longer be waiting for the command to complete,
* set a completion handler to make sure we free all resources.
*/
cm->cm_complete = mprsas_ata_id_complete;
}
static int

View File

@ -367,6 +367,7 @@ struct mpr_softc {
u_int enable_ssu;
int spinup_wait_time;
int use_phynum;
int dump_reqs_alltypes;
uint64_t chain_alloc_fail;
uint64_t prp_page_alloc_fail;
struct sysctl_ctx_list sysctl_ctx;
@ -617,7 +618,8 @@ mpr_free_command(struct mpr_softc *sc, struct mpr_command *cm)
struct mpr_chain *chain, *chain_temp;
struct mpr_prp_page *prp_page, *prp_page_temp;
KASSERT(cm->cm_state == MPR_CM_STATE_BUSY, ("state not busy\n"));
KASSERT(cm->cm_state == MPR_CM_STATE_BUSY,
("state not busy, state = %u\n", cm->cm_state));
if (cm->cm_reply != NULL)
mpr_free_reply(sc, cm->cm_reply_data);
@ -658,7 +660,7 @@ mpr_alloc_command(struct mpr_softc *sc)
return (NULL);
KASSERT(cm->cm_state == MPR_CM_STATE_FREE,
("mpr: Allocating busy command\n"));
("mpr: Allocating busy command, state = %u\n", cm->cm_state));
TAILQ_REMOVE(&sc->req_list, cm, cm_link);
cm->cm_state = MPR_CM_STATE_BUSY;
@ -671,7 +673,8 @@ mpr_free_high_priority_command(struct mpr_softc *sc, struct mpr_command *cm)
{
struct mpr_chain *chain, *chain_temp;
KASSERT(cm->cm_state == MPR_CM_STATE_BUSY, ("state not busy\n"));
KASSERT(cm->cm_state == MPR_CM_STATE_BUSY,
("state not busy, state = %u\n", cm->cm_state));
if (cm->cm_reply != NULL)
mpr_free_reply(sc, cm->cm_reply_data);
@ -700,7 +703,7 @@ mpr_alloc_high_priority_command(struct mpr_softc *sc)
return (NULL);
KASSERT(cm->cm_state == MPR_CM_STATE_FREE,
("mpr: Allocating busy command\n"));
("mpr: Allocating busy command, state = %u\n", cm->cm_state));
TAILQ_REMOVE(&sc->high_priority_req_list, cm, cm_link);
cm->cm_state = MPR_CM_STATE_BUSY;

View File

@ -1112,7 +1112,8 @@ mps_enqueue_request(struct mps_softc *sc, struct mps_command *cm)
rd.u.high = cm->cm_desc.Words.High;
rd.word = htole64(rd.word);
KASSERT(cm->cm_state == MPS_CM_STATE_BUSY, ("command not busy\n"));
KASSERT(cm->cm_state == MPS_CM_STATE_BUSY,
("command not busy, state = %u\n", cm->cm_state));
cm->cm_state = MPS_CM_STATE_INQUEUE;
/* TODO-We may need to make below regwrite atomic */
@ -1774,6 +1775,11 @@ mps_setup_sysctl(struct mps_softc *sc)
CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT,
sc, 0, mps_dump_reqs, "I", "Dump Active Requests");
SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
OID_AUTO, "dump_reqs_alltypes", CTLFLAG_RW,
&sc->dump_reqs_alltypes, 0,
"dump all request types not just inqueue");
SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
OID_AUTO, "use_phy_num", CTLFLAG_RD, &sc->use_phynum, 0,
"Use the phy number for enumeration");
@ -1947,7 +1953,7 @@ mps_dump_reqs(SYSCTL_HANDLER_ARGS)
/* Best effort, no locking */
for (i = smid; i < numreqs; i++) {
cm = &sc->commands[i];
if (cm->cm_state != state)
if ((sc->dump_reqs_alltypes == 0) && (cm->cm_state != state))
continue;
hdr.smid = i;
hdr.state = cm->cm_state;
@ -2206,6 +2212,9 @@ mps_complete_command(struct mps_softc *sc, struct mps_command *cm)
return;
}
KASSERT(cm->cm_state == MPS_CM_STATE_INQUEUE,
("command not inqueue, state = %u\n", cm->cm_state));
cm->cm_state = MPS_CM_STATE_BUSY;
if (cm->cm_flags & MPS_CM_FLAGS_POLLED)
cm->cm_flags |= MPS_CM_FLAGS_COMPLETE;
@ -2382,9 +2391,6 @@ mps_intr_locked(void *data)
switch (flags) {
case MPI2_RPY_DESCRIPT_FLAGS_SCSI_IO_SUCCESS:
cm = &sc->commands[le16toh(desc->SCSIIOSuccess.SMID)];
KASSERT(cm->cm_state == MPS_CM_STATE_INQUEUE,
("command not inqueue\n"));
cm->cm_state = MPS_CM_STATE_BUSY;
cm->cm_reply = NULL;
break;
case MPI2_RPY_DESCRIPT_FLAGS_ADDRESS_REPLY:
@ -2460,7 +2466,6 @@ mps_intr_locked(void *data)
cm = &sc->commands[
le16toh(desc->AddressReply.SMID)];
if (cm->cm_state == MPS_CM_STATE_INQUEUE) {
cm->cm_state = MPS_CM_STATE_BUSY;
cm->cm_reply = reply;
cm->cm_reply_data = le32toh(
desc->AddressReply.ReplyFrameAddress);

View File

@ -1170,7 +1170,7 @@ mpssas_tm_timeout(void *data)
"task mgmt %p timed out\n", tm);
KASSERT(tm->cm_state == MPS_CM_STATE_INQUEUE,
("command not inqueue\n"));
("command not inqueue, state = %u\n", tm->cm_state));
tm->cm_state = MPS_CM_STATE_BUSY;
mps_reinit(sc);
@ -2015,7 +2015,7 @@ mpssas_scsiio_complete(struct mps_softc *sc, struct mps_command *cm)
if (cm->cm_flags & MPS_CM_FLAGS_ON_RECOVERY) {
TAILQ_REMOVE(&cm->cm_targ->timedout_commands, cm, cm_recovery);
KASSERT(cm->cm_state == MPS_CM_STATE_BUSY,
("Not busy for CM_FLAGS_TIMEDOUT: %d\n", cm->cm_state));
("Not busy for CM_FLAGS_TIMEDOUT: %u\n", cm->cm_state));
cm->cm_flags &= ~MPS_CM_FLAGS_ON_RECOVERY;
if (cm->cm_reply != NULL)
mpssas_log_command(cm, MPS_RECOVERY,

View File

@ -123,6 +123,7 @@ static int mpssas_add_device(struct mps_softc *sc, u16 handle, u8 linkrate);
static int mpssas_get_sata_identify(struct mps_softc *sc, u16 handle,
Mpi2SataPassthroughReply_t *mpi_reply, char *id_buffer, int sz,
u32 devinfo);
static void mpssas_ata_id_complete(struct mps_softc *, struct mps_command *);
static void mpssas_ata_id_timeout(struct mps_softc *, struct mps_command *);
int mpssas_get_sas_address_for_sata_disk(struct mps_softc *sc,
u64 *sas_address, u16 handle, u32 device_info, u8 *is_SATA_SSD);
@ -780,7 +781,8 @@ mpssas_add_device(struct mps_softc *sc, u16 handle, u8 linkrate){
* An Abort Task TM should be used instead of a Target Reset, but that
* would be much more difficult because targets have not been fully
* discovered yet, and LUN's haven't been setup. So, just reset the
* target instead of the LUN.
* target instead of the LUN. The commands should complete once the
* target has been reset.
*/
for (i = 1; i < sc->num_reqs; i++) {
cm = &sc->commands[i];
@ -808,16 +810,6 @@ mpssas_add_device(struct mps_softc *sc, u16 handle, u8 linkrate){
}
}
out:
/*
* Free the commands that may not have been freed from the SATA ID call
*/
for (i = 1; i < sc->num_reqs; i++) {
cm = &sc->commands[i];
if (cm->cm_flags & MPS_CM_FLAGS_SATA_ID_TIMEOUT) {
free(cm->cm_data, M_MPT2);
mps_free_command(sc, cm);
}
}
mpssas_startup_decrement(sassc);
return (error);
}
@ -993,8 +985,8 @@ mpssas_get_sata_identify(struct mps_softc *sc, u16 handle,
out:
/*
* If the SATA_ID_TIMEOUT flag has been set for this command, don't free
* it. The command and buffer will be freed after sending an Abort
* Task TM.
* it. The command and buffer will be freed after we send a Target
* Reset TM and the command comes back from the controller.
*/
if ((cm->cm_flags & MPS_CM_FLAGS_SATA_ID_TIMEOUT) == 0) {
mps_free_command(sc, cm);
@ -1003,21 +995,41 @@ out:
return (error);
}
/*
* This is completion handler to make sure that commands and allocated
* buffers get freed when timed out SATA ID commands finally complete after
* we've reset the target. In the normal case, we wait for the command to
* complete.
*/
static void
mpssas_ata_id_complete(struct mps_softc *sc, struct mps_command *cm)
{
mps_dprint(sc, MPS_INFO, "%s ATA ID completed late cm %p sc %p\n",
__func__, cm, sc);
free(cm->cm_data, M_MPT2);
mps_free_command(sc, cm);
}
static void
mpssas_ata_id_timeout(struct mps_softc *sc, struct mps_command *cm)
{
mps_dprint(sc, MPS_INFO, "%s ATA ID command timeout cm %p sc %p\n",
__func__, cm, sc);
/*
* The Abort Task cannot be sent from here because the driver has not
* completed setting up targets. Instead, the command is flagged so
* that special handling will be used to send the abort. Now that
* this command has timed out, it's no longer in the queue.
* that special handling will be used to send a target reset.
*/
cm->cm_flags |= MPS_CM_FLAGS_SATA_ID_TIMEOUT;
cm->cm_state = MPS_CM_STATE_BUSY;
/*
* Since we will no longer be waiting for the command to complete,
* set a completion handler to make sure we free all resources.
*/
cm->cm_complete = mpssas_ata_id_complete;
}
static int

View File

@ -326,6 +326,7 @@ struct mps_softc {
u_int enable_ssu;
int spinup_wait_time;
int use_phynum;
int dump_reqs_alltypes;
uint64_t chain_alloc_fail;
struct sysctl_ctx_list sysctl_ctx;
struct sysctl_oid *sysctl_tree;
@ -548,7 +549,7 @@ mps_free_command(struct mps_softc *sc, struct mps_command *cm)
struct mps_chain *chain, *chain_temp;
KASSERT(cm->cm_state == MPS_CM_STATE_BUSY,
("state not busy: %d\n", cm->cm_state));
("state not busy: %u\n", cm->cm_state));
if (cm->cm_reply != NULL)
mps_free_reply(sc, cm->cm_reply_data);
@ -584,7 +585,7 @@ mps_alloc_command(struct mps_softc *sc)
return (NULL);
KASSERT(cm->cm_state == MPS_CM_STATE_FREE,
("mps: Allocating busy command: %d\n", cm->cm_state));
("mps: Allocating busy command: %u\n", cm->cm_state));
TAILQ_REMOVE(&sc->req_list, cm, cm_link);
cm->cm_state = MPS_CM_STATE_BUSY;
@ -598,7 +599,7 @@ mps_free_high_priority_command(struct mps_softc *sc, struct mps_command *cm)
struct mps_chain *chain, *chain_temp;
KASSERT(cm->cm_state == MPS_CM_STATE_BUSY,
("state not busy: %d\n", cm->cm_state));
("state not busy: %u\n", cm->cm_state));
if (cm->cm_reply != NULL)
mps_free_reply(sc, cm->cm_reply_data);
@ -627,7 +628,7 @@ mps_alloc_high_priority_command(struct mps_softc *sc)
return (NULL);
KASSERT(cm->cm_state == MPS_CM_STATE_FREE,
("mps: Allocating high priority busy command: %d\n", cm->cm_state));
("mps: Allocating high priority busy command: %u\n", cm->cm_state));
TAILQ_REMOVE(&sc->high_priority_req_list, cm, cm_link);
cm->cm_state = MPS_CM_STATE_BUSY;