Hyper-V: storvsc: Enhance srb_status code handling.

In hv_storvsc_io_request() when coring, prevent changing of the send channel from the base channel to another one. storvsc_poll always probes on the base channel. Based upon conversations with Microsoft, changed the handling of srb_status codes. Most we should never get, others yes. All are treated as retry-able except for two. We should not get these statuses, but if we ever do, the I/O state is not known. Submitted by: Alexander Sideropoulos <Alexander.Sideropoulos@netapp.com> Reviewed by: trasz, allanjude, whu MFC after: 1 week Sponsored by: Netapp Inc Differential Revision: https://reviews.freebsd.org/D25756
svn path=/head/; revision=364984
2020-08-31 09:05:45 +00:00 · 2020-08-31 09:05:45 +00:00 · 2a0ce39d08 · 2020-12-20 02:59:44 +00:00
commit 2a0ce39d08
parent 1e9b8db9b2
2 changed files with 168 additions and 24 deletions
--- a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c
+++ b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c
@ -151,6 +151,12 @@ SYSCTL_UINT(_hw_storvsc, OID_AUTO, max_io, CTLFLAG_RDTUN,
 static int hv_storvsc_chan_cnt = 0;
 SYSCTL_INT(_hw_storvsc, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
 	&hv_storvsc_chan_cnt, 0, "# of channels to use");
+#ifdef DIAGNOSTIC
+static int hv_storvsc_srb_status = -1;
+SYSCTL_INT(_hw_storvsc, OID_AUTO, srb_status,  CTLFLAG_RW,
+	&hv_storvsc_srb_status, 0, "srb_status to inject");
+TUNABLE_INT("hw_storvsc.srb_status", &hv_storvsc_srb_status);
+#endif /* DIAGNOSTIC */

 #define STORVSC_MAX_IO						\
 	vmbus_chan_prplist_nelem(hv_storvsc_ringbuffer_size,	\
@ -704,7 +710,16 @@ hv_storvsc_io_request(struct storvsc_softc *sc,
 	vstor_packet->operation = VSTOR_OPERATION_EXECUTESRB;

 	ch_sel = (vstor_packet->u.vm_srb.lun + curcpu) % sc->hs_nchan;
-	outgoing_channel = sc->hs_sel_chan[ch_sel];
+	/*
+	 * If we are panic'ing, then we are dumping core. Since storvsc_polls
+	 * always uses sc->hs_chan, then we must send to that channel or a poll
+	 * timeout will occur.
+	 */
+	if (panicstr) {
+		outgoing_channel = sc->hs_chan;
+	} else {
+		outgoing_channel = sc->hs_sel_chan[ch_sel];
+	}

 	mtx_unlock(&request->softc->hs_lock);
 	if (request->prp_list.gpa_range.gpa_len) {
@ -2155,26 +2170,133 @@ storvsc_io_done(struct hv_storvsc_request *reqp)
 	ccb->ccb_h.status &= ~CAM_SIM_QUEUED;
 	ccb->ccb_h.status &= ~CAM_STATUS_MASK;
 	int srb_status = SRB_STATUS(vm_srb->srb_status);
+#ifdef DIAGNOSTIC
+	if (hv_storvsc_srb_status != -1) {
+		srb_status = SRB_STATUS(hv_storvsc_srb_status & 0x3f);
+		hv_storvsc_srb_status = -1;
+	}
+#endif /* DIAGNOSTIC */
 	if (vm_srb->scsi_status == SCSI_STATUS_OK) {
 		if (srb_status != SRB_STATUS_SUCCESS) {
-			/*
-			 * If there are errors, for example, invalid LUN,
-			 * host will inform VM through SRB status.
-			 */
-			if (bootverbose) {
-				if (srb_status == SRB_STATUS_INVALID_LUN) {
-					xpt_print(ccb->ccb_h.path,
-					    "invalid LUN %d for op: %s\n",
-					    vm_srb->lun,
-					    scsi_op_desc(cmd->opcode, NULL));
-				} else {
-					xpt_print(ccb->ccb_h.path,
-					    "Unknown SRB flag: %d for op: %s\n",
-					    srb_status,
-					    scsi_op_desc(cmd->opcode, NULL));
-				}
+			bool log_error = true;
+			switch (srb_status) {
+				case SRB_STATUS_PENDING:
+					/* We should never get this */
+					panic("storvsc_io_done: SRB_STATUS_PENDING");
+					break;
+				case SRB_STATUS_ABORTED:
+					/*
+					 * storvsc doesn't support aborts yet
+					 * but if we ever get this status
+					 * the I/O is complete - treat it as a
+					 * timeout
+					 */
+					ccb->ccb_h.status |= CAM_CMD_TIMEOUT;
+					break;
+				case SRB_STATUS_ABORT_FAILED:
+					/* We should never get this */
+					panic("storvsc_io_done: SRB_STATUS_ABORT_FAILED");
+					break;
+				case SRB_STATUS_ERROR:
+					/*
+					 * We should never get this.
+					 * Treat it as a CAM_UNREC_HBA_ERROR.
+					 * It will be retried
+					 */
+					ccb->ccb_h.status |= CAM_UNREC_HBA_ERROR;
+					break;
+				case SRB_STATUS_BUSY:
+					/* Host is busy. Delay and retry */
+					ccb->ccb_h.status |= CAM_BUSY;
+					break;
+				case SRB_STATUS_INVALID_REQUEST:
+				case SRB_STATUS_INVALID_PATH_ID:
+				case SRB_STATUS_NO_DEVICE:
+				case SRB_STATUS_INVALID_TARGET_ID:
+					/*
+					 * These indicate an invalid address
+					 * and really should never be seen.
+					 * A CAM_PATH_INVALID could be
+					 * used here but I want to run
+					 * down retries.  Do a CAM_BUSY
+					 * since the host might be having issues.
+					 */
+					ccb->ccb_h.status |= CAM_BUSY;
+					break;
+				case SRB_STATUS_TIMEOUT:
+				case SRB_STATUS_COMMAND_TIMEOUT:
+					/* The backend has timed this out */
+					ccb->ccb_h.status |= CAM_BUSY;
+					break;
+				/* Some old pSCSI errors below */
+				case SRB_STATUS_SELECTION_TIMEOUT:
+				case SRB_STATUS_MESSAGE_REJECTED:
+				case SRB_STATUS_PARITY_ERROR:
+				case SRB_STATUS_NO_HBA:
+				case SRB_STATUS_DATA_OVERRUN:
+				case SRB_STATUS_UNEXPECTED_BUS_FREE:
+				case SRB_STATUS_PHASE_SEQUENCE_FAILURE:
+					/*
+					 * Old pSCSI responses, should never get.
+					 * If we do treat as a CAM_UNREC_HBA_ERROR
+					 * which will be retried
+					 */
+					ccb->ccb_h.status |= CAM_UNREC_HBA_ERROR;
+					break;
+				case SRB_STATUS_BUS_RESET:
+					ccb->ccb_h.status |= CAM_SCSI_BUS_RESET;
+					break;
+				case SRB_STATUS_BAD_SRB_BLOCK_LENGTH:
+					/*
+					 * The request block is malformed and
+					 * I doubt it is from the guest. Just retry.
+					 */
+					ccb->ccb_h.status |= CAM_UNREC_HBA_ERROR;
+					break;
+				/* Not used statuses just retry */
+				case SRB_STATUS_REQUEST_FLUSHED:
+				case SRB_STATUS_BAD_FUNCTION:
+				case SRB_STATUS_NOT_POWERED:
+					ccb->ccb_h.status |= CAM_UNREC_HBA_ERROR;
+					break;
+				case SRB_STATUS_INVALID_LUN:
+					/*
+					 * Don't log an EMS for this response since
+					 * there is no device at this LUN. This is a
+					 * normal and expected response when a device
+					 * is detached.
+					 */
+					ccb->ccb_h.status |= CAM_DEV_NOT_THERE;
+					log_error = false;
+					break;
+				case SRB_STATUS_ERROR_RECOVERY:
+				case SRB_STATUS_LINK_DOWN:
+					/*
+					 * I don't ever expect these from
+					 * the host but if we ever get
+					 * retry after a delay
+					 */
+					ccb->ccb_h.status |= CAM_BUSY;
+					break;
+				default:
+					/*
+					 * An undefined response assert on
+					 * on debug builds else retry
+					 */
+					ccb->ccb_h.status |= CAM_UNREC_HBA_ERROR;
+					KASSERT(srb_status <= SRB_STATUS_LINK_DOWN,
+					    ("storvsc: %s, unexpected srb_status of 0x%x",
+					    __func__, srb_status));
+					break;
+			}
+			if (log_error) {
+				xpt_print(ccb->ccb_h.path, "The hypervisor's I/O adapter "
+					"driver received an unexpected response code 0x%x "
+					"for operation: %s. If this continues to occur, "
+					"report the condition to your hypervisor vendor so "
+					"they can rectify the issue.\n", srb_status,
+					scsi_op_desc(cmd->opcode, NULL));
 			}
-			ccb->ccb_h.status |= CAM_DEV_NOT_THERE;
 		} else {
 			ccb->ccb_h.status |= CAM_REQ_CMP;
 		}
--- a/sys/dev/hyperv/storvsc/hv_vstorage.h
+++ b/sys/dev/hyperv/storvsc/hv_vstorage.h
@ -241,12 +241,34 @@ struct vstor_packet {
 /**
 * SRB (SCSI Request Block) Status Codes
 */
-#define SRB_STATUS_PENDING		0x00
-#define SRB_STATUS_SUCCESS		0x01
-#define SRB_STATUS_ABORTED		0x02
-#define SRB_STATUS_ERROR 		0x04
-#define SRB_STATUS_DATA_OVERRUN		0x12
-#define SRB_STATUS_INVALID_LUN		0x20
+#define SRB_STATUS_PENDING                  0x00
+#define SRB_STATUS_SUCCESS                  0x01
+#define SRB_STATUS_ABORTED                  0x02
+#define SRB_STATUS_ABORT_FAILED             0x03
+#define SRB_STATUS_ERROR                    0x04
+#define SRB_STATUS_BUSY                     0x05
+#define SRB_STATUS_INVALID_REQUEST          0x06
+#define SRB_STATUS_INVALID_PATH_ID          0x07
+#define SRB_STATUS_NO_DEVICE                0x08
+#define SRB_STATUS_TIMEOUT                  0x09
+#define SRB_STATUS_SELECTION_TIMEOUT        0x0A
+#define SRB_STATUS_COMMAND_TIMEOUT          0x0B
+#define SRB_STATUS_MESSAGE_REJECTED         0x0D
+#define SRB_STATUS_BUS_RESET                0x0E
+#define SRB_STATUS_PARITY_ERROR             0x0F
+#define SRB_STATUS_REQUEST_SENSE_FAILED     0x10
+#define SRB_STATUS_NO_HBA                   0x11
+#define SRB_STATUS_DATA_OVERRUN             0x12
+#define SRB_STATUS_UNEXPECTED_BUS_FREE      0x13
+#define SRB_STATUS_PHASE_SEQUENCE_FAILURE   0x14
+#define SRB_STATUS_BAD_SRB_BLOCK_LENGTH     0x15
+#define SRB_STATUS_REQUEST_FLUSHED          0x16
+#define SRB_STATUS_INVALID_LUN              0x20
+#define SRB_STATUS_INVALID_TARGET_ID        0x21
+#define SRB_STATUS_BAD_FUNCTION             0x22
+#define SRB_STATUS_ERROR_RECOVERY           0x23
+#define SRB_STATUS_NOT_POWERED              0x24
+#define SRB_STATUS_LINK_DOWN                0x25
 /**
 * SRB Status Masks (can be combined with above status codes)
 */