Add a devctl/devd notification conduit for CAM errors that happen at the
periph level. When a relevant error is reported to the periph, some amplifying information is gathered, and the error and information are fed to devctl with the attributes / keys system=CAM, subsystem=periph. The 'type' key will be either 'error' or 'timeout', and based on this, various other keys are also populated. The purpose of this is to provide a concise mechanism for error reporting that is less noisy than the system console but higher in resolution and fidelity than simple sysctl counters. We will be using it at Netflix to populate a structured log and database to track errors and error trends across our world-wide population of drives. Submitted by: imp, scottl Approved by: kenm MFC after: 3 days Sponsored by: Netflix Differential Revision: D5943
This commit is contained in:
parent
4bd916567e
commit
a30ecd42b8
Notes:
svn2git
2020-12-20 02:59:44 +00:00
svn path=/head/; revision=298004
@ -86,6 +86,7 @@ static int camperiphscsisenseerror(union ccb *ccb,
|
||||
u_int32_t *timeout,
|
||||
u_int32_t *action,
|
||||
const char **action_string);
|
||||
static void cam_periph_devctl_notify(union ccb *ccb);
|
||||
|
||||
static int nperiph_drivers;
|
||||
static int initialized = 0;
|
||||
@ -1615,7 +1616,7 @@ cam_periph_error(union ccb *ccb, cam_flags camflags,
|
||||
struct cam_periph *periph;
|
||||
const char *action_string;
|
||||
cam_status status;
|
||||
int frozen, error, openings;
|
||||
int frozen, error, openings, devctl_err;
|
||||
u_int32_t action, relsim_flags, timeout;
|
||||
|
||||
action = SSQ_PRINT_SENSE;
|
||||
@ -1624,9 +1625,26 @@ cam_periph_error(union ccb *ccb, cam_flags camflags,
|
||||
status = ccb->ccb_h.status;
|
||||
frozen = (status & CAM_DEV_QFRZN) != 0;
|
||||
status &= CAM_STATUS_MASK;
|
||||
openings = relsim_flags = timeout = 0;
|
||||
devctl_err = openings = relsim_flags = timeout = 0;
|
||||
orig_ccb = ccb;
|
||||
|
||||
/* Filter the errors that should be reported via devctl */
|
||||
switch (ccb->ccb_h.status & CAM_STATUS_MASK) {
|
||||
case CAM_CMD_TIMEOUT:
|
||||
case CAM_REQ_ABORTED:
|
||||
case CAM_REQ_CMP_ERR:
|
||||
case CAM_REQ_TERMIO:
|
||||
case CAM_UNREC_HBA_ERROR:
|
||||
case CAM_DATA_RUN_ERR:
|
||||
case CAM_SCSI_STATUS_ERROR:
|
||||
case CAM_ATA_STATUS_ERROR:
|
||||
case CAM_SMP_STATUS_ERROR:
|
||||
devctl_err++;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
switch (status) {
|
||||
case CAM_REQ_CMP:
|
||||
error = 0;
|
||||
@ -1754,6 +1772,9 @@ cam_periph_error(union ccb *ccb, cam_flags camflags,
|
||||
xpt_print(ccb->ccb_h.path, "Retrying command\n");
|
||||
}
|
||||
|
||||
if (devctl_err)
|
||||
cam_periph_devctl_notify(orig_ccb);
|
||||
|
||||
if ((action & SSQ_LOST) != 0) {
|
||||
lun_id_t lun_id;
|
||||
|
||||
@ -1824,3 +1845,80 @@ cam_periph_error(union ccb *ccb, cam_flags camflags,
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
#define CAM_PERIPH_DEVD_MSG_SIZE 256
|
||||
|
||||
static void
|
||||
cam_periph_devctl_notify(union ccb *ccb)
|
||||
{
|
||||
struct cam_periph *periph;
|
||||
struct ccb_getdev *cgd;
|
||||
struct sbuf sb;
|
||||
int serr, sk, asc, ascq;
|
||||
char *sbmsg, *type;
|
||||
|
||||
sbmsg = malloc(CAM_PERIPH_DEVD_MSG_SIZE, M_CAMPERIPH, M_NOWAIT);
|
||||
if (sbmsg == NULL)
|
||||
return;
|
||||
|
||||
sbuf_new(&sb, sbmsg, CAM_PERIPH_DEVD_MSG_SIZE, SBUF_FIXEDLEN);
|
||||
|
||||
periph = xpt_path_periph(ccb->ccb_h.path);
|
||||
sbuf_printf(&sb, "device=%s%d ", periph->periph_name,
|
||||
periph->unit_number);
|
||||
|
||||
sbuf_printf(&sb, "serial=\"");
|
||||
if ((cgd = (struct ccb_getdev *)xpt_alloc_ccb_nowait()) != NULL) {
|
||||
xpt_setup_ccb(&cgd->ccb_h, ccb->ccb_h.path,
|
||||
CAM_PRIORITY_NORMAL);
|
||||
cgd->ccb_h.func_code = XPT_GDEV_TYPE;
|
||||
xpt_action((union ccb *)cgd);
|
||||
|
||||
if (cgd->ccb_h.status == CAM_REQ_CMP)
|
||||
sbuf_bcat(&sb, cgd->serial_num, cgd->serial_num_len);
|
||||
}
|
||||
sbuf_printf(&sb, "\" ");
|
||||
sbuf_printf(&sb, "cam_status=\"0x%x\" ", ccb->ccb_h.status);
|
||||
|
||||
switch (ccb->ccb_h.status & CAM_STATUS_MASK) {
|
||||
case CAM_CMD_TIMEOUT:
|
||||
sbuf_printf(&sb, "timeout=%d ", ccb->ccb_h.timeout);
|
||||
type = "timeout";
|
||||
break;
|
||||
case CAM_SCSI_STATUS_ERROR:
|
||||
sbuf_printf(&sb, "scsi_status=%d ", ccb->csio.scsi_status);
|
||||
if (scsi_extract_sense_ccb(ccb, &serr, &sk, &asc, &ascq))
|
||||
sbuf_printf(&sb, "scsi_sense=\"%02x %02x %02x %02x\" ",
|
||||
serr, sk, asc, ascq);
|
||||
type = "error";
|
||||
break;
|
||||
case CAM_ATA_STATUS_ERROR:
|
||||
sbuf_printf(&sb, "RES=\"");
|
||||
ata_res_sbuf(&ccb->ataio.res, &sb);
|
||||
sbuf_printf(&sb, "\" ");
|
||||
type = "error";
|
||||
break;
|
||||
default:
|
||||
type = "error";
|
||||
break;
|
||||
}
|
||||
|
||||
if (ccb->ccb_h.func_code == XPT_SCSI_IO) {
|
||||
sbuf_printf(&sb, "CDB=\"");
|
||||
if ((ccb->ccb_h.flags & CAM_CDB_POINTER) != 0)
|
||||
scsi_cdb_sbuf(ccb->csio.cdb_io.cdb_ptr, &sb);
|
||||
else
|
||||
scsi_cdb_sbuf(ccb->csio.cdb_io.cdb_bytes, &sb);
|
||||
sbuf_printf(&sb, "\" ");
|
||||
} else if (ccb->ccb_h.func_code == XPT_ATA_IO) {
|
||||
sbuf_printf(&sb, "ACB=\"");
|
||||
ata_cmd_sbuf(&ccb->ataio.cmd, &sb);
|
||||
sbuf_printf(&sb, "\" ");
|
||||
}
|
||||
|
||||
if (sbuf_finish(&sb) == 0)
|
||||
devctl_notify("CAM", "periph", type, sbuf_data(&sb));
|
||||
sbuf_delete(&sb);
|
||||
free(sbmsg, M_CAMPERIPH);
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user