Add i/o error counters to hastd(8) and make hastctl(8) display

them.  This may be useful for detecting problems with HAST disks.

Discussed with and reviewed by:	pjd
MFC after:	1 week
This commit is contained in:
Mikolaj Golub 2013-02-25 20:09:07 +00:00
parent 7323adac99
commit 2adbba660d
5 changed files with 71 additions and 1 deletions

View File

@ -351,6 +351,12 @@ control_status(struct nv *nv)
(uint64_t)nv_get_uint64(nv, "stat_flush%u", ii));
printf(" activemap updates: %ju\n",
(uint64_t)nv_get_uint64(nv, "stat_activemap_update%u", ii));
printf(" local errors: "
"read: %ju, write: %ju, delete: %ju, flush: %ju\n",
(uintmax_t)nv_get_uint64(nv, "stat_read_error%u", ii),
(uintmax_t)nv_get_uint64(nv, "stat_write_error%u", ii),
(uintmax_t)nv_get_uint64(nv, "stat_delete_error%u", ii),
(uintmax_t)nv_get_uint64(nv, "stat_flush_error%u", ii));
}
return (ret);
}

View File

@ -207,6 +207,14 @@ control_status_worker(struct hast_resource *res, struct nv *nvout,
"stat_flush%u", no);
nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_activemap_update"),
"stat_activemap_update%u", no);
nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_read_error"),
"stat_read_error%u", no);
nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_write_error"),
"stat_write_error%u", no);
nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_delete_error"),
"stat_delete_error%u", no);
nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_flush_error"),
"stat_flush_error%u", no);
end:
if (cnvin != NULL)
nv_free(cnvin);
@ -459,6 +467,16 @@ ctrl_thread(void *arg)
nv_add_uint64(nvout, res->hr_stat_flush, "stat_flush");
nv_add_uint64(nvout, res->hr_stat_activemap_update,
"stat_activemap_update");
nv_add_uint64(nvout, res->hr_stat_read_error,
"stat_read_error");
nv_add_uint64(nvout, res->hr_stat_write_error +
res->hr_stat_activemap_write_error,
"stat_write_error");
nv_add_uint64(nvout, res->hr_stat_delete_error,
"stat_delete_error");
nv_add_uint64(nvout, res->hr_stat_flush_error +
res->hr_stat_activemap_flush_error,
"stat_flush_error");
nv_add_int16(nvout, 0, "error");
break;
case CONTROL_RELOAD:

View File

@ -239,6 +239,18 @@ struct hast_resource {
uint64_t hr_stat_flush;
/* Number of activemap updates. */
uint64_t hr_stat_activemap_update;
/* Number of local read errors. */
uint64_t hr_stat_read_error;
/* Number of local write errors. */
uint64_t hr_stat_write_error;
/* Number of local delete errors. */
uint64_t hr_stat_delete_error;
/* Number of flush errors. */
uint64_t hr_stat_flush_error;
/* Number of activemap write errors. */
uint64_t hr_stat_activemap_write_error;
/* Number of activemap flush errors. */
uint64_t hr_stat_activemap_flush_error;
/* Next resource. */
TAILQ_ENTRY(hast_resource) hr_next;

View File

@ -303,6 +303,7 @@ hast_activemap_flush(struct hast_resource *res)
if (pwrite(res->hr_localfd, buf, size, METADATA_SIZE) !=
(ssize_t)size) {
pjdlog_errno(LOG_ERR, "Unable to flush activemap to disk");
res->hr_stat_activemap_write_error++;
return (-1);
}
if (res->hr_metaflush == 1 && g_flush(res->hr_localfd) == -1) {
@ -313,6 +314,7 @@ hast_activemap_flush(struct hast_resource *res)
} else {
pjdlog_errno(LOG_ERR,
"Unable to flush disk cache on activemap update");
res->hr_stat_activemap_flush_error++;
return (-1);
}
}
@ -1936,6 +1938,22 @@ ggate_send_thread(void *arg)
"G_GATE_CMD_DONE failed");
}
}
if (hio->hio_errors[0]) {
switch (ggio->gctl_cmd) {
case BIO_READ:
res->hr_stat_read_error++;
break;
case BIO_WRITE:
res->hr_stat_write_error++;
break;
case BIO_DELETE:
res->hr_stat_delete_error++;
break;
case BIO_FLUSH:
res->hr_stat_flush_error++;
break;
}
}
pjdlog_debug(2,
"ggate_send: (%p) Moving request to the free queue.", hio);
QUEUE_INSERT2(hio, free);

View File

@ -765,6 +765,7 @@ disk_thread(void *arg)
pjdlog_errno(LOG_WARNING,
"Unable to store cleared activemap");
free(map);
res->hr_stat_activemap_write_error++;
break;
}
free(map);
@ -883,8 +884,23 @@ send_thread(void *arg)
PJDLOG_ABORT("Unexpected command (cmd=%hhu).",
hio->hio_cmd);
}
if (hio->hio_error != 0)
if (hio->hio_error != 0) {
switch (hio->hio_cmd) {
case HIO_READ:
res->hr_stat_read_error++;
break;
case HIO_WRITE:
res->hr_stat_write_error++;
break;
case HIO_DELETE:
res->hr_stat_delete_error++;
break;
case HIO_FLUSH:
res->hr_stat_flush_error++;
break;
}
nv_add_int16(nvout, hio->hio_error, "error");
}
if (hast_proto_send(res, res->hr_remoteout, nvout, data,
length) == -1) {
secondary_exit(EX_TEMPFAIL, "Unable to send reply");