bdev/nvme: Add support to get the health log for NVMe device

Add a new RPC method to get the health log of a certain NVMe
device.

Below is the example:

./scripts/rpc.py bdev_nvme_get_controller_health_info -c Nvme0
{
  "model_number": "INTEL SSDPE2KX020T8",
  "serial_number": "BTLJ72430ARH2P0BGN",
  "firmware_revision": "VDV10110",
  "traddr": "0000:08:00.0",
  "temperature_celsius": 33,
  "available_spare_percentage": 99,
  "available_spare_threshold_percentage": 10,
  "percentage_used_percentage": 2,
  "data_units_read": 1013408619,
  "data_units_written": 346792685,
  "host_read_commands": 30457773264,
  "host_write_commands": 18949677715,
  "controller_busy_time": 4979,
  "power_cycles": 49,
  "power_on_hours": 31114,
  "unsafe_shutdowns": 18,
  "media_errors": 17,
  "num_err_log_entries": 19,
  "warning_temperature_time_minutes": 0,
  "critical_composite_temperature_time_minutes": 0
}

Change-Id: I53125d2ec16cb36011571473430aece99167b803
Signed-off-by: GangCao <gang.cao@intel.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/8806
Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
Reviewed-by: Tomasz Zawadzki <tomasz.zawadzki@intel.com>
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Community-CI: Mellanox Build Bot
This commit is contained in:
WindYu 2021-07-16 17:42:20 -04:00 committed by Tomasz Zawadzki
parent 2d629511f5
commit 49ffdc18af
4 changed files with 297 additions and 0 deletions

View File

@ -457,6 +457,7 @@ Example response:
"bdev_passthru_delete"
"bdev_nvme_apply_firmware",
"bdev_nvme_get_transport_statistics",
"bdev_nvme_get_controller_health_info",
"bdev_nvme_detach_controller",
"bdev_nvme_attach_controller",
"bdev_null_create",
@ -3341,6 +3342,62 @@ Example response:
}
~~~
## bdev_nvme_get_controller_health_info {#rpc_bdev_nvme_get_controller_health_info}
Display health log of the required NVMe bdev device.
### Parameters
Name | Optional | Type | Description
----------------------- | -------- | ----------- | -----------
name | Required | string | Name of the NVMe bdev controller
### Response
The response is the object containing information about health log of the NVMe controller.
### Example
Example request:
~~~
{
"jsonrpc": "2.0",
"method": "bdev_nvme_get_controller_health_info",
"id": 1,
"params": {
"name": "Nvme0"
}
}
~~~
Example response:
~~~
{
"model_number": "INTEL SSDPE2KX020T8",
"serial_number": "BTLJ72430ARH2P0BGN",
"firmware_revision": "VDV10110",
"traddr": "0000:08:00.0",
"temperature_celsius": 32,
"available_spare_percentage": 99,
"available_spare_threshold_percentage": 10,
"percentage_used": 2,
"data_units_read": 1013408619,
"data_units_written": 346792685,
"host_read_commands": 30457773282,
"host_write_commands": 18949677715,
"controller_busy_time": 4979,
"power_cycles": 49,
"power_on_hours": 31118,
"unsafe_shutdowns": 18,
"media_errors": 17,
"num_err_log_entries": 19,
"warning_temperature_time_minutes": 0,
"critical_composite_temperature_time_minutes": 0
}
~~~
## bdev_rbd_register_cluster {#rpc_bdev_rbd_register_cluster}
This method is available only if SPDK was build with Ceph RBD support.

View File

@ -41,6 +41,9 @@
#include "spdk/string.h"
#include "spdk/rpc.h"
#include "spdk/util.h"
#include "spdk/env.h"
#include "spdk/nvme.h"
#include "spdk/nvme_spec.h"
#include "spdk/log.h"
#include "spdk/bdev_module.h"
@ -1145,3 +1148,217 @@ cleanup:
free_rpc_bdev_nvme_reset_controller_req(&req);
}
SPDK_RPC_REGISTER("bdev_nvme_reset_controller", rpc_bdev_nvme_reset_controller, SPDK_RPC_RUNTIME)
struct rpc_get_controller_health_info {
char *name;
};
struct spdk_nvme_health_info_context {
struct spdk_jsonrpc_request *request;
struct spdk_nvme_ctrlr *ctrlr;
struct spdk_nvme_health_information_page health_page;
};
static void
free_rpc_get_controller_health_info(struct rpc_get_controller_health_info *r)
{
free(r->name);
}
static const struct spdk_json_object_decoder rpc_get_controller_health_info_decoders[] = {
{"name", offsetof(struct rpc_get_controller_health_info, name), spdk_json_decode_string, true},
};
static void nvme_health_info_cleanup(struct spdk_nvme_health_info_context *context, bool response)
{
if (response == true) {
spdk_jsonrpc_send_error_response(context->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
"Internal error.");
}
free(context);
}
static void
get_health_log_page_completion(void *cb_arg, const struct spdk_nvme_cpl *cpl)
{
int i;
char buf[128];
struct spdk_nvme_health_info_context *context = cb_arg;
struct spdk_jsonrpc_request *request = context->request;
struct spdk_json_write_ctx *w;
struct spdk_nvme_ctrlr *ctrlr = context->ctrlr;
const struct spdk_nvme_transport_id *trid = NULL;
const struct spdk_nvme_ctrlr_data *cdata = NULL;
struct spdk_nvme_health_information_page *health_page = NULL;
if (spdk_nvme_cpl_is_error(cpl)) {
nvme_health_info_cleanup(context, true);
SPDK_ERRLOG("get log page failed\n");
return;
}
if (ctrlr == NULL) {
nvme_health_info_cleanup(context, true);
SPDK_ERRLOG("ctrlr is NULL\n");
return;
} else {
trid = spdk_nvme_ctrlr_get_transport_id(ctrlr);
cdata = spdk_nvme_ctrlr_get_data(ctrlr);
health_page = &(context->health_page);
}
w = spdk_jsonrpc_begin_result(request);
spdk_json_write_object_begin(w);
snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn);
spdk_str_trim(buf);
spdk_json_write_named_string(w, "model_number", buf);
snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn);
spdk_str_trim(buf);
spdk_json_write_named_string(w, "serial_number", buf);
snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr);
spdk_str_trim(buf);
spdk_json_write_named_string(w, "firmware_revision", buf);
spdk_json_write_named_string(w, "traddr", trid->traddr);
spdk_json_write_named_uint64(w, "temperature_celsius", health_page->temperature - 273);
spdk_json_write_named_uint64(w, "available_spare_percentage", health_page->available_spare);
spdk_json_write_named_uint64(w, "available_spare_threshold_percentage",
health_page->available_spare_threshold);
spdk_json_write_named_uint64(w, "percentage_used", health_page->percentage_used);
spdk_json_write_named_uint128(w, "data_units_read",
health_page->data_units_read[0], health_page->data_units_read[1]);
spdk_json_write_named_uint128(w, "data_units_written",
health_page->data_units_written[0], health_page->data_units_written[1]);
spdk_json_write_named_uint128(w, "host_read_commands",
health_page->host_read_commands[0], health_page->host_read_commands[1]);
spdk_json_write_named_uint128(w, "host_write_commands",
health_page->host_write_commands[0], health_page->host_write_commands[1]);
spdk_json_write_named_uint128(w, "controller_busy_time",
health_page->controller_busy_time[0], health_page->controller_busy_time[1]);
spdk_json_write_named_uint128(w, "power_cycles",
health_page->power_cycles[0], health_page->power_cycles[1]);
spdk_json_write_named_uint128(w, "power_on_hours",
health_page->power_on_hours[0], health_page->power_on_hours[1]);
spdk_json_write_named_uint128(w, "unsafe_shutdowns",
health_page->unsafe_shutdowns[0], health_page->unsafe_shutdowns[1]);
spdk_json_write_named_uint128(w, "media_errors",
health_page->media_errors[0], health_page->media_errors[1]);
spdk_json_write_named_uint128(w, "num_err_log_entries",
health_page->num_error_info_log_entries[0], health_page->num_error_info_log_entries[1]);
spdk_json_write_named_uint64(w, "warning_temperature_time_minutes", health_page->warning_temp_time);
spdk_json_write_named_uint64(w, "critical_composite_temperature_time_minutes",
health_page->critical_temp_time);
for (i = 0; i < 8; i++) {
if (health_page->temp_sensor[i] != 0) {
spdk_json_write_named_uint64(w, "temperature_sensor_celsius", health_page->temp_sensor[i] - 273);
}
}
spdk_json_write_object_end(w);
spdk_jsonrpc_end_result(request, w);
nvme_health_info_cleanup(context, false);
}
static void
get_health_log_page(struct spdk_nvme_health_info_context *context)
{
struct spdk_nvme_ctrlr *ctrlr = context->ctrlr;
if (spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, SPDK_NVME_LOG_HEALTH_INFORMATION,
SPDK_NVME_GLOBAL_NS_TAG,
&(context->health_page), sizeof(context->health_page), 0,
get_health_log_page_completion, context)) {
nvme_health_info_cleanup(context, true);
SPDK_ERRLOG("spdk_nvme_ctrlr_cmd_get_log_page() failed\n");
}
}
static void
get_temperature_threshold_feature_completion(void *cb_arg, const struct spdk_nvme_cpl *cpl)
{
struct spdk_nvme_health_info_context *context = cb_arg;
if (spdk_nvme_cpl_is_error(cpl)) {
nvme_health_info_cleanup(context, true);
SPDK_ERRLOG("feature SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD failed in completion\n");
} else {
get_health_log_page(context);
}
}
static int
get_temperature_threshold_feature(struct spdk_nvme_health_info_context *context)
{
struct spdk_nvme_cmd cmd = {};
cmd.opc = SPDK_NVME_OPC_GET_FEATURES;
cmd.cdw10 = SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD;
return spdk_nvme_ctrlr_cmd_admin_raw(context->ctrlr, &cmd, NULL, 0,
get_temperature_threshold_feature_completion, context);
}
static void
get_controller_health_info(struct spdk_jsonrpc_request *request, struct spdk_nvme_ctrlr *ctrlr)
{
struct spdk_nvme_health_info_context *context;
context = calloc(1, sizeof(struct spdk_nvme_health_info_context));
if (!context) {
spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
"Memory allocation error.");
return;
}
context->request = request;
context->ctrlr = ctrlr;
if (get_temperature_threshold_feature(context)) {
nvme_health_info_cleanup(context, true);
SPDK_ERRLOG("feature SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD failed to submit\n");
}
return;
}
static void
rpc_bdev_nvme_get_controller_health_info(struct spdk_jsonrpc_request *request,
const struct spdk_json_val *params)
{
struct rpc_get_controller_health_info req = {};
struct nvme_ctrlr *nvme_ctrlr = NULL;
if (!params) {
spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
"Missing device name");
return;
}
if (spdk_json_decode_object(params, rpc_get_controller_health_info_decoders,
SPDK_COUNTOF(rpc_get_controller_health_info_decoders), &req)) {
SPDK_ERRLOG("spdk_json_decode_object failed\n");
free_rpc_get_controller_health_info(&req);
spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
"Invalid parameters");
return;
}
nvme_ctrlr = nvme_ctrlr_get_by_name(req.name);
if (!nvme_ctrlr) {
SPDK_ERRLOG("nvme ctrlr name '%s' does not exist\n", req.name);
free_rpc_get_controller_health_info(&req);
spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
"Device not found");
return;
}
get_controller_health_info(request, nvme_ctrlr->ctrlr);
free_rpc_get_controller_health_info(&req);
return;
}
SPDK_RPC_REGISTER("bdev_nvme_get_controller_health_info",
rpc_bdev_nvme_get_controller_health_info, SPDK_RPC_RUNTIME)

View File

@ -964,6 +964,15 @@ if __name__ == "__main__":
help='Get bdev_nvme poll group transport statistics')
p.set_defaults(func=bdev_nvme_get_transport_statistics)
def bdev_nvme_get_controller_health_info(args):
print_dict(rpc.bdev.bdev_nvme_get_controller_health_info(args.client,
name=args.name))
p = subparsers.add_parser('bdev_nvme_get_controller_health_info',
help='Display health log of the required NVMe bdev controller.')
p.add_argument('-c', '--name', help="Name of the NVMe bdev controller. Example: Nvme0", required=True)
p.set_defaults(func=bdev_nvme_get_controller_health_info)
# iSCSI
def iscsi_set_options(args):
rpc.iscsi.iscsi_set_options(

View File

@ -1276,3 +1276,17 @@ def bdev_nvme_apply_firmware(client, bdev_name, filename):
def bdev_nvme_get_transport_statistics(client):
"""Get bdev_nvme poll group transport statistics"""
return client.call('bdev_nvme_get_transport_statistics')
def bdev_nvme_get_controller_health_info(client, name):
"""Display health log of the required NVMe bdev controller.
Args:
name: name of the required NVMe bdev controller
Returns:
Health log for the requested NVMe bdev controller.
"""
params = {}
params['name'] = name
return client.call('bdev_nvme_get_controller_health_info', params)