Add port module event software counters in mlx5core.

While at it, fixup PME based on latest PRM defines.

Submitted by:	slavash@
MFC after:	3 days
Sponsored by:	Mellanox Technologies
This commit is contained in:
hselasky 2019-10-02 09:29:55 +00:00
parent 980e2c987e
commit d6e923c7f0
5 changed files with 86 additions and 27 deletions

View File

@ -537,7 +537,7 @@ enum {
MLX5_MODULE_STATUS_PLUGGED_ENABLED = 0x1,
MLX5_MODULE_STATUS_UNPLUGGED = 0x2,
MLX5_MODULE_STATUS_ERROR = 0x3,
MLX5_MODULE_STATUS_PLUGGED_DISABLED = 0x4,
MLX5_MODULE_STATUS_NUM ,
};
enum {
@ -549,7 +549,7 @@ enum {
MLX5_MODULE_EVENT_ERROR_UNSUPPORTED_CABLE = 0x5,
MLX5_MODULE_EVENT_ERROR_HIGH_TEMPERATURE = 0x6,
MLX5_MODULE_EVENT_ERROR_CABLE_IS_SHORTED = 0x7,
MLX5_MODULE_EVENT_ERROR_PCIE_SYSTEM_POWER_SLOT_EXCEEDED = 0xc,
MLX5_MODULE_EVENT_ERROR_NUM ,
};
struct mlx5_eqe_port_module_event {

View File

@ -569,6 +569,11 @@ struct mlx5_rl_table {
};
#endif
struct mlx5_pme_stats {
u64 status_counters[MLX5_MODULE_STATUS_NUM];
u64 error_counters[MLX5_MODULE_EVENT_ERROR_NUM];
};
struct mlx5_priv {
char name[MLX5_MAX_NAME_LEN];
struct mlx5_eq_table eq_table;
@ -624,6 +629,7 @@ struct mlx5_priv {
#ifdef RATELIMIT
struct mlx5_rl_table rl_table;
#endif
struct mlx5_pme_stats pme_stats;
};
enum mlx5_device_state {

View File

@ -639,9 +639,9 @@ static const char *mlx5_port_module_event_error_type_to_string(u8 error_type)
{
switch (error_type) {
case MLX5_MODULE_EVENT_ERROR_POWER_BUDGET_EXCEEDED:
return "Power Budget Exceeded";
return "Power budget exceeded";
case MLX5_MODULE_EVENT_ERROR_LONG_RANGE_FOR_NON_MLNX_CABLE_MODULE:
return "Long Range for non MLNX cable/module";
return "Long Range for non MLNX cable";
case MLX5_MODULE_EVENT_ERROR_BUS_STUCK:
return "Bus stuck(I2C or data shorted)";
case MLX5_MODULE_EVENT_ERROR_NO_EEPROM_RETRY_TIMEOUT:
@ -649,18 +649,11 @@ static const char *mlx5_port_module_event_error_type_to_string(u8 error_type)
case MLX5_MODULE_EVENT_ERROR_ENFORCE_PART_NUMBER_LIST:
return "Enforce part number list";
case MLX5_MODULE_EVENT_ERROR_UNSUPPORTED_CABLE:
return "Unsupported Cable";
return "Unknown identifier";
case MLX5_MODULE_EVENT_ERROR_HIGH_TEMPERATURE:
return "High Temperature";
case MLX5_MODULE_EVENT_ERROR_CABLE_IS_SHORTED:
return "Cable is shorted";
case MLX5_MODULE_EVENT_ERROR_PCIE_SYSTEM_POWER_SLOT_EXCEEDED:
return "One or more network ports have been powered "
"down due to insufficient/unadvertised power on "
"the PCIe slot. Please refer to the card's user "
"manual for power specifications or contact "
"Mellanox support.";
return "Bad or shorted cable/module";
default:
return "Unknown error type";
}
@ -686,29 +679,36 @@ static void mlx5_port_module_event(struct mlx5_core_dev *dev,
module_num = (unsigned int)module_event_eqe->module;
module_status = (unsigned int)module_event_eqe->module_status &
PORT_MODULE_EVENT_MODULE_STATUS_MASK;
PORT_MODULE_EVENT_MODULE_STATUS_MASK;
error_type = (unsigned int)module_event_eqe->error_type &
PORT_MODULE_EVENT_ERROR_TYPE_MASK;
PORT_MODULE_EVENT_ERROR_TYPE_MASK;
if (module_status < MLX5_MODULE_STATUS_NUM)
dev->priv.pme_stats.status_counters[module_status]++;
switch (module_status) {
case MLX5_MODULE_STATUS_PLUGGED_ENABLED:
device_printf((&pdev->dev)->bsddev, "INFO: ""Module %u, status: plugged and enabled\n", module_num);
device_printf((&pdev->dev)->bsddev,
"INFO: Module %u, status: plugged and enabled\n",
module_num);
break;
case MLX5_MODULE_STATUS_UNPLUGGED:
device_printf((&pdev->dev)->bsddev, "INFO: ""Module %u, status: unplugged\n", module_num);
device_printf((&pdev->dev)->bsddev,
"INFO: Module %u, status: unplugged\n", module_num);
break;
case MLX5_MODULE_STATUS_ERROR:
device_printf((&pdev->dev)->bsddev, "INFO: ""Module %u, status: error, %s\n", module_num, mlx5_port_module_event_error_type_to_string(error_type));
break;
case MLX5_MODULE_STATUS_PLUGGED_DISABLED:
device_printf((&pdev->dev)->bsddev, "INFO: ""Module %u, status: plugged but disabled\n", module_num);
device_printf((&pdev->dev)->bsddev,
"ERROR: Module %u, status: error, %s\n",
module_num,
mlx5_port_module_event_error_type_to_string(error_type));
if (error_type < MLX5_MODULE_EVENT_ERROR_NUM)
dev->priv.pme_stats.error_counters[error_type]++;
break;
default:
device_printf((&pdev->dev)->bsddev, "INFO: ""Module %u, unknown status\n", module_num);
device_printf((&pdev->dev)->bsddev,
"INFO: Module %u, unknown status\n", module_num);
}
/* store module status */
if (module_num < MLX5_MAX_PORTS)

View File

@ -1244,13 +1244,31 @@ struct mlx5_core_event_handler {
void *data);
};
#define MLX5_STATS_DESC(a, b, c, d, e, ...) d, e,
#define MLX5_PORT_MODULE_ERROR_STATS(m) \
m(+1, u64, power_budget_exceeded, "power_budget", "Module Power Budget Exceeded") \
m(+1, u64, long_range, "long_range", "Module Long Range for non MLNX cable/module") \
m(+1, u64, bus_stuck, "bus_stuck", "Module Bus stuck(I2C or data shorted)") \
m(+1, u64, no_eeprom, "no_eeprom", "No EEPROM/retry timeout") \
m(+1, u64, enforce_part_number, "enforce_part_number", "Module Enforce part number list") \
m(+1, u64, unknown_id, "unknown_id", "Module Unknown identifier") \
m(+1, u64, high_temp, "high_temp", "Module High Temperature") \
m(+1, u64, cable_shorted, "cable_shorted", "Module Cable is shorted")
static const char *mlx5_pme_err_desc[] = {
MLX5_PORT_MODULE_ERROR_STATS(MLX5_STATS_DESC)
};
static int init_one(struct pci_dev *pdev,
const struct pci_device_id *id)
{
struct mlx5_core_dev *dev;
struct mlx5_priv *priv;
device_t bsddev = pdev->dev.bsddev;
int err;
int i,err;
struct sysctl_oid *pme_sysctl_node;
struct sysctl_oid *pme_err_sysctl_node;
dev = kzalloc(sizeof(*dev), GFP_KERNEL);
priv = &dev->priv;
@ -1282,6 +1300,41 @@ static int init_one(struct pci_dev *pdev,
OID_AUTO, "power_value", CTLFLAG_RD, &dev->pwr_value, 0,
"Current power value in Watts");
pme_sysctl_node = SYSCTL_ADD_NODE(&dev->sysctl_ctx,
SYSCTL_CHILDREN(device_get_sysctl_tree(bsddev)),
OID_AUTO, "pme_stats", CTLFLAG_RD, NULL,
"Port module event statistics");
if (pme_sysctl_node == NULL) {
err = -ENOMEM;
goto clean_sysctl_ctx;
}
pme_err_sysctl_node = SYSCTL_ADD_NODE(&dev->sysctl_ctx,
SYSCTL_CHILDREN(pme_sysctl_node),
OID_AUTO, "errors", CTLFLAG_RD, NULL,
"Port module event error statistics");
if (pme_err_sysctl_node == NULL) {
err = -ENOMEM;
goto clean_sysctl_ctx;
}
SYSCTL_ADD_U64(&dev->sysctl_ctx,
SYSCTL_CHILDREN(pme_sysctl_node), OID_AUTO,
"module_plug", CTLFLAG_RD | CTLFLAG_MPSAFE,
&dev->priv.pme_stats.status_counters[MLX5_MODULE_STATUS_PLUGGED_ENABLED],
0, "Number of time module plugged");
SYSCTL_ADD_U64(&dev->sysctl_ctx,
SYSCTL_CHILDREN(pme_sysctl_node), OID_AUTO,
"module_unplug", CTLFLAG_RD | CTLFLAG_MPSAFE,
&dev->priv.pme_stats.status_counters[MLX5_MODULE_STATUS_UNPLUGGED],
0, "Number of time module unplugged");
for (i = 0 ; i < MLX5_MODULE_EVENT_ERROR_NUM; i++) {
SYSCTL_ADD_U64(&dev->sysctl_ctx,
SYSCTL_CHILDREN(pme_err_sysctl_node), OID_AUTO,
mlx5_pme_err_desc[2 * i], CTLFLAG_RD | CTLFLAG_MPSAFE,
&dev->priv.pme_stats.error_counters[i],
0, mlx5_pme_err_desc[2 * i + 1]);
}
INIT_LIST_HEAD(&priv->ctx_list);
spin_lock_init(&priv->ctx_lock);
mutex_init(&dev->pci_status_mutex);
@ -1320,8 +1373,9 @@ clean_health:
close_pci:
mlx5_pci_close(dev, priv);
clean_dev:
sysctl_ctx_free(&dev->sysctl_ctx);
mtx_destroy(&dev->dump_lock);
clean_sysctl_ctx:
sysctl_ctx_free(&dev->sysctl_ctx);
kfree(dev);
return err;
}

View File

@ -3389,8 +3389,7 @@ out:
}
/* Check if module is present before doing an access */
module_status = mlx5_query_module_status(priv->mdev, module_num);
if (module_status != MLX5_MODULE_STATUS_PLUGGED_ENABLED &&
module_status != MLX5_MODULE_STATUS_PLUGGED_DISABLED) {
if (module_status != MLX5_MODULE_STATUS_PLUGGED_ENABLED) {
error = EINVAL;
goto err_i2c;
}