diff --git a/sys/dev/mlx5/device.h b/sys/dev/mlx5/device.h index d308e287954c..eab769953c64 100644 --- a/sys/dev/mlx5/device.h +++ b/sys/dev/mlx5/device.h @@ -537,7 +537,7 @@ enum { MLX5_MODULE_STATUS_PLUGGED_ENABLED = 0x1, MLX5_MODULE_STATUS_UNPLUGGED = 0x2, MLX5_MODULE_STATUS_ERROR = 0x3, - MLX5_MODULE_STATUS_PLUGGED_DISABLED = 0x4, + MLX5_MODULE_STATUS_NUM , }; enum { @@ -549,7 +549,7 @@ enum { MLX5_MODULE_EVENT_ERROR_UNSUPPORTED_CABLE = 0x5, MLX5_MODULE_EVENT_ERROR_HIGH_TEMPERATURE = 0x6, MLX5_MODULE_EVENT_ERROR_CABLE_IS_SHORTED = 0x7, - MLX5_MODULE_EVENT_ERROR_PCIE_SYSTEM_POWER_SLOT_EXCEEDED = 0xc, + MLX5_MODULE_EVENT_ERROR_NUM , }; struct mlx5_eqe_port_module_event { diff --git a/sys/dev/mlx5/driver.h b/sys/dev/mlx5/driver.h index 44835d72741e..f65ec58122f6 100644 --- a/sys/dev/mlx5/driver.h +++ b/sys/dev/mlx5/driver.h @@ -569,6 +569,11 @@ struct mlx5_rl_table { }; #endif +struct mlx5_pme_stats { + u64 status_counters[MLX5_MODULE_STATUS_NUM]; + u64 error_counters[MLX5_MODULE_EVENT_ERROR_NUM]; +}; + struct mlx5_priv { char name[MLX5_MAX_NAME_LEN]; struct mlx5_eq_table eq_table; @@ -624,6 +629,7 @@ struct mlx5_priv { #ifdef RATELIMIT struct mlx5_rl_table rl_table; #endif + struct mlx5_pme_stats pme_stats; }; enum mlx5_device_state { diff --git a/sys/dev/mlx5/mlx5_core/mlx5_eq.c b/sys/dev/mlx5/mlx5_core/mlx5_eq.c index 793f654bcbb4..fa1c7ef1150a 100644 --- a/sys/dev/mlx5/mlx5_core/mlx5_eq.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_eq.c @@ -639,9 +639,9 @@ static const char *mlx5_port_module_event_error_type_to_string(u8 error_type) { switch (error_type) { case MLX5_MODULE_EVENT_ERROR_POWER_BUDGET_EXCEEDED: - return "Power Budget Exceeded"; + return "Power budget exceeded"; case MLX5_MODULE_EVENT_ERROR_LONG_RANGE_FOR_NON_MLNX_CABLE_MODULE: - return "Long Range for non MLNX cable/module"; + return "Long Range for non MLNX cable"; case MLX5_MODULE_EVENT_ERROR_BUS_STUCK: return "Bus stuck(I2C or data shorted)"; case MLX5_MODULE_EVENT_ERROR_NO_EEPROM_RETRY_TIMEOUT: @@ -649,18 +649,11 @@ static const char *mlx5_port_module_event_error_type_to_string(u8 error_type) case MLX5_MODULE_EVENT_ERROR_ENFORCE_PART_NUMBER_LIST: return "Enforce part number list"; case MLX5_MODULE_EVENT_ERROR_UNSUPPORTED_CABLE: - return "Unsupported Cable"; + return "Unknown identifier"; case MLX5_MODULE_EVENT_ERROR_HIGH_TEMPERATURE: return "High Temperature"; case MLX5_MODULE_EVENT_ERROR_CABLE_IS_SHORTED: - return "Cable is shorted"; - case MLX5_MODULE_EVENT_ERROR_PCIE_SYSTEM_POWER_SLOT_EXCEEDED: - return "One or more network ports have been powered " - "down due to insufficient/unadvertised power on " - "the PCIe slot. Please refer to the card's user " - "manual for power specifications or contact " - "Mellanox support."; - + return "Bad or shorted cable/module"; default: return "Unknown error type"; } @@ -686,29 +679,36 @@ static void mlx5_port_module_event(struct mlx5_core_dev *dev, module_num = (unsigned int)module_event_eqe->module; module_status = (unsigned int)module_event_eqe->module_status & - PORT_MODULE_EVENT_MODULE_STATUS_MASK; + PORT_MODULE_EVENT_MODULE_STATUS_MASK; error_type = (unsigned int)module_event_eqe->error_type & - PORT_MODULE_EVENT_ERROR_TYPE_MASK; + PORT_MODULE_EVENT_ERROR_TYPE_MASK; + if (module_status < MLX5_MODULE_STATUS_NUM) + dev->priv.pme_stats.status_counters[module_status]++; switch (module_status) { case MLX5_MODULE_STATUS_PLUGGED_ENABLED: - device_printf((&pdev->dev)->bsddev, "INFO: ""Module %u, status: plugged and enabled\n", module_num); + device_printf((&pdev->dev)->bsddev, + "INFO: Module %u, status: plugged and enabled\n", + module_num); break; case MLX5_MODULE_STATUS_UNPLUGGED: - device_printf((&pdev->dev)->bsddev, "INFO: ""Module %u, status: unplugged\n", module_num); + device_printf((&pdev->dev)->bsddev, + "INFO: Module %u, status: unplugged\n", module_num); break; case MLX5_MODULE_STATUS_ERROR: - device_printf((&pdev->dev)->bsddev, "INFO: ""Module %u, status: error, %s\n", module_num, mlx5_port_module_event_error_type_to_string(error_type)); - break; - - case MLX5_MODULE_STATUS_PLUGGED_DISABLED: - device_printf((&pdev->dev)->bsddev, "INFO: ""Module %u, status: plugged but disabled\n", module_num); + device_printf((&pdev->dev)->bsddev, + "ERROR: Module %u, status: error, %s\n", + module_num, + mlx5_port_module_event_error_type_to_string(error_type)); + if (error_type < MLX5_MODULE_EVENT_ERROR_NUM) + dev->priv.pme_stats.error_counters[error_type]++; break; default: - device_printf((&pdev->dev)->bsddev, "INFO: ""Module %u, unknown status\n", module_num); + device_printf((&pdev->dev)->bsddev, + "INFO: Module %u, unknown status\n", module_num); } /* store module status */ if (module_num < MLX5_MAX_PORTS) diff --git a/sys/dev/mlx5/mlx5_core/mlx5_main.c b/sys/dev/mlx5/mlx5_core/mlx5_main.c index 8806a819297f..d9becbe9942c 100644 --- a/sys/dev/mlx5/mlx5_core/mlx5_main.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_main.c @@ -1244,13 +1244,31 @@ struct mlx5_core_event_handler { void *data); }; +#define MLX5_STATS_DESC(a, b, c, d, e, ...) d, e, + +#define MLX5_PORT_MODULE_ERROR_STATS(m) \ +m(+1, u64, power_budget_exceeded, "power_budget", "Module Power Budget Exceeded") \ +m(+1, u64, long_range, "long_range", "Module Long Range for non MLNX cable/module") \ +m(+1, u64, bus_stuck, "bus_stuck", "Module Bus stuck(I2C or data shorted)") \ +m(+1, u64, no_eeprom, "no_eeprom", "No EEPROM/retry timeout") \ +m(+1, u64, enforce_part_number, "enforce_part_number", "Module Enforce part number list") \ +m(+1, u64, unknown_id, "unknown_id", "Module Unknown identifier") \ +m(+1, u64, high_temp, "high_temp", "Module High Temperature") \ +m(+1, u64, cable_shorted, "cable_shorted", "Module Cable is shorted") + +static const char *mlx5_pme_err_desc[] = { + MLX5_PORT_MODULE_ERROR_STATS(MLX5_STATS_DESC) +}; + static int init_one(struct pci_dev *pdev, const struct pci_device_id *id) { struct mlx5_core_dev *dev; struct mlx5_priv *priv; device_t bsddev = pdev->dev.bsddev; - int err; + int i,err; + struct sysctl_oid *pme_sysctl_node; + struct sysctl_oid *pme_err_sysctl_node; dev = kzalloc(sizeof(*dev), GFP_KERNEL); priv = &dev->priv; @@ -1282,6 +1300,41 @@ static int init_one(struct pci_dev *pdev, OID_AUTO, "power_value", CTLFLAG_RD, &dev->pwr_value, 0, "Current power value in Watts"); + pme_sysctl_node = SYSCTL_ADD_NODE(&dev->sysctl_ctx, + SYSCTL_CHILDREN(device_get_sysctl_tree(bsddev)), + OID_AUTO, "pme_stats", CTLFLAG_RD, NULL, + "Port module event statistics"); + if (pme_sysctl_node == NULL) { + err = -ENOMEM; + goto clean_sysctl_ctx; + } + pme_err_sysctl_node = SYSCTL_ADD_NODE(&dev->sysctl_ctx, + SYSCTL_CHILDREN(pme_sysctl_node), + OID_AUTO, "errors", CTLFLAG_RD, NULL, + "Port module event error statistics"); + if (pme_err_sysctl_node == NULL) { + err = -ENOMEM; + goto clean_sysctl_ctx; + } + SYSCTL_ADD_U64(&dev->sysctl_ctx, + SYSCTL_CHILDREN(pme_sysctl_node), OID_AUTO, + "module_plug", CTLFLAG_RD | CTLFLAG_MPSAFE, + &dev->priv.pme_stats.status_counters[MLX5_MODULE_STATUS_PLUGGED_ENABLED], + 0, "Number of time module plugged"); + SYSCTL_ADD_U64(&dev->sysctl_ctx, + SYSCTL_CHILDREN(pme_sysctl_node), OID_AUTO, + "module_unplug", CTLFLAG_RD | CTLFLAG_MPSAFE, + &dev->priv.pme_stats.status_counters[MLX5_MODULE_STATUS_UNPLUGGED], + 0, "Number of time module unplugged"); + for (i = 0 ; i < MLX5_MODULE_EVENT_ERROR_NUM; i++) { + SYSCTL_ADD_U64(&dev->sysctl_ctx, + SYSCTL_CHILDREN(pme_err_sysctl_node), OID_AUTO, + mlx5_pme_err_desc[2 * i], CTLFLAG_RD | CTLFLAG_MPSAFE, + &dev->priv.pme_stats.error_counters[i], + 0, mlx5_pme_err_desc[2 * i + 1]); + } + + INIT_LIST_HEAD(&priv->ctx_list); spin_lock_init(&priv->ctx_lock); mutex_init(&dev->pci_status_mutex); @@ -1320,8 +1373,9 @@ static int init_one(struct pci_dev *pdev, close_pci: mlx5_pci_close(dev, priv); clean_dev: - sysctl_ctx_free(&dev->sysctl_ctx); mtx_destroy(&dev->dump_lock); +clean_sysctl_ctx: + sysctl_ctx_free(&dev->sysctl_ctx); kfree(dev); return err; } diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c index c06d79e24db2..0f771250d10d 100644 --- a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c @@ -3389,8 +3389,7 @@ mlx5e_ioctl(struct ifnet *ifp, u_long command, caddr_t data) } /* Check if module is present before doing an access */ module_status = mlx5_query_module_status(priv->mdev, module_num); - if (module_status != MLX5_MODULE_STATUS_PLUGGED_ENABLED && - module_status != MLX5_MODULE_STATUS_PLUGGED_DISABLED) { + if (module_status != MLX5_MODULE_STATUS_PLUGGED_ENABLED) { error = EINVAL; goto err_i2c; }