Protect from infinite sw-reset loop in mlx5core.
Avoid an infinite software firmware reset loop that may be caused by a hardware bug by limiting the maximum number of resets. The counter between resets is reset by request for reset, and not by a successful reset. The interval between two resets can be configured via sysctl: hw.mlx5.sw_reset_timeout which is global to all mlx5 devices in the system. Submitted by: slavash@ MFC after: 3 days Sponsored by: Mellanox Technologies
This commit is contained in:
parent
4d6dbe0567
commit
0ca4d482ae
@ -536,6 +536,7 @@ struct mlx5_core_health {
|
||||
unsigned long flags;
|
||||
struct work_struct work;
|
||||
struct delayed_work recover_work;
|
||||
unsigned int last_reset_req;
|
||||
};
|
||||
|
||||
#ifdef RATELIMIT
|
||||
|
@ -64,6 +64,12 @@ SYSCTL_INT(_hw_mlx5, OID_AUTO, fw_reset_enable, CTLFLAG_RWTUN,
|
||||
&mlx5_fw_reset_enable, 0,
|
||||
"Enable firmware reset");
|
||||
|
||||
static unsigned int sw_reset_to = 1200;
|
||||
SYSCTL_UINT(_hw_mlx5, OID_AUTO, sw_reset_timeout, CTLFLAG_RWTUN,
|
||||
&sw_reset_to, 0,
|
||||
"Minimum timeout in seconds between two firmware resets");
|
||||
|
||||
|
||||
static int lock_sem_sw_reset(struct mlx5_core_dev *dev)
|
||||
{
|
||||
int ret;
|
||||
@ -218,6 +224,32 @@ static void reset_fw_if_needed(struct mlx5_core_dev *dev)
|
||||
&dev->iseg->cmdq_addr_l_sz);
|
||||
}
|
||||
|
||||
static bool
|
||||
mlx5_health_allow_reset(struct mlx5_core_dev *dev)
|
||||
{
|
||||
struct mlx5_core_health *health = &dev->priv.health;
|
||||
unsigned int delta;
|
||||
bool ret;
|
||||
|
||||
if (health->last_reset_req != 0) {
|
||||
delta = ticks - health->last_reset_req;
|
||||
delta /= hz;
|
||||
ret = delta >= sw_reset_to;
|
||||
} else {
|
||||
ret = true;
|
||||
}
|
||||
|
||||
/*
|
||||
* In principle, ticks may be 0. Setting it to off by one (-1)
|
||||
* to prevent certain reset in next request.
|
||||
*/
|
||||
health->last_reset_req = ticks ? : -1;
|
||||
if (!ret)
|
||||
mlx5_core_warn(dev, "Firmware reset elided due to "
|
||||
"auto-reset frequency threshold.\n");
|
||||
return (ret);
|
||||
}
|
||||
|
||||
#define MLX5_CRDUMP_WAIT_MS 60000
|
||||
#define MLX5_FW_RESET_WAIT_MS 1000
|
||||
#define MLX5_NIC_STATE_POLL_MS 5
|
||||
@ -243,7 +275,8 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
|
||||
if (force)
|
||||
goto err_state_done;
|
||||
|
||||
if (fatal_error == MLX5_SENSOR_FW_SYND_RFR) {
|
||||
if (fatal_error == MLX5_SENSOR_FW_SYND_RFR &&
|
||||
mlx5_health_allow_reset(dev)) {
|
||||
/* Get cr-dump and reset FW semaphore */
|
||||
if (mlx5_core_is_pf(dev))
|
||||
lock = lock_sem_sw_reset(dev);
|
||||
|
Loading…
Reference in New Issue
Block a user