Protect from infinite sw-reset loop in mlx5core.

Avoid an infinite software firmware reset loop that may be caused by a
hardware bug by limiting the maximum number of resets.
The counter between resets is reset by request for reset, and not by a
successful reset.
The interval between two resets can be configured via sysctl:
hw.mlx5.sw_reset_timeout
which is global to all mlx5 devices in the system.

Submitted by:	slavash@
MFC after:	3 days
Sponsored by:	Mellanox Technologies
This commit is contained in:
hselasky 2019-05-08 10:30:47 +00:00
parent 4d6dbe0567
commit 0ca4d482ae
2 changed files with 35 additions and 1 deletions

View File

@ -536,6 +536,7 @@ struct mlx5_core_health {
unsigned long flags;
struct work_struct work;
struct delayed_work recover_work;
unsigned int last_reset_req;
};
#ifdef RATELIMIT

View File

@ -64,6 +64,12 @@ SYSCTL_INT(_hw_mlx5, OID_AUTO, fw_reset_enable, CTLFLAG_RWTUN,
&mlx5_fw_reset_enable, 0,
"Enable firmware reset");
static unsigned int sw_reset_to = 1200;
SYSCTL_UINT(_hw_mlx5, OID_AUTO, sw_reset_timeout, CTLFLAG_RWTUN,
&sw_reset_to, 0,
"Minimum timeout in seconds between two firmware resets");
static int lock_sem_sw_reset(struct mlx5_core_dev *dev)
{
int ret;
@ -218,6 +224,32 @@ static void reset_fw_if_needed(struct mlx5_core_dev *dev)
&dev->iseg->cmdq_addr_l_sz);
}
static bool
mlx5_health_allow_reset(struct mlx5_core_dev *dev)
{
struct mlx5_core_health *health = &dev->priv.health;
unsigned int delta;
bool ret;
if (health->last_reset_req != 0) {
delta = ticks - health->last_reset_req;
delta /= hz;
ret = delta >= sw_reset_to;
} else {
ret = true;
}
/*
* In principle, ticks may be 0. Setting it to off by one (-1)
* to prevent certain reset in next request.
*/
health->last_reset_req = ticks ? : -1;
if (!ret)
mlx5_core_warn(dev, "Firmware reset elided due to "
"auto-reset frequency threshold.\n");
return (ret);
}
#define MLX5_CRDUMP_WAIT_MS 60000
#define MLX5_FW_RESET_WAIT_MS 1000
#define MLX5_NIC_STATE_POLL_MS 5
@ -243,7 +275,8 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
if (force)
goto err_state_done;
if (fatal_error == MLX5_SENSOR_FW_SYND_RFR) {
if (fatal_error == MLX5_SENSOR_FW_SYND_RFR &&
mlx5_health_allow_reset(dev)) {
/* Get cr-dump and reset FW semaphore */
if (mlx5_core_is_pf(dev))
lock = lock_sem_sw_reset(dev);