Reorganize health recovery in mlx5core.
- Move the semaphore locking and unlocking to the same function. - Flags are no longer needed if the reset and crdump will be done in the same function. MFC after: 3 days Submitted by: slavash@ Sponsored by: Mellanox Technologies
This commit is contained in:
parent
3c1274bd64
commit
d28b6b55ba
@ -48,8 +48,6 @@ enum {
|
||||
enum {
|
||||
MLX5_DROP_NEW_HEALTH_WORK,
|
||||
MLX5_DROP_NEW_RECOVERY_WORK,
|
||||
MLX5_SKIP_SW_RESET,
|
||||
MLX5_SW_RESET_SEM_LOCKED,
|
||||
};
|
||||
|
||||
enum {
|
||||
@ -167,7 +165,6 @@ static void reset_fw_if_needed(struct mlx5_core_dev *dev)
|
||||
{
|
||||
bool supported = (ioread32be(&dev->iseg->initializing) >>
|
||||
MLX5_FW_RESET_SUPPORTED_OFFSET) & 1;
|
||||
struct mlx5_core_health *health = &dev->priv.health;
|
||||
u32 cmdq_addr, fatal_error;
|
||||
|
||||
if (!supported)
|
||||
@ -181,8 +178,7 @@ static void reset_fw_if_needed(struct mlx5_core_dev *dev)
|
||||
fatal_error = check_fatal_sensors(dev);
|
||||
if (fatal_error == MLX5_SENSOR_PCI_COMM_ERR ||
|
||||
fatal_error == MLX5_SENSOR_NIC_DISABLED ||
|
||||
fatal_error == MLX5_SENSOR_NIC_SW_RESET ||
|
||||
test_bit(MLX5_SKIP_SW_RESET, &health->flags)) {
|
||||
fatal_error == MLX5_SENSOR_NIC_SW_RESET) {
|
||||
mlx5_core_warn(dev, "Not issuing FW reset. Either it's already done or won't help.\n");
|
||||
return;
|
||||
}
|
||||
@ -197,26 +193,65 @@ static void reset_fw_if_needed(struct mlx5_core_dev *dev)
|
||||
&dev->iseg->cmdq_addr_l_sz);
|
||||
}
|
||||
|
||||
#define MLX5_CRDUMP_WAIT_MS 60000
|
||||
#define MLX5_FW_RESET_WAIT_MS 1000
|
||||
#define MLX5_NIC_STATE_POLL_MS 5
|
||||
void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
|
||||
{
|
||||
unsigned long end, delay_ms = MLX5_CRDUMP_WAIT_MS;
|
||||
u32 fatal_error;
|
||||
int lock = -EBUSY;
|
||||
|
||||
mutex_lock(&dev->intf_state_mutex);
|
||||
if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
|
||||
goto unlock;
|
||||
return;
|
||||
}
|
||||
|
||||
if (!force)
|
||||
mlx5_core_err(dev, "internal state error detected\n");
|
||||
if (check_fatal_sensors(dev) || force) {
|
||||
reset_fw_if_needed(dev);
|
||||
fatal_error = check_fatal_sensors(dev);
|
||||
|
||||
if (fatal_error || force) {
|
||||
if (!force)
|
||||
mlx5_core_err(dev, "internal state error detected\n");
|
||||
dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
|
||||
mlx5_trigger_cmd_completions(dev);
|
||||
}
|
||||
|
||||
mlx5_core_event(dev, MLX5_DEV_EVENT_SYS_ERROR, 0);
|
||||
if (!force)
|
||||
mlx5_core_err(dev, "system error event triggered\n");
|
||||
if (force)
|
||||
goto err_state_done;
|
||||
|
||||
if (fatal_error == MLX5_SENSOR_FW_SYND_RFR) {
|
||||
if (mlx5_core_is_pf(dev))
|
||||
lock = lock_sem_sw_reset(dev, LOCK);
|
||||
|
||||
if (lock != -EBUSY) {
|
||||
reset_fw_if_needed(dev);
|
||||
delay_ms = MLX5_FW_RESET_WAIT_MS;
|
||||
}
|
||||
}
|
||||
|
||||
/* Recover from SW reset */
|
||||
end = jiffies + msecs_to_jiffies(delay_ms);
|
||||
do {
|
||||
if (sensor_nic_disabled(dev))
|
||||
break;
|
||||
|
||||
msleep(MLX5_NIC_STATE_POLL_MS);
|
||||
} while (!time_after(jiffies, end));
|
||||
|
||||
if (!sensor_nic_disabled(dev)) {
|
||||
dev_err(&dev->pdev->dev, "NIC IFC still %d after %lums.\n",
|
||||
get_nic_mode(dev), delay_ms);
|
||||
}
|
||||
|
||||
/* Release FW semaphore if you are the lock owner */
|
||||
if (!lock)
|
||||
lock_sem_sw_reset(dev, UNLOCK);
|
||||
|
||||
mlx5_core_err(dev, "system error event triggered\n");
|
||||
|
||||
err_state_done:
|
||||
mlx5_core_event(dev, MLX5_DEV_EVENT_SYS_ERROR, 0);
|
||||
unlock:
|
||||
mutex_unlock(&dev->intf_state_mutex);
|
||||
}
|
||||
@ -265,7 +300,6 @@ static void health_recover(struct work_struct *work)
|
||||
if (sensor_pci_no_comm(dev)) {
|
||||
dev_err(&dev->pdev->dev, "health recovery flow aborted, PCI reads still not working\n");
|
||||
recover = false;
|
||||
goto clear_sem;
|
||||
}
|
||||
|
||||
nic_mode = get_nic_mode(dev);
|
||||
@ -281,14 +315,6 @@ static void health_recover(struct work_struct *work)
|
||||
recover = false;
|
||||
}
|
||||
|
||||
clear_sem:
|
||||
if (test_and_clear_bit(MLX5_SW_RESET_SEM_LOCKED, &health->flags)) {
|
||||
mlx5_core_dbg(dev, "Unlocking FW reset semaphore\n");
|
||||
lock_sem_sw_reset(dev, UNLOCK);
|
||||
}
|
||||
|
||||
test_and_clear_bit(MLX5_SKIP_SW_RESET, &health->flags);
|
||||
|
||||
if (recover) {
|
||||
dev_err(&dev->pdev->dev, "starting health recovery flow\n");
|
||||
mlx5_recover_device(dev);
|
||||
@ -312,29 +338,11 @@ static void health_care(struct work_struct *work)
|
||||
struct mlx5_core_dev *dev;
|
||||
struct mlx5_priv *priv;
|
||||
unsigned long flags;
|
||||
int ret;
|
||||
|
||||
health = container_of(work, struct mlx5_core_health, work);
|
||||
priv = container_of(health, struct mlx5_priv, health);
|
||||
dev = container_of(priv, struct mlx5_core_dev, priv);
|
||||
|
||||
if (mlx5_core_is_pf(dev)) {
|
||||
ret = lock_sem_sw_reset(dev, LOCK);
|
||||
if (!ret) {
|
||||
mlx5_core_warn(dev, "Locked FW reset semaphore\n");
|
||||
set_bit(MLX5_SW_RESET_SEM_LOCKED, &health->flags);
|
||||
}
|
||||
else if (ret == -EBUSY) {
|
||||
/* sw reset will be skipped only in case we detect the
|
||||
* semaphore was already taken. In case of an error
|
||||
* while taking the semaphore we prefer to issue a
|
||||
* reset since longer cr-dump time and multiple resets
|
||||
* are better than a stuck fw.
|
||||
*/
|
||||
set_bit(MLX5_SKIP_SW_RESET, &health->flags);
|
||||
}
|
||||
}
|
||||
|
||||
mlx5_core_warn(dev, "handling bad device here\n");
|
||||
mlx5_handle_bad_state(dev);
|
||||
recover_delay = msecs_to_jiffies(get_recovery_delay(dev));
|
||||
|
Loading…
x
Reference in New Issue
Block a user