Check for missing MSI-x and Tx completions in ENA

If the first MSI-x won't be executed, then the timer service will detect
that and trigger device reset.

The checking for missing Tx completion was reworked, so it will also
check for missing interrupts. Checking number of missing Tx completions
can be performed after loop, instead of checking it every iteration.

Submitted by:  Michal Krawczyk <mk@semihalf.com>
Obtained from: Semihalf
Sponsored by:  Amazon, Inc.
This commit is contained in:
mw 2019-05-30 13:16:56 +00:00
parent 1cefbea456
commit 319dfd5cc6
2 changed files with 74 additions and 21 deletions

View File

@ -405,6 +405,8 @@ ena_init_io_rings_common(struct ena_adapter *adapter, struct ena_ring *ring,
ring->qid = qid;
ring->adapter = adapter;
ring->ena_dev = adapter->ena_dev;
ring->first_interrupt = false;
ring->no_interrupt_event_cnt = 0;
}
static void
@ -1773,6 +1775,9 @@ ena_handle_msix(void *arg)
ena_qid = ENA_IO_TXQ_IDX(qid);
io_cq = &adapter->ena_dev->io_cq_queues[ena_qid];
tx_ring->first_interrupt = true;
rx_ring->first_interrupt = true;
for (i = 0; i < CLEAN_BUDGET; ++i) {
/*
* If lock cannot be acquired, then deferred cleanup task was
@ -3329,13 +3334,37 @@ static void check_for_admin_com_state(struct ena_adapter *adapter)
}
static int
check_missing_comp_in_queue(struct ena_adapter *adapter,
check_for_rx_interrupt_queue(struct ena_adapter *adapter,
struct ena_ring *rx_ring)
{
if (likely(rx_ring->first_interrupt))
return (0);
if (ena_com_cq_empty(rx_ring->ena_com_io_cq))
return (0);
rx_ring->no_interrupt_event_cnt++;
if (rx_ring->no_interrupt_event_cnt == ENA_MAX_NO_INTERRUPT_ITERATIONS) {
device_printf(adapter->pdev, "Potential MSIX issue on Rx side "
"Queue = %d. Reset the device\n", rx_ring->qid);
adapter->reset_reason = ENA_REGS_RESET_MISS_INTERRUPT;
adapter->trigger_reset = true;
return (EIO);
}
return (0);
}
static int
check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
struct ena_ring *tx_ring)
{
struct bintime curtime, time;
struct ena_tx_buffer *tx_buf;
sbintime_t time_offset;
uint32_t missed_tx = 0;
int i;
int i, rc = 0;
getbinuptime(&curtime);
@ -3347,9 +3376,24 @@ check_missing_comp_in_queue(struct ena_adapter *adapter,
time = curtime;
bintime_sub(&time, &tx_buf->timestamp);
time_offset = bttosbt(time);
if (unlikely(!tx_ring->first_interrupt &&
time_offset > 2 * adapter->missing_tx_timeout)) {
/*
* If after graceful period interrupt is still not
* received, we schedule a reset.
*/
device_printf(adapter->pdev,
"Potential MSIX issue on Tx side Queue = %d. "
"Reset the device\n", tx_ring->qid);
adapter->reset_reason = ENA_REGS_RESET_MISS_INTERRUPT;
adapter->trigger_reset = true;
return (EIO);
}
/* Check again if packet is still waiting */
if (unlikely(bttosbt(time) > adapter->missing_tx_timeout)) {
if (unlikely(time_offset > adapter->missing_tx_timeout)) {
if (!tx_buf->print_once)
ena_trace(ENA_WARNING, "Found a Tx that wasn't "
@ -3358,24 +3402,22 @@ check_missing_comp_in_queue(struct ena_adapter *adapter,
tx_buf->print_once = true;
missed_tx++;
counter_u64_add(tx_ring->tx_stats.missing_tx_comp, 1);
if (unlikely(missed_tx >
adapter->missing_tx_threshold)) {
device_printf(adapter->pdev,
"The number of lost tx completion "
"is above the threshold (%d > %d). "
"Reset the device\n",
missed_tx, adapter->missing_tx_threshold);
adapter->reset_reason =
ENA_REGS_RESET_MISS_TX_CMPL;
adapter->trigger_reset = true;
return (EIO);
}
}
}
return (0);
if (unlikely(missed_tx > adapter->missing_tx_threshold)) {
device_printf(adapter->pdev,
"The number of lost tx completion is above the threshold "
"(%d > %d). Reset the device\n",
missed_tx, adapter->missing_tx_threshold);
adapter->reset_reason = ENA_REGS_RESET_MISS_TX_CMPL;
adapter->trigger_reset = true;
rc = EIO;
}
counter_u64_add(tx_ring->tx_stats.missing_tx_comp, missed_tx);
return (rc);
}
/*
@ -3385,9 +3427,10 @@ check_missing_comp_in_queue(struct ena_adapter *adapter,
* transactions exceeds "missing_tx_threshold".
*/
static void
check_for_missing_tx_completions(struct ena_adapter *adapter)
check_for_missing_completions(struct ena_adapter *adapter)
{
struct ena_ring *tx_ring;
struct ena_ring *rx_ring;
int i, budget, rc;
/* Make sure the driver doesn't turn the device in other process */
@ -3406,8 +3449,13 @@ check_for_missing_tx_completions(struct ena_adapter *adapter)
for (i = adapter->next_monitored_tx_qid; i < adapter->num_queues; i++) {
tx_ring = &adapter->tx_ring[i];
rx_ring = &adapter->rx_ring[i];
rc = check_missing_comp_in_queue(adapter, tx_ring);
rc = check_missing_comp_in_tx_queue(adapter, tx_ring);
if (unlikely(rc != 0))
return;
rc = check_for_rx_interrupt_queue(adapter, rx_ring);
if (unlikely(rc != 0))
return;
@ -3516,7 +3564,7 @@ ena_timer_service(void *data)
check_for_admin_com_state(adapter);
check_for_missing_tx_completions(adapter);
check_for_missing_completions(adapter);
check_for_empty_rx_ring(adapter);

View File

@ -120,6 +120,8 @@
#define ENA_IO_IRQ_FIRST_IDX 1
#define ENA_IO_IRQ_IDX(q) (ENA_IO_IRQ_FIRST_IDX + (q))
#define ENA_MAX_NO_INTERRUPT_ITERATIONS 3
/*
* ENA device should send keep alive msg every 1 sec.
* We wait for 6 sec just to be on the safe side.
@ -241,6 +243,9 @@ struct ena_ring {
/* The maximum length the driver can push to the device (For LLQ) */
uint8_t tx_max_header_size;
bool first_interrupt;
uint16_t no_interrupt_event_cnt;
struct ena_com_rx_buf_info ena_bufs[ENA_PKT_MAX_BUFS];
/*