net/mlx5: introduce clock queue service routine

Service routine is invoked periodically on Rearm Queue
completion interrupts, typically once per some milliseconds
(1-16) to track clock jitter and wander in robust fashion.
It performs the following:

- fetches the completed CQEs for Rearm Queue
- restarts Rearm Queue on errors
- pushes new requests to Rearm Queue to make it
  continuously running and pushing cross-channel requests
  to Clock Queue
- reads and caches the Clock Queue CQE to be used in datapath
- gathers statistics to estimate clock jitter and wander
- gathers Clock Queue errors statistics

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
Acked-by: Matan Azrad <matan@mellanox.com>
This commit is contained in:
Viacheslav Ovsiienko 2020-07-16 08:23:12 +00:00 committed by Ferruh Yigit
parent aef1e20ebe
commit 77522be0a5
4 changed files with 375 additions and 0 deletions

View File

@ -555,6 +555,12 @@ struct mlx5_txpp_wq {
volatile uint32_t *sq_dbrec;
};
/* Tx packet pacing internal timestamp. */
struct mlx5_txpp_ts {
rte_atomic64_t ci_ts;
rte_atomic64_t ts;
};
/* Tx packet pacing structure. */
struct mlx5_dev_txpp {
pthread_mutex_t mutex; /* Pacing create/destroy mutex. */
@ -570,6 +576,15 @@ struct mlx5_dev_txpp {
struct mlx5_txpp_wq rearm_queue; /* Clock Queue. */
struct mlx5dv_pp *pp; /* Packet pacing context. */
uint16_t pp_id; /* Packet pacing context index. */
uint16_t ts_n; /* Number of captured timestamps. */
uint16_t ts_p; /* Pointer to statisticks timestamp. */
struct mlx5_txpp_ts *tsa; /* Timestamps sliding window stats. */
struct mlx5_txpp_ts ts; /* Cached completion id/timestamp. */
uint32_t sync_lost:1; /* ci/timestamp synchronization lost. */
/* Statistics counters. */
rte_atomic32_t err_miss_int; /* Missed service interrupt. */
rte_atomic32_t err_rearm_queue; /* Rearm Queue errors. */
rte_atomic32_t err_clock_queue; /* Clock Queue errors. */
};
/*
@ -993,5 +1008,6 @@ void mlx5_os_set_reg_mr_cb(mlx5_reg_mr_t *reg_mr_cb,
int mlx5_txpp_start(struct rte_eth_dev *dev);
void mlx5_txpp_stop(struct rte_eth_dev *dev);
void mlx5_txpp_interrupt_handler(void *cb_arg);
#endif /* RTE_PMD_MLX5_H_ */

View File

@ -172,6 +172,7 @@
#define MLX5_TXDB_HEURISTIC 2
/* Tx accurate scheduling on timestamps parameters. */
#define MLX5_TXPP_WAIT_INIT_TS 1000ul /* How long to wait timestamp. */
#define MLX5_TXPP_CLKQ_SIZE 1
#define MLX5_TXPP_REARM ((1UL << MLX5_WQ_INDEX_WIDTH) / 4)
#define MLX5_TXPP_REARM_SQ_SIZE (((1UL << MLX5_CQ_INDEX_WIDTH) / \

View File

@ -30,6 +30,7 @@
#include <rte_io.h>
#include <rte_bus_pci.h>
#include <rte_malloc.h>
#include <rte_cycles.h>
#include <mlx5_glue.h>
#include <mlx5_prm.h>
@ -695,4 +696,23 @@ mlx5_tx_dbrec(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe)
mlx5_tx_dbrec_cond_wmb(txq, wqe, 1);
}
/**
* Convert timestamp from HW format to linear counter
* from Packet Pacing Clock Queue CQE timestamp format.
*
* @param sh
* Pointer to the device shared context. Might be needed
* to convert according current device configuration.
* @param ts
* Timestamp from CQE to convert.
* @return
* UTC in nanoseconds
*/
static __rte_always_inline uint64_t
mlx5_txpp_convert_rx_ts(struct mlx5_dev_ctx_shared *sh, uint64_t ts)
{
RTE_SET_USED(sh);
return (ts & UINT32_MAX) + (ts >> 32) * NS_PER_S;
}
#endif /* RTE_PMD_MLX5_RXTX_H_ */

View File

@ -1,6 +1,9 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2020 Mellanox Technologies, Ltd
*/
#include <fcntl.h>
#include <stdint.h>
#include <rte_ether.h>
#include <rte_ethdev_driver.h>
#include <rte_interrupts.h>
@ -144,6 +147,33 @@ mlx5_txpp_destroy_clock_queue(struct mlx5_dev_ctx_shared *sh)
struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
mlx5_txpp_destroy_send_queue(wq);
if (sh->txpp.tsa) {
rte_free(sh->txpp.tsa);
sh->txpp.tsa = NULL;
}
}
static void
mlx5_txpp_doorbell_rearm_queue(struct mlx5_dev_ctx_shared *sh, uint16_t ci)
{
struct mlx5_txpp_wq *wq = &sh->txpp.rearm_queue;
union {
uint32_t w32[2];
uint64_t w64;
} cs;
wq->sq_ci = ci + 1;
cs.w32[0] = rte_cpu_to_be_32(rte_be_to_cpu_32
(wq->wqes[ci & (wq->sq_size - 1)].ctrl[0]) | (ci - 1) << 8);
cs.w32[1] = wq->wqes[ci & (wq->sq_size - 1)].ctrl[1];
/* Update SQ doorbell record with new SQ ci. */
rte_compiler_barrier();
*wq->sq_dbrec = rte_cpu_to_be_32(wq->sq_ci);
/* Make sure the doorbell record is updated. */
rte_wmb();
/* Write to doorbel register to start processing. */
__mlx5_uar_write64_relaxed(cs.w64, sh->tx_uar->reg_addr, NULL);
rte_wmb();
}
static void
@ -433,6 +463,16 @@ mlx5_txpp_create_clock_queue(struct mlx5_dev_ctx_shared *sh)
uint32_t umem_size, umem_dbrec;
int ret;
sh->txpp.tsa = rte_zmalloc_socket(__func__,
MLX5_TXPP_REARM_SQ_SIZE *
sizeof(struct mlx5_txpp_ts),
0, sh->numa_node);
if (!sh->txpp.tsa) {
DRV_LOG(ERR, "Failed to allocate memory for CQ stats.");
return -ENOMEM;
}
sh->txpp.ts_p = 0;
sh->txpp.ts_n = 0;
/* Allocate memory buffer for CQEs and doorbell record. */
umem_size = sizeof(struct mlx5_cqe) * MLX5_TXPP_CLKQ_SIZE;
umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
@ -562,6 +602,299 @@ error:
return ret;
}
/* Enable notification from the Rearm Queue CQ. */
static inline void
mlx5_txpp_cq_arm(struct mlx5_dev_ctx_shared *sh)
{
struct mlx5_txpp_wq *aq = &sh->txpp.rearm_queue;
uint32_t arm_sn = aq->arm_sn << MLX5_CQ_SQN_OFFSET;
uint32_t db_hi = arm_sn | MLX5_CQ_DBR_CMD_ALL | aq->cq_ci;
uint64_t db_be = rte_cpu_to_be_64(((uint64_t)db_hi << 32) | aq->cq->id);
uint32_t *addr = RTE_PTR_ADD(sh->tx_uar->base_addr, MLX5_CQ_DOORBELL);
rte_compiler_barrier();
aq->cq_dbrec[MLX5_CQ_ARM_DB] = rte_cpu_to_be_32(db_hi);
rte_wmb();
#ifdef RTE_ARCH_64
*(uint64_t *)addr = db_be;
#else
*(uint32_t *)addr = db_be;
rte_io_wmb();
*((uint32_t *)addr + 1) = db_be >> 32;
#endif
aq->arm_sn++;
}
static inline void
mlx5_atomic_read_cqe(rte_int128_t *from, rte_int128_t *ts)
{
/*
* The only CQE of Clock Queue is being continuously
* update by hardware with soecified rate. We have to
* read timestump and WQE completion index atomically.
*/
#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
rte_int128_t src;
memset(&src, 0, sizeof(src));
*ts = src;
/* if (*from == *ts) *from = *src else *ts = *from; */
rte_atomic128_cmp_exchange(from, ts, &src, 0,
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
#else
rte_atomic64_t *cqe = (rte_atomic64_t *)from;
/* Power architecture does not support 16B compare-and-swap. */
for (;;) {
int64_t tm, op;
int64_t *ps;
rte_compiler_barrier();
tm = rte_atomic64_read(cqe + 0);
op = rte_atomic64_read(cqe + 1);
rte_compiler_barrier();
if (tm != rte_atomic64_read(cqe + 0))
continue;
if (op != rte_atomic64_read(cqe + 1))
continue;
ps = (int64_t *)ts;
ps[0] = tm;
ps[1] = op;
return;
}
#endif
}
/* Stores timestamp in the cache structure to share data with datapath. */
static inline void
mlx5_txpp_cache_timestamp(struct mlx5_dev_ctx_shared *sh,
uint64_t ts, uint64_t ci)
{
ci = ci << (64 - MLX5_CQ_INDEX_WIDTH);
ci |= (ts << MLX5_CQ_INDEX_WIDTH) >> MLX5_CQ_INDEX_WIDTH;
rte_compiler_barrier();
rte_atomic64_set(&sh->txpp.ts.ts, ts);
rte_atomic64_set(&sh->txpp.ts.ci_ts, ci);
rte_wmb();
}
/* Reads timestamp from Clock Queue CQE and stores in the cache. */
static inline void
mlx5_txpp_update_timestamp(struct mlx5_dev_ctx_shared *sh)
{
struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
struct mlx5_cqe *cqe = (struct mlx5_cqe *)(uintptr_t)wq->cqes;
union {
rte_int128_t u128;
struct mlx5_cqe_ts cts;
} to;
uint64_t ts;
uint16_t ci;
static_assert(sizeof(struct mlx5_cqe_ts) == sizeof(rte_int128_t),
"Wrong timestamp CQE part size");
mlx5_atomic_read_cqe((rte_int128_t *)&cqe->timestamp, &to.u128);
if (to.cts.op_own >> 4) {
DRV_LOG(DEBUG, "Clock Queue error sync lost.");
rte_atomic32_inc(&sh->txpp.err_clock_queue);
sh->txpp.sync_lost = 1;
return;
}
ci = rte_be_to_cpu_16(to.cts.wqe_counter);
ts = rte_be_to_cpu_64(to.cts.timestamp);
ts = mlx5_txpp_convert_rx_ts(sh, ts);
wq->cq_ci += (ci - wq->sq_ci) & UINT16_MAX;
wq->sq_ci = ci;
mlx5_txpp_cache_timestamp(sh, ts, wq->cq_ci);
}
/* Waits for the first completion on Clock Queue to init timestamp. */
static inline void
mlx5_txpp_init_timestamp(struct mlx5_dev_ctx_shared *sh)
{
struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
uint32_t wait;
sh->txpp.ts_p = 0;
sh->txpp.ts_n = 0;
for (wait = 0; wait < MLX5_TXPP_WAIT_INIT_TS; wait++) {
struct timespec onems;
mlx5_txpp_update_timestamp(sh);
if (wq->sq_ci)
return;
/* Wait one millisecond and try again. */
onems.tv_sec = 0;
onems.tv_nsec = NS_PER_S / MS_PER_S;
nanosleep(&onems, 0);
}
DRV_LOG(ERR, "Unable to initialize timestamp.");
sh->txpp.sync_lost = 1;
}
#ifdef HAVE_IBV_DEVX_EVENT
/* Gather statistics for timestamp from Clock Queue CQE. */
static inline void
mlx5_txpp_gather_timestamp(struct mlx5_dev_ctx_shared *sh)
{
/* Check whether we have a valid timestamp. */
if (!sh->txpp.clock_queue.sq_ci && !sh->txpp.ts_n)
return;
MLX5_ASSERT(sh->txpp.ts_p < MLX5_TXPP_REARM_SQ_SIZE);
sh->txpp.tsa[sh->txpp.ts_p] = sh->txpp.ts;
if (++sh->txpp.ts_p >= MLX5_TXPP_REARM_SQ_SIZE)
sh->txpp.ts_p = 0;
if (sh->txpp.ts_n < MLX5_TXPP_REARM_SQ_SIZE)
++sh->txpp.ts_n;
}
/* Handles Rearm Queue completions in periodic service. */
static __rte_always_inline void
mlx5_txpp_handle_rearm_queue(struct mlx5_dev_ctx_shared *sh)
{
struct mlx5_txpp_wq *wq = &sh->txpp.rearm_queue;
uint32_t cq_ci = wq->cq_ci;
bool error = false;
int ret;
do {
volatile struct mlx5_cqe *cqe;
cqe = &wq->cqes[cq_ci & (MLX5_TXPP_REARM_CQ_SIZE - 1)];
ret = check_cqe(cqe, MLX5_TXPP_REARM_CQ_SIZE, cq_ci);
switch (ret) {
case MLX5_CQE_STATUS_ERR:
error = true;
++cq_ci;
break;
case MLX5_CQE_STATUS_SW_OWN:
wq->sq_ci += 2;
++cq_ci;
break;
case MLX5_CQE_STATUS_HW_OWN:
break;
default:
MLX5_ASSERT(false);
break;
}
} while (ret != MLX5_CQE_STATUS_HW_OWN);
if (likely(cq_ci != wq->cq_ci)) {
/* Check whether we have missed interrupts. */
if (cq_ci - wq->cq_ci != 1) {
DRV_LOG(DEBUG, "Rearm Queue missed interrupt.");
rte_atomic32_inc(&sh->txpp.err_miss_int);
/* Check sync lost on wqe index. */
if (cq_ci - wq->cq_ci >=
(((1UL << MLX5_WQ_INDEX_WIDTH) /
MLX5_TXPP_REARM) - 1))
error = 1;
}
/* Update doorbell record to notify hardware. */
rte_compiler_barrier();
*wq->cq_dbrec = rte_cpu_to_be_32(cq_ci);
rte_wmb();
wq->cq_ci = cq_ci;
/* Fire new requests to Rearm Queue. */
if (error) {
DRV_LOG(DEBUG, "Rearm Queue error sync lost.");
rte_atomic32_inc(&sh->txpp.err_rearm_queue);
sh->txpp.sync_lost = 1;
}
}
}
/* Handles Clock Queue completions in periodic service. */
static __rte_always_inline void
mlx5_txpp_handle_clock_queue(struct mlx5_dev_ctx_shared *sh)
{
mlx5_txpp_update_timestamp(sh);
mlx5_txpp_gather_timestamp(sh);
}
#endif
/* Invoked periodically on Rearm Queue completions. */
void
mlx5_txpp_interrupt_handler(void *cb_arg)
{
#ifndef HAVE_IBV_DEVX_EVENT
RTE_SET_USED(cb_arg);
return;
#else
struct mlx5_dev_ctx_shared *sh = cb_arg;
union {
struct mlx5dv_devx_async_event_hdr event_resp;
uint8_t buf[sizeof(struct mlx5dv_devx_async_event_hdr) + 128];
} out;
MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
/* Process events in the loop. Only rearm completions are expected. */
while (mlx5_glue->devx_get_event
(sh->txpp.echan,
&out.event_resp,
sizeof(out.buf)) >=
(ssize_t)sizeof(out.event_resp.cookie)) {
mlx5_txpp_handle_rearm_queue(sh);
mlx5_txpp_handle_clock_queue(sh);
mlx5_txpp_cq_arm(sh);
mlx5_txpp_doorbell_rearm_queue
(sh, sh->txpp.rearm_queue.sq_ci - 1);
}
#endif /* HAVE_IBV_DEVX_ASYNC */
}
static void
mlx5_txpp_stop_service(struct mlx5_dev_ctx_shared *sh)
{
if (!sh->txpp.intr_handle.fd)
return;
mlx5_intr_callback_unregister(&sh->txpp.intr_handle,
mlx5_txpp_interrupt_handler, sh);
sh->txpp.intr_handle.fd = 0;
}
/* Attach interrupt handler and fires first request to Rearm Queue. */
static int
mlx5_txpp_start_service(struct mlx5_dev_ctx_shared *sh)
{
uint16_t event_nums[1] = {0};
int flags;
int ret;
/* Attach interrupt handler to process Rearm Queue completions. */
flags = fcntl(sh->txpp.echan->fd, F_GETFL);
ret = fcntl(sh->txpp.echan->fd, F_SETFL, flags | O_NONBLOCK);
if (ret) {
DRV_LOG(ERR, "Failed to change event channel FD.");
rte_errno = errno;
return -rte_errno;
}
memset(&sh->txpp.intr_handle, 0, sizeof(sh->txpp.intr_handle));
sh->txpp.intr_handle.fd = sh->txpp.echan->fd;
sh->txpp.intr_handle.type = RTE_INTR_HANDLE_EXT;
if (rte_intr_callback_register(&sh->txpp.intr_handle,
mlx5_txpp_interrupt_handler, sh)) {
sh->txpp.intr_handle.fd = 0;
DRV_LOG(ERR, "Failed to register CQE interrupt %d.", rte_errno);
return -rte_errno;
}
/* Subscribe CQ event to the event channel controlled by the driver. */
ret = mlx5_glue->devx_subscribe_devx_event(sh->txpp.echan,
sh->txpp.rearm_queue.cq->obj,
sizeof(event_nums),
event_nums, 0);
if (ret) {
DRV_LOG(ERR, "Failed to subscribe CQE event.");
rte_errno = errno;
return -errno;
}
/* Enable interrupts in the CQ. */
mlx5_txpp_cq_arm(sh);
/* Fire the first request on Rearm Queue. */
mlx5_txpp_doorbell_rearm_queue(sh, sh->txpp.rearm_queue.sq_size - 1);
mlx5_txpp_init_timestamp(sh);
return 0;
}
/*
* The routine initializes the packet pacing infrastructure:
* - allocates PP context
@ -595,8 +928,12 @@ mlx5_txpp_create(struct mlx5_dev_ctx_shared *sh, struct mlx5_priv *priv)
ret = mlx5_txpp_create_rearm_queue(sh);
if (ret)
goto exit;
ret = mlx5_txpp_start_service(sh);
if (ret)
goto exit;
exit:
if (ret) {
mlx5_txpp_stop_service(sh);
mlx5_txpp_destroy_rearm_queue(sh);
mlx5_txpp_destroy_clock_queue(sh);
mlx5_txpp_free_pp_index(sh);
@ -618,6 +955,7 @@ exit:
static void
mlx5_txpp_destroy(struct mlx5_dev_ctx_shared *sh)
{
mlx5_txpp_stop_service(sh);
mlx5_txpp_destroy_rearm_queue(sh);
mlx5_txpp_destroy_clock_queue(sh);
mlx5_txpp_free_pp_index(sh);