Implement hardware mlx5(4) rx timestamps.
Driver support is only provided for ConnectX4/5. System-time timestamp is calculated based on the free-running counter timestamp provided by hardware. Driver periodically samples the counter to calibrate it against the system clock and uses linear interpolation to convert. Stability of the crystal which drives the clock is +-50 ppm at the operational temperature, which makes the algorithm good enough. The calculation is somewhat delicate because all values are 64bit and overflow the naive formula for linear interpolation. The calculation drops the least significant bits in advance, see the PREC shift in mlx5_mbuf_tstmp(). Hardware stamps can be turned off by 'ifconfig mceN -hwrxtsmp'. Buggy firmware might result in small but visible errors in the reported timestamps, detectable e.g. by nonsensical (negative) RTT values for LAN pings. Reviewed by: gallatin, hselasky Sponsored by: Mellanox Technologies Differential revision: https://reviews.freebsd.org/D12638
This commit is contained in:
parent
fa3f256682
commit
ef23f141bc
@ -619,6 +619,8 @@ struct mlx5_cqe64 {
|
||||
u8 op_own;
|
||||
};
|
||||
|
||||
#define MLX5_CQE_TSTMP_PTP (1ULL << 63)
|
||||
|
||||
static inline bool get_cqe_lro_timestamp_valid(struct mlx5_cqe64 *cqe)
|
||||
{
|
||||
return (cqe->lro_tcppsh_abort_dupack >> 7) & 1;
|
||||
|
@ -650,6 +650,16 @@ struct mlx5e_flow_tables {
|
||||
struct mlx5e_flow_table inner_rss;
|
||||
};
|
||||
|
||||
#define MLX5E_TSTMP_PREC 10
|
||||
|
||||
struct mlx5e_clbr_point {
|
||||
uint64_t base_curr;
|
||||
uint64_t base_prev;
|
||||
uint64_t clbr_hw_prev;
|
||||
uint64_t clbr_hw_curr;
|
||||
u_int clbr_gen;
|
||||
};
|
||||
|
||||
struct mlx5e_priv {
|
||||
/* priv data path fields - start */
|
||||
int order_base_2_num_channels;
|
||||
@ -704,6 +714,12 @@ struct mlx5e_priv {
|
||||
int media_active_last;
|
||||
|
||||
struct callout watchdog;
|
||||
|
||||
struct callout tstmp_clbr;
|
||||
int clbr_done;
|
||||
int clbr_curr;
|
||||
struct mlx5e_clbr_point clbr_points[2];
|
||||
u_int clbr_gen;
|
||||
};
|
||||
|
||||
#define MLX5E_NET_IP_ALIGN 2
|
||||
|
@ -154,6 +154,8 @@ static const struct {
|
||||
|
||||
MALLOC_DEFINE(M_MLX5EN, "MLX5EN", "MLX5 Ethernet");
|
||||
|
||||
static SYSCTL_NODE(_hw, OID_AUTO, mlx5, CTLFLAG_RW, 0, "MLX5 driver parameters");
|
||||
|
||||
static void
|
||||
mlx5e_update_carrier(struct mlx5e_priv *priv)
|
||||
{
|
||||
@ -637,6 +639,109 @@ mlx5e_disable_async_events(struct mlx5e_priv *priv)
|
||||
mtx_unlock(&priv->async_events_mtx);
|
||||
}
|
||||
|
||||
static void mlx5e_calibration_callout(void *arg);
|
||||
static int mlx5e_calibration_duration = 20;
|
||||
static int mlx5e_fast_calibration = 1;
|
||||
static int mlx5e_normal_calibration = 30;
|
||||
|
||||
static SYSCTL_NODE(_hw_mlx5, OID_AUTO, calibr, CTLFLAG_RW, 0,
|
||||
"MLX5 timestamp calibration parameteres");
|
||||
|
||||
SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, duration, CTLFLAG_RWTUN,
|
||||
&mlx5e_calibration_duration, 0,
|
||||
"Duration of initial calibration");
|
||||
SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, fast, CTLFLAG_RWTUN,
|
||||
&mlx5e_fast_calibration, 0,
|
||||
"Recalibration interval during initial calibration");
|
||||
SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, normal, CTLFLAG_RWTUN,
|
||||
&mlx5e_normal_calibration, 0,
|
||||
"Recalibration interval during normal operations");
|
||||
|
||||
/*
|
||||
* Ignites the calibration process.
|
||||
*/
|
||||
static void
|
||||
mlx5e_reset_calibration_callout(struct mlx5e_priv *priv)
|
||||
{
|
||||
|
||||
if (priv->clbr_done == 0)
|
||||
mlx5e_calibration_callout(priv);
|
||||
else
|
||||
callout_reset_curcpu(&priv->tstmp_clbr, (priv->clbr_done <
|
||||
mlx5e_calibration_duration ? mlx5e_fast_calibration :
|
||||
mlx5e_normal_calibration) * hz, mlx5e_calibration_callout,
|
||||
priv);
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
mlx5e_timespec2usec(const struct timespec *ts)
|
||||
{
|
||||
|
||||
return ((uint64_t)ts->tv_sec * 1000000000 + ts->tv_nsec);
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
mlx5e_hw_clock(struct mlx5e_priv *priv)
|
||||
{
|
||||
struct mlx5_init_seg *iseg;
|
||||
uint32_t hw_h, hw_h1, hw_l;
|
||||
|
||||
iseg = priv->mdev->iseg;
|
||||
do {
|
||||
hw_h = ioread32be(&iseg->internal_timer_h);
|
||||
hw_l = ioread32be(&iseg->internal_timer_l);
|
||||
hw_h1 = ioread32be(&iseg->internal_timer_h);
|
||||
} while (hw_h1 != hw_h);
|
||||
return (((uint64_t)hw_h << 32) | hw_l);
|
||||
}
|
||||
|
||||
/*
|
||||
* The calibration callout, it runs either in the context of the
|
||||
* thread which enables calibration, or in callout. It takes the
|
||||
* snapshot of system and adapter clocks, then advances the pointers to
|
||||
* the calibration point to allow rx path to read the consistent data
|
||||
* lockless.
|
||||
*/
|
||||
static void
|
||||
mlx5e_calibration_callout(void *arg)
|
||||
{
|
||||
struct mlx5e_priv *priv;
|
||||
struct mlx5e_clbr_point *next, *curr;
|
||||
struct timespec ts;
|
||||
int clbr_curr_next;
|
||||
|
||||
priv = arg;
|
||||
curr = &priv->clbr_points[priv->clbr_curr];
|
||||
clbr_curr_next = priv->clbr_curr + 1;
|
||||
if (clbr_curr_next >= nitems(priv->clbr_points))
|
||||
clbr_curr_next = 0;
|
||||
next = &priv->clbr_points[clbr_curr_next];
|
||||
|
||||
next->base_prev = curr->base_curr;
|
||||
next->clbr_hw_prev = curr->clbr_hw_curr;
|
||||
|
||||
next->clbr_hw_curr = mlx5e_hw_clock(priv);
|
||||
if (((next->clbr_hw_curr - curr->clbr_hw_prev) >> MLX5E_TSTMP_PREC) ==
|
||||
0) {
|
||||
if_printf(priv->ifp, "HW failed tstmp frozen %#jx %#jx,"
|
||||
"disabling\n", next->clbr_hw_curr, curr->clbr_hw_prev);
|
||||
priv->clbr_done = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
nanouptime(&ts);
|
||||
next->base_curr = mlx5e_timespec2usec(&ts);
|
||||
|
||||
curr->clbr_gen = 0;
|
||||
atomic_thread_fence_rel();
|
||||
priv->clbr_curr = clbr_curr_next;
|
||||
atomic_store_rel_int(&next->clbr_gen, ++(priv->clbr_gen));
|
||||
|
||||
if (priv->clbr_done < mlx5e_calibration_duration)
|
||||
priv->clbr_done++;
|
||||
mlx5e_reset_calibration_callout(priv);
|
||||
}
|
||||
|
||||
static const char *mlx5e_rq_stats_desc[] = {
|
||||
MLX5E_RQ_STATS(MLX5E_STATS_DESC)
|
||||
};
|
||||
@ -2693,6 +2798,16 @@ mlx5e_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
|
||||
mlx5e_open_locked(ifp);
|
||||
}
|
||||
}
|
||||
if (mask & IFCAP_HWRXTSTMP) {
|
||||
ifp->if_capenable ^= IFCAP_HWRXTSTMP;
|
||||
if (ifp->if_capenable & IFCAP_HWRXTSTMP) {
|
||||
if (priv->clbr_done == 0)
|
||||
mlx5e_reset_calibration_callout(priv);
|
||||
} else {
|
||||
callout_drain(&priv->tstmp_clbr);
|
||||
priv->clbr_done = 0;
|
||||
}
|
||||
}
|
||||
out:
|
||||
PRIV_UNLOCK(priv);
|
||||
break;
|
||||
@ -3198,7 +3313,7 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev)
|
||||
ifp->if_capabilities |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU;
|
||||
ifp->if_capabilities |= IFCAP_LRO;
|
||||
ifp->if_capabilities |= IFCAP_TSO | IFCAP_VLAN_HWTSO;
|
||||
ifp->if_capabilities |= IFCAP_HWSTATS;
|
||||
ifp->if_capabilities |= IFCAP_HWSTATS | IFCAP_HWRXTSTMP;
|
||||
|
||||
/* set TSO limits so that we don't have to drop TX packets */
|
||||
ifp->if_hw_tsomax = MLX5E_MAX_TX_PAYLOAD_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
|
||||
@ -3347,6 +3462,13 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev)
|
||||
mlx5e_update_stats(priv);
|
||||
mtx_unlock(&priv->async_events_mtx);
|
||||
|
||||
SYSCTL_ADD_INT(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
|
||||
OID_AUTO, "rx_clbr_done", CTLFLAG_RD,
|
||||
&priv->clbr_done, 0,
|
||||
"RX timestamps calibration state");
|
||||
callout_init(&priv->tstmp_clbr, CALLOUT_DIRECT);
|
||||
mlx5e_reset_calibration_callout(priv);
|
||||
|
||||
return (priv);
|
||||
|
||||
err_dealloc_transport_domain:
|
||||
@ -3391,6 +3513,8 @@ mlx5e_destroy_ifp(struct mlx5_core_dev *mdev, void *vpriv)
|
||||
/* stop watchdog timer */
|
||||
callout_drain(&priv->watchdog);
|
||||
|
||||
callout_drain(&priv->tstmp_clbr);
|
||||
|
||||
if (priv->vlan_attach != NULL)
|
||||
EVENTHANDLER_DEREGISTER(vlan_config, priv->vlan_attach);
|
||||
if (priv->vlan_detach != NULL)
|
||||
|
@ -179,13 +179,43 @@ mlx5e_lro_update_hdr(struct mbuf *mb, struct mlx5_cqe64 *cqe)
|
||||
/* TODO: handle tcp checksum */
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
mlx5e_mbuf_tstmp(struct mlx5e_priv *priv, uint64_t hw_tstmp)
|
||||
{
|
||||
struct mlx5e_clbr_point *cp;
|
||||
uint64_t a1, a2, res;
|
||||
u_int gen;
|
||||
|
||||
do {
|
||||
cp = &priv->clbr_points[priv->clbr_curr];
|
||||
gen = atomic_load_acq_int(&cp->clbr_gen);
|
||||
a1 = (hw_tstmp - cp->clbr_hw_prev) >> MLX5E_TSTMP_PREC;
|
||||
a2 = (cp->base_curr - cp->base_prev) >> MLX5E_TSTMP_PREC;
|
||||
res = (a1 * a2) << MLX5E_TSTMP_PREC;
|
||||
|
||||
/*
|
||||
* Divisor cannot be zero because calibration callback
|
||||
* checks for the condition and disables timestamping
|
||||
* if clock halted.
|
||||
*/
|
||||
res /= (cp->clbr_hw_curr - cp->clbr_hw_prev) >>
|
||||
MLX5E_TSTMP_PREC;
|
||||
|
||||
res += cp->base_prev;
|
||||
atomic_thread_fence_acq();
|
||||
} while (gen == 0 || gen != cp->clbr_gen);
|
||||
return (res);
|
||||
}
|
||||
|
||||
static inline void
|
||||
mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe,
|
||||
struct mlx5e_rq *rq, struct mbuf *mb,
|
||||
u32 cqe_bcnt)
|
||||
{
|
||||
struct ifnet *ifp = rq->ifp;
|
||||
struct mlx5e_channel *c;
|
||||
int lro_num_seg; /* HW LRO session aggregated packets counter */
|
||||
uint64_t tstmp;
|
||||
|
||||
lro_num_seg = be32_to_cpu(cqe->srqn) >> 24;
|
||||
if (lro_num_seg > 1) {
|
||||
@ -250,6 +280,21 @@ mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe,
|
||||
mb->m_pkthdr.ether_vtag = be16_to_cpu(cqe->vlan_info);
|
||||
mb->m_flags |= M_VLANTAG;
|
||||
}
|
||||
|
||||
c = container_of(rq, struct mlx5e_channel, rq);
|
||||
if (c->priv->clbr_done >= 2) {
|
||||
tstmp = mlx5e_mbuf_tstmp(c->priv, be64_to_cpu(cqe->timestamp));
|
||||
if ((tstmp & MLX5_CQE_TSTMP_PTP) != 0) {
|
||||
/*
|
||||
* Timestamp was taken on the packet entrance,
|
||||
* instead of the cqe generation.
|
||||
*/
|
||||
tstmp &= ~MLX5_CQE_TSTMP_PTP;
|
||||
mb->m_flags |= M_TSTMP_HPREC;
|
||||
}
|
||||
mb->m_pkthdr.rcv_tstmp = tstmp;
|
||||
mb->m_flags |= M_TSTMP;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void
|
||||
|
Loading…
Reference in New Issue
Block a user