Implement hardware mlx5(4) rx timestamps.

Driver support is only provided for ConnectX4/5.

System-time timestamp is calculated based on the free-running counter
timestamp provided by hardware.  Driver periodically samples the
counter to calibrate it against the system clock and uses linear
interpolation to convert.  Stability of the crystal which drives the
clock is +-50 ppm at the operational temperature, which makes the
algorithm good enough.

The calculation is somewhat delicate because all values are 64bit and
overflow the naive formula for linear interpolation.  The calculation
drops the least significant bits in advance, see the PREC shift in
mlx5_mbuf_tstmp().

Hardware stamps can be turned off by 'ifconfig mceN -hwrxtsmp'.  Buggy
firmware might result in small but visible errors in the reported
timestamps, detectable e.g. by nonsensical (negative) RTT values for
LAN pings.

Reviewed by:	gallatin, hselasky
Sponsored by:	Mellanox Technologies
Differential revision:	https://reviews.freebsd.org/D12638
This commit is contained in:
Konstantin Belousov 2017-11-29 10:04:11 +00:00
parent fa3f256682
commit ef23f141bc
4 changed files with 188 additions and 1 deletions

View File

@ -619,6 +619,8 @@ struct mlx5_cqe64 {
u8 op_own;
};
#define MLX5_CQE_TSTMP_PTP (1ULL << 63)
static inline bool get_cqe_lro_timestamp_valid(struct mlx5_cqe64 *cqe)
{
return (cqe->lro_tcppsh_abort_dupack >> 7) & 1;

View File

@ -650,6 +650,16 @@ struct mlx5e_flow_tables {
struct mlx5e_flow_table inner_rss;
};
#define MLX5E_TSTMP_PREC 10
struct mlx5e_clbr_point {
uint64_t base_curr;
uint64_t base_prev;
uint64_t clbr_hw_prev;
uint64_t clbr_hw_curr;
u_int clbr_gen;
};
struct mlx5e_priv {
/* priv data path fields - start */
int order_base_2_num_channels;
@ -704,6 +714,12 @@ struct mlx5e_priv {
int media_active_last;
struct callout watchdog;
struct callout tstmp_clbr;
int clbr_done;
int clbr_curr;
struct mlx5e_clbr_point clbr_points[2];
u_int clbr_gen;
};
#define MLX5E_NET_IP_ALIGN 2

View File

@ -154,6 +154,8 @@ static const struct {
MALLOC_DEFINE(M_MLX5EN, "MLX5EN", "MLX5 Ethernet");
static SYSCTL_NODE(_hw, OID_AUTO, mlx5, CTLFLAG_RW, 0, "MLX5 driver parameters");
static void
mlx5e_update_carrier(struct mlx5e_priv *priv)
{
@ -637,6 +639,109 @@ mlx5e_disable_async_events(struct mlx5e_priv *priv)
mtx_unlock(&priv->async_events_mtx);
}
static void mlx5e_calibration_callout(void *arg);
static int mlx5e_calibration_duration = 20;
static int mlx5e_fast_calibration = 1;
static int mlx5e_normal_calibration = 30;
static SYSCTL_NODE(_hw_mlx5, OID_AUTO, calibr, CTLFLAG_RW, 0,
"MLX5 timestamp calibration parameteres");
SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, duration, CTLFLAG_RWTUN,
&mlx5e_calibration_duration, 0,
"Duration of initial calibration");
SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, fast, CTLFLAG_RWTUN,
&mlx5e_fast_calibration, 0,
"Recalibration interval during initial calibration");
SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, normal, CTLFLAG_RWTUN,
&mlx5e_normal_calibration, 0,
"Recalibration interval during normal operations");
/*
* Ignites the calibration process.
*/
static void
mlx5e_reset_calibration_callout(struct mlx5e_priv *priv)
{
if (priv->clbr_done == 0)
mlx5e_calibration_callout(priv);
else
callout_reset_curcpu(&priv->tstmp_clbr, (priv->clbr_done <
mlx5e_calibration_duration ? mlx5e_fast_calibration :
mlx5e_normal_calibration) * hz, mlx5e_calibration_callout,
priv);
}
static uint64_t
mlx5e_timespec2usec(const struct timespec *ts)
{
return ((uint64_t)ts->tv_sec * 1000000000 + ts->tv_nsec);
}
static uint64_t
mlx5e_hw_clock(struct mlx5e_priv *priv)
{
struct mlx5_init_seg *iseg;
uint32_t hw_h, hw_h1, hw_l;
iseg = priv->mdev->iseg;
do {
hw_h = ioread32be(&iseg->internal_timer_h);
hw_l = ioread32be(&iseg->internal_timer_l);
hw_h1 = ioread32be(&iseg->internal_timer_h);
} while (hw_h1 != hw_h);
return (((uint64_t)hw_h << 32) | hw_l);
}
/*
* The calibration callout, it runs either in the context of the
* thread which enables calibration, or in callout. It takes the
* snapshot of system and adapter clocks, then advances the pointers to
* the calibration point to allow rx path to read the consistent data
* lockless.
*/
static void
mlx5e_calibration_callout(void *arg)
{
struct mlx5e_priv *priv;
struct mlx5e_clbr_point *next, *curr;
struct timespec ts;
int clbr_curr_next;
priv = arg;
curr = &priv->clbr_points[priv->clbr_curr];
clbr_curr_next = priv->clbr_curr + 1;
if (clbr_curr_next >= nitems(priv->clbr_points))
clbr_curr_next = 0;
next = &priv->clbr_points[clbr_curr_next];
next->base_prev = curr->base_curr;
next->clbr_hw_prev = curr->clbr_hw_curr;
next->clbr_hw_curr = mlx5e_hw_clock(priv);
if (((next->clbr_hw_curr - curr->clbr_hw_prev) >> MLX5E_TSTMP_PREC) ==
0) {
if_printf(priv->ifp, "HW failed tstmp frozen %#jx %#jx,"
"disabling\n", next->clbr_hw_curr, curr->clbr_hw_prev);
priv->clbr_done = 0;
return;
}
nanouptime(&ts);
next->base_curr = mlx5e_timespec2usec(&ts);
curr->clbr_gen = 0;
atomic_thread_fence_rel();
priv->clbr_curr = clbr_curr_next;
atomic_store_rel_int(&next->clbr_gen, ++(priv->clbr_gen));
if (priv->clbr_done < mlx5e_calibration_duration)
priv->clbr_done++;
mlx5e_reset_calibration_callout(priv);
}
static const char *mlx5e_rq_stats_desc[] = {
MLX5E_RQ_STATS(MLX5E_STATS_DESC)
};
@ -2693,6 +2798,16 @@ mlx5e_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
mlx5e_open_locked(ifp);
}
}
if (mask & IFCAP_HWRXTSTMP) {
ifp->if_capenable ^= IFCAP_HWRXTSTMP;
if (ifp->if_capenable & IFCAP_HWRXTSTMP) {
if (priv->clbr_done == 0)
mlx5e_reset_calibration_callout(priv);
} else {
callout_drain(&priv->tstmp_clbr);
priv->clbr_done = 0;
}
}
out:
PRIV_UNLOCK(priv);
break;
@ -3198,7 +3313,7 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev)
ifp->if_capabilities |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU;
ifp->if_capabilities |= IFCAP_LRO;
ifp->if_capabilities |= IFCAP_TSO | IFCAP_VLAN_HWTSO;
ifp->if_capabilities |= IFCAP_HWSTATS;
ifp->if_capabilities |= IFCAP_HWSTATS | IFCAP_HWRXTSTMP;
/* set TSO limits so that we don't have to drop TX packets */
ifp->if_hw_tsomax = MLX5E_MAX_TX_PAYLOAD_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
@ -3347,6 +3462,13 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev)
mlx5e_update_stats(priv);
mtx_unlock(&priv->async_events_mtx);
SYSCTL_ADD_INT(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
OID_AUTO, "rx_clbr_done", CTLFLAG_RD,
&priv->clbr_done, 0,
"RX timestamps calibration state");
callout_init(&priv->tstmp_clbr, CALLOUT_DIRECT);
mlx5e_reset_calibration_callout(priv);
return (priv);
err_dealloc_transport_domain:
@ -3391,6 +3513,8 @@ mlx5e_destroy_ifp(struct mlx5_core_dev *mdev, void *vpriv)
/* stop watchdog timer */
callout_drain(&priv->watchdog);
callout_drain(&priv->tstmp_clbr);
if (priv->vlan_attach != NULL)
EVENTHANDLER_DEREGISTER(vlan_config, priv->vlan_attach);
if (priv->vlan_detach != NULL)

View File

@ -179,13 +179,43 @@ mlx5e_lro_update_hdr(struct mbuf *mb, struct mlx5_cqe64 *cqe)
/* TODO: handle tcp checksum */
}
static uint64_t
mlx5e_mbuf_tstmp(struct mlx5e_priv *priv, uint64_t hw_tstmp)
{
struct mlx5e_clbr_point *cp;
uint64_t a1, a2, res;
u_int gen;
do {
cp = &priv->clbr_points[priv->clbr_curr];
gen = atomic_load_acq_int(&cp->clbr_gen);
a1 = (hw_tstmp - cp->clbr_hw_prev) >> MLX5E_TSTMP_PREC;
a2 = (cp->base_curr - cp->base_prev) >> MLX5E_TSTMP_PREC;
res = (a1 * a2) << MLX5E_TSTMP_PREC;
/*
* Divisor cannot be zero because calibration callback
* checks for the condition and disables timestamping
* if clock halted.
*/
res /= (cp->clbr_hw_curr - cp->clbr_hw_prev) >>
MLX5E_TSTMP_PREC;
res += cp->base_prev;
atomic_thread_fence_acq();
} while (gen == 0 || gen != cp->clbr_gen);
return (res);
}
static inline void
mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe,
struct mlx5e_rq *rq, struct mbuf *mb,
u32 cqe_bcnt)
{
struct ifnet *ifp = rq->ifp;
struct mlx5e_channel *c;
int lro_num_seg; /* HW LRO session aggregated packets counter */
uint64_t tstmp;
lro_num_seg = be32_to_cpu(cqe->srqn) >> 24;
if (lro_num_seg > 1) {
@ -250,6 +280,21 @@ mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe,
mb->m_pkthdr.ether_vtag = be16_to_cpu(cqe->vlan_info);
mb->m_flags |= M_VLANTAG;
}
c = container_of(rq, struct mlx5e_channel, rq);
if (c->priv->clbr_done >= 2) {
tstmp = mlx5e_mbuf_tstmp(c->priv, be64_to_cpu(cqe->timestamp));
if ((tstmp & MLX5_CQE_TSTMP_PTP) != 0) {
/*
* Timestamp was taken on the packet entrance,
* instead of the cqe generation.
*/
tstmp &= ~MLX5_CQE_TSTMP_PTP;
mb->m_flags |= M_TSTMP_HPREC;
}
mb->m_pkthdr.rcv_tstmp = tstmp;
mb->m_flags |= M_TSTMP;
}
}
static inline void