numam-dpdk/drivers/event/octeontx2/otx2_worker_dual.h
Harman Kalra bd992b2adc net/octeontx2: fix PTP performance
A huge drop in per core MPPS value was observed when PTP stack is
enabled. The reason behind the bottleneck is HW serialises the
transfer of all SQEs, which seeks timestamp capture, on the same
send DMA path. Hence only those packets which requires timestamp
capture should set SETTSTAMP in send mem alg.
With this patch timestamping would be done only for those packets
with PKT_TX_IEEE1588_TMST set.

Fixes: fb3ae0951abd ("net/octeontx2: support Tx")
Fixes: 8980a153006b ("event/octeontx2: support PTP for SSO")

Signed-off-by: Harman Kalra <hkalra@marvell.com>
Acked-by: Jerin Jacob <jerinj@marvell.com>
2019-07-29 12:23:22 +02:00

106 lines
3.3 KiB
C

/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(C) 2019 Marvell International Ltd.
*/
#ifndef __OTX2_WORKER_DUAL_H__
#define __OTX2_WORKER_DUAL_H__
#include <rte_branch_prediction.h>
#include <rte_common.h>
#include <otx2_common.h>
#include "otx2_evdev.h"
/* SSO Operations */
static __rte_always_inline uint16_t
otx2_ssogws_dual_get_work(struct otx2_ssogws_state *ws,
struct otx2_ssogws_state *ws_pair,
struct rte_event *ev, const uint32_t flags,
const void * const lookup_mem,
struct otx2_timesync_info * const tstamp)
{
const uint64_t set_gw = BIT_ULL(16) | 1;
union otx2_sso_event event;
uint64_t tstamp_ptr;
uint64_t get_work1;
uint64_t mbuf;
if (flags & NIX_RX_OFFLOAD_PTYPE_F)
rte_prefetch_non_temporal(lookup_mem);
#ifdef RTE_ARCH_ARM64
asm volatile(
" ldr %[tag], [%[tag_loc]] \n"
" ldr %[wqp], [%[wqp_loc]] \n"
" tbz %[tag], 63, done%= \n"
" sevl \n"
"rty%=: wfe \n"
" ldr %[tag], [%[tag_loc]] \n"
" ldr %[wqp], [%[wqp_loc]] \n"
" tbnz %[tag], 63, rty%= \n"
"done%=: str %[gw], [%[pong]] \n"
" dmb ld \n"
" prfm pldl1keep, [%[wqp], #8]\n"
" sub %[mbuf], %[wqp], #0x80 \n"
" prfm pldl1keep, [%[mbuf]] \n"
: [tag] "=&r" (event.get_work0),
[wqp] "=&r" (get_work1),
[mbuf] "=&r" (mbuf)
: [tag_loc] "r" (ws->tag_op),
[wqp_loc] "r" (ws->wqp_op),
[gw] "r" (set_gw),
[pong] "r" (ws_pair->getwrk_op)
);
#else
event.get_work0 = otx2_read64(ws->tag_op);
while ((BIT_ULL(63)) & event.get_work0)
event.get_work0 = otx2_read64(ws->tag_op);
get_work1 = otx2_read64(ws->wqp_op);
otx2_write64(set_gw, ws_pair->getwrk_op);
rte_prefetch0((const void *)get_work1);
mbuf = (uint64_t)((char *)get_work1 - sizeof(struct rte_mbuf));
rte_prefetch0((const void *)mbuf);
#endif
event.get_work0 = (event.get_work0 & (0x3ull << 32)) << 6 |
(event.get_work0 & (0x3FFull << 36)) << 4 |
(event.get_work0 & 0xffffffff);
ws->cur_tt = event.sched_type;
ws->cur_grp = event.queue_id;
if (event.sched_type != SSO_TT_EMPTY &&
event.event_type == RTE_EVENT_TYPE_ETHDEV) {
otx2_wqe_to_mbuf(get_work1, mbuf, event.sub_event_type,
(uint32_t) event.get_work0, flags, lookup_mem);
/* Extracting tstamp, if PTP enabled. CGX will prepend the
* timestamp at starting of packet data and it can be derieved
* from WQE 9 dword which corresponds to SG iova.
* rte_pktmbuf_mtod_offset can be used for this purpose but it
* brings down the performance as it reads mbuf->buf_addr which
* is not part of cache in general fast path.
*/
tstamp_ptr = *(uint64_t *)(((struct nix_wqe_hdr_s *)get_work1)
+ OTX2_SSO_WQE_SG_PTR);
otx2_nix_mbuf_to_tstamp((struct rte_mbuf *)mbuf, tstamp, flags,
(uint64_t *)tstamp_ptr);
get_work1 = mbuf;
}
ev->event = event.get_work0;
ev->u64 = get_work1;
return !!get_work1;
}
static __rte_always_inline void
otx2_ssogws_dual_add_work(struct otx2_ssogws_dual *ws, const uint64_t event_ptr,
const uint32_t tag, const uint8_t new_tt,
const uint16_t grp)
{
uint64_t add_work0;
add_work0 = tag | ((uint64_t)(new_tt) << 32);
otx2_store_pair(add_work0, event_ptr, ws->grps_base[grp]);
}
#endif