numam-dpdk/lib/pcapng/rte_pcapng.c
Stephen Hemminger d2e3c4b8a2 pcapng: record received RSS hash in pcap file
There is an option for recording RSS hash with packets in the
pcapng standard. This implements this for all received packets.

There is a corner case that can not be addressed with current
DPDK API's. If using rte_flow() and some hardware it is possible
to write a flow rule that uses another hash function like XOR.
But there is no API that records this, or provides the algorithm
info on a per-packet basis.

Wireshark recently merged support for displaying the recorded hash
option (for, yet to be released, version 4.1).

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Tested-by: Ben Magistro <koncept1@gmail.com>
2022-10-27 10:29:59 +02:00

669 lines
18 KiB
C

/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2019 Microsoft Corporation
*/
#include <errno.h>
#include <net/if.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/uio.h>
#include <time.h>
#include <unistd.h>
#include <bus_driver.h>
#include <rte_common.h>
#include <rte_cycles.h>
#include <dev_driver.h>
#include <rte_errno.h>
#include <rte_ethdev.h>
#include <rte_ether.h>
#include <rte_mbuf.h>
#include <rte_pcapng.h>
#include <rte_reciprocal.h>
#include <rte_time.h>
#include "pcapng_proto.h"
/* conversion from DPDK speed to PCAPNG */
#define PCAPNG_MBPS_SPEED 1000000ull
/* Format of the capture file handle */
struct rte_pcapng {
int outfd; /* output file */
/* DPDK port id to interface index in file */
uint32_t port_index[RTE_MAX_ETHPORTS];
};
/* For converting TSC cycles to PCAPNG ns format */
static struct pcapng_time {
uint64_t ns;
uint64_t cycles;
uint64_t tsc_hz;
struct rte_reciprocal_u64 tsc_hz_inverse;
} pcapng_time;
static inline void
pcapng_init(void)
{
struct timespec ts;
pcapng_time.cycles = rte_get_tsc_cycles();
clock_gettime(CLOCK_REALTIME, &ts);
pcapng_time.cycles = (pcapng_time.cycles + rte_get_tsc_cycles()) / 2;
pcapng_time.ns = rte_timespec_to_ns(&ts);
pcapng_time.tsc_hz = rte_get_tsc_hz();
pcapng_time.tsc_hz_inverse = rte_reciprocal_value_u64(pcapng_time.tsc_hz);
}
/* PCAPNG timestamps are in nanoseconds */
static uint64_t pcapng_tsc_to_ns(uint64_t cycles)
{
uint64_t delta, secs;
if (!pcapng_time.tsc_hz)
pcapng_init();
/* In essence the calculation is:
* delta = (cycles - pcapng_time.cycles) * NSEC_PRE_SEC / rte_get_tsc_hz()
* but this overflows within 4 to 8 seconds depending on TSC frequency.
* Instead, if delta >= pcapng_time.tsc_hz:
* Increase pcapng_time.ns and pcapng_time.cycles by the number of
* whole seconds in delta and reduce delta accordingly.
* delta will therefore always lie in the interval [0, pcapng_time.tsc_hz),
* which will not overflow when multiplied by NSEC_PER_SEC provided the
* TSC frequency < approx 18.4GHz.
*
* Currently all TSCs operate below 5GHz.
*/
delta = cycles - pcapng_time.cycles;
if (unlikely(delta >= pcapng_time.tsc_hz)) {
if (likely(delta < pcapng_time.tsc_hz * 2)) {
delta -= pcapng_time.tsc_hz;
pcapng_time.cycles += pcapng_time.tsc_hz;
pcapng_time.ns += NSEC_PER_SEC;
} else {
secs = rte_reciprocal_divide_u64(delta, &pcapng_time.tsc_hz_inverse);
delta -= secs * pcapng_time.tsc_hz;
pcapng_time.cycles += secs * pcapng_time.tsc_hz;
pcapng_time.ns += secs * NSEC_PER_SEC;
}
}
return pcapng_time.ns + rte_reciprocal_divide_u64(delta * NSEC_PER_SEC,
&pcapng_time.tsc_hz_inverse);
}
/* length of option including padding */
static uint16_t pcapng_optlen(uint16_t len)
{
return RTE_ALIGN(sizeof(struct pcapng_option) + len,
sizeof(uint32_t));
}
/* build TLV option and return location of next */
static struct pcapng_option *
pcapng_add_option(struct pcapng_option *popt, uint16_t code,
const void *data, uint16_t len)
{
popt->code = code;
popt->length = len;
memcpy(popt->data, data, len);
return (struct pcapng_option *)((uint8_t *)popt + pcapng_optlen(len));
}
/*
* Write required initial section header describing the capture
*/
static int
pcapng_section_block(rte_pcapng_t *self,
const char *os, const char *hw,
const char *app, const char *comment)
{
struct pcapng_section_header *hdr;
struct pcapng_option *opt;
void *buf;
uint32_t len;
ssize_t cc;
len = sizeof(*hdr);
if (hw)
len += pcapng_optlen(strlen(hw));
if (os)
len += pcapng_optlen(strlen(os));
if (app)
len += pcapng_optlen(strlen(app));
if (comment)
len += pcapng_optlen(strlen(comment));
/* reserve space for OPT_END */
len += pcapng_optlen(0);
len += sizeof(uint32_t);
buf = calloc(1, len);
if (!buf)
return -1;
hdr = (struct pcapng_section_header *)buf;
*hdr = (struct pcapng_section_header) {
.block_type = PCAPNG_SECTION_BLOCK,
.block_length = len,
.byte_order_magic = PCAPNG_BYTE_ORDER_MAGIC,
.major_version = PCAPNG_MAJOR_VERS,
.minor_version = PCAPNG_MINOR_VERS,
.section_length = UINT64_MAX,
};
/* After the section header insert variable length options. */
opt = (struct pcapng_option *)(hdr + 1);
if (comment)
opt = pcapng_add_option(opt, PCAPNG_OPT_COMMENT,
comment, strlen(comment));
if (hw)
opt = pcapng_add_option(opt, PCAPNG_SHB_HARDWARE,
hw, strlen(hw));
if (os)
opt = pcapng_add_option(opt, PCAPNG_SHB_OS,
os, strlen(os));
if (app)
opt = pcapng_add_option(opt, PCAPNG_SHB_USERAPPL,
app, strlen(app));
/* The standard requires last option to be OPT_END */
opt = pcapng_add_option(opt, PCAPNG_OPT_END, NULL, 0);
/* clone block_length after option */
memcpy(opt, &hdr->block_length, sizeof(uint32_t));
cc = write(self->outfd, buf, len);
free(buf);
return cc;
}
/* Write an interface block for a DPDK port */
static int
pcapng_add_interface(rte_pcapng_t *self, uint16_t port)
{
struct pcapng_interface_block *hdr;
struct rte_eth_dev_info dev_info;
struct rte_ether_addr *ea, macaddr;
const struct rte_device *dev;
struct rte_eth_link link;
struct pcapng_option *opt;
const uint8_t tsresol = 9; /* nanosecond resolution */
uint32_t len;
void *buf;
char ifname[IF_NAMESIZE];
char ifhw[256];
uint64_t speed = 0;
if (rte_eth_dev_info_get(port, &dev_info) < 0)
return -1;
/* make something like an interface name */
if (if_indextoname(dev_info.if_index, ifname) == NULL)
snprintf(ifname, IF_NAMESIZE, "dpdk:%u", port);
/* make a useful device hardware string */
dev = dev_info.device;
if (dev)
snprintf(ifhw, sizeof(ifhw),
"%s-%s", dev->bus->name, dev->name);
/* DPDK reports in units of Mbps */
if (rte_eth_link_get(port, &link) == 0 &&
link.link_status == RTE_ETH_LINK_UP)
speed = link.link_speed * PCAPNG_MBPS_SPEED;
if (rte_eth_macaddr_get(port, &macaddr) < 0)
ea = NULL;
else
ea = &macaddr;
/* Compute length of interface block options */
len = sizeof(*hdr);
len += pcapng_optlen(sizeof(tsresol)); /* timestamp */
len += pcapng_optlen(strlen(ifname)); /* ifname */
if (ea)
len += pcapng_optlen(RTE_ETHER_ADDR_LEN); /* macaddr */
if (speed != 0)
len += pcapng_optlen(sizeof(uint64_t));
if (dev)
len += pcapng_optlen(strlen(ifhw));
len += pcapng_optlen(0);
len += sizeof(uint32_t);
buf = alloca(len);
if (!buf)
return -1;
hdr = (struct pcapng_interface_block *)buf;
*hdr = (struct pcapng_interface_block) {
.block_type = PCAPNG_INTERFACE_BLOCK,
.link_type = 1, /* DLT_EN10MB - Ethernet */
.block_length = len,
};
opt = (struct pcapng_option *)(hdr + 1);
opt = pcapng_add_option(opt, PCAPNG_IFB_TSRESOL,
&tsresol, sizeof(tsresol));
opt = pcapng_add_option(opt, PCAPNG_IFB_NAME,
ifname, strlen(ifname));
if (ea)
opt = pcapng_add_option(opt, PCAPNG_IFB_MACADDR,
ea, RTE_ETHER_ADDR_LEN);
if (speed != 0)
opt = pcapng_add_option(opt, PCAPNG_IFB_SPEED,
&speed, sizeof(uint64_t));
if (dev)
opt = pcapng_add_option(opt, PCAPNG_IFB_HARDWARE,
ifhw, strlen(ifhw));
opt = pcapng_add_option(opt, PCAPNG_OPT_END, NULL, 0);
/* clone block_length after optionsa */
memcpy(opt, &hdr->block_length, sizeof(uint32_t));
return write(self->outfd, buf, len);
}
/*
* Write the list of possible interfaces at the start
* of the file.
*/
static int
pcapng_interfaces(rte_pcapng_t *self)
{
uint16_t port_id;
uint16_t index = 0;
RTE_ETH_FOREACH_DEV(port_id) {
/* The list if ports in pcapng needs to be contiguous */
self->port_index[port_id] = index++;
if (pcapng_add_interface(self, port_id) < 0)
return -1;
}
return 0;
}
/*
* Write an Interface statistics block at the end of capture.
*/
ssize_t
rte_pcapng_write_stats(rte_pcapng_t *self, uint16_t port_id,
const char *comment,
uint64_t start_time, uint64_t end_time,
uint64_t ifrecv, uint64_t ifdrop)
{
struct pcapng_statistics *hdr;
struct pcapng_option *opt;
uint32_t optlen, len;
uint8_t *buf;
uint64_t ns;
RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
optlen = 0;
if (ifrecv != UINT64_MAX)
optlen += pcapng_optlen(sizeof(ifrecv));
if (ifdrop != UINT64_MAX)
optlen += pcapng_optlen(sizeof(ifdrop));
if (start_time != 0)
optlen += pcapng_optlen(sizeof(start_time));
if (end_time != 0)
optlen += pcapng_optlen(sizeof(end_time));
if (comment)
optlen += pcapng_optlen(strlen(comment));
if (optlen != 0)
optlen += pcapng_optlen(0);
len = sizeof(*hdr) + optlen + sizeof(uint32_t);
buf = alloca(len);
if (buf == NULL)
return -1;
hdr = (struct pcapng_statistics *)buf;
opt = (struct pcapng_option *)(hdr + 1);
if (comment)
opt = pcapng_add_option(opt, PCAPNG_OPT_COMMENT,
comment, strlen(comment));
if (start_time != 0)
opt = pcapng_add_option(opt, PCAPNG_ISB_STARTTIME,
&start_time, sizeof(start_time));
if (end_time != 0)
opt = pcapng_add_option(opt, PCAPNG_ISB_ENDTIME,
&end_time, sizeof(end_time));
if (ifrecv != UINT64_MAX)
opt = pcapng_add_option(opt, PCAPNG_ISB_IFRECV,
&ifrecv, sizeof(ifrecv));
if (ifdrop != UINT64_MAX)
opt = pcapng_add_option(opt, PCAPNG_ISB_IFDROP,
&ifdrop, sizeof(ifdrop));
if (optlen != 0)
opt = pcapng_add_option(opt, PCAPNG_OPT_END, NULL, 0);
hdr->block_type = PCAPNG_INTERFACE_STATS_BLOCK;
hdr->block_length = len;
hdr->interface_id = self->port_index[port_id];
ns = pcapng_tsc_to_ns(rte_get_tsc_cycles());
hdr->timestamp_hi = ns >> 32;
hdr->timestamp_lo = (uint32_t)ns;
/* clone block_length after option */
memcpy(opt, &len, sizeof(uint32_t));
return write(self->outfd, buf, len);
}
uint32_t
rte_pcapng_mbuf_size(uint32_t length)
{
/* The VLAN and EPB header must fit in the mbuf headroom. */
RTE_ASSERT(sizeof(struct pcapng_enhance_packet_block) +
sizeof(struct rte_vlan_hdr) <= RTE_PKTMBUF_HEADROOM);
/* The flags and queue information are added at the end. */
return sizeof(struct rte_mbuf)
+ RTE_ALIGN(length, sizeof(uint32_t))
+ pcapng_optlen(sizeof(uint32_t)) /* flag option */
+ pcapng_optlen(sizeof(uint32_t)) /* queue option */
+ sizeof(uint32_t); /* length */
}
/* More generalized version rte_vlan_insert() */
static int
pcapng_vlan_insert(struct rte_mbuf *m, uint16_t ether_type, uint16_t tci)
{
struct rte_ether_hdr *nh, *oh;
struct rte_vlan_hdr *vh;
if (!RTE_MBUF_DIRECT(m) || rte_mbuf_refcnt_read(m) > 1)
return -EINVAL;
if (rte_pktmbuf_data_len(m) < sizeof(*oh))
return -EINVAL;
oh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
nh = (struct rte_ether_hdr *)
rte_pktmbuf_prepend(m, sizeof(struct rte_vlan_hdr));
if (nh == NULL)
return -ENOSPC;
memmove(nh, oh, 2 * RTE_ETHER_ADDR_LEN);
nh->ether_type = rte_cpu_to_be_16(ether_type);
vh = (struct rte_vlan_hdr *) (nh + 1);
vh->vlan_tci = rte_cpu_to_be_16(tci);
return 0;
}
/*
* The mbufs created use the Pcapng standard enhanced packet block.
*
* 1 2 3
* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* 0 | Block Type = 0x00000006 |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* 4 | Block Total Length |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* 8 | Interface ID |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* 12 | Timestamp (High) |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* 16 | Timestamp (Low) |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* 20 | Captured Packet Length |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* 24 | Original Packet Length |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* 28 / /
* / Packet Data /
* / variable length, padded to 32 bits /
* / /
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* | Option Code = 0x0002 | Option Length = 0x004 |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* | Flags (direction) |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* | Option Code = 0x0006 | Option Length = 0x002 |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* | Queue id |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* | Block Total Length |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*/
/* Make a copy of original mbuf with pcapng header and options */
struct rte_mbuf *
rte_pcapng_copy(uint16_t port_id, uint32_t queue,
const struct rte_mbuf *md,
struct rte_mempool *mp,
uint32_t length, uint64_t cycles,
enum rte_pcapng_direction direction)
{
struct pcapng_enhance_packet_block *epb;
uint32_t orig_len, data_len, padding, flags;
struct pcapng_option *opt;
uint16_t optlen;
struct rte_mbuf *mc;
uint64_t ns;
bool rss_hash;
#ifdef RTE_LIBRTE_ETHDEV_DEBUG
RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, NULL);
#endif
ns = pcapng_tsc_to_ns(cycles);
orig_len = rte_pktmbuf_pkt_len(md);
/* Take snapshot of the data */
mc = rte_pktmbuf_copy(md, mp, 0, length);
if (unlikely(mc == NULL))
return NULL;
/* Expand any offloaded VLAN information */
if ((direction == RTE_PCAPNG_DIRECTION_IN &&
(md->ol_flags & RTE_MBUF_F_RX_VLAN_STRIPPED)) ||
(direction == RTE_PCAPNG_DIRECTION_OUT &&
(md->ol_flags & RTE_MBUF_F_TX_VLAN))) {
if (pcapng_vlan_insert(mc, RTE_ETHER_TYPE_VLAN,
md->vlan_tci) != 0)
goto fail;
}
if ((direction == RTE_PCAPNG_DIRECTION_IN &&
(md->ol_flags & RTE_MBUF_F_RX_QINQ_STRIPPED)) ||
(direction == RTE_PCAPNG_DIRECTION_OUT &&
(md->ol_flags & RTE_MBUF_F_TX_QINQ))) {
if (pcapng_vlan_insert(mc, RTE_ETHER_TYPE_QINQ,
md->vlan_tci_outer) != 0)
goto fail;
}
/* record HASH on incoming packets */
rss_hash = (direction == RTE_PCAPNG_DIRECTION_IN &&
(md->ol_flags & RTE_MBUF_F_RX_RSS_HASH));
/* pad the packet to 32 bit boundary */
data_len = rte_pktmbuf_data_len(mc);
padding = RTE_ALIGN(data_len, sizeof(uint32_t)) - data_len;
if (padding > 0) {
void *tail = rte_pktmbuf_append(mc, padding);
if (tail == NULL)
goto fail;
memset(tail, 0, padding);
}
optlen = pcapng_optlen(sizeof(flags));
optlen += pcapng_optlen(sizeof(queue));
if (rss_hash)
optlen += pcapng_optlen(sizeof(uint8_t) + sizeof(uint32_t));
/* reserve trailing options and block length */
opt = (struct pcapng_option *)
rte_pktmbuf_append(mc, optlen + sizeof(uint32_t));
if (unlikely(opt == NULL))
goto fail;
switch (direction) {
case RTE_PCAPNG_DIRECTION_IN:
flags = PCAPNG_IFB_INBOUND;
break;
case RTE_PCAPNG_DIRECTION_OUT:
flags = PCAPNG_IFB_OUTBOUND;
break;
default:
flags = 0;
}
opt = pcapng_add_option(opt, PCAPNG_EPB_FLAGS,
&flags, sizeof(flags));
opt = pcapng_add_option(opt, PCAPNG_EPB_QUEUE,
&queue, sizeof(queue));
if (rss_hash) {
uint8_t hash_opt[5];
/* The algorithm could be something else if
* using rte_flow_action_rss; but the current API does not
* have a way for ethdev to report this on a per-packet basis.
*/
hash_opt[0] = PCAPNG_HASH_TOEPLITZ;
memcpy(&hash_opt[1], &md->hash.rss, sizeof(uint32_t));
opt = pcapng_add_option(opt, PCAPNG_EPB_HASH,
&hash_opt, sizeof(hash_opt));
}
/* Note: END_OPT necessary here. Wireshark doesn't do it. */
/* Add PCAPNG packet header */
epb = (struct pcapng_enhance_packet_block *)
rte_pktmbuf_prepend(mc, sizeof(*epb));
if (unlikely(epb == NULL))
goto fail;
epb->block_type = PCAPNG_ENHANCED_PACKET_BLOCK;
epb->block_length = rte_pktmbuf_data_len(mc);
/* Interface index is filled in later during write */
mc->port = port_id;
epb->timestamp_hi = ns >> 32;
epb->timestamp_lo = (uint32_t)ns;
epb->capture_length = data_len;
epb->original_length = orig_len;
/* set trailer of block length */
*(uint32_t *)opt = epb->block_length;
return mc;
fail:
rte_pktmbuf_free(mc);
return NULL;
}
/* Write pre-formatted packets to file. */
ssize_t
rte_pcapng_write_packets(rte_pcapng_t *self,
struct rte_mbuf *pkts[], uint16_t nb_pkts)
{
struct iovec iov[IOV_MAX];
unsigned int i, cnt = 0;
ssize_t ret, total = 0;
for (i = 0; i < nb_pkts; i++) {
struct rte_mbuf *m = pkts[i];
struct pcapng_enhance_packet_block *epb;
/* sanity check that is really a pcapng mbuf */
epb = rte_pktmbuf_mtod(m, struct pcapng_enhance_packet_block *);
if (unlikely(epb->block_type != PCAPNG_ENHANCED_PACKET_BLOCK ||
epb->block_length != rte_pktmbuf_data_len(m))) {
rte_errno = EINVAL;
return -1;
}
/*
* Handle case of highly fragmented and large burst size
* Note: this assumes that max segments per mbuf < IOV_MAX
*/
if (unlikely(cnt + m->nb_segs >= IOV_MAX)) {
ret = writev(self->outfd, iov, cnt);
if (unlikely(ret < 0)) {
rte_errno = errno;
return -1;
}
total += ret;
cnt = 0;
}
/*
* The DPDK port is recorded during pcapng_copy.
* Map that to PCAPNG interface in file.
*/
epb->interface_id = self->port_index[m->port];
do {
iov[cnt].iov_base = rte_pktmbuf_mtod(m, void *);
iov[cnt].iov_len = rte_pktmbuf_data_len(m);
++cnt;
} while ((m = m->next));
}
ret = writev(self->outfd, iov, cnt);
if (unlikely(ret < 0)) {
rte_errno = errno;
return -1;
}
return total + ret;
}
/* Create new pcapng writer handle */
rte_pcapng_t *
rte_pcapng_fdopen(int fd,
const char *osname, const char *hardware,
const char *appname, const char *comment)
{
rte_pcapng_t *self;
self = malloc(sizeof(*self));
if (!self) {
rte_errno = ENOMEM;
return NULL;
}
self->outfd = fd;
if (pcapng_section_block(self, osname, hardware, appname, comment) < 0)
goto fail;
if (pcapng_interfaces(self) < 0)
goto fail;
return self;
fail:
free(self);
return NULL;
}
void
rte_pcapng_close(rte_pcapng_t *self)
{
close(self->outfd);
free(self);
}