Performance improvements for octe(4):

- Distribute RX load across multiple cores, if present. This reverts
  r217212, which is no longer relevant (I think because of the newer
  SDK).
- Use newer APIs for pinning taskqueue entries to specific cores.
- Deepen RX buffers.

This more than doubles NAT forwarding throughput on my EdgeRouter Lite from,
with typical packet mixture, 90 Mbps to over 200 Mbps. The result matches
forwarding throughput in Linux without the UBNT hardware offload on the same
hardware, and thus likely reflects hardware limits.

Reviewed by:	jhibbits
This commit is contained in:
Nathan Whitehorn 2019-02-10 20:13:59 +00:00
parent 3c25d4ea3c
commit f68992cf66
3 changed files with 14 additions and 19 deletions

View File

@ -38,14 +38,14 @@ AND WITH ALL FAULTS AND CAVIUM NETWORKS MAKES NO PROMISES, REPRESENTATIONS OR W
* the driver uses the default from below.
*/
#define INTERRUPT_LIMIT 10000 /* Max interrupts per second per core */
#define INTERRUPT_LIMIT 1000 /* Max interrupts per second per core */
/*#define INTERRUPT_LIMIT 0 *//* Don't limit the number of interrupts */
#define USE_RED 1 /* Enable Random Early Dropping under load */
#define USE_10MBPS_PREAMBLE_WORKAROUND 1 /* Allow SW based preamble removal at 10Mbps to workaround PHYs giving us bad preambles */
#define DONT_WRITEBACK(x) (x) /* Use this to have all FPA frees also tell the L2 not to write data to memory */
/*#define DONT_WRITEBACK(x) 0 *//* Use this to not have FPA frees control L2 */
#define MAX_RX_PACKETS 120 /* Maximum number of packets to process per interrupt. */
#define MAX_RX_PACKETS 1024 /* Maximum number of packets to process per interrupt. */
#define MAX_OUT_QUEUE_DEPTH 1000
#define FAU_NUM_PACKET_BUFFERS_TO_FREE (CVMX_FAU_REG_END - sizeof(uint32_t))

View File

@ -57,8 +57,6 @@ extern struct ifnet *cvm_oct_device[];
static struct task cvm_oct_task;
static struct taskqueue *cvm_oct_taskq;
static int cvm_oct_rx_active;
/**
* Interrupt handler. The interrupt occurs whenever the POW
* transitions from 0->1 packets in our group.
@ -77,10 +75,9 @@ int cvm_oct_do_interrupt(void *dev_id)
cvmx_write_csr(CVMX_POW_WQ_INT, 0x10001<<pow_receive_group);
/*
* Schedule task if there isn't one running.
* Schedule task.
*/
if (atomic_cmpset_int(&cvm_oct_rx_active, 0, 1))
taskqueue_enqueue(cvm_oct_taskq, &cvm_oct_task);
taskqueue_enqueue(cvm_oct_taskq, &cvm_oct_task);
return FILTER_HANDLED;
}
@ -172,7 +169,6 @@ void cvm_oct_tasklet_rx(void *context, int pending)
int num_freed;
int packet_not_copied;
sched_pin();
coreid = cvmx_get_core_num();
/* Prefetch cvm_oct_device since we know we need it soon */
@ -343,12 +339,6 @@ void cvm_oct_tasklet_rx(void *context, int pending)
*/
if (INTERRUPT_LIMIT != 0 && rx_count == MAX_RX_PACKETS) {
taskqueue_enqueue(cvm_oct_taskq, &cvm_oct_task);
} else {
/*
* No more packets, all done.
*/
if (!atomic_cmpset_int(&cvm_oct_rx_active, 1, 0))
panic("%s: inconsistent rx active state.", __func__);
}
/* Restore the original POW group mask */
@ -370,20 +360,25 @@ void cvm_oct_tasklet_rx(void *context, int pending)
number_to_free - num_freed);
}
}
sched_unpin();
}
void cvm_oct_rx_initialize(void)
{
int cpu;
TASK_INIT(&cvm_oct_task, 0, cvm_oct_tasklet_rx, NULL);
cvm_oct_taskq = taskqueue_create_fast("oct_rx", M_NOWAIT,
taskqueue_thread_enqueue,
&cvm_oct_taskq);
taskqueue_start_threads(&cvm_oct_taskq, min(mp_ncpus, MAXCPU),
PI_NET, "octe taskq");
CPU_FOREACH(cpu) {
cpuset_t cpu_mask;
CPU_SETOF(cpu, &cpu_mask);
taskqueue_start_threads_cpuset(&cvm_oct_taskq, 1, PI_NET,
&cpu_mask, "octe taskq");
}
}
void cvm_oct_rx_shutdown(void)

View File

@ -61,7 +61,7 @@ __FBSDID("$FreeBSD$");
#if defined(CONFIG_CAVIUM_OCTEON_NUM_PACKET_BUFFERS) && CONFIG_CAVIUM_OCTEON_NUM_PACKET_BUFFERS
int num_packet_buffers = CONFIG_CAVIUM_OCTEON_NUM_PACKET_BUFFERS;
#else
int num_packet_buffers = 1024;
int num_packet_buffers = 2048;
#endif
TUNABLE_INT("hw.octe.num_packet_buffers", &num_packet_buffers);
/*
@ -448,7 +448,7 @@ int cvm_oct_init_module(device_t bus)
if (INTERRUPT_LIMIT) {
/* Set the POW timer rate to give an interrupt at most INTERRUPT_LIMIT times per second */
cvmx_write_csr(CVMX_POW_WQ_INT_PC, cvmx_clock_get_rate(CVMX_CLOCK_CORE)/(INTERRUPT_LIMIT*16*256)<<8);
cvmx_write_csr(CVMX_POW_WQ_INT_PC, cvmx_clock_get_rate(CVMX_CLOCK_CORE)/((INTERRUPT_LIMIT+1)*16*256)<<8);
/* Enable POW timer interrupt. It will count when there are packets available */
cvmx_write_csr(CVMX_POW_WQ_INT_THRX(pow_receive_group), 0x1ful<<24);