Improve flexibility of ioat_test / ioatcontrol(8)

The test logic now preallocates memory before running the test.

The buffer size is now configurable.  Post-copy verification is
configurable.  The number of copies to chain into one transaction (one
interrupt) is configurable.

A 'duration' mode is added, which repeats the test until the duration
has elapsed, reporting the B/s and transactions completed.

ioatcontrol.8 has been updated to document the new arguments.

Initial limits (on this particular Broadwell-DE) (and when the
interrupts are working) seem to be: 256 interrupts/sec or ~6 GB/s,
whichever limit is more restrictive.

Unfortunately, it seems the interrupt-reset handling on Broadwell isn't
working as intended.  That will be fixed in a later commit.

Sponsored by:	EMC / Isilon Storage Division
This commit is contained in:
Conrad Meyer 2015-10-22 04:38:05 +00:00
parent b81eee4a22
commit 7c69db50df
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=289733
5 changed files with 387 additions and 87 deletions

View File

@ -51,18 +51,28 @@ __FBSDID("$FreeBSD$");
#include "ioat_internal.h"
#include "ioat_test.h"
#ifndef time_after
#define time_after(a,b) ((long)(b) - (long)(a) < 0)
#endif
MALLOC_DEFINE(M_IOAT_TEST, "ioat_test", "ioat test allocations");
#define IOAT_TEST_SIZE 0x40000
#define IOAT_MAX_BUFS 8
#define IOAT_MAX_BUFS 256
struct test_transaction {
uint8_t num_buffers;
void *buf[IOAT_MAX_BUFS];
uint32_t length;
uint32_t depth;
struct ioat_test *test;
TAILQ_ENTRY(test_transaction) entry;
};
#define IT_LOCK() mtx_lock(&ioat_test_lk)
#define IT_UNLOCK() mtx_unlock(&ioat_test_lk)
#define IT_ASSERT() mtx_assert(&ioat_test_lk, MA_OWNED)
static struct mtx ioat_test_lk;
MTX_SYSINIT(ioat_test_lk, &ioat_test_lk, "test coordination mtx", MTX_DEF);
static int g_thread_index = 1;
static struct cdev *g_ioat_cdev = NULL;
@ -73,7 +83,7 @@ ioat_test_transaction_destroy(struct test_transaction *tx)
for (i = 0; i < IOAT_MAX_BUFS; i++) {
if (tx->buf[i] != NULL) {
contigfree(tx->buf[i], IOAT_TEST_SIZE, M_IOAT_TEST);
contigfree(tx->buf[i], tx->length, M_IOAT_TEST);
tx->buf[i] = NULL;
}
}
@ -82,17 +92,16 @@ ioat_test_transaction_destroy(struct test_transaction *tx)
}
static struct
test_transaction *ioat_test_transaction_create(uint8_t num_buffers,
test_transaction *ioat_test_transaction_create(unsigned num_buffers,
uint32_t buffer_size)
{
struct test_transaction *tx;
int i;
unsigned i;
tx = malloc(sizeof(struct test_transaction), M_IOAT_TEST, M_NOWAIT | M_ZERO);
tx = malloc(sizeof(*tx), M_IOAT_TEST, M_NOWAIT | M_ZERO);
if (tx == NULL)
return (NULL);
tx->num_buffers = num_buffers;
tx->length = buffer_size;
for (i = 0; i < num_buffers; i++) {
@ -107,6 +116,18 @@ test_transaction *ioat_test_transaction_create(uint8_t num_buffers,
return (tx);
}
static bool
ioat_compare_ok(struct test_transaction *tx)
{
uint32_t i;
for (i = 0; i < tx->depth; i++) {
if (memcmp(tx->buf[2*i], tx->buf[2*i+1], tx->length) != 0)
return (false);
}
return (true);
}
static void
ioat_dma_test_callback(void *arg)
{
@ -116,82 +137,195 @@ ioat_dma_test_callback(void *arg)
tx = arg;
test = tx->test;
if (memcmp(tx->buf[0], tx->buf[1], tx->length) != 0) {
if (test->verify && !ioat_compare_ok(tx)) {
ioat_log_message(0, "miscompare found\n");
test->status = IOAT_TEST_MISCOMPARE;
atomic_add_32(&test->status[IOAT_TEST_MISCOMPARE], tx->depth);
} else if (!test->too_late)
atomic_add_32(&test->status[IOAT_TEST_OK], tx->depth);
IT_LOCK();
TAILQ_REMOVE(&test->pend_q, tx, entry);
TAILQ_INSERT_TAIL(&test->free_q, tx, entry);
wakeup(&test->free_q);
IT_UNLOCK();
}
static int
ioat_test_prealloc_memory(struct ioat_test *test, int index)
{
uint32_t i, j, k;
struct test_transaction *tx;
for (i = 0; i < test->transactions; i++) {
tx = ioat_test_transaction_create(test->chain_depth * 2,
test->buffer_size);
if (tx == NULL) {
ioat_log_message(0, "tx == NULL - memory exhausted\n");
test->status[IOAT_TEST_NO_MEMORY]++;
return (ENOMEM);
}
TAILQ_INSERT_HEAD(&test->free_q, tx, entry);
tx->test = test;
tx->depth = test->chain_depth;
/* fill in source buffers */
for (j = 0; j < (tx->length / sizeof(uint32_t)); j++) {
uint32_t val = j + (index << 28);
for (k = 0; k < test->chain_depth; k++) {
((uint32_t *)tx->buf[2*k])[j] = ~val;
((uint32_t *)tx->buf[2*k+1])[j] = val;
}
}
}
atomic_add_32(&test->num_completions, 1);
ioat_test_transaction_destroy(tx);
if (test->num_completions == test->num_loops)
wakeup(test);
return (0);
}
static void
ioat_test_release_memory(struct ioat_test *test)
{
struct test_transaction *tx, *s;
TAILQ_FOREACH_SAFE(tx, &test->free_q, entry, s)
ioat_test_transaction_destroy(tx);
TAILQ_INIT(&test->free_q);
TAILQ_FOREACH_SAFE(tx, &test->pend_q, entry, s)
ioat_test_transaction_destroy(tx);
TAILQ_INIT(&test->pend_q);
}
static void
ioat_test_submit_1_tx(struct ioat_test *test, bus_dmaengine_t dma)
{
struct test_transaction *tx;
struct bus_dmadesc *desc;
bus_dmaengine_callback_t cb;
bus_addr_t src, dest;
uint32_t i, flags;
IT_LOCK();
while (TAILQ_EMPTY(&test->free_q))
msleep(&test->free_q, &ioat_test_lk, 0, "test_submit", 0);
tx = TAILQ_FIRST(&test->free_q);
TAILQ_REMOVE(&test->free_q, tx, entry);
TAILQ_INSERT_HEAD(&test->pend_q, tx, entry);
IT_UNLOCK();
ioat_acquire(dma);
for (i = 0; i < tx->depth; i++) {
src = vtophys((vm_offset_t)tx->buf[2*i]);
dest = vtophys((vm_offset_t)tx->buf[2*i+1]);
if (i == tx->depth - 1) {
cb = ioat_dma_test_callback;
flags = DMA_INT_EN;
} else {
cb = NULL;
flags = 0;
}
desc = ioat_copy(dma, src, dest, tx->length, cb, tx, flags);
if (desc == NULL)
panic("Failed to allocate a ring slot "
"-- this shouldn't happen!");
}
ioat_release(dma);
}
static void
ioat_dma_test(void *arg)
{
struct test_transaction *tx;
struct ioat_test *test;
bus_dmaengine_t dmaengine;
uint32_t loops;
int index, i;
int index, rc, start, end;
test = arg;
loops = test->num_loops;
memset(__DEVOLATILE(void *, test->status), 0, sizeof(test->status));
test->status = IOAT_TEST_OK;
test->num_completions = 0;
index = g_thread_index++;
dmaengine = ioat_get_dmaengine(test->channel_index);
if (dmaengine == NULL) {
ioat_log_message(0, "Couldn't acquire dmaengine\n");
test->status = IOAT_TEST_NO_DMA_ENGINE;
if (test->buffer_size > 1024 * 1024) {
ioat_log_message(0, "Buffer size too large >1MB\n");
test->status[IOAT_TEST_NO_MEMORY]++;
return;
}
ioat_log_message(0, "Thread %d: num_loops remaining: 0x%07x\n", index,
test->num_loops);
for (loops = 0; loops < test->num_loops; loops++) {
bus_addr_t src, dest;
if (loops % 0x10000 == 0) {
ioat_log_message(0, "Thread %d: "
"num_loops remaining: 0x%07x\n", index,
test->num_loops - loops);
}
tx = ioat_test_transaction_create(2, IOAT_TEST_SIZE);
if (tx == NULL) {
ioat_log_message(0, "tx == NULL - memory exhausted\n");
atomic_add_32(&test->num_completions, 1);
test->status = IOAT_TEST_NO_MEMORY;
continue;
}
tx->test = test;
wmb();
/* fill in source buffer */
for (i = 0; i < (IOAT_TEST_SIZE / sizeof(uint32_t)); i++) {
uint32_t val = i + (loops << 16) + (index << 28);
((uint32_t *)tx->buf[0])[i] = ~val;
((uint32_t *)tx->buf[1])[i] = val;
}
src = pmap_kextract((vm_offset_t)tx->buf[0]);
dest = pmap_kextract((vm_offset_t)tx->buf[1]);
ioat_acquire(dmaengine);
ioat_copy(dmaengine, src, dest, IOAT_TEST_SIZE,
ioat_dma_test_callback, tx, DMA_INT_EN);
ioat_release(dmaengine);
if (test->chain_depth * 2 > IOAT_MAX_BUFS) {
ioat_log_message(0, "Depth too large (> %u)\n",
(unsigned)IOAT_MAX_BUFS / 2);
test->status[IOAT_TEST_NO_MEMORY]++;
return;
}
while (test->num_completions < test->num_loops)
tsleep(test, 0, "compl", 5 * hz);
if (btoc((uint64_t)test->buffer_size * test->chain_depth *
test->transactions) > (physmem / 4)) {
ioat_log_message(0, "Sanity check failed -- test would "
"use more than 1/4 of phys mem.\n");
test->status[IOAT_TEST_NO_MEMORY]++;
return;
}
if ((uint64_t)test->transactions * test->chain_depth > (1<<16)) {
ioat_log_message(0, "Sanity check failed -- test would "
"use more than available IOAT ring space.\n");
test->status[IOAT_TEST_NO_MEMORY]++;
return;
}
dmaengine = ioat_get_dmaengine(test->channel_index);
if (dmaengine == NULL) {
ioat_log_message(0, "Couldn't acquire dmaengine\n");
test->status[IOAT_TEST_NO_DMA_ENGINE]++;
return;
}
index = g_thread_index++;
TAILQ_INIT(&test->free_q);
TAILQ_INIT(&test->pend_q);
if (test->duration == 0)
ioat_log_message(1, "Thread %d: num_loops remaining: 0x%08x\n",
index, test->transactions);
else
ioat_log_message(1, "Thread %d: starting\n", index);
rc = ioat_test_prealloc_memory(test, index);
if (rc != 0) {
ioat_log_message(0, "prealloc_memory: %d\n", rc);
return;
}
wmb();
test->too_late = false;
start = ticks;
end = start + (((sbintime_t)test->duration * hz) / 1000);
for (loops = 0;; loops++) {
if (test->duration == 0 && loops >= test->transactions)
break;
else if (test->duration != 0 && time_after(ticks, end)) {
test->too_late = true;
break;
}
ioat_test_submit_1_tx(test, dmaengine);
}
ioat_log_message(1, "Test Elapsed: %d ticks (overrun %d), %d sec.\n",
ticks - start, ticks - end, (ticks - start) / hz);
IT_LOCK();
while (!TAILQ_EMPTY(&test->pend_q))
msleep(&test->free_q, &ioat_test_lk, 0, "ioattestcompl", hz);
IT_UNLOCK();
ioat_log_message(1, "Test Elapsed2: %d ticks (overrun %d), %d sec.\n",
ticks - start, ticks - end, (ticks - start) / hz);
ioat_test_release_memory(test);
}
static int

View File

@ -29,17 +29,39 @@ __FBSDID("$FreeBSD$");
#ifndef __IOAT_TEST_H__
#define __IOAT_TEST_H__
struct ioat_test {
uint32_t channel_index;
uint32_t num_loops;
volatile uint32_t num_completions;
uint32_t status;
enum ioat_res {
IOAT_TEST_OK = 0,
IOAT_TEST_NO_DMA_ENGINE,
IOAT_TEST_NO_MEMORY,
IOAT_TEST_MISCOMPARE,
IOAT_NUM_RES
};
#define IOAT_TEST_OK 0
#define IOAT_TEST_NO_DMA_ENGINE 1
#define IOAT_TEST_NO_MEMORY 2
#define IOAT_TEST_MISCOMPARE 3
struct test_transaction;
struct ioat_test {
volatile uint32_t status[IOAT_NUM_RES];
uint32_t channel_index;
/* HW max of 1MB */
uint32_t buffer_size;
uint32_t chain_depth;
uint32_t transactions;
/*
* If non-zero, duration is time in ms;
* If zero, bounded by 'transactions' above.
*/
uint32_t duration;
/* If true, check for miscompares after a copy. */
bool verify;
/* Internal usage -- not test inputs */
TAILQ_HEAD(, test_transaction) free_q;
TAILQ_HEAD(, test_transaction) pend_q;
volatile bool too_late;
};
#define IOAT_DMATEST _IOWR('i', 0, struct ioat_test)

View File

@ -4,5 +4,6 @@ PROG= ioatcontrol
MAN= ioatcontrol.8
CFLAGS+= -I${.CURDIR:H:H:H}/sys/dev/ioat
WARNS?= 6
LIBADD= util
.include <bsd.prog.mk>

View File

@ -24,7 +24,7 @@
.\"
.\" $FreeBSD$
.\"
.Dd August 24, 2015
.Dd October 21, 2015
.Dt IOATCONTROL 8
.Os
.Sh NAME
@ -33,18 +33,64 @@
.Xr ioat 4
.Sh SYNOPSIS
.Nm
.Op Fl V
.Ar channel_number
.Ar num_loops
.Ar num_txns
.Ar [ bufsize
.Ar [ chain-len
.Ar [ duration ] ] ]
.Sh DESCRIPTION
.Nm
allows one to issue some number of test operations to the
.Xr ioat 4
driver on a specific hardware channel.
The arguments are as follows:
.Bl -tag -width Ds
.It Fl V
Verify copies for accuracy
.El
.Pp
Each loop will allocate two chunks of memory, write data patterns to them,
submit a DMA request to copy one buffer to the other, and compare the contents
in the callback.
If the contents are not as expected, an error is reported.
.Nm
operates in one of two modes; if the
.Ar duration
argument is passed,
.Nm
tries to estimate the copy rate in bytes per second by running
.Ar num_txns
repeatedly in loop.
If
.Ar duration
is not passed,
.Nm
only runs through
.Ar num_txns
once and prints the total bytes copied, as well as error information.
.Pp
The
.Ar bufsize
argument determines the size of buffers to use for each
.Fn ioat_copy
invocation.
The default is 256 KB.
.Pp
The
.Ar chain-len
argument determines the number of copies to chain together in a single DMA
transaction.
The default is 1, and the maximum is currently 4.
.Pp
The
.Ar duration
argument specifies an approximate time limit for the test, in milliseconds.
.Pp
The test will allocate two chunks of memory for each component of each
transaction's chain.
It will initialize them with specific data patterns.
During the test, it submits DMA requests to copy between pairs of buffers.
If the
.Fl V
flag was specified, it will compare the contents in the callback for a copy
error.
.Sh FILES
.Pa /dev/ioat_test
.Pp
@ -55,6 +101,10 @@ and
.Nm
exposes it with
.Cd hw.ioat.enable_ioat_test=1 .
.Sh DIAGNOSTICS
The wait channel
.Va test_submit
indicates that the test code is keeping the DMA engine full of work.
.Sh SEE ALSO
.Xr ioat 4
.Sh HISTORY

View File

@ -28,34 +28,88 @@
__FBSDID("$FreeBSD$");
#include <sys/ioctl.h>
#include <sys/queue.h>
#include <fcntl.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <sysexits.h>
#include <unistd.h>
#include <libutil.h>
#include "ioat_test.h"
static int prettyprint(struct ioat_test *);
static void
usage(void)
{
printf("Usage: %s [-V] <channel #> <txns> [<bufsize> "
"[<chain-len> [duration]]]\n", getprogname());
exit(EX_USAGE);
}
int
main(int argc, char **argv)
{
struct ioat_test t;
int fd;
int fd, ch;
if (argc < 3) {
printf("Usage: %s <channel #> <num_loops>\n", argv[0]);
return (EX_USAGE);
while ((ch = getopt(argc, argv, "V")) != -1) {
switch (ch) {
case 'V':
t.verify = true;
break;
default:
usage();
}
}
argc -= optind;
argv += optind;
t.channel_index = atoi(argv[1]);
if (argc < 2)
usage();
/* Defaults for optional args */
t.buffer_size = 256 * 1024;
t.chain_depth = 2;
t.duration = 0;
t.channel_index = atoi(argv[0]);
if (t.channel_index > 8) {
printf("Channel number must be between 0 and 7.\n");
return (EX_USAGE);
}
t.num_loops = atoi(argv[2]);
t.transactions = atoi(argv[1]);
if (argc >= 3) {
t.buffer_size = atoi(argv[2]);
if (t.buffer_size == 0) {
printf("Buffer size must be greater than zero\n");
return (EX_USAGE);
}
}
if (argc >= 4) {
t.chain_depth = atoi(argv[3]);
if (t.chain_depth < 1) {
printf("Chain length must be greater than zero\n");
return (EX_USAGE);
}
}
if (argc >= 5) {
t.duration = atoi(argv[4]);
if (t.duration < 1) {
printf("Duration must be greater than zero\n");
return (EX_USAGE);
}
}
fd = open("/dev/ioat_test", O_RDWR);
if (fd < 0) {
@ -66,5 +120,44 @@ main(int argc, char **argv)
(void)ioctl(fd, IOAT_DMATEST, &t);
close(fd);
return (t.status);
return (prettyprint(&t));
}
static int
prettyprint(struct ioat_test *t)
{
char bps[10], bytesh[10];
uintmax_t bytes;
if (t->status[IOAT_TEST_NO_DMA_ENGINE] != 0 ||
t->status[IOAT_TEST_NO_MEMORY] != 0 ||
t->status[IOAT_TEST_MISCOMPARE] != 0) {
printf("Errors:\n");
if (t->status[IOAT_TEST_NO_DMA_ENGINE] != 0)
printf("\tNo DMA engine present: %u\n",
(unsigned)t->status[IOAT_TEST_NO_DMA_ENGINE]);
if (t->status[IOAT_TEST_NO_MEMORY] != 0)
printf("\tOut of memory: %u\n",
(unsigned)t->status[IOAT_TEST_NO_MEMORY]);
if (t->status[IOAT_TEST_MISCOMPARE] != 0)
printf("\tMiscompares: %u\n",
(unsigned)t->status[IOAT_TEST_MISCOMPARE]);
}
printf("Processed %u txns\n", (unsigned)t->status[IOAT_TEST_OK] /
t->chain_depth);
bytes = (uintmax_t)t->buffer_size * t->status[IOAT_TEST_OK];
humanize_number(bytesh, sizeof(bytesh), (int64_t)bytes, "B",
HN_AUTOSCALE, HN_DECIMAL);
if (t->duration) {
humanize_number(bps, sizeof(bps),
(int64_t)1000 * bytes / t->duration, "B/s", HN_AUTOSCALE,
HN_DECIMAL);
printf("%ju (%s) copied in %u ms (%s)\n", bytes, bytesh,
(unsigned)t->duration, bps);
} else
printf("%ju (%s) copied\n", bytes, bytesh);
return (EX_OK);
}