Report I/O stats from the CAM_IOSCHED_DYNAMIC extension

Report, on a periodic basis, the I/O latencies the CAM I/O scheduler
computes. These times are only for the hardware portion of the I/O as
measured from the time the operation is scheduled with the SIM using
xpt_action() until the SIM reports it has completed with xpt_dine(). Any
time the I/O operation spends in a software queue is no included.

The P50 (median), P90, P99 and P99.9 statistics about the latency of
each of the read, write and trim operations that completed during the
polling interval are reported. If there are fewer than 2, 10, 100 or
1000 operations during the polling interval, no statistic is reported
and a single dash '-' is displayed.

The read, write and trim commands (either on the command line or at run
time) toggle display of these operations. The color command toggles
color (it defaults to on, like gstat). When color is enabled, unknown
statistics are reported in blue, high latency for a statistics is
reported in red, medium in magenta and low in green (as with gstat). The
med= and hi= commands can set these latency thresholds.

Limitations: The entire sysctl space for all the devices is walked for
each polling period. This should be optimized to remember the OIDs and
only do such polling with the xpt generation changes. There is also no
way to filter devices displayed. This command only works on physical
devies that are connected to SCSI, ATA or NVME sims as those are the
only ones that are instrumented in the CAM I/O scheduler (the
CAM_IOSCHED_DYNAMIC option must be in the kernel, and the dynamic
scheduler can't be disabled).

MFC After:		1 month
Relnotes:		yes
Sponsored by:		Netflix
Reviewed by:		pauamma_gundo.com, chs
Differential Revision:	https://reviews.freebsd.org/D34259
This commit is contained in:
Warner Losh 2022-02-28 10:17:06 -07:00
parent dbd330fc1c
commit 22054f8891
6 changed files with 594 additions and 1 deletions

View File

@ -4,7 +4,7 @@
.include <src.opts.mk>
PROG= systat
SRCS= cmds.c cmdtab.c devs.c fetch.c iostat.c keyboard.c main.c sysput.c \
SRCS= cmds.c cmdtab.c devs.c fetch.c iolat.c iostat.c keyboard.c main.c sysput.c \
netcmds.c netstat.c pigs.c proc.c swap.c icmp.c \
mode.c ip.c sctp.c tcp.c zarc.c \
vmstat.c convtbl.c ifcmds.c ifstat.c

View File

@ -83,6 +83,9 @@ struct cmdtab cmdtab[] = {
{ "zarc", showzarc, fetchzarc, labelzarc,
initzarc, openzarc, closezarc, 0,
resetzarc, CF_ZFSARC },
{ "iolat", showiolat, fetchiolat, labeliolat,
initiolat, openiolat, closeiolat, cmdiolat,
0, CF_LOADAV },
{ NULL, NULL, NULL, NULL, NULL, NULL, NULL, 0, 0, 0 }
};
struct cmdtab *curcmd = &cmdtab[0];

View File

@ -88,6 +88,7 @@ void closepigs(WINDOW *);
void closeswap(WINDOW *);
void closetcp(WINDOW *);
int cmdifstat(const char *, const char *);
int cmdiolat(const char *, const char *);
int cmdiostat(const char *, const char *);
int cmdkre(const char *, const char *);
int cmdnetstat(const char *, const char *);
@ -188,3 +189,4 @@ void sysputwuint64(WINDOW* , int, int, int, uint64_t, int);
SYSTAT_CMD( zarc );
SYSTAT_CMD( sctp );
SYSTAT_CMD( iolat );

520
usr.bin/systat/iolat.c Normal file
View File

@ -0,0 +1,520 @@
/*
* Copyright (c) 2021 Netflix, Inc
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <sys/param.h>
#include <sys/sysctl.h>
#include <sys/resource.h>
#include <devstat.h>
#include <err.h>
#include <errno.h>
#include <math.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#include <sys/queue.h>
#include <sys/sysctl.h>
#include "systat.h"
#include "extern.h"
#include "devs.h"
#define CAM_BASE "kern.cam"
#define LATENCY ".latencies"
#define CAM_IOSCHED_BASE "kern.cam.iosched.bucket_base_us"
#define DEV_NAMSIZE 32
#define OP_NAMSIZE 16
#define MAX_LATS 32
static double high_thresh = 500;
static double med_thresh = 300;
static bool docolor = true;
static int ndevs;
static SLIST_HEAD(, iosched_stat) curlist;
struct iosched_op_stat {
int nlats;
uint64_t lats[MAX_LATS];
uint64_t prev_lats[MAX_LATS];
};
enum { OP_READ = 0, OP_WRITE, OP_TRIM, NUM_OPS };
static const char *ops[NUM_OPS] = { "read", "write", "trim" };
#define OP_READ_MASK (1 << OP_READ)
#define OP_WRITE_MASK (1 << OP_WRITE)
#define OP_TRIM_MASK (1 << OP_TRIM)
static uint32_t flags = OP_READ_MASK | OP_WRITE_MASK | OP_TRIM_MASK;
struct iosched_stat {
SLIST_ENTRY(iosched_stat) link;
char dev_name[DEV_NAMSIZE];
int unit;
struct iosched_op_stat op_stats[NUM_OPS];
};
static int name2oid(const char *, int *);
static int walk_sysctl(int *, size_t);
static int
name2oid(const char *name, int *oidp)
{
int oid[2];
int i;
size_t j;
oid[0] = CTL_SYSCTL;
oid[1] = CTL_SYSCTL_NAME2OID;
j = CTL_MAXNAME * sizeof(int);
i = sysctl(oid, 2, oidp, &j, name, strlen(name));
if (i < 0)
return (i);
j /= sizeof(int);
return (j);
}
static size_t /* Includes the trailing NUL */
oid2name(int *oid, size_t nlen, char *name, size_t namlen)
{
int qoid[CTL_MAXNAME + 2];
int i;
size_t j;
bzero(name, namlen);
qoid[0] = CTL_SYSCTL;
qoid[1] = CTL_SYSCTL_NAME;
memcpy(qoid + 2, oid, nlen * sizeof(int));
j = namlen;
i = sysctl(qoid, nlen + 2, name, &j, 0, 0);
if (i || !j)
err(1, "sysctl name %d %zu %d", i, j, errno);
return (j);
}
static int
oidfmt(int *oid, int len, u_int *kind)
{
int qoid[CTL_MAXNAME+2];
u_char buf[BUFSIZ];
int i;
size_t j;
qoid[0] = CTL_SYSCTL;
qoid[1] = CTL_SYSCTL_OIDFMT;
memcpy(qoid + 2, oid, len * sizeof(int));
j = sizeof(buf);
i = sysctl(qoid, len + 2, buf, &j, 0, 0);
if (i)
err(1, "sysctl fmt %d %zu %d", i, j, errno);
*kind = *(u_int *)buf;
return (0);
}
static int
split_u64(char *str, const char *delim, uint64_t *buckets, int *nbuckets)
{
int n = *nbuckets, i;
char *v;
memset(buckets, 0, n * sizeof(buckets[0]));
for (i = 0; (v = strsep(&str, delim)) != NULL && i < n; i++) {
buckets[i] = strtoull(v, NULL, 10);
}
if (i < n)
*nbuckets = i;
return (i < n);
}
static double baselat = 0.000020;
static float
pest(int permill, uint64_t *lats, int nlat)
{
uint64_t tot, samp;
int i;
float b1, b2;
for (tot = 0, i = 0; i < nlat; i++)
tot += lats[i];
if (tot == 0)
return -nanf("");
if (tot < (uint64_t)2000 / (1000 - permill))
return nanf("");
samp = tot * permill / 1000;
if (samp < lats[0])
return baselat * (float)samp / lats[0]; /* linear interpolation 0 and baselat */
for (tot = 0, i = 0; samp >= tot && i < nlat; i++)
tot += lats[i];
i--;
b1 = baselat * (1 << (i - 1));
b2 = baselat * (1 << i);
/* Should expoentially interpolate between buckets -- doing linear instead */
return b1 + (b2 - b1) * (float)(lats[i] - (tot - samp)) / lats[i];
}
static int
op2num(const char *op)
{
for (int i = 0; i < NUM_OPS; i++)
if (strcmp(op, ops[i]) == 0)
return i;
return -1;
}
static struct iosched_op_stat *
find_dev(const char *dev, int unit, int op)
{
struct iosched_stat *isp;
struct iosched_op_stat *iosp;
SLIST_FOREACH(isp, &curlist, link) {
if (strcmp(isp->dev_name, dev) != 0 || isp->unit != unit)
continue;
iosp = &isp->op_stats[op];
return iosp;
}
return NULL;
}
static struct iosched_op_stat *
alloc_dev(const char *dev, int unit, int op)
{
struct iosched_stat *isp;
struct iosched_op_stat *iosp;
isp = malloc(sizeof(*isp));
if (isp == NULL)
return NULL;
strlcpy(isp->dev_name, dev, sizeof(isp->dev_name));
isp->unit = unit;
SLIST_INSERT_HEAD(&curlist, isp, link);
ndevs++;
iosp = &isp->op_stats[op];
return iosp;
}
#define E3 1000.0
static void
update_dev(const char *dev, int unit, int op, uint64_t *lats, int nlat)
{
struct iosched_op_stat *iosp;
iosp = find_dev(dev, unit, op);
if (iosp == NULL)
iosp = alloc_dev(dev, unit, op);
if (iosp == NULL)
return;
iosp->nlats = nlat;
memcpy(iosp->prev_lats, iosp->lats, iosp->nlats * sizeof(uint64_t));
memcpy(iosp->lats, lats, iosp->nlats * sizeof(uint64_t));
// printf("%s%d: %-6s %.3f %.3f %.3f %.3f\r\n",
// dev, unit, operation, E3 * pest(500, lats, nlat), E3 * pest(900, lats, nlat),
// E3 * pest(990, lats, nlat), E3 * pest(999, lats, nlat));
}
static int
walk_sysctl(int *base_oid, size_t len)
{
int qoid[CTL_MAXNAME + 2], oid[CTL_MAXNAME];
size_t l1, l2;
char name[BUFSIZ];
if (len > CTL_MAXNAME)
err(1, "Length %zd too long", len);
qoid[0] = CTL_SYSCTL;
qoid[1] = CTL_SYSCTL_NEXT;
l1 = 2;
memcpy(qoid + 2, base_oid, len * sizeof(int));
l1 += len;
for (;;) {
/*
* Get the next one or return when we get to the end of the
* sysctls in the kernel.
*/
l2 = sizeof(oid);
if (sysctl(qoid, l1, oid, &l2, 0, 0) != 0) {
if (errno == ENOENT)
return (0);
err(1, "sysctl(getnext) %zu", l2);
}
l2 /= sizeof(int);
/*
* Bail if we're seeing OIDs that don't have the
* same prefix or can't have the same prefix.
*/
if (l2 < len ||
memcmp(oid, base_oid, len * sizeof(int)) != 0)
return (0);
/*
* Get the name, validate it's one we're looking for,
* parse the latency and add to list.
*/
do {
int nlat;
size_t l3;
char val[BUFSIZ];
char *walker, *dev, *opstr;
uint64_t latvals[MAX_LATS];
u_int kind;
int unit, op;
l1 = oid2name(oid, l2, name, sizeof(name));
if (strcmp(name + l1 - strlen(LATENCY) - 1, LATENCY) != 0)
break;
if (oidfmt(oid, l2, &kind) != 0)
err(1, "oidfmt");
if ((kind & CTLTYPE) != CTLTYPE_STRING)
errx(1, "string");
l3 = sizeof(val);
if (sysctl(oid, l2, val, &l3, 0, 0) != 0)
err(1, "sysctl");
val[l3] = '\0';
nlat = nitems(latvals);
if (split_u64(val, ",", latvals, &nlat) == 0)
break;
walker = name + strlen(CAM_BASE) + 1;
dev = strsep(&walker, ".");
unit = (int)strtol(strsep(&walker, "."), NULL, 10);
strsep(&walker, ".");
opstr = strsep(&walker, ".");
op = op2num(opstr);
if (op < 0)
break;
update_dev(dev, unit, op, latvals, nlat);
} while (false);
memcpy(qoid + 2, oid, l2 * sizeof(int));
l1 = 2 + l2;
}
}
void
closeiolat(WINDOW *w)
{
if (w == NULL)
return;
wclear(w);
wrefresh(w);
delwin(w);
}
static void
doublecmd(const char *cmd, double *v)
{
const char *p;
double tv;
p = strchr(cmd, '=');
if (p == NULL)
return; /* XXX Tell the user something? */
if (sscanf(p + 1, "%lf", &tv) != 1)
return; /* XXX Tell the user something? */
*v = tv;
}
int
cmdiolat(const char *cmd __unused, const char *args __unused)
{
fprintf(stderr, "CMD IS '%s'\n\n", cmd);
if (prefix(cmd, "trim"))
flags ^= OP_TRIM_MASK;
else if (prefix(cmd, "read"))
flags ^= OP_READ_MASK;
else if (prefix(cmd, "write"))
flags ^= OP_WRITE_MASK;
else if (prefix(cmd, "color"))
docolor = !docolor;
else if (prefix("high", cmd))
doublecmd(cmd, &high_thresh);
else if (prefix("med", cmd))
doublecmd(cmd, &med_thresh);
else
return (0);
wclear(wnd);
labeliolat();
refresh();
return (1);
}
int
initiolat(void)
{
int cam[CTL_MAXNAME];
uint64_t sbt_base;
size_t len = sizeof(sbt_base);
SLIST_INIT(&curlist);
baselat = 1e-3; /* old default */
if (sysctlbyname(CAM_IOSCHED_BASE, &sbt_base, &len, NULL, 0) == 0)
baselat = sbt_base * 1e-6; /* Convert to microseconds */
name2oid(CAM_BASE, cam);
walk_sysctl(cam, 2);
return (1);
}
void
fetchiolat(void)
{
int cam[CTL_MAXNAME];
name2oid(CAM_BASE, cam);
walk_sysctl(cam, 2);
}
#define INSET 10
void
labeliolat(void)
{
int _col, ndrives, lpr, row, j;
int regions __unused;
struct iosched_stat *isp;
char tmpstr[32];
#define COLWIDTH 29
#define DRIVESPERLINE ((getmaxx(wnd) - 1 - INSET) / COLWIDTH)
ndrives = ndevs; // XXX FILTER XXX
regions = howmany(ndrives, DRIVESPERLINE);
lpr = 2; /* for headers */
for (int i = 0; i < NUM_OPS; i++) {
if (flags & (1 << i))
lpr++;
}
row = 0;
_col = INSET;
j = 2;
if (flags & OP_READ_MASK)
mvwaddstr(wnd, row + j++, 1, "read");
if (flags & OP_WRITE_MASK)
mvwaddstr(wnd, row + j++, 1, "write");
if (flags & OP_TRIM_MASK)
mvwaddstr(wnd, row + j++, 1, "trim");
SLIST_FOREACH(isp, &curlist, link) {
if (_col + COLWIDTH >= getmaxx(wnd) - 1 - INSET) {
_col = INSET;
row += lpr + 1;
if (row > getmaxy(wnd) - 1 - (lpr + 1))
break;
j = 2;
if (flags & OP_READ_MASK)
mvwaddstr(wnd, row + j++, 1, "read");
if (flags & OP_WRITE_MASK)
mvwaddstr(wnd, row + j++, 1, "write");
if (flags & OP_TRIM_MASK)
mvwaddstr(wnd, row + j++, 1, "trim");
}
snprintf(tmpstr, sizeof(tmpstr), "%s%d", isp->dev_name, isp->unit);
mvwaddstr(wnd, row, _col + (COLWIDTH - strlen(tmpstr)) / 2, tmpstr);
mvwaddstr(wnd, row + 1, _col, " p50 p90 p99 p99.9");
_col += COLWIDTH;
}
}
WINDOW *
openiolat(void)
{
return (subwin(stdscr, LINES-3-1, 0, MAINWIN_ROW, 0));
}
static void
fmt(float f, char *buf, size_t len)
{
if (isnan(f))
strlcpy(buf, " - ", len);
else if (f >= 1000.0)
snprintf(buf, len, "%6d", (int)f);
else if (f >= 100.0)
snprintf(buf, len, "%6.1f", f);
else if (f >= 10.0)
snprintf(buf, len, "%6.2f", f);
else
snprintf(buf, len, "%6.3f", f);
}
static void
latout(double lat, int y, int x)
{
int i;
char tmpstr[32];
fmt(lat, tmpstr, sizeof(tmpstr));
if (isnan(lat))
i = 4;
else if (lat > high_thresh)
i = 3;
else if (lat > med_thresh)
i = 2;
else
i = 1;
if (docolor)
wattron(wnd, COLOR_PAIR(i));
mvwaddstr(wnd, y, x, tmpstr);
if (docolor)
wattroff(wnd, COLOR_PAIR(i));
}
void
showiolat(void)
{
int _col, ndrives, lpr, row, k;
int regions __unused;
struct iosched_stat *isp;
struct iosched_op_stat *iosp;
#define COLWIDTH 29
#define DRIVESPERLINE ((getmaxx(wnd) - 1 - INSET) / COLWIDTH)
ndrives = ndevs; // XXX FILTER XXX
regions = howmany(ndrives, DRIVESPERLINE);
lpr = 2; /* XXX */
for (int i = 0; i < NUM_OPS; i++) {
if (flags & (1 << i))
lpr++;
}
row = 0;
_col = INSET;
SLIST_FOREACH(isp, &curlist, link) {
if (_col + COLWIDTH >= getmaxx(wnd) - 1 - INSET) {
_col = INSET;
row += lpr + 1;
if (row > getmaxy(wnd) - 1 - (lpr + 1))
break;
}
k = 2;
for (int i = 0; i < NUM_OPS; i++) {
uint64_t lats[MAX_LATS];
int nlats;
float p50, p90, p99, p999;
if ((flags & (1 << i)) == 0)
continue;
iosp = &isp->op_stats[i];
nlats = iosp->nlats;
memset(lats, 0, sizeof(lats));
for (int j = 0; j < iosp->nlats; j++)
lats[j] = iosp->lats[j] - iosp->prev_lats[j];
p50 = pest(500, lats, nlats) * E3;
p90 = pest(900, lats, nlats) * E3;
p99 = pest(990, lats, nlats) * E3;
p999 = pest(999, lats, nlats) * E3;
latout(p50, row + k, _col);
latout(p90, row + k, _col + 7);
latout(p99, row + k, _col + 14);
latout(p999, row + k, _col + 21);
k++;
}
_col += COLWIDTH;
}
}

View File

@ -155,6 +155,7 @@ main(int argc, char **argv)
char errbuf[_POSIX2_LINE_MAX], dummy;
size_t size;
struct cmdentry *cmd = NULL;
short cf, cb;
(void) setlocale(LC_ALL, "");
@ -214,6 +215,13 @@ main(int argc, char **argv)
* routines to minimize update work by curses.
*/
initscr();
start_color();
use_default_colors();
pair_content(0, &cf, &cb);
init_pair(1, COLOR_GREEN, cb);
init_pair(2, COLOR_MAGENTA, cb);
init_pair(3, COLOR_RED, cb);
init_pair(4, COLOR_BLUE, cb);
CMDLINE = LINES - 1;
wnd = (*curcmd->c_open)();
if (wnd == NULL) {

View File

@ -88,6 +88,7 @@ to be one of:
.Ic icmp ,
.Ic icmp6 ,
.Ic ifstat ,
.Ic iolat ,
.Ic iostat ,
.Ic ip ,
.Ic ip6 ,
@ -244,6 +245,59 @@ but with SCTP statistics.
Like
.Ic icmp ,
but with TCP statistics.
.It Ic iolat
Display statistics describing the hardware latencies of I/O operations as
computed by the
.Va CAM_IOSCHED_DYNAMIC
option.
This option must be in the kernel config file of the running kernel for this
display to work.
All devices are displayed as there is currently no way to filter them.
The statistics displayed for the I/O latencies are the percentiles with
sufficient data during the polling interval to compute.
If a value cannot be estimated ``-'' is displayed.
The P50 (also known as the median), P90, P99 and P99.9 values are computed if
more than 2, 10, 100 or 1000 operations occurred during the polling interval.
The latency is the hardware latency values, and does not include any software
queueing time.
The latencies are estimated based on histogram data computed by the CAM I/O
scheduler and represent estimates of the actual value that are only good to
two or three significant digits.
The display of latency changes based on the scale of the latency to reflect
the precision of the estimates and to fit on the available screen space.
All latencies are reported in milliseconds.
When color is enabled
.Bl -bullet
.It Values below the medium latency threshold are displayed in green.
.It Values between the minimum latency and high latency thresholds are displayed
in magenta.
.It Values above the high latency thresholds are displayed in red.
.Pp
When color is disabled, the default foreground and background colors are always
used.
.Pp
The following commands are specific to the
.Ic iolat
display; the minimum unambiguous prefix may be supplied.
.Pp
.Bl -tag -width Fl -compact
.It Cm color
Toggle the use of color in the display.
The default is on.
.It Cm hi=XXX
Set the high latency threshold to XXX milliseconds.
.It Cm med=XXX
Set the medium latency threshold to XXX milliseconds.
.It Cm read
Toggle the display of statistics about read operations.
The default is on.
.It Cm write
Toggle the display of statistics about write operations.
The default is on.
.It Cm trim
Toggle the display of statistics about trim operations.
The default is on.
.El
.It Ic iostat
Display, in the lower window, statistics about processor use
and disk throughput.
@ -682,3 +736,9 @@ The
.Ic vmstat
display looks out of place because it is (it was added in as
a separate display rather than created as a new program).
The
.Ic iolat
command doesn't implement the common device commands including
filtering, as it doesn't use the
.Xr devstat 3
mechanism to obtain its statistics.