Implement moving SD.

From the paper "Incremental calculation of weighted mean and variance"
by Tony Finch Februrary 2009, retrieved from
http://people.ds.cam.ac.uk/fanf2/hermes/doc/antiforgery/stats.pdf
converted to use shifting.
This commit is contained in:
Warner Losh 2017-03-22 19:18:47 +00:00
parent eea7b35fb9
commit 79d80af216
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=315735

View File

@ -93,6 +93,7 @@ SYSCTL_INT(_kern_cam, OID_AUTO, do_dynamic_iosched, CTLFLAG_RD,
* For a brief intro: https://en.wikipedia.org/wiki/Moving_average
*
* [*] Steal from the load average code and many other places.
* Note: See computation of EMA and EMVAR for acceptable ranges of alpha.
*/
static int alpha_bits = 9;
TUNABLE_INT("kern.cam.iosched_alpha_bits", &alpha_bits);
@ -226,7 +227,7 @@ struct iop_stats {
*/
/* Exp Moving Average, see alpha_bits for more details */
sbintime_t ema;
sbintime_t emss; /* Exp Moving sum of the squares */
sbintime_t emvar;
sbintime_t sd; /* Last computed sd */
uint32_t state_flags;
@ -755,8 +756,7 @@ cam_iosched_iop_stats_init(struct cam_iosched_softc *isc, struct iop_stats *ios)
ios->queued = 0;
ios->total = 0;
ios->ema = 0;
ios->emss = 0;
ios->sd = 0;
ios->emvar = 0;
ios->softc = isc;
}
@ -897,13 +897,9 @@ cam_iosched_iop_stats_sysctl_init(struct cam_iosched_softc *isc, struct iop_stat
&ios->ema,
"Fast Exponentially Weighted Moving Average");
SYSCTL_ADD_UQUAD(ctx, n,
OID_AUTO, "emss", CTLFLAG_RD,
&ios->emss,
"Fast Exponentially Weighted Moving Sum of Squares (maybe wrong)");
SYSCTL_ADD_UQUAD(ctx, n,
OID_AUTO, "sd", CTLFLAG_RD,
&ios->sd,
"Estimated SD for fast ema (may be wrong)");
OID_AUTO, "emvar", CTLFLAG_RD,
&ios->emvar,
"Fast Exponentially Weighted Moving Variance");
SYSCTL_ADD_INT(ctx, n,
OID_AUTO, "pending", CTLFLAG_RD,
@ -1523,35 +1519,6 @@ isqrt64(uint64_t val)
return res;
}
/*
* a and b are 32.32 fixed point stored in a 64-bit word.
* Let al and bl be the .32 part of a and b.
* Let ah and bh be the 32 part of a and b.
* R is the radix and is 1 << 32
*
* a * b
* (ah + al / R) * (bh + bl / R)
* ah * bh + (al * bh + ah * bl) / R + al * bl / R^2
*
* After multiplicaiton, we have to renormalize by multiply by
* R, so we wind up with
* ah * bh * R + al * bh + ah * bl + al * bl / R
* which turns out to be a very nice way to compute this value
* so long as ah and bh are < 65536 there's no loss of high bits
* and the low order bits are below the threshold of caring for
* this application.
*/
static uint64_t
mul(uint64_t a, uint64_t b)
{
uint64_t al, ah, bl, bh;
al = a & 0xffffffff;
ah = a >> 32;
bl = b & 0xffffffff;
bh = b >> 32;
return ((ah * bh) << 32) + al * bh + ah * bl + ((al * bl) >> 32);
}
static sbintime_t latencies[] = {
SBT_1MS << 0,
SBT_1MS << 1,
@ -1569,8 +1536,7 @@ static sbintime_t latencies[] = {
static void
cam_iosched_update(struct iop_stats *iop, sbintime_t sim_latency)
{
sbintime_t y, yy;
uint64_t var;
sbintime_t y, deltasq, delta;
int i;
/*
@ -1591,42 +1557,61 @@ cam_iosched_update(struct iop_stats *iop, sbintime_t sim_latency)
* (2 ^ -alpha_bits). For more info see the NIST statistical
* handbook.
*
* ema_t = y_t * alpha + ema_t-1 * (1 - alpha)
* ema_t = y_t * alpha + ema_t-1 * (1 - alpha) [nist]
* ema_t = y_t * alpha + ema_t-1 - alpha * ema_t-1
* ema_t = alpha * y_t - alpha * ema_t-1 + ema_t-1
* alpha = 1 / (1 << alpha_bits)
* sub e == ema_t-1, b == 1/alpha (== 1 << alpha_bits), d == y_t - ema_t-1
* = y_t/b - e/b + be/b
* = (y_t - e + be) / b
* = (e + d) / b
*
* Since alpha is a power of two, we can compute this w/o any mult or
* division.
*
* Variance can also be computed. Usually, it would be expressed as follows:
* diff_t = y_t - ema_t-1
* emvar_t = (1 - alpha) * (emavar_t-1 + diff_t^2 * alpha)
* = emavar_t-1 - alpha * emavar_t-1 + delta_t^2 * alpha - (delta_t * alpha)^2
* sub b == 1/alpha (== 1 << alpha_bits), e == emavar_t-1, d = delta_t^2
* = e - e/b + dd/b + dd/bb
* = (bbe - be + bdd + dd) / bb
* = (bbe + b(dd-e) + dd) / bb (which is expanded below bb = 1<<(2*alpha_bits))
*/
/*
* XXX possible numeric issues
* o We assume right shifted integers do the right thing, since that's
* implementation defined. You can change the right shifts to / (1LL << alpha).
* o alpha_bits = 9 gives ema ceiling of 23 bits of seconds for ema and 14 bits
* for emvar. This puts a ceiling of 13 bits on alpha since we need a
* few tens of seconds of representation.
* o We mitigate alpha issues by never setting it too high.
*/
y = sim_latency;
iop->ema = (y + (iop->ema << alpha_bits) - iop->ema) >> alpha_bits;
yy = mul(y, y);
iop->emss = (yy + (iop->emss << alpha_bits) - iop->emss) >> alpha_bits;
delta = (y - iop->ema); /* d */
iop->ema = ((iop->ema << alpha_bits) + delta) >> alpha_bits;
/*
* s_1 = sum of data
* s_2 = sum of data * data
* ema ~ mean (or s_1 / N)
* emss ~ s_2 / N
*
* sd = sqrt((N * s_2 - s_1 ^ 2) / (N * (N - 1)))
* sd = sqrt((N * s_2 / N * (N - 1)) - (s_1 ^ 2 / (N * (N - 1))))
*
* N ~ 2 / alpha - 1
* alpha < 1 / 16 (typically much less)
* N > 31 --> N large so N * (N - 1) is approx N * N
*
* substituting and rearranging:
* sd ~ sqrt(s_2 / N - (s_1 / N) ^ 2)
* ~ sqrt(emss - ema ^ 2);
* which is the formula used here to get a decent estimate of sd which
* we use to detect outliers. Note that when first starting up, it
* takes a while for emss sum of squares estimator to converge on a
* good value. during this time, it can be less than ema^2. We
* compute a sd of 0 in that case, and ignore outliers.
* Were we to naively plow ahead at this point, we wind up with many numerical
* issues making any SD > ~3ms unreliable. So, we shift right by 12. This leaves
* us with microsecond level precision in the input, so the same in the
* output. It means we can't overflow deltasq unless delta > 4k seconds. It
* also means that emvar can be up 46 bits 40 of which are fraction, which
* gives us a way to measure up to ~8s in the SD before the computation goes
* unstable. Even the worst hard disk rarely has > 1s service time in the
* drive. It does mean we have to shift left 12 bits after taking the
* square root to compute the actual standard deviation estimate. This loss of
* precision is preferable to needing int128 types to work. The above numbers
* assume alpha=9. 10 or 11 are ok, but we start to run into issues at 12,
* so 12 or 13 is OK for EMA, EMVAR and SD will be wrong in those cases.
*/
var = iop->emss - mul(iop->ema, iop->ema);
iop->sd = (int64_t)var < 0 ? 0 : isqrt64(var);
delta >>= 12;
deltasq = delta * delta; /* dd */
iop->emvar = ((iop->emvar << (2 * alpha_bits)) + /* bbe */
((deltasq - iop->emvar) << alpha_bits) + /* b(dd-e) */
deltasq) /* dd */
>> (2 * alpha_bits); /* div bb */
iop->sd = (sbintime_t)isqrt64((uint64_t)iop->emvar) << 12;
}
static void