Finish the fix for overflow in calcru1().

The previous fix was unnecessarily very slow up to 105 hours where the
simple formula used previously worked, and unnecessarily slow by a factor
of about 5/3 up to 388 days, and didn't work above 388 days.  388 days is
not a long time, since it is a reasonable uptime, and for processes the
times being calculated are aggregated over all threads, so with N CPUs
running the same thread a runtime of 388 days is reachable after only
388 / N physical days.

The PRs document overflow at 388 days, but don't try to fix it.

Use the simple formula up to 76 hours.  Then use a complicated general
method that reduces to the simple formula up to a bit less than 105
hours, then reduces to the previous method without its extra work up
to almost 388 days, then does more complicated reductions, usually
many bits at a time so that this is not slow.  This works up to half
of maximum representable time (292271 years), with accumulated rounding
errors of at most 32 usec.

amd64 can do all this with no avoidable rounding errors in an inline
asm with 2 instructions, but this is too special to use.  __uint128_t
can do the same with 100's of instructions on 64-bit arches.  Long
doubles with at least 64 bits of precision are the easiest method to
use on i386 userland, but are hard to use in the kernel.

PR:		76972 and duplicates
Reviewed by:	kib
This commit is contained in:
bde 2019-02-14 19:07:08 +00:00
parent e437b09b08
commit 24c8af8da8

View File

@ -863,13 +863,88 @@ rufetchtd(struct thread *td, struct rusage *ru)
calcru1(p, &td->td_rux, &ru->ru_utime, &ru->ru_stime);
}
/* XXX: the MI version is too slow to use: */
#ifndef __HAVE_INLINE_FLSLL
#define flsll(x) (fls((x) >> 32) != 0 ? fls((x) >> 32) + 32 : fls(x))
#endif
static uint64_t
mul64_by_fraction(uint64_t a, uint64_t b, uint64_t c)
{
uint64_t acc, bh, bl;
int i, s, sa, sb;
/*
* Compute floor(a * (b / c)) without overflowing, (b / c) <= 1.0.
* Calculate (a * b) / c accurately enough without overflowing. c
* must be nonzero, and its top bit must be 0. a or b must be
* <= c, and the implementation is tuned for b <= c.
*
* The comments about times are for use in calcru1() with units of
* microseconds for 'a' and stathz ticks at 128 Hz for b and c.
*
* Let n be the number of top zero bits in c. Each iteration
* either returns, or reduces b by right shifting it by at least n.
* The number of iterations is at most 1 + 64 / n, and the error is
* at most the number of iterations.
*
* It is very unusual to need even 2 iterations. Previous
* implementations overflowed essentially by returning early in the
* first iteration, with n = 38 giving overflow at 105+ hours and
* n = 32 giving overlow at at 388+ days despite a more careful
* calculation. 388 days is a reasonable uptime, and the calculation
* needs to work for the uptime times the number of CPUs since 'a'
* is per-process.
*/
return ((a / c) * b + (a % c) * (b / c) + (a % c) * (b % c) / c);
if (a >= (uint64_t)1 << 63)
return (0); /* Unsupported arg -- can't happen. */
acc = 0;
for (i = 0; i < 128; i++) {
sa = flsll(a);
sb = flsll(b);
if (sa + sb <= 64)
/* Up to 105 hours on first iteration. */
return (acc + (a * b) / c);
if (a >= c) {
/*
* This reduction is based on a = q * c + r, with the
* remainder r < c. 'a' may be large to start, and
* moving bits from b into 'a' at the end of the loop
* sets the top bit of 'a', so the reduction makes
* significant progress.
*/
acc += (a / c) * b;
a %= c;
sa = flsll(a);
if (sa + sb <= 64)
/* Up to 388 days on first iteration. */
return (acc + (a * b) / c);
}
/*
* This step writes a * b as a * ((bh << s) + bl) =
* a * (bh << s) + a * bl = (a << s) * bh + a * bl. The 2
* additive terms are handled separately. Splitting in
* this way is linear except for rounding errors.
*
* s = 64 - sa is the maximum such that a << s fits in 64
* bits. Since a < c and c has at least 1 zero top bit,
* sa < 64 and s > 0. Thus this step makes progress by
* reducing b (it increases 'a', but taking remainders on
* the next iteration completes the reduction).
*
* Finally, the choice for s is just what is needed to keep
* a * bl from overflowing, so we don't need complications
* like a recursive call mul64_by_fraction(a, bl, c) to
* handle the second additive term.
*/
s = 64 - sa;
bh = b >> s;
bl = b - (bh << s);
acc += (a * bl) / c;
a <<= s;
b = bh;
}
return (0); /* Algorithm failure -- can't happen. */
}
static void
@ -896,15 +971,23 @@ calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up,
tu = ruxp->rux_tu;
}
/* Subdivide tu. Avoid overflow in the multiplications. */
if (__predict_true(tu <= ((uint64_t)1 << 38) && tt <= (1 << 26))) {
/* Up to 76 hours when stathz is 128. */
uu = (tu * ut) / tt;
su = (tu * st) / tt;
} else {
uu = mul64_by_fraction(tu, ut, tt);
su = mul64_by_fraction(tu, ut, st);
}
if (tu >= ruxp->rux_tu) {
/*
* The normal case, time increased.
* Enforce monotonicity of bucketed numbers.
*/
uu = mul64_by_fraction(tu, ut, tt);
if (uu < ruxp->rux_uu)
uu = ruxp->rux_uu;
su = mul64_by_fraction(tu, st, tt);
if (su < ruxp->rux_su)
su = ruxp->rux_su;
} else if (tu + 3 > ruxp->rux_tu || 101 * tu > 100 * ruxp->rux_tu) {
@ -933,8 +1016,6 @@ calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up,
"to %ju usec for pid %d (%s)\n",
(uintmax_t)ruxp->rux_tu, (uintmax_t)tu,
p->p_pid, p->p_comm);
uu = mul64_by_fraction(tu, ut, tt);
su = mul64_by_fraction(tu, st, tt);
}
ruxp->rux_uu = uu;