Finish the fix for overflow in calcru1().
The previous fix was unnecessarily very slow up to 105 hours where the simple formula used previously worked, and unnecessarily slow by a factor of about 5/3 up to 388 days, and didn't work above 388 days. 388 days is not a long time, since it is a reasonable uptime, and for processes the times being calculated are aggregated over all threads, so with N CPUs running the same thread a runtime of 388 days is reachable after only 388 / N physical days. The PRs document overflow at 388 days, but don't try to fix it. Use the simple formula up to 76 hours. Then use a complicated general method that reduces to the simple formula up to a bit less than 105 hours, then reduces to the previous method without its extra work up to almost 388 days, then does more complicated reductions, usually many bits at a time so that this is not slow. This works up to half of maximum representable time (292271 years), with accumulated rounding errors of at most 32 usec. amd64 can do all this with no avoidable rounding errors in an inline asm with 2 instructions, but this is too special to use. __uint128_t can do the same with 100's of instructions on 64-bit arches. Long doubles with at least 64 bits of precision are the easiest method to use on i386 userland, but are hard to use in the kernel. PR: 76972 and duplicates Reviewed by: kib
This commit is contained in:
parent
af06fa2652
commit
23e5e43ccd
@ -863,13 +863,88 @@ rufetchtd(struct thread *td, struct rusage *ru)
|
||||
calcru1(p, &td->td_rux, &ru->ru_utime, &ru->ru_stime);
|
||||
}
|
||||
|
||||
/* XXX: the MI version is too slow to use: */
|
||||
#ifndef __HAVE_INLINE_FLSLL
|
||||
#define flsll(x) (fls((x) >> 32) != 0 ? fls((x) >> 32) + 32 : fls(x))
|
||||
#endif
|
||||
|
||||
static uint64_t
|
||||
mul64_by_fraction(uint64_t a, uint64_t b, uint64_t c)
|
||||
{
|
||||
uint64_t acc, bh, bl;
|
||||
int i, s, sa, sb;
|
||||
|
||||
/*
|
||||
* Compute floor(a * (b / c)) without overflowing, (b / c) <= 1.0.
|
||||
* Calculate (a * b) / c accurately enough without overflowing. c
|
||||
* must be nonzero, and its top bit must be 0. a or b must be
|
||||
* <= c, and the implementation is tuned for b <= c.
|
||||
*
|
||||
* The comments about times are for use in calcru1() with units of
|
||||
* microseconds for 'a' and stathz ticks at 128 Hz for b and c.
|
||||
*
|
||||
* Let n be the number of top zero bits in c. Each iteration
|
||||
* either returns, or reduces b by right shifting it by at least n.
|
||||
* The number of iterations is at most 1 + 64 / n, and the error is
|
||||
* at most the number of iterations.
|
||||
*
|
||||
* It is very unusual to need even 2 iterations. Previous
|
||||
* implementations overflowed essentially by returning early in the
|
||||
* first iteration, with n = 38 giving overflow at 105+ hours and
|
||||
* n = 32 giving overlow at at 388+ days despite a more careful
|
||||
* calculation. 388 days is a reasonable uptime, and the calculation
|
||||
* needs to work for the uptime times the number of CPUs since 'a'
|
||||
* is per-process.
|
||||
*/
|
||||
return ((a / c) * b + (a % c) * (b / c) + (a % c) * (b % c) / c);
|
||||
if (a >= (uint64_t)1 << 63)
|
||||
return (0); /* Unsupported arg -- can't happen. */
|
||||
acc = 0;
|
||||
for (i = 0; i < 128; i++) {
|
||||
sa = flsll(a);
|
||||
sb = flsll(b);
|
||||
if (sa + sb <= 64)
|
||||
/* Up to 105 hours on first iteration. */
|
||||
return (acc + (a * b) / c);
|
||||
if (a >= c) {
|
||||
/*
|
||||
* This reduction is based on a = q * c + r, with the
|
||||
* remainder r < c. 'a' may be large to start, and
|
||||
* moving bits from b into 'a' at the end of the loop
|
||||
* sets the top bit of 'a', so the reduction makes
|
||||
* significant progress.
|
||||
*/
|
||||
acc += (a / c) * b;
|
||||
a %= c;
|
||||
sa = flsll(a);
|
||||
if (sa + sb <= 64)
|
||||
/* Up to 388 days on first iteration. */
|
||||
return (acc + (a * b) / c);
|
||||
}
|
||||
|
||||
/*
|
||||
* This step writes a * b as a * ((bh << s) + bl) =
|
||||
* a * (bh << s) + a * bl = (a << s) * bh + a * bl. The 2
|
||||
* additive terms are handled separately. Splitting in
|
||||
* this way is linear except for rounding errors.
|
||||
*
|
||||
* s = 64 - sa is the maximum such that a << s fits in 64
|
||||
* bits. Since a < c and c has at least 1 zero top bit,
|
||||
* sa < 64 and s > 0. Thus this step makes progress by
|
||||
* reducing b (it increases 'a', but taking remainders on
|
||||
* the next iteration completes the reduction).
|
||||
*
|
||||
* Finally, the choice for s is just what is needed to keep
|
||||
* a * bl from overflowing, so we don't need complications
|
||||
* like a recursive call mul64_by_fraction(a, bl, c) to
|
||||
* handle the second additive term.
|
||||
*/
|
||||
s = 64 - sa;
|
||||
bh = b >> s;
|
||||
bl = b - (bh << s);
|
||||
acc += (a * bl) / c;
|
||||
a <<= s;
|
||||
b = bh;
|
||||
}
|
||||
return (0); /* Algorithm failure -- can't happen. */
|
||||
}
|
||||
|
||||
static void
|
||||
@ -896,15 +971,23 @@ calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up,
|
||||
tu = ruxp->rux_tu;
|
||||
}
|
||||
|
||||
/* Subdivide tu. Avoid overflow in the multiplications. */
|
||||
if (__predict_true(tu <= ((uint64_t)1 << 38) && tt <= (1 << 26))) {
|
||||
/* Up to 76 hours when stathz is 128. */
|
||||
uu = (tu * ut) / tt;
|
||||
su = (tu * st) / tt;
|
||||
} else {
|
||||
uu = mul64_by_fraction(tu, ut, tt);
|
||||
su = mul64_by_fraction(tu, ut, st);
|
||||
}
|
||||
|
||||
if (tu >= ruxp->rux_tu) {
|
||||
/*
|
||||
* The normal case, time increased.
|
||||
* Enforce monotonicity of bucketed numbers.
|
||||
*/
|
||||
uu = mul64_by_fraction(tu, ut, tt);
|
||||
if (uu < ruxp->rux_uu)
|
||||
uu = ruxp->rux_uu;
|
||||
su = mul64_by_fraction(tu, st, tt);
|
||||
if (su < ruxp->rux_su)
|
||||
su = ruxp->rux_su;
|
||||
} else if (tu + 3 > ruxp->rux_tu || 101 * tu > 100 * ruxp->rux_tu) {
|
||||
@ -933,8 +1016,6 @@ calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up,
|
||||
"to %ju usec for pid %d (%s)\n",
|
||||
(uintmax_t)ruxp->rux_tu, (uintmax_t)tu,
|
||||
p->p_pid, p->p_comm);
|
||||
uu = mul64_by_fraction(tu, ut, tt);
|
||||
su = mul64_by_fraction(tu, st, tt);
|
||||
}
|
||||
|
||||
ruxp->rux_uu = uu;
|
||||
|
Loading…
x
Reference in New Issue
Block a user