b0dcb77676
Add experimental feature to increase concurrency in Fortuna. As this diverges slightly from canonical Fortuna, and due to the security sensitivity of random(4), it is off by default. To enable it, set the tunable kern.random.fortuna.concurrent_read="1". The rest of this commit message describes the behavior when enabled. Readers continue to update shared Fortuna state under global mutex, as they do in the status quo implementation of the algorithm, but shift the actual PRF generation out from under the global lock. This massively reduces the CPU time readers spend holding the global lock, allowing for increased concurrency on SMP systems and less bullying of the harvestq kthread. It is somewhat of a deviation from FS&K. I think the primary difference is that the specific sequence of AES keys will differ if READ_RANDOM_UIO is accessed concurrently (as the 2nd thread to take the mutex will no longer receive a key derived from rekeying the first thread). However, I believe the goals of rekeying AES are maintained: trivially, we continue to rekey every 1MB for the statistical property; and each consumer gets a forward-secret, independent AES key for their PRF. Since Chacha doesn't need to rekey for sequences of any length, this change makes no difference to the sequence of Chacha keys and PRF generated when Chacha is used in place of AES. On a GENERIC 4-thread VM (so, INVARIANTS/WITNESS, numbers not necessarily representative), 3x concurrent AES performance jumped from ~55 MiB/s per thread to ~197 MB/s per thread. Concurrent Chacha20 at 3 threads went from roughly ~113 MB/s per thread to ~430 MB/s per thread. Prior to this change, the system was extremely unresponsive with 3-4 concurrent random readers; each thread had high variance in latency and throughput, depending on who got lucky and won the lock. "rand_harvestq" thread CPU use was high (double digits), seemingly due to spinning on the global lock. After the change, concurrent random readers and the system in general are much more responsive, and rand_harvestq CPU use dropped to basically zero. Tests are added to the devrandom suite to ensure the uint128_add64 primitive utilized by unlocked read functions to specification. Reviewed by: markm Approved by: secteam(delphij) Relnotes: yes Differential Revision: https://reviews.freebsd.org/D20313
130 lines
3.3 KiB
C
130 lines
3.3 KiB
C
/*-
|
|
* Copyright (c) 2015 Mark R V Murray
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer
|
|
* in this position and unchanged.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
|
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
|
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
|
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
|
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
* $FreeBSD$
|
|
*/
|
|
|
|
#ifndef SYS_DEV_RANDOM_UINT128_H_INCLUDED
|
|
#define SYS_DEV_RANDOM_UINT128_H_INCLUDED
|
|
|
|
#include <sys/endian.h>
|
|
|
|
/* This whole thing is a crock :-(
|
|
*
|
|
* Everyone knows you always need the __uint128_t types!
|
|
*/
|
|
|
|
#ifdef __SIZEOF_INT128__
|
|
#define USE_REAL_UINT128_T
|
|
#endif
|
|
|
|
#ifdef USE_REAL_UINT128_T
|
|
typedef __uint128_t uint128_t;
|
|
#define UINT128_ZERO 0ULL
|
|
#else
|
|
typedef struct {
|
|
/* Ignore endianness */
|
|
uint64_t u128t_word0;
|
|
uint64_t u128t_word1;
|
|
} uint128_t;
|
|
static const uint128_t very_long_zero = {0UL,0UL};
|
|
#define UINT128_ZERO very_long_zero
|
|
#endif
|
|
|
|
static __inline void
|
|
uint128_increment(uint128_t *big_uintp)
|
|
{
|
|
#ifdef USE_REAL_UINT128_T
|
|
(*big_uintp)++;
|
|
#else
|
|
big_uintp->u128t_word0++;
|
|
if (big_uintp->u128t_word0 == 0UL)
|
|
big_uintp->u128t_word1++;
|
|
#endif
|
|
}
|
|
|
|
static __inline void
|
|
uint128_add64(uint128_t *big_uintp, uint64_t add)
|
|
{
|
|
#ifdef USE_REAL_UINT128_T
|
|
(*big_uintp) += add;
|
|
#else
|
|
uint64_t word0p;
|
|
|
|
word0p = big_uintp->u128t_word0 + add;
|
|
if (word0p < big_uintp->u128t_word0)
|
|
big_uintp->u128t_word1++;
|
|
big_uintp->u128t_word0 = word0p;
|
|
#endif
|
|
}
|
|
|
|
static __inline bool
|
|
uint128_equals(uint128_t a, uint128_t b)
|
|
{
|
|
#ifdef USE_REAL_UINT128_T
|
|
return (a == b);
|
|
#else
|
|
return (a.u128t_word0 == b.u128t_word0 &&
|
|
a.u128t_word1 == b.u128t_word1);
|
|
#endif
|
|
}
|
|
|
|
static __inline int
|
|
uint128_is_zero(uint128_t big_uint)
|
|
{
|
|
return (uint128_equals(big_uint, UINT128_ZERO));
|
|
}
|
|
|
|
static __inline uint128_t
|
|
le128dec(const void *pp)
|
|
{
|
|
const uint8_t *p = pp;
|
|
|
|
#ifdef USE_REAL_UINT128_T
|
|
return (((uint128_t)le64dec(p + 8) << 64) | le64dec(p));
|
|
#else
|
|
return ((uint128_t){
|
|
.u128t_word0 = le64dec(p),
|
|
.u128t_word1 = le64dec(p + 8),
|
|
});
|
|
#endif
|
|
}
|
|
|
|
static __inline void
|
|
le128enc(void *pp, uint128_t u)
|
|
{
|
|
uint8_t *p = pp;
|
|
|
|
#ifdef USE_REAL_UINT128_T
|
|
le64enc(p, (uint64_t)(u & UINT64_MAX));
|
|
le64enc(p + 8, (uint64_t)(u >> 64));
|
|
#else
|
|
le64enc(p, u.u128t_word0);
|
|
le64enc(p + 8, u.u128t_word1);
|
|
#endif
|
|
}
|
|
|
|
#endif /* SYS_DEV_RANDOM_UINT128_H_INCLUDED */
|