Conrad Meyer 179f62805c random(4): Fortuna: allow increased concurrency
Add experimental feature to increase concurrency in Fortuna.  As this
diverges slightly from canonical Fortuna, and due to the security
sensitivity of random(4), it is off by default.  To enable it, set the
tunable kern.random.fortuna.concurrent_read="1".  The rest of this commit
message describes the behavior when enabled.

Readers continue to update shared Fortuna state under global mutex, as they
do in the status quo implementation of the algorithm, but shift the actual
PRF generation out from under the global lock.  This massively reduces the
CPU time readers spend holding the global lock, allowing for increased
concurrency on SMP systems and less bullying of the harvestq kthread.

It is somewhat of a deviation from FS&K.  I think the primary difference is
that the specific sequence of AES keys will differ if READ_RANDOM_UIO is
accessed concurrently (as the 2nd thread to take the mutex will no longer
receive a key derived from rekeying the first thread).  However, I believe
the goals of rekeying AES are maintained: trivially, we continue to rekey
every 1MB for the statistical property; and each consumer gets a
forward-secret, independent AES key for their PRF.

Since Chacha doesn't need to rekey for sequences of any length, this change
makes no difference to the sequence of Chacha keys and PRF generated when
Chacha is used in place of AES.

On a GENERIC 4-thread VM (so, INVARIANTS/WITNESS, numbers not necessarily
representative), 3x concurrent AES performance jumped from ~55 MiB/s per
thread to ~197 MB/s per thread.  Concurrent Chacha20 at 3 threads went from
roughly ~113 MB/s per thread to ~430 MB/s per thread.

Prior to this change, the system was extremely unresponsive with 3-4
concurrent random readers; each thread had high variance in latency and
throughput, depending on who got lucky and won the lock.  "rand_harvestq"
thread CPU use was high (double digits), seemingly due to spinning on the
global lock.

After the change, concurrent random readers and the system in general are
much more responsive, and rand_harvestq CPU use dropped to basically zero.

Tests are added to the devrandom suite to ensure the uint128_add64 primitive
utilized by unlocked read functions to specification.

Reviewed by:	markm
Approved by:	secteam(delphij)
Relnotes:	yes
Differential Revision:	https://reviews.freebsd.org/D20313
2019-06-17 20:29:13 +00:00

248 lines
7.2 KiB
C

/*-
* Copyright (c) 2000-2015 Mark R V Murray
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer
* in this position and unchanged.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#ifdef _KERNEL
#include <sys/param.h>
#include <sys/malloc.h>
#include <sys/random.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#else /* !_KERNEL */
#include <sys/param.h>
#include <sys/types.h>
#include <assert.h>
#include <inttypes.h>
#include <signal.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <threads.h>
#define KASSERT(x, y) assert(x)
#define CTASSERT(x) _Static_assert(x, "CTASSERT " #x)
#endif /* _KERNEL */
#define CHACHA_EMBED
#define KEYSTREAM_ONLY
#define CHACHA_NONCE0_CTR128
#include <crypto/chacha20/chacha.c>
#include <crypto/rijndael/rijndael-api-fst.h>
#include <crypto/sha2/sha256.h>
#include <dev/random/hash.h>
#ifdef _KERNEL
#include <dev/random/randomdev.h>
#endif
/* This code presumes that RANDOM_KEYSIZE is twice as large as RANDOM_BLOCKSIZE */
CTASSERT(RANDOM_KEYSIZE == 2*RANDOM_BLOCKSIZE);
/* Validate that full Chacha IV is as large as the 128-bit counter */
_Static_assert(CHACHA_STATELEN == RANDOM_BLOCKSIZE, "");
/*
* Experimental Chacha20-based PRF for Fortuna keystream primitive. For now,
* disabled by default. But we may enable it in the future.
*
* Benefits include somewhat faster keystream generation compared with
* unaccelerated AES-ICM.
*/
bool random_chachamode __read_frequently = false;
#ifdef _KERNEL
SYSCTL_BOOL(_kern_random, OID_AUTO, use_chacha20_cipher, CTLFLAG_RDTUN,
&random_chachamode, 0,
"If non-zero, use the ChaCha20 cipher for randomdev PRF. "
"If zero, use AES-ICM cipher for randomdev PRF (default).");
#endif
/* Initialise the hash */
void
randomdev_hash_init(struct randomdev_hash *context)
{
SHA256_Init(&context->sha);
}
/* Iterate the hash */
void
randomdev_hash_iterate(struct randomdev_hash *context, const void *data, size_t size)
{
SHA256_Update(&context->sha, data, size);
}
/* Conclude by returning the hash in the supplied <*buf> which must be
* RANDOM_KEYSIZE bytes long.
*/
void
randomdev_hash_finish(struct randomdev_hash *context, void *buf)
{
SHA256_Final(buf, &context->sha);
}
/* Initialise the encryption routine by setting up the key schedule
* from the supplied <*data> which must be RANDOM_KEYSIZE bytes of binary
* data.
*/
void
randomdev_encrypt_init(union randomdev_key *context, const void *data)
{
if (random_chachamode) {
chacha_keysetup(&context->chacha, data, RANDOM_KEYSIZE * 8);
} else {
rijndael_cipherInit(&context->cipher, MODE_ECB, NULL);
rijndael_makeKey(&context->key, DIR_ENCRYPT, RANDOM_KEYSIZE*8, data);
}
}
/*
* Create a psuedorandom output stream of 'bytecount' bytes using a CTR-mode
* cipher or similar. The 128-bit counter is supplied in the in-out parmeter
* 'ctr.' The output stream goes to 'd_out.'
*
* If AES is used, 'bytecount' is guaranteed to be a multiple of
* RANDOM_BLOCKSIZE.
*/
void
randomdev_keystream(union randomdev_key *context, uint128_t *ctr,
void *d_out, size_t bytecount)
{
size_t i, blockcount, read_chunk;
if (random_chachamode) {
uint128_t lectr;
/*
* Chacha always encodes and increments the counter little
* endian. So on BE machines, we must provide a swapped
* counter to chacha, and swap the output too.
*/
le128enc(&lectr, *ctr);
chacha_ivsetup(&context->chacha, NULL, (const void *)&lectr);
while (bytecount > 0) {
/*
* We are limited by the chacha_encrypt_bytes API to
* u32 bytes per chunk.
*/
read_chunk = MIN(bytecount,
rounddown((size_t)UINT32_MAX, CHACHA_BLOCKLEN));
chacha_encrypt_bytes(&context->chacha, NULL, d_out,
read_chunk);
d_out = (char *)d_out + read_chunk;
bytecount -= read_chunk;
}
/*
* Decode Chacha-updated LE counter to native endian and store
* it back in the caller's in-out parameter.
*/
chacha_ctrsave(&context->chacha, (void *)&lectr);
*ctr = le128dec(&lectr);
explicit_bzero(&lectr, sizeof(lectr));
} else {
KASSERT(bytecount % RANDOM_BLOCKSIZE == 0,
("%s: AES mode invalid bytecount, not a multiple of native "
"block size", __func__));
blockcount = bytecount / RANDOM_BLOCKSIZE;
for (i = 0; i < blockcount; i++) {
/*-
* FS&K - r = r|E(K,C)
* - C = C + 1
*/
rijndael_blockEncrypt(&context->cipher, &context->key,
(void *)ctr, RANDOM_BLOCKSIZE * 8, d_out);
d_out = (char *)d_out + RANDOM_BLOCKSIZE;
uint128_increment(ctr);
}
}
}
/*
* Fetch a pointer to the relevant key material and its size.
*
* This API is expected to only be used only for reseeding, where the
* endianness does not matter; the goal is to simply incorporate the key
* material into the hash iterator that will produce key'.
*
* Do not expect the buffer pointed to by this API to match the exact
* endianness, etc, as the key material that was supplied to
* randomdev_encrypt_init().
*/
void
randomdev_getkey(union randomdev_key *context, const void **keyp, size_t *szp)
{
if (!random_chachamode) {
*keyp = &context->key.keyMaterial;
*szp = context->key.keyLen / 8;
return;
}
/* Chacha20 mode */
*keyp = (const void *)&context->chacha.input[4];
/* Sanity check keysize */
if (context->chacha.input[0] == U8TO32_LITTLE(sigma) &&
context->chacha.input[1] == U8TO32_LITTLE(&sigma[4]) &&
context->chacha.input[2] == U8TO32_LITTLE(&sigma[8]) &&
context->chacha.input[3] == U8TO32_LITTLE(&sigma[12])) {
*szp = 32;
return;
}
#if 0
/*
* Included for the sake of completeness; as-implemented, Fortuna
* doesn't need or use 128-bit Chacha20.
*/
if (context->chacha->input[0] == U8TO32_LITTLE(tau) &&
context->chacha->input[1] == U8TO32_LITTLE(&tau[4]) &&
context->chacha->input[2] == U8TO32_LITTLE(&tau[8]) &&
context->chacha->input[3] == U8TO32_LITTLE(&tau[12])) {
*szp = 16;
return;
}
#endif
#ifdef _KERNEL
panic("%s: Invalid chacha20 keysize: %16D\n", __func__,
(void *)context->chacha.input, " ");
#else
raise(SIGKILL);
#endif
}