Lawrence Stewart 370efe5ac8 Add support for the experimental Internet-Draft "TCP Alternative Backoff with
ECN (ABE)" proposal to the New Reno congestion control algorithm module.
ABE reduces the amount of congestion window reduction in response to
ECN-signalled congestion relative to the loss-inferred congestion response.

More details about ABE can be found in the Internet-Draft:
https://tools.ietf.org/html/draft-ietf-tcpm-alternativebackoff-ecn

The implementation introduces four new sysctls:

- net.inet.tcp.cc.abe defaults to 0 (disabled) and can be set to non-zero to
  enable ABE for ECN-enabled TCP connections.

- net.inet.tcp.cc.newreno.beta and net.inet.tcp.cc.newreno.beta_ecn set the
  multiplicative window decrease factor, specified as a percentage, applied to
  the congestion window in response to a loss-based or ECN-based congestion
  signal respectively. They default to the values specified in the draft i.e.
  beta=50 and beta_ecn=80.

- net.inet.tcp.cc.abe_frlossreduce defaults to 0 (disabled) and can be set to
  non-zero to enable the use of standard beta (50% by default) when repairing
  loss during an ECN-signalled congestion recovery episode. It enables a more
  conservative congestion response and is provided for the purposes of
  experimentation as a result of some discussion at IETF 100 in Singapore.

The values of beta and beta_ecn can also be set per-connection by way of the
TCP_CCALGOOPT TCP-level socket option and the new CC_NEWRENO_BETA or
CC_NEWRENO_BETA_ECN CC algo sub-options.

Submitted by:	Tom Jones <tj@enoti.me>
Tested by:	Tom Jones <tj@enoti.me>, Grenville Armitage <garmitage@swin.edu.au>
Relnotes:	Yes
Differential Revision:	https://reviews.freebsd.org/D11616
2018-03-19 16:37:47 +00:00

341 lines
8.9 KiB
C

/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2007-2008
* Swinburne University of Technology, Melbourne, Australia.
* Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
* Copyright (c) 2010 The FreeBSD Foundation
* All rights reserved.
*
* This software was developed at the Centre for Advanced Internet
* Architectures, Swinburne University of Technology, by Lawrence Stewart and
* James Healy, made possible in part by a grant from the Cisco University
* Research Program Fund at Community Foundation Silicon Valley.
*
* Portions of this software were developed at the Centre for Advanced
* Internet Architectures, Swinburne University of Technology, Melbourne,
* Australia by David Hayes under sponsorship from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* This software was first released in 2007 by James Healy and Lawrence Stewart
* whilst working on the NewTCP research project at Swinburne University of
* Technology's Centre for Advanced Internet Architectures, Melbourne,
* Australia, which was made possible in part by a grant from the Cisco
* University Research Program Fund at Community Foundation Silicon Valley.
* More details are available at:
* http://caia.swin.edu.au/urp/newtcp/
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/libkern.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/queue.h>
#include <sys/rwlock.h>
#include <sys/sbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/in_pcb.h>
#include <netinet/tcp.h>
#include <netinet/tcp_var.h>
#include <netinet/cc/cc.h>
#include <netinet/cc/cc_module.h>
/*
* List of available cc algorithms on the current system. First element
* is used as the system default CC algorithm.
*/
struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list);
/* Protects the cc_list TAILQ. */
struct rwlock cc_list_lock;
VNET_DEFINE(struct cc_algo *, default_cc_ptr) = &newreno_cc_algo;
/*
* Sysctl handler to show and change the default CC algorithm.
*/
static int
cc_default_algo(SYSCTL_HANDLER_ARGS)
{
char default_cc[TCP_CA_NAME_MAX];
struct cc_algo *funcs;
int error;
/* Get the current default: */
CC_LIST_RLOCK();
strlcpy(default_cc, CC_DEFAULT()->name, sizeof(default_cc));
CC_LIST_RUNLOCK();
error = sysctl_handle_string(oidp, default_cc, sizeof(default_cc), req);
/* Check for error or no change */
if (error != 0 || req->newptr == NULL)
goto done;
error = ESRCH;
/* Find algo with specified name and set it to default. */
CC_LIST_RLOCK();
STAILQ_FOREACH(funcs, &cc_list, entries) {
if (strncmp(default_cc, funcs->name, sizeof(default_cc)))
continue;
V_default_cc_ptr = funcs;
error = 0;
break;
}
CC_LIST_RUNLOCK();
done:
return (error);
}
/*
* Sysctl handler to display the list of available CC algorithms.
*/
static int
cc_list_available(SYSCTL_HANDLER_ARGS)
{
struct cc_algo *algo;
struct sbuf *s;
int err, first, nalgos;
err = nalgos = 0;
first = 1;
CC_LIST_RLOCK();
STAILQ_FOREACH(algo, &cc_list, entries) {
nalgos++;
}
CC_LIST_RUNLOCK();
s = sbuf_new(NULL, NULL, nalgos * TCP_CA_NAME_MAX, SBUF_FIXEDLEN);
if (s == NULL)
return (ENOMEM);
/*
* It is theoretically possible for the CC list to have grown in size
* since the call to sbuf_new() and therefore for the sbuf to be too
* small. If this were to happen (incredibly unlikely), the sbuf will
* reach an overflow condition, sbuf_printf() will return an error and
* the sysctl will fail gracefully.
*/
CC_LIST_RLOCK();
STAILQ_FOREACH(algo, &cc_list, entries) {
err = sbuf_printf(s, first ? "%s" : ", %s", algo->name);
if (err) {
/* Sbuf overflow condition. */
err = EOVERFLOW;
break;
}
first = 0;
}
CC_LIST_RUNLOCK();
if (!err) {
sbuf_finish(s);
err = sysctl_handle_string(oidp, sbuf_data(s), 0, req);
}
sbuf_delete(s);
return (err);
}
/*
* Reset the default CC algo to NewReno for any netstack which is using the algo
* that is about to go away as its default.
*/
static void
cc_checkreset_default(struct cc_algo *remove_cc)
{
VNET_ITERATOR_DECL(vnet_iter);
CC_LIST_LOCK_ASSERT();
VNET_LIST_RLOCK_NOSLEEP();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
if (strncmp(CC_DEFAULT()->name, remove_cc->name,
TCP_CA_NAME_MAX) == 0)
V_default_cc_ptr = &newreno_cc_algo;
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK_NOSLEEP();
}
/*
* Initialise CC subsystem on system boot.
*/
static void
cc_init(void)
{
CC_LIST_LOCK_INIT();
STAILQ_INIT(&cc_list);
}
/*
* Returns non-zero on success, 0 on failure.
*/
int
cc_deregister_algo(struct cc_algo *remove_cc)
{
struct cc_algo *funcs, *tmpfuncs;
int err;
err = ENOENT;
/* Never allow newreno to be deregistered. */
if (&newreno_cc_algo == remove_cc)
return (EPERM);
/* Remove algo from cc_list so that new connections can't use it. */
CC_LIST_WLOCK();
STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) {
if (funcs == remove_cc) {
cc_checkreset_default(remove_cc);
STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries);
err = 0;
break;
}
}
CC_LIST_WUNLOCK();
if (!err)
/*
* XXXLAS:
* - We may need to handle non-zero return values in future.
* - If we add CC framework support for protocols other than
* TCP, we may want a more generic way to handle this step.
*/
tcp_ccalgounload(remove_cc);
return (err);
}
/*
* Returns 0 on success, non-zero on failure.
*/
int
cc_register_algo(struct cc_algo *add_cc)
{
struct cc_algo *funcs;
int err;
err = 0;
/*
* Iterate over list of registered CC algorithms and make sure
* we're not trying to add a duplicate.
*/
CC_LIST_WLOCK();
STAILQ_FOREACH(funcs, &cc_list, entries) {
if (funcs == add_cc || strncmp(funcs->name, add_cc->name,
TCP_CA_NAME_MAX) == 0)
err = EEXIST;
}
if (!err)
STAILQ_INSERT_TAIL(&cc_list, add_cc, entries);
CC_LIST_WUNLOCK();
return (err);
}
/*
* Handles kld related events. Returns 0 on success, non-zero on failure.
*/
int
cc_modevent(module_t mod, int event_type, void *data)
{
struct cc_algo *algo;
int err;
err = 0;
algo = (struct cc_algo *)data;
switch(event_type) {
case MOD_LOAD:
if (algo->mod_init != NULL)
err = algo->mod_init();
if (!err)
err = cc_register_algo(algo);
break;
case MOD_QUIESCE:
case MOD_SHUTDOWN:
case MOD_UNLOAD:
err = cc_deregister_algo(algo);
if (!err && algo->mod_destroy != NULL)
algo->mod_destroy();
if (err == ENOENT)
err = 0;
break;
default:
err = EINVAL;
break;
}
return (err);
}
SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL);
/* Declare sysctl tree and populate it. */
SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW, NULL,
"Congestion control related settings");
SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm,
CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW,
NULL, 0, cc_default_algo, "A", "Default congestion control algorithm");
SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, CTLTYPE_STRING|CTLFLAG_RD,
NULL, 0, cc_list_available, "A",
"List available congestion control algorithms");
VNET_DEFINE(int, cc_do_abe) = 0;
SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(cc_do_abe), 0,
"Enable draft-ietf-tcpm-alternativebackoff-ecn (TCP Alternative Backoff with ECN)");
VNET_DEFINE(int, cc_abe_frlossreduce) = 0;
SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe_frlossreduce, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(cc_abe_frlossreduce), 0,
"Apply standard beta instead of ABE-beta during ECN-signalled congestion "
"recovery episodes if loss also needs to be repaired");