Import the ERTT (Enhanced Round Trip Time) Khelp module. ERTT uses the
Khelp/Hhook KPIs to hook into the TCP stack and maintain a per-connection, low noise estimate of the instantaneous RTT. ERTT's implementation is robust even in the face of delayed acknowledgements and/or TSO being in use for a connection. A high quality, low noise RTT estimate is a requirement for applications such as delay-based congestion control, for which we will be importing some algorithm implementations shortly. In collaboration with: David Hayes <dahayes at swin edu au> and Grenville Armitage <garmitage at swin edu au> Sponsored by: FreeBSD Foundation Reviewed by: bz and others along the way MFC after: 3 months
This commit is contained in:
parent
5f3b301a43
commit
050570efa7
@ -1,5 +1,5 @@
|
|||||||
# $FreeBSD$
|
# $FreeBSD$
|
||||||
|
|
||||||
SUBDIR=
|
SUBDIR= h_ertt
|
||||||
|
|
||||||
.include <bsd.subdir.mk>
|
.include <bsd.subdir.mk>
|
||||||
|
9
sys/modules/khelp/h_ertt/Makefile
Normal file
9
sys/modules/khelp/h_ertt/Makefile
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
# $FreeBSD$
|
||||||
|
|
||||||
|
.include <bsd.own.mk>
|
||||||
|
|
||||||
|
.PATH: ${.CURDIR}/../../../netinet/khelp
|
||||||
|
KMOD= h_ertt
|
||||||
|
SRCS= h_ertt.c
|
||||||
|
|
||||||
|
.include <bsd.kmod.mk>
|
545
sys/netinet/khelp/h_ertt.c
Normal file
545
sys/netinet/khelp/h_ertt.c
Normal file
@ -0,0 +1,545 @@
|
|||||||
|
/*-
|
||||||
|
* Copyright (c) 2009-2010
|
||||||
|
* Swinburne University of Technology, Melbourne, Australia
|
||||||
|
* Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org>
|
||||||
|
* Copyright (c) 2010-2011 The FreeBSD Foundation
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* This software was developed at the Centre for Advanced Internet
|
||||||
|
* Architectures, Swinburne University, by David Hayes, made possible in part by
|
||||||
|
* a grant from the Cisco University Research Program Fund at Community
|
||||||
|
* Foundation Silicon Valley.
|
||||||
|
*
|
||||||
|
* Portions of this software were developed at the Centre for Advanced
|
||||||
|
* Internet Architectures, Swinburne University of Technology, Melbourne,
|
||||||
|
* Australia by David Hayes under sponsorship from the FreeBSD Foundation.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||||
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||||
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||||
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||||
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||||
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||||
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <sys/cdefs.h>
|
||||||
|
__FBSDID("$FreeBSD$");
|
||||||
|
|
||||||
|
#include <sys/param.h>
|
||||||
|
#include <sys/kernel.h>
|
||||||
|
#include <sys/mbuf.h>
|
||||||
|
#include <sys/module.h>
|
||||||
|
#include <sys/hhook.h>
|
||||||
|
#include <sys/khelp.h>
|
||||||
|
#include <sys/module_khelp.h>
|
||||||
|
#include <sys/socket.h>
|
||||||
|
#include <sys/sockopt.h>
|
||||||
|
|
||||||
|
#include <net/vnet.h>
|
||||||
|
|
||||||
|
#include <netinet/in.h>
|
||||||
|
#include <netinet/in_pcb.h>
|
||||||
|
#include <netinet/tcp_seq.h>
|
||||||
|
#include <netinet/tcp_var.h>
|
||||||
|
|
||||||
|
#include <netinet/khelp/h_ertt.h>
|
||||||
|
|
||||||
|
#include <vm/uma.h>
|
||||||
|
|
||||||
|
uma_zone_t txseginfo_zone;
|
||||||
|
|
||||||
|
/* Smoothing factor for delayed ack guess. */
|
||||||
|
#define DLYACK_SMOOTH 5
|
||||||
|
|
||||||
|
/* Max number of time stamp errors allowed in a session. */
|
||||||
|
#define MAX_TS_ERR 10
|
||||||
|
|
||||||
|
static int ertt_packet_measurement_hook(int hhook_type, int hhook_id,
|
||||||
|
void *udata, void *ctx_data, void *hdata, struct osd *hosd);
|
||||||
|
static int ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id,
|
||||||
|
void *udata, void *ctx_data, void *hdata, struct osd *hosd);
|
||||||
|
static int ertt_mod_init(void);
|
||||||
|
static int ertt_mod_destroy(void);
|
||||||
|
static int ertt_uma_ctor(void *mem, int size, void *arg, int flags);
|
||||||
|
static void ertt_uma_dtor(void *mem, int size, void *arg);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Contains information about the sent segment for comparison with the
|
||||||
|
* corresponding ack.
|
||||||
|
*/
|
||||||
|
struct txseginfo {
|
||||||
|
/* Segment length. */
|
||||||
|
long len;
|
||||||
|
/* Segment sequence number. */
|
||||||
|
tcp_seq seq;
|
||||||
|
/* Time stamp indicating when the packet was sent. */
|
||||||
|
uint32_t tx_ts;
|
||||||
|
/* Last received receiver ts (if the TCP option is used). */
|
||||||
|
uint32_t rx_ts;
|
||||||
|
uint32_t flags;
|
||||||
|
TAILQ_ENTRY (txseginfo) txsegi_lnk;
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Flags for struct txseginfo. */
|
||||||
|
#define TXSI_TSO 0x01 /* TSO was used for this entry. */
|
||||||
|
#define TXSI_RTT_MEASURE_START 0x02 /* Start a per RTT measurement. */
|
||||||
|
#define TXSI_RX_MEASURE_END 0x04 /* Measure the rx rate until this txsi. */
|
||||||
|
|
||||||
|
struct helper ertt_helper = {
|
||||||
|
.mod_init = ertt_mod_init,
|
||||||
|
.mod_destroy = ertt_mod_destroy,
|
||||||
|
.h_flags = HELPER_NEEDS_OSD,
|
||||||
|
.h_classes = HELPER_CLASS_TCP
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Define the helper hook info required by ERTT. */
|
||||||
|
struct hookinfo ertt_hooks[] = {
|
||||||
|
{
|
||||||
|
.hook_type = HHOOK_TYPE_TCP,
|
||||||
|
.hook_id = HHOOK_TCP_EST_IN,
|
||||||
|
.hook_udata = NULL,
|
||||||
|
.hook_func = &ertt_packet_measurement_hook
|
||||||
|
},
|
||||||
|
{
|
||||||
|
.hook_type = HHOOK_TYPE_TCP,
|
||||||
|
.hook_id = HHOOK_TCP_EST_OUT,
|
||||||
|
.hook_udata = NULL,
|
||||||
|
.hook_func = &ertt_add_tx_segment_info_hook
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Flags to indicate how marked_packet_rtt should handle this txsi. */
|
||||||
|
#define MULTI_ACK 0x01 /* More than this txsi is acked. */
|
||||||
|
#define OLD_TXSI 0x02 /* TXSI is old according to timestamps. */
|
||||||
|
#define CORRECT_ACK 0X04 /* Acks this TXSI. */
|
||||||
|
#define FORCED_MEASUREMENT 0X08 /* Force an RTT measurement. */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This fuction measures the RTT of a particular segment/ack pair, or the next
|
||||||
|
* closest if this will yield an inaccurate result due to delayed acking or
|
||||||
|
* other issues.
|
||||||
|
*/
|
||||||
|
static void inline
|
||||||
|
marked_packet_rtt(struct txseginfo *txsi, struct ertt *e_t, struct tcpcb *tp,
|
||||||
|
uint32_t *pmeasurenext, int *pmeasurenext_len, int *prtt_bytes_adjust,
|
||||||
|
int mflag)
|
||||||
|
{
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we can't measure this one properly due to delayed acking adjust
|
||||||
|
* byte counters and flag to measure next txsi. Note that since the
|
||||||
|
* marked packet's transmitted bytes are measured we need to subtract the
|
||||||
|
* transmitted bytes. Then pretend the next txsi was marked.
|
||||||
|
*/
|
||||||
|
if (mflag & (MULTI_ACK|OLD_TXSI)) {
|
||||||
|
*pmeasurenext = txsi->tx_ts;
|
||||||
|
*pmeasurenext_len = txsi->len;
|
||||||
|
*prtt_bytes_adjust += *pmeasurenext_len;
|
||||||
|
} else {
|
||||||
|
if (mflag & FORCED_MEASUREMENT) {
|
||||||
|
e_t->markedpkt_rtt = ticks - *pmeasurenext + 1;
|
||||||
|
e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt +
|
||||||
|
*pmeasurenext_len - *prtt_bytes_adjust;
|
||||||
|
} else {
|
||||||
|
e_t->markedpkt_rtt = ticks - txsi->tx_ts + 1;
|
||||||
|
e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt -
|
||||||
|
*prtt_bytes_adjust;
|
||||||
|
}
|
||||||
|
e_t->marked_snd_cwnd = tp->snd_cwnd;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Reset the ERTT_MEASUREMENT_IN_PROGRESS flag to indicate to
|
||||||
|
* add_tx_segment_info that a new measurement should be started.
|
||||||
|
*/
|
||||||
|
e_t->flags &= ~ERTT_MEASUREMENT_IN_PROGRESS;
|
||||||
|
/*
|
||||||
|
* Set ERTT_NEW_MEASUREMENT to tell the congestion control
|
||||||
|
* algorithm that a new marked RTT measurement has has been made
|
||||||
|
* and is available for use.
|
||||||
|
*/
|
||||||
|
e_t->flags |= ERTT_NEW_MEASUREMENT;
|
||||||
|
|
||||||
|
if (tp->t_flags & TF_TSO) {
|
||||||
|
/* Temporarily disable TSO to aid a new measurment. */
|
||||||
|
tp->t_flags &= ~TF_TSO;
|
||||||
|
/* Keep track that we've disabled it. */
|
||||||
|
e_t->flags |= ERTT_TSO_DISABLED;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Ertt_packet_measurements uses a small amount of state kept on each packet
|
||||||
|
* sent to match incoming acknowledgements. This enables more accurate and
|
||||||
|
* secure round trip time measurements. The resulting measurement is used for
|
||||||
|
* congestion control algorithms which require a more accurate time.
|
||||||
|
* Ertt_packet_measurements is called via the helper hook in tcp_input.c
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
ertt_packet_measurement_hook(int hhook_type, int hhook_id, void *udata,
|
||||||
|
void *ctx_data, void *hdata, struct osd *hosd)
|
||||||
|
{
|
||||||
|
struct ertt *e_t;
|
||||||
|
struct tcpcb *tp;
|
||||||
|
struct tcphdr *th;
|
||||||
|
struct tcpopt *to;
|
||||||
|
struct tcp_hhook_data *thdp;
|
||||||
|
struct txseginfo *txsi;
|
||||||
|
int acked, measurenext_len, multiack, new_sacked_bytes, rtt_bytes_adjust;
|
||||||
|
uint32_t measurenext, rts;
|
||||||
|
tcp_seq ack;
|
||||||
|
|
||||||
|
KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
|
||||||
|
KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));
|
||||||
|
|
||||||
|
e_t = (struct ertt *)hdata;
|
||||||
|
thdp = ctx_data;
|
||||||
|
tp = thdp->tp;
|
||||||
|
th = thdp->th;
|
||||||
|
to = thdp->to;
|
||||||
|
new_sacked_bytes = (tp->sackhint.last_sack_ack != 0);
|
||||||
|
measurenext = measurenext_len = multiack = rts = rtt_bytes_adjust = 0;
|
||||||
|
acked = th->th_ack - tp->snd_una;
|
||||||
|
|
||||||
|
INP_WLOCK_ASSERT(tp->t_inpcb);
|
||||||
|
|
||||||
|
/* Packet has provided new acknowledgements. */
|
||||||
|
if (acked > 0 || new_sacked_bytes) {
|
||||||
|
if (acked == 0 && new_sacked_bytes) {
|
||||||
|
/* Use last sacked data. */
|
||||||
|
ack = tp->sackhint.last_sack_ack;
|
||||||
|
} else
|
||||||
|
ack = th->th_ack;
|
||||||
|
|
||||||
|
txsi = TAILQ_FIRST(&e_t->txsegi_q);
|
||||||
|
while (txsi != NULL) {
|
||||||
|
rts = 0;
|
||||||
|
|
||||||
|
/* Acknowledgement is acking more than this txsi. */
|
||||||
|
if (SEQ_GT(ack, txsi->seq + txsi->len)) {
|
||||||
|
if (txsi->flags & TXSI_RTT_MEASURE_START ||
|
||||||
|
measurenext) {
|
||||||
|
marked_packet_rtt(txsi, e_t, tp,
|
||||||
|
&measurenext, &measurenext_len,
|
||||||
|
&rtt_bytes_adjust, MULTI_ACK);
|
||||||
|
}
|
||||||
|
TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
|
||||||
|
uma_zfree(txseginfo_zone, txsi);
|
||||||
|
txsi = TAILQ_FIRST(&e_t->txsegi_q);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Guess if delayed acks are being used by the receiver.
|
||||||
|
*
|
||||||
|
* XXXDH: A simple heuristic that could be improved
|
||||||
|
*/
|
||||||
|
if (!new_sacked_bytes) {
|
||||||
|
if (acked > tp->t_maxseg) {
|
||||||
|
e_t->dlyack_rx +=
|
||||||
|
(e_t->dlyack_rx < DLYACK_SMOOTH) ?
|
||||||
|
1 : 0;
|
||||||
|
multiack = 1;
|
||||||
|
} else if (acked > txsi->len) {
|
||||||
|
multiack = 1;
|
||||||
|
e_t->dlyack_rx +=
|
||||||
|
(e_t->dlyack_rx < DLYACK_SMOOTH) ?
|
||||||
|
1 : 0;
|
||||||
|
} else if (acked == tp->t_maxseg ||
|
||||||
|
acked == txsi->len) {
|
||||||
|
e_t->dlyack_rx -=
|
||||||
|
(e_t->dlyack_rx > 0) ? 1 : 0;
|
||||||
|
}
|
||||||
|
/* Otherwise leave dlyack_rx the way it was. */
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Time stamps are only to help match the txsi with the
|
||||||
|
* received acknowledgements.
|
||||||
|
*/
|
||||||
|
if (e_t->timestamp_errors < MAX_TS_ERR &&
|
||||||
|
(to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
|
||||||
|
/*
|
||||||
|
* Note: All packets sent with the offload will
|
||||||
|
* have the same time stamp. If we are sending
|
||||||
|
* on a fast interface and the t_maxseg is much
|
||||||
|
* smaller than one tick, this will be fine. The
|
||||||
|
* time stamp would be the same whether we were
|
||||||
|
* using tso or not. However, if the interface
|
||||||
|
* is slow, this will cause problems with the
|
||||||
|
* calculations. If the interface is slow, there
|
||||||
|
* is not reason to be using tso, and it should
|
||||||
|
* be turned off.
|
||||||
|
*/
|
||||||
|
/*
|
||||||
|
* If there are too many time stamp errors, time
|
||||||
|
* stamps won't be trusted
|
||||||
|
*/
|
||||||
|
rts = to->to_tsecr;
|
||||||
|
/* Before this packet. */
|
||||||
|
if (!e_t->dlyack_rx && TSTMP_LT(rts, txsi->tx_ts))
|
||||||
|
/* When delayed acking is used, the
|
||||||
|
* reflected time stamp is of the first
|
||||||
|
* packet and thus may be before
|
||||||
|
* txsi->tx_ts.
|
||||||
|
*/
|
||||||
|
break;
|
||||||
|
if (TSTMP_GT(rts, txsi->tx_ts)) {
|
||||||
|
/*
|
||||||
|
* If reflected time stamp is later than
|
||||||
|
* tx_tsi, then this txsi is old.
|
||||||
|
*/
|
||||||
|
if (txsi->flags & TXSI_RTT_MEASURE_START
|
||||||
|
|| measurenext) {
|
||||||
|
marked_packet_rtt(txsi, e_t, tp,
|
||||||
|
&measurenext, &measurenext_len,
|
||||||
|
&rtt_bytes_adjust, OLD_TXSI);
|
||||||
|
}
|
||||||
|
TAILQ_REMOVE(&e_t->txsegi_q, txsi,
|
||||||
|
txsegi_lnk);
|
||||||
|
uma_zfree(txseginfo_zone, txsi);
|
||||||
|
txsi = TAILQ_FIRST(&e_t->txsegi_q);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (rts == txsi->tx_ts &&
|
||||||
|
TSTMP_LT(to->to_tsval, txsi->rx_ts)) {
|
||||||
|
/*
|
||||||
|
* Segment received before sent!
|
||||||
|
* Something is wrong with the received
|
||||||
|
* timestamps so increment errors. If
|
||||||
|
* this keeps up we will ignore
|
||||||
|
* timestamps.
|
||||||
|
*/
|
||||||
|
e_t->timestamp_errors++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
* Acknowledging a sequence number before this txsi.
|
||||||
|
* If it is an old txsi that may have had the same seq
|
||||||
|
* numbers, it should have been removed if time stamps
|
||||||
|
* are being used.
|
||||||
|
*/
|
||||||
|
if (SEQ_LEQ(ack, txsi->seq))
|
||||||
|
break; /* Before first packet in txsi. */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Only ack > txsi->seq and ack <= txsi->seq+txsi->len
|
||||||
|
* past this point.
|
||||||
|
*
|
||||||
|
* If delayed acks are being used, an acknowledgement
|
||||||
|
* for a single segment will have been delayed by the
|
||||||
|
* receiver and will yield an inaccurate measurement. In
|
||||||
|
* this case, we only make the measurement if more than
|
||||||
|
* one segment is being acknowledged or sack is
|
||||||
|
* currently being used.
|
||||||
|
*/
|
||||||
|
if (!e_t->dlyack_rx || multiack || new_sacked_bytes) {
|
||||||
|
/* Make an accurate new measurement. */
|
||||||
|
e_t->rtt = ticks - txsi->tx_ts + 1;
|
||||||
|
|
||||||
|
if (e_t->rtt < e_t->minrtt || e_t->minrtt == 0)
|
||||||
|
e_t->minrtt = e_t->rtt;
|
||||||
|
|
||||||
|
if (e_t->rtt > e_t->maxrtt || e_t->maxrtt == 0)
|
||||||
|
e_t->maxrtt = e_t->rtt;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext)
|
||||||
|
marked_packet_rtt(txsi, e_t, tp,
|
||||||
|
&measurenext, &measurenext_len,
|
||||||
|
&rtt_bytes_adjust, CORRECT_ACK);
|
||||||
|
|
||||||
|
if (txsi->flags & TXSI_TSO) {
|
||||||
|
txsi->len -= acked;
|
||||||
|
if (txsi->len > 0) {
|
||||||
|
/*
|
||||||
|
* This presumes ack for first bytes in
|
||||||
|
* txsi, this may not be true but it
|
||||||
|
* shouldn't cause problems for the
|
||||||
|
* timing.
|
||||||
|
*
|
||||||
|
* We remeasure RTT even though we only
|
||||||
|
* have a single txsi. The rationale
|
||||||
|
* behind this is that it is better to
|
||||||
|
* have a slightly inaccurate
|
||||||
|
* measurement than no additional
|
||||||
|
* measurement for the rest of the bulk
|
||||||
|
* transfer. Since TSO is only used on
|
||||||
|
* high speed interface cards, so the
|
||||||
|
* packets should be transmitted at line
|
||||||
|
* rate back to back with little
|
||||||
|
* difference in transmission times (in
|
||||||
|
* ticks).
|
||||||
|
*/
|
||||||
|
txsi->seq += acked;
|
||||||
|
/*
|
||||||
|
* Reset txsi measure flag so we don't
|
||||||
|
* use it for another RTT measurement.
|
||||||
|
*/
|
||||||
|
txsi->flags &= ~TXSI_RTT_MEASURE_START;
|
||||||
|
/*
|
||||||
|
* There is still more data to be acked
|
||||||
|
* from tso bulk transmission, so we
|
||||||
|
* won't remove it from the TAILQ yet.
|
||||||
|
*/
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
|
||||||
|
uma_zfree(txseginfo_zone, txsi);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (measurenext) {
|
||||||
|
/*
|
||||||
|
* We need to do a RTT measurement. It won't be the best
|
||||||
|
* if we do it here.
|
||||||
|
*/
|
||||||
|
marked_packet_rtt(txsi, e_t, tp,
|
||||||
|
&measurenext, &measurenext_len,
|
||||||
|
&rtt_bytes_adjust, FORCED_MEASUREMENT);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Add information about a transmitted segment to a list.
|
||||||
|
* This is called via the helper hook in tcp_output.c
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id, void *udata,
|
||||||
|
void *ctx_data, void *hdata, struct osd *hosd)
|
||||||
|
{
|
||||||
|
struct ertt *e_t;
|
||||||
|
struct tcpcb *tp;
|
||||||
|
struct tcphdr *th;
|
||||||
|
struct tcpopt *to;
|
||||||
|
struct tcp_hhook_data *thdp;
|
||||||
|
struct txseginfo *txsi;
|
||||||
|
long len;
|
||||||
|
int tso;
|
||||||
|
|
||||||
|
KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
|
||||||
|
KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));
|
||||||
|
|
||||||
|
e_t = (struct ertt *)hdata;
|
||||||
|
thdp = ctx_data;
|
||||||
|
tp = thdp->tp;
|
||||||
|
th = thdp->th;
|
||||||
|
to = thdp->to;
|
||||||
|
len = thdp->len;
|
||||||
|
tso = thdp->tso;
|
||||||
|
|
||||||
|
INP_WLOCK_ASSERT(tp->t_inpcb);
|
||||||
|
|
||||||
|
if (len > 0) {
|
||||||
|
txsi = uma_zalloc(txseginfo_zone, M_NOWAIT);
|
||||||
|
if (txsi != NULL) {
|
||||||
|
/* Construct txsi setting the necessary flags. */
|
||||||
|
txsi->flags = 0; /* Needs to be initialised. */
|
||||||
|
txsi->seq = ntohl(th->th_seq);
|
||||||
|
txsi->len = len;
|
||||||
|
if (tso)
|
||||||
|
txsi->flags |= TXSI_TSO;
|
||||||
|
else if (e_t->flags & ERTT_TSO_DISABLED) {
|
||||||
|
tp->t_flags |= TF_TSO;
|
||||||
|
e_t->flags &= ~ERTT_TSO_DISABLED;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (e_t->flags & ERTT_MEASUREMENT_IN_PROGRESS) {
|
||||||
|
e_t->bytes_tx_in_rtt += len;
|
||||||
|
} else {
|
||||||
|
txsi->flags |= TXSI_RTT_MEASURE_START;
|
||||||
|
e_t->flags |= ERTT_MEASUREMENT_IN_PROGRESS;
|
||||||
|
e_t->bytes_tx_in_rtt = len;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (((tp->t_flags & TF_NOOPT) == 0) &&
|
||||||
|
(to->to_flags & TOF_TS)) {
|
||||||
|
txsi->tx_ts = ntohl(to->to_tsval) -
|
||||||
|
tp->ts_offset;
|
||||||
|
txsi->rx_ts = ntohl(to->to_tsecr);
|
||||||
|
} else {
|
||||||
|
txsi->tx_ts = ticks;
|
||||||
|
txsi->rx_ts = 0; /* No received time stamp. */
|
||||||
|
}
|
||||||
|
TAILQ_INSERT_TAIL(&e_t->txsegi_q, txsi, txsegi_lnk);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
ertt_mod_init(void)
|
||||||
|
{
|
||||||
|
|
||||||
|
txseginfo_zone = uma_zcreate("ertt_txseginfo", sizeof(struct txseginfo),
|
||||||
|
NULL, NULL, NULL, NULL, 0, 0);
|
||||||
|
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
ertt_mod_destroy(void)
|
||||||
|
{
|
||||||
|
|
||||||
|
uma_zdestroy(txseginfo_zone);
|
||||||
|
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
ertt_uma_ctor(void *mem, int size, void *arg, int flags)
|
||||||
|
{
|
||||||
|
struct ertt *e_t;
|
||||||
|
|
||||||
|
e_t = mem;
|
||||||
|
|
||||||
|
TAILQ_INIT(&e_t->txsegi_q);
|
||||||
|
e_t->timestamp_errors = 0;
|
||||||
|
e_t->minrtt = 0;
|
||||||
|
e_t->maxrtt = 0;
|
||||||
|
e_t->rtt = 0;
|
||||||
|
e_t->flags = 0;
|
||||||
|
e_t->dlyack_rx = 0;
|
||||||
|
e_t->bytes_tx_in_rtt = 0;
|
||||||
|
e_t->markedpkt_rtt = 0;
|
||||||
|
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
ertt_uma_dtor(void *mem, int size, void *arg)
|
||||||
|
{
|
||||||
|
struct ertt *e_t;
|
||||||
|
struct txseginfo *n_txsi, *txsi;
|
||||||
|
|
||||||
|
e_t = mem;
|
||||||
|
txsi = TAILQ_FIRST(&e_t->txsegi_q);
|
||||||
|
while (txsi != NULL) {
|
||||||
|
n_txsi = TAILQ_NEXT(txsi, txsegi_lnk);
|
||||||
|
uma_zfree(txseginfo_zone, txsi);
|
||||||
|
txsi = n_txsi;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
KHELP_DECLARE_MOD_UMA(ertt, &ertt_helper, ertt_hooks, 1, sizeof(struct ertt),
|
||||||
|
ertt_uma_ctor, ertt_uma_dtor);
|
89
sys/netinet/khelp/h_ertt.h
Normal file
89
sys/netinet/khelp/h_ertt.h
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
/*-
|
||||||
|
* Copyright (c) 2009-2010
|
||||||
|
* Swinburne University of Technology, Melbourne, Australia
|
||||||
|
* Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org>
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* This software was developed at the Centre for Advanced Internet
|
||||||
|
* Architectures, Swinburne University, by David Hayes, made possible in part by
|
||||||
|
* a grant from the Cisco University Research Program Fund at Community
|
||||||
|
* Foundation Silicon Valley.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||||
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||||
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||||
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||||
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||||
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||||
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
*
|
||||||
|
* $FreeBSD$
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The ERTT (Enhanced Round Trip Time) Khelp module calculates an estimate of
|
||||||
|
* the instantaneous TCP RTT which, for example, is used by delay-based
|
||||||
|
* congestion control schemes. When the module is loaded, ERTT data is
|
||||||
|
* calculated for each active TCP connection and encapsulated within a
|
||||||
|
* "struct ertt".
|
||||||
|
*
|
||||||
|
* This software was first released in 2010 by David Hayes and Lawrence Stewart
|
||||||
|
* whilst working on the NewTCP research project at Swinburne University's
|
||||||
|
* Centre for Advanced Internet Architectures, Melbourne, Australia, which was
|
||||||
|
* made possible in part by a grant from the Cisco University Research Program
|
||||||
|
* Fund at Community Foundation Silicon Valley. Testing and development was
|
||||||
|
* further assisted by a grant from the FreeBSD Foundation. More details are
|
||||||
|
* available at:
|
||||||
|
* http://caia.swin.edu.au/urp/newtcp/
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef _NETINET_KHELP_H_ERTT_
|
||||||
|
#define _NETINET_KHELP_H_ERTT_
|
||||||
|
|
||||||
|
struct txseginfo;
|
||||||
|
|
||||||
|
/* Structure used as the ertt data block. */
|
||||||
|
struct ertt {
|
||||||
|
/* Information about transmitted segments to aid in RTT calculation. */
|
||||||
|
TAILQ_HEAD(txseginfo_head, txseginfo) txsegi_q;
|
||||||
|
/* Bytes TX so far in marked RTT. */
|
||||||
|
long bytes_tx_in_rtt;
|
||||||
|
/* Final version of above. */
|
||||||
|
long bytes_tx_in_marked_rtt;
|
||||||
|
/* cwnd for marked RTT. */
|
||||||
|
unsigned long marked_snd_cwnd;
|
||||||
|
/* Per-packet measured RTT. */
|
||||||
|
int rtt;
|
||||||
|
/* Maximum RTT measured. */
|
||||||
|
int maxrtt;
|
||||||
|
/* Minimum RTT measured. */
|
||||||
|
int minrtt;
|
||||||
|
/* Guess if the receiver is using delayed ack. */
|
||||||
|
int dlyack_rx;
|
||||||
|
/* Keep track of inconsistencies in packet timestamps. */
|
||||||
|
int timestamp_errors;
|
||||||
|
/* RTT for a marked packet. */
|
||||||
|
int markedpkt_rtt;
|
||||||
|
/* Flags to signal conditions between hook function calls. */
|
||||||
|
uint32_t flags;
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Flags for struct ertt. */
|
||||||
|
#define ERTT_NEW_MEASUREMENT 0x01
|
||||||
|
#define ERTT_MEASUREMENT_IN_PROGRESS 0x02
|
||||||
|
#define ERTT_TSO_DISABLED 0x04
|
||||||
|
|
||||||
|
#endif /* _NETINET_KHELP_H_ERTT_ */
|
Loading…
x
Reference in New Issue
Block a user