From 62dd1037a967deecdae7d0cf37e61aefb65e69ca Mon Sep 17 00:00:00 2001 From: Andriy Gapon Date: Sun, 22 Sep 2019 08:34:23 +0000 Subject: [PATCH 001/106] print summary line for space estimate of zfs send from bookmark Although there is always a single stream and the total size in the summary is always equal to the size reported for the stream, it's nice to follow the usual output format. MFC after: 3 days --- .../opensolaris/lib/libzfs/common/libzfs_sendrecv.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c index 290b8926d3a3..4b2122c2a501 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c @@ -2054,6 +2054,15 @@ zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t flags) if (err == 0) { send_print_verbose(fout, zhp->zfs_name, from, size, flags.parsable); + if (flags.parsable) { + (void) fprintf(fout, "size\t%llu\n", + (longlong_t)size); + } else { + char buf[16]; + zfs_nicenum(size, buf, sizeof (buf)); + (void) fprintf(fout, dgettext(TEXT_DOMAIN, + "total estimated size is %s\n"), buf); + } } else { (void) fprintf(stderr, "Cannot estimate send size: " "%s\n", strerror(errno)); From 38a1def12f6c7ac7240360e2442858c076fe1c2f Mon Sep 17 00:00:00 2001 From: Andriy Gapon Date: Sun, 22 Sep 2019 08:44:41 +0000 Subject: [PATCH 002/106] MFZoL: Retire send space estimation via ZFS_IOC_SEND Add a small wrapper around libzfs_core's lzc_send_space() to libzfs so that every legacy ZFS_IOC_SEND consumer, along with their userland counterpart estimate_ioctl(), can leverage ZFS_IOC_SEND_SPACE to request send space estimation. The legacy functionality in zfs_ioc_send() is left untouched for compatibility purposes. Obtained from: ZoL Obtained from: zfsonlinux/zfs@cf7684bc8d57 Author: loli10K MFC after: 2 weeks --- .../lib/libzfs/common/libzfs_sendrecv.c | 57 ++++++++++--------- .../opensolaris/uts/common/fs/zfs/zfs_ioctl.c | 3 + 2 files changed, 32 insertions(+), 28 deletions(-) diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c index 4b2122c2a501..c1e07fa33106 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c @@ -948,39 +948,32 @@ typedef struct send_dump_data { } send_dump_data_t; static int -estimate_ioctl(zfs_handle_t *zhp, uint64_t fromsnap_obj, - boolean_t fromorigin, enum lzc_send_flags flags, uint64_t *sizep) +zfs_send_space(zfs_handle_t *zhp, const char *snapname, const char *from, + enum lzc_send_flags flags, uint64_t *spacep) { - zfs_cmd_t zc = { 0 }; libzfs_handle_t *hdl = zhp->zfs_hdl; + int error; - assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); - assert(fromsnap_obj == 0 || !fromorigin); + assert(snapname != NULL); + error = lzc_send_space(snapname, from, flags, spacep); - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - zc.zc_obj = fromorigin; - zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); - zc.zc_fromobj = fromsnap_obj; - zc.zc_guid = 1; /* estimate flag */ - zc.zc_flags = flags; - - if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) { + if (error != 0) { char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "warning: cannot estimate space for '%s'"), zhp->zfs_name); + "warning: cannot estimate space for '%s'"), snapname); - switch (errno) { + switch (error) { case EXDEV: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not an earlier snapshot from the same fs")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); case ENOENT: - if (zfs_dataset_exists(hdl, zc.zc_name, + if (zfs_dataset_exists(hdl, snapname, ZFS_TYPE_SNAPSHOT)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "incremental source (@%s) does not exist"), - zc.zc_value); + "incremental source (%s) does not exist"), + snapname); } return (zfs_error(hdl, EZFS_NOENT, errbuf)); @@ -994,16 +987,15 @@ estimate_ioctl(zfs_handle_t *zhp, uint64_t fromsnap_obj, case ERANGE: case EFAULT: case EROFS: - zfs_error_aux(hdl, strerror(errno)); + case EINVAL: + zfs_error_aux(hdl, strerror(error)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: - return (zfs_standard_error(hdl, errno, errbuf)); + return (zfs_standard_error(hdl, error, errbuf)); } } - *sizep = zc.zc_objset_type; - return (0); } @@ -1290,13 +1282,22 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) (sdd->fromorigin || sdd->replicate); if (sdd->verbose || sdd->progress) { - (void) estimate_ioctl(zhp, sdd->prevsnap_obj, - fromorigin, flags, &size); - sdd->size += size; + char fromds[ZFS_MAX_DATASET_NAME_LEN]; - send_print_verbose(fout, zhp->zfs_name, - sdd->prevsnap[0] ? sdd->prevsnap : NULL, - size, sdd->parsable); + if (sdd->prevsnap[0] != '\0') { + (void) strlcpy(fromds, zhp->zfs_name, sizeof (fromds)); + *(strchr(fromds, '@') + 1) = '\0'; + (void) strlcat(fromds, sdd->prevsnap, sizeof (fromds)); + } + if (zfs_send_space(zhp, zhp->zfs_name, + sdd->prevsnap[0] ? fromds : NULL, flags, &size) != 0) { + size = 0; /* cannot estimate send space */ + } else { + send_print_verbose(fout, zhp->zfs_name, + sdd->prevsnap[0] ? sdd->prevsnap : NULL, + size, sdd->parsable); + } + sdd->size += size; } if (!sdd->dryrun) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c index f19fa8720968..575dd6904917 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c @@ -4799,6 +4799,9 @@ zfs_ioc_recv(zfs_cmd_t *zc) * * outputs: * zc_objset_type estimated size, if zc_guid is set + * + * NOTE: This is no longer the preferred interface, any new functionality + * should be added to zfs_ioc_send_new() instead. */ static int zfs_ioc_send(zfs_cmd_t *zc) From 44f2a3272e65fb34c3d57edf5973ba95e079ac1a Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Sun, 22 Sep 2019 10:40:15 +0000 Subject: [PATCH 003/106] Cleanup the RTO calculation and perform some consistency checks before computing the RTO. This should fix an overflow issue reported by Felix Weinrank in https://github.com/sctplab/usrsctp/issues/375 for the userland stack and found by running a fuzz tester. MFC after: 3 days --- sys/netinet/sctp_indata.c | 41 +++++++++++++++++---------------------- sys/netinet/sctp_input.c | 13 +++++-------- sys/netinet/sctputil.c | 34 +++++++++++++++++++------------- sys/netinet/sctputil.h | 2 +- 4 files changed, 45 insertions(+), 45 deletions(-) diff --git a/sys/netinet/sctp_indata.c b/sys/netinet/sctp_indata.c index 0e729c9ccb33..5db87ae86b54 100644 --- a/sys/netinet/sctp_indata.c +++ b/sys/netinet/sctp_indata.c @@ -3108,13 +3108,12 @@ sctp_process_segment_range(struct sctp_tcb *stcb, struct sctp_tmit_chunk **p_tp1 * update RTO too ? */ if (tp1->do_rtt) { - if (*rto_ok) { - tp1->whoTo->RTO = - sctp_calculate_rto(stcb, - &stcb->asoc, - tp1->whoTo, - &tp1->sent_rcv_time, - SCTP_RTT_FROM_DATA); + if (*rto_ok && + sctp_calculate_rto(stcb, + &stcb->asoc, + tp1->whoTo, + &tp1->sent_rcv_time, + SCTP_RTT_FROM_DATA)) { *rto_ok = 0; } if (tp1->whoTo->rto_needed == 0) { @@ -4086,16 +4085,12 @@ sctp_express_handle_sack(struct sctp_tcb *stcb, uint32_t cumack, /* update RTO too? */ if (tp1->do_rtt) { - if (rto_ok) { - tp1->whoTo->RTO = - /* - * sa_ignore - * NO_NULL_CHK - */ - sctp_calculate_rto(stcb, - asoc, tp1->whoTo, - &tp1->sent_rcv_time, - SCTP_RTT_FROM_DATA); + if (rto_ok && + sctp_calculate_rto(stcb, + &stcb->asoc, + tp1->whoTo, + &tp1->sent_rcv_time, + SCTP_RTT_FROM_DATA)) { rto_ok = 0; } if (tp1->whoTo->rto_needed == 0) { @@ -4704,12 +4699,12 @@ sctp_handle_sack(struct mbuf *m, int offset_seg, int offset_dup, /* update RTO too? */ if (tp1->do_rtt) { - if (rto_ok) { - tp1->whoTo->RTO = - sctp_calculate_rto(stcb, - asoc, tp1->whoTo, - &tp1->sent_rcv_time, - SCTP_RTT_FROM_DATA); + if (rto_ok && + sctp_calculate_rto(stcb, + &stcb->asoc, + tp1->whoTo, + &tp1->sent_rcv_time, + SCTP_RTT_FROM_DATA)) { rto_ok = 0; } if (tp1->whoTo->rto_needed == 0) { diff --git a/sys/netinet/sctp_input.c b/sys/netinet/sctp_input.c index e356253d4128..27db0fd17138 100644 --- a/sys/netinet/sctp_input.c +++ b/sys/netinet/sctp_input.c @@ -548,7 +548,7 @@ sctp_process_init_ack(struct mbuf *m, int iphlen, int offset, asoc->primary_destination, SCTP_FROM_SCTP_INPUT + SCTP_LOC_3); /* calculate the RTO */ - net->RTO = sctp_calculate_rto(stcb, asoc, net, &asoc->time_entered, + sctp_calculate_rto(stcb, asoc, net, &asoc->time_entered, SCTP_RTT_FROM_NON_DATA); retval = sctp_send_cookie_echo(m, offset, initack_limit, stcb, net); return (retval); @@ -648,7 +648,7 @@ sctp_handle_heartbeat_ack(struct sctp_heartbeat_chunk *cp, tv.tv_sec = cp->heartbeat.hb_info.time_value_1; tv.tv_usec = cp->heartbeat.hb_info.time_value_2; /* Now lets do a RTO with this */ - r_net->RTO = sctp_calculate_rto(stcb, &stcb->asoc, r_net, &tv, + sctp_calculate_rto(stcb, &stcb->asoc, r_net, &tv, SCTP_RTT_FROM_NON_DATA); if (!(r_net->dest_state & SCTP_ADDR_REACHABLE)) { r_net->dest_state |= SCTP_ADDR_REACHABLE; @@ -1674,8 +1674,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, old.tv_sec = cookie->time_entered.tv_sec; old.tv_usec = cookie->time_entered.tv_usec; net->hb_responded = 1; - net->RTO = sctp_calculate_rto(stcb, asoc, net, - &old, + sctp_calculate_rto(stcb, asoc, net, &old, SCTP_RTT_FROM_NON_DATA); if (stcb->asoc.sctp_autoclose_ticks && @@ -2399,8 +2398,7 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, /* calculate the RTT and set the encaps port */ old.tv_sec = cookie->time_entered.tv_sec; old.tv_usec = cookie->time_entered.tv_usec; - (*netp)->RTO = sctp_calculate_rto(stcb, asoc, *netp, - &old, SCTP_RTT_FROM_NON_DATA); + sctp_calculate_rto(stcb, asoc, *netp, &old, SCTP_RTT_FROM_NON_DATA); } /* respond with a COOKIE-ACK */ sctp_send_cookie_ack(stcb); @@ -2976,8 +2974,7 @@ sctp_handle_cookie_ack(struct sctp_cookie_ack_chunk *cp SCTP_UNUSED, SCTP_STAT_INCR_COUNTER32(sctps_activeestab); SCTP_STAT_INCR_GAUGE32(sctps_currestab); if (asoc->overall_error_count == 0) { - net->RTO = sctp_calculate_rto(stcb, asoc, net, - &asoc->time_entered, + sctp_calculate_rto(stcb, asoc, net, &asoc->time_entered, SCTP_RTT_FROM_NON_DATA); } (void)SCTP_GETTIME_TIMEVAL(&asoc->time_entered); diff --git a/sys/netinet/sctputil.c b/sys/netinet/sctputil.c index 183b290c2475..b40e1a22aa94 100644 --- a/sys/netinet/sctputil.c +++ b/sys/netinet/sctputil.c @@ -2469,25 +2469,24 @@ sctp_mtu_size_reset(struct sctp_inpcb *inp, /* - * given an association and starting time of the current RTT period return - * RTO in number of msecs net should point to the current network + * Given an association and starting time of the current RTT period, update + * RTO in number of msecs. net should point to the current network. + * Return 1, if an RTO update was performed, return 0 if no update was + * performed due to invalid starting point. */ -uint32_t +int sctp_calculate_rto(struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_nets *net, struct timeval *old, int rtt_from_sack) { - /*- - * given an association and the starting time of the current RTT - * period (in value1/value2) return RTO in number of msecs. - */ + struct timeval now; + uint64_t rtt_us; /* RTT in us */ int32_t rtt; /* RTT in ms */ uint32_t new_rto; int first_measure = 0; - struct timeval now; /************************/ /* 1. calculate new RTT */ @@ -2498,10 +2497,19 @@ sctp_calculate_rto(struct sctp_tcb *stcb, } else { (void)SCTP_GETTIME_TIMEVAL(&now); } + if ((old->tv_sec > now.tv_sec) || + ((old->tv_sec == now.tv_sec) && (old->tv_sec > now.tv_sec))) { + /* The starting point is in the future. */ + return (0); + } timevalsub(&now, old); + rtt_us = (uint64_t)1000000 * (uint64_t)now.tv_sec + (uint64_t)now.tv_usec; + if (rtt_us > SCTP_RTO_UPPER_BOUND * 1000) { + /* The RTT is larger than a sane value. */ + return (0); + } /* store the current RTT in us */ - net->rtt = (uint64_t)1000000 * (uint64_t)now.tv_sec + - (uint64_t)now.tv_usec; + net->rtt = rtt_us; /* compute rtt in ms */ rtt = (int32_t)(net->rtt / 1000); if ((asoc->cc_functions.sctp_rtt_calculated) && (rtt_from_sack == SCTP_RTT_FROM_DATA)) { @@ -2533,7 +2541,7 @@ sctp_calculate_rto(struct sctp_tcb *stcb, * Paper "Congestion Avoidance and Control", Annex A. * * (net->lastsa >> SCTP_RTT_SHIFT) is the srtt - * (net->lastsa >> SCTP_RTT_VAR_SHIFT) is the rttvar + * (net->lastsv >> SCTP_RTT_VAR_SHIFT) is the rttvar */ if (net->RTO_measured) { rtt -= (net->lastsa >> SCTP_RTT_SHIFT); @@ -2574,8 +2582,8 @@ sctp_calculate_rto(struct sctp_tcb *stcb, if (new_rto > stcb->asoc.maxrto) { new_rto = stcb->asoc.maxrto; } - /* we are now returning the RTO */ - return (new_rto); + net->RTO = new_rto; + return (1); } /* diff --git a/sys/netinet/sctputil.h b/sys/netinet/sctputil.h index 690e6125b7ec..c67c021fd30d 100644 --- a/sys/netinet/sctputil.h +++ b/sys/netinet/sctputil.h @@ -133,7 +133,7 @@ uint32_t sctp_get_next_mtu(uint32_t); void sctp_timeout_handler(void *); -uint32_t +int sctp_calculate_rto(struct sctp_tcb *, struct sctp_association *, struct sctp_nets *, struct timeval *, int); From 1325a0de13f45421f1c5060f57e148cca7cdfaab Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Sun, 22 Sep 2019 11:11:01 +0000 Subject: [PATCH 004/106] Don't hold the info lock when calling sctp_select_a_tag(). This avoids a double lock bug in the NAT colliding state processing of SCTP. Thanks to Felix Weinrank for finding and reporting this issue in https://github.com/sctplab/usrsctp/issues/374 He found this bug using fuzz testing. MFC after: 3 days --- sys/netinet/sctp_input.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/sys/netinet/sctp_input.c b/sys/netinet/sctp_input.c index 27db0fd17138..8a20dfee5133 100644 --- a/sys/netinet/sctp_input.c +++ b/sys/netinet/sctp_input.c @@ -703,34 +703,37 @@ static int sctp_handle_nat_colliding_state(struct sctp_tcb *stcb) { /* - * return 0 means we want you to proceed with the abort non-zero - * means no abort processing + * Return 0 means we want you to proceed with the abort non-zero + * means no abort processing. */ + uint32_t new_vtag; struct sctpasochead *head; if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { + new_vtag = sctp_select_a_tag(stcb->sctp_ep, stcb->sctp_ep->sctp_lport, stcb->rport, 1); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_INP_INFO_WLOCK(); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); + } else { + return (0); } if (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) { /* generate a new vtag and send init */ LIST_REMOVE(stcb, sctp_asocs); - stcb->asoc.my_vtag = sctp_select_a_tag(stcb->sctp_ep, stcb->sctp_ep->sctp_lport, stcb->rport, 1); + stcb->asoc.my_vtag = new_vtag; head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))]; /* * put it in the bucket in the vtag hash of assoc's for the * system */ LIST_INSERT_HEAD(head, stcb, sctp_asocs); - sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); SCTP_INP_INFO_WUNLOCK(); + sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); return (1); - } - if (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED) { + } else { /* * treat like a case where the cookie expired i.e.: - dump * current cookie. - generate a new vtag. - resend init. @@ -740,15 +743,15 @@ sctp_handle_nat_colliding_state(struct sctp_tcb *stcb) SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_WAIT); sctp_stop_all_cookie_timers(stcb); sctp_toss_old_cookies(stcb, &stcb->asoc); - stcb->asoc.my_vtag = sctp_select_a_tag(stcb->sctp_ep, stcb->sctp_ep->sctp_lport, stcb->rport, 1); + stcb->asoc.my_vtag = new_vtag; head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))]; /* * put it in the bucket in the vtag hash of assoc's for the * system */ LIST_INSERT_HEAD(head, stcb, sctp_asocs); - sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); SCTP_INP_INFO_WUNLOCK(); + sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); return (1); } return (0); From 789f4e26e60218bfce58004971bcdcca4e7b1bc4 Mon Sep 17 00:00:00 2001 From: Mike Karels Date: Sun, 22 Sep 2019 13:56:27 +0000 Subject: [PATCH 005/106] Add support for ps -H on corefiles in libkvm Add support for kernel threads in kvm_getprocs() and the underlying kvm_proclist() in libkvm when fetching from a kernel core file. This has been missing/needed for several releases, when kernel threads became normal threads. The loop over the processes now contains a sub-loop for threads, which iterates beyond the first thread only when threads are requested. Also set some fields such as tid that were previously uninitialized. Reviewed by: vangyzen jhb(earlier revision) MFC after: 4 days Sponsored by: Forcepoint LLC Differential Revision: https://reviews.freebsd.org/D21461 --- lib/libkvm/kvm_proc.c | 259 +++++++++++++++++++++++++----------------- 1 file changed, 152 insertions(+), 107 deletions(-) diff --git a/lib/libkvm/kvm_proc.c b/lib/libkvm/kvm_proc.c index 409be3d64885..c97a347decd7 100644 --- a/lib/libkvm/kvm_proc.c +++ b/lib/libkvm/kvm_proc.c @@ -67,6 +67,7 @@ __SCCSID("@(#)kvm_proc.c 8.3 (Berkeley) 9/23/93"); #include #include #include +#include #include #include #include @@ -130,13 +131,16 @@ kvm_proclist(kvm_t *kd, int what, int arg, struct proc *p, struct proc pproc; struct sysentvec sysent; char svname[KI_EMULNAMELEN]; + struct thread *td = NULL; + bool first_thread; kp = &kinfo_proc; kp->ki_structsize = sizeof(kinfo_proc); /* - * Loop on the processes. this is completely broken because we need to be - * able to loop on the threads and merge the ones that are the same process some how. + * Loop on the processes, then threads within the process if requested. */ + if (what == KERN_PROC_ALL) + what |= KERN_PROC_INC_THREAD; for (; cnt < maxcnt && p != NULL; p = LIST_NEXT(&proc, p_list)) { memset(kp, 0, sizeof *kp); if (KREAD(kd, (u_long)p, &proc)) { @@ -145,15 +149,6 @@ kvm_proclist(kvm_t *kd, int what, int arg, struct proc *p, } if (proc.p_state == PRS_NEW) continue; - if (proc.p_state != PRS_ZOMBIE) { - if (KREAD(kd, (u_long)TAILQ_FIRST(&proc.p_threads), - &mtd)) { - _kvm_err(kd, kd->program, - "can't read thread at %p", - TAILQ_FIRST(&proc.p_threads)); - return (-1); - } - } if (KREAD(kd, (u_long)proc.p_ucred, &ucred) == 0) { kp->ki_ruid = ucred.cr_ruid; kp->ki_svuid = ucred.cr_svuid; @@ -222,6 +217,7 @@ kvm_proclist(kvm_t *kd, int what, int arg, struct proc *p, kp->ki_addr = 0; /* XXX uarea */ /* kp->ki_kstack = proc.p_thread.td_kstack; XXXKSE */ kp->ki_args = proc.p_args; + kp->ki_numthreads = proc.p_numthreads; kp->ki_tracep = proc.p_tracevp; kp->ki_textvp = proc.p_textvp; kp->ki_fd = proc.p_fd; @@ -285,9 +281,6 @@ kvm_proclist(kvm_t *kd, int what, int arg, struct proc *p, kp->ki_sid = sess.s_sid; (void)memcpy(kp->ki_login, sess.s_login, sizeof(kp->ki_login)); - kp->ki_kiflag = sess.s_ttyvp ? KI_CTTY : 0; - if (sess.s_leader == p) - kp->ki_kiflag |= KI_SLEADER; if ((proc.p_flag & P_CONTROLT) && sess.s_ttyp != NULL) { if (KREAD(kd, (u_long)sess.s_ttyp, &tty)) { _kvm_err(kd, kd->program, @@ -330,9 +323,6 @@ kvm_proclist(kvm_t *kd, int what, int arg, struct proc *p, nopgrp: kp->ki_tdev = NODEV; } - if ((proc.p_state != PRS_ZOMBIE) && mtd.td_wmesg) - (void)kvm_read(kd, (u_long)mtd.td_wmesg, - kp->ki_wmesg, WMESGLEN); (void)kvm_read(kd, (u_long)proc.p_vmspace, (char *)&vmspace, sizeof(vmspace)); @@ -374,85 +364,127 @@ kvm_proclist(kvm_t *kd, int what, int arg, struct proc *p, sizeof(svname)); if (svname[0] != 0) strlcpy(kp->ki_emul, svname, KI_EMULNAMELEN); - if ((proc.p_state != PRS_ZOMBIE) && - (mtd.td_blocked != 0)) { - kp->ki_kiflag |= KI_LOCKBLOCK; - if (mtd.td_lockname) - (void)kvm_read(kd, - (u_long)mtd.td_lockname, - kp->ki_lockname, LOCKNAMELEN); - kp->ki_lockname[LOCKNAMELEN] = 0; - } kp->ki_runtime = cputick2usec(proc.p_rux.rux_runtime); kp->ki_pid = proc.p_pid; - kp->ki_siglist = proc.p_siglist; - SIGSETOR(kp->ki_siglist, mtd.td_siglist); - kp->ki_sigmask = mtd.td_sigmask; kp->ki_xstat = KW_EXITCODE(proc.p_xexit, proc.p_xsig); kp->ki_acflag = proc.p_acflag; kp->ki_lock = proc.p_lock; - if (proc.p_state != PRS_ZOMBIE) { - kp->ki_swtime = (ticks - proc.p_swtick) / hz; - kp->ki_flag = proc.p_flag; - kp->ki_sflag = 0; - kp->ki_nice = proc.p_nice; - kp->ki_traceflag = proc.p_traceflag; - if (proc.p_state == PRS_NORMAL) { - if (TD_ON_RUNQ(&mtd) || - TD_CAN_RUN(&mtd) || - TD_IS_RUNNING(&mtd)) { - kp->ki_stat = SRUN; - } else if (mtd.td_state == - TDS_INHIBITED) { - if (P_SHOULDSTOP(&proc)) { - kp->ki_stat = SSTOP; - } else if ( - TD_IS_SLEEPING(&mtd)) { - kp->ki_stat = SSLEEP; - } else if (TD_ON_LOCK(&mtd)) { - kp->ki_stat = SLOCK; - } else { - kp->ki_stat = SWAIT; - } - } - } else { - kp->ki_stat = SIDL; - } - /* Stuff from the thread */ - kp->ki_pri.pri_level = mtd.td_priority; - kp->ki_pri.pri_native = mtd.td_base_pri; - kp->ki_lastcpu = mtd.td_lastcpu; - kp->ki_wchan = mtd.td_wchan; - kp->ki_oncpu = mtd.td_oncpu; - if (mtd.td_name[0] != '\0') - strlcpy(kp->ki_tdname, mtd.td_name, sizeof(kp->ki_tdname)); - kp->ki_pctcpu = 0; - kp->ki_rqindex = 0; - - /* - * Note: legacy fields; wraps at NO_CPU_OLD or the - * old max CPU value as appropriate - */ - if (mtd.td_lastcpu == NOCPU) - kp->ki_lastcpu_old = NOCPU_OLD; - else if (mtd.td_lastcpu > MAXCPU_OLD) - kp->ki_lastcpu_old = MAXCPU_OLD; - else - kp->ki_lastcpu_old = mtd.td_lastcpu; - - if (mtd.td_oncpu == NOCPU) - kp->ki_oncpu_old = NOCPU_OLD; - else if (mtd.td_oncpu > MAXCPU_OLD) - kp->ki_oncpu_old = MAXCPU_OLD; - else - kp->ki_oncpu_old = mtd.td_oncpu; - } else { - kp->ki_stat = SZOMB; - } kp->ki_tdev_freebsd11 = kp->ki_tdev; /* truncate */ - bcopy(&kinfo_proc, bp, sizeof(kinfo_proc)); - ++bp; - ++cnt; + + /* Per-thread items; iterate as appropriate. */ + td = TAILQ_FIRST(&proc.p_threads); + for (first_thread = true; cnt < maxcnt && td != NULL && + (first_thread || (what & KERN_PROC_INC_THREAD)); + first_thread = false) { + if (proc.p_state != PRS_ZOMBIE) { + if (KREAD(kd, (u_long)td, &mtd)) { + _kvm_err(kd, kd->program, + "can't read thread at %p", td); + return (-1); + } + if (what & KERN_PROC_INC_THREAD) + td = TAILQ_NEXT(&mtd, td_plist); + } else + td = NULL; + if ((proc.p_state != PRS_ZOMBIE) && mtd.td_wmesg) + (void)kvm_read(kd, (u_long)mtd.td_wmesg, + kp->ki_wmesg, WMESGLEN); + else + memset(kp->ki_wmesg, 0, WMESGLEN); + if (proc.p_pgrp == NULL) { + kp->ki_kiflag = 0; + } else { + kp->ki_kiflag = sess.s_ttyvp ? KI_CTTY : 0; + if (sess.s_leader == p) + kp->ki_kiflag |= KI_SLEADER; + } + if ((proc.p_state != PRS_ZOMBIE) && + (mtd.td_blocked != 0)) { + kp->ki_kiflag |= KI_LOCKBLOCK; + if (mtd.td_lockname) + (void)kvm_read(kd, + (u_long)mtd.td_lockname, + kp->ki_lockname, LOCKNAMELEN); + else + memset(kp->ki_lockname, 0, + LOCKNAMELEN); + kp->ki_lockname[LOCKNAMELEN] = 0; + } else + kp->ki_kiflag &= ~KI_LOCKBLOCK; + kp->ki_siglist = proc.p_siglist; + if (proc.p_state != PRS_ZOMBIE) { + SIGSETOR(kp->ki_siglist, mtd.td_siglist); + kp->ki_sigmask = mtd.td_sigmask; + kp->ki_swtime = (ticks - proc.p_swtick) / hz; + kp->ki_flag = proc.p_flag; + kp->ki_sflag = 0; + kp->ki_nice = proc.p_nice; + kp->ki_traceflag = proc.p_traceflag; + if (proc.p_state == PRS_NORMAL) { + if (TD_ON_RUNQ(&mtd) || + TD_CAN_RUN(&mtd) || + TD_IS_RUNNING(&mtd)) { + kp->ki_stat = SRUN; + } else if (mtd.td_state == + TDS_INHIBITED) { + if (P_SHOULDSTOP(&proc)) { + kp->ki_stat = SSTOP; + } else if ( + TD_IS_SLEEPING(&mtd)) { + kp->ki_stat = SSLEEP; + } else if (TD_ON_LOCK(&mtd)) { + kp->ki_stat = SLOCK; + } else { + kp->ki_stat = SWAIT; + } + } + } else { + kp->ki_stat = SIDL; + } + /* Stuff from the thread */ + kp->ki_pri.pri_level = mtd.td_priority; + kp->ki_pri.pri_native = mtd.td_base_pri; + kp->ki_lastcpu = mtd.td_lastcpu; + kp->ki_wchan = mtd.td_wchan; + kp->ki_oncpu = mtd.td_oncpu; + if (mtd.td_name[0] != '\0') + strlcpy(kp->ki_tdname, mtd.td_name, + sizeof(kp->ki_tdname)); + else + memset(kp->ki_tdname, 0, + sizeof(kp->ki_tdname)); + kp->ki_pctcpu = 0; + kp->ki_rqindex = 0; + + /* + * Note: legacy fields; wraps at NO_CPU_OLD + * or the old max CPU value as appropriate + */ + if (mtd.td_lastcpu == NOCPU) + kp->ki_lastcpu_old = NOCPU_OLD; + else if (mtd.td_lastcpu > MAXCPU_OLD) + kp->ki_lastcpu_old = MAXCPU_OLD; + else + kp->ki_lastcpu_old = mtd.td_lastcpu; + + if (mtd.td_oncpu == NOCPU) + kp->ki_oncpu_old = NOCPU_OLD; + else if (mtd.td_oncpu > MAXCPU_OLD) + kp->ki_oncpu_old = MAXCPU_OLD; + else + kp->ki_oncpu_old = mtd.td_oncpu; + kp->ki_tid = mtd.td_tid; + } else { + memset(&kp->ki_sigmask, 0, + sizeof(kp->ki_sigmask)); + kp->ki_stat = SZOMB; + kp->ki_tid = 0; + } + + bcopy(&kinfo_proc, bp, sizeof(kinfo_proc)); + ++bp; + ++cnt; + } } return (cnt); } @@ -466,7 +498,7 @@ kvm_deadprocs(kvm_t *kd, int what, int arg, u_long a_allproc, u_long a_zombproc, int maxcnt) { struct kinfo_proc *bp = kd->procbase; - int acnt, zcnt; + int acnt, zcnt = 0; struct proc *p; if (KREAD(kd, a_allproc, &p)) { @@ -477,13 +509,15 @@ kvm_deadprocs(kvm_t *kd, int what, int arg, u_long a_allproc, if (acnt < 0) return (acnt); - if (KREAD(kd, a_zombproc, &p)) { - _kvm_err(kd, kd->program, "cannot read zombproc"); - return (-1); + if (a_zombproc != 0) { + if (KREAD(kd, a_zombproc, &p)) { + _kvm_err(kd, kd->program, "cannot read zombproc"); + return (-1); + } + zcnt = kvm_proclist(kd, what, arg, p, bp + acnt, maxcnt - acnt); + if (zcnt < 0) + zcnt = 0; } - zcnt = kvm_proclist(kd, what, arg, p, bp + acnt, maxcnt - acnt); - if (zcnt < 0) - zcnt = 0; return (acnt + zcnt); } @@ -568,15 +602,18 @@ kvm_getprocs(kvm_t *kd, int op, int arg, int *cnt) liveout: nprocs = size == 0 ? 0 : size / kd->procbase->ki_structsize; } else { - struct nlist nl[7], *p; + struct nlist nl[6], *p; + struct nlist nlz[2]; nl[0].n_name = "_nprocs"; nl[1].n_name = "_allproc"; - nl[2].n_name = "_zombproc"; - nl[3].n_name = "_ticks"; - nl[4].n_name = "_hz"; - nl[5].n_name = "_cpu_tick_frequency"; - nl[6].n_name = 0; + nl[2].n_name = "_ticks"; + nl[3].n_name = "_hz"; + nl[4].n_name = "_cpu_tick_frequency"; + nl[5].n_name = 0; + + nlz[0].n_name = "_zombproc"; + nlz[1].n_name = 0; if (!kd->arch->ka_native(kd)) { _kvm_err(kd, kd->program, @@ -591,19 +628,27 @@ kvm_getprocs(kvm_t *kd, int op, int arg, int *cnt) "%s: no such symbol", p->n_name); return (0); } + (void) kvm_nlist(kd, nlz); /* attempt to get zombproc */ if (KREAD(kd, nl[0].n_value, &nprocs)) { _kvm_err(kd, kd->program, "can't read nprocs"); return (0); } - if (KREAD(kd, nl[3].n_value, &ticks)) { + /* + * If returning all threads, we don't know how many that + * might be. Presume that there are, on average, no more + * than 10 threads per process. + */ + if (op == KERN_PROC_ALL || (op & KERN_PROC_INC_THREAD)) + nprocs *= 10; /* XXX */ + if (KREAD(kd, nl[2].n_value, &ticks)) { _kvm_err(kd, kd->program, "can't read ticks"); return (0); } - if (KREAD(kd, nl[4].n_value, &hz)) { + if (KREAD(kd, nl[3].n_value, &hz)) { _kvm_err(kd, kd->program, "can't read hz"); return (0); } - if (KREAD(kd, nl[5].n_value, &cpu_tick_frequency)) { + if (KREAD(kd, nl[4].n_value, &cpu_tick_frequency)) { _kvm_err(kd, kd->program, "can't read cpu_tick_frequency"); return (0); @@ -614,7 +659,7 @@ kvm_getprocs(kvm_t *kd, int op, int arg, int *cnt) return (0); nprocs = kvm_deadprocs(kd, op, arg, nl[1].n_value, - nl[2].n_value, nprocs); + nlz[0].n_value, nprocs); if (nprocs <= 0) { _kvm_freeprocs(kd); nprocs = 0; From 2de5a21edcc31f0de59073a8293426efabc39795 Mon Sep 17 00:00:00 2001 From: Toomas Soome Date: Sun, 22 Sep 2019 16:10:25 +0000 Subject: [PATCH 006/106] loader_4th: menu items need to reset color attribute, not switch to white Forth menu kernel and BE entries, instead of resetting the color attribute, are switching to white color. --- stand/forth/menu.4th | 4 ++-- stand/forth/menu.rc | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/stand/forth/menu.4th b/stand/forth/menu.4th index e3fe0f7d776e..ca166c6d934b 100644 --- a/stand/forth/menu.4th +++ b/stand/forth/menu.4th @@ -397,7 +397,7 @@ also menu-infrastructure definitions setenv \ Assign third to ansi_caption[x][y] - kerncapbuf 0 s" @[1mK@[37mernel: " [char] @ escc! strcat + kerncapbuf 0 s" @[1mK@[mernel: " [char] @ escc! strcat kernmenuidx @ [char] 0 = if s" default/@[32m" else @@ -405,7 +405,7 @@ also menu-infrastructure definitions then [char] @ escc! strcat 2over strcat - s" @[37m" [char] @ escc! strcat + s" @[m" [char] @ escc! strcat kernidx @ kernmenuidx @ ansi_caption[x][y] setenv diff --git a/stand/forth/menu.rc b/stand/forth/menu.rc index 6fe3dfe14281..d640e803e61f 100644 --- a/stand/forth/menu.rc +++ b/stand/forth/menu.rc @@ -72,7 +72,7 @@ s" currdev" getenv dup 0> [if] drop 4 s" zfs:" compare 0= [if] set mainmenu_caption[7]="Select Boot [E]nvironment..." set mainmenu_command[7]="3 goto_menu" set mainmenu_keycode[7]=101 - set mainansi_caption[7]="Select Boot ^[1mE^[37mnvironment..." + set mainansi_caption[7]="Select Boot ^[1mE^[mnvironment..." s" chain_disk" getenv? [if] set mainmenu_caption[8]="Chain[L]oad ${chain_disk}" From 6dd078df1971b83e7676c614084f1c558817ef6e Mon Sep 17 00:00:00 2001 From: Toomas Soome Date: Sun, 22 Sep 2019 17:39:20 +0000 Subject: [PATCH 007/106] loader_lua: lua color changes should end with reset The color change should have reset sequence, not switch to white. --- stand/lua/color.lua | 8 ++++---- stand/lua/logo-beastie.lua | 2 +- stand/lua/logo-orb.lua | 2 +- stand/lua/menu.lua | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/stand/lua/color.lua b/stand/lua/color.lua index af4e1f4090ad..c5a3e1215c7e 100644 --- a/stand/lua/color.lua +++ b/stand/lua/color.lua @@ -42,7 +42,7 @@ color.MAGENTA = 5 color.CYAN = 6 color.WHITE = 7 -color.DEFAULT = 0 +color.DEFAULT = 9 color.BRIGHT = 1 color.DIM = 2 @@ -67,7 +67,7 @@ function color.resetfg() if color.disabled then return '' end - return color.escapefg(color.WHITE) + return color.escapefg(color.DEFAULT) end function color.escapebg(color_value) @@ -81,7 +81,7 @@ function color.resetbg() if color.disabled then return '' end - return color.escapebg(color.BLACK) + return color.escapebg(color.DEFAULT) end function color.escape(fg_color, bg_color, attribute) @@ -101,7 +101,7 @@ function color.default() if color.disabled then return "" end - return color.escape(color.WHITE, color.BLACK, color.DEFAULT) + return color.escape(color.DEFAULT, color.DEFAULT) end function color.highlight(str) diff --git a/stand/lua/logo-beastie.lua b/stand/lua/logo-beastie.lua index d01dc14b2d85..758cec4e91de 100644 --- a/stand/lua/logo-beastie.lua +++ b/stand/lua/logo-beastie.lua @@ -48,7 +48,7 @@ local beastie_color = { " \\ / /\\", " \027[36m______\027[31m( (_ / \\______/", " \027[36m,' ,-----' |", -" `--{__________)\027[37m" +" `--{__________)\027[m" } drawer.addLogo("beastie", { diff --git a/stand/lua/logo-orb.lua b/stand/lua/logo-orb.lua index 04a9fc09c6e6..3fe265a31e43 100644 --- a/stand/lua/logo-orb.lua +++ b/stand/lua/logo-orb.lua @@ -44,7 +44,7 @@ local orb_color = { " -- \027[31;1m-.\027[31m", " `:` \027[31;1m`:`", " \027[31;1m.-- `--.", -" .---.....----.\027[37m" +" .---.....----.\027[m" } drawer.addLogo("orb", { diff --git a/stand/lua/menu.lua b/stand/lua/menu.lua index d811a240fca3..51098844e87e 100644 --- a/stand/lua/menu.lua +++ b/stand/lua/menu.lua @@ -47,10 +47,10 @@ local return_menu_entry = { local function OnOff(str, value) if value then return str .. color.escapefg(color.GREEN) .. "On" .. - color.escapefg(color.WHITE) + color.resetfg() else return str .. color.escapefg(color.RED) .. "off" .. - color.escapefg(color.WHITE) + color.resetfg() end end From b16a3c9d192f145d7fa4f899f500279392803e1c Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Sun, 22 Sep 2019 18:27:57 +0000 Subject: [PATCH 008/106] Honor CWARNFLAGS.clang/gcc in the kernel build Some kernel builds or users may want to disable warnings on a per-compiler basis, so do this now. --- sys/conf/kern.mk | 1 + 1 file changed, 1 insertion(+) diff --git a/sys/conf/kern.mk b/sys/conf/kern.mk index 941bbe93c463..949b0b2ecacc 100644 --- a/sys/conf/kern.mk +++ b/sys/conf/kern.mk @@ -253,6 +253,7 @@ CFLAGS+= -gdwarf-2 .endif CFLAGS+= ${CWARNFLAGS:M*} ${CWARNFLAGS.${.IMPSRC:T}} +CFLAGS+= ${CWARNFLAGS.${COMPILER_TYPE}} CFLAGS+= ${CFLAGS.${COMPILER_TYPE}} ${CFLAGS.${.IMPSRC:T}} # Tell bmake not to mistake standard targets for things to be searched for From 2946ed83c059724edb09648970ec69ab34b2dab0 Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Sun, 22 Sep 2019 18:30:19 +0000 Subject: [PATCH 009/106] octeon1: suppress a couple of warnings under clang These appear in octeon-sdk -- there are new releases, but they don't seem to address the running issues in octeon-sdk. GCC4.2 is more than happy, but clang is much less-so and most of them are fairly innocuous and perhaps a by-product of their style guide, which may make some of the changes harder to upstream (if this is even possible anymore). --- sys/mips/cavium/std.octeon1 | 1 + 1 file changed, 1 insertion(+) diff --git a/sys/mips/cavium/std.octeon1 b/sys/mips/cavium/std.octeon1 index fa6ae0746d98..d5ff64bf7044 100644 --- a/sys/mips/cavium/std.octeon1 +++ b/sys/mips/cavium/std.octeon1 @@ -7,3 +7,4 @@ cpu CPU_CNMIPS makeoptions CFLAGS_PARAM_INLINE_UNIT_GROWTH=10000 makeoptions CFLAGS_PARAM_LARGE_FUNCTION_GROWTH=100000 makeoptions CFLAGS_ARCH_PARAMS="--param max-inline-insns-single=10000" +makeoptions "CWARNFLAGS.clang"+="-Wno-parentheses-equality -Wno-pointer-sign" From da0a7834acc1b1b44dfc2031f326fc64b0b0d4f5 Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Sun, 22 Sep 2019 18:32:05 +0000 Subject: [PATCH 010/106] octeon-sdk: suppress another set of warnings under clang Clang sees this construct and warns that adding an int to a string like this does not concatenate the two. Fortunately, this is not what octeon-sdk actually intended to do, so we take the path towards remediation that clang offers: use array indexing instead. --- sys/contrib/octeon-sdk/cvmx-app-init.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sys/contrib/octeon-sdk/cvmx-app-init.h b/sys/contrib/octeon-sdk/cvmx-app-init.h index ba62e0a1c403..11d83ae645f6 100644 --- a/sys/contrib/octeon-sdk/cvmx-app-init.h +++ b/sys/contrib/octeon-sdk/cvmx-app-init.h @@ -355,7 +355,7 @@ enum cvmx_chip_types_enum { #define CVMX_BOARD_TYPE_NAO38 CVMX_BOARD_TYPE_NAC38 /* Functions to return string based on type */ -#define ENUM_BRD_TYPE_CASE(x) case x: return(#x + 16); /* Skip CVMX_BOARD_TYPE_ */ +#define ENUM_BRD_TYPE_CASE(x) case x: return(&#x[16]); /* Skip CVMX_BOARD_TYPE_ */ static inline const char *cvmx_board_type_to_string(enum cvmx_board_types_enum type) { switch (type) @@ -491,7 +491,7 @@ static inline const char *cvmx_board_type_to_string(enum cvmx_board_types_enum t return "Unsupported Board"; } -#define ENUM_CHIP_TYPE_CASE(x) case x: return(#x + 15); /* Skip CVMX_CHIP_TYPE */ +#define ENUM_CHIP_TYPE_CASE(x) case x: return(&#x[15]); /* Skip CVMX_CHIP_TYPE */ static inline const char *cvmx_chip_type_to_string(enum cvmx_chip_types_enum type) { switch (type) From a5181a86a225bfd98f9e301270a350ea703bfc6b Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Sun, 22 Sep 2019 19:20:37 +0000 Subject: [PATCH 011/106] amd64: minor tweaks to pat decoding in sysctl vm.pmap.kernel_maps. Decode PAT_UNCACHED. When unknown pat mode is encountered, print the pte bits combination instead of the index, which is always 8. Reviewed by: markj Sponsored by: The FreeBSD Foundation MFC after: 1 week Differential revision: https://reviews.freebsd.org/D21738 --- sys/amd64/amd64/pmap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 5798c8253d1e..703b225592ca 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -10023,6 +10023,9 @@ sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, case PAT_UNCACHEABLE: mode = "UC"; break; + case PAT_UNCACHED: + mode = "U-"; + break; case PAT_WRITE_PROTECTED: mode = "WP"; break; @@ -10031,7 +10034,7 @@ sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, break; default: printf("%s: unknown PAT mode %#x for range 0x%016lx-0x%016lx\n", - __func__, i, range->sva, eva); + __func__, pat_idx, range->sva, eva); mode = "??"; break; } From b223a69238ee8f8d73eff6b3315717f8aa341129 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Sun, 22 Sep 2019 19:23:00 +0000 Subject: [PATCH 012/106] i386: implement sysctl vm.pmap.kernel_maps. Reviewed by: markj Sponsored by: The FreeBSD Foundation MFC after: 1 week Differential revision: https://reviews.freebsd.org/D21739 --- sys/i386/i386/pmap.c | 242 ++++++++++++++++++++++++++++++++++- sys/i386/i386/pmap_base.c | 11 ++ sys/i386/include/pmap_base.h | 1 + 3 files changed, 253 insertions(+), 1 deletion(-) diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index f07f500e8977..005af4e7cd52 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -121,6 +121,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -1130,6 +1131,38 @@ __CONCAT(PMTYPE, cache_bits)(pmap_t pmap, int mode, boolean_t is_pde) return (cache_bits); } +static int +pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde) +{ + int pat_flag, pat_idx; + + if ((cpu_feature & CPUID_PAT) == 0) + return (0); + + pat_idx = 0; + /* The PAT bit is different for PTE's and PDE's. */ + pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; + + if ((pte & pat_flag) != 0) + pat_idx |= 0x4; + if ((pte & PG_NC_PCD) != 0) + pat_idx |= 0x2; + if ((pte & PG_NC_PWT) != 0) + pat_idx |= 0x1; + + /* See pmap_init_pat(). */ + if (pat_works) { + if (pat_idx == 4) + pat_idx = 0; + if (pat_idx == 7) + pat_idx = 3; + } else { + /* XXXKIB */ + } + + return (pat_idx); +} + static bool __CONCAT(PMTYPE, ps_enabled)(pmap_t pmap __unused) { @@ -6083,7 +6116,213 @@ __CONCAT(PMTYPE, bios16_leave)(void *arg) free(h->pte, M_TEMP); /* ... and free it */ } -#define PMM(a) \ +struct pmap_kernel_map_range { + vm_offset_t sva; + pt_entry_t attrs; + int ptes; + int pdes; + int pdpes; +}; + +static void +sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, + vm_offset_t eva) +{ + const char *mode; + int i, pat_idx; + + if (eva <= range->sva) + return; + + pat_idx = pmap_pat_index(kernel_pmap, range->attrs, true); + for (i = 0; i < PAT_INDEX_SIZE; i++) + if (pat_index[i] == pat_idx) + break; + + switch (i) { + case PAT_WRITE_BACK: + mode = "WB"; + break; + case PAT_WRITE_THROUGH: + mode = "WT"; + break; + case PAT_UNCACHEABLE: + mode = "UC"; + break; + case PAT_UNCACHED: + mode = "U-"; + break; + case PAT_WRITE_PROTECTED: + mode = "WP"; + break; + case PAT_WRITE_COMBINING: + mode = "WC"; + break; + default: + printf("%s: unknown PAT mode %#x for range 0x%08x-0x%08x\n", + __func__, pat_idx, range->sva, eva); + mode = "??"; + break; + } + + sbuf_printf(sb, "0x%08x-0x%08x r%c%c%c%c %s %d %d %d\n", + range->sva, eva, + (range->attrs & PG_RW) != 0 ? 'w' : '-', +#ifdef PMAP_PAE_COMP + (range->attrs & pg_nx) != 0 ? '-' : 'x', +#else + '-', +#endif + (range->attrs & PG_U) != 0 ? 'u' : 's', + (range->attrs & PG_G) != 0 ? 'g' : '-', + mode, range->pdpes, range->pdes, range->ptes); + + /* Reset to sentinel value. */ + range->sva = 0xffffffff; +} + +/* + * Determine whether the attributes specified by a page table entry match those + * being tracked by the current range. This is not quite as simple as a direct + * flag comparison since some PAT modes have multiple representations. + */ +static bool +sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) +{ + pt_entry_t diff, mask; + + mask = PG_G | PG_RW | PG_U | PG_PDE_CACHE; +#ifdef PMAP_PAE_COMP + mask |= pg_nx; +#endif + diff = (range->attrs ^ attrs) & mask; + if (diff == 0) + return (true); + if ((diff & ~PG_PDE_PAT) == 0 && + pmap_pat_index(kernel_pmap, range->attrs, true) == + pmap_pat_index(kernel_pmap, attrs, true)) + return (true); + return (false); +} + +static void +sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, + pt_entry_t attrs) +{ + + memset(range, 0, sizeof(*range)); + range->sva = va; + range->attrs = attrs; +} + +/* + * Given a leaf PTE, derive the mapping's attributes. If they do not match + * those of the current run, dump the address range and its attributes, and + * begin a new run. + */ +static void +sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, + vm_offset_t va, pd_entry_t pde, pt_entry_t pte) +{ + pt_entry_t attrs, mask; + + attrs = pde & (PG_RW | PG_U); +#ifdef PMAP_PAE_COMP + attrs |= pde & pg_nx; +#endif + + if ((pde & PG_PS) != 0) { + attrs |= pde & (PG_G | PG_PDE_CACHE); + } else if (pte != 0) { + mask = pte & (PG_RW | PG_U); +#ifdef PMAP_PAE_COMP + mask |= pg_nx; +#endif + attrs &= mask; + attrs |= pte & (PG_G | PG_PTE_CACHE); + + /* Canonicalize by always using the PDE PAT bit. */ + if ((attrs & PG_PTE_PAT) != 0) + attrs ^= PG_PDE_PAT | PG_PTE_PAT; + } + + if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { + sysctl_kmaps_dump(sb, range, va); + sysctl_kmaps_reinit(range, va, attrs); + } +} + +static int +__CONCAT(PMTYPE, sysctl_kmaps)(SYSCTL_HANDLER_ARGS) +{ + struct pmap_kernel_map_range range; + struct sbuf sbuf, *sb; + pd_entry_t pde; + pt_entry_t *pt, pte; + vm_offset_t sva; + vm_paddr_t pa; + int error; + u_int i, k; + + error = sysctl_wire_old_buffer(req, 0); + if (error != 0) + return (error); + sb = &sbuf; + sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); + + /* Sentinel value. */ + range.sva = 0xffffffff; + + /* + * Iterate over the kernel page tables without holding the + * kernel pmap lock. Kernel page table pages are never freed, + * so at worst we will observe inconsistencies in the output. + */ + for (sva = 0, i = 0; i < NPTEPG * NPGPTD * NPDEPG ;) { + if (i == 0) + sbuf_printf(sb, "\nLow PDE:\n"); + else if (i == LOWPTDI * NPTEPG) + sbuf_printf(sb, "Low PDE dup:\n"); + else if (i == PTDPTDI * NPTEPG) + sbuf_printf(sb, "Recursive map:\n"); + else if (i == KERNPTDI * NPTEPG) + sbuf_printf(sb, "Kernel base:\n"); + else if (i == TRPTDI * NPTEPG) + sbuf_printf(sb, "Trampoline:\n"); + pde = IdlePTD[sva >> PDRSHIFT]; + if ((pde & PG_V) == 0) { + sva = rounddown2(sva, NBPDR); + sysctl_kmaps_dump(sb, &range, sva); + sva += NBPDR; + i += NPTEPG; + continue; + } + pa = pde & PG_FRAME; + if ((pde & PG_PS) != 0) { + sysctl_kmaps_check(sb, &range, sva, pde, 0); + range.pdes++; + sva += NBPDR; + i += NPTEPG; + continue; + } + for (pt = vtopte(sva), k = 0; k < NPTEPG; i++, k++, pt++, + sva += PAGE_SIZE) { + pte = *pt; + if ((pte & PG_V) == 0) { + sysctl_kmaps_dump(sb, &range, sva); + continue; + } + sysctl_kmaps_check(sb, &range, sva, pde, pte); + range.ptes++; + } + } + + error = sbuf_finish(sb); + sbuf_delete(sb); + return (error); +} + +#define PMM(a) \ .pm_##a = __CONCAT(PMTYPE, a), struct pmap_methods __CONCAT(PMTYPE, methods) = { @@ -6162,4 +6401,5 @@ struct pmap_methods __CONCAT(PMTYPE, methods) = { PMM(flush_page) PMM(kenter) PMM(kremove) + PMM(sysctl_kmaps) }; diff --git a/sys/i386/i386/pmap_base.c b/sys/i386/i386/pmap_base.c index 4ddc9bee7b87..9209319da01f 100644 --- a/sys/i386/i386/pmap_base.c +++ b/sys/i386/i386/pmap_base.c @@ -258,6 +258,17 @@ SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, struct pmap kernel_pmap_store; static struct pmap_methods *pmap_methods_ptr; +static int +sysctl_kmaps(SYSCTL_HANDLER_ARGS) +{ + return (pmap_methods_ptr->pm_sysctl_kmaps(oidp, arg1, arg2, req)); +} +SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, + NULL, 0, sysctl_kmaps, "A", + "Dump kernel address layout"); + + /* * Initialize a vm_page's machine-dependent fields. */ diff --git a/sys/i386/include/pmap_base.h b/sys/i386/include/pmap_base.h index 3f107eeb2aa8..76bc9f28262d 100644 --- a/sys/i386/include/pmap_base.h +++ b/sys/i386/include/pmap_base.h @@ -118,6 +118,7 @@ struct pmap_methods { void (*pm_flush_page)(vm_page_t); void (*pm_kenter)(vm_offset_t, vm_paddr_t); void (*pm_kremove)(vm_offset_t); + int (*pm_sysctl_kmaps)(SYSCTL_HANDLER_ARGS); }; void pmap_cold(void); From 66eb1d6347abb5b4b8fbea16d9243666fb9d769e Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Sun, 22 Sep 2019 19:59:10 +0000 Subject: [PATCH 013/106] i386: reduce differences in source between PAE and non-PAE pmaps ... by defining pg_nx as zero for non-PAE and correspondingly simplifying some expressions. Suggested and reviewed by: markj Sponsored by: The FreeBSD Foundation MFC after: 1 week Differential revision: https://reviews.freebsd.org/D21757 --- sys/i386/i386/pmap.c | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index 005af4e7cd52..949cfe6a121a 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -227,6 +227,8 @@ static int nkpt = NKPT; #ifdef PMAP_PAE_COMP pt_entry_t pg_nx; static uma_zone_t pdptzone; +#else +#define pg_nx 0 #endif _Static_assert(VM_MAXUSER_ADDRESS == VADDR(TRPTDI, 0), "VM_MAXUSER_ADDRESS"); @@ -1871,11 +1873,7 @@ __CONCAT(PMTYPE, qenter)(vm_offset_t sva, vm_page_t *ma, int count) m->md.pat_mode, 0); if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) { oldpte |= *pte; -#ifdef PMAP_PAE_COMP pte_store(pte, pa | pg_nx | PG_RW | PG_V); -#else - pte_store(pte, pa | PG_RW | PG_V); -#endif } pte++; } @@ -6168,11 +6166,7 @@ sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, sbuf_printf(sb, "0x%08x-0x%08x r%c%c%c%c %s %d %d %d\n", range->sva, eva, (range->attrs & PG_RW) != 0 ? 'w' : '-', -#ifdef PMAP_PAE_COMP (range->attrs & pg_nx) != 0 ? '-' : 'x', -#else - '-', -#endif (range->attrs & PG_U) != 0 ? 'u' : 's', (range->attrs & PG_G) != 0 ? 'g' : '-', mode, range->pdpes, range->pdes, range->ptes); @@ -6191,10 +6185,7 @@ sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) { pt_entry_t diff, mask; - mask = PG_G | PG_RW | PG_U | PG_PDE_CACHE; -#ifdef PMAP_PAE_COMP - mask |= pg_nx; -#endif + mask = pg_nx | PG_G | PG_RW | PG_U | PG_PDE_CACHE; diff = (range->attrs ^ attrs) & mask; if (diff == 0) return (true); @@ -6224,21 +6215,15 @@ static void sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, vm_offset_t va, pd_entry_t pde, pt_entry_t pte) { - pt_entry_t attrs, mask; + pt_entry_t attrs; - attrs = pde & (PG_RW | PG_U); -#ifdef PMAP_PAE_COMP - attrs |= pde & pg_nx; -#endif + attrs = pde & (PG_RW | PG_U | pg_nx); if ((pde & PG_PS) != 0) { attrs |= pde & (PG_G | PG_PDE_CACHE); } else if (pte != 0) { - mask = pte & (PG_RW | PG_U); -#ifdef PMAP_PAE_COMP - mask |= pg_nx; -#endif - attrs &= mask; + attrs |= pte & pg_nx; + attrs &= pg_nx | (pte & (PG_RW | PG_U)); attrs |= pte & (PG_G | PG_PTE_CACHE); /* Canonicalize by always using the PDE PAT bit. */ From d2be3ef05c1fbcd66e9554f513e9f6816f56de6e Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sun, 22 Sep 2019 20:44:24 +0000 Subject: [PATCH 014/106] lockprof: move per-cpu data to dpcpu Reviewed by: kib Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D21747 --- sys/kern/subr_lock.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/sys/kern/subr_lock.c b/sys/kern/subr_lock.c index 1efbcc807213..434954f519a4 100644 --- a/sys/kern/subr_lock.c +++ b/sys/kern/subr_lock.c @@ -241,7 +241,9 @@ struct lock_prof_cpu { struct lock_prof_type lpc_types[2]; /* One for spin one for other. */ }; -struct lock_prof_cpu *lp_cpu[MAXCPU]; +DPCPU_DEFINE_STATIC(struct lock_prof_cpu, lp); +#define LP_CPU_SELF (DPCPU_PTR(lp)) +#define LP_CPU(cpu) (DPCPU_ID_PTR((cpu), lp)) volatile int __read_mostly lock_prof_enable; static volatile int lock_prof_resetting; @@ -288,10 +290,8 @@ lock_prof_init(void *arg) int cpu; CPU_FOREACH(cpu) { - lp_cpu[cpu] = malloc(sizeof(*lp_cpu[cpu]), M_DEVBUF, - M_WAITOK | M_ZERO); - lock_prof_init_type(&lp_cpu[cpu]->lpc_types[0]); - lock_prof_init_type(&lp_cpu[cpu]->lpc_types[1]); + lock_prof_init_type(&LP_CPU(cpu)->lpc_types[0]); + lock_prof_init_type(&LP_CPU(cpu)->lpc_types[1]); } } SYSINIT(lockprof, SI_SUB_SMP, SI_ORDER_ANY, lock_prof_init, NULL); @@ -331,14 +331,14 @@ lock_prof_reset(void) * into per-thread lists as well. */ CPU_FOREACH(cpu) { - lpc = lp_cpu[cpu]; + lpc = LP_CPU(cpu); for (i = 0; i < LPROF_CACHE_SIZE; i++) { LIST_REMOVE(&lpc->lpc_types[0].lpt_objs[i], lpo_link); LIST_REMOVE(&lpc->lpc_types[1].lpt_objs[i], lpo_link); } } CPU_FOREACH(cpu) { - lpc = lp_cpu[cpu]; + lpc = LP_CPU(cpu); bzero(lpc, sizeof(*lpc)); lock_prof_init_type(&lpc->lpc_types[0]); lock_prof_init_type(&lpc->lpc_types[1]); @@ -379,7 +379,7 @@ lock_prof_sum(struct lock_prof *match, struct lock_prof *dst, int hash, dst->name = match->name; CPU_FOREACH(cpu) { - type = &lp_cpu[cpu]->lpc_types[spin]; + type = &LP_CPU(cpu)->lpc_types[spin]; SLIST_FOREACH(l, &type->lpt_hash[hash], link) { if (l->ticks == t) continue; @@ -436,8 +436,8 @@ dump_lock_prof_stats(SYSCTL_HANDLER_ARGS) quiesce_all_cpus("profstat", 0); t = ticks; CPU_FOREACH(cpu) { - lock_prof_type_stats(&lp_cpu[cpu]->lpc_types[0], sb, 0, t); - lock_prof_type_stats(&lp_cpu[cpu]->lpc_types[1], sb, 1, t); + lock_prof_type_stats(&LP_CPU(cpu)->lpc_types[0], sb, 0, t); + lock_prof_type_stats(&LP_CPU(cpu)->lpc_types[1], sb, 1, t); } lock_prof_enable = enabled; @@ -503,7 +503,7 @@ lock_profile_lookup(struct lock_object *lo, int spin, const char *file, p = unknown; hash = (uintptr_t)lo->lo_name * 31 + (uintptr_t)p * 31 + line; hash &= LPROF_HASH_MASK; - type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin]; + type = &LP_CPU_SELF->lpc_types[spin]; head = &type->lpt_hash[hash]; SLIST_FOREACH(lp, head, link) { if (lp->line == line && lp->file == p && @@ -538,7 +538,7 @@ lock_profile_object_lookup(struct lock_object *lo, int spin, const char *file, if (l->lpo_obj == lo && l->lpo_file == file && l->lpo_line == line) return (l); - type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin]; + type = &LP_CPU_SELF->lpc_types[spin]; l = LIST_FIRST(&type->lpt_lpoalloc); if (l == NULL) { lock_prof_rejected++; @@ -674,7 +674,7 @@ lock_profile_release_lock(struct lock_object *lo) lp->cnt_cur += l->lpo_cnt; release: LIST_REMOVE(l, lpo_link); - type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin]; + type = &LP_CPU_SELF->lpc_types[spin]; LIST_INSERT_HEAD(&type->lpt_lpoalloc, l, lpo_link); out: critical_exit(); From cd2112c305f5aff1f30b6d6281fd75946592e8f5 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sun, 22 Sep 2019 20:49:17 +0000 Subject: [PATCH 015/106] cache: jump in negative success instead of positive Sponsored by: The FreeBSD Foundation --- sys/kern/vfs_cache.c | 55 ++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c index ef0654800391..7b43d4e24d7b 100644 --- a/sys/kern/vfs_cache.c +++ b/sys/kern/vfs_cache.c @@ -1380,7 +1380,7 @@ cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, } /* We failed to find an entry */ - if (ncp == NULL) { + if (__predict_false(ncp == NULL)) { rw_runlock(blp); SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL); @@ -1388,35 +1388,17 @@ cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, return (0); } + if (ncp->nc_flag & NCF_NEGATIVE) + goto negative_success; + /* We found a "positive" match, return the vnode */ - if (!(ncp->nc_flag & NCF_NEGATIVE)) { - counter_u64_add(numposhits, 1); - *vpp = ncp->nc_vp; - CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", - dvp, cnp->cn_nameptr, *vpp, ncp); - SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, - *vpp); - cache_out_ts(ncp, tsp, ticksp); - goto success; - } - -negative_success: - /* We found a negative match, and want to create it, so purge */ - if (cnp->cn_nameiop == CREATE) { - counter_u64_add(numnegzaps, 1); - goto zap_and_exit; - } - - counter_u64_add(numneghits, 1); - cache_negative_hit(ncp); - if (ncp->nc_flag & NCF_WHITE) - cnp->cn_flags |= ISWHITEOUT; - SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, - ncp->nc_name); + counter_u64_add(numposhits, 1); + *vpp = ncp->nc_vp; + CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", + dvp, cnp->cn_nameptr, *vpp, ncp); + SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, + *vpp); cache_out_ts(ncp, tsp, ticksp); - cache_lookup_unlock(blp, dvlp); - return (ENOENT); - success: /* * On success we return a locked and ref'd vnode as per the lookup @@ -1450,6 +1432,23 @@ cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, } return (-1); +negative_success: + /* We found a negative match, and want to create it, so purge */ + if (cnp->cn_nameiop == CREATE) { + counter_u64_add(numnegzaps, 1); + goto zap_and_exit; + } + + counter_u64_add(numneghits, 1); + cache_negative_hit(ncp); + if (ncp->nc_flag & NCF_WHITE) + cnp->cn_flags |= ISWHITEOUT; + SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, + ncp->nc_name); + cache_out_ts(ncp, tsp, ticksp); + cache_lookup_unlock(blp, dvlp); + return (ENOENT); + zap_and_exit: if (blp != NULL) error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp); From 7505cffa56d7347436b32f8d3f192ad1c4383ad8 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sun, 22 Sep 2019 20:50:24 +0000 Subject: [PATCH 016/106] cache: try to avoid vhold if locks held Sponsored by: The FreeBSD Foundation --- sys/kern/vfs_cache.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c index 7b43d4e24d7b..ab55f672a544 100644 --- a/sys/kern/vfs_cache.c +++ b/sys/kern/vfs_cache.c @@ -1690,7 +1690,7 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, uint32_t hash; int flag; int len; - bool neg_locked; + bool neg_locked, held_dvp; u_long lnumcache; CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); @@ -1769,6 +1769,13 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, } } + held_dvp = false; + if (LIST_EMPTY(&dvp->v_cache_src) && flag != NCF_ISDOTDOT) { + vhold(dvp); + atomic_add_long(&numcachehv, 1); + held_dvp = true; + } + /* * Calculate the hash key and setup as much of the new * namecache entry as possible before acquiring the lock. @@ -1858,8 +1865,21 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, if (flag != NCF_ISDOTDOT) { if (LIST_EMPTY(&dvp->v_cache_src)) { - vhold(dvp); - atomic_add_rel_long(&numcachehv, 1); + if (!held_dvp) { + vhold(dvp); + atomic_add_long(&numcachehv, 1); + } + } else { + if (held_dvp) { + /* + * This will not take the interlock as someone + * else already holds the vnode on account of + * the namecache and we hold locks preventing + * this from changing. + */ + vdrop(dvp); + atomic_subtract_long(&numcachehv, 1); + } } LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); } @@ -1894,6 +1914,10 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, out_unlock_free: cache_enter_unlock(&cel); cache_free(ncp); + if (held_dvp) { + vdrop(dvp); + atomic_subtract_long(&numcachehv, 1); + } return; } From ba7a55d93465725fd860066d95d4e910c3c027c2 Mon Sep 17 00:00:00 2001 From: Sean Eric Fagan Date: Mon, 23 Sep 2019 04:28:07 +0000 Subject: [PATCH 017/106] Add two options to allow mount to avoid covering up existing mount points. The two options are * nocover/cover: Prevent/allow mounting over an existing root mountpoint. E.g., "mount -t ufs -o nocover /dev/sd1a /usr/local" will fail if /usr/local is already a mountpoint. * emptydir/noemptydir: Prevent/allow mounting on a non-empty directory. E.g., "mount -t ufs -o emptydir /dev/sd1a /usr" will fail. Neither of these options is intended to be a default, for historical and compatibility reasons. Reviewed by: allanjude, kib Differential Revision: https://reviews.freebsd.org/D21458 --- lib/libc/sys/mount.2 | 16 +++++++++- sbin/mount/mntopts.h | 7 +++-- sbin/mount/mount.8 | 7 ++++- sbin/mount/mount.c | 4 +++ sys/kern/vfs_mount.c | 39 ++++++++++++++++++++---- sys/kern/vfs_subr.c | 70 ++++++++++++++++++++++++++++++++++++++++++++ sys/sys/mount.h | 4 ++- sys/sys/vnode.h | 1 + 8 files changed, 138 insertions(+), 10 deletions(-) diff --git a/lib/libc/sys/mount.2 b/lib/libc/sys/mount.2 index 2d296e6f5b1b..6f5a199be9ab 100644 --- a/lib/libc/sys/mount.2 +++ b/lib/libc/sys/mount.2 @@ -28,7 +28,7 @@ .\" @(#)mount.2 8.3 (Berkeley) 5/24/95 .\" $FreeBSD$ .\" -.Dd December 1, 2017 +.Dd August 28, 2019 .Dt MOUNT 2 .Os .Sh NAME @@ -157,6 +157,10 @@ mount even if some files are open for writing. Disable read clustering. .It Dv MNT_NOCLUSTERW Disable write clustering. +.It Dv MNT_NOCOVER +Do not mount over the root of another mount point. +.It Dv MNT_EMPTYDIR +Require an empty directory for the mount point directory. .El .Pp The flag @@ -260,6 +264,11 @@ is not a directory. .It Bq Er EBUSY Another process currently holds a reference to .Fa dir . +.It Bq Er EBUSY +The +.Dv MNT_NOCOVER +option was given, and the requested mount point +is already the root of another mount point. .It Bq Er EFAULT The .Fa dir @@ -280,6 +289,11 @@ The .Fa fspec argument is not a block device. +.It Bq Er ENOTEMPTY +The +.Dv MNT_EMPTYDIR +option was specified, and the requested mount point +is not an empty directory. .It Bq Er ENXIO The major device number of .Fa fspec diff --git a/sbin/mount/mntopts.h b/sbin/mount/mntopts.h index 924ead253890..183d6d9e501d 100644 --- a/sbin/mount/mntopts.h +++ b/sbin/mount/mntopts.h @@ -65,7 +65,8 @@ struct mntopt { #define MOPT_UPDATE { "update", 0, MNT_UPDATE, 0 } #define MOPT_RO { "ro", 0, MNT_RDONLY, 0 } #define MOPT_RW { "rw", 1, MNT_RDONLY, 0 } - +#define MOPT_NOCOVER { "cover", 1, MNT_NOCOVER, 0 } +#define MOPT_EMPTYDIR { "emptydir", 0, MNT_EMPTYDIR, 0 } /* This is parsed by mount(8), but is ignored by specific mount_*(8)s. */ #define MOPT_AUTO { "auto", 0, 0, 0 } @@ -95,7 +96,9 @@ struct mntopt { MOPT_ACLS, \ MOPT_NFS4ACLS, \ MOPT_AUTOMOUNTED, \ - MOPT_UNTRUSTED + MOPT_UNTRUSTED, \ + MOPT_NOCOVER, \ + MOPT_EMPTYDIR void getmntopts(const char *, const struct mntopt *, int *, int *); void rmslashes(char *, char *); diff --git a/sbin/mount/mount.8 b/sbin/mount/mount.8 index 7ff94fb7c165..3aee1bb86151 100644 --- a/sbin/mount/mount.8 +++ b/sbin/mount/mount.8 @@ -28,7 +28,7 @@ .\" @(#)mount.8 8.8 (Berkeley) 6/16/94 .\" $FreeBSD$ .\" -.Dd March 22, 2017 +.Dd August 28, 2019 .Dt MOUNT 8 .Os .Sh NAME @@ -162,6 +162,8 @@ When used with the .Fl u flag, this is the same as specifying the options currently in effect for the mounted file system. +.It Cm emptydir +Require that the mount point directory be empty. .It Cm force The same as .Fl f ; @@ -237,6 +239,9 @@ flag. Disable read clustering. .It Cm noclusterw Disable write clustering. +.It Cm nocover +Do not mount if the requested mount point is already +the root of a mount point. .It Cm noexec Do not allow execution of any binaries on the mounted file system. This option is useful for a server that has file systems containing diff --git a/sbin/mount/mount.c b/sbin/mount/mount.c index 102e87009b0f..938a220b41c3 100644 --- a/sbin/mount/mount.c +++ b/sbin/mount/mount.c @@ -119,6 +119,8 @@ static struct opt { { MNT_AUTOMOUNTED, "automounted" }, { MNT_VERIFIED, "verified" }, { MNT_UNTRUSTED, "untrusted" }, + { MNT_NOCOVER, "nocover" }, + { MNT_EMPTYDIR, "emptydir" }, { 0, NULL } }; @@ -975,6 +977,8 @@ flags2opts(int flags) if (flags & MNT_ACLS) res = catopt(res, "acls"); if (flags & MNT_NFS4ACLS) res = catopt(res, "nfsv4acls"); if (flags & MNT_UNTRUSTED) res = catopt(res, "untrusted"); + if (flags & MNT_NOCOVER) res = catopt(res, "nocover"); + if (flags & MNT_EMPTYDIR) res = catopt(res, "emptydir"); return (res); } diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c index 76c483c5640e..633fc27a6c98 100644 --- a/sys/kern/vfs_mount.c +++ b/sys/kern/vfs_mount.c @@ -668,19 +668,21 @@ vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions) * when we want to update the root filesystem. */ TAILQ_FOREACH_SAFE(opt, optlist, link, tmp_opt) { + int do_freeopt = 0; + if (strcmp(opt->name, "update") == 0) { fsflags |= MNT_UPDATE; - vfs_freeopt(optlist, opt); + do_freeopt = 1; } else if (strcmp(opt->name, "async") == 0) fsflags |= MNT_ASYNC; else if (strcmp(opt->name, "force") == 0) { fsflags |= MNT_FORCE; - vfs_freeopt(optlist, opt); + do_freeopt = 1; } else if (strcmp(opt->name, "reload") == 0) { fsflags |= MNT_RELOAD; - vfs_freeopt(optlist, opt); + do_freeopt = 1; } else if (strcmp(opt->name, "multilabel") == 0) fsflags |= MNT_MULTILABEL; @@ -741,7 +743,7 @@ vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions) autoro = false; } else if (strcmp(opt->name, "autoro") == 0) { - vfs_freeopt(optlist, opt); + do_freeopt = 1; autoro = true; } else if (strcmp(opt->name, "suiddir") == 0) @@ -752,8 +754,22 @@ vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions) fsflags |= MNT_UNION; else if (strcmp(opt->name, "automounted") == 0) { fsflags |= MNT_AUTOMOUNTED; - vfs_freeopt(optlist, opt); + do_freeopt = 1; + } else if (strcmp(opt->name, "nocover") == 0) { + fsflags |= MNT_NOCOVER; + do_freeopt = 1; + } else if (strcmp(opt->name, "cover") == 0) { + fsflags &= ~MNT_NOCOVER; + do_freeopt = 1; + } else if (strcmp(opt->name, "emptydir") == 0) { + fsflags |= MNT_EMPTYDIR; + do_freeopt = 1; + } else if (strcmp(opt->name, "noemptydir") == 0) { + fsflags &= ~MNT_EMPTYDIR; + do_freeopt = 1; } + if (do_freeopt) + vfs_freeopt(optlist, opt); } /* @@ -889,6 +905,14 @@ vfs_domount_first( ASSERT_VOP_ELOCKED(vp, __func__); KASSERT((fsflags & MNT_UPDATE) == 0, ("MNT_UPDATE shouldn't be here")); + if ((fsflags & MNT_EMPTYDIR) != 0) { + error = vfs_emptydir(vp); + if (error != 0) { + vput(vp); + return (error); + } + } + /* * If the jail of the calling thread lacks permission for this type of * file system, deny immediately. @@ -1229,6 +1253,11 @@ vfs_domount( NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if ((fsflags & MNT_UPDATE) == 0) { + if ((vp->v_vflag & VV_ROOT) != 0 && + (fsflags & MNT_NOCOVER) != 0) { + vput(vp); + return (EBUSY); + } pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); strcpy(pathbuf, fspath); error = vn_path_to_global_path(td, vp, pathbuf, MNAMELEN); diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 3bdc4d1e3f65..7d8ea63c7a6f 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -5535,6 +5535,76 @@ filt_vfsvnode(struct knote *kn, long hint) return (res); } +/* + * Returns whether the directory is empty or not. + * If it is empty, the return value is 0; otherwise + * the return value is an error value (which may + * be ENOTEMPTY). + */ +int +vfs_emptydir(struct vnode *vp) +{ + struct uio uio; + struct iovec iov; + struct dirent *dirent, *dp, *endp; + int error, eof; + + error = 0; + eof = 0; + + ASSERT_VOP_LOCKED(vp, "vfs_emptydir"); + + dirent = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK); + iov.iov_base = dirent; + iov.iov_len = sizeof(struct dirent); + + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_offset = 0; + uio.uio_resid = sizeof(struct dirent); + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_rw = UIO_READ; + uio.uio_td = curthread; + + while (eof == 0 && error == 0) { + error = VOP_READDIR(vp, &uio, curthread->td_ucred, &eof, + NULL, NULL); + if (error != 0) + break; + endp = (void *)((uint8_t *)dirent + + sizeof(struct dirent) - uio.uio_resid); + for (dp = dirent; dp < endp; + dp = (void *)((uint8_t *)dp + GENERIC_DIRSIZ(dp))) { + if (dp->d_type == DT_WHT) + continue; + if (dp->d_namlen == 0) + continue; + if (dp->d_type != DT_DIR && + dp->d_type != DT_UNKNOWN) { + error = ENOTEMPTY; + break; + } + if (dp->d_namlen > 2) { + error = ENOTEMPTY; + break; + } + if (dp->d_namlen == 1 && + dp->d_name[0] != '.') { + error = ENOTEMPTY; + break; + } + if (dp->d_namlen == 2 && + dp->d_name[1] != '.') { + error = ENOTEMPTY; + break; + } + uio.uio_resid = sizeof(struct dirent); + } + } + free(dirent, M_TEMP); + return (error); +} + int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) { diff --git a/sys/sys/mount.h b/sys/sys/mount.h index 562fa191fe24..4b60055ca6dd 100644 --- a/sys/sys/mount.h +++ b/sys/sys/mount.h @@ -373,9 +373,11 @@ void __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *); #define MNT_SNAPSHOT 0x0000000001000000ULL /* snapshot the filesystem */ #define MNT_NONBUSY 0x0000000004000000ULL /* check vnode use counts. */ #define MNT_BYFSID 0x0000000008000000ULL /* specify filesystem by ID. */ +#define MNT_NOCOVER 0x0000001000000000ULL /* Do not cover a mount point */ +#define MNT_EMPTYDIR 0x0000002000000000ULL /* Only mount on empty dir */ #define MNT_CMDFLAGS (MNT_UPDATE | MNT_DELEXPORT | MNT_RELOAD | \ MNT_FORCE | MNT_SNAPSHOT | MNT_NONBUSY | \ - MNT_BYFSID) + MNT_BYFSID | MNT_NOCOVER | MNT_EMPTYDIR) /* * Internal filesystem control flags stored in mnt_kern_flag. * diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 18134ecca814..967b1c066c95 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -930,6 +930,7 @@ int vfs_kqfilter(struct vop_kqfilter_args *); void vfs_mark_atime(struct vnode *vp, struct ucred *cred); struct dirent; int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off); +int vfs_emptydir(struct vnode *vp); int vfs_unixify_accmode(accmode_t *accmode); From afe257e3ca2359d0f8a89a918a1942c7990075d3 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Mon, 23 Sep 2019 08:53:14 +0000 Subject: [PATCH 018/106] cache: count evictions of negatve entries Sponsored by: The FreeBSD Foundation --- sys/kern/vfs_cache.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c index ab55f672a544..925eba5f5d0b 100644 --- a/sys/kern/vfs_cache.c +++ b/sys/kern/vfs_cache.c @@ -378,6 +378,8 @@ STATNODE_COUNTER(numfullpathfail2, "Number of fullpath search errors (VOP_VPTOCNP failures)"); STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); +static long numneg_evicted; STATNODE_ULONG(numneg_evicted, + "Number of negative entries evicted when adding a new entry"); STATNODE_COUNTER(zap_and_exit_bucket_relock_success, "Number of successful removals after relocking"); static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, @@ -844,6 +846,7 @@ cache_negative_zap_one(void) ncp->nc_name, ncp->nc_neghits); cache_zap_locked(ncp, true); + numneg_evicted++; out_unlock_all: mtx_unlock(&neglist->nl_lock); rw_wunlock(blp); From 55258ab0ff2a419a6a703a8a2f4277f171546131 Mon Sep 17 00:00:00 2001 From: Tijl Coosemans Date: Mon, 23 Sep 2019 12:27:55 +0000 Subject: [PATCH 019/106] Create a "drm" subdirectory for drm devices in linsysfs. Recent versions of linux libdrm check for the existence of this directory: https://cgit.freedesktop.org/mesa/drm/commit/?id=f8392583418aef5e27bfed9989aeb601e20cc96d MFC after: 2 weeks --- sys/compat/linsysfs/linsysfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sys/compat/linsysfs/linsysfs.c b/sys/compat/linsysfs/linsysfs.c index b37ff2b86833..bc9f4e4e280f 100644 --- a/sys/compat/linsysfs/linsysfs.c +++ b/sys/compat/linsysfs/linsysfs.c @@ -520,6 +520,7 @@ linsysfs_run_bus(device_t dev, struct pfs_node *dir, struct pfs_node *scsi, device_get_unit(dev) >= 0) { dinfo = device_get_ivars(parent); if (dinfo != NULL && dinfo->cfg.baseclass == PCIC_DISPLAY) { + pfs_create_dir(dir, "drm", NULL, NULL, NULL, 0); sprintf(devname, "226:%d", device_get_unit(dev)); sub_dir = pfs_create_dir(chardev, From 2e6a21bbd8416c532a4c207caf01e7d2b47f7327 Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Mon, 23 Sep 2019 12:43:08 +0000 Subject: [PATCH 020/106] mips: fix XLPN32 after r352434 SYSINIT usage was added, but the dependency was not added. This worked by coincidence, as most of the mips configs have DDB enabled and pmap.c gets via ddb.h pollution. Reported by: dim --- sys/mips/mips/pmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sys/mips/mips/pmap.c b/sys/mips/mips/pmap.c index f498fa504d9e..10487ac18738 100644 --- a/sys/mips/mips/pmap.c +++ b/sys/mips/mips/pmap.c @@ -68,6 +68,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include From f9bf9282c9803ebc6774f3de6a566d84aaa8f192 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Mon, 23 Sep 2019 13:24:31 +0000 Subject: [PATCH 021/106] Fix destruction of the robust mutexes. If robust mutex' owner terminated, causing kernel-assisted state recovery, and then pthread_mutex_destroy() is executed as the next action, assert is triggered about mutex still being on the list. Ignore the mutex linkage in pthread_mutex_destroy() for shared robust mutexes with dead owner, same as for enqueue_mutex(). Reported by: avg Sponsored by: The FreeBSD Foundation MFC after: 1 week --- lib/libthr/thread/thr_mutex.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/libthr/thread/thr_mutex.c b/lib/libthr/thread/thr_mutex.c index dc09f539add5..57984ef6d0e6 100644 --- a/lib/libthr/thread/thr_mutex.c +++ b/lib/libthr/thread/thr_mutex.c @@ -474,7 +474,11 @@ _thr_mutex_destroy(pthread_mutex_t *mutex) if (m == THR_PSHARED_PTR) { m1 = __thr_pshared_offpage(mutex, 0); if (m1 != NULL) { - mutex_assert_not_owned(_get_curthread(), m1); + if ((uint32_t)m1->m_lock.m_owner != + UMUTEX_RB_OWNERDEAD) { + mutex_assert_not_owned( + _get_curthread(), m1); + } __thr_pshared_destroy(mutex); } *mutex = THR_MUTEX_DESTROYED; From 751727948a2924c8ae424e81d9cd7aff9d8dd4d3 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 23 Sep 2019 14:11:59 +0000 Subject: [PATCH 022/106] Set NX in mappings created by pmap_kenter() and pmap_kenter_attr(). There does not appear to be any existing need for such mappings to be executable. Reviewed by: alc, kib MFC after: 1 month Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D21754 --- sys/amd64/amd64/pmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 703b225592ca..e2f206c8dcaf 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -3134,7 +3134,7 @@ pmap_kenter(vm_offset_t va, vm_paddr_t pa) pt_entry_t *pte; pte = vtopte(va); - pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g); + pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | pg_nx); } static __inline void @@ -3145,7 +3145,7 @@ pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) pte = vtopte(va); cache_bits = pmap_cache_bits(kernel_pmap, mode, 0); - pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | cache_bits); + pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | pg_nx | cache_bits); } /* From 38dae42c26d630cdbd2184ad13a45c021d87e1f7 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 23 Sep 2019 14:14:43 +0000 Subject: [PATCH 023/106] Use elf_relocaddr() when handling R_X86_64_RELATIVE relocations. This is required for DPCPU and VNET data variable definitions to work when KLDs are linked as DSOs. R_X86_64_RELATIVE relocations should not appear in object files, so assert this in elf_relocaddr(). Reviewed by: kib MFC after: 1 month Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D21755 --- sys/amd64/amd64/elf_machdep.c | 3 +-- sys/kern/link_elf.c | 3 +++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/sys/amd64/amd64/elf_machdep.c b/sys/amd64/amd64/elf_machdep.c index 5b31d111ff4c..c1a4781b243d 100644 --- a/sys/amd64/amd64/elf_machdep.c +++ b/sys/amd64/amd64/elf_machdep.c @@ -267,7 +267,6 @@ elf_reloc_internal(linker_file_t lf, Elf_Addr relocbase, const void *data, */ printf("kldload: unexpected R_COPY relocation\n"); return (-1); - break; case R_X86_64_GLOB_DAT: /* S */ case R_X86_64_JMP_SLOT: /* XXX need addend + offset */ @@ -279,7 +278,7 @@ elf_reloc_internal(linker_file_t lf, Elf_Addr relocbase, const void *data, break; case R_X86_64_RELATIVE: /* B + A */ - addr = relocbase + addend; + addr = elf_relocaddr(lf, relocbase + addend); val = addr; if (*where != val) *where = val; diff --git a/sys/kern/link_elf.c b/sys/kern/link_elf.c index 8d07dd5bbb7c..80b2656d8d27 100644 --- a/sys/kern/link_elf.c +++ b/sys/kern/link_elf.c @@ -1162,6 +1162,9 @@ elf_relocaddr(linker_file_t lf, Elf_Addr x) { elf_file_t ef; + KASSERT(lf->ops->cls == (kobj_class_t)&link_elf_class, + ("elf_relocaddr: unexpected linker file %p", lf)); + ef = (elf_file_t)lf; if (x >= ef->pcpu_start && x < ef->pcpu_stop) return ((x - ef->pcpu_start) + ef->pcpu_base); From 2ff730191daad455548fe0c0361a5b50bdc1aa3a Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 23 Sep 2019 14:19:41 +0000 Subject: [PATCH 024/106] Set NX on some non-leaf direct map page table entries. The direct map is never used for execution of code, so we might as well set NX in the direct map's PML4Es. Also clarify the intent of the code in create_pagetables() that restricts access protections on the region of the direct map mapping the kernel text. Reviewed by: alc, kib (previous version) MFC after: 1 week Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D21759 --- sys/amd64/amd64/pmap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index e2f206c8dcaf..793702174e6a 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -1539,13 +1539,13 @@ create_pagetables(vm_paddr_t *firstaddr) } for (j = 0; i < ndmpdp; i++, j++) { pdp_p[i] = DMPDphys + ptoa(j); - pdp_p[i] |= X86_PG_RW | X86_PG_V; + pdp_p[i] |= X86_PG_RW | X86_PG_V | pg_nx; } /* * Instead of using a 1G page for the memory containing the kernel, - * use 2M pages with appropriate permissions. (If using 1G pages, - * this will partially overwrite the PDPEs above.) + * use 2M pages with read-only and no-execute permissions. (If using 1G + * pages, this will partially overwrite the PDPEs above.) */ if (ndm1g) { pd_p = (pd_entry_t *)DMPDkernphys; @@ -1555,7 +1555,7 @@ create_pagetables(vm_paddr_t *firstaddr) bootaddr_rwx(i << PDRSHIFT); for (i = 0; i < nkdmpde; i++) pdp_p[i] = (DMPDkernphys + ptoa(i)) | X86_PG_RW | - X86_PG_V; + X86_PG_V | pg_nx; } /* And recursively map PML4 to itself in order to get PTmap */ @@ -1566,7 +1566,7 @@ create_pagetables(vm_paddr_t *firstaddr) /* Connect the Direct Map slot(s) up to the PML4. */ for (i = 0; i < ndmpdpphys; i++) { p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); - p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V; + p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; } /* Connect the KVA slots up to the PML4 */ From c7e224c66d9acb4874da5dbc2af38de5df96a812 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 23 Sep 2019 14:29:05 +0000 Subject: [PATCH 025/106] Revert r316820. Despite appearing correct, r316820 breaks packet rx/tx for jme(4) interfaces. With 12.1 approaching, let's just revert the commit for now. PR: 233952 Tested by: Armin Gruner MFC after: 3 days --- sys/dev/jme/if_jme.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/sys/dev/jme/if_jme.c b/sys/dev/jme/if_jme.c index 267804bc768c..9c8f725f95c9 100644 --- a/sys/dev/jme/if_jme.c +++ b/sys/dev/jme/if_jme.c @@ -559,7 +559,7 @@ jme_map_intr_vector(struct jme_softc *sc) bzero(map, sizeof(map)); /* Map Tx interrupts source to MSI/MSIX vector 2. */ - map[MSINUM_REG_INDEX(N_INTR_TXQ0_COMP)] |= + map[MSINUM_REG_INDEX(N_INTR_TXQ0_COMP)] = MSINUM_INTR_SOURCE(2, N_INTR_TXQ0_COMP); map[MSINUM_REG_INDEX(N_INTR_TXQ1_COMP)] |= MSINUM_INTR_SOURCE(2, N_INTR_TXQ1_COMP); @@ -581,37 +581,37 @@ jme_map_intr_vector(struct jme_softc *sc) MSINUM_INTR_SOURCE(2, N_INTR_TXQ_COAL_TO); /* Map Rx interrupts source to MSI/MSIX vector 1. */ - map[MSINUM_REG_INDEX(N_INTR_RXQ0_COMP)] |= + map[MSINUM_REG_INDEX(N_INTR_RXQ0_COMP)] = MSINUM_INTR_SOURCE(1, N_INTR_RXQ0_COMP); - map[MSINUM_REG_INDEX(N_INTR_RXQ1_COMP)] |= + map[MSINUM_REG_INDEX(N_INTR_RXQ1_COMP)] = MSINUM_INTR_SOURCE(1, N_INTR_RXQ1_COMP); - map[MSINUM_REG_INDEX(N_INTR_RXQ2_COMP)] |= + map[MSINUM_REG_INDEX(N_INTR_RXQ2_COMP)] = MSINUM_INTR_SOURCE(1, N_INTR_RXQ2_COMP); - map[MSINUM_REG_INDEX(N_INTR_RXQ3_COMP)] |= + map[MSINUM_REG_INDEX(N_INTR_RXQ3_COMP)] = MSINUM_INTR_SOURCE(1, N_INTR_RXQ3_COMP); - map[MSINUM_REG_INDEX(N_INTR_RXQ0_DESC_EMPTY)] |= + map[MSINUM_REG_INDEX(N_INTR_RXQ0_DESC_EMPTY)] = MSINUM_INTR_SOURCE(1, N_INTR_RXQ0_DESC_EMPTY); - map[MSINUM_REG_INDEX(N_INTR_RXQ1_DESC_EMPTY)] |= + map[MSINUM_REG_INDEX(N_INTR_RXQ1_DESC_EMPTY)] = MSINUM_INTR_SOURCE(1, N_INTR_RXQ1_DESC_EMPTY); - map[MSINUM_REG_INDEX(N_INTR_RXQ2_DESC_EMPTY)] |= + map[MSINUM_REG_INDEX(N_INTR_RXQ2_DESC_EMPTY)] = MSINUM_INTR_SOURCE(1, N_INTR_RXQ2_DESC_EMPTY); - map[MSINUM_REG_INDEX(N_INTR_RXQ3_DESC_EMPTY)] |= + map[MSINUM_REG_INDEX(N_INTR_RXQ3_DESC_EMPTY)] = MSINUM_INTR_SOURCE(1, N_INTR_RXQ3_DESC_EMPTY); - map[MSINUM_REG_INDEX(N_INTR_RXQ0_COAL)] |= + map[MSINUM_REG_INDEX(N_INTR_RXQ0_COAL)] = MSINUM_INTR_SOURCE(1, N_INTR_RXQ0_COAL); - map[MSINUM_REG_INDEX(N_INTR_RXQ1_COAL)] |= + map[MSINUM_REG_INDEX(N_INTR_RXQ1_COAL)] = MSINUM_INTR_SOURCE(1, N_INTR_RXQ1_COAL); - map[MSINUM_REG_INDEX(N_INTR_RXQ2_COAL)] |= + map[MSINUM_REG_INDEX(N_INTR_RXQ2_COAL)] = MSINUM_INTR_SOURCE(1, N_INTR_RXQ2_COAL); - map[MSINUM_REG_INDEX(N_INTR_RXQ3_COAL)] |= + map[MSINUM_REG_INDEX(N_INTR_RXQ3_COAL)] = MSINUM_INTR_SOURCE(1, N_INTR_RXQ3_COAL); - map[MSINUM_REG_INDEX(N_INTR_RXQ0_COAL_TO)] |= + map[MSINUM_REG_INDEX(N_INTR_RXQ0_COAL_TO)] = MSINUM_INTR_SOURCE(1, N_INTR_RXQ0_COAL_TO); - map[MSINUM_REG_INDEX(N_INTR_RXQ1_COAL_TO)] |= + map[MSINUM_REG_INDEX(N_INTR_RXQ1_COAL_TO)] = MSINUM_INTR_SOURCE(1, N_INTR_RXQ1_COAL_TO); - map[MSINUM_REG_INDEX(N_INTR_RXQ2_COAL_TO)] |= + map[MSINUM_REG_INDEX(N_INTR_RXQ2_COAL_TO)] = MSINUM_INTR_SOURCE(1, N_INTR_RXQ2_COAL_TO); - map[MSINUM_REG_INDEX(N_INTR_RXQ3_COAL_TO)] |= + map[MSINUM_REG_INDEX(N_INTR_RXQ3_COAL_TO)] = MSINUM_INTR_SOURCE(1, N_INTR_RXQ3_COAL_TO); /* Map all other interrupts source to MSI/MSIX vector 0. */ From 07bf14bb72609dd138d0f4cedd8443684cc9a1cd Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 23 Sep 2019 14:34:23 +0000 Subject: [PATCH 026/106] Fix a harmless typo. MFC after: 1 week --- sys/dev/jme/if_jme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/dev/jme/if_jme.c b/sys/dev/jme/if_jme.c index 9c8f725f95c9..f52f88e602ab 100644 --- a/sys/dev/jme/if_jme.c +++ b/sys/dev/jme/if_jme.c @@ -569,7 +569,7 @@ jme_map_intr_vector(struct jme_softc *sc) MSINUM_INTR_SOURCE(2, N_INTR_TXQ3_COMP); map[MSINUM_REG_INDEX(N_INTR_TXQ4_COMP)] |= MSINUM_INTR_SOURCE(2, N_INTR_TXQ4_COMP); - map[MSINUM_REG_INDEX(N_INTR_TXQ4_COMP)] |= + map[MSINUM_REG_INDEX(N_INTR_TXQ5_COMP)] |= MSINUM_INTR_SOURCE(2, N_INTR_TXQ5_COMP); map[MSINUM_REG_INDEX(N_INTR_TXQ6_COMP)] |= MSINUM_INTR_SOURCE(2, N_INTR_TXQ6_COMP); From 9093dd9a66113e7ab2abf2557319a4acc40651f0 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 23 Sep 2019 15:08:17 +0000 Subject: [PATCH 027/106] Implement x86 dtrace_invop_(un)init() in C. There is no reason for these routines to be written in assembly. In the ports of DTrace to other platforms, they are already written in C. No functional change intended. MFC after: 1 week Sponsored by: Netflix --- sys/cddl/dev/dtrace/amd64/dtrace_asm.S | 16 ---------------- sys/cddl/dev/dtrace/amd64/dtrace_subr.c | 20 +++++++++++++++++++- sys/cddl/dev/dtrace/i386/dtrace_asm.S | 16 ---------------- sys/cddl/dev/dtrace/i386/dtrace_subr.c | 20 +++++++++++++++++++- 4 files changed, 38 insertions(+), 34 deletions(-) diff --git a/sys/cddl/dev/dtrace/amd64/dtrace_asm.S b/sys/cddl/dev/dtrace/amd64/dtrace_asm.S index 22e1d6964fb8..13bd930d5f2f 100644 --- a/sys/cddl/dev/dtrace/amd64/dtrace_asm.S +++ b/sys/cddl/dev/dtrace/amd64/dtrace_asm.S @@ -149,22 +149,6 @@ bp_ret: END(dtrace_invop_start) -/* -void dtrace_invop_init(void) -*/ - ENTRY(dtrace_invop_init) - movq $dtrace_invop_start, dtrace_invop_jump_addr(%rip) - ret - END(dtrace_invop_init) - -/* -void dtrace_invop_uninit(void) -*/ - ENTRY(dtrace_invop_uninit) - movq $0, dtrace_invop_jump_addr(%rip) - ret - END(dtrace_invop_uninit) - /* greg_t dtrace_getfp(void) */ diff --git a/sys/cddl/dev/dtrace/amd64/dtrace_subr.c b/sys/cddl/dev/dtrace/amd64/dtrace_subr.c index 82d20ebba936..cf24e6adae3f 100644 --- a/sys/cddl/dev/dtrace/amd64/dtrace_subr.c +++ b/sys/cddl/dev/dtrace/amd64/dtrace_subr.c @@ -48,8 +48,12 @@ #include extern void dtrace_getnanotime(struct timespec *tsp); +extern int (*dtrace_invop_jump_addr)(struct trapframe *); -int dtrace_invop(uintptr_t, struct trapframe *, uintptr_t); +int dtrace_invop(uintptr_t, struct trapframe *, uintptr_t); +int dtrace_invop_start(struct trapframe *frame); +void dtrace_invop_init(void); +void dtrace_invop_uninit(void); typedef struct dtrace_invop_hdlr { int (*dtih_func)(uintptr_t, struct trapframe *, uintptr_t); @@ -109,6 +113,20 @@ dtrace_invop_remove(int (*func)(uintptr_t, struct trapframe *, uintptr_t)) kmem_free(hdlr, 0); } +void +dtrace_invop_init(void) +{ + + dtrace_invop_jump_addr = dtrace_invop_start; +} + +void +dtrace_invop_uninit(void) +{ + + dtrace_invop_jump_addr = NULL; +} + /*ARGSUSED*/ void dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit)) diff --git a/sys/cddl/dev/dtrace/i386/dtrace_asm.S b/sys/cddl/dev/dtrace/i386/dtrace_asm.S index 116db1f84be1..e8767c74efc0 100644 --- a/sys/cddl/dev/dtrace/i386/dtrace_asm.S +++ b/sys/cddl/dev/dtrace/i386/dtrace_asm.S @@ -134,22 +134,6 @@ invop_nop: END(dtrace_invop_start) -/* -void dtrace_invop_init(void) -*/ - ENTRY(dtrace_invop_init) - movl $dtrace_invop_start, dtrace_invop_jump_addr - ret - END(dtrace_invop_init) - -/* -void dtrace_invop_uninit(void) -*/ - ENTRY(dtrace_invop_uninit) - movl $0, dtrace_invop_jump_addr - ret - END(dtrace_invop_uninit) - /* greg_t dtrace_getfp(void) */ diff --git a/sys/cddl/dev/dtrace/i386/dtrace_subr.c b/sys/cddl/dev/dtrace/i386/dtrace_subr.c index 4c0decaa3c87..37cc7601bef5 100644 --- a/sys/cddl/dev/dtrace/i386/dtrace_subr.c +++ b/sys/cddl/dev/dtrace/i386/dtrace_subr.c @@ -51,8 +51,12 @@ extern uintptr_t kernelbase; extern void dtrace_getnanotime(struct timespec *tsp); +extern int (*dtrace_invop_jump_addr)(struct trapframe *); -int dtrace_invop(uintptr_t, struct trapframe *, uintptr_t); +int dtrace_invop(uintptr_t, struct trapframe *, uintptr_t); +int dtrace_invop_start(struct trapframe *frame); +void dtrace_invop_init(void); +void dtrace_invop_uninit(void); typedef struct dtrace_invop_hdlr { int (*dtih_func)(uintptr_t, struct trapframe *, uintptr_t); @@ -112,6 +116,20 @@ dtrace_invop_remove(int (*func)(uintptr_t, struct trapframe *, uintptr_t)) kmem_free(hdlr, 0); } +void +dtrace_invop_init(void) +{ + + dtrace_invop_jump_addr = dtrace_invop_start; +} + +void +dtrace_invop_uninit(void) +{ + + dtrace_invop_jump_addr = NULL; +} + void dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit)) { From 1eab19cbec44c09a5be358269695f9d993792557 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Mon, 23 Sep 2019 17:53:47 +0000 Subject: [PATCH 028/106] Make nvme(4) driver some more NUMA aware. - For each queue pair precalculate CPU and domain it is bound to. If queue pairs are not per-CPU, then use the domain of the device. - Allocate most of queue pair memory from the domain it is bound to. - Bind callouts to the same CPUs as queue pair to avoid migrations. - Do not assign queue pairs to each SMT thread. It just wasted resources and increased lock congestions. - Remove fixed multiplier of CPUs per queue pair, spread them even. This allows to use more queue pairs in some hardware configurations. - If queue pair serves multiple CPUs, bind different NVMe devices to different CPUs. MFC after: 1 month Sponsored by: iXsystems, Inc. --- sys/dev/nvme/nvme_ahci.c | 1 - sys/dev/nvme/nvme_ctrlr.c | 61 +++++++++++++---------- sys/dev/nvme/nvme_ctrlr_cmd.c | 5 +- sys/dev/nvme/nvme_pci.c | 91 ++++++++++++++--------------------- sys/dev/nvme/nvme_private.h | 15 +++--- sys/dev/nvme/nvme_qpair.c | 35 ++++++++------ sys/dev/nvme/nvme_sysctl.c | 6 +-- 7 files changed, 101 insertions(+), 113 deletions(-) diff --git a/sys/dev/nvme/nvme_ahci.c b/sys/dev/nvme/nvme_ahci.c index eae607bcce89..1037fab66664 100644 --- a/sys/dev/nvme/nvme_ahci.c +++ b/sys/dev/nvme/nvme_ahci.c @@ -96,7 +96,6 @@ nvme_ahci_attach(device_t dev) ctrlr->msix_enabled = 0; ctrlr->num_io_queues = 1; - ctrlr->num_cpus_per_ioq = mp_ncpus; if (bus_setup_intr(dev, ctrlr->res, INTR_TYPE_MISC | INTR_MPSAFE, NULL, nvme_ctrlr_intx_handler, ctrlr, &ctrlr->tag) != 0) { diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c index a206b5efc6cf..9b5610f6ccce 100644 --- a/sys/dev/nvme/nvme_ctrlr.c +++ b/sys/dev/nvme/nvme_ctrlr.c @@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include "nvme_private.h" @@ -57,6 +58,9 @@ nvme_ctrlr_construct_admin_qpair(struct nvme_controller *ctrlr) int error; qpair = &ctrlr->adminq; + qpair->id = 0; + qpair->cpu = CPU_FFS(&cpuset_domain[ctrlr->domain]) - 1; + qpair->domain = ctrlr->domain; num_entries = NVME_ADMIN_ENTRIES; TUNABLE_INT_FETCH("hw.nvme.admin_entries", &num_entries); @@ -75,22 +79,21 @@ nvme_ctrlr_construct_admin_qpair(struct nvme_controller *ctrlr) * The admin queue's max xfer size is treated differently than the * max I/O xfer size. 16KB is sufficient here - maybe even less? */ - error = nvme_qpair_construct(qpair, - 0, /* qpair ID */ - 0, /* vector */ - num_entries, - NVME_ADMIN_TRACKERS, - ctrlr); + error = nvme_qpair_construct(qpair, num_entries, NVME_ADMIN_TRACKERS, + ctrlr); return (error); } +#define QP(ctrlr, c) ((c) * (ctrlr)->num_io_queues / mp_ncpus) + static int nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr) { struct nvme_qpair *qpair; uint32_t cap_lo; uint16_t mqes; - int i, error, num_entries, num_trackers, max_entries; + int c, error, i, n; + int num_entries, num_trackers, max_entries; /* * NVMe spec sets a hard limit of 64K max entries, but devices may @@ -130,32 +133,35 @@ nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr) */ ctrlr->max_hw_pend_io = num_trackers * ctrlr->num_io_queues * 3 / 4; - /* - * This was calculated previously when setting up interrupts, but - * a controller could theoretically support fewer I/O queues than - * MSI-X vectors. So calculate again here just to be safe. - */ - ctrlr->num_cpus_per_ioq = howmany(mp_ncpus, ctrlr->num_io_queues); - ctrlr->ioq = malloc(ctrlr->num_io_queues * sizeof(struct nvme_qpair), M_NVME, M_ZERO | M_WAITOK); - for (i = 0; i < ctrlr->num_io_queues; i++) { + for (i = c = n = 0; i < ctrlr->num_io_queues; i++, c += n) { qpair = &ctrlr->ioq[i]; /* * Admin queue has ID=0. IO queues start at ID=1 - * hence the 'i+1' here. - * + */ + qpair->id = i + 1; + if (ctrlr->num_io_queues > 1) { + /* Find number of CPUs served by this queue. */ + for (n = 1; QP(ctrlr, c + n) == i; n++) + ; + /* Shuffle multiple NVMe devices between CPUs. */ + qpair->cpu = c + (device_get_unit(ctrlr->dev)+n/2) % n; + qpair->domain = pcpu_find(qpair->cpu)->pc_domain; + } else { + qpair->cpu = CPU_FFS(&cpuset_domain[ctrlr->domain]) - 1; + qpair->domain = ctrlr->domain; + } + + /* * For I/O queues, use the controller-wide max_xfer_size * calculated in nvme_attach(). */ - error = nvme_qpair_construct(qpair, - i+1, /* qpair ID */ - ctrlr->msix_enabled ? i+1 : 0, /* vector */ - num_entries, - num_trackers, - ctrlr); + error = nvme_qpair_construct(qpair, num_entries, num_trackers, + ctrlr); if (error) return (error); @@ -164,8 +170,7 @@ nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr) * interrupt thread for this controller. */ if (ctrlr->num_io_queues > 1) - bus_bind_intr(ctrlr->dev, qpair->res, - i * ctrlr->num_cpus_per_ioq); + bus_bind_intr(ctrlr->dev, qpair->res, qpair->cpu); } return (0); @@ -458,6 +463,8 @@ nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr) */ ctrlr->num_io_queues = min(ctrlr->num_io_queues, sq_allocated); ctrlr->num_io_queues = min(ctrlr->num_io_queues, cq_allocated); + if (ctrlr->num_io_queues > vm_ndomains) + ctrlr->num_io_queues -= ctrlr->num_io_queues % vm_ndomains; return (0); } @@ -473,7 +480,7 @@ nvme_ctrlr_create_qpairs(struct nvme_controller *ctrlr) qpair = &ctrlr->ioq[i]; status.done = 0; - nvme_ctrlr_cmd_create_io_cq(ctrlr, qpair, qpair->vector, + nvme_ctrlr_cmd_create_io_cq(ctrlr, qpair, nvme_completion_poll_cb, &status); nvme_completion_poll(&status); if (nvme_completion_is_error(&status.cpl)) { @@ -1132,6 +1139,8 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev) ctrlr->dev = dev; mtx_init(&ctrlr->lock, "nvme ctrlr lock", NULL, MTX_DEF); + if (bus_get_domain(dev, &ctrlr->domain) != 0) + ctrlr->domain = 0; cap_hi = nvme_mmio_read_4(ctrlr, cap_hi); ctrlr->dstrd = NVME_CAP_HI_DSTRD(cap_hi) + 2; @@ -1296,7 +1305,7 @@ nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr, { struct nvme_qpair *qpair; - qpair = &ctrlr->ioq[curcpu / ctrlr->num_cpus_per_ioq]; + qpair = &ctrlr->ioq[QP(ctrlr, curcpu)]; nvme_qpair_submit_request(qpair, req); } diff --git a/sys/dev/nvme/nvme_ctrlr_cmd.c b/sys/dev/nvme/nvme_ctrlr_cmd.c index 972b40993b98..fda5576cd12d 100644 --- a/sys/dev/nvme/nvme_ctrlr_cmd.c +++ b/sys/dev/nvme/nvme_ctrlr_cmd.c @@ -76,8 +76,7 @@ nvme_ctrlr_cmd_identify_namespace(struct nvme_controller *ctrlr, uint32_t nsid, void nvme_ctrlr_cmd_create_io_cq(struct nvme_controller *ctrlr, - struct nvme_qpair *io_que, uint16_t vector, nvme_cb_fn_t cb_fn, - void *cb_arg) + struct nvme_qpair *io_que, nvme_cb_fn_t cb_fn, void *cb_arg) { struct nvme_request *req; struct nvme_command *cmd; @@ -93,7 +92,7 @@ nvme_ctrlr_cmd_create_io_cq(struct nvme_controller *ctrlr, */ cmd->cdw10 = htole32(((io_que->num_entries-1) << 16) | io_que->id); /* 0x3 = interrupts enabled | physically contiguous */ - cmd->cdw11 = htole32((vector << 16) | 0x3); + cmd->cdw11 = htole32((io_que->vector << 16) | 0x3); cmd->prp1 = htole64(io_que->cpl_bus_addr); nvme_ctrlr_submit_admin_request(ctrlr, req); diff --git a/sys/dev/nvme/nvme_pci.c b/sys/dev/nvme/nvme_pci.c index 779a81ba265b..448bfda6a718 100644 --- a/sys/dev/nvme/nvme_pci.c +++ b/sys/dev/nvme/nvme_pci.c @@ -34,6 +34,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -233,7 +234,6 @@ nvme_ctrlr_configure_intx(struct nvme_controller *ctrlr) ctrlr->msix_enabled = 0; ctrlr->num_io_queues = 1; - ctrlr->num_cpus_per_ioq = mp_ncpus; ctrlr->rid = 0; ctrlr->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ, &ctrlr->rid, RF_SHAREABLE | RF_ACTIVE); @@ -259,82 +259,61 @@ static void nvme_ctrlr_setup_interrupts(struct nvme_controller *ctrlr) { device_t dev; - int per_cpu_io_queues; + int force_intx, num_io_queues, per_cpu_io_queues; int min_cpus_per_ioq; int num_vectors_requested, num_vectors_allocated; - int num_vectors_available; dev = ctrlr->dev; - min_cpus_per_ioq = 1; - TUNABLE_INT_FETCH("hw.nvme.min_cpus_per_ioq", &min_cpus_per_ioq); - if (min_cpus_per_ioq < 1) { - min_cpus_per_ioq = 1; - } else if (min_cpus_per_ioq > mp_ncpus) { - min_cpus_per_ioq = mp_ncpus; - } - - per_cpu_io_queues = 1; - TUNABLE_INT_FETCH("hw.nvme.per_cpu_io_queues", &per_cpu_io_queues); - - if (per_cpu_io_queues == 0) { - min_cpus_per_ioq = mp_ncpus; - } - - ctrlr->force_intx = 0; - TUNABLE_INT_FETCH("hw.nvme.force_intx", &ctrlr->force_intx); - - /* - * FreeBSD currently cannot allocate more than about 190 vectors at - * boot, meaning that systems with high core count and many devices - * requesting per-CPU interrupt vectors will not get their full - * allotment. So first, try to allocate as many as we may need to - * understand what is available, then immediately release them. - * Then figure out how many of those we will actually use, based on - * assigning an equal number of cores to each I/O queue. - */ - - /* One vector for per core I/O queue, plus one vector for admin queue. */ - num_vectors_available = min(pci_msix_count(dev), mp_ncpus + 1); - if (pci_alloc_msix(dev, &num_vectors_available) != 0) { - num_vectors_available = 0; - } - pci_release_msi(dev); - - if (ctrlr->force_intx || num_vectors_available < 2) { + force_intx = 0; + TUNABLE_INT_FETCH("hw.nvme.force_intx", &force_intx); + if (force_intx || pci_msix_count(dev) < 2) { nvme_ctrlr_configure_intx(ctrlr); return; } - /* - * Do not use all vectors for I/O queues - one must be saved for the - * admin queue. - */ - ctrlr->num_cpus_per_ioq = max(min_cpus_per_ioq, - howmany(mp_ncpus, num_vectors_available - 1)); + num_io_queues = mp_ncpus; + TUNABLE_INT_FETCH("hw.nvme.num_io_queues", &num_io_queues); + if (num_io_queues < 1 || num_io_queues > mp_ncpus) + num_io_queues = mp_ncpus; - ctrlr->num_io_queues = howmany(mp_ncpus, ctrlr->num_cpus_per_ioq); - num_vectors_requested = ctrlr->num_io_queues + 1; + per_cpu_io_queues = 1; + TUNABLE_INT_FETCH("hw.nvme.per_cpu_io_queues", &per_cpu_io_queues); + if (per_cpu_io_queues == 0) + num_io_queues = 1; + + min_cpus_per_ioq = smp_threads_per_core; + TUNABLE_INT_FETCH("hw.nvme.min_cpus_per_ioq", &min_cpus_per_ioq); + if (min_cpus_per_ioq > 1) { + num_io_queues = min(num_io_queues, + max(1, mp_ncpus / min_cpus_per_ioq)); + } + + num_io_queues = min(num_io_queues, pci_msix_count(dev) - 1); + +again: + if (num_io_queues > vm_ndomains) + num_io_queues -= num_io_queues % vm_ndomains; + /* One vector for per core I/O queue, plus one vector for admin queue. */ + num_vectors_requested = num_io_queues + 1; num_vectors_allocated = num_vectors_requested; - - /* - * Now just allocate the number of vectors we need. This should - * succeed, since we previously called pci_alloc_msix() - * successfully returning at least this many vectors, but just to - * be safe, if something goes wrong just revert to INTx. - */ if (pci_alloc_msix(dev, &num_vectors_allocated) != 0) { nvme_ctrlr_configure_intx(ctrlr); return; } - - if (num_vectors_allocated < num_vectors_requested) { + if (num_vectors_allocated < 2) { pci_release_msi(dev); nvme_ctrlr_configure_intx(ctrlr); return; } + if (num_vectors_allocated != num_vectors_requested) { + pci_release_msi(dev); + num_io_queues = num_vectors_allocated - 1; + goto again; + } ctrlr->msix_enabled = 1; + ctrlr->num_io_queues = num_io_queues; } static int diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h index e62a78a222f1..4aaac9deff45 100644 --- a/sys/dev/nvme/nvme_private.h +++ b/sys/dev/nvme/nvme_private.h @@ -175,7 +175,8 @@ struct nvme_qpair { struct nvme_controller *ctrlr; uint32_t id; - uint32_t phase; + int domain; + int cpu; uint16_t vector; int rid; @@ -187,6 +188,7 @@ struct nvme_qpair { uint32_t sq_tdbl_off; uint32_t cq_hdbl_off; + uint32_t phase; uint32_t sq_head; uint32_t sq_tail; uint32_t cq_head; @@ -238,7 +240,7 @@ struct nvme_controller { device_t dev; struct mtx lock; - + int domain; uint32_t ready_timeout_in_ms; uint32_t quirks; #define QUIRK_DELAY_B4_CHK_RDY 1 /* Can't touch MMIO on disable */ @@ -258,11 +260,9 @@ struct nvme_controller { struct resource *bar4_resource; uint32_t msix_enabled; - uint32_t force_intx; uint32_t enable_aborts; uint32_t num_io_queues; - uint32_t num_cpus_per_ioq; uint32_t max_hw_pend_io; /* Fields for tracking progress during controller initialization. */ @@ -377,7 +377,7 @@ void nvme_ctrlr_cmd_get_firmware_page(struct nvme_controller *ctrlr, nvme_cb_fn_t cb_fn, void *cb_arg); void nvme_ctrlr_cmd_create_io_cq(struct nvme_controller *ctrlr, - struct nvme_qpair *io_que, uint16_t vector, + struct nvme_qpair *io_que, nvme_cb_fn_t cb_fn, void *cb_arg); void nvme_ctrlr_cmd_create_io_sq(struct nvme_controller *ctrlr, struct nvme_qpair *io_que, @@ -413,9 +413,8 @@ void nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr, void nvme_ctrlr_post_failed_request(struct nvme_controller *ctrlr, struct nvme_request *req); -int nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id, - uint16_t vector, uint32_t num_entries, - uint32_t num_trackers, +int nvme_qpair_construct(struct nvme_qpair *qpair, + uint32_t num_entries, uint32_t num_trackers, struct nvme_controller *ctrlr); void nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr); diff --git a/sys/dev/nvme/nvme_qpair.c b/sys/dev/nvme/nvme_qpair.c index 744ca3492eda..35d38ee1cd0c 100644 --- a/sys/dev/nvme/nvme_qpair.c +++ b/sys/dev/nvme/nvme_qpair.c @@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -637,8 +638,8 @@ nvme_qpair_msix_handler(void *arg) } int -nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id, - uint16_t vector, uint32_t num_entries, uint32_t num_trackers, +nvme_qpair_construct(struct nvme_qpair *qpair, + uint32_t num_entries, uint32_t num_trackers, struct nvme_controller *ctrlr) { struct nvme_tracker *tr; @@ -647,8 +648,7 @@ nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id, uint8_t *queuemem, *prpmem, *prp_list; int i, err; - qpair->id = id; - qpair->vector = vector; + qpair->vector = ctrlr->msix_enabled ? qpair->id : 0; qpair->num_entries = num_entries; qpair->num_trackers = num_trackers; qpair->ctrlr = ctrlr; @@ -659,19 +659,19 @@ nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id, * MSI-X vector resource IDs start at 1, so we add one to * the queue's vector to get the corresponding rid to use. */ - qpair->rid = vector + 1; + qpair->rid = qpair->vector + 1; qpair->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ, &qpair->rid, RF_ACTIVE); bus_setup_intr(ctrlr->dev, qpair->res, INTR_TYPE_MISC | INTR_MPSAFE, NULL, nvme_qpair_msix_handler, qpair, &qpair->tag); - if (id == 0) { + if (qpair->id == 0) { bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag, "admin"); } else { bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag, - "io%d", id - 1); + "io%d", qpair->id - 1); } } @@ -707,6 +707,7 @@ nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id, nvme_printf(ctrlr, "tag create failed %d\n", err); goto out; } + bus_dma_tag_set_domain(qpair->dma_tag, qpair->domain); if (bus_dmamem_alloc(qpair->dma_tag, (void **)&queuemem, BUS_DMA_NOWAIT, &qpair->queuemem_map)) { @@ -737,9 +738,9 @@ nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id, * it to various small values. */ qpair->sq_tdbl_off = nvme_mmio_offsetof(doorbell[0]) + - (id << (ctrlr->dstrd + 1)); + (qpair->id << (ctrlr->dstrd + 1)); qpair->cq_hdbl_off = nvme_mmio_offsetof(doorbell[0]) + - (id << (ctrlr->dstrd + 1)) + (1 << ctrlr->dstrd); + (qpair->id << (ctrlr->dstrd + 1)) + (1 << ctrlr->dstrd); TAILQ_INIT(&qpair->free_tr); TAILQ_INIT(&qpair->outstanding_tr); @@ -765,7 +766,8 @@ nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id, (uint8_t *)roundup2((uintptr_t)prp_list, PAGE_SIZE); } - tr = malloc(sizeof(*tr), M_NVME, M_ZERO | M_WAITOK); + tr = malloc_domainset(sizeof(*tr), M_NVME, + DOMAINSET_PREF(qpair->domain), M_ZERO | M_WAITOK); bus_dmamap_create(qpair->dma_tag_payload, 0, &tr->payload_dma_map); callout_init(&tr->timer, 1); @@ -783,8 +785,9 @@ nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id, goto out; } - qpair->act_tr = malloc(sizeof(struct nvme_tracker *) * - qpair->num_entries, M_NVME, M_ZERO | M_WAITOK); + qpair->act_tr = malloc_domainset(sizeof(struct nvme_tracker *) * + qpair->num_entries, M_NVME, DOMAINSET_PREF(qpair->domain), + M_ZERO | M_WAITOK); return (0); out: @@ -814,14 +817,14 @@ nvme_qpair_destroy(struct nvme_qpair *qpair) } if (qpair->act_tr) - free(qpair->act_tr, M_NVME); + free_domain(qpair->act_tr, M_NVME); while (!TAILQ_EMPTY(&qpair->free_tr)) { tr = TAILQ_FIRST(&qpair->free_tr); TAILQ_REMOVE(&qpair->free_tr, tr, tailq); bus_dmamap_destroy(qpair->dma_tag_payload, tr->payload_dma_map); - free(tr, M_NVME); + free_domain(tr, M_NVME); } if (qpair->dma_tag) @@ -938,8 +941,8 @@ nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr) ctrlr = qpair->ctrlr; if (req->timeout) - callout_reset_curcpu(&tr->timer, ctrlr->timeout_period * hz, - nvme_timeout, tr); + callout_reset_on(&tr->timer, ctrlr->timeout_period * hz, + nvme_timeout, tr, qpair->cpu); /* Copy the command from the tracker to the submission queue. */ memcpy(&qpair->cmd[qpair->sq_tail], &req->cmd, sizeof(req->cmd)); diff --git a/sys/dev/nvme/nvme_sysctl.c b/sys/dev/nvme/nvme_sysctl.c index 15da68edcf4c..b68a644206e7 100644 --- a/sys/dev/nvme/nvme_sysctl.c +++ b/sys/dev/nvme/nvme_sysctl.c @@ -306,9 +306,9 @@ nvme_sysctl_initialize_ctrlr(struct nvme_controller *ctrlr) ctrlr_tree = device_get_sysctl_tree(ctrlr->dev); ctrlr_list = SYSCTL_CHILDREN(ctrlr_tree); - SYSCTL_ADD_UINT(ctrlr_ctx, ctrlr_list, OID_AUTO, "num_cpus_per_ioq", - CTLFLAG_RD, &ctrlr->num_cpus_per_ioq, 0, - "Number of CPUs assigned per I/O queue pair"); + SYSCTL_ADD_UINT(ctrlr_ctx, ctrlr_list, OID_AUTO, "num_io_queues", + CTLFLAG_RD, &ctrlr->num_io_queues, 0, + "Number of I/O queue pairs"); SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO, "int_coal_time", CTLTYPE_UINT | CTLFLAG_RW, ctrlr, 0, From 93a85508ad520aebceee9b740884708cd7e21ffe Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Mon, 23 Sep 2019 20:50:04 +0000 Subject: [PATCH 029/106] cache: tidy up handling of negative entries - track the total count of hot entries - pre-read the lock when shrinking since it is typically already taken - place the lock in its own cacheline - shorten the hold time of hot lock list when zapping Sponsored by: The FreeBSD Foundation --- sys/kern/vfs_cache.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c index 925eba5f5d0b..47a745af09aa 100644 --- a/sys/kern/vfs_cache.c +++ b/sys/kern/vfs_cache.c @@ -226,7 +226,7 @@ SYSCTL_UINT(_vfs, OID_AUTO, ncneghitsrequeue, CTLFLAG_RW, &ncneghitsrequeue, 0, struct nchstats nchstats; /* cache effectiveness statistics */ -static struct mtx ncneg_shrink_lock; +static struct mtx __exclusive_cache_line ncneg_shrink_lock; static int shrink_list_turn; struct neglist { @@ -236,6 +236,7 @@ struct neglist { static struct neglist __read_mostly *neglists; static struct neglist ncneg_hot; +static u_long numhotneg; #define numneglists (ncneghash + 1) static u_int __read_mostly ncneghash; @@ -389,6 +390,7 @@ static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2, static long cache_lock_vnodes_cel_3_failures; STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, "Number of times 3-way vnode locking failed"); +STATNODE_ULONG(numhotneg, "Number of hot negative entries"); static void cache_zap_locked(struct namecache *ncp, bool neg_locked); static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, @@ -705,6 +707,7 @@ cache_negative_hit(struct namecache *ncp) neglist = NCP2NEGLIST(ncp); mtx_lock(&neglist->nl_lock); if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) { + numhotneg++; TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); ncp->nc_flag |= NCF_HOTNEGATIVE; @@ -758,6 +761,7 @@ cache_negative_remove(struct namecache *ncp, bool neg_locked) if (ncp->nc_flag & NCF_HOTNEGATIVE) { mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); + numhotneg--; } else { mtx_assert(&neglist->nl_lock, MA_OWNED); TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); @@ -803,7 +807,8 @@ cache_negative_zap_one(void) struct mtx *dvlp; struct rwlock *blp; - if (!mtx_trylock(&ncneg_shrink_lock)) + if (mtx_owner(&ncneg_shrink_lock) != NULL || + !mtx_trylock(&ncneg_shrink_lock)) return; mtx_lock(&ncneg_hot.nl_lock); @@ -814,8 +819,10 @@ cache_negative_zap_one(void) TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); ncp->nc_flag &= ~NCF_HOTNEGATIVE; + numhotneg--; mtx_unlock(&neglist->nl_lock); } + mtx_unlock(&ncneg_hot.nl_lock); cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist); shrink_list_turn++; @@ -823,16 +830,13 @@ cache_negative_zap_one(void) shrink_list_turn = 0; if (ncp == NULL && shrink_list_turn == 0) cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist); - if (ncp == NULL) { - mtx_unlock(&ncneg_hot.nl_lock); + if (ncp == NULL) goto out; - } MPASS(ncp->nc_flag & NCF_NEGATIVE); dvlp = VP2VNODELOCK(ncp->nc_dvp); blp = NCP2BUCKETLOCK(ncp); mtx_unlock(&neglist->nl_lock); - mtx_unlock(&ncneg_hot.nl_lock); mtx_lock(dvlp); rw_wlock(blp); mtx_lock(&neglist->nl_lock); @@ -1750,9 +1754,14 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, if (vp != NULL) { TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); + if (ncp->nc_flag & NCF_HOTNEGATIVE) + numhotneg--; ncp->nc_flag &= ~(NCF_NEGATIVE|NCF_HOTNEGATIVE); } else { - ncp->nc_flag &= ~(NCF_HOTNEGATIVE); + if (ncp->nc_flag & NCF_HOTNEGATIVE) { + numhotneg--; + ncp->nc_flag &= ~(NCF_HOTNEGATIVE); + } ncp->nc_flag |= NCF_NEGATIVE; cache_negative_insert(ncp, true); } From d395e985b28bc1b58c29e8da38efa8eeac39bfeb Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 23 Sep 2019 22:20:11 +0000 Subject: [PATCH 030/106] ping6: Use caph_rights_limit(3) for STDIN_FILENO Update some error messages while here. Reported by: olivier MFC after: 3 days --- sbin/ping6/ping6.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sbin/ping6/ping6.c b/sbin/ping6/ping6.c index 649eadf3bf2e..5bd9de92545d 100644 --- a/sbin/ping6/ping6.c +++ b/sbin/ping6/ping6.c @@ -1028,8 +1028,8 @@ main(int argc, char *argv[]) err(1, "caph_enter_casper"); cap_rights_init(&rights_stdin); - if (cap_rights_limit(STDIN_FILENO, &rights_stdin) < 0) - err(1, "cap_rights_limit stdin"); + if (caph_rights_limit(STDIN_FILENO, &rights_stdin) < 0) + err(1, "caph_rights_limit stdin"); if (caph_limit_stdout() < 0) err(1, "caph_limit_stdout"); if (caph_limit_stderr() < 0) @@ -1037,10 +1037,10 @@ main(int argc, char *argv[]) cap_rights_init(&rights_srecv, CAP_RECV, CAP_EVENT, CAP_SETSOCKOPT); if (caph_rights_limit(srecv, &rights_srecv) < 0) - err(1, "cap_rights_limit srecv"); + err(1, "caph_rights_limit srecv"); cap_rights_init(&rights_ssend, CAP_SEND, CAP_SETSOCKOPT); if (caph_rights_limit(ssend, &rights_ssend) < 0) - err(1, "cap_rights_limit ssend"); + err(1, "caph_rights_limit ssend"); #if defined(SO_SNDBUF) && defined(SO_RCVBUF) if (sockbufsize) { @@ -1092,10 +1092,10 @@ main(int argc, char *argv[]) cap_rights_clear(&rights_srecv, CAP_SETSOCKOPT); if (caph_rights_limit(srecv, &rights_srecv) < 0) - err(1, "cap_rights_limit srecv setsockopt"); + err(1, "caph_rights_limit srecv setsockopt"); cap_rights_clear(&rights_ssend, CAP_SETSOCKOPT); if (caph_rights_limit(ssend, &rights_ssend) < 0) - err(1, "cap_rights_limit ssend setsockopt"); + err(1, "caph_rights_limit ssend setsockopt"); printf("PING6(%lu=40+8+%lu bytes) ", (unsigned long)(40 + pingerlen()), (unsigned long)(pingerlen() - 8)); From 05cba150d3f92f157a34b3be660869f8a40e905f Mon Sep 17 00:00:00 2001 From: Li-Wen Hsu Date: Tue, 24 Sep 2019 01:56:27 +0000 Subject: [PATCH 031/106] Clean LINT* kernel configurations for arm* MFC after: 3 days Sponsored by: The FreeBSD Foundation --- sys/conf/makeLINT.mk | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sys/conf/makeLINT.mk b/sys/conf/makeLINT.mk index e4b0b7131e3b..0b4916a91161 100644 --- a/sys/conf/makeLINT.mk +++ b/sys/conf/makeLINT.mk @@ -11,6 +11,9 @@ clean: .if ${TARGET} == "amd64" || ${TARGET} == "i386" rm -f LINT-NOINET LINT-NOINET6 LINT-NOIP .endif +.if ${TARGET} == "arm" + rm -f LINT-V5 LINT-V7 +.endif NOTES+= ${.CURDIR}/../../conf/NOTES ${.CURDIR}/NOTES MAKELINT_SED= ${.CURDIR}/../../conf/makeLINT.sed From 5d85e12f44ccb0e5728344a002c108ddc105e038 Mon Sep 17 00:00:00 2001 From: Rick Macklem Date: Tue, 24 Sep 2019 01:58:54 +0000 Subject: [PATCH 032/106] Replace all mtx_lock()/mtx_unlock() on n_mtx with the macros. For a long time, some places in the NFS code have locked/unlocked the NFS node lock with the macros NFSLOCKNODE()/NFSUNLOCKNODE() whereas others have simply used mtx_lock()/mtx_unlock(). Since the NFS node mutex needs to change to an sx lock so it can be held when vnode_pager_setsize() is called, replace all occurrences of mtx_lock/mtx_unlock with the macros to simply making the change to an sx lock in future commit. There is no semantic change as a result of this commit. I am not sure if the change to an sx lock will be MFC'd soon, so I put an MFC of 1 week on this commit so that it could be MFC'd with that commit. Suggested by: kib MFC after: 1 week --- sys/fs/nfs/nfsport.h | 4 +- sys/fs/nfsclient/nfs_clbio.c | 114 ++++++++-------- sys/fs/nfsclient/nfs_clnode.c | 16 +-- sys/fs/nfsclient/nfs_clport.c | 8 +- sys/fs/nfsclient/nfs_clrpcops.c | 16 +-- sys/fs/nfsclient/nfs_clsubs.c | 14 +- sys/fs/nfsclient/nfs_clvnops.c | 226 ++++++++++++++++---------------- 7 files changed, 199 insertions(+), 199 deletions(-) diff --git a/sys/fs/nfs/nfsport.h b/sys/fs/nfs/nfsport.h index 62f3f4b925d3..221ee1cc6f09 100644 --- a/sys/fs/nfs/nfsport.h +++ b/sys/fs/nfs/nfsport.h @@ -855,11 +855,11 @@ MALLOC_DECLARE(M_NEWNFSDSESSION); #define NFSWRITERPC_SETTIME(w, n, a, v4) \ do { \ if (w) { \ - mtx_lock(&((n)->n_mtx)); \ + NFSLOCKNODE(n); \ (n)->n_mtime = (a)->na_mtime; \ if (v4) \ (n)->n_change = (a)->na_filerev; \ - mtx_unlock(&((n)->n_mtx)); \ + NFSUNLOCKNODE(n); \ } \ } while (0) diff --git a/sys/fs/nfsclient/nfs_clbio.c b/sys/fs/nfsclient/nfs_clbio.c index cbd4725d844d..dddbfb963125 100644 --- a/sys/fs/nfsclient/nfs_clbio.c +++ b/sys/fs/nfsclient/nfs_clbio.c @@ -101,9 +101,9 @@ ncl_gbp_getblksz(struct vnode *vp, daddr_t lbn) int biosize, bcount; np = VTONFS(vp); - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); nsize = np->n_size; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); biosize = vp->v_bufobj.bo_bsize; bcount = biosize; @@ -144,13 +144,13 @@ ncl_getpages(struct vop_getpages_args *ap) } if (newnfs_directio_enable && !newnfs_directio_allow_mmap) { - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); if ((np->n_flag & NNONCACHE) && (vp->v_type == VREG)) { - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); printf("ncl_getpages: called on non-cacheable vnode\n"); return (VM_PAGER_ERROR); } else - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } mtx_lock(&nmp->nm_mtx); @@ -301,12 +301,12 @@ ncl_putpages(struct vop_putpages_args *ap) } else mtx_unlock(&nmp->nm_mtx); - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); if (newnfs_directio_enable && !newnfs_directio_allow_mmap && (np->n_flag & NNONCACHE) && (vp->v_type == VREG)) { - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); printf("ncl_putpages: called on noncache-able vnode\n"); - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); } /* * When putting pages, do not extend file past EOF. @@ -316,7 +316,7 @@ ncl_putpages(struct vop_putpages_args *ap) if (count < 0) count = 0; } - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); for (i = 0; i < npages; i++) rtvals[i] = VM_PAGER_ERROR; @@ -374,9 +374,9 @@ nfs_bioread_check_cons(struct vnode *vp, struct thread *td, struct ucred *cred) * whether the cache is consistent. */ old_lock = ncl_excl_start(vp); - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); if (np->n_flag & NMODIFIED) { - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); if (vp->v_type != VREG) { if (vp->v_type != VDIR) panic("nfs: bioread, not dir"); @@ -390,28 +390,28 @@ nfs_bioread_check_cons(struct vnode *vp, struct thread *td, struct ucred *cred) error = VOP_GETATTR(vp, &vattr, cred); if (error) goto out; - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); np->n_mtime = vattr.va_mtime; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } else { - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); error = VOP_GETATTR(vp, &vattr, cred); if (error) goto out; - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); if ((np->n_flag & NSIZECHANGED) || (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime))) { - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); if (vp->v_type == VDIR) ncl_invaldir(vp); error = ncl_vinvalbuf(vp, V_SAVE | V_ALLOWCLEAN, td, 1); if (error != 0) goto out; - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); np->n_mtime = vattr.va_mtime; np->n_flag &= ~NSIZECHANGED; } - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } out: ncl_excl_finish(vp, old_lock); @@ -474,9 +474,9 @@ ncl_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred) do { u_quad_t nsize; - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); nsize = np->n_size; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); switch (vp->v_type) { case VREG: @@ -883,13 +883,13 @@ ncl_write(struct vop_write_args *ap) ("ncl_write proc")); if (vp->v_type != VREG) return (EIO); - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); if (np->n_flag & NWRITEERR) { np->n_flag &= ~NWRITEERR; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); return (np->n_error); } else - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); mtx_lock(&nmp->nm_mtx); if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { @@ -906,9 +906,9 @@ ncl_write(struct vop_write_args *ap) * mode or if we are appending. */ if (ioflag & (IO_APPEND | IO_SYNC)) { - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); if (np->n_flag & NMODIFIED) { - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); #ifdef notyet /* Needs matching nonblock semantics elsewhere, too. */ /* * Require non-blocking, synchronous writes to @@ -925,13 +925,13 @@ ncl_write(struct vop_write_args *ap) if (error != 0) return (error); } else - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } orig_resid = uio->uio_resid; - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); orig_size = np->n_size; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); /* * If IO_APPEND then load uio_offset. We restart here if we cannot @@ -943,9 +943,9 @@ ncl_write(struct vop_write_args *ap) error = VOP_GETATTR(vp, &vattr, cred); if (error) return (error); - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); uio->uio_offset = np->n_size; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } if (uio->uio_offset < 0) @@ -979,9 +979,9 @@ ncl_write(struct vop_write_args *ap) if (!(ioflag & IO_SYNC)) { int nflag; - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); nflag = np->n_flag; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); if (nflag & NMODIFIED) { BO_LOCK(&vp->v_bufobj); if (vp->v_bufobj.bo_dirty.bv_cnt != 0) { @@ -1018,7 +1018,7 @@ ncl_write(struct vop_write_args *ap) * Handle direct append and file extension cases, calculate * unaligned buffer size. */ - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); if ((np->n_flag & NHASBEENLOCKED) == 0 && (nmp->nm_flag & NFSMNT_NONCONTIGWR) != 0) noncontig_write = 1; @@ -1028,7 +1028,7 @@ ncl_write(struct vop_write_args *ap) (noncontig_write != 0 && lbn == (np->n_size / biosize) && uio->uio_offset + n > np->n_size)) && n) { - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); /* * Get the buffer (in its pre-append state to maintain * B_CACHE if it was previously set). Resize the @@ -1041,11 +1041,11 @@ ncl_write(struct vop_write_args *ap) if (bp != NULL) { long save; - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); np->n_size = uio->uio_offset + n; np->n_flag |= NMODIFIED; vnode_pager_setsize(vp, np->n_size); - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); save = bp->b_flags & B_CACHE; bcount = on + n; @@ -1067,15 +1067,15 @@ ncl_write(struct vop_write_args *ap) else bcount = np->n_size - (off_t)lbn * biosize; } - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); bp = nfs_getcacheblk(vp, lbn, bcount, td); - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); if (uio->uio_offset + n > np->n_size) { np->n_size = uio->uio_offset + n; np->n_flag |= NMODIFIED; vnode_pager_setsize(vp, np->n_size); } - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } if (!bp) { @@ -1124,9 +1124,9 @@ ncl_write(struct vop_write_args *ap) } if (bp->b_wcred == NOCRED) bp->b_wcred = crhold(cred); - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); np->n_flag |= NMODIFIED; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); /* * If dirtyend exceeds file size, chop it down. This should @@ -1369,13 +1369,13 @@ ncl_vinvalbuf(struct vnode *vp, int flags, struct thread *td, int intrflg) * Invalidate the attribute cache, since writes to a DS * won't update the size attribute. */ - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); np->n_attrstamp = 0; } else - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); if (np->n_directio_asyncwr == 0) np->n_flag &= ~NMODIFIED; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); out: ncl_excl_finish(vp, old_lock); return error; @@ -1517,10 +1517,10 @@ ncl_asyncio(struct nfsmount *nmp, struct buf *bp, struct ucred *cred, struct thr TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); nmp->nm_bufqlen++; if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) { - mtx_lock(&(VTONFS(bp->b_vp))->n_mtx); + NFSLOCKNODE(VTONFS(bp->b_vp)); VTONFS(bp->b_vp)->n_flag |= NMODIFIED; VTONFS(bp->b_vp)->n_directio_asyncwr++; - mtx_unlock(&(VTONFS(bp->b_vp))->n_mtx); + NFSUNLOCKNODE(VTONFS(bp->b_vp)); } mtx_unlock(&ncl_iod_mutex); return (0); @@ -1552,7 +1552,7 @@ ncl_doio_directwrite(struct buf *bp) free(uiop, M_NFSDIRECTIO); if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) { struct nfsnode *np = VTONFS(bp->b_vp); - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); if (NFSHASPNFS(VFSTONFS(vnode_mount(bp->b_vp)))) { /* * Invalidate the attribute cache, since writes to a DS @@ -1568,7 +1568,7 @@ ncl_doio_directwrite(struct buf *bp) wakeup((caddr_t)&np->n_directio_asyncwr); } } - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } bp->b_vp = NULL; uma_zfree(ncl_pbuf_zone, bp); @@ -1640,14 +1640,14 @@ ncl_doio(struct vnode *vp, struct buf *bp, struct ucred *cr, struct thread *td, } /* ASSERT_VOP_LOCKED(vp, "ncl_doio"); */ if (p && vp->v_writecount <= -1) { - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); if (NFS_TIMESPEC_COMPARE(&np->n_mtime, &np->n_vattr.na_mtime)) { - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); PROC_LOCK(p); killproc(p, "text file modification"); PROC_UNLOCK(p); } else - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } break; case VLNK: @@ -1706,10 +1706,10 @@ ncl_doio(struct vnode *vp, struct buf *bp, struct ucred *cr, struct thread *td, /* * Setup for actual write */ - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size) bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); if (bp->b_dirtyend > bp->b_dirtyoff) { io.iov_len = uiop->uio_resid = bp->b_dirtyend @@ -1802,11 +1802,11 @@ ncl_doio(struct vnode *vp, struct buf *bp, struct ucred *cr, struct thread *td, bp->b_ioflags |= BIO_ERROR; bp->b_flags |= B_INVAL; bp->b_error = np->n_error = error; - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); np->n_flag |= NWRITEERR; np->n_attrstamp = 0; KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } bp->b_dirtyoff = bp->b_dirtyend = 0; } @@ -1839,10 +1839,10 @@ ncl_meta_setsize(struct vnode *vp, struct thread *td, u_quad_t nsize) int biosize = vp->v_bufobj.bo_bsize; int error = 0; - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); tsize = np->n_size; np->n_size = nsize; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); if (nsize < tsize) { struct buf *bp; diff --git a/sys/fs/nfsclient/nfs_clnode.c b/sys/fs/nfsclient/nfs_clnode.c index 0baf710d9cfa..e58797ac8be0 100644 --- a/sys/fs/nfsclient/nfs_clnode.c +++ b/sys/fs/nfsclient/nfs_clnode.c @@ -219,7 +219,7 @@ ncl_releasesillyrename(struct vnode *vp, struct thread *td) } else sp = NULL; if (sp != NULL) { - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); (void) ncl_vinvalbuf(vp, 0, td, 1); /* * Remove the silly file that was rename'd earlier @@ -228,7 +228,7 @@ ncl_releasesillyrename(struct vnode *vp, struct thread *td) crfree(sp->s_cred); TASK_INIT(&sp->s_task, 0, nfs_freesillyrename, sp); taskqueue_enqueue(taskqueue_thread, &sp->s_task); - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); } } @@ -260,7 +260,7 @@ ncl_inactive(struct vop_inactive_args *ap) } np = VTONFS(vp); - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); ncl_releasesillyrename(vp, ap->a_td); /* @@ -271,7 +271,7 @@ ncl_inactive(struct vop_inactive_args *ap) * None of the other flags are meaningful after the vnode is unused. */ np->n_flag &= (NMODIFIED | NDSCOMMIT); - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); return (0); } @@ -292,9 +292,9 @@ ncl_reclaim(struct vop_reclaim_args *ap) if (nfs_reclaim_p != NULL) nfs_reclaim_p(ap); - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); ncl_releasesillyrename(vp, ap->a_td); - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); if (NFS_ISV4(vp) && vp->v_type == VREG) /* @@ -348,11 +348,11 @@ ncl_invalcaches(struct vnode *vp) struct nfsnode *np = VTONFS(vp); int i; - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); for (i = 0; i < NFS_ACCESSCACHESIZE; i++) np->n_accesscache[i].stamp = 0; KDTRACE_NFS_ACCESSCACHE_FLUSH_DONE(vp); np->n_attrstamp = 0; KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } diff --git a/sys/fs/nfsclient/nfs_clport.c b/sys/fs/nfsclient/nfs_clport.c index a23b4ba4efae..2529830d2300 100644 --- a/sys/fs/nfsclient/nfs_clport.c +++ b/sys/fs/nfsclient/nfs_clport.c @@ -742,12 +742,12 @@ nfscl_wcc_data(struct nfsrv_descript *nd, struct vnode *vp, if (*tl == newnfs_true) { NFSM_DISSECT(tl, u_int32_t *, 6 * NFSX_UNSIGNED); if (wccflagp != NULL) { - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); *wccflagp = (np->n_mtime.tv_sec == fxdr_unsigned(u_int32_t, *(tl + 2)) && np->n_mtime.tv_nsec == fxdr_unsigned(u_int32_t, *(tl + 3))); - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } } error = nfscl_postop_attr(nd, nap, flagp, stuff); @@ -768,12 +768,12 @@ nfscl_wcc_data(struct nfsrv_descript *nd, struct vnode *vp, nd->nd_flag |= ND_NOMOREDATA; if (wccflagp != NULL && nfsva.na_vattr.va_mtime.tv_sec != 0) { - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); *wccflagp = (np->n_mtime.tv_sec == nfsva.na_vattr.va_mtime.tv_sec && np->n_mtime.tv_nsec == nfsva.na_vattr.va_mtime.tv_sec); - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } } nfsmout: diff --git a/sys/fs/nfsclient/nfs_clrpcops.c b/sys/fs/nfsclient/nfs_clrpcops.c index 7a15719717ad..60a55a4d14aa 100644 --- a/sys/fs/nfsclient/nfs_clrpcops.c +++ b/sys/fs/nfsclient/nfs_clrpcops.c @@ -5952,9 +5952,9 @@ nfscl_doflayoutio(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit, error = EIO; } else { commit_thru_mds = 0; - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); np->n_flag |= NDSCOMMIT; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } if (docommit != 0) { if (error == 0) @@ -5968,9 +5968,9 @@ nfscl_doflayoutio(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit, *eofp = 1; uiop->uio_resid = 0; } else { - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); np->n_flag &= ~NDSCOMMIT; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } } else if (rwflag == NFSV4OPEN_ACCESSREAD) error = nfsrpc_readds(vp, uiop, stateidp, eofp, *dspp, @@ -6036,9 +6036,9 @@ nfscl_dofflayoutio(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit, transfer = dp->nfsdi_rsize; else transfer = dp->nfsdi_wsize; - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); np->n_flag |= NDSCOMMIT; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); if (len > transfer && docommit == 0) xfer = transfer; else @@ -6075,9 +6075,9 @@ nfscl_dofflayoutio(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit, *eofp = 1; uiop->uio_resid = 0; } else { - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); np->n_flag &= ~NDSCOMMIT; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } } else if (rwflag == NFSV4OPEN_ACCESSREAD) { error = nfsrpc_readds(vp, uiop, stateidp, eofp, *dspp, diff --git a/sys/fs/nfsclient/nfs_clsubs.c b/sys/fs/nfsclient/nfs_clsubs.c index c569652ab6f8..7691d7b5c469 100644 --- a/sys/fs/nfsclient/nfs_clsubs.c +++ b/sys/fs/nfsclient/nfs_clsubs.c @@ -121,20 +121,20 @@ ncl_uninit(struct vfsconf *vfsp) void ncl_dircookie_lock(struct nfsnode *np) { - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); while (np->n_flag & NDIRCOOKIELK) (void) msleep(&np->n_flag, &np->n_mtx, PZERO, "nfsdirlk", 0); np->n_flag |= NDIRCOOKIELK; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } void ncl_dircookie_unlock(struct nfsnode *np) { - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); np->n_flag &= ~NDIRCOOKIELK; wakeup(&np->n_flag); - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } bool @@ -190,7 +190,7 @@ ncl_getattrcache(struct vnode *vp, struct vattr *vaper) vap = &np->n_vattr.na_vattr; nmp = VFSTONFS(vp->v_mount); mustflush = nfscl_mustflush(vp); /* must be before mtx_lock() */ - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); /* XXX n_mtime doesn't seem to be updated on a miss-and-reload */ timeo = (time_second - np->n_mtime.tv_sec) / 10; @@ -225,7 +225,7 @@ ncl_getattrcache(struct vnode *vp, struct vattr *vaper) if ((time_second - np->n_attrstamp) >= timeo && (mustflush != 0 || np->n_attrstamp == 0)) { nfsstatsv1.attrcache_misses++; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); KDTRACE_NFS_ATTRCACHE_GET_MISS(vp); return( ENOENT); } @@ -252,7 +252,7 @@ ncl_getattrcache(struct vnode *vp, struct vattr *vaper) if (np->n_flag & NUPD) vaper->va_mtime = np->n_mtim; } - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); KDTRACE_NFS_ATTRCACHE_GET_HIT(vp, vap); return (0); } diff --git a/sys/fs/nfsclient/nfs_clvnops.c b/sys/fs/nfsclient/nfs_clvnops.c index 45c49124b678..d89a0b23f940 100644 --- a/sys/fs/nfsclient/nfs_clvnops.c +++ b/sys/fs/nfsclient/nfs_clvnops.c @@ -310,7 +310,7 @@ nfs34_access_otw(struct vnode *vp, int wmode, struct thread *td, (void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1); if (!error) { lrupos = 0; - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); for (i = 0; i < NFS_ACCESSCACHESIZE; i++) { if (np->n_accesscache[i].uid == cred->cr_uid) { np->n_accesscache[i].mode = rmode; @@ -326,7 +326,7 @@ nfs34_access_otw(struct vnode *vp, int wmode, struct thread *td, np->n_accesscache[lrupos].mode = rmode; np->n_accesscache[lrupos].stamp = time_second; } - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); if (retmode != NULL) *retmode = rmode; KDTRACE_NFS_ACCESSCACHE_LOAD_DONE(vp, cred->cr_uid, rmode, 0); @@ -421,7 +421,7 @@ nfs_access(struct vop_access_args *ap) * this request? */ gotahit = 0; - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); for (i = 0; i < NFS_ACCESSCACHESIZE; i++) { if (ap->a_cred->cr_uid == np->n_accesscache[i].uid) { if (time_second < (np->n_accesscache[i].stamp @@ -433,7 +433,7 @@ nfs_access(struct vop_access_args *ap) break; } } - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); #ifdef KDTRACE_HOOKS if (gotahit != 0) KDTRACE_NFS_ACCESSCACHE_GET_HIT(vp, @@ -465,14 +465,14 @@ nfs_access(struct vop_access_args *ap) * After calling nfsspec_access, we should have the correct * file size cached. */ - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); if (ap->a_cred->cr_uid == 0 && (ap->a_accmode & VREAD) && VTONFS(vp)->n_size > 0) { struct iovec aiov; struct uio auio; char buf[1]; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); aiov.iov_base = buf; aiov.iov_len = 1; auio.uio_iov = &aiov; @@ -498,7 +498,7 @@ nfs_access(struct vop_access_args *ap) else error = EACCES; } else - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); return (error); } } @@ -543,52 +543,52 @@ nfs_open(struct vop_open_args *ap) * Now, if this Open will be doing reading, re-validate/flush the * cache, so that Close/Open coherency is maintained. */ - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); if (np->n_flag & NMODIFIED) { - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); error = ncl_vinvalbuf(vp, V_SAVE, ap->a_td, 1); if (error == EINTR || error == EIO) { if (NFS_ISV4(vp)) (void) nfsrpc_close(vp, 0, ap->a_td); return (error); } - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); np->n_attrstamp = 0; KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); if (vp->v_type == VDIR) np->n_direofoffset = 0; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); error = VOP_GETATTR(vp, &vattr, ap->a_cred); if (error) { if (NFS_ISV4(vp)) (void) nfsrpc_close(vp, 0, ap->a_td); return (error); } - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); np->n_mtime = vattr.va_mtime; if (NFS_ISV4(vp)) np->n_change = vattr.va_filerev; } else { - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); error = VOP_GETATTR(vp, &vattr, ap->a_cred); if (error) { if (NFS_ISV4(vp)) (void) nfsrpc_close(vp, 0, ap->a_td); return (error); } - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); if ((NFS_ISV4(vp) && np->n_change != vattr.va_filerev) || NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) { if (vp->v_type == VDIR) np->n_direofoffset = 0; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); error = ncl_vinvalbuf(vp, V_SAVE, ap->a_td, 1); if (error == EINTR || error == EIO) { if (NFS_ISV4(vp)) (void) nfsrpc_close(vp, 0, ap->a_td); return (error); } - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); np->n_mtime = vattr.va_mtime; if (NFS_ISV4(vp)) np->n_change = vattr.va_filerev; @@ -601,14 +601,14 @@ nfs_open(struct vop_open_args *ap) if (newnfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) { if (np->n_directio_opens == 0) { - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); error = ncl_vinvalbuf(vp, V_SAVE, ap->a_td, 1); if (error) { if (NFS_ISV4(vp)) (void) nfsrpc_close(vp, 0, ap->a_td); return (error); } - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); np->n_flag |= NNONCACHE; } np->n_directio_opens++; @@ -630,7 +630,7 @@ nfs_open(struct vop_open_args *ap) np->n_writecred = crhold(ap->a_cred); } else cred = NULL; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); if (cred != NULL) crfree(cred); @@ -657,9 +657,9 @@ nfs_open(struct vop_open_args *ap) /* And, finally, make sure that n_mtime is up to date. */ np = VTONFS(vp); - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); np->n_mtime = np->n_vattr.na_mtime; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } return (0); } @@ -734,9 +734,9 @@ nfs_close(struct vop_close_args *ap) vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_WUNLOCK(vp->v_object); } - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); if (np->n_flag & NMODIFIED) { - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); if (NFS_ISV3(vp)) { /* * Under NFSv3 we have dirty buffers to dispose of. We @@ -770,7 +770,7 @@ nfs_close(struct vop_close_args *ap) } else { error = ncl_vinvalbuf(vp, V_SAVE, ap->a_td, 1); } - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); } /* * Invalidate the attribute cache in all cases. @@ -795,7 +795,7 @@ nfs_close(struct vop_close_args *ap) np->n_flag &= ~NWRITEERR; error = np->n_error; } - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } if (NFS_ISV4(vp)) { @@ -829,13 +829,13 @@ nfs_close(struct vop_close_args *ap) ("nfs_close: dirty unflushed (%d) directio buffers\n", np->n_directio_asyncwr)); if (newnfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) { - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); KASSERT((np->n_directio_opens > 0), ("nfs_close: unexpectedly value (0) of n_directio_opens\n")); np->n_directio_opens--; if (np->n_directio_opens == 0) np->n_flag &= ~NNONCACHE; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } if (localcred) NFSFREECRED(cred); @@ -859,10 +859,10 @@ nfs_getattr(struct vop_getattr_args *ap) /* * Update local times for special files. */ - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); if (np->n_flag & (NACC | NUPD)) np->n_flag |= NCHG; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); /* * First look in the cache. */ @@ -976,14 +976,14 @@ nfs_setattr(struct vop_setattr_args *ap) * V_SAVE races that might setsize a lower * value. */ - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); tsize = np->n_size; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); error = ncl_meta_setsize(vp, td, vap->va_size); - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); if (np->n_flag & NMODIFIED) { tsize = np->n_size; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); error = ncl_vinvalbuf(vp, vap->va_size == 0 ? 0 : V_SAVE, td, 1); if (error != 0) { @@ -996,34 +996,34 @@ nfs_setattr(struct vop_setattr_args *ap) */ nfscl_delegmodtime(vp); } else - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); /* * np->n_size has already been set to vap->va_size * in ncl_meta_setsize(). We must set it again since * nfs_loadattrcache() could be called through * ncl_meta_setsize() and could modify np->n_size. */ - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); np->n_vattr.na_size = np->n_size = vap->va_size; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } } else { - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); if ((vap->va_mtime.tv_sec != VNOVAL || vap->va_atime.tv_sec != VNOVAL) && (np->n_flag & NMODIFIED) && vp->v_type == VREG) { - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); error = ncl_vinvalbuf(vp, V_SAVE, td, 1); if (error == EINTR || error == EIO) return (error); } else - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } error = nfs_setattrrpc(vp, vap, ap->a_cred, td); if (error && vap->va_size != VNOVAL) { - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); np->n_size = np->n_vattr.na_size = tsize; vnode_pager_setsize(vp, tsize); - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } return (error); } @@ -1040,11 +1040,11 @@ nfs_setattrrpc(struct vnode *vp, struct vattr *vap, struct ucred *cred, struct nfsvattr nfsva; if (NFS_ISV34(vp)) { - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); for (i = 0; i < NFS_ACCESSCACHESIZE; i++) np->n_accesscache[i].stamp = 0; np->n_flag |= NDELEGMOD; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); KDTRACE_NFS_ACCESSCACHE_FLUSH_DONE(vp); } error = nfsrpc_setattr(vp, vap, NULL, cred, td, &nfsva, &attrflag, @@ -1092,12 +1092,12 @@ nfs_lookup(struct vop_lookup_args *ap) np = VTONFS(dvp); /* For NFSv4, wait until any remove is done. */ - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); while (NFSHASNFSV4(nmp) && (np->n_flag & NREMOVEINPROG)) { np->n_flag |= NREMOVEWANT; (void) msleep((caddr_t)np, &np->n_mtx, PZERO, "nfslkup", 0); } - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td)) != 0) return (error); @@ -1139,10 +1139,10 @@ nfs_lookup(struct vop_lookup_args *ap) if (!(nmp->nm_flag & NFSMNT_NOCTO) && (flags & (ISLASTCN | ISOPEN)) == (ISLASTCN | ISOPEN) && !(newnp->n_flag & NMODIFIED)) { - mtx_lock(&newnp->n_mtx); + NFSLOCKNODE(newnp); newnp->n_attrstamp = 0; KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(newvp); - mtx_unlock(&newnp->n_mtx); + NFSUNLOCKNODE(newnp); } if (nfscl_nodeleg(newvp, 0) == 0 || ((u_int)(ticks - ncticks) < (nmp->nm_nametimeo * hz) && @@ -1224,14 +1224,14 @@ nfs_lookup(struct vop_lookup_args *ap) * has changed. Don't bother adding the entry * if the directory has already changed. */ - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); if (timespeccmp(&np->n_vattr.na_mtime, &dnfsva.na_mtime, ==)) { - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); cache_enter_time(dvp, NULL, cnp, &dnfsva.na_mtime, NULL); } else - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } return (ENOENT); } @@ -1319,10 +1319,10 @@ nfs_lookup(struct vop_lookup_args *ap) * are fetched in nfs_open() since we did not * fetch attributes from the LOOKUP reply. */ - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); np->n_attrstamp = 0; KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(newvp); - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } } if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) @@ -1516,13 +1516,13 @@ nfs_mknodrpc(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, vap->va_gid); } dnp = VTONFS(dvp); - mtx_lock(&dnp->n_mtx); + NFSLOCKNODE(dnp); dnp->n_flag |= NMODIFIED; if (!dattrflag) { dnp->n_attrstamp = 0; KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp); } - mtx_unlock(&dnp->n_mtx); + NFSUNLOCKNODE(dnp); return (error); } @@ -1593,12 +1593,12 @@ nfs_create(struct vop_create_args *ap) nmp = VFSTONFS(vnode_mount(dvp)); again: /* For NFSv4, wait until any remove is done. */ - mtx_lock(&dnp->n_mtx); + NFSLOCKNODE(dnp); while (NFSHASNFSV4(nmp) && (dnp->n_flag & NREMOVEINPROG)) { dnp->n_flag |= NREMOVEWANT; (void) msleep((caddr_t)dnp, &dnp->n_mtx, PZERO, "nfscrt", 0); } - mtx_unlock(&dnp->n_mtx); + NFSUNLOCKNODE(dnp); cverf = nfs_get_cverf(); error = nfsrpc_create(dvp, cnp->cn_nameptr, cnp->cn_namelen, @@ -1664,13 +1664,13 @@ nfs_create(struct vop_create_args *ap) error = nfscl_maperr(cnp->cn_thread, error, vap->va_uid, vap->va_gid); } - mtx_lock(&dnp->n_mtx); + NFSLOCKNODE(dnp); dnp->n_flag |= NMODIFIED; if (!dattrflag) { dnp->n_attrstamp = 0; KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp); } - mtx_unlock(&dnp->n_mtx); + NFSUNLOCKNODE(dnp); return (error); } @@ -1729,9 +1729,9 @@ nfs_remove(struct vop_remove_args *ap) error = 0; } else if (!np->n_sillyrename) error = nfs_sillyrename(dvp, vp, cnp); - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); np->n_attrstamp = 0; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); return (error); } @@ -1763,29 +1763,29 @@ nfs_removerpc(struct vnode *dvp, struct vnode *vp, char *name, struct nfsnode *dnp = VTONFS(dvp); int error = 0, dattrflag; - mtx_lock(&dnp->n_mtx); + NFSLOCKNODE(dnp); dnp->n_flag |= NREMOVEINPROG; - mtx_unlock(&dnp->n_mtx); + NFSUNLOCKNODE(dnp); error = nfsrpc_remove(dvp, name, namelen, vp, cred, td, &dnfsva, &dattrflag, NULL); - mtx_lock(&dnp->n_mtx); + NFSLOCKNODE(dnp); if ((dnp->n_flag & NREMOVEWANT)) { dnp->n_flag &= ~(NREMOVEWANT | NREMOVEINPROG); - mtx_unlock(&dnp->n_mtx); + NFSUNLOCKNODE(dnp); wakeup((caddr_t)dnp); } else { dnp->n_flag &= ~NREMOVEINPROG; - mtx_unlock(&dnp->n_mtx); + NFSUNLOCKNODE(dnp); } if (dattrflag) (void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1); - mtx_lock(&dnp->n_mtx); + NFSLOCKNODE(dnp); dnp->n_flag |= NMODIFIED; if (!dattrflag) { dnp->n_attrstamp = 0; KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp); } - mtx_unlock(&dnp->n_mtx); + NFSUNLOCKNODE(dnp); if (error && NFS_ISV4(dvp)) error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0); return (error); @@ -1869,8 +1869,8 @@ nfs_rename(struct vop_rename_args *ap) sizeof (struct nfsv4node) + tdnp->n_fhp->nfh_len + tcnp->cn_namelen - 1, M_NFSV4NODE, M_WAITOK); - mtx_lock(&tdnp->n_mtx); - mtx_lock(&fnp->n_mtx); + NFSLOCKNODE(tdnp); + NFSLOCKNODE(fnp); if (fnp->n_v4 != NULL && fvp->v_type == VREG && (fnp->n_v4->n4_namelen != tcnp->cn_namelen || NFSBCMP(tcnp->cn_nameptr, NFS4NODENAME(fnp->n_v4), @@ -1896,8 +1896,8 @@ printf("ren replace=%s\n",nnn); NFSBCOPY(tcnp->cn_nameptr, NFS4NODENAME(fnp->n_v4), tcnp->cn_namelen); } - mtx_unlock(&tdnp->n_mtx); - mtx_unlock(&fnp->n_mtx); + NFSUNLOCKNODE(tdnp); + NFSUNLOCKNODE(fnp); if (newv4 != NULL) free(newv4, M_NFSV4NODE); } @@ -1954,24 +1954,24 @@ nfs_renamerpc(struct vnode *fdvp, struct vnode *fvp, char *fnameptr, error = nfsrpc_rename(fdvp, fvp, fnameptr, fnamelen, tdvp, tvp, tnameptr, tnamelen, cred, td, &fnfsva, &tnfsva, &fattrflag, &tattrflag, NULL, NULL); - mtx_lock(&fdnp->n_mtx); + NFSLOCKNODE(fdnp); fdnp->n_flag |= NMODIFIED; if (fattrflag != 0) { - mtx_unlock(&fdnp->n_mtx); + NFSUNLOCKNODE(fdnp); (void) nfscl_loadattrcache(&fdvp, &fnfsva, NULL, NULL, 0, 1); } else { fdnp->n_attrstamp = 0; - mtx_unlock(&fdnp->n_mtx); + NFSUNLOCKNODE(fdnp); KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(fdvp); } - mtx_lock(&tdnp->n_mtx); + NFSLOCKNODE(tdnp); tdnp->n_flag |= NMODIFIED; if (tattrflag != 0) { - mtx_unlock(&tdnp->n_mtx); + NFSUNLOCKNODE(tdnp); (void) nfscl_loadattrcache(&tdvp, &tnfsva, NULL, NULL, 0, 1); } else { tdnp->n_attrstamp = 0; - mtx_unlock(&tdnp->n_mtx); + NFSUNLOCKNODE(tdnp); KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(tdvp); } if (error && NFS_ISV4(fdvp)) @@ -2003,23 +2003,23 @@ nfs_link(struct vop_link_args *ap) cnp->cn_cred, cnp->cn_thread, &dnfsva, &nfsva, &attrflag, &dattrflag, NULL); tdnp = VTONFS(tdvp); - mtx_lock(&tdnp->n_mtx); + NFSLOCKNODE(tdnp); tdnp->n_flag |= NMODIFIED; if (dattrflag != 0) { - mtx_unlock(&tdnp->n_mtx); + NFSUNLOCKNODE(tdnp); (void) nfscl_loadattrcache(&tdvp, &dnfsva, NULL, NULL, 0, 1); } else { tdnp->n_attrstamp = 0; - mtx_unlock(&tdnp->n_mtx); + NFSUNLOCKNODE(tdnp); KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(tdvp); } if (attrflag) (void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1); else { np = VTONFS(vp); - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); np->n_attrstamp = 0; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); } /* @@ -2091,14 +2091,14 @@ nfs_symlink(struct vop_symlink_args *ap) } dnp = VTONFS(dvp); - mtx_lock(&dnp->n_mtx); + NFSLOCKNODE(dnp); dnp->n_flag |= NMODIFIED; if (dattrflag != 0) { - mtx_unlock(&dnp->n_mtx); + NFSUNLOCKNODE(dnp); (void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1); } else { dnp->n_attrstamp = 0; - mtx_unlock(&dnp->n_mtx); + NFSUNLOCKNODE(dnp); KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp); } /* @@ -2137,14 +2137,14 @@ nfs_mkdir(struct vop_mkdir_args *ap) vap, cnp->cn_cred, cnp->cn_thread, &dnfsva, &nfsva, &nfhp, &attrflag, &dattrflag, NULL); dnp = VTONFS(dvp); - mtx_lock(&dnp->n_mtx); + NFSLOCKNODE(dnp); dnp->n_flag |= NMODIFIED; if (dattrflag != 0) { - mtx_unlock(&dnp->n_mtx); + NFSUNLOCKNODE(dnp); (void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1); } else { dnp->n_attrstamp = 0; - mtx_unlock(&dnp->n_mtx); + NFSUNLOCKNODE(dnp); KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp); } if (nfhp) { @@ -2208,14 +2208,14 @@ nfs_rmdir(struct vop_rmdir_args *ap) error = nfsrpc_rmdir(dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread, &dnfsva, &dattrflag, NULL); dnp = VTONFS(dvp); - mtx_lock(&dnp->n_mtx); + NFSLOCKNODE(dnp); dnp->n_flag |= NMODIFIED; if (dattrflag != 0) { - mtx_unlock(&dnp->n_mtx); + NFSUNLOCKNODE(dnp); (void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1); } else { dnp->n_attrstamp = 0; - mtx_unlock(&dnp->n_mtx); + NFSUNLOCKNODE(dnp); KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp); } @@ -2256,16 +2256,16 @@ nfs_readdir(struct vop_readdir_args *ap) if (np->n_direofoffset > 0 && uio->uio_offset >= np->n_direofoffset && (np->n_flag & NMODIFIED) == 0) { if (VOP_GETATTR(vp, &vattr, ap->a_cred) == 0) { - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); if ((NFS_ISV4(vp) && np->n_change == vattr.va_filerev) || !NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) { - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); NFSINCRGLOBAL(nfsstatsv1.direofcache_hits); if (ap->a_eofflag != NULL) *ap->a_eofflag = 1; return (0); } else - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } } @@ -2605,9 +2605,9 @@ ncl_commit(struct vnode *vp, u_quad_t offset, int cnt, struct ucred *cred, error = nfscl_doiods(vp, &uio, NULL, NULL, NFSV4OPEN_ACCESSWRITE, 1, cred, td); if (error != 0) { - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); np->n_flag &= ~NDSCOMMIT; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } } if (error != 0) { @@ -2982,7 +2982,7 @@ ncl_flush(struct vnode *vp, int waitfor, struct thread *td, * Wait for all the async IO requests to drain */ BO_UNLOCK(bo); - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); while (np->n_directio_asyncwr > 0) { np->n_flag |= NFSYNCWAIT; error = newnfs_msleep(td, &np->n_directio_asyncwr, @@ -2990,13 +2990,13 @@ ncl_flush(struct vnode *vp, int waitfor, struct thread *td, "nfsfsync", 0); if (error) { if (newnfs_sigintr(nmp, td)) { - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); error = EINTR; goto done; } } } - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } else BO_UNLOCK(bo); if (NFSHASPNFS(nmp)) { @@ -3005,10 +3005,10 @@ ncl_flush(struct vnode *vp, int waitfor, struct thread *td, * Invalidate the attribute cache, since writes to a DS * won't update the size attribute. */ - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); np->n_attrstamp = 0; } else - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); if (np->n_flag & NWRITEERR) { error = np->n_error; np->n_flag &= ~NWRITEERR; @@ -3016,7 +3016,7 @@ ncl_flush(struct vnode *vp, int waitfor, struct thread *td, if (commit && bo->bo_dirty.bv_cnt == 0 && bo->bo_numoutput == 0 && np->n_directio_asyncwr == 0) np->n_flag &= ~NMODIFIED; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); done: if (bvec != NULL && bvec != bvec_on_stack) free(bvec, M_TEMP); @@ -3136,9 +3136,9 @@ nfs_advlock(struct vop_advlock_args *ap) } } /* Mark that a file lock has been acquired. */ - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); np->n_flag |= NHASBEENLOCKED; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); } } else if (!NFS_ISV4(vp)) { if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) { @@ -3157,9 +3157,9 @@ nfs_advlock(struct vop_advlock_args *ap) error = NFSVOPLOCK(vp, LK_SHARED); if (error == 0) { /* Mark that a file lock has been acquired. */ - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); np->n_flag |= NHASBEENLOCKED; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); NFSVOPUNLOCK(vp, 0); } } @@ -3314,10 +3314,10 @@ nfsfifo_read(struct vop_read_args *ap) /* * Set access flag. */ - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); np->n_flag |= NACC; vfs_timestamp(&np->n_atim); - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); error = fifo_specops.vop_read(ap); return error; } @@ -3333,10 +3333,10 @@ nfsfifo_write(struct vop_write_args *ap) /* * Set update flag. */ - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); np->n_flag |= NUPD; vfs_timestamp(&np->n_mtim); - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); return(fifo_specops.vop_write(ap)); } @@ -3353,7 +3353,7 @@ nfsfifo_close(struct vop_close_args *ap) struct vattr vattr; struct timespec ts; - mtx_lock(&np->n_mtx); + NFSLOCKNODE(np); if (np->n_flag & (NACC | NUPD)) { vfs_timestamp(&ts); if (np->n_flag & NACC) @@ -3368,12 +3368,12 @@ nfsfifo_close(struct vop_close_args *ap) vattr.va_atime = np->n_atim; if (np->n_flag & NUPD) vattr.va_mtime = np->n_mtim; - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); (void)VOP_SETATTR(vp, &vattr, ap->a_cred); goto out; } } - mtx_unlock(&np->n_mtx); + NFSUNLOCKNODE(np); out: return (fifo_specops.vop_close(ap)); } From 3c78771400e74f5bc54ee8e9a28fbf70190fd250 Mon Sep 17 00:00:00 2001 From: Yuri Pankov Date: Tue, 24 Sep 2019 12:21:01 +0000 Subject: [PATCH 033/106] lib/libc/regex: fix build with REDEBUG defined Reviewed by: kevans Differential Revision: https://reviews.freebsd.org/D21760 --- lib/libc/regex/engine.c | 2 +- lib/libc/regex/regcomp.c | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/lib/libc/regex/engine.c b/lib/libc/regex/engine.c index a25bfa08ede7..e7da4cbc2a5d 100644 --- a/lib/libc/regex/engine.c +++ b/lib/libc/regex/engine.c @@ -1068,7 +1068,7 @@ print(struct match *m, fprintf(d, " %s", pchar(ch)); for (i = 0; i < g->nstates; i++) if (ISSET(st, i)) { - fprintf(d, "%s%d", (first) ? "\t" : ", ", i); + fprintf(d, "%s%lu", (first) ? "\t" : ", ", i); first = 0; } fprintf(d, "\n"); diff --git a/lib/libc/regex/regcomp.c b/lib/libc/regex/regcomp.c index 509b9e7fa4af..5e772c21d381 100644 --- a/lib/libc/regex/regcomp.c +++ b/lib/libc/regex/regcomp.c @@ -196,12 +196,6 @@ static char nuls[10]; /* place to point scanner in event of error */ #define THERETHERE() (p->slen - 2) #define DROP(n) (p->slen -= (n)) -#ifndef NDEBUG -static int never = 0; /* for use in asserts; shuts lint up */ -#else -#define never 0 /* some s have bugs too */ -#endif - /* Macro used by computejump()/computematchjump() */ #define MIN(a,b) ((a)<(b)?(a):(b)) From 2b861c1538575f63e8f23352c6796533b43d312a Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Tue, 24 Sep 2019 13:15:24 +0000 Subject: [PATCH 034/106] Plumb a memory leak. Thnanks to Felix Weinrank for finding this issue using fuzz testing and reporting it for the userland stack: https://github.com/sctplab/usrsctp/issues/378 MFC after: 3 days --- sys/netinet/sctp_indata.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sys/netinet/sctp_indata.c b/sys/netinet/sctp_indata.c index 5db87ae86b54..910557200f51 100644 --- a/sys/netinet/sctp_indata.c +++ b/sys/netinet/sctp_indata.c @@ -472,6 +472,11 @@ sctp_clean_up_control(struct sctp_tcb *stcb, struct sctp_queued_to_read *control chk->data = NULL; sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); } + sctp_free_remote_addr(control->whoFrom); + if (control->data) { + sctp_m_freem(control->data); + control->data = NULL; + } sctp_free_a_readq(stcb, control); } From 13d9bd2692f0e2f7036e1c391ea309d264c568ae Mon Sep 17 00:00:00 2001 From: Olivier Cochard Date: Tue, 24 Sep 2019 16:45:34 +0000 Subject: [PATCH 035/106] Fix coredump_phnum_test in case of kern.compress_user_cores=1 PR: 240783 Approved by: ngie, lwhsu MFC after: 1 month Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D21776 --- tests/sys/kern/coredump_phnum_test.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/sys/kern/coredump_phnum_test.sh b/tests/sys/kern/coredump_phnum_test.sh index a39f0a5f9a8f..3c7cd262ccba 100644 --- a/tests/sys/kern/coredump_phnum_test.sh +++ b/tests/sys/kern/coredump_phnum_test.sh @@ -45,10 +45,12 @@ coredump_phnum_body() ulimit -c '$(ulimit -c)' sysctl kern.coredump=$(sysctl -n kern.coredump) sysctl kern.corefile='$(sysctl -n kern.corefile)' + sysctl kern.compress_user_cores='$(sysctl -n kern.compress_user_cores)' EOF ulimit -c unlimited sysctl kern.coredump=1 + sysctl kern.compress_user_cores=0 sysctl kern.corefile="$(pwd)/coredump_phnum_helper.core" atf_check -s signal:sigabrt "$(atf_get_srcdir)/coredump_phnum_helper" From 16f9d2f3b8212e4a025aebd5fd8e3ea387c13a14 Mon Sep 17 00:00:00 2001 From: Olivier Cochard Date: Tue, 24 Sep 2019 16:49:42 +0000 Subject: [PATCH 036/106] Fix a minor typo Approved by: lwhsu MFC after: 1 month Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D19970 --- sys/dev/pccard/pccard.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/dev/pccard/pccard.c b/sys/dev/pccard/pccard.c index 65147341a9bc..095179eecb90 100644 --- a/sys/dev/pccard/pccard.c +++ b/sys/dev/pccard/pccard.c @@ -266,7 +266,7 @@ pccard_probe_and_attach_child(device_t dev, device_t child, * In NetBSD, the drivers are responsible for activating each * function of a card and selecting the config to use. In * FreeBSD, all that's done automatically in the typical lazy - * way we do device resoruce allocation (except we pick the + * way we do device resource allocation (except we pick the * cfe up front). This is the biggest depature from the * inherited NetBSD model, apart from the FreeBSD resource code. * From 53b5b9b049be36ac1856a7b0964901cac54daefe Mon Sep 17 00:00:00 2001 From: Eric Joyner Date: Tue, 24 Sep 2019 17:03:31 +0000 Subject: [PATCH 037/106] iflib: Remove redundant VLAN events deregistration From Piotr: r351152 introduced iflib_deregister() function calling EVENTHANDLER_DEREGISTER() to unregister VLAN events. This patch removes duplicate of EVENTHANDLER_DEREGISTER() calls placed in iflib_device_deregister() as this function is now calling iflib_deregister(). This is to avoid deregistering same event twice. This patch also adds check in iflib_vlan_register() to prevent registering VLAN while being in detach. Patch co-authored by Krzysztof Galazka , erj and Jacob Keller . Signed-off-by: Piotr Pietruszewski Submitted by: Piotr Pietruszewski Reviewed by: gallatin@, erj@ MFC after: 3 days Sponsored by: Intel Corporation Differential Revision: https://reviews.freebsd.org/D21711 --- sys/net/iflib.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/sys/net/iflib.c b/sys/net/iflib.c index 6c57b6aaeaef..148a8b3e6d90 100644 --- a/sys/net/iflib.c +++ b/sys/net/iflib.c @@ -4280,6 +4280,9 @@ iflib_vlan_register(void *arg, if_t ifp, uint16_t vtag) if ((vtag == 0) || (vtag > 4095)) return; + if (iflib_in_detach(ctx)) + return; + CTX_LOCK(ctx); IFDI_VLAN_REGISTER(ctx, vtag); /* Re-init to load the changes */ @@ -5079,12 +5082,6 @@ iflib_device_deregister(if_ctx_t ctx) iflib_stop(ctx); CTX_UNLOCK(ctx); - /* Unregister VLAN events */ - if (ctx->ifc_vlan_attach_event != NULL) - EVENTHANDLER_DEREGISTER(vlan_config, ctx->ifc_vlan_attach_event); - if (ctx->ifc_vlan_detach_event != NULL) - EVENTHANDLER_DEREGISTER(vlan_unconfig, ctx->ifc_vlan_detach_event); - iflib_netmap_detach(ifp); ether_ifdetach(ifp); iflib_rem_pfil(ctx); From 749597dc1d21dce46fb94bfbe34cdb20ec1d9ab3 Mon Sep 17 00:00:00 2001 From: Eric Joyner Date: Tue, 24 Sep 2019 17:06:32 +0000 Subject: [PATCH 038/106] ix, ixv: Read msix_bar from device configuration Instead of predicting the MSI-X bar index based on the device's MAC type, read it from the device's PCI configuration instead. PR: 239704 Submitted by: Piotr Pietruszewski Reviewed by: erj@ MFC after: 3 days Sponsored by: Intel Corporation Differential Revision: https://reviews.freebsd.org/D21547 --- sys/dev/ixgbe/if_ix.c | 5 +++-- sys/dev/ixgbe/if_ixv.c | 2 +- sys/dev/ixgbe/ixgbe.h | 2 -- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/sys/dev/ixgbe/if_ix.c b/sys/dev/ixgbe/if_ix.c index 9c61c9a24073..de76ca7f4432 100644 --- a/sys/dev/ixgbe/if_ix.c +++ b/sys/dev/ixgbe/if_ix.c @@ -1012,12 +1012,13 @@ ixgbe_if_attach_pre(if_ctx_t ctx) CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_IP6_TSO; if (adapter->hw.mac.type == ixgbe_mac_82598EB) { scctx->isc_tx_nsegments = IXGBE_82598_SCATTER; - scctx->isc_msix_bar = PCIR_BAR(MSIX_82598_BAR); } else { scctx->isc_tx_csum_flags |= CSUM_SCTP |CSUM_IP6_SCTP; scctx->isc_tx_nsegments = IXGBE_82599_SCATTER; - scctx->isc_msix_bar = PCIR_BAR(MSIX_82599_BAR); } + + scctx->isc_msix_bar = pci_msix_table_bar(dev); + scctx->isc_tx_tso_segments_max = scctx->isc_tx_nsegments; scctx->isc_tx_tso_size_max = IXGBE_TSO_SIZE; scctx->isc_tx_tso_segsize_max = PAGE_SIZE; diff --git a/sys/dev/ixgbe/if_ixv.c b/sys/dev/ixgbe/if_ixv.c index cfc824dd7f85..5e98e8a445ba 100644 --- a/sys/dev/ixgbe/if_ixv.c +++ b/sys/dev/ixgbe/if_ixv.c @@ -494,7 +494,7 @@ ixv_if_attach_pre(if_ctx_t ctx) scctx->isc_tx_csum_flags = CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_TSO | CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_IP6_TSO; scctx->isc_tx_nsegments = IXGBE_82599_SCATTER; - scctx->isc_msix_bar = PCIR_BAR(MSIX_82598_BAR); + scctx->isc_msix_bar = pci_msix_table_bar(dev); scctx->isc_tx_tso_segments_max = scctx->isc_tx_nsegments; scctx->isc_tx_tso_size_max = IXGBE_TSO_SIZE; scctx->isc_tx_tso_segsize_max = PAGE_SIZE; diff --git a/sys/dev/ixgbe/ixgbe.h b/sys/dev/ixgbe/ixgbe.h index 4261d1e36b88..1502c7fb866e 100644 --- a/sys/dev/ixgbe/ixgbe.h +++ b/sys/dev/ixgbe/ixgbe.h @@ -189,8 +189,6 @@ #define MAX_NUM_MULTICAST_ADDRESSES 128 #define IXGBE_82598_SCATTER 100 #define IXGBE_82599_SCATTER 32 -#define MSIX_82598_BAR 3 -#define MSIX_82599_BAR 4 #define IXGBE_TSO_SIZE 262140 #define IXGBE_RX_HDR 128 #define IXGBE_VFTA_SIZE 128 From 35c7bb340788f0ce9347b7066619d8afb31e2123 Mon Sep 17 00:00:00 2001 From: Randall Stewart Date: Tue, 24 Sep 2019 18:18:11 +0000 Subject: [PATCH 039/106] This commit adds BBR (Bottleneck Bandwidth and RTT) congestion control. This is a completely separate TCP stack (tcp_bbr.ko) that will be built only if you add the make options WITH_EXTRA_TCP_STACKS=1 and also include the option TCPHPTS. You can also include the RATELIMIT option if you have a NIC interface that supports hardware pacing, BBR understands how to use such a feature. Note that this commit also adds in a general purpose time-filter which allows you to have a min-filter or max-filter. A filter allows you to have a low (or high) value for some period of time and degrade slowly to another value has time passes. You can find out the details of BBR by looking at the original paper at: https://queue.acm.org/detail.cfm?id=3022184 or consult many other web resources you can find on the web referenced by "BBR congestion control". It should be noted that BBRv1 (which this is) does tend to unfairness in cases of small buffered paths, and it will usually get less bandwidth in the case of large BDP paths(when competing with new-reno or cubic flows). BBR is still an active research area and we do plan on implementing V2 of BBR to see if it is an improvement over V1. Sponsored by: Netflix Inc. Differential Revision: https://reviews.freebsd.org/D21582 --- sys/conf/files | 1 + sys/kern/subr_filter.c | 482 + sys/modules/tcp/Makefile | 2 + sys/modules/tcp/bbr/Makefile | 23 + sys/netinet/ip_output.c | 9 +- sys/netinet/ip_var.h | 1 + sys/netinet/tcp.h | 1 + sys/netinet/tcp_stacks/bbr.c | 15189 +++++++++++++++++++++ sys/netinet/tcp_stacks/rack.c | 4176 ++++-- sys/netinet/tcp_stacks/rack_bbr_common.c | 48 +- sys/netinet/tcp_stacks/rack_bbr_common.h | 2 +- sys/netinet/tcp_stacks/sack_filter.c | 236 +- sys/netinet/tcp_stacks/sack_filter.h | 7 +- sys/netinet/tcp_stacks/tcp_bbr.h | 845 ++ sys/netinet/tcp_stacks/tcp_rack.h | 80 +- sys/sys/mbuf.h | 12 +- sys/sys/tim_filter.h | 134 + 17 files changed, 19688 insertions(+), 1560 deletions(-) create mode 100644 sys/kern/subr_filter.c create mode 100644 sys/modules/tcp/bbr/Makefile create mode 100644 sys/netinet/tcp_stacks/bbr.c create mode 100644 sys/netinet/tcp_stacks/tcp_bbr.h create mode 100644 sys/sys/tim_filter.h diff --git a/sys/conf/files b/sys/conf/files index 44c23e8cc01d..4f8272ca5908 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -3808,6 +3808,7 @@ kern/subr_epoch.c standard kern/subr_eventhandler.c standard kern/subr_fattime.c standard kern/subr_firmware.c optional firmware +kern/subr_filter.c standard kern/subr_gtaskqueue.c standard kern/subr_hash.c standard kern/subr_hints.c standard diff --git a/sys/kern/subr_filter.c b/sys/kern/subr_filter.c new file mode 100644 index 000000000000..1089dee452dc --- /dev/null +++ b/sys/kern/subr_filter.c @@ -0,0 +1,482 @@ +/*- + * Copyright (c) 2016-2019 Netflix, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Author: Randall Stewart + */ +#include +__FBSDID("$FreeBSD$"); +#include +#include +#include +#include + +void +reset_time(struct time_filter *tf, uint32_t time_len) +{ + tf->cur_time_limit = time_len; +} + +void +reset_time_small(struct time_filter_small *tf, uint32_t time_len) +{ + tf->cur_time_limit = time_len; +} + +/* + * A time filter can be a filter for MIN or MAX. + * You call setup_time_filter() with the pointer to + * the filter structure, the type (FILTER_TYPE_MIN/MAX) and + * the time length. You can optionally reset the time length + * later with reset_time(). + * + * You generally call apply_filter_xxx() to apply the new value + * to the filter. You also provide a time (now). The filter will + * age out entries based on the time now and your time limit + * so that you are always maintaining the min or max in that + * window of time. Time is a relative thing, it might be ticks + * in milliseconds, it might be round trip times, its really + * up to you to decide what it is. + * + * To access the current flitered value you can use the macro + * get_filter_value() which returns the correct entry that + * has the "current" value in the filter. + * + * One thing that used to be here is a single apply_filter(). But + * this meant that we then had to store the type of filter in + * the time_filter structure. In order to keep it at a cache + * line size I split it to two functions. + * + */ +int +setup_time_filter(struct time_filter *tf, int fil_type, uint32_t time_len) +{ + uint64_t set_val; + int i; + + /* + * You must specify either a MIN or MAX filter, + * though its up to the user to use the correct + * apply. + */ + if ((fil_type != FILTER_TYPE_MIN) && + (fil_type != FILTER_TYPE_MAX)) + return(EINVAL); + + if (time_len < NUM_FILTER_ENTRIES) + return(EINVAL); + + if (fil_type == FILTER_TYPE_MIN) + set_val = 0xffffffffffffffff; + else + set_val = 0; + + for(i=0; ientries[i].value = set_val; + tf->entries[i].time_up = 0; + } + tf->cur_time_limit = time_len; + return(0); +} + +int +setup_time_filter_small(struct time_filter_small *tf, int fil_type, uint32_t time_len) +{ + uint32_t set_val; + int i; + + /* + * You must specify either a MIN or MAX filter, + * though its up to the user to use the correct + * apply. + */ + if ((fil_type != FILTER_TYPE_MIN) && + (fil_type != FILTER_TYPE_MAX)) + return(EINVAL); + + if (time_len < NUM_FILTER_ENTRIES) + return(EINVAL); + + if (fil_type == FILTER_TYPE_MIN) + set_val = 0xffffffff; + else + set_val = 0; + + for(i=0; ientries[i].value = set_val; + tf->entries[i].time_up = 0; + } + tf->cur_time_limit = time_len; + return(0); +} + + +static void +check_update_times(struct time_filter *tf, uint64_t value, uint32_t now) +{ + int i, j, fnd; + uint32_t tim; + uint32_t time_limit; + for(i=0; i<(NUM_FILTER_ENTRIES-1); i++) { + tim = now - tf->entries[i].time_up; + time_limit = (tf->cur_time_limit * (NUM_FILTER_ENTRIES-i))/NUM_FILTER_ENTRIES; + if (tim >= time_limit) { + fnd = 0; + for(j=(i+1); jentries[i].time_up < tf->entries[j].time_up) { + tf->entries[i].value = tf->entries[j].value; + tf->entries[i].time_up = tf->entries[j].time_up; + fnd = 1; + break; + } + } + if (fnd == 0) { + /* Nothing but the same old entry */ + tf->entries[i].value = value; + tf->entries[i].time_up = now; + } + } + } + i = NUM_FILTER_ENTRIES-1; + tim = now - tf->entries[i].time_up; + time_limit = (tf->cur_time_limit * (NUM_FILTER_ENTRIES-i))/NUM_FILTER_ENTRIES; + if (tim >= time_limit) { + tf->entries[i].value = value; + tf->entries[i].time_up = now; + } +} + +static void +check_update_times_small(struct time_filter_small *tf, uint32_t value, uint32_t now) +{ + int i, j, fnd; + uint32_t tim; + uint32_t time_limit; + for(i=0; i<(NUM_FILTER_ENTRIES-1); i++) { + tim = now - tf->entries[i].time_up; + time_limit = (tf->cur_time_limit * (NUM_FILTER_ENTRIES-i))/NUM_FILTER_ENTRIES; + if (tim >= time_limit) { + fnd = 0; + for(j=(i+1); jentries[i].time_up < tf->entries[j].time_up) { + tf->entries[i].value = tf->entries[j].value; + tf->entries[i].time_up = tf->entries[j].time_up; + fnd = 1; + break; + } + } + if (fnd == 0) { + /* Nothing but the same old entry */ + tf->entries[i].value = value; + tf->entries[i].time_up = now; + } + } + } + i = NUM_FILTER_ENTRIES-1; + tim = now - tf->entries[i].time_up; + time_limit = (tf->cur_time_limit * (NUM_FILTER_ENTRIES-i))/NUM_FILTER_ENTRIES; + if (tim >= time_limit) { + tf->entries[i].value = value; + tf->entries[i].time_up = now; + } +} + + + +void +filter_reduce_by(struct time_filter *tf, uint64_t reduce_by, uint32_t now) +{ + int i; + /* + * Reduce our filter main by reduce_by and + * update its time. Then walk other's and + * make them the new value too. + */ + if (reduce_by < tf->entries[0].value) + tf->entries[0].value -= reduce_by; + else + tf->entries[0].value = 0; + tf->entries[0].time_up = now; + for(i=1; ientries[i].value = tf->entries[0].value; + tf->entries[i].time_up = now; + } +} + +void +filter_reduce_by_small(struct time_filter_small *tf, uint32_t reduce_by, uint32_t now) +{ + int i; + /* + * Reduce our filter main by reduce_by and + * update its time. Then walk other's and + * make them the new value too. + */ + if (reduce_by < tf->entries[0].value) + tf->entries[0].value -= reduce_by; + else + tf->entries[0].value = 0; + tf->entries[0].time_up = now; + for(i=1; ientries[i].value = tf->entries[0].value; + tf->entries[i].time_up = now; + } +} + +void +filter_increase_by(struct time_filter *tf, uint64_t incr_by, uint32_t now) +{ + int i; + /* + * Increase our filter main by incr_by and + * update its time. Then walk other's and + * make them the new value too. + */ + tf->entries[0].value += incr_by; + tf->entries[0].time_up = now; + for(i=1; ientries[i].value = tf->entries[0].value; + tf->entries[i].time_up = now; + } +} + +void +filter_increase_by_small(struct time_filter_small *tf, uint32_t incr_by, uint32_t now) +{ + int i; + /* + * Increase our filter main by incr_by and + * update its time. Then walk other's and + * make them the new value too. + */ + tf->entries[0].value += incr_by; + tf->entries[0].time_up = now; + for(i=1; ientries[i].value = tf->entries[0].value; + tf->entries[i].time_up = now; + } +} + +void +forward_filter_clock(struct time_filter *tf, uint32_t ticks_forward) +{ + /* + * Bring forward all time values by N ticks. This + * postpones expiring slots by that amount. + */ + int i; + + for(i=0; ientries[i].time_up += ticks_forward; + } +} + + +void +forward_filter_clock_small(struct time_filter_small *tf, uint32_t ticks_forward) +{ + /* + * Bring forward all time values by N ticks. This + * postpones expiring slots by that amount. + */ + int i; + + for(i=0; ientries[i].time_up += ticks_forward; + } +} + + +void +tick_filter_clock(struct time_filter *tf, uint32_t now) +{ + int i; + uint32_t tim, time_limit; + + /* + * We start at two positions back. This + * is because the oldest worst value is + * preserved always, i.e. it can't expire + * due to clock ticking with no updated value. + * + * The other choice would be to fill it in with + * zero, but I don't like that option since + * some measurement is better than none (even + * if its your oldest measurment). + */ + for(i=(NUM_FILTER_ENTRIES-2); i>=0 ; i--) { + tim = now - tf->entries[i].time_up; + time_limit = (tf->cur_time_limit * (NUM_FILTER_ENTRIES-i))/NUM_FILTER_ENTRIES; + if (tim >= time_limit) { + /* + * This entry is expired, pull down + * the next one up. + */ + tf->entries[i].value = tf->entries[(i+1)].value; + tf->entries[i].time_up = tf->entries[(i+1)].time_up; + } + + } +} + +void +tick_filter_clock_small(struct time_filter_small *tf, uint32_t now) +{ + int i; + uint32_t tim, time_limit; + + /* + * We start at two positions back. This + * is because the oldest worst value is + * preserved always, i.e. it can't expire + * due to clock ticking with no updated value. + * + * The other choice would be to fill it in with + * zero, but I don't like that option since + * some measurement is better than none (even + * if its your oldest measurment). + */ + for(i=(NUM_FILTER_ENTRIES-2); i>=0 ; i--) { + tim = now - tf->entries[i].time_up; + time_limit = (tf->cur_time_limit * (NUM_FILTER_ENTRIES-i))/NUM_FILTER_ENTRIES; + if (tim >= time_limit) { + /* + * This entry is expired, pull down + * the next one up. + */ + tf->entries[i].value = tf->entries[(i+1)].value; + tf->entries[i].time_up = tf->entries[(i+1)].time_up; + } + + } +} + +uint32_t +apply_filter_min(struct time_filter *tf, uint64_t value, uint32_t now) +{ + int i, j; + + if (value <= tf->entries[0].value) { + /* Zap them all */ + for(i=0; ientries[i].value = value; + tf->entries[i].time_up = now; + } + return (tf->entries[0].value); + } + for (j=1; jentries[j].value) { + for(i=j; ientries[i].value = value; + tf->entries[i].time_up = now; + } + break; + } + } + check_update_times(tf, value, now); + return (tf->entries[0].value); +} + +uint32_t +apply_filter_min_small(struct time_filter_small *tf, + uint32_t value, uint32_t now) +{ + int i, j; + + if (value <= tf->entries[0].value) { + /* Zap them all */ + for(i=0; ientries[i].value = value; + tf->entries[i].time_up = now; + } + return (tf->entries[0].value); + } + for (j=1; jentries[j].value) { + for(i=j; ientries[i].value = value; + tf->entries[i].time_up = now; + } + break; + } + } + check_update_times_small(tf, value, now); + return (tf->entries[0].value); +} + +uint32_t +apply_filter_max(struct time_filter *tf, uint64_t value, uint32_t now) +{ + int i, j; + + if (value >= tf->entries[0].value) { + /* Zap them all */ + for(i=0; ientries[i].value = value; + tf->entries[i].time_up = now; + } + return (tf->entries[0].value); + } + for (j=1; j= tf->entries[j].value) { + for(i=j; ientries[i].value = value; + tf->entries[i].time_up = now; + } + break; + } + } + check_update_times(tf, value, now); + return (tf->entries[0].value); +} + + +uint32_t +apply_filter_max_small(struct time_filter_small *tf, + uint32_t value, uint32_t now) +{ + int i, j; + + if (value >= tf->entries[0].value) { + /* Zap them all */ + for(i=0; ientries[i].value = value; + tf->entries[i].time_up = now; + } + return (tf->entries[0].value); + } + for (j=1; j= tf->entries[j].value) { + for(i=j; ientries[i].value = value; + tf->entries[i].time_up = now; + } + break; + } + } + check_update_times_small(tf, value, now); + return (tf->entries[0].value); +} diff --git a/sys/modules/tcp/Makefile b/sys/modules/tcp/Makefile index f9e9aa53883a..45c4ef3df423 100644 --- a/sys/modules/tcp/Makefile +++ b/sys/modules/tcp/Makefile @@ -6,10 +6,12 @@ SYSDIR?=${SRCTOP}/sys .include "${SYSDIR}/conf/kern.opts.mk" SUBDIR= \ + ${_tcp_bbr} \ ${_tcp_rack} \ ${_tcpmd5} \ .if ${MK_EXTRA_TCP_STACKS} != "no" || defined(ALL_MODULES) +_tcp_bbr= bbr _tcp_rack= rack .endif diff --git a/sys/modules/tcp/bbr/Makefile b/sys/modules/tcp/bbr/Makefile new file mode 100644 index 000000000000..8c0c35d3cab8 --- /dev/null +++ b/sys/modules/tcp/bbr/Makefile @@ -0,0 +1,23 @@ +# +# $FreeBSD$ +# + +.PATH: ${.CURDIR}/../../../netinet/tcp_stacks + +STACKNAME= bbr +KMOD= tcp_${STACKNAME} +SRCS= bbr.c sack_filter.c rack_bbr_common.c + +SRCS+= opt_inet.h opt_inet6.h opt_ipsec.h +SRCS+= opt_tcpdebug.h +SRCS+= opt_kern_tls.h + +# +# Enable full debugging +# +#CFLAGS += -g + +CFLAGS+= -DMODNAME=${KMOD} +CFLAGS+= -DSTACKNAME=${STACKNAME} + +.include diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c index 085040f25e64..cbd2d72188fa 100644 --- a/sys/netinet/ip_output.c +++ b/sys/netinet/ip_output.c @@ -212,7 +212,7 @@ ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, int flags, static int ip_output_send(struct inpcb *inp, struct ifnet *ifp, struct mbuf *m, - const struct sockaddr_in *gw, struct route *ro) + const struct sockaddr_in *gw, struct route *ro, bool stamp_tag) { #ifdef KERN_TLS struct ktls_session *tls = NULL; @@ -256,7 +256,7 @@ ip_output_send(struct inpcb *inp, struct ifnet *ifp, struct mbuf *m, mst = inp->inp_snd_tag; } #endif - if (mst != NULL) { + if (stamp_tag && mst != NULL) { KASSERT(m->m_pkthdr.rcvif == NULL, ("trying to add a send tag to a forwarded packet")); if (mst->ifp != ifp) { @@ -791,7 +791,8 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, */ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); - error = ip_output_send(inp, ifp, m, gw, ro); + error = ip_output_send(inp, ifp, m, gw, ro, + (flags & IP_NO_SND_TAG_RL) ? false : true); goto done; } @@ -827,7 +828,7 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, IP_PROBE(send, NULL, NULL, mtod(m, struct ip *), ifp, mtod(m, struct ip *), NULL); - error = ip_output_send(inp, ifp, m, gw, ro); + error = ip_output_send(inp, ifp, m, gw, ro, true); } else m_freem(m); } diff --git a/sys/netinet/ip_var.h b/sys/netinet/ip_var.h index 7580a7b45212..b6693eb58200 100644 --- a/sys/netinet/ip_var.h +++ b/sys/netinet/ip_var.h @@ -166,6 +166,7 @@ void kmod_ipstat_dec(int statnum); #define IP_ROUTETOIF SO_DONTROUTE /* 0x10 bypass routing tables */ #define IP_ALLOWBROADCAST SO_BROADCAST /* 0x20 can send broadcast packets */ #define IP_NODEFAULTFLOWID 0x40 /* Don't set the flowid from inp */ +#define IP_NO_SND_TAG_RL 0x80 /* Don't send down the ratelimit tag */ #ifdef __NO_STRICT_ALIGNMENT #define IP_HDR_ALIGNED_P(ip) 1 diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h index 508d4b5fbc17..37ba3bb55741 100644 --- a/sys/netinet/tcp.h +++ b/sys/netinet/tcp.h @@ -239,6 +239,7 @@ struct tcphdr { #define TCP_BBR_ACK_COMP_ALG 1096 /* Not used */ #define TCP_BBR_TMR_PACE_OH 1096 /* Recycled in 4.2 */ #define TCP_BBR_EXTRA_GAIN 1097 +#define TCP_RACK_DO_DETECTION 1097 /* Recycle of extra gain for rack, attack detection */ #define TCP_BBR_RACK_RTT_USE 1098 /* what RTT should we use 0, 1, or 2? */ #define TCP_BBR_RETRAN_WTSO 1099 #define TCP_DATA_AFTER_CLOSE 1100 diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c new file mode 100644 index 000000000000..dccb2894ea68 --- /dev/null +++ b/sys/netinet/tcp_stacks/bbr.c @@ -0,0 +1,15189 @@ +/*- + * Copyright (c) 2016-2019 + * Netflix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +/** + * Author: Randall Stewart + * This work is based on the ACM Queue paper + * BBR - Congestion Based Congestion Control + * and also numerous discussions with Neal, Yuchung and Van. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_ipsec.h" +#include "opt_tcpdebug.h" +#include "opt_ratelimit.h" +#include "opt_kern_tls.h" +#include +#include +#include +#ifdef TCP_HHOOK +#include +#endif +#include +#include +#include +#include +#include +#ifdef KERN_TLS +#include +#endif +#include +#include +#include +#include +#ifdef NETFLIX_STATS +#include /* Must come after qmath.h and tree.h */ +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define TCPSTATES /* for logging */ + +#include +#include +#include +#include +#include /* required for icmp_var.h */ +#include /* for ICMP_BANDLIM */ +#include +#include +#include +#include +#define TCPOUTFLAGS +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef TCPDEBUG +#include +#endif /* TCPDEBUG */ +#ifdef TCP_OFFLOAD +#include +#endif +#ifdef INET6 +#include +#endif +#include + +#include +#include +#include +#include + +#if defined(IPSEC) || defined(IPSEC_SUPPORT) +#include +#include +#endif /* IPSEC */ + +#include +#include +#include + +#ifdef MAC +#include +#endif + +#include "sack_filter.h" +#include "tcp_bbr.h" +#include "rack_bbr_common.h" +uma_zone_t bbr_zone; +uma_zone_t bbr_pcb_zone; + +struct sysctl_ctx_list bbr_sysctl_ctx; +struct sysctl_oid *bbr_sysctl_root; + +#define TCPT_RANGESET_NOSLOP(tv, value, tvmin, tvmax) do { \ + (tv) = (value); \ + if ((u_long)(tv) < (u_long)(tvmin)) \ + (tv) = (tvmin); \ + if ((u_long)(tv) > (u_long)(tvmax)) \ + (tv) = (tvmax); \ +} while(0) + +/*#define BBR_INVARIANT 1*/ + +/* + * initial window + */ +static uint32_t bbr_def_init_win = 10; +static int32_t bbr_persist_min = 250000; /* 250ms */ +static int32_t bbr_persist_max = 1000000; /* 1 Second */ +static int32_t bbr_cwnd_may_shrink = 0; +static int32_t bbr_cwndtarget_rtt_touse = BBR_RTT_PROP; +static int32_t bbr_num_pktepo_for_del_limit = BBR_NUM_RTTS_FOR_DEL_LIMIT; +static int32_t bbr_hardware_pacing_limit = 8000; +static int32_t bbr_quanta = 3; /* How much extra quanta do we get? */ +static int32_t bbr_no_retran = 0; +static int32_t bbr_tcp_map_entries_limit = 1500; +static int32_t bbr_tcp_map_split_limit = 256; + +static int32_t bbr_error_base_paceout = 10000; /* usec to pace */ +static int32_t bbr_max_net_error_cnt = 10; +/* Should the following be dynamic too -- loss wise */ +static int32_t bbr_rtt_gain_thresh = 0; +/* Measurement controls */ +static int32_t bbr_use_google_algo = 1; +static int32_t bbr_ts_limiting = 1; +static int32_t bbr_ts_can_raise = 0; +static int32_t bbr_do_red = 600; +static int32_t bbr_red_scale = 20000; +static int32_t bbr_red_mul = 1; +static int32_t bbr_red_div = 2; +static int32_t bbr_red_growth_restrict = 1; +static int32_t bbr_target_is_bbunit = 0; +static int32_t bbr_drop_limit = 0; +/* + * How much gain do we need to see to + * stay in startup? + */ +static int32_t bbr_marks_rxt_sack_passed = 0; +static int32_t bbr_start_exit = 25; +static int32_t bbr_low_start_exit = 25; /* When we are in reduced gain */ +static int32_t bbr_startup_loss_thresh = 2000; /* 20.00% loss */ +static int32_t bbr_hptsi_max_mul = 1; /* These two mul/div assure a min pacing */ +static int32_t bbr_hptsi_max_div = 2; /* time, 0 means turned off. We need this + * if we go back ever to where the pacer + * has priority over timers. + */ +static int32_t bbr_policer_call_from_rack_to = 0; +static int32_t bbr_policer_detection_enabled = 1; +static int32_t bbr_min_measurements_req = 1; /* We need at least 2 + * measurments before we are + * "good" note that 2 == 1. + * This is because we use a > + * comparison. This means if + * min_measure was 0, it takes + * num-measures > min(0) and + * you get 1 measurement and + * you are good. Set to 1, you + * have to have two + * measurements (this is done + * to prevent it from being ok + * to have no measurements). */ +static int32_t bbr_no_pacing_until = 4; + +static int32_t bbr_min_usec_delta = 20000; /* 20,000 usecs */ +static int32_t bbr_min_peer_delta = 20; /* 20 units */ +static int32_t bbr_delta_percent = 150; /* 15.0 % */ + +static int32_t bbr_target_cwnd_mult_limit = 8; +/* + * bbr_cwnd_min_val is the number of + * segments we hold to in the RTT probe + * state typically 4. + */ +static int32_t bbr_cwnd_min_val = BBR_PROBERTT_NUM_MSS; + + +static int32_t bbr_cwnd_min_val_hs = BBR_HIGHSPEED_NUM_MSS; + +static int32_t bbr_gain_to_target = 1; +static int32_t bbr_gain_gets_extra_too = 1; +/* + * bbr_high_gain is the 2/ln(2) value we need + * to double the sending rate in startup. This + * is used for both cwnd and hptsi gain's. + */ +static int32_t bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; +static int32_t bbr_startup_lower = BBR_UNIT * 1500 / 1000 + 1; +static int32_t bbr_use_lower_gain_in_startup = 1; + +/* thresholds for reduction on drain in sub-states/drain */ +static int32_t bbr_drain_rtt = BBR_SRTT; +static int32_t bbr_drain_floor = 88; +static int32_t google_allow_early_out = 1; +static int32_t google_consider_lost = 1; +static int32_t bbr_drain_drop_mul = 4; +static int32_t bbr_drain_drop_div = 5; +static int32_t bbr_rand_ot = 50; +static int32_t bbr_can_force_probertt = 0; +static int32_t bbr_can_adjust_probertt = 1; +static int32_t bbr_probertt_sets_rtt = 0; +static int32_t bbr_can_use_ts_for_rtt = 1; +static int32_t bbr_is_ratio = 0; +static int32_t bbr_sub_drain_app_limit = 1; +static int32_t bbr_prtt_slam_cwnd = 1; +static int32_t bbr_sub_drain_slam_cwnd = 1; +static int32_t bbr_slam_cwnd_in_main_drain = 1; +static int32_t bbr_filter_len_sec = 6; /* How long does the rttProp filter + * hold */ +static uint32_t bbr_rtt_probe_limit = (USECS_IN_SECOND * 4); +/* + * bbr_drain_gain is the reverse of the high_gain + * designed to drain back out the standing queue + * that is formed in startup by causing a larger + * hptsi gain and thus drainging the packets + * in flight. + */ +static int32_t bbr_drain_gain = BBR_UNIT * 1000 / 2885; +static int32_t bbr_rttprobe_gain = 192; + +/* + * The cwnd_gain is the default cwnd gain applied when + * calculating a target cwnd. Note that the cwnd is + * a secondary factor in the way BBR works (see the + * paper and think about it, it will take some time). + * Basically the hptsi_gain spreads the packets out + * so you never get more than BDP to the peer even + * if the cwnd is high. In our implemenation that + * means in non-recovery/retransmission scenarios + * cwnd will never be reached by the flight-size. + */ +static int32_t bbr_cwnd_gain = BBR_UNIT * 2; +static int32_t bbr_tlp_type_to_use = BBR_SRTT; +static int32_t bbr_delack_time = 100000; /* 100ms in useconds */ +static int32_t bbr_sack_not_required = 0; /* set to one to allow non-sack to use bbr */ +static int32_t bbr_initial_bw_bps = 62500; /* 500kbps in bytes ps */ +static int32_t bbr_ignore_data_after_close = 1; +static int16_t bbr_hptsi_gain[] = { + (BBR_UNIT *5 / 4), + (BBR_UNIT * 3 / 4), + BBR_UNIT, + BBR_UNIT, + BBR_UNIT, + BBR_UNIT, + BBR_UNIT, + BBR_UNIT +}; +int32_t bbr_use_rack_resend_cheat = 1; +int32_t bbr_sends_full_iwnd = 1; + +#define BBR_HPTSI_GAIN_MAX 8 +/* + * The BBR module incorporates a number of + * TCP ideas that have been put out into the IETF + * over the last few years: + * - Yuchung Cheng's RACK TCP (for which its named) that + * will stop us using the number of dup acks and instead + * use time as the gage of when we retransmit. + * - Reorder Detection of RFC4737 and the Tail-Loss probe draft + * of Dukkipati et.al. + * - Van Jacobson's et.al BBR. + * + * RACK depends on SACK, so if an endpoint arrives that + * cannot do SACK the state machine below will shuttle the + * connection back to using the "default" TCP stack that is + * in FreeBSD. + * + * To implement BBR and RACK the original TCP stack was first decomposed + * into a functional state machine with individual states + * for each of the possible TCP connection states. The do_segement + * functions role in life is to mandate the connection supports SACK + * initially and then assure that the RACK state matches the conenction + * state before calling the states do_segment function. Data processing + * of inbound segments also now happens in the hpts_do_segment in general + * with only one exception. This is so we can keep the connection on + * a single CPU. + * + * Each state is simplified due to the fact that the original do_segment + * has been decomposed and we *know* what state we are in (no + * switches on the state) and all tests for SACK are gone. This + * greatly simplifies what each state does. + * + * TCP output is also over-written with a new version since it + * must maintain the new rack scoreboard and has had hptsi + * integrated as a requirment. Still todo is to eliminate the + * use of the callout_() system and use the hpts for all + * timers as well. + */ +static uint32_t bbr_rtt_probe_time = 200000; /* 200ms in micro seconds */ +static uint32_t bbr_rtt_probe_cwndtarg = 4; /* How many mss's outstanding */ +static const int32_t bbr_min_req_free = 2; /* The min we must have on the + * free list */ +static int32_t bbr_tlp_thresh = 1; +static int32_t bbr_reorder_thresh = 2; +static int32_t bbr_reorder_fade = 60000000; /* 0 - never fade, def + * 60,000,000 - 60 seconds */ +static int32_t bbr_pkt_delay = 1000; +static int32_t bbr_min_to = 1000; /* Number of usec's minimum timeout */ +static int32_t bbr_incr_timers = 1; + +static int32_t bbr_tlp_min = 10000; /* 10ms in usecs */ +static int32_t bbr_delayed_ack_time = 200000; /* 200ms in usecs */ +static int32_t bbr_exit_startup_at_loss = 1; + +/* + * bbr_lt_bw_ratio is 1/8th + * bbr_lt_bw_diff is < 4 Kbit/sec + */ +static uint64_t bbr_lt_bw_diff = 4000 / 8; /* In bytes per second */ +static uint64_t bbr_lt_bw_ratio = 8; /* For 1/8th */ +static uint32_t bbr_lt_bw_max_rtts = 48; /* How many rtt's do we use + * the lt_bw for */ +static uint32_t bbr_lt_intvl_min_rtts = 4; /* Min num of RTT's to measure + * lt_bw */ +static int32_t bbr_lt_intvl_fp = 0; /* False positive epoch diff */ +static int32_t bbr_lt_loss_thresh = 196; /* Lost vs delivered % */ +static int32_t bbr_lt_fd_thresh = 100; /* false detection % */ + +static int32_t bbr_verbose_logging = 0; +/* + * Currently regular tcp has a rto_min of 30ms + * the backoff goes 12 times so that ends up + * being a total of 122.850 seconds before a + * connection is killed. + */ +static int32_t bbr_rto_min_ms = 30; /* 30ms same as main freebsd */ +static int32_t bbr_rto_max_sec = 4; /* 4 seconds */ + +/****************************************************/ +/* DEFAULT TSO SIZING (cpu performance impacting) */ +/****************************************************/ +/* What amount is our formula using to get TSO size */ +static int32_t bbr_hptsi_per_second = 1000; + +/* + * For hptsi under bbr_cross_over connections what is delay + * target 7ms (in usec) combined with a seg_max of 2 + * gets us close to identical google behavior in + * TSO size selection (possibly more 1MSS sends). + */ +static int32_t bbr_hptsi_segments_delay_tar = 7000; + +/* Does pacing delay include overhead's in its time calculations? */ +static int32_t bbr_include_enet_oh = 0; +static int32_t bbr_include_ip_oh = 1; +static int32_t bbr_include_tcp_oh = 1; +static int32_t bbr_google_discount = 10; + +/* Do we use (nf mode) pkt-epoch to drive us or rttProp? */ +static int32_t bbr_state_is_pkt_epoch = 0; +static int32_t bbr_state_drain_2_tar = 1; +/* What is the max the 0 - bbr_cross_over MBPS TSO target + * can reach using our delay target. Note that this + * value becomes the floor for the cross over + * algorithm. + */ +static int32_t bbr_hptsi_segments_max = 2; +static int32_t bbr_hptsi_segments_floor = 1; +static int32_t bbr_hptsi_utter_max = 0; + +/* What is the min the 0 - bbr_cross-over MBPS TSO target can be */ +static int32_t bbr_hptsi_bytes_min = 1460; +static int32_t bbr_all_get_min = 0; + +/* Cross over point from algo-a to algo-b */ +static uint32_t bbr_cross_over = TWENTY_THREE_MBPS; + +/* Do we deal with our restart state? */ +static int32_t bbr_uses_idle_restart = 0; +static int32_t bbr_idle_restart_threshold = 100000; /* 100ms in useconds */ + +/* Do we allow hardware pacing? */ +static int32_t bbr_allow_hdwr_pacing = 0; +static int32_t bbr_hdwr_pace_adjust = 2; /* multipler when we calc the tso size */ +static int32_t bbr_hdwr_pace_floor = 1; +static int32_t bbr_hdwr_pacing_delay_cnt = 10; + +/****************************************************/ +static int32_t bbr_resends_use_tso = 0; +static int32_t bbr_tlp_max_resend = 2; +static int32_t bbr_sack_block_limit = 128; + +#define BBR_MAX_STAT 19 +counter_u64_t bbr_state_time[BBR_MAX_STAT]; +counter_u64_t bbr_state_lost[BBR_MAX_STAT]; +counter_u64_t bbr_state_resend[BBR_MAX_STAT]; +counter_u64_t bbr_stat_arry[BBR_STAT_SIZE]; +counter_u64_t bbr_opts_arry[BBR_OPTS_SIZE]; +counter_u64_t bbr_out_size[TCP_MSS_ACCT_SIZE]; +counter_u64_t bbr_flows_whdwr_pacing; +counter_u64_t bbr_flows_nohdwr_pacing; + +counter_u64_t bbr_nohdwr_pacing_enobuf; +counter_u64_t bbr_hdwr_pacing_enobuf; + +static inline uint64_t bbr_get_bw(struct tcp_bbr *bbr); + +/* + * Static defintions we need for forward declarations. + */ +static uint32_t +bbr_get_pacing_length(struct tcp_bbr *bbr, uint16_t gain, + uint32_t useconds_time, uint64_t bw); +static uint32_t +bbr_get_a_state_target(struct tcp_bbr *bbr, uint32_t gain); +static void + bbr_set_state(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t win); +static void +bbr_set_probebw_gains(struct tcp_bbr *bbr, uint32_t cts, uint32_t losses); +static void +bbr_substate_change(struct tcp_bbr *bbr, uint32_t cts, int line, + int dolog); +static uint32_t +bbr_get_target_cwnd(struct tcp_bbr *bbr, uint64_t bw, uint32_t gain); +static void +bbr_state_change(struct tcp_bbr *bbr, uint32_t cts, int32_t epoch, + int32_t pkt_epoch, uint32_t losses); +static uint32_t +bbr_calc_thresh_rack(struct tcp_bbr *bbr, uint32_t srtt, uint32_t cts, struct bbr_sendmap *rsm); +static uint32_t bbr_initial_cwnd(struct tcp_bbr *bbr, struct tcpcb *tp); +static uint32_t +bbr_calc_thresh_tlp(struct tcpcb *tp, struct tcp_bbr *bbr, + struct bbr_sendmap *rsm, uint32_t srtt, + uint32_t cts); +static void +bbr_exit_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, + int32_t line); +static void + bbr_set_state_target(struct tcp_bbr *bbr, int line); +static void + bbr_enter_probe_rtt(struct tcp_bbr *bbr, uint32_t cts, int32_t line); + +static void + bbr_log_progress_event(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t tick, int event, int line); + +static void + tcp_bbr_tso_size_check(struct tcp_bbr *bbr, uint32_t cts); + +static void + bbr_setup_red_bw(struct tcp_bbr *bbr, uint32_t cts); + +static void + bbr_log_rtt_shrinks(struct tcp_bbr *bbr, uint32_t cts, uint32_t applied, uint32_t rtt, + uint32_t line, uint8_t is_start, uint16_t set); + +static struct bbr_sendmap * + bbr_find_lowest_rsm(struct tcp_bbr *bbr); +static __inline uint32_t +bbr_get_rtt(struct tcp_bbr *bbr, int32_t rtt_type); +static void + bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, uint8_t which); + +static void +bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts, uint32_t time_since_sent, uint32_t srtt, + uint32_t thresh, uint32_t to); +static void + bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag); + +static void +bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot, + uint32_t del_by, uint32_t cts, uint32_t sloton, uint32_t prev_delay); + +static void +bbr_enter_persist(struct tcpcb *tp, struct tcp_bbr *bbr, + uint32_t cts, int32_t line); +static void + bbr_stop_all_timers(struct tcpcb *tp); +static void + bbr_exit_probe_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts); +static void + bbr_check_probe_rtt_limits(struct tcp_bbr *bbr, uint32_t cts); +static void + bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts); + + +static void +bbr_log_pacing_delay_calc(struct tcp_bbr *bbr, uint16_t gain, uint32_t len, + uint32_t cts, uint32_t usecs, uint64_t bw, uint32_t override, int mod); + +static inline uint8_t +bbr_state_val(struct tcp_bbr *bbr) +{ + return(bbr->rc_bbr_substate); +} + +static inline uint32_t +get_min_cwnd(struct tcp_bbr *bbr) +{ + int mss; + + mss = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options), bbr->r_ctl.rc_pace_max_segs); + if (bbr_get_rtt(bbr, BBR_RTT_PROP) < BBR_HIGH_SPEED) + return (bbr_cwnd_min_val_hs * mss); + else + return (bbr_cwnd_min_val * mss); +} + +static uint32_t +bbr_get_persists_timer_val(struct tcpcb *tp, struct tcp_bbr *bbr) +{ + uint64_t srtt, var; + uint64_t ret_val; + + bbr->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; + if (tp->t_srtt == 0) { + srtt = (uint64_t)BBR_INITIAL_RTO; + var = 0; + } else { + srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT); + var = ((uint64_t)TICKS_2_USEC(tp->t_rttvar) >> TCP_RTT_SHIFT); + } + TCPT_RANGESET_NOSLOP(ret_val, ((srtt + var) * tcp_backoff[tp->t_rxtshift]), + bbr_persist_min, bbr_persist_max); + return ((uint32_t)ret_val); +} + +static uint32_t +bbr_timer_start(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) +{ + /* + * Start the FR timer, we do this based on getting the first one in + * the rc_tmap. Note that if its NULL we must stop the timer. in all + * events we need to stop the running timer (if its running) before + * starting the new one. + */ + uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse; + int32_t idx; + int32_t is_tlp_timer = 0; + struct bbr_sendmap *rsm; + + if (bbr->rc_all_timers_stopped) { + /* All timers have been stopped none are to run */ + return (0); + } + if (bbr->rc_in_persist) { + /* We can't start any timer in persists */ + return (bbr_get_persists_timer_val(tp, bbr)); + } + rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap); + if ((rsm == NULL) || + ((tp->t_flags & TF_SACK_PERMIT) == 0) || + (tp->t_state < TCPS_ESTABLISHED)) { + /* Nothing on the send map */ +activate_rxt: + if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { + uint64_t tov; + + time_since_sent = 0; + rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap); + if (rsm) { + idx = rsm->r_rtr_cnt - 1; + if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], bbr->r_ctl.rc_tlp_rxt_last_time)) + tstmp_touse = rsm->r_tim_lastsent[idx]; + else + tstmp_touse = bbr->r_ctl.rc_tlp_rxt_last_time; + if (TSTMP_GT(tstmp_touse, cts)) + time_since_sent = cts - tstmp_touse; + } + bbr->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; + if (tp->t_srtt == 0) + tov = BBR_INITIAL_RTO; + else + tov = ((uint64_t)(TICKS_2_USEC(tp->t_srtt) + + ((uint64_t)TICKS_2_USEC(tp->t_rttvar) * (uint64_t)4)) >> TCP_RTT_SHIFT); + if (tp->t_rxtshift) + tov *= tcp_backoff[tp->t_rxtshift]; + if (tov > time_since_sent) + tov -= time_since_sent; + else + tov = bbr->r_ctl.rc_min_to; + TCPT_RANGESET_NOSLOP(to, tov, + (bbr->r_ctl.rc_min_rto_ms * MS_IN_USEC), + (bbr->rc_max_rto_sec * USECS_IN_SECOND)); + bbr_log_timer_var(bbr, 2, cts, 0, srtt, 0, to); + return (to); + } + return (0); + } + if (rsm->r_flags & BBR_ACKED) { + rsm = bbr_find_lowest_rsm(bbr); + if (rsm == NULL) { + /* No lowest? */ + goto activate_rxt; + } + } + /* Convert from ms to usecs */ + if (rsm->r_flags & BBR_SACK_PASSED) { + if ((tp->t_flags & TF_SENTFIN) && + ((tp->snd_max - tp->snd_una) == 1) && + (rsm->r_flags & BBR_HAS_FIN)) { + /* + * We don't start a bbr rack timer if all we have is + * a FIN outstanding. + */ + goto activate_rxt; + } + srtt = bbr_get_rtt(bbr, BBR_RTT_RACK); + thresh = bbr_calc_thresh_rack(bbr, srtt, cts, rsm); + idx = rsm->r_rtr_cnt - 1; + exp = rsm->r_tim_lastsent[idx] + thresh; + if (SEQ_GEQ(exp, cts)) { + to = exp - cts; + if (to < bbr->r_ctl.rc_min_to) { + to = bbr->r_ctl.rc_min_to; + } + } else { + to = bbr->r_ctl.rc_min_to; + } + } else { + /* Ok we need to do a TLP not RACK */ + if (bbr->rc_tlp_in_progress != 0) { + /* + * The previous send was a TLP. + */ + goto activate_rxt; + } + rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_tmap, bbr_sendmap, r_tnext); + if (rsm == NULL) { + /* We found no rsm to TLP with. */ + goto activate_rxt; + } + if (rsm->r_flags & BBR_HAS_FIN) { + /* If its a FIN we don't do TLP */ + rsm = NULL; + goto activate_rxt; + } + time_since_sent = 0; + idx = rsm->r_rtr_cnt - 1; + if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], bbr->r_ctl.rc_tlp_rxt_last_time)) + tstmp_touse = rsm->r_tim_lastsent[idx]; + else + tstmp_touse = bbr->r_ctl.rc_tlp_rxt_last_time; + if (TSTMP_GT(tstmp_touse, cts)) + time_since_sent = cts - tstmp_touse; + is_tlp_timer = 1; + srtt = bbr_get_rtt(bbr, bbr_tlp_type_to_use); + thresh = bbr_calc_thresh_tlp(tp, bbr, rsm, srtt, cts); + if (thresh > time_since_sent) + to = thresh - time_since_sent; + else + to = bbr->r_ctl.rc_min_to; + if (to > (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND)) { + /* + * If the TLP time works out to larger than the max + * RTO lets not do TLP.. just RTO. + */ + goto activate_rxt; + } + if ((bbr->rc_tlp_rtx_out == 1) && + (rsm->r_start == bbr->r_ctl.rc_last_tlp_seq)) { + /* + * Second retransmit of the same TLP + * lets not. + */ + bbr->rc_tlp_rtx_out = 0; + goto activate_rxt; + } + if (rsm->r_start != bbr->r_ctl.rc_last_tlp_seq) { + /* + * The tail is no longer the last one I did a probe + * on + */ + bbr->r_ctl.rc_tlp_seg_send_cnt = 0; + bbr->r_ctl.rc_last_tlp_seq = rsm->r_start; + } + } + if (is_tlp_timer == 0) { + BBR_STAT_INC(bbr_to_arm_rack); + bbr->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; + } else { + bbr_log_timer_var(bbr, 1, cts, time_since_sent, srtt, thresh, to); + if (bbr->r_ctl.rc_tlp_seg_send_cnt > bbr_tlp_max_resend) { + /* + * We have exceeded how many times we can retran the + * current TLP timer, switch to the RTO timer. + */ + goto activate_rxt; + } else { + BBR_STAT_INC(bbr_to_arm_tlp); + bbr->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; + } + } + return (to); +} + +static inline int32_t +bbr_minseg(struct tcp_bbr *bbr) +{ + return (bbr->r_ctl.rc_pace_min_segs - bbr->rc_last_options); +} + +static void +bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_t frm, int32_t slot, uint32_t tot_len) +{ + struct inpcb *inp; + struct hpts_diag diag; + uint32_t delayed_ack = 0; + uint32_t left = 0; + uint32_t hpts_timeout; + uint8_t stopped; + int32_t delay_calc = 0; + uint32_t prev_delay = 0; + + inp = tp->t_inpcb; + if (inp->inp_in_hpts) { + /* A previous call is already set up */ + return; + } + if ((tp->t_state == TCPS_CLOSED) || + (tp->t_state == TCPS_LISTEN)) { + return; + } + stopped = bbr->rc_tmr_stopped; + if (stopped && TSTMP_GT(bbr->r_ctl.rc_timer_exp, cts)) { + left = bbr->r_ctl.rc_timer_exp - cts; + } + bbr->r_ctl.rc_hpts_flags = 0; + bbr->r_ctl.rc_timer_exp = 0; + prev_delay = bbr->r_ctl.rc_last_delay_val; + if (bbr->r_ctl.rc_last_delay_val && + (slot == 0)) { + /* + * If a previous pacer delay was in place we + * are not coming from the output side (where + * we calculate a delay, more likely a timer). + */ + slot = bbr->r_ctl.rc_last_delay_val; + if (TSTMP_GT(cts, bbr->rc_pacer_started)) { + /* Compensate for time passed */ + delay_calc = cts - bbr->rc_pacer_started; + if (delay_calc <= slot) + slot -= delay_calc; + } + } + /* Do we have early to make up for by pushing out the pacing time? */ + if (bbr->r_agg_early_set) { + bbr_log_pacing_delay_calc(bbr, 0, bbr->r_ctl.rc_agg_early, cts, slot, 0, bbr->r_agg_early_set, 2); + slot += bbr->r_ctl.rc_agg_early; + bbr->r_ctl.rc_agg_early = 0; + bbr->r_agg_early_set = 0; + } + /* Are we running a total debt that needs to be compensated for? */ + if (bbr->r_ctl.rc_hptsi_agg_delay) { + if (slot > bbr->r_ctl.rc_hptsi_agg_delay) { + /* We nuke the delay */ + slot -= bbr->r_ctl.rc_hptsi_agg_delay; + bbr->r_ctl.rc_hptsi_agg_delay = 0; + } else { + /* We nuke some of the delay, put in a minimal 100usecs */ + bbr->r_ctl.rc_hptsi_agg_delay -= slot; + bbr->r_ctl.rc_last_delay_val = slot = 100; + } + } + bbr->r_ctl.rc_last_delay_val = slot; + hpts_timeout = bbr_timer_start(tp, bbr, cts); + if (tp->t_flags & TF_DELACK) { + if (bbr->rc_in_persist == 0) { + delayed_ack = bbr_delack_time; + } else { + /* + * We are in persists and have + * gotten a new data element. + */ + if (hpts_timeout > bbr_delack_time) { + /* + * Lets make the persists timer (which acks) + * be the smaller of hpts_timeout and bbr_delack_time. + */ + hpts_timeout = bbr_delack_time; + } + } + } + if (delayed_ack && + ((hpts_timeout == 0) || + (delayed_ack < hpts_timeout))) { + /* We need a Delayed ack timer */ + bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK; + hpts_timeout = delayed_ack; + } + if (slot) { + /* Mark that we have a pacing timer up */ + BBR_STAT_INC(bbr_paced_segments); + bbr->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; + } + /* + * If no timers are going to run and we will fall off thfe hptsi + * wheel, we resort to a keep-alive timer if its configured. + */ + if ((hpts_timeout == 0) && + (slot == 0)) { + if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && + (tp->t_state <= TCPS_CLOSING)) { + /* + * Ok we have no timer (persists, rack, tlp, rxt or + * del-ack), we don't have segments being paced. So + * all that is left is the keepalive timer. + */ + if (TCPS_HAVEESTABLISHED(tp->t_state)) { + hpts_timeout = TICKS_2_USEC(TP_KEEPIDLE(tp)); + } else { + hpts_timeout = TICKS_2_USEC(TP_KEEPINIT(tp)); + } + bbr->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; + } + } + if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == + (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { + /* + * RACK, TLP, persists and RXT timers all are restartable + * based on actions input .. i.e we received a packet (ack + * or sack) and that changes things (rw, or snd_una etc). + * Thus we can restart them with a new value. For + * keep-alive, delayed_ack we keep track of what was left + * and restart the timer with a smaller value. + */ + if (left < hpts_timeout) + hpts_timeout = left; + } + if (bbr->r_ctl.rc_incr_tmrs && slot && + (bbr->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) { + /* + * If configured to do so, and the timer is either + * the TLP or RXT timer, we need to increase the timeout + * by the pacing time. Consider the bottleneck at my + * machine as an example, we are sending something + * to start a TLP on. The last packet won't be emitted + * fully until the pacing time (the bottleneck will hold + * the data in place). Once the packet is emitted that + * is when we want to start waiting for the TLP. This + * is most evident with hardware pacing (where the nic + * is holding the packet(s) before emitting). But it + * can also show up in the network so we do it for all + * cases. Technically we would take off one packet from + * this extra delay but this is easier and being more + * conservative is probably better. + */ + hpts_timeout += slot; + } + if (hpts_timeout) { + /* + * Hack alert for now we can't time-out over 2147 seconds (a + * bit more than 35min) + */ + if (hpts_timeout > 0x7ffffffe) + hpts_timeout = 0x7ffffffe; + bbr->r_ctl.rc_timer_exp = cts + hpts_timeout; + } else + bbr->r_ctl.rc_timer_exp = 0; + if ((slot) && + (bbr->rc_use_google || + bbr->output_error_seen || + (slot <= hpts_timeout)) ) { + /* + * Tell LRO that it can queue packets while + * we pace. + */ + bbr->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY; + if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) && + (bbr->rc_cwnd_limited == 0)) { + /* + * If we are not cwnd limited and we + * are running a rack timer we put on + * the do not disturbe even for sack. + */ + inp->inp_flags2 |= INP_DONT_SACK_QUEUE; + } else + inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; + bbr->rc_pacer_started = cts; + + (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(slot), + __LINE__, &diag); + bbr->rc_timer_first = 0; + bbr->bbr_timer_src = frm; + bbr_log_to_start(bbr, cts, hpts_timeout, slot, 1); + bbr_log_hpts_diag(bbr, cts, &diag); + } else if (hpts_timeout) { + (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(hpts_timeout), + __LINE__, &diag); + /* + * We add the flag here as well if the slot is set, + * since hpts will call in to clear the queue first before + * calling the output routine (which does our timers). + * We don't want to set the flag if its just a timer + * else the arrival of data might (that causes us + * to send more) might get delayed. Imagine being + * on a keep-alive timer and a request comes in for + * more data. + */ + if (slot) + bbr->rc_pacer_started = cts; + if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) && + (bbr->rc_cwnd_limited == 0)) { + /* + * For a rack timer, don't wake us even + * if a sack arrives as long as we are + * not cwnd limited. + */ + bbr->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY; + inp->inp_flags2 |= INP_DONT_SACK_QUEUE; + } else { + /* All other timers wake us up */ + bbr->rc_inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY; + inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; + } + bbr->bbr_timer_src = frm; + bbr_log_to_start(bbr, cts, hpts_timeout, slot, 0); + bbr_log_hpts_diag(bbr, cts, &diag); + bbr->rc_timer_first = 1; + } + bbr->rc_tmr_stopped = 0; + bbr_log_type_bbrsnd(bbr, tot_len, slot, delay_calc, cts, frm, prev_delay); +} + +static void +bbr_timer_audit(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, struct sockbuf *sb) +{ + /* + * We received an ack, and then did not call send or were bounced + * out due to the hpts was running. Now a timer is up as well, is it + * the right timer? + */ + struct inpcb *inp; + struct bbr_sendmap *rsm; + uint32_t hpts_timeout; + int tmr_up; + + tmr_up = bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK; + if (bbr->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) + return; + rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap); + if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && + (tmr_up == PACE_TMR_RXT)) { + /* Should be an RXT */ + return; + } + inp = bbr->rc_inp; + if (rsm == NULL) { + /* Nothing outstanding? */ + if (tp->t_flags & TF_DELACK) { + if (tmr_up == PACE_TMR_DELACK) + /* + * We are supposed to have delayed ack up + * and we do + */ + return; + } else if (sbavail(&inp->inp_socket->so_snd) && + (tmr_up == PACE_TMR_RXT)) { + /* + * if we hit enobufs then we would expect the + * possiblity of nothing outstanding and the RXT up + * (and the hptsi timer). + */ + return; + } else if (((tcp_always_keepalive || + inp->inp_socket->so_options & SO_KEEPALIVE) && + (tp->t_state <= TCPS_CLOSING)) && + (tmr_up == PACE_TMR_KEEP) && + (tp->snd_max == tp->snd_una)) { + /* We should have keep alive up and we do */ + return; + } + } + if (rsm && (rsm->r_flags & BBR_SACK_PASSED)) { + if ((tp->t_flags & TF_SENTFIN) && + ((tp->snd_max - tp->snd_una) == 1) && + (rsm->r_flags & BBR_HAS_FIN)) { + /* needs to be a RXT */ + if (tmr_up == PACE_TMR_RXT) + return; + else + goto wrong_timer; + } else if (tmr_up == PACE_TMR_RACK) + return; + else + goto wrong_timer; + } else if (rsm && (tmr_up == PACE_TMR_RACK)) { + /* Rack timer has priority if we have data out */ + return; + } else if (SEQ_GT(tp->snd_max, tp->snd_una) && + ((tmr_up == PACE_TMR_TLP) || + (tmr_up == PACE_TMR_RXT))) { + /* + * Either a TLP or RXT is fine if no sack-passed is in place + * and data is outstanding. + */ + return; + } else if (tmr_up == PACE_TMR_DELACK) { + /* + * If the delayed ack was going to go off before the + * rtx/tlp/rack timer were going to expire, then that would + * be the timer in control. Note we don't check the time + * here trusting the code is correct. + */ + return; + } + if (SEQ_GT(tp->snd_max, tp->snd_una) && + ((tmr_up == PACE_TMR_RXT) || + (tmr_up == PACE_TMR_TLP) || + (tmr_up == PACE_TMR_RACK))) { + /* + * We have outstanding data and + * we *do* have a RACK, TLP or RXT + * timer running. We won't restart + * anything here since thats probably ok we + * will get called with some timer here shortly. + */ + return; + } + /* + * Ok the timer originally started is not what we want now. We will + * force the hpts to be stopped if any, and restart with the slot + * set to what was in the saved slot. + */ +wrong_timer: + if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) { + if (inp->inp_in_hpts) + tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT); + bbr_timer_cancel(bbr, __LINE__, cts); + bbr_start_hpts_timer(bbr, tp, cts, 1, bbr->r_ctl.rc_last_delay_val, + 0); + } else { + /* + * Output is hptsi so we just need to switch the type of + * timer. We don't bother with keep-alive, since when we + * jump through the output, it will start the keep-alive if + * nothing is sent. + * + * We only need a delayed-ack added and or the hpts_timeout. + */ + hpts_timeout = bbr_timer_start(tp, bbr, cts); + if (tp->t_flags & TF_DELACK) { + if (hpts_timeout == 0) { + hpts_timeout = bbr_delack_time; + bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK; + } + else if (hpts_timeout > bbr_delack_time) { + hpts_timeout = bbr_delack_time; + bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK; + } + } + if (hpts_timeout) { + if (hpts_timeout > 0x7ffffffe) + hpts_timeout = 0x7ffffffe; + bbr->r_ctl.rc_timer_exp = cts + hpts_timeout; + } + } +} + +int32_t bbr_clear_lost = 0; + +/* + * Considers the two time values now (cts) and earlier. + * If cts is smaller than earlier, we could have + * had a sequence wrap (our counter wraps every + * 70 min or so) or it could be just clock skew + * getting us two differnt time values. Clock skew + * will show up within 10ms or so. So in such + * a case (where cts is behind earlier time by + * less than 10ms) we return 0. Otherwise we + * return the true difference between them. + */ +static inline uint32_t +bbr_calc_time(uint32_t cts, uint32_t earlier_time) { + /* + * Given two timestamps, the current time stamp cts, and some other + * time-stamp taken in theory earlier return the difference. The + * trick is here sometimes locking will get the other timestamp + * after the cts. If this occurs we need to return 0. + */ + if (TSTMP_GEQ(cts, earlier_time)) + return (cts - earlier_time); + /* + * cts is behind earlier_time if its less than 10ms consider it 0. + * If its more than 10ms difference then we had a time wrap. Else + * its just the normal locking foo. I wonder if we should not go to + * 64bit TS and get rid of this issue. + */ + if (TSTMP_GEQ((cts + 10000), earlier_time)) + return (0); + /* + * Ok the time must have wrapped. So we need to answer a large + * amount of time, which the normal subtraction should do. + */ + return (cts - earlier_time); +} + + + +static int +sysctl_bbr_clear_lost(SYSCTL_HANDLER_ARGS) +{ + uint32_t stat; + int32_t error; + + error = SYSCTL_OUT(req, &bbr_clear_lost, sizeof(uint32_t)); + if (error || req->newptr == NULL) + return error; + + error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); + if (error) + return (error); + if (stat == 1) { +#ifdef BBR_INVARIANTS + printf("Clearing BBR lost counters\n"); +#endif + COUNTER_ARRAY_ZERO(bbr_state_lost, BBR_MAX_STAT); + COUNTER_ARRAY_ZERO(bbr_state_time, BBR_MAX_STAT); + COUNTER_ARRAY_ZERO(bbr_state_resend, BBR_MAX_STAT); + } else if (stat == 2) { +#ifdef BBR_INVARIANTS + printf("Clearing BBR option counters\n"); +#endif + COUNTER_ARRAY_ZERO(bbr_opts_arry, BBR_OPTS_SIZE); + } else if (stat == 3) { +#ifdef BBR_INVARIANTS + printf("Clearing BBR stats counters\n"); +#endif + COUNTER_ARRAY_ZERO(bbr_stat_arry, BBR_STAT_SIZE); + } else if (stat == 4) { +#ifdef BBR_INVARIANTS + printf("Clearing BBR out-size counters\n"); +#endif + COUNTER_ARRAY_ZERO(bbr_out_size, TCP_MSS_ACCT_SIZE); + } + bbr_clear_lost = 0; + return (0); +} + +static void +bbr_init_sysctls() +{ + struct sysctl_oid *bbr_probertt; + struct sysctl_oid *bbr_hptsi; + struct sysctl_oid *bbr_measure; + struct sysctl_oid *bbr_cwnd; + struct sysctl_oid *bbr_timeout; + struct sysctl_oid *bbr_states; + struct sysctl_oid *bbr_startup; + struct sysctl_oid *bbr_policer; + + /* Probe rtt controls */ + bbr_probertt = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, + "probertt", + CTLFLAG_RW, 0, + ""); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_probertt), + OID_AUTO, "gain", CTLFLAG_RW, + &bbr_rttprobe_gain, 192, + "What is the filter gain drop in probe_rtt (0=disable)?"); + SYSCTL_ADD_U32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_probertt), + OID_AUTO, "cwnd", CTLFLAG_RW, + &bbr_rtt_probe_cwndtarg, 4, + "How many mss's are outstanding during probe-rtt"); + SYSCTL_ADD_U32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_probertt), + OID_AUTO, "int", CTLFLAG_RW, + &bbr_rtt_probe_limit, 4000000, + "If RTT has not shrank in this many micro-seconds enter probe-rtt"); + SYSCTL_ADD_U32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_probertt), + OID_AUTO, "mintime", CTLFLAG_RW, + &bbr_rtt_probe_time, 200000, + "How many microseconds in probe-rtt"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_probertt), + OID_AUTO, "filter_len_sec", CTLFLAG_RW, + &bbr_filter_len_sec, 6, + "How long in seconds does the rttProp filter run?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_probertt), + OID_AUTO, "drain_rtt", CTLFLAG_RW, + &bbr_drain_rtt, BBR_SRTT, + "What is the drain rtt to use in probeRTT (rtt_prop=0, rtt_rack=1, rtt_pkt=2, rtt_srtt=3?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_probertt), + OID_AUTO, "can_force", CTLFLAG_RW, + &bbr_can_force_probertt, 0, + "If we keep setting new low rtt's but delay going in probe-rtt can we force in??"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_probertt), + OID_AUTO, "enter_sets_force", CTLFLAG_RW, + &bbr_probertt_sets_rtt, 0, + "In NF mode, do we imitate google_mode and set the rttProp on entry to probe-rtt?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_probertt), + OID_AUTO, "can_adjust", CTLFLAG_RW, + &bbr_can_adjust_probertt, 1, + "Can we dynamically adjust the probe-rtt limits and times?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_probertt), + OID_AUTO, "is_ratio", CTLFLAG_RW, + &bbr_is_ratio, 0, + "is the limit to filter a ratio?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_probertt), + OID_AUTO, "use_cwnd", CTLFLAG_RW, + &bbr_prtt_slam_cwnd, 0, + "Should we set/recover cwnd?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_probertt), + OID_AUTO, "can_use_ts", CTLFLAG_RW, + &bbr_can_use_ts_for_rtt, 1, + "Can we use the ms timestamp if available for retransmistted rtt calculations?"); + + /* Pacing controls */ + bbr_hptsi = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, + "pacing", + CTLFLAG_RW, 0, + ""); + SYSCTL_ADD_U32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_hptsi), + OID_AUTO, "hw_pacing", CTLFLAG_RW, + &bbr_allow_hdwr_pacing, 1, + "Do we allow hardware pacing?"); + SYSCTL_ADD_U32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_hptsi), + OID_AUTO, "hw_pacing_limit", CTLFLAG_RW, + &bbr_hardware_pacing_limit, 4000, + "Do we have a limited number of connections for pacing chelsio (0=no limit)?"); + SYSCTL_ADD_U32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_hptsi), + OID_AUTO, "hw_pacing_adj", CTLFLAG_RW, + &bbr_hdwr_pace_adjust, 2, + "Multiplier to calculated tso size?"); + SYSCTL_ADD_U32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_hptsi), + OID_AUTO, "hw_pacing_floor", CTLFLAG_RW, + &bbr_hdwr_pace_floor, 1, + "Do we invoke the hardware pacing floor?"); + SYSCTL_ADD_U32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_hptsi), + OID_AUTO, "hw_pacing_delay_cnt", CTLFLAG_RW, + &bbr_hdwr_pacing_delay_cnt, 10, + "How many packets must be sent after hdwr pacing is enabled"); + SYSCTL_ADD_U32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_hptsi), + OID_AUTO, "bw_cross", CTLFLAG_RW, + &bbr_cross_over, 3000000, + "What is the point where we cross over to linux like TSO size set"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_hptsi), + OID_AUTO, "seg_deltarg", CTLFLAG_RW, + &bbr_hptsi_segments_delay_tar, 7000, + "What is the worse case delay target for hptsi < 48Mbp connections"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_hptsi), + OID_AUTO, "enet_oh", CTLFLAG_RW, + &bbr_include_enet_oh, 0, + "Do we include the ethernet overhead in calculating pacing delay?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_hptsi), + OID_AUTO, "ip_oh", CTLFLAG_RW, + &bbr_include_ip_oh, 1, + "Do we include the IP overhead in calculating pacing delay?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_hptsi), + OID_AUTO, "tcp_oh", CTLFLAG_RW, + &bbr_include_tcp_oh, 0, + "Do we include the TCP overhead in calculating pacing delay?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_hptsi), + OID_AUTO, "google_discount", CTLFLAG_RW, + &bbr_google_discount, 10, + "What is the default google discount percentage wise for pacing (11 = 1.1%%)?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_hptsi), + OID_AUTO, "all_get_min", CTLFLAG_RW, + &bbr_all_get_min, 0, + "If you are less than a MSS do you just get the min?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_hptsi), + OID_AUTO, "tso_min", CTLFLAG_RW, + &bbr_hptsi_bytes_min, 1460, + "For 0 -> 24Mbps what is floor number of segments for TSO"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_hptsi), + OID_AUTO, "seg_tso_max", CTLFLAG_RW, + &bbr_hptsi_segments_max, 6, + "For 0 -> 24Mbps what is top number of segments for TSO"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_hptsi), + OID_AUTO, "seg_floor", CTLFLAG_RW, + &bbr_hptsi_segments_floor, 1, + "Minimum TSO size we will fall too in segments"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_hptsi), + OID_AUTO, "utter_max", CTLFLAG_RW, + &bbr_hptsi_utter_max, 0, + "The absolute maximum that any pacing (outside of hardware) can be"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_hptsi), + OID_AUTO, "seg_divisor", CTLFLAG_RW, + &bbr_hptsi_per_second, 100, + "What is the divisor in our hptsi TSO calculation 512Mbps < X > 24Mbps "); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_hptsi), + OID_AUTO, "srtt_mul", CTLFLAG_RW, + &bbr_hptsi_max_mul, 1, + "The multiplier for pace len max"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_hptsi), + OID_AUTO, "srtt_div", CTLFLAG_RW, + &bbr_hptsi_max_div, 2, + "The divisor for pace len max"); + /* Measurement controls */ + bbr_measure = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, + "measure", + CTLFLAG_RW, 0, + "Measurement controls"); + SYSCTL_ADD_U32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_measure), + OID_AUTO, "min_i_bw", CTLFLAG_RW, + &bbr_initial_bw_bps, 62500, + "Minimum initial b/w in bytes per second"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_measure), + OID_AUTO, "no_sack_needed", CTLFLAG_RW, + &bbr_sack_not_required, 0, + "Do we allow bbr to run on connections not supporting SACK?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_measure), + OID_AUTO, "use_google", CTLFLAG_RW, + &bbr_use_google_algo, 0, + "Use has close to google V1.0 has possible?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_measure), + OID_AUTO, "ts_limiting", CTLFLAG_RW, + &bbr_ts_limiting, 1, + "Do we attempt to use the peers timestamp to limit b/w caculations?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_measure), + OID_AUTO, "ts_can_raise", CTLFLAG_RW, + &bbr_ts_can_raise, 0, + "Can we raise the b/w via timestamp b/w calculation?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_measure), + OID_AUTO, "ts_delta", CTLFLAG_RW, + &bbr_min_usec_delta, 20000, + "How long in usec between ts of our sends in ts validation code?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_measure), + OID_AUTO, "ts_peer_delta", CTLFLAG_RW, + &bbr_min_peer_delta, 20, + "What min numerical value should be between the peer deltas?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_measure), + OID_AUTO, "ts_delta_percent", CTLFLAG_RW, + &bbr_delta_percent, 150, + "What percentage (150 = 15.0) do we allow variance for?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_measure), + OID_AUTO, "min_measure_good_bw", CTLFLAG_RW, + &bbr_min_measurements_req, 1, + "What is the minimum measurment count we need before we switch to our b/w estimate"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_measure), + OID_AUTO, "min_measure_before_pace", CTLFLAG_RW, + &bbr_no_pacing_until, 4, + "How many pkt-epoch's (0 is off) do we need before pacing is on?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_measure), + OID_AUTO, "quanta", CTLFLAG_RW, + &bbr_quanta, 2, + "Extra quanta to add when calculating the target (ID section 4.2.3.2)."); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_measure), + OID_AUTO, "noretran", CTLFLAG_RW, + &bbr_no_retran, 0, + "Should google mode not use retransmission measurements for the b/w estimation?"); + /* State controls */ + bbr_states = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, + "states", + CTLFLAG_RW, 0, + "State controls"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_states), + OID_AUTO, "idle_restart", CTLFLAG_RW, + &bbr_uses_idle_restart, 0, + "Do we use a new special idle_restart state to ramp back up quickly?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_states), + OID_AUTO, "idle_restart_threshold", CTLFLAG_RW, + &bbr_idle_restart_threshold, 100000, + "How long must we be idle before we restart??"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_states), + OID_AUTO, "use_pkt_epoch", CTLFLAG_RW, + &bbr_state_is_pkt_epoch, 0, + "Do we use a pkt-epoch for substate if 0 rttProp?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_states), + OID_AUTO, "startup_rtt_gain", CTLFLAG_RW, + &bbr_rtt_gain_thresh, 0, + "What increase in RTT triggers us to stop ignoring no-loss and possibly exit startup?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_states), + OID_AUTO, "drain_floor", CTLFLAG_RW, + &bbr_drain_floor, 88, + "What is the lowest we can drain (pg) too?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_states), + OID_AUTO, "drain_2_target", CTLFLAG_RW, + &bbr_state_drain_2_tar, 1, + "Do we drain to target in drain substate?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_states), + OID_AUTO, "gain_2_target", CTLFLAG_RW, + &bbr_gain_to_target, 1, + "Does probe bw gain to target??"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_states), + OID_AUTO, "gain_extra_time", CTLFLAG_RW, + &bbr_gain_gets_extra_too, 1, + "Does probe bw gain get the extra time too?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_states), + OID_AUTO, "ld_div", CTLFLAG_RW, + &bbr_drain_drop_div, 5, + "Long drain drop divider?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_states), + OID_AUTO, "ld_mul", CTLFLAG_RW, + &bbr_drain_drop_mul, 4, + "Long drain drop multiplier?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_states), + OID_AUTO, "rand_ot_disc", CTLFLAG_RW, + &bbr_rand_ot, 50, + "Random discount of the ot?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_states), + OID_AUTO, "dr_filter_life", CTLFLAG_RW, + &bbr_num_pktepo_for_del_limit, BBR_NUM_RTTS_FOR_DEL_LIMIT, + "How many packet-epochs does the b/w delivery rate last?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_states), + OID_AUTO, "subdrain_applimited", CTLFLAG_RW, + &bbr_sub_drain_app_limit, 0, + "Does our sub-state drain invoke app limited if its long?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_states), + OID_AUTO, "use_cwnd_subdrain", CTLFLAG_RW, + &bbr_sub_drain_slam_cwnd, 0, + "Should we set/recover cwnd for sub-state drain?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_states), + OID_AUTO, "use_cwnd_maindrain", CTLFLAG_RW, + &bbr_slam_cwnd_in_main_drain, 0, + "Should we set/recover cwnd for main-state drain?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_states), + OID_AUTO, "google_gets_earlyout", CTLFLAG_RW, + &google_allow_early_out, 1, + "Should we allow google probe-bw/drain to exit early at flight target?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_states), + OID_AUTO, "google_exit_loss", CTLFLAG_RW, + &google_consider_lost, 1, + "Should we have losses exit gain of probebw in google mode??"); + /* Startup controls */ + bbr_startup = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, + "startup", + CTLFLAG_RW, 0, + "Startup controls"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_startup), + OID_AUTO, "cheat_iwnd", CTLFLAG_RW, + &bbr_sends_full_iwnd, 1, + "Do we not pace but burst out initial windows has our TSO size?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_startup), + OID_AUTO, "loss_threshold", CTLFLAG_RW, + &bbr_startup_loss_thresh, 2000, + "In startup what is the loss threshold in a pe that will exit us from startup?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_startup), + OID_AUTO, "use_lowerpg", CTLFLAG_RW, + &bbr_use_lower_gain_in_startup, 1, + "Should we use a lower hptsi gain if we see loss in startup?"); + SYSCTL_ADD_U32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_startup), + OID_AUTO, "gain", CTLFLAG_RW, + &bbr_start_exit, 25, + "What gain percent do we need to see to stay in startup??"); + SYSCTL_ADD_U32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_startup), + OID_AUTO, "low_gain", CTLFLAG_RW, + &bbr_low_start_exit, 15, + "What gain percent do we need to see to stay in the lower gain startup??"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_startup), + OID_AUTO, "loss_exit", CTLFLAG_RW, + &bbr_exit_startup_at_loss, 1, + "Should we exit startup at loss in an epoch if we are not gaining?"); + /* CWND controls */ + bbr_cwnd = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, + "cwnd", + CTLFLAG_RW, 0, + "Cwnd controls"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_cwnd), + OID_AUTO, "tar_rtt", CTLFLAG_RW, + &bbr_cwndtarget_rtt_touse, 0, + "Target cwnd rtt measurment to use (0=rtt_prop, 1=rtt_rack, 2=pkt_rtt, 3=srtt)?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_cwnd), + OID_AUTO, "may_shrink", CTLFLAG_RW, + &bbr_cwnd_may_shrink, 0, + "Can the cwnd shrink if it would grow to more than the target?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_cwnd), + OID_AUTO, "max_target_limit", CTLFLAG_RW, + &bbr_target_cwnd_mult_limit, 8, + "Do we limit the cwnd to some multiple of the cwnd target if cwnd can't shrink 0=no?"); + SYSCTL_ADD_U32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_cwnd), + OID_AUTO, "highspeed_min", CTLFLAG_RW, + &bbr_cwnd_min_val_hs, BBR_HIGHSPEED_NUM_MSS, + "What is the high-speed min cwnd (rttProp under 1ms)"); + SYSCTL_ADD_U32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_cwnd), + OID_AUTO, "lowspeed_min", CTLFLAG_RW, + &bbr_cwnd_min_val, BBR_PROBERTT_NUM_MSS, + "What is the min cwnd (rttProp > 1ms)"); + SYSCTL_ADD_U32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_cwnd), + OID_AUTO, "initwin", CTLFLAG_RW, + &bbr_def_init_win, 10, + "What is the BBR initial window, if 0 use tcp version"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_cwnd), + OID_AUTO, "do_loss_red", CTLFLAG_RW, + &bbr_do_red, 600, + "Do we reduce the b/w at exit from recovery based on ratio of prop/srtt (800=80.0, 0=off)?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_cwnd), + OID_AUTO, "red_scale", CTLFLAG_RW, + &bbr_red_scale, 20000, + "What RTT do we scale with?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_cwnd), + OID_AUTO, "red_growslow", CTLFLAG_RW, + &bbr_red_growth_restrict, 1, + "Do we restrict cwnd growth for whats in flight?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_cwnd), + OID_AUTO, "red_div", CTLFLAG_RW, + &bbr_red_div, 2, + "If we reduce whats the divisor?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_cwnd), + OID_AUTO, "red_mul", CTLFLAG_RW, + &bbr_red_mul, 1, + "If we reduce whats the mulitiplier?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_cwnd), + OID_AUTO, "target_is_unit", CTLFLAG_RW, + &bbr_target_is_bbunit, 0, + "Is the state target the pacing_gain or BBR_UNIT?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_cwnd), + OID_AUTO, "drop_limit", CTLFLAG_RW, + &bbr_drop_limit, 0, + "Number of segments limit for drop (0=use min_cwnd w/flight)?"); + + /* Timeout controls */ + bbr_timeout = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, + "timeout", + CTLFLAG_RW, 0, + "Time out controls"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_timeout), + OID_AUTO, "delack", CTLFLAG_RW, + &bbr_delack_time, 100000, + "BBR's delayed ack time"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_timeout), + OID_AUTO, "tlp_uses", CTLFLAG_RW, + &bbr_tlp_type_to_use, 3, + "RTT that TLP uses in its calculations, 0=rttProp, 1=Rack_rtt, 2=pkt_rtt and 3=srtt"); + SYSCTL_ADD_U32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_timeout), + OID_AUTO, "persmin", CTLFLAG_RW, + &bbr_persist_min, 250000, + "What is the minimum time in microseconds between persists"); + SYSCTL_ADD_U32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_timeout), + OID_AUTO, "persmax", CTLFLAG_RW, + &bbr_persist_max, 1000000, + "What is the largest delay in microseconds between persists"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_timeout), + OID_AUTO, "tlp_minto", CTLFLAG_RW, + &bbr_tlp_min, 10000, + "TLP Min timeout in usecs"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_timeout), + OID_AUTO, "tlp_dack_time", CTLFLAG_RW, + &bbr_delayed_ack_time, 200000, + "TLP delayed ack compensation value"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, "minrto", CTLFLAG_RW, + &bbr_rto_min_ms, 30, + "Minimum RTO in ms"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_timeout), + OID_AUTO, "maxrto", CTLFLAG_RW, + &bbr_rto_max_sec, 4, + "Maxiumum RTO in seconds -- should be at least as large as min_rto"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_timeout), + OID_AUTO, "tlp_retry", CTLFLAG_RW, + &bbr_tlp_max_resend, 2, + "How many times does TLP retry a single segment or multiple with no ACK"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_timeout), + OID_AUTO, "minto", CTLFLAG_RW, + &bbr_min_to, 1000, + "Minimum rack timeout in useconds"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_timeout), + OID_AUTO, "pktdelay", CTLFLAG_RW, + &bbr_pkt_delay, 1000, + "Extra RACK time (in useconds) besides reordering thresh"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_timeout), + OID_AUTO, "incr_tmrs", CTLFLAG_RW, + &bbr_incr_timers, 1, + "Increase the RXT/TLP timer by the pacing time used?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_timeout), + OID_AUTO, "rxtmark_sackpassed", CTLFLAG_RW, + &bbr_marks_rxt_sack_passed, 0, + "Mark sack passed on all those not ack'd when a RXT hits?"); + /* Policer controls */ + bbr_policer = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, + "policer", + CTLFLAG_RW, 0, + "Policer controls"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_policer), + OID_AUTO, "detect_enable", CTLFLAG_RW, + &bbr_policer_detection_enabled, 1, + "Is policer detection enabled??"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_policer), + OID_AUTO, "min_pes", CTLFLAG_RW, + &bbr_lt_intvl_min_rtts, 4, + "Minimum number of PE's?"); + SYSCTL_ADD_U64(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_policer), + OID_AUTO, "bwdiff", CTLFLAG_RW, + &bbr_lt_bw_diff, (4000/8), + "Minimal bw diff?"); + SYSCTL_ADD_U64(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_policer), + OID_AUTO, "bwratio", CTLFLAG_RW, + &bbr_lt_bw_ratio, 8, + "Minimal bw diff?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_policer), + OID_AUTO, "from_rack_rxt", CTLFLAG_RW, + &bbr_policer_call_from_rack_to, 0, + "Do we call the policer detection code from a rack-timeout?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_policer), + OID_AUTO, "false_postive", CTLFLAG_RW, + &bbr_lt_intvl_fp, 0, + "What packet epoch do we do false-postive detection at (0=no)?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_policer), + OID_AUTO, "loss_thresh", CTLFLAG_RW, + &bbr_lt_loss_thresh, 196, + "Loss threshold 196 = 19.6%?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_policer), + OID_AUTO, "false_postive_thresh", CTLFLAG_RW, + &bbr_lt_fd_thresh, 100, + "What percentage is the false detection threshold (150=15.0)?"); + /* All the rest */ + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, "cheat_rxt", CTLFLAG_RW, + &bbr_use_rack_resend_cheat, 0, + "Do we burst 1ms between sends on retransmissions (like rack)?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, "error_paceout", CTLFLAG_RW, + &bbr_error_base_paceout, 10000, + "When we hit an error what is the min to pace out in usec's?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, "kill_paceout", CTLFLAG_RW, + &bbr_max_net_error_cnt, 10, + "When we hit this many errors in a row, kill the session?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, "data_after_close", CTLFLAG_RW, + &bbr_ignore_data_after_close, 1, + "Do we hold off sending a RST until all pending data is ack'd"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, "resend_use_tso", CTLFLAG_RW, + &bbr_resends_use_tso, 0, + "Can resends use TSO?"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, "sblklimit", CTLFLAG_RW, + &bbr_sack_block_limit, 128, + "When do we start ignoring small sack blocks"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, "bb_verbose", CTLFLAG_RW, + &bbr_verbose_logging, 0, + "Should BBR black box logging be verbose"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, "reorder_thresh", CTLFLAG_RW, + &bbr_reorder_thresh, 2, + "What factor for rack will be added when seeing reordering (shift right)"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, "reorder_fade", CTLFLAG_RW, + &bbr_reorder_fade, 0, + "Does reorder detection fade, if so how many ms (0 means never)"); + SYSCTL_ADD_S32(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, + &bbr_tlp_thresh, 1, + "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); + /* Stats and counters */ + /* The pacing counters for hdwr/software can't be in the array */ + bbr_nohdwr_pacing_enobuf = counter_u64_alloc(M_WAITOK); + bbr_hdwr_pacing_enobuf = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, "enob_hdwr_pacing", CTLFLAG_RD, + &bbr_hdwr_pacing_enobuf, + "Total number of enobufs for hardware paced flows"); + SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, "enob_no_hdwr_pacing", CTLFLAG_RD, + &bbr_nohdwr_pacing_enobuf, + "Total number of enobufs for non-hardware paced flows"); + + + bbr_flows_whdwr_pacing = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, "hdwr_pacing", CTLFLAG_RD, + &bbr_flows_whdwr_pacing, + "Total number of hardware paced flows"); + bbr_flows_nohdwr_pacing = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, "software_pacing", CTLFLAG_RD, + &bbr_flows_nohdwr_pacing, + "Total number of software paced flows"); + COUNTER_ARRAY_ALLOC(bbr_stat_arry, BBR_STAT_SIZE, M_WAITOK); + SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, "stats", CTLFLAG_RD, + bbr_stat_arry, BBR_STAT_SIZE, "BBR Stats"); + COUNTER_ARRAY_ALLOC(bbr_opts_arry, BBR_OPTS_SIZE, M_WAITOK); + SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, "opts", CTLFLAG_RD, + bbr_opts_arry, BBR_OPTS_SIZE, "BBR Option Stats"); + COUNTER_ARRAY_ALLOC(bbr_state_lost, BBR_MAX_STAT, M_WAITOK); + SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, "lost", CTLFLAG_RD, + bbr_state_lost, BBR_MAX_STAT, "Stats of when losses occur"); + COUNTER_ARRAY_ALLOC(bbr_state_resend, BBR_MAX_STAT, M_WAITOK); + SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, "stateresend", CTLFLAG_RD, + bbr_state_resend, BBR_MAX_STAT, "Stats of what states resend"); + COUNTER_ARRAY_ALLOC(bbr_state_time, BBR_MAX_STAT, M_WAITOK); + SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, "statetime", CTLFLAG_RD, + bbr_state_time, BBR_MAX_STAT, "Stats of time spent in the states"); + COUNTER_ARRAY_ALLOC(bbr_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); + SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, "outsize", CTLFLAG_RD, + bbr_out_size, TCP_MSS_ACCT_SIZE, "Size of output calls"); + SYSCTL_ADD_PROC(&bbr_sysctl_ctx, + SYSCTL_CHILDREN(bbr_sysctl_root), + OID_AUTO, "clrlost", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, + &bbr_clear_lost, 0, sysctl_bbr_clear_lost, "IU", "Clear lost counters"); +} + +static inline int32_t +bbr_progress_timeout_check(struct tcp_bbr *bbr) +{ + if (bbr->rc_tp->t_maxunacktime && bbr->rc_tp->t_acktime && + TSTMP_GT(ticks, bbr->rc_tp->t_acktime)) { + if ((((uint32_t)ticks - bbr->rc_tp->t_acktime)) >= bbr->rc_tp->t_maxunacktime) { + /* + * There is an assumption here that the caller will + * drop the connection, so we increment the + * statistics. + */ + bbr_log_progress_event(bbr, bbr->rc_tp, ticks, PROGRESS_DROP, __LINE__); + BBR_STAT_INC(bbr_progress_drops); +#ifdef NETFLIX_STATS + TCPSTAT_INC(tcps_progdrops); +#endif + return (1); + } + } + return (0); +} + +static void +bbr_counter_destroy() +{ + COUNTER_ARRAY_FREE(bbr_stat_arry, BBR_STAT_SIZE); + COUNTER_ARRAY_FREE(bbr_opts_arry, BBR_OPTS_SIZE); + COUNTER_ARRAY_FREE(bbr_out_size, TCP_MSS_ACCT_SIZE); + COUNTER_ARRAY_FREE(bbr_state_lost, BBR_MAX_STAT); + COUNTER_ARRAY_FREE(bbr_state_time, BBR_MAX_STAT); + COUNTER_ARRAY_FREE(bbr_state_resend, BBR_MAX_STAT); + counter_u64_free(bbr_flows_whdwr_pacing); + counter_u64_free(bbr_flows_nohdwr_pacing); + +} + +static __inline void +bbr_fill_in_logging_data(struct tcp_bbr *bbr, struct tcp_log_bbr *l, uint32_t cts) +{ + memset(l, 0, sizeof(union tcp_log_stackspecific)); + l->cur_del_rate = bbr->r_ctl.rc_bbr_cur_del_rate; + l->delRate = get_filter_value(&bbr->r_ctl.rc_delrate); + l->rttProp = get_filter_value_small(&bbr->r_ctl.rc_rttprop); + l->bw_inuse = bbr_get_bw(bbr); + l->inflight = ctf_flight_size(bbr->rc_tp, + (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); + l->applimited = bbr->r_ctl.r_app_limited_until; + l->delivered = bbr->r_ctl.rc_delivered; + l->timeStamp = cts; + l->lost = bbr->r_ctl.rc_lost; + l->bbr_state = bbr->rc_bbr_state; + l->bbr_substate = bbr_state_val(bbr); + l->epoch = bbr->r_ctl.rc_rtt_epoch; + l->lt_epoch = bbr->r_ctl.rc_lt_epoch; + l->pacing_gain = bbr->r_ctl.rc_bbr_hptsi_gain; + l->cwnd_gain = bbr->r_ctl.rc_bbr_cwnd_gain; + l->inhpts = bbr->rc_inp->inp_in_hpts; + l->ininput = bbr->rc_inp->inp_in_input; + l->use_lt_bw = bbr->rc_lt_use_bw; + l->pkts_out = bbr->r_ctl.rc_flight_at_input; + l->pkt_epoch = bbr->r_ctl.rc_pkt_epoch; +} + +static void +bbr_log_type_bw_reduce(struct tcp_bbr *bbr, int reason) +{ + if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); + log.u_bbr.flex1 = 0; + log.u_bbr.flex2 = 0; + log.u_bbr.flex5 = 0; + log.u_bbr.flex3 = 0; + log.u_bbr.flex4 = bbr->r_ctl.rc_pkt_epoch_loss_rate; + log.u_bbr.flex7 = reason; + log.u_bbr.flex6 = bbr->r_ctl.rc_bbr_enters_probertt; + log.u_bbr.flex8 = 0; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_BW_RED_EV, 0, + 0, &log, false, &bbr->rc_tv); + } +} + +static void +bbr_log_type_rwnd_collapse(struct tcp_bbr *bbr, int seq, int mode, uint32_t count) +{ + if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); + log.u_bbr.flex1 = seq; + log.u_bbr.flex2 = count; + log.u_bbr.flex8 = mode; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_LOWGAIN, 0, + 0, &log, false, &bbr->rc_tv); + } +} + + + +static void +bbr_log_type_just_return(struct tcp_bbr *bbr, uint32_t cts, uint32_t tlen, uint8_t hpts_calling, + uint8_t reason, uint32_t p_maxseg, int len) +{ + if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + log.u_bbr.flex1 = p_maxseg; + log.u_bbr.flex2 = bbr->r_ctl.rc_hpts_flags; + log.u_bbr.flex3 = bbr->r_ctl.rc_timer_exp; + log.u_bbr.flex4 = reason; + log.u_bbr.flex5 = bbr->rc_in_persist; + log.u_bbr.flex6 = bbr->r_ctl.rc_last_delay_val; + log.u_bbr.flex7 = p_maxseg; + log.u_bbr.flex8 = bbr->rc_in_persist; + log.u_bbr.pkts_out = 0; + log.u_bbr.applimited = len; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_JUSTRET, 0, + tlen, &log, false, &bbr->rc_tv); + } +} + + +static void +bbr_log_type_enter_rec(struct tcp_bbr *bbr, uint32_t seq) +{ + if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); + log.u_bbr.flex1 = seq; + log.u_bbr.flex2 = bbr->r_ctl.rc_cwnd_on_ent; + log.u_bbr.flex3 = bbr->r_ctl.rc_recovery_start; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_ENTREC, 0, + 0, &log, false, &bbr->rc_tv); + } +} + +static void +bbr_log_msgsize_fail(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t len, uint32_t maxseg, uint32_t mtu, int32_t csum_flags, int32_t tso, uint32_t cts) +{ + if (tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + log.u_bbr.flex1 = tso; + log.u_bbr.flex2 = maxseg; + log.u_bbr.flex3 = mtu; + log.u_bbr.flex4 = csum_flags; + TCP_LOG_EVENTP(tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_MSGSIZE, 0, + 0, &log, false, &bbr->rc_tv); + } +} + +static void +bbr_log_flowend(struct tcp_bbr *bbr) +{ + if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + struct sockbuf *r, *s; + struct timeval tv; + + if (bbr->rc_inp->inp_socket) { + r = &bbr->rc_inp->inp_socket->so_rcv; + s = &bbr->rc_inp->inp_socket->so_snd; + } else { + r = s = NULL; + } + bbr_fill_in_logging_data(bbr, &log.u_bbr, tcp_get_usecs(&tv)); + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + r, s, + TCP_LOG_FLOWEND, 0, + 0, &log, false, &tv); + } +} + +static void +bbr_log_pkt_epoch(struct tcp_bbr *bbr, uint32_t cts, uint32_t line, + uint32_t lost, uint32_t del) +{ + if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + log.u_bbr.flex1 = lost; + log.u_bbr.flex2 = del; + log.u_bbr.flex3 = bbr->r_ctl.rc_bbr_lastbtlbw; + log.u_bbr.flex4 = bbr->r_ctl.rc_pkt_epoch_rtt; + log.u_bbr.flex5 = bbr->r_ctl.rc_bbr_last_startup_epoch; + log.u_bbr.flex6 = bbr->r_ctl.rc_lost_at_startup; + log.u_bbr.flex7 = line; + log.u_bbr.flex8 = 0; + log.u_bbr.inflight = bbr->r_ctl.r_measurement_count; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_PKT_EPOCH, 0, + 0, &log, false, &bbr->rc_tv); + } +} + +static void +bbr_log_time_epoch(struct tcp_bbr *bbr, uint32_t cts, uint32_t line, uint32_t epoch_time) +{ + if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + log.u_bbr.flex1 = bbr->r_ctl.rc_lost; + log.u_bbr.flex2 = bbr->rc_inp->inp_socket->so_snd.sb_lowat; + log.u_bbr.flex3 = bbr->rc_inp->inp_socket->so_snd.sb_hiwat; + log.u_bbr.flex7 = line; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_TIME_EPOCH, 0, + 0, &log, false, &bbr->rc_tv); + } +} + +static void +bbr_log_set_of_state_target(struct tcp_bbr *bbr, uint32_t new_tar, int line, int meth) +{ + if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); + log.u_bbr.flex1 = bbr->r_ctl.rc_target_at_state; + log.u_bbr.flex2 = new_tar; + log.u_bbr.flex3 = line; + log.u_bbr.flex4 = bbr->r_ctl.rc_pace_max_segs; + log.u_bbr.flex5 = bbr_quanta; + log.u_bbr.flex6 = bbr->r_ctl.rc_pace_min_segs; + log.u_bbr.flex7 = bbr->rc_last_options; + log.u_bbr.flex8 = meth; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_STATE_TARGET, 0, + 0, &log, false, &bbr->rc_tv); + } + +} + +static void +bbr_log_type_statechange(struct tcp_bbr *bbr, uint32_t cts, int32_t line) +{ + if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + log.u_bbr.flex1 = line; + log.u_bbr.flex2 = bbr->r_ctl.rc_rtt_shrinks; + log.u_bbr.flex3 = bbr->r_ctl.rc_probertt_int; + if (bbr_state_is_pkt_epoch) + log.u_bbr.flex4 = bbr_get_rtt(bbr, BBR_RTT_PKTRTT); + else + log.u_bbr.flex4 = bbr_get_rtt(bbr, BBR_RTT_PROP); + log.u_bbr.flex5 = bbr->r_ctl.rc_bbr_last_startup_epoch; + log.u_bbr.flex6 = bbr->r_ctl.rc_lost_at_startup; + log.u_bbr.flex7 = (bbr->r_ctl.rc_target_at_state/1000); + log.u_bbr.lt_epoch = bbr->r_ctl.rc_level_state_extra; + log.u_bbr.pkts_out = bbr->r_ctl.rc_target_at_state; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_STATE, 0, + 0, &log, false, &bbr->rc_tv); + } +} + +static void +bbr_log_rtt_shrinks(struct tcp_bbr *bbr, uint32_t cts, uint32_t applied, + uint32_t rtt, uint32_t line, uint8_t reas, uint16_t cond) +{ + if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + log.u_bbr.flex1 = line; + log.u_bbr.flex2 = bbr->r_ctl.rc_rtt_shrinks; + log.u_bbr.flex3 = bbr->r_ctl.last_in_probertt; + log.u_bbr.flex4 = applied; + log.u_bbr.flex5 = rtt; + log.u_bbr.flex6 = bbr->r_ctl.rc_target_at_state; + log.u_bbr.flex7 = cond; + log.u_bbr.flex8 = reas; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_RTT_SHRINKS, 0, + 0, &log, false, &bbr->rc_tv); + } +} + +static void +bbr_log_type_exit_rec(struct tcp_bbr *bbr) +{ + if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); + log.u_bbr.flex1 = bbr->r_ctl.rc_recovery_start; + log.u_bbr.flex2 = bbr->r_ctl.rc_cwnd_on_ent; + log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_EXITREC, 0, + 0, &log, false, &bbr->rc_tv); + } +} + +static void +bbr_log_type_cwndupd(struct tcp_bbr *bbr, uint32_t bytes_this_ack, uint32_t chg, + uint32_t prev_acked, int32_t meth, uint32_t target, uint32_t th_ack, int32_t line) +{ + if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); + log.u_bbr.flex1 = line; + log.u_bbr.flex2 = prev_acked; + log.u_bbr.flex3 = bytes_this_ack; + log.u_bbr.flex4 = chg; + log.u_bbr.flex5 = th_ack; + log.u_bbr.flex6 = target; + log.u_bbr.flex8 = meth; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_CWND, 0, + 0, &log, false, &bbr->rc_tv); + } +} + +static void +bbr_log_rtt_sample(struct tcp_bbr *bbr, uint32_t rtt, uint32_t tsin) +{ + /* + * Log the rtt sample we are applying to the srtt algorithm in + * useconds. + */ + if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); + log.u_bbr.flex1 = rtt; + log.u_bbr.flex2 = bbr->r_ctl.rc_bbr_state_time; + log.u_bbr.flex3 = bbr->r_ctl.rc_ack_hdwr_delay; + log.u_bbr.flex4 = bbr->rc_tp->ts_offset; + log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state; + log.u_bbr.pkts_out = tcp_tv_to_mssectick(&bbr->rc_tv); + log.u_bbr.flex6 = tsin; + log.u_bbr.flex7 = 0; + log.u_bbr.flex8 = bbr->rc_ack_was_delayed; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + TCP_LOG_RTT, 0, + 0, &log, false, &bbr->rc_tv); + } +} + +static void +bbr_log_type_pesist(struct tcp_bbr *bbr, uint32_t cts, uint32_t time_in, int32_t line, uint8_t enter_exit) +{ + if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + log.u_bbr.flex1 = time_in; + log.u_bbr.flex2 = line; + log.u_bbr.flex8 = enter_exit; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_PERSIST, 0, + 0, &log, false, &bbr->rc_tv); + } +} +static void +bbr_log_ack_clear(struct tcp_bbr *bbr, uint32_t cts) +{ + if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + log.u_bbr.flex1 = bbr->rc_tp->ts_recent_age; + log.u_bbr.flex2 = bbr->r_ctl.rc_rtt_shrinks; + log.u_bbr.flex3 = bbr->r_ctl.rc_probertt_int; + log.u_bbr.flex4 = bbr->r_ctl.rc_went_idle_time; + log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_ACKCLEAR, 0, + 0, &log, false, &bbr->rc_tv); + } +} + +static void +bbr_log_ack_event(struct tcp_bbr *bbr, struct tcphdr *th, struct tcpopt *to, uint32_t tlen, + uint16_t nsegs, uint32_t cts, int32_t nxt_pkt, struct mbuf *m) +{ + if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + struct timeval tv; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + log.u_bbr.flex1 = nsegs; + log.u_bbr.flex2 = bbr->r_ctl.rc_lost_bytes; + if (m) { + struct timespec ts; + + log.u_bbr.flex3 = m->m_flags; + if (m->m_flags & M_TSTMP) { + mbuf_tstmp2timespec(m, &ts); + tv.tv_sec = ts.tv_sec; + tv.tv_usec = ts.tv_nsec / 1000; + log.u_bbr.lt_epoch = tcp_tv_to_usectick(&tv); + } else { + log.u_bbr.lt_epoch = 0; + } + if (m->m_flags & M_TSTMP_LRO) { + tv.tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000; + tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000) / 1000; + log.u_bbr.flex5 = tcp_tv_to_usectick(&tv); + } else { + /* No arrival timestamp */ + log.u_bbr.flex5 = 0; + } + + log.u_bbr.pkts_out = tcp_get_usecs(&tv); + } else { + log.u_bbr.flex3 = 0; + log.u_bbr.flex5 = 0; + log.u_bbr.flex6 = 0; + log.u_bbr.pkts_out = 0; + } + log.u_bbr.flex4 = bbr->r_ctl.rc_target_at_state; + log.u_bbr.flex7 = bbr->r_wanted_output; + log.u_bbr.flex8 = bbr->rc_in_persist; + TCP_LOG_EVENTP(bbr->rc_tp, th, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + TCP_LOG_IN, 0, + tlen, &log, true, &bbr->rc_tv); + } +} + +static void +bbr_log_doseg_done(struct tcp_bbr *bbr, uint32_t cts, int32_t nxt_pkt, int32_t did_out) +{ + if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + log.u_bbr.flex1 = did_out; + log.u_bbr.flex2 = nxt_pkt; + log.u_bbr.flex3 = bbr->r_ctl.rc_last_delay_val; + log.u_bbr.flex4 = bbr->r_ctl.rc_hpts_flags; + log.u_bbr.flex5 = bbr->r_ctl.rc_timer_exp; + log.u_bbr.flex6 = bbr->r_ctl.rc_lost_bytes; + log.u_bbr.flex7 = bbr->r_wanted_output; + log.u_bbr.flex8 = bbr->rc_in_persist; + log.u_bbr.pkts_out = bbr->r_ctl.highest_hdwr_delay; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_DOSEG_DONE, 0, + 0, &log, true, &bbr->rc_tv); + } +} + +static void +bbr_log_enobuf_jmp(struct tcp_bbr *bbr, uint32_t len, uint32_t cts, + int32_t line, uint32_t o_len, uint32_t segcnt, uint32_t segsiz) +{ + if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + log.u_bbr.flex1 = line; + log.u_bbr.flex2 = o_len; + log.u_bbr.flex3 = segcnt; + log.u_bbr.flex4 = segsiz; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_ENOBUF_JMP, ENOBUFS, + len, &log, true, &bbr->rc_tv); + } +} + +static void +bbr_log_to_processing(struct tcp_bbr *bbr, uint32_t cts, int32_t ret, int32_t timers, uint8_t hpts_calling) +{ + if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + log.u_bbr.flex1 = timers; + log.u_bbr.flex2 = ret; + log.u_bbr.flex3 = bbr->r_ctl.rc_timer_exp; + log.u_bbr.flex4 = bbr->r_ctl.rc_hpts_flags; + log.u_bbr.flex5 = cts; + log.u_bbr.flex6 = bbr->r_ctl.rc_target_at_state; + log.u_bbr.flex8 = hpts_calling; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_TO_PROCESS, 0, + 0, &log, false, &bbr->rc_tv); + } +} + +static void +bbr_log_to_event(struct tcp_bbr *bbr, uint32_t cts, int32_t to_num) +{ + if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + uint64_t ar; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + log.u_bbr.flex1 = bbr->bbr_timer_src; + log.u_bbr.flex2 = 0; + log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags; + ar = (uint64_t)(bbr->r_ctl.rc_resend); + ar >>= 32; + ar &= 0x00000000ffffffff; + log.u_bbr.flex4 = (uint32_t)ar; + ar = (uint64_t)bbr->r_ctl.rc_resend; + ar &= 0x00000000ffffffff; + log.u_bbr.flex5 = (uint32_t)ar; + log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur); + log.u_bbr.flex8 = to_num; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_RTO, 0, + 0, &log, false, &bbr->rc_tv); + } +} + +static void +bbr_log_startup_event(struct tcp_bbr *bbr, uint32_t cts, uint32_t flex1, uint32_t flex2, uint32_t flex3, uint8_t reason) +{ + if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + log.u_bbr.flex1 = flex1; + log.u_bbr.flex2 = flex2; + log.u_bbr.flex3 = flex3; + log.u_bbr.flex4 = 0; + log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state; + log.u_bbr.flex6 = bbr->r_ctl.rc_lost_at_startup; + log.u_bbr.flex8 = reason; + log.u_bbr.cur_del_rate = bbr->r_ctl.rc_bbr_lastbtlbw; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_REDUCE, 0, + 0, &log, false, &bbr->rc_tv); + } +} + +static void +bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag) +{ + if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + log.u_bbr.flex1 = diag->p_nxt_slot; + log.u_bbr.flex2 = diag->p_cur_slot; + log.u_bbr.flex3 = diag->slot_req; + log.u_bbr.flex4 = diag->inp_hptsslot; + log.u_bbr.flex5 = diag->slot_remaining; + log.u_bbr.flex6 = diag->need_new_to; + log.u_bbr.flex7 = diag->p_hpts_active; + log.u_bbr.flex8 = diag->p_on_min_sleep; + /* Hijack other fields as needed */ + log.u_bbr.epoch = diag->have_slept; + log.u_bbr.lt_epoch = diag->yet_to_sleep; + log.u_bbr.pkts_out = diag->co_ret; + log.u_bbr.applimited = diag->hpts_sleep_time; + log.u_bbr.delivered = diag->p_prev_slot; + log.u_bbr.inflight = diag->p_runningtick; + log.u_bbr.bw_inuse = diag->wheel_tick; + log.u_bbr.rttProp = diag->wheel_cts; + log.u_bbr.delRate = diag->maxticks; + log.u_bbr.cur_del_rate = diag->p_curtick; + log.u_bbr.cur_del_rate <<= 32; + log.u_bbr.cur_del_rate |= diag->p_lasttick; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_HPTSDIAG, 0, + 0, &log, false, &bbr->rc_tv); + } +} + +static void +bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts, uint32_t time_since_sent, uint32_t srtt, + uint32_t thresh, uint32_t to) +{ + if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + log.u_bbr.flex1 = bbr->rc_tp->t_rttvar; + log.u_bbr.flex2 = time_since_sent; + log.u_bbr.flex3 = srtt; + log.u_bbr.flex4 = thresh; + log.u_bbr.flex5 = to; + log.u_bbr.flex6 = bbr->rc_tp->t_srtt; + log.u_bbr.flex8 = mode; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_TIMERPREP, 0, + 0, &log, false, &bbr->rc_tv); + } +} + +static void +bbr_log_pacing_delay_calc(struct tcp_bbr *bbr, uint16_t gain, uint32_t len, + uint32_t cts, uint32_t usecs, uint64_t bw, uint32_t override, int mod) +{ + if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + log.u_bbr.flex1 = usecs; + log.u_bbr.flex2 = len; + log.u_bbr.flex3 = (uint32_t)((bw >> 32) & 0x00000000ffffffff); + log.u_bbr.flex4 = (uint32_t)(bw & 0x00000000ffffffff); + if (override) + log.u_bbr.flex5 = (1 << 2); + else + log.u_bbr.flex5 = 0; + log.u_bbr.flex6 = override; + log.u_bbr.flex7 = gain; + log.u_bbr.flex8 = mod; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_HPTSI_CALC, 0, + len, &log, false, &bbr->rc_tv); + } +} + +static void +bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) +{ + if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + + log.u_bbr.flex1 = bbr->bbr_timer_src; + log.u_bbr.flex2 = to; + log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags; + log.u_bbr.flex4 = slot; + log.u_bbr.flex5 = bbr->rc_inp->inp_hptsslot; + log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur); + log.u_bbr.pkts_out = bbr->rc_inp->inp_flags2; + log.u_bbr.flex8 = which; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_TIMERSTAR, 0, + 0, &log, false, &bbr->rc_tv); + } +} + +static void +bbr_log_thresh_choice(struct tcp_bbr *bbr, uint32_t cts, uint32_t thresh, uint32_t lro, uint32_t srtt, struct bbr_sendmap *rsm, uint8_t frm) +{ + if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + log.u_bbr.flex1 = thresh; + log.u_bbr.flex2 = lro; + log.u_bbr.flex3 = bbr->r_ctl.rc_reorder_ts; + log.u_bbr.flex4 = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; + log.u_bbr.flex5 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur); + log.u_bbr.flex6 = srtt; + log.u_bbr.flex7 = bbr->r_ctl.rc_reorder_shift; + log.u_bbr.flex8 = frm; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_THRESH_CALC, 0, + 0, &log, false, &bbr->rc_tv); + } +} + +static void +bbr_log_to_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts, uint8_t hpts_removed) +{ + if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + log.u_bbr.flex1 = line; + log.u_bbr.flex2 = bbr->bbr_timer_src; + log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags; + log.u_bbr.flex4 = bbr->rc_in_persist; + log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state; + log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur); + log.u_bbr.flex8 = hpts_removed; + log.u_bbr.pkts_out = bbr->rc_pacer_started; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_TIMERCANC, 0, + 0, &log, false, &bbr->rc_tv); + } +} + + +static void +bbr_log_tstmp_validation(struct tcp_bbr *bbr, uint64_t peer_delta, uint64_t delta) +{ + if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); + log.u_bbr.flex1 = bbr->r_ctl.bbr_peer_tsratio; + log.u_bbr.flex2 = (peer_delta >> 32); + log.u_bbr.flex3 = (peer_delta & 0x00000000ffffffff); + log.u_bbr.flex4 = (delta >> 32); + log.u_bbr.flex5 = (delta & 0x00000000ffffffff); + log.u_bbr.flex7 = bbr->rc_ts_clock_set; + log.u_bbr.flex8 = bbr->rc_ts_cant_be_used; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_TSTMP_VAL, 0, + 0, &log, false, &bbr->rc_tv); + + } +} + +static void +bbr_log_type_tsosize(struct tcp_bbr *bbr, uint32_t cts, uint32_t tsosz, uint32_t tls, uint32_t old_val, uint32_t maxseg, int hdwr) +{ + if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + log.u_bbr.flex1 = tsosz; + log.u_bbr.flex2 = tls; + log.u_bbr.flex3 = tcp_min_hptsi_time; + log.u_bbr.flex4 = bbr->r_ctl.bbr_hptsi_bytes_min; + log.u_bbr.flex5 = old_val; + log.u_bbr.flex6 = maxseg; + log.u_bbr.flex7 = bbr->rc_no_pacing; + log.u_bbr.flex7 <<= 1; + log.u_bbr.flex7 |= bbr->rc_past_init_win; + if (hdwr) + log.u_bbr.flex8 = 0x80 | bbr->rc_use_google; + else + log.u_bbr.flex8 = bbr->rc_use_google; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_BBRTSO, 0, + 0, &log, false, &bbr->rc_tv); + } +} + +static void +bbr_log_type_rsmclear(struct tcp_bbr *bbr, uint32_t cts, struct bbr_sendmap *rsm, + uint32_t flags, uint32_t line) +{ + if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + log.u_bbr.flex1 = line; + log.u_bbr.flex2 = rsm->r_start; + log.u_bbr.flex3 = rsm->r_end; + log.u_bbr.flex4 = rsm->r_delivered; + log.u_bbr.flex5 = rsm->r_rtr_cnt; + log.u_bbr.flex6 = rsm->r_dupack; + log.u_bbr.flex7 = rsm->r_tim_lastsent[0]; + log.u_bbr.flex8 = rsm->r_flags; + /* Hijack the pkts_out fids */ + log.u_bbr.applimited = flags; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_RSM_CLEARED, 0, + 0, &log, false, &bbr->rc_tv); + } +} + +static void +bbr_log_type_bbrupd(struct tcp_bbr *bbr, uint8_t flex8, uint32_t cts, + uint32_t flex3, uint32_t flex2, uint32_t flex5, + uint32_t flex6, uint32_t pkts_out, int flex7, + uint32_t flex4, uint32_t flex1) +{ + + if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + log.u_bbr.flex1 = flex1; + log.u_bbr.flex2 = flex2; + log.u_bbr.flex3 = flex3; + log.u_bbr.flex4 = flex4; + log.u_bbr.flex5 = flex5; + log.u_bbr.flex6 = flex6; + log.u_bbr.flex7 = flex7; + /* Hijack the pkts_out fids */ + log.u_bbr.pkts_out = pkts_out; + log.u_bbr.flex8 = flex8; + if (bbr->rc_ack_was_delayed) + log.u_bbr.epoch = bbr->r_ctl.rc_ack_hdwr_delay; + else + log.u_bbr.epoch = 0; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_BBRUPD, 0, + flex2, &log, false, &bbr->rc_tv); + } +} + + +static void +bbr_log_type_ltbw(struct tcp_bbr *bbr, uint32_t cts, int32_t reason, + uint32_t newbw, uint32_t obw, uint32_t diff, + uint32_t tim) +{ + if (/*bbr_verbose_logging && */(bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + log.u_bbr.flex1 = reason; + log.u_bbr.flex2 = newbw; + log.u_bbr.flex3 = obw; + log.u_bbr.flex4 = diff; + log.u_bbr.flex5 = bbr->r_ctl.rc_lt_lost; + log.u_bbr.flex6 = bbr->r_ctl.rc_lt_del; + log.u_bbr.flex7 = bbr->rc_lt_is_sampling; + log.u_bbr.pkts_out = tim; + log.u_bbr.bw_inuse = bbr->r_ctl.rc_lt_bw; + if (bbr->rc_lt_use_bw == 0) + log.u_bbr.epoch = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch; + else + log.u_bbr.epoch = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch_use; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_BWSAMP, 0, + 0, &log, false, &bbr->rc_tv); + } +} + +static inline void +bbr_log_progress_event(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t tick, int event, int line) +{ + if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); + log.u_bbr.flex1 = line; + log.u_bbr.flex2 = tick; + log.u_bbr.flex3 = tp->t_maxunacktime; + log.u_bbr.flex4 = tp->t_acktime; + log.u_bbr.flex8 = event; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_PROGRESS, 0, + 0, &log, false, &bbr->rc_tv); + } +} + +static void +bbr_type_log_hdwr_pacing(struct tcp_bbr *bbr, const struct ifnet *ifp, + uint64_t rate, uint64_t hw_rate, int line, uint32_t cts, + int error) +{ + if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff); + log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff); + log.u_bbr.flex3 = (((uint64_t)ifp >> 32) & 0x00000000ffffffff); + log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff); + log.u_bbr.bw_inuse = rate; + log.u_bbr.flex5 = line; + log.u_bbr.flex6 = error; + log.u_bbr.flex8 = bbr->skip_gain; + log.u_bbr.flex8 <<= 1; + log.u_bbr.flex8 |= bbr->gain_is_limited; + log.u_bbr.flex8 <<= 1; + log.u_bbr.flex8 |= bbr->bbr_hdrw_pacing; + log.u_bbr.pkts_out = bbr->rc_tp->t_maxseg; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_HDWR_PACE, 0, + 0, &log, false, &bbr->rc_tv); + } +} + +static void +bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot, uint32_t del_by, uint32_t cts, uint32_t line, uint32_t prev_delay) +{ + if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + log.u_bbr.flex1 = slot; + log.u_bbr.flex2 = del_by; + log.u_bbr.flex3 = prev_delay; + log.u_bbr.flex4 = line; + log.u_bbr.flex5 = bbr->r_ctl.rc_last_delay_val; + log.u_bbr.flex6 = bbr->r_ctl.rc_hptsi_agg_delay; + log.u_bbr.flex7 = (0x0000ffff & bbr->r_ctl.rc_hpts_flags); + log.u_bbr.flex8 = bbr->rc_in_persist; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_BBRSND, 0, + len, &log, false, &bbr->rc_tv); + } +} + +static void +bbr_log_type_bbrrttprop(struct tcp_bbr *bbr, uint32_t t, uint32_t end, uint32_t tsconv, uint32_t cts, int32_t match, uint32_t seq, uint8_t flags) +{ + if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + log.u_bbr.flex1 = bbr->r_ctl.rc_delivered; + log.u_bbr.flex2 = 0; + log.u_bbr.flex3 = bbr->r_ctl.rc_lowest_rtt; + log.u_bbr.flex4 = end; + log.u_bbr.flex5 = seq; + log.u_bbr.flex6 = t; + log.u_bbr.flex7 = match; + log.u_bbr.flex8 = flags; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_BBRRTT, 0, + 0, &log, false, &bbr->rc_tv); + } +} + +static void +bbr_log_exit_gain(struct tcp_bbr *bbr, uint32_t cts, int32_t entry_method) +{ + if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + log.u_bbr.flex1 = bbr->r_ctl.rc_target_at_state; + log.u_bbr.flex2 = (bbr->rc_tp->t_maxseg - bbr->rc_last_options); + log.u_bbr.flex3 = bbr->r_ctl.gain_epoch; + log.u_bbr.flex4 = bbr->r_ctl.rc_pace_max_segs; + log.u_bbr.flex5 = bbr->r_ctl.rc_pace_min_segs; + log.u_bbr.flex6 = bbr->r_ctl.rc_bbr_state_atflight; + log.u_bbr.flex7 = 0; + log.u_bbr.flex8 = entry_method; + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_EXIT_GAIN, 0, + 0, &log, false, &bbr->rc_tv); + } +} + +static void +bbr_log_settings_change(struct tcp_bbr *bbr, int settings_desired) +{ + if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); + /* R-HU */ + log.u_bbr.flex1 = 0; + log.u_bbr.flex2 = 0; + log.u_bbr.flex3 = 0; + log.u_bbr.flex4 = 0; + log.u_bbr.flex7 = 0; + log.u_bbr.flex8 = settings_desired; + + TCP_LOG_EVENTP(bbr->rc_tp, NULL, + &bbr->rc_inp->inp_socket->so_rcv, + &bbr->rc_inp->inp_socket->so_snd, + BBR_LOG_SETTINGS_CHG, 0, + 0, &log, false, &bbr->rc_tv); + } +} + +/* + * Returns the bw from the our filter. + */ +static inline uint64_t +bbr_get_full_bw(struct tcp_bbr *bbr) +{ + uint64_t bw; + + bw = get_filter_value(&bbr->r_ctl.rc_delrate); + + return (bw); +} + +static inline void +bbr_set_pktepoch(struct tcp_bbr *bbr, uint32_t cts, int32_t line) +{ + uint64_t calclr; + uint32_t lost, del; + + if (bbr->r_ctl.rc_lost > bbr->r_ctl.rc_lost_at_pktepoch) + lost = bbr->r_ctl.rc_lost - bbr->r_ctl.rc_lost_at_pktepoch; + else + lost = 0; + del = bbr->r_ctl.rc_delivered - bbr->r_ctl.rc_pkt_epoch_del; + if (lost == 0) { + calclr = 0; + } else if (del) { + calclr = lost; + calclr *= (uint64_t)1000; + calclr /= (uint64_t)del; + } else { + /* Nothing delivered? 100.0% loss */ + calclr = 1000; + } + bbr->r_ctl.rc_pkt_epoch_loss_rate = (uint32_t)calclr; + if (IN_RECOVERY(bbr->rc_tp->t_flags)) + bbr->r_ctl.recovery_lr += (uint32_t)calclr; + bbr->r_ctl.rc_pkt_epoch++; + if (bbr->rc_no_pacing && + (bbr->r_ctl.rc_pkt_epoch >= bbr->no_pacing_until)) { + bbr->rc_no_pacing = 0; + tcp_bbr_tso_size_check(bbr, cts); + } + bbr->r_ctl.rc_pkt_epoch_rtt = bbr_calc_time(cts, bbr->r_ctl.rc_pkt_epoch_time); + bbr->r_ctl.rc_pkt_epoch_time = cts; + /* What was our loss rate */ + bbr_log_pkt_epoch(bbr, cts, line, lost, del); + bbr->r_ctl.rc_pkt_epoch_del = bbr->r_ctl.rc_delivered; + bbr->r_ctl.rc_lost_at_pktepoch = bbr->r_ctl.rc_lost; +} + +static inline void +bbr_set_epoch(struct tcp_bbr *bbr, uint32_t cts, int32_t line) +{ + uint32_t epoch_time; + + /* Tick the RTT clock */ + bbr->r_ctl.rc_rtt_epoch++; + epoch_time = cts - bbr->r_ctl.rc_rcv_epoch_start; + bbr_log_time_epoch(bbr, cts, line, epoch_time); + bbr->r_ctl.rc_rcv_epoch_start = cts; +} + + +static inline void +bbr_isit_a_pkt_epoch(struct tcp_bbr *bbr, uint32_t cts, struct bbr_sendmap *rsm, int32_t line, int32_t cum_acked) +{ + if (SEQ_GEQ(rsm->r_delivered, bbr->r_ctl.rc_pkt_epoch_del)) { + bbr->rc_is_pkt_epoch_now = 1; + } +} + +/* + * Returns the bw from either the b/w filter + * or from the lt_bw (if the connection is being + * policed). + */ +static inline uint64_t +__bbr_get_bw(struct tcp_bbr *bbr) +{ + uint64_t bw, min_bw; + uint64_t rtt; + int gm_measure_cnt = 1; + + /* + * For startup we make, like google, a + * minimum b/w. This is generated from the + * IW and the rttProp. We do fall back to srtt + * if for some reason (initial handshake) we don't + * have a rttProp. We, in the worst case, fall back + * to the configured min_bw (rc_initial_hptsi_bw). + */ + if (bbr->rc_bbr_state == BBR_STATE_STARTUP) { + /* Attempt first to use rttProp */ + rtt = (uint64_t)get_filter_value_small(&bbr->r_ctl.rc_rttprop); + if (rtt && (rtt < 0xffffffff)) { +measure: + min_bw = (uint64_t)(bbr_initial_cwnd(bbr, bbr->rc_tp)) * + ((uint64_t)1000000); + min_bw /= rtt; + if (min_bw < bbr->r_ctl.rc_initial_hptsi_bw) { + min_bw = bbr->r_ctl.rc_initial_hptsi_bw; + } + + } else if (bbr->rc_tp->t_srtt != 0) { + /* No rttProp, use srtt? */ + rtt = bbr_get_rtt(bbr, BBR_SRTT); + goto measure; + } else { + min_bw = bbr->r_ctl.rc_initial_hptsi_bw; + } + } else + min_bw = 0; + + if ((bbr->rc_past_init_win == 0) && + (bbr->r_ctl.rc_delivered > bbr_initial_cwnd(bbr, bbr->rc_tp))) + bbr->rc_past_init_win = 1; + if ((bbr->rc_use_google) && (bbr->r_ctl.r_measurement_count >= 1)) + gm_measure_cnt = 0; + if (gm_measure_cnt && + ((bbr->r_ctl.r_measurement_count < bbr_min_measurements_req) || + (bbr->rc_past_init_win == 0))) { + /* For google we use our guess rate until we get 1 measurement */ + +use_initial_window: + rtt = (uint64_t)get_filter_value_small(&bbr->r_ctl.rc_rttprop); + if (rtt && (rtt < 0xffffffff)) { + /* + * We have an RTT measurment. Use that in + * combination with our initial window to calculate + * a b/w. + */ + bw = (uint64_t)(bbr_initial_cwnd(bbr, bbr->rc_tp)) * + ((uint64_t)1000000); + bw /= rtt; + if (bw < bbr->r_ctl.rc_initial_hptsi_bw) { + bw = bbr->r_ctl.rc_initial_hptsi_bw; + } + } else { + /* Drop back to the 40 and punt to a default */ + bw = bbr->r_ctl.rc_initial_hptsi_bw; + } + if (bw < 1) + /* Probably should panic */ + bw = 1; + if (bw > min_bw) + return (bw); + else + return (min_bw); + } + if (bbr->rc_lt_use_bw) + bw = bbr->r_ctl.rc_lt_bw; + else if (bbr->r_recovery_bw && (bbr->rc_use_google == 0)) + bw = bbr->r_ctl.red_bw; + else + bw = get_filter_value(&bbr->r_ctl.rc_delrate); + if (bbr->rc_tp->t_peakrate_thr && (bbr->rc_use_google == 0)) { + /* + * Enforce user set rate limit, keep in mind that + * t_peakrate_thr is in B/s already + */ + bw = uqmin((uint64_t)bbr->rc_tp->t_peakrate_thr, bw); + } + if (bw == 0) { + /* We should not be at 0, go to the initial window then */ + goto use_initial_window; + } + if (bw < 1) + /* Probably should panic */ + bw = 1; + if (bw < min_bw) + bw = min_bw; + return (bw); +} + +static inline uint64_t +bbr_get_bw(struct tcp_bbr *bbr) +{ + uint64_t bw; + + bw = __bbr_get_bw(bbr); + return (bw); +} + +static inline void +bbr_reset_lt_bw_interval(struct tcp_bbr *bbr, uint32_t cts) +{ + bbr->r_ctl.rc_lt_epoch = bbr->r_ctl.rc_pkt_epoch; + bbr->r_ctl.rc_lt_time = bbr->r_ctl.rc_del_time; + bbr->r_ctl.rc_lt_del = bbr->r_ctl.rc_delivered; + bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost; +} + +static inline void +bbr_reset_lt_bw_sampling(struct tcp_bbr *bbr, uint32_t cts) +{ + bbr->rc_lt_is_sampling = 0; + bbr->rc_lt_use_bw = 0; + bbr->r_ctl.rc_lt_bw = 0; + bbr_reset_lt_bw_interval(bbr, cts); +} + +static inline void +bbr_lt_bw_samp_done(struct tcp_bbr *bbr, uint64_t bw, uint32_t cts, uint32_t timin) +{ + uint64_t diff; + + /* Do we have a previous sample? */ + if (bbr->r_ctl.rc_lt_bw) { + /* Get the diff in bytes per second */ + if (bbr->r_ctl.rc_lt_bw > bw) + diff = bbr->r_ctl.rc_lt_bw - bw; + else + diff = bw - bbr->r_ctl.rc_lt_bw; + if ((diff <= bbr_lt_bw_diff) || + (diff <= (bbr->r_ctl.rc_lt_bw / bbr_lt_bw_ratio))) { + /* Consider us policed */ + uint32_t saved_bw; + + saved_bw = (uint32_t)bbr->r_ctl.rc_lt_bw; + bbr->r_ctl.rc_lt_bw = (bw + bbr->r_ctl.rc_lt_bw) / 2; /* average of two */ + bbr->rc_lt_use_bw = 1; + bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT; + /* + * Use pkt based epoch for measuring length of + * policer up + */ + bbr->r_ctl.rc_lt_epoch_use = bbr->r_ctl.rc_pkt_epoch; + /* + * reason 4 is we need to start consider being + * policed + */ + bbr_log_type_ltbw(bbr, cts, 4, (uint32_t)bw, saved_bw, (uint32_t)diff, timin); + return; + } + } + bbr->r_ctl.rc_lt_bw = bw; + bbr_reset_lt_bw_interval(bbr, cts); + bbr_log_type_ltbw(bbr, cts, 5, 0, (uint32_t)bw, 0, timin); +} + +/* + * RRS: Copied from user space! + * Calculate a uniformly distributed random number less than upper_bound + * avoiding "modulo bias". + * + * Uniformity is achieved by generating new random numbers until the one + * returned is outside the range [0, 2**32 % upper_bound). This + * guarantees the selected random number will be inside + * [2**32 % upper_bound, 2**32) which maps back to [0, upper_bound) + * after reduction modulo upper_bound. + */ +static uint32_t +arc4random_uniform(uint32_t upper_bound) +{ + uint32_t r, min; + + if (upper_bound < 2) + return 0; + + /* 2**32 % x == (2**32 - x) % x */ + min = -upper_bound % upper_bound; + + /* + * This could theoretically loop forever but each retry has + * p > 0.5 (worst case, usually far better) of selecting a + * number inside the range we need, so it should rarely need + * to re-roll. + */ + for (;;) { + r = arc4random(); + if (r >= min) + break; + } + + return r % upper_bound; +} + +static void +bbr_randomize_extra_state_time(struct tcp_bbr *bbr) +{ + uint32_t ran, deduct; + + ran = arc4random_uniform(bbr_rand_ot); + if (ran) { + deduct = bbr->r_ctl.rc_level_state_extra / ran; + bbr->r_ctl.rc_level_state_extra -= deduct; + } +} +/* + * Return randomly the starting state + * to use in probebw. + */ +static uint8_t +bbr_pick_probebw_substate(struct tcp_bbr *bbr, uint32_t cts) +{ + uint32_t ran; + uint8_t ret_val; + + /* Initialize the offset to 0 */ + bbr->r_ctl.rc_exta_time_gd = 0; + bbr->rc_hit_state_1 = 0; + bbr->r_ctl.rc_level_state_extra = 0; + ran = arc4random_uniform((BBR_SUBSTATE_COUNT-1)); + /* + * The math works funny here :) the return value is used to set the + * substate and then the state change is called which increments by + * one. So if we return 1 (DRAIN) we will increment to 2 (LEVEL1) when + * we fully enter the state. Note that the (8 - 1 - ran) assures that + * we return 1 - 7, so we dont return 0 and end up starting in + * state 1 (DRAIN). + */ + ret_val = BBR_SUBSTATE_COUNT - 1 - ran; + /* Set an epoch */ + if ((cts - bbr->r_ctl.rc_rcv_epoch_start) >= bbr_get_rtt(bbr, BBR_RTT_PROP)) + bbr_set_epoch(bbr, cts, __LINE__); + + bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost; + return (ret_val); +} + +static void +bbr_lt_bw_sampling(struct tcp_bbr *bbr, uint32_t cts, int32_t loss_detected) +{ + uint32_t diff, d_time; + uint64_t del_time, bw, lost, delivered; + + if (bbr->r_use_policer == 0) + return; + if (bbr->rc_lt_use_bw) { + /* We are using lt bw do we stop yet? */ + diff = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch_use; + if (diff > bbr_lt_bw_max_rtts) { + /* Reset it all */ +reset_all: + bbr_reset_lt_bw_sampling(bbr, cts); + if (bbr->rc_filled_pipe) { + bbr_set_epoch(bbr, cts, __LINE__); + bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts); + bbr_substate_change(bbr, cts, __LINE__, 0); + bbr->rc_bbr_state = BBR_STATE_PROBE_BW; + bbr_log_type_statechange(bbr, cts, __LINE__); + } else { + /* + * This should not happen really + * unless we remove the startup/drain + * restrictions above. + */ + bbr->rc_bbr_state = BBR_STATE_STARTUP; + bbr_set_epoch(bbr, cts, __LINE__); + bbr->r_ctl.rc_bbr_state_time = cts; + bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost; + bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_startup_pg; + bbr->r_ctl.rc_bbr_cwnd_gain = bbr->r_ctl.rc_startup_pg; + bbr_set_state_target(bbr, __LINE__); + bbr_log_type_statechange(bbr, cts, __LINE__); + } + /* reason 0 is to stop using lt-bw */ + bbr_log_type_ltbw(bbr, cts, 0, 0, 0, 0, 0); + return; + } + if (bbr_lt_intvl_fp == 0) { + /* Not doing false-postive detection */ + return; + } + /* False positive detection */ + if (diff == bbr_lt_intvl_fp) { + /* At bbr_lt_intvl_fp we record the lost */ + bbr->r_ctl.rc_lt_del = bbr->r_ctl.rc_delivered; + bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost; + } else if (diff > (bbr_lt_intvl_min_rtts + bbr_lt_intvl_fp)) { + /* Now is our loss rate still high? */ + lost = bbr->r_ctl.rc_lost - bbr->r_ctl.rc_lt_lost; + delivered = bbr->r_ctl.rc_delivered - bbr->r_ctl.rc_lt_del; + if ((delivered == 0) || + (((lost * 1000)/delivered) < bbr_lt_fd_thresh)) { + /* No still below our threshold */ + bbr_log_type_ltbw(bbr, cts, 7, lost, delivered, 0, 0); + } else { + /* Yikes its still high, it must be a false positive */ + bbr_log_type_ltbw(bbr, cts, 8, lost, delivered, 0, 0); + goto reset_all; + } + } + return; + } + /* + * Wait for the first loss before sampling, to let the policer + * exhaust its tokens and estimate the steady-state rate allowed by + * the policer. Starting samples earlier includes bursts that + * over-estimate the bw. + */ + if (bbr->rc_lt_is_sampling == 0) { + /* reason 1 is to begin doing the sampling */ + if (loss_detected == 0) + return; + bbr_reset_lt_bw_interval(bbr, cts); + bbr->rc_lt_is_sampling = 1; + bbr_log_type_ltbw(bbr, cts, 1, 0, 0, 0, 0); + return; + } + /* Now how long were we delivering long term last> */ + if (TSTMP_GEQ(bbr->r_ctl.rc_del_time, bbr->r_ctl.rc_lt_time)) + d_time = bbr->r_ctl.rc_del_time - bbr->r_ctl.rc_lt_time; + else + d_time = 0; + + /* To avoid underestimates, reset sampling if we run out of data. */ + if (bbr->r_ctl.r_app_limited_until) { + /* Can not measure in app-limited state */ + bbr_reset_lt_bw_sampling(bbr, cts); + /* reason 2 is to reset sampling due to app limits */ + bbr_log_type_ltbw(bbr, cts, 2, 0, 0, 0, d_time); + return; + } + diff = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch; + if (diff < bbr_lt_intvl_min_rtts) { + /* + * need more samples (we don't + * start on a round like linux so + * we need 1 more). + */ + /* 6 is not_enough time or no-loss */ + bbr_log_type_ltbw(bbr, cts, 6, 0, 0, 0, d_time); + return; + } + if (diff > (4 * bbr_lt_intvl_min_rtts)) { + /* + * For now if we wait too long, reset all sampling. We need + * to do some research here, its possible that we should + * base this on how much loss as occurred.. something like + * if its under 10% (or some thresh) reset all otherwise + * don't. Thats for phase II I guess. + */ + bbr_reset_lt_bw_sampling(bbr, cts); + /* reason 3 is to reset sampling due too long of sampling */ + bbr_log_type_ltbw(bbr, cts, 3, 0, 0, 0, d_time); + return; + } + /* + * End sampling interval when a packet is lost, so we estimate the + * policer tokens were exhausted. Stopping the sampling before the + * tokens are exhausted under-estimates the policed rate. + */ + if (loss_detected == 0) { + /* 6 is not_enough time or no-loss */ + bbr_log_type_ltbw(bbr, cts, 6, 0, 0, 0, d_time); + return; + } + /* Calculate packets lost and delivered in sampling interval. */ + lost = bbr->r_ctl.rc_lost - bbr->r_ctl.rc_lt_lost; + delivered = bbr->r_ctl.rc_delivered - bbr->r_ctl.rc_lt_del; + if ((delivered == 0) || + (((lost * 1000)/delivered) < bbr_lt_loss_thresh)) { + bbr_log_type_ltbw(bbr, cts, 6, lost, delivered, 0, d_time); + return; + } + if (d_time < 1000) { + /* Not enough time. wait */ + /* 6 is not_enough time or no-loss */ + bbr_log_type_ltbw(bbr, cts, 6, 0, 0, 0, d_time); + return; + } + if (d_time >= (0xffffffff / USECS_IN_MSEC)) { + /* Too long */ + bbr_reset_lt_bw_sampling(bbr, cts); + /* reason 3 is to reset sampling due too long of sampling */ + bbr_log_type_ltbw(bbr, cts, 3, 0, 0, 0, d_time); + return; + } + del_time = d_time; + bw = delivered; + bw *= (uint64_t)USECS_IN_SECOND; + bw /= del_time; + bbr_lt_bw_samp_done(bbr, bw, cts, d_time); +} + +/* + * Allocate a sendmap from our zone. + */ +static struct bbr_sendmap * +bbr_alloc(struct tcp_bbr *bbr) +{ + struct bbr_sendmap *rsm; + + BBR_STAT_INC(bbr_to_alloc); + rsm = uma_zalloc(bbr_zone, (M_NOWAIT | M_ZERO)); + if (rsm) { + bbr->r_ctl.rc_num_maps_alloced++; + return (rsm); + } + if (bbr->r_ctl.rc_free_cnt) { + BBR_STAT_INC(bbr_to_alloc_emerg); + rsm = TAILQ_FIRST(&bbr->r_ctl.rc_free); + TAILQ_REMOVE(&bbr->r_ctl.rc_free, rsm, r_next); + bbr->r_ctl.rc_free_cnt--; + return (rsm); + } + BBR_STAT_INC(bbr_to_alloc_failed); + return (NULL); +} + +static struct bbr_sendmap * +bbr_alloc_full_limit(struct tcp_bbr *bbr) +{ + if ((bbr_tcp_map_entries_limit > 0) && + (bbr->r_ctl.rc_num_maps_alloced >= bbr_tcp_map_entries_limit)) { + BBR_STAT_INC(bbr_alloc_limited); + if (!bbr->alloc_limit_reported) { + bbr->alloc_limit_reported = 1; + BBR_STAT_INC(bbr_alloc_limited_conns); + } + return (NULL); + } + return (bbr_alloc(bbr)); +} + + +/* wrapper to allocate a sendmap entry, subject to a specific limit */ +static struct bbr_sendmap * +bbr_alloc_limit(struct tcp_bbr *bbr, uint8_t limit_type) +{ + struct bbr_sendmap *rsm; + + if (limit_type) { + /* currently there is only one limit type */ + if (bbr_tcp_map_split_limit > 0 && + bbr->r_ctl.rc_num_split_allocs >= bbr_tcp_map_split_limit) { + BBR_STAT_INC(bbr_split_limited); + if (!bbr->alloc_limit_reported) { + bbr->alloc_limit_reported = 1; + BBR_STAT_INC(bbr_alloc_limited_conns); + } + return (NULL); + } + } + + /* allocate and mark in the limit type, if set */ + rsm = bbr_alloc(bbr); + if (rsm != NULL && limit_type) { + rsm->r_limit_type = limit_type; + bbr->r_ctl.rc_num_split_allocs++; + } + return (rsm); +} + +static void +bbr_free(struct tcp_bbr *bbr, struct bbr_sendmap *rsm) +{ + if (rsm->r_limit_type) { + /* currently there is only one limit type */ + bbr->r_ctl.rc_num_split_allocs--; + } + if (rsm->r_is_smallmap) + bbr->r_ctl.rc_num_small_maps_alloced--; + if (bbr->r_ctl.rc_tlp_send == rsm) + bbr->r_ctl.rc_tlp_send = NULL; + if (bbr->r_ctl.rc_resend == rsm) { + bbr->r_ctl.rc_resend = NULL; + } + if (bbr->r_ctl.rc_next == rsm) + bbr->r_ctl.rc_next = NULL; + if (bbr->r_ctl.rc_sacklast == rsm) + bbr->r_ctl.rc_sacklast = NULL; + if (bbr->r_ctl.rc_free_cnt < bbr_min_req_free) { + memset(rsm, 0, sizeof(struct bbr_sendmap)); + TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_free, rsm, r_next); + rsm->r_limit_type = 0; + bbr->r_ctl.rc_free_cnt++; + return; + } + bbr->r_ctl.rc_num_maps_alloced--; + uma_zfree(bbr_zone, rsm); +} + +/* + * Returns the BDP. + */ +static uint64_t +bbr_get_bw_delay_prod(uint64_t rtt, uint64_t bw) { + /* + * Calculate the bytes in flight needed given the bw (in bytes per + * second) and the specifyed rtt in useconds. We need to put out the + * returned value per RTT to match that rate. Gain will normaly + * raise it up from there. + * + * This should not overflow as long as the bandwidth is below 1 + * TByte per second (bw < 10**12 = 2**40) and the rtt is smaller + * than 1000 seconds (rtt < 10**3 * 10**6 = 10**9 = 2**30). + */ + uint64_t usec_per_sec; + + usec_per_sec = USECS_IN_SECOND; + return ((rtt * bw) / usec_per_sec); +} + +/* + * Return the initial cwnd. + */ +static uint32_t +bbr_initial_cwnd(struct tcp_bbr *bbr, struct tcpcb *tp) +{ + uint32_t i_cwnd; + + if (bbr->rc_init_win) { + i_cwnd = bbr->rc_init_win * tp->t_maxseg; + } else if (V_tcp_initcwnd_segments) + i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg), + max(2 * tp->t_maxseg, 14600)); + else if (V_tcp_do_rfc3390) + i_cwnd = min(4 * tp->t_maxseg, + max(2 * tp->t_maxseg, 4380)); + else { + /* Per RFC5681 Section 3.1 */ + if (tp->t_maxseg > 2190) + i_cwnd = 2 * tp->t_maxseg; + else if (tp->t_maxseg > 1095) + i_cwnd = 3 * tp->t_maxseg; + else + i_cwnd = 4 * tp->t_maxseg; + } + return (i_cwnd); +} + +/* + * Given a specified gain, return the target + * cwnd based on that gain. + */ +static uint32_t +bbr_get_raw_target_cwnd(struct tcp_bbr *bbr, uint32_t gain, uint64_t bw) +{ + uint64_t bdp, rtt; + uint32_t cwnd; + + if ((get_filter_value_small(&bbr->r_ctl.rc_rttprop) == 0xffffffff) || + (bbr_get_full_bw(bbr) == 0)) { + /* No measurements yet */ + return (bbr_initial_cwnd(bbr, bbr->rc_tp)); + } + /* + * Get bytes per RTT needed (rttProp is normally in + * bbr_cwndtarget_rtt_touse) + */ + rtt = bbr_get_rtt(bbr, bbr_cwndtarget_rtt_touse); + /* Get the bdp from the two values */ + bdp = bbr_get_bw_delay_prod(rtt, bw); + /* Now apply the gain */ + cwnd = (uint32_t)(((bdp * ((uint64_t)gain)) + (uint64_t)(BBR_UNIT - 1)) / ((uint64_t)BBR_UNIT)); + + return (cwnd); +} + +static uint32_t +bbr_get_target_cwnd(struct tcp_bbr *bbr, uint64_t bw, uint32_t gain) +{ + uint32_t cwnd, mss; + + mss = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options), bbr->r_ctl.rc_pace_max_segs); + /* Get the base cwnd with gain rounded to a mss */ + cwnd = roundup(bbr_get_raw_target_cwnd(bbr, bw, gain), mss); + /* + * Add in N (2 default since we do not have a + * fq layer to trap packets in) quanta's per the I-D + * section 4.2.3.2 quanta adjust. + */ + cwnd += (bbr_quanta * bbr->r_ctl.rc_pace_max_segs); + if (bbr->rc_use_google) { + if((bbr->rc_bbr_state == BBR_STATE_PROBE_BW) && + (bbr_state_val(bbr) == BBR_SUB_GAIN)) { + /* + * The linux implementation adds + * an extra 2 x mss in gain cycle which + * is documented no-where except in the code. + * so we add more for Neal undocumented feature + */ + cwnd += 2 * mss; + } + if ((cwnd / mss) & 0x1) { + /* Round up for odd num mss */ + cwnd += mss; + } + } + /* Are we below the min cwnd? */ + if (cwnd < get_min_cwnd(bbr)) + return (get_min_cwnd(bbr)); + return (cwnd); +} + +static uint16_t +bbr_gain_adjust(struct tcp_bbr *bbr, uint16_t gain) +{ + if (gain < 1) + gain = 1; + return (gain); +} + +static uint32_t +bbr_get_header_oh(struct tcp_bbr *bbr) +{ + int seg_oh; + + seg_oh = 0; + if (bbr->r_ctl.rc_inc_tcp_oh) { + /* Do we include TCP overhead? */ + seg_oh = (bbr->rc_last_options + sizeof(struct tcphdr)); + } + if (bbr->r_ctl.rc_inc_ip_oh) { + /* Do we include IP overhead? */ +#ifdef INET6 + if (bbr->r_is_v6) + seg_oh += sizeof(struct ip6_hdr); + else +#endif +#ifdef INET + seg_oh += sizeof(struct ip); +#endif + } + if (bbr->r_ctl.rc_inc_enet_oh) { + /* Do we include the ethernet overhead? */ + seg_oh += sizeof(struct ether_header); + } + return(seg_oh); +} + + +static uint32_t +bbr_get_pacing_length(struct tcp_bbr *bbr, uint16_t gain, uint32_t useconds_time, uint64_t bw) +{ + uint64_t divor, res, tim; + + if (useconds_time == 0) + return (0); + gain = bbr_gain_adjust(bbr, gain); + divor = (uint64_t)USECS_IN_SECOND * (uint64_t)BBR_UNIT; + tim = useconds_time; + res = (tim * bw * gain) / divor; + if (res == 0) + res = 1; + return ((uint32_t)res); +} + +/* + * Given a gain and a length return the delay in useconds that + * should be used to evenly space out packets + * on the connection (based on the gain factor). + */ +static uint32_t +bbr_get_pacing_delay(struct tcp_bbr *bbr, uint16_t gain, int32_t len, uint32_t cts, int nolog) +{ + uint64_t bw, lentim, res; + uint32_t usecs, srtt, over = 0; + uint32_t seg_oh, num_segs, maxseg; + + if (len == 0) + return (0); + + maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options; + num_segs = (len + maxseg - 1) / maxseg; + if (bbr->rc_use_google == 0) { + seg_oh = bbr_get_header_oh(bbr); + len += (num_segs * seg_oh); + } + gain = bbr_gain_adjust(bbr, gain); + bw = bbr_get_bw(bbr); + if (bbr->rc_use_google) { + uint64_t cbw; + + /* + * Reduce the b/w by the google discount + * factor 10 = 1%. + */ + cbw = bw * (uint64_t)(1000 - bbr->r_ctl.bbr_google_discount); + cbw /= (uint64_t)1000; + /* We don't apply a discount if it results in 0 */ + if (cbw > 0) + bw = cbw; + } + lentim = ((uint64_t)len * + (uint64_t)USECS_IN_SECOND * + (uint64_t)BBR_UNIT); + res = lentim / ((uint64_t)gain * bw); + if (res == 0) + res = 1; + usecs = (uint32_t)res; + srtt = bbr_get_rtt(bbr, BBR_SRTT); + if (bbr_hptsi_max_mul && bbr_hptsi_max_div && + (bbr->rc_use_google == 0) && + (usecs > ((srtt * bbr_hptsi_max_mul) / bbr_hptsi_max_div))) { + /* + * We cannot let the delay be more than 1/2 the srtt time. + * Otherwise we cannot pace out or send properly. + */ + over = usecs = (srtt * bbr_hptsi_max_mul) / bbr_hptsi_max_div; + BBR_STAT_INC(bbr_hpts_min_time); + } + if (!nolog) + bbr_log_pacing_delay_calc(bbr, gain, len, cts, usecs, bw, over, 1); + return (usecs); +} + +static void +bbr_ack_received(struct tcpcb *tp, struct tcp_bbr *bbr, struct tcphdr *th, uint32_t bytes_this_ack, + uint32_t sack_changed, uint32_t prev_acked, int32_t line, uint32_t losses) +{ + INP_WLOCK_ASSERT(tp->t_inpcb); + uint64_t bw; + uint32_t cwnd, target_cwnd, saved_bytes, maxseg; + int32_t meth; + +#ifdef NETFLIX_STATS + if ((tp->t_flags & TF_GPUTINPROG) && + SEQ_GEQ(th->th_ack, tp->gput_ack)) { + /* + * Strech acks and compressed acks will cause this to + * oscillate but we are doing it the same way as the main + * stack so it will be compariable (though possibly not + * ideal). + */ + int32_t cgput; + int64_t gput, time_stamp; + + gput = (int64_t) (th->th_ack - tp->gput_seq) * 8; + time_stamp = max(1, ((bbr->r_ctl.rc_rcvtime - tp->gput_ts) / 1000)); + cgput = gput / time_stamp; + stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, + cgput); + if (tp->t_stats_gput_prev > 0) + stats_voi_update_abs_s32(tp->t_stats, + VOI_TCP_GPUT_ND, + ((gput - tp->t_stats_gput_prev) * 100) / + tp->t_stats_gput_prev); + tp->t_flags &= ~TF_GPUTINPROG; + tp->t_stats_gput_prev = cgput; + } +#endif + if ((bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) && + ((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google)) { + /* We don't change anything in probe-rtt */ + return; + } + maxseg = tp->t_maxseg - bbr->rc_last_options; + saved_bytes = bytes_this_ack; + bytes_this_ack += sack_changed; + if (bytes_this_ack > prev_acked) { + bytes_this_ack -= prev_acked; + /* + * A byte ack'd gives us a full mss + * to be like linux i.e. they count packets. + */ + if ((bytes_this_ack < maxseg) && bbr->rc_use_google) + bytes_this_ack = maxseg; + } else { + /* Unlikely */ + bytes_this_ack = 0; + } + cwnd = tp->snd_cwnd; + bw = get_filter_value(&bbr->r_ctl.rc_delrate); + if (bw) + target_cwnd = bbr_get_target_cwnd(bbr, + bw, + (uint32_t)bbr->r_ctl.rc_bbr_cwnd_gain); + else + target_cwnd = bbr_initial_cwnd(bbr, bbr->rc_tp); + if (IN_RECOVERY(tp->t_flags) && + (bbr->bbr_prev_in_rec == 0)) { + /* + * We are entering recovery and + * thus packet conservation. + */ + bbr->pkt_conservation = 1; + bbr->r_ctl.rc_recovery_start = bbr->r_ctl.rc_rcvtime; + cwnd = ctf_flight_size(tp, + (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) + + bytes_this_ack; + } + if (IN_RECOVERY(tp->t_flags)) { + uint32_t flight; + + bbr->bbr_prev_in_rec = 1; + if (cwnd > losses) { + cwnd -= losses; + if (cwnd < maxseg) + cwnd = maxseg; + } else + cwnd = maxseg; + flight = ctf_flight_size(tp, + (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); + bbr_log_type_cwndupd(bbr, flight, 0, + losses, 10, 0, 0, line); + if (bbr->pkt_conservation) { + uint32_t time_in; + + if (TSTMP_GEQ(bbr->r_ctl.rc_rcvtime, bbr->r_ctl.rc_recovery_start)) + time_in = bbr->r_ctl.rc_rcvtime - bbr->r_ctl.rc_recovery_start; + else + time_in = 0; + + if (time_in >= bbr_get_rtt(bbr, BBR_RTT_PROP)) { + /* Clear packet conservation after an rttProp */ + bbr->pkt_conservation = 0; + } else { + if ((flight + bytes_this_ack) > cwnd) + cwnd = flight + bytes_this_ack; + if (cwnd < get_min_cwnd(bbr)) + cwnd = get_min_cwnd(bbr); + tp->snd_cwnd = cwnd; + bbr_log_type_cwndupd(bbr, saved_bytes, sack_changed, + prev_acked, 1, target_cwnd, th->th_ack, line); + return; + } + } + } else + bbr->bbr_prev_in_rec = 0; + if ((bbr->rc_use_google == 0) && bbr->r_ctl.restrict_growth) { + bbr->r_ctl.restrict_growth--; + if (bytes_this_ack > maxseg) + bytes_this_ack = maxseg; + } + if (bbr->rc_filled_pipe) { + /* + * Here we have exited startup and filled the pipe. We will + * thus allow the cwnd to shrink to the target. We hit here + * mostly. + */ + uint32_t s_cwnd; + + meth = 2; + s_cwnd = min((cwnd + bytes_this_ack), target_cwnd); + if (s_cwnd > cwnd) + cwnd = s_cwnd; + else if (bbr_cwnd_may_shrink || bbr->rc_use_google || bbr->rc_no_pacing) + cwnd = s_cwnd; + } else { + /* + * Here we are still in startup, we increase cwnd by what + * has been acked. + */ + if ((cwnd < target_cwnd) || + (bbr->rc_past_init_win == 0)) { + meth = 3; + cwnd += bytes_this_ack; + } else { + /* + * Method 4 means we are at target so no gain in + * startup and past the initial window. + */ + meth = 4; + } + } + tp->snd_cwnd = max(cwnd, get_min_cwnd(bbr)); + bbr_log_type_cwndupd(bbr, saved_bytes, sack_changed, prev_acked, meth, target_cwnd, th->th_ack, line); +} + +static void +tcp_bbr_partialack(struct tcpcb *tp) +{ + struct tcp_bbr *bbr; + + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + INP_WLOCK_ASSERT(tp->t_inpcb); + if (ctf_flight_size(tp, + (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) <= + tp->snd_cwnd) { + bbr->r_wanted_output = 1; + } +} + +static void +bbr_post_recovery(struct tcpcb *tp) +{ + struct tcp_bbr *bbr; + uint32_t flight; + + INP_WLOCK_ASSERT(tp->t_inpcb); + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + /* + * Here we just exit recovery. + */ + EXIT_RECOVERY(tp->t_flags); + /* Lock in our b/w reduction for the specified number of pkt-epochs */ + bbr->r_recovery_bw = 0; + tp->snd_recover = tp->snd_una; + tcp_bbr_tso_size_check(bbr, bbr->r_ctl.rc_rcvtime); + bbr->pkt_conservation = 0; + if (bbr->rc_use_google == 0) { + /* + * For non-google mode lets + * go ahead and make sure we clear + * the recovery state so if we + * bounce back in to recovery we + * will do PC. + */ + bbr->bbr_prev_in_rec = 0; + } + bbr_log_type_exit_rec(bbr); + if (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) { + tp->snd_cwnd = max(tp->snd_cwnd, bbr->r_ctl.rc_cwnd_on_ent); + bbr_log_type_cwndupd(bbr, 0, 0, 0, 15, 0, 0, __LINE__); + } else { + /* For probe-rtt case lets fix up its saved_cwnd */ + if (bbr->r_ctl.rc_saved_cwnd < bbr->r_ctl.rc_cwnd_on_ent) { + bbr->r_ctl.rc_saved_cwnd = bbr->r_ctl.rc_cwnd_on_ent; + bbr_log_type_cwndupd(bbr, 0, 0, 0, 16, 0, 0, __LINE__); + } + } + flight = ctf_flight_size(tp, + (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); + if ((bbr->rc_use_google == 0) && + bbr_do_red) { + uint64_t val, lr2use; + uint32_t maxseg, newcwnd, acks_inflight, ratio, cwnd; + uint32_t *cwnd_p; + + if (bbr_get_rtt(bbr, BBR_SRTT)) { + val = ((uint64_t)bbr_get_rtt(bbr, BBR_RTT_PROP) * (uint64_t)1000); + val /= bbr_get_rtt(bbr, BBR_SRTT); + ratio = (uint32_t)val; + } else + ratio = 1000; + + bbr_log_type_cwndupd(bbr, bbr_red_mul, bbr_red_div, + bbr->r_ctl.recovery_lr, 21, + ratio, + bbr->r_ctl.rc_red_cwnd_pe, + __LINE__); + if ((ratio < bbr_do_red) || (bbr_do_red == 0)) + goto done; + if (((bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) && + bbr_prtt_slam_cwnd) || + (bbr_sub_drain_slam_cwnd && + (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) && + bbr->rc_hit_state_1 && + (bbr_state_val(bbr) == BBR_SUB_DRAIN)) || + ((bbr->rc_bbr_state == BBR_STATE_DRAIN) && + bbr_slam_cwnd_in_main_drain)) { + /* + * Here we must poke at the saved cwnd + * as well as the cwnd. + */ + cwnd = bbr->r_ctl.rc_saved_cwnd; + cwnd_p = &bbr->r_ctl.rc_saved_cwnd; + } else { + cwnd = tp->snd_cwnd; + cwnd_p = &tp->snd_cwnd; + } + maxseg = tp->t_maxseg - bbr->rc_last_options; + /* Add the overall lr with the recovery lr */ + if (bbr->r_ctl.rc_lost == 0) + lr2use = 0; + else if (bbr->r_ctl.rc_delivered == 0) + lr2use = 1000; + else { + lr2use = bbr->r_ctl.rc_lost * 1000; + lr2use /= bbr->r_ctl.rc_delivered; + } + lr2use += bbr->r_ctl.recovery_lr; + acks_inflight = (flight / (maxseg * 2)); + if (bbr_red_scale) { + lr2use *= bbr_get_rtt(bbr, BBR_SRTT); + lr2use /= bbr_red_scale; + if ((bbr_red_growth_restrict) && + ((bbr_get_rtt(bbr, BBR_SRTT)/bbr_red_scale) > 1)) + bbr->r_ctl.restrict_growth += acks_inflight; + } + if (lr2use) { + val = (uint64_t)cwnd * lr2use; + val /= 1000; + if (cwnd > val) + newcwnd = roundup((cwnd - val), maxseg); + else + newcwnd = maxseg; + } else { + val = (uint64_t)cwnd * (uint64_t)bbr_red_mul; + val /= (uint64_t)bbr_red_div; + newcwnd = roundup((uint32_t)val, maxseg); + } + /* with standard delayed acks how many acks can I expect? */ + if (bbr_drop_limit == 0) { + /* + * Anticpate how much we will + * raise the cwnd based on the acks. + */ + if ((newcwnd + (acks_inflight * maxseg)) < get_min_cwnd(bbr)) { + /* We do enforce the min (with the acks) */ + newcwnd = (get_min_cwnd(bbr) - acks_inflight); + } + } else { + /* + * A strict drop limit of N is is inplace + */ + if (newcwnd < (bbr_drop_limit * maxseg)) { + newcwnd = bbr_drop_limit * maxseg; + } + } + /* For the next N acks do we restrict the growth */ + *cwnd_p = newcwnd; + if (tp->snd_cwnd > newcwnd) + tp->snd_cwnd = newcwnd; + bbr_log_type_cwndupd(bbr, bbr_red_mul, bbr_red_div, val, 22, + (uint32_t)lr2use, + bbr_get_rtt(bbr, BBR_SRTT), __LINE__); + bbr->r_ctl.rc_red_cwnd_pe = bbr->r_ctl.rc_pkt_epoch; + } +done: + bbr->r_ctl.recovery_lr = 0; + if (flight <= tp->snd_cwnd) { + bbr->r_wanted_output = 1; + } + tcp_bbr_tso_size_check(bbr, bbr->r_ctl.rc_rcvtime); +} + +static void +bbr_setup_red_bw(struct tcp_bbr *bbr, uint32_t cts) +{ + bbr->r_ctl.red_bw = get_filter_value(&bbr->r_ctl.rc_delrate); + /* Limit the drop in b/w to 1/2 our current filter. */ + if (bbr->r_ctl.red_bw > bbr->r_ctl.rc_bbr_cur_del_rate) + bbr->r_ctl.red_bw = bbr->r_ctl.rc_bbr_cur_del_rate; + if (bbr->r_ctl.red_bw < (get_filter_value(&bbr->r_ctl.rc_delrate) / 2)) + bbr->r_ctl.red_bw = get_filter_value(&bbr->r_ctl.rc_delrate) / 2; + tcp_bbr_tso_size_check(bbr, cts); +} + +static void +bbr_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type, struct bbr_sendmap *rsm) +{ + struct tcp_bbr *bbr; + + INP_WLOCK_ASSERT(tp->t_inpcb); + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + switch (type) { + case CC_NDUPACK: + if (!IN_RECOVERY(tp->t_flags)) { + tp->snd_recover = tp->snd_max; + /* Start a new epoch */ + bbr_set_pktepoch(bbr, bbr->r_ctl.rc_rcvtime, __LINE__); + if (bbr->rc_lt_is_sampling || bbr->rc_lt_use_bw) { + /* + * Move forward the lt epoch + * so it won't count the truncated + * epoch. + */ + bbr->r_ctl.rc_lt_epoch++; + } + if (bbr->rc_bbr_state == BBR_STATE_STARTUP) { + /* + * Just like the policer detection code + * if we are in startup we must push + * forward the last startup epoch + * to hide the truncated PE. + */ + bbr->r_ctl.rc_bbr_last_startup_epoch++; + } + bbr->r_ctl.rc_cwnd_on_ent = tp->snd_cwnd; + ENTER_RECOVERY(tp->t_flags); + bbr->rc_tlp_rtx_out = 0; + bbr->r_ctl.recovery_lr = bbr->r_ctl.rc_pkt_epoch_loss_rate; + tcp_bbr_tso_size_check(bbr, bbr->r_ctl.rc_rcvtime); + if (bbr->rc_inp->inp_in_hpts && + ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) == 0)) { + /* + * When we enter recovery, we need to restart + * any timers. This may mean we gain an agg + * early, which will be made up for at the last + * rxt out. + */ + bbr->rc_timer_first = 1; + bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); + } + /* + * Calculate a new cwnd based on to the current + * delivery rate with no gain. We get the bdp + * without gaining it up like we normally would and + * we use the last cur_del_rate. + */ + if ((bbr->rc_use_google == 0) && + (bbr->r_ctl.bbr_rttprobe_gain_val || + (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT))) { + tp->snd_cwnd = ctf_flight_size(tp, + (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) + + (tp->t_maxseg - bbr->rc_last_options); + if (tp->snd_cwnd < get_min_cwnd(bbr)) { + /* We always gate to min cwnd */ + tp->snd_cwnd = get_min_cwnd(bbr); + } + bbr_log_type_cwndupd(bbr, 0, 0, 0, 14, 0, 0, __LINE__); + } + bbr_log_type_enter_rec(bbr, rsm->r_start); + } + break; + case CC_RTO_ERR: + TCPSTAT_INC(tcps_sndrexmitbad); + /* RTO was unnecessary, so reset everything. */ + bbr_reset_lt_bw_sampling(bbr, bbr->r_ctl.rc_rcvtime); + if (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) { + tp->snd_cwnd = tp->snd_cwnd_prev; + tp->snd_ssthresh = tp->snd_ssthresh_prev; + tp->snd_recover = tp->snd_recover_prev; + tp->snd_cwnd = max(tp->snd_cwnd, bbr->r_ctl.rc_cwnd_on_ent); + bbr_log_type_cwndupd(bbr, 0, 0, 0, 13, 0, 0, __LINE__); + } + tp->t_badrxtwin = 0; + break; + } +} + +/* + * Indicate whether this ack should be delayed. We can delay the ack if + * following conditions are met: + * - There is no delayed ack timer in progress. + * - Our last ack wasn't a 0-sized window. We never want to delay + * the ack that opens up a 0-sized window. + * - LRO wasn't used for this segment. We make sure by checking that the + * segment size is not larger than the MSS. + * - Delayed acks are enabled or this is a half-synchronized T/TCP + * connection. + * - The data being acked is less than a full segment (a stretch ack + * of more than a segment we should ack. + * - nsegs is 1 (if its more than that we received more than 1 ack). + */ +#define DELAY_ACK(tp, bbr, nsegs) \ + (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ + ((bbr->bbr_segs_rcvd + nsegs) < tp->t_delayed_ack) && \ + (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) + +/* + * Return the lowest RSM in the map of + * packets still in flight that is not acked. + * This should normally find on the first one + * since we remove packets from the send + * map after they are marked ACKED. + */ +static struct bbr_sendmap * +bbr_find_lowest_rsm(struct tcp_bbr *bbr) +{ + struct bbr_sendmap *rsm; + + /* + * Walk the time-order transmitted list looking for an rsm that is + * not acked. This will be the one that was sent the longest time + * ago that is still outstanding. + */ + TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_tmap, r_tnext) { + if (rsm->r_flags & BBR_ACKED) { + continue; + } + goto finish; + } +finish: + return (rsm); +} + +static struct bbr_sendmap * +bbr_find_high_nonack(struct tcp_bbr *bbr, struct bbr_sendmap *rsm) +{ + struct bbr_sendmap *prsm; + + /* + * Walk the sequence order list backward until we hit and arrive at + * the highest seq not acked. In theory when this is called it + * should be the last segment (which it was not). + */ + prsm = rsm; + TAILQ_FOREACH_REVERSE_FROM(prsm, &bbr->r_ctl.rc_map, bbr_head, r_next) { + if (prsm->r_flags & (BBR_ACKED | BBR_HAS_FIN)) { + continue; + } + return (prsm); + } + return (NULL); +} + +/* + * Returns to the caller the number of microseconds that + * the packet can be outstanding before we think we + * should have had an ack returned. + */ +static uint32_t +bbr_calc_thresh_rack(struct tcp_bbr *bbr, uint32_t srtt, uint32_t cts, struct bbr_sendmap *rsm) +{ + /* + * lro is the flag we use to determine if we have seen reordering. + * If it gets set we have seen reordering. The reorder logic either + * works in one of two ways: + * + * If reorder-fade is configured, then we track the last time we saw + * re-ordering occur. If we reach the point where enough time as + * passed we no longer consider reordering has occuring. + * + * Or if reorder-face is 0, then once we see reordering we consider + * the connection to alway be subject to reordering and just set lro + * to 1. + * + * In the end if lro is non-zero we add the extra time for + * reordering in. + */ + int32_t lro; + uint32_t thresh, t_rxtcur; + + if (srtt == 0) + srtt = 1; + if (bbr->r_ctl.rc_reorder_ts) { + if (bbr->r_ctl.rc_reorder_fade) { + if (SEQ_GEQ(cts, bbr->r_ctl.rc_reorder_ts)) { + lro = cts - bbr->r_ctl.rc_reorder_ts; + if (lro == 0) { + /* + * No time as passed since the last + * reorder, mark it as reordering. + */ + lro = 1; + } + } else { + /* Negative time? */ + lro = 0; + } + if (lro > bbr->r_ctl.rc_reorder_fade) { + /* Turn off reordering seen too */ + bbr->r_ctl.rc_reorder_ts = 0; + lro = 0; + } + } else { + /* Reodering does not fade */ + lro = 1; + } + } else { + lro = 0; + } + thresh = srtt + bbr->r_ctl.rc_pkt_delay; + if (lro) { + /* It must be set, if not you get 1/4 rtt */ + if (bbr->r_ctl.rc_reorder_shift) + thresh += (srtt >> bbr->r_ctl.rc_reorder_shift); + else + thresh += (srtt >> 2); + } else { + thresh += 1000; + } + /* We don't let the rack timeout be above a RTO */ + if ((bbr->rc_tp)->t_srtt == 0) + t_rxtcur = BBR_INITIAL_RTO; + else + t_rxtcur = TICKS_2_USEC(bbr->rc_tp->t_rxtcur); + if (thresh > t_rxtcur) { + thresh = t_rxtcur; + } + /* And we don't want it above the RTO max either */ + if (thresh > (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND)) { + thresh = (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND); + } + bbr_log_thresh_choice(bbr, cts, thresh, lro, srtt, rsm, BBR_TO_FRM_RACK); + return (thresh); +} + +/* + * Return to the caller the amount of time in mico-seconds + * that should be used for the TLP timer from the last + * send time of this packet. + */ +static uint32_t +bbr_calc_thresh_tlp(struct tcpcb *tp, struct tcp_bbr *bbr, + struct bbr_sendmap *rsm, uint32_t srtt, + uint32_t cts) +{ + uint32_t thresh, len, maxseg, t_rxtcur; + struct bbr_sendmap *prsm; + + if (srtt == 0) + srtt = 1; + if (bbr->rc_tlp_threshold) + thresh = srtt + (srtt / bbr->rc_tlp_threshold); + else + thresh = (srtt * 2); + maxseg = tp->t_maxseg - bbr->rc_last_options; + /* Get the previous sent packet, if any */ + len = rsm->r_end - rsm->r_start; + + /* 2.1 behavior */ + prsm = TAILQ_PREV(rsm, bbr_head, r_tnext); + if (prsm && (len <= maxseg)) { + /* + * Two packets outstanding, thresh should be (2*srtt) + + * possible inter-packet delay (if any). + */ + uint32_t inter_gap = 0; + int idx, nidx; + + idx = rsm->r_rtr_cnt - 1; + nidx = prsm->r_rtr_cnt - 1; + if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) { + /* Yes it was sent later (or at the same time) */ + inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; + } + thresh += inter_gap; + } else if (len <= maxseg) { + /* + * Possibly compensate for delayed-ack. + */ + uint32_t alt_thresh; + + alt_thresh = srtt + (srtt / 2) + bbr_delayed_ack_time; + if (alt_thresh > thresh) + thresh = alt_thresh; + } + /* Not above the current RTO */ + if (tp->t_srtt == 0) + t_rxtcur = BBR_INITIAL_RTO; + else + t_rxtcur = TICKS_2_USEC(tp->t_rxtcur); + + bbr_log_thresh_choice(bbr, cts, thresh, t_rxtcur, srtt, rsm, BBR_TO_FRM_TLP); + /* Not above an RTO */ + if (thresh > t_rxtcur) { + thresh = t_rxtcur; + } + /* Not above a RTO max */ + if (thresh > (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND)) { + thresh = (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND); + } + /* And now apply the user TLP min */ + if (thresh < bbr_tlp_min) { + thresh = bbr_tlp_min; + } + return (thresh); +} + +/* + * Return one of three RTTs to use (in microseconds). + */ +static __inline uint32_t +bbr_get_rtt(struct tcp_bbr *bbr, int32_t rtt_type) +{ + uint32_t f_rtt; + uint32_t srtt; + + f_rtt = get_filter_value_small(&bbr->r_ctl.rc_rttprop); + if (get_filter_value_small(&bbr->r_ctl.rc_rttprop) == 0xffffffff) { + /* We have no rtt at all */ + if (bbr->rc_tp->t_srtt == 0) + f_rtt = BBR_INITIAL_RTO; + else + f_rtt = (TICKS_2_USEC(bbr->rc_tp->t_srtt) >> TCP_RTT_SHIFT); + /* + * Since we don't know how good the rtt is apply a + * delayed-ack min + */ + if (f_rtt < bbr_delayed_ack_time) { + f_rtt = bbr_delayed_ack_time; + } + } + /* Take the filter version or last measured pkt-rtt */ + if (rtt_type == BBR_RTT_PROP) { + srtt = f_rtt; + } else if (rtt_type == BBR_RTT_PKTRTT) { + if (bbr->r_ctl.rc_pkt_epoch_rtt) { + srtt = bbr->r_ctl.rc_pkt_epoch_rtt; + } else { + /* No pkt rtt yet */ + srtt = f_rtt; + } + } else if (rtt_type == BBR_RTT_RACK) { + srtt = bbr->r_ctl.rc_last_rtt; + /* We need to add in any internal delay for our timer */ + if (bbr->rc_ack_was_delayed) + srtt += bbr->r_ctl.rc_ack_hdwr_delay; + } else if (rtt_type == BBR_SRTT) { + srtt = (TICKS_2_USEC(bbr->rc_tp->t_srtt) >> TCP_RTT_SHIFT); + } else { + /* TSNH */ + srtt = f_rtt; +#ifdef BBR_INVARIANTS + panic("Unknown rtt request type %d", rtt_type); +#endif + } + return (srtt); +} + +static int +bbr_is_lost(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t cts) +{ + uint32_t thresh; + + + thresh = bbr_calc_thresh_rack(bbr, bbr_get_rtt(bbr, BBR_RTT_RACK), + cts, rsm); + if ((cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]) >= thresh) { + /* It is lost (past time) */ + return (1); + } + return (0); +} + +/* + * Return a sendmap if we need to retransmit something. + */ +static struct bbr_sendmap * +bbr_check_recovery_mode(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) +{ + /* + * Check to see that we don't need to fall into recovery. We will + * need to do so if our oldest transmit is past the time we should + * have had an ack. + */ + + struct bbr_sendmap *rsm; + int32_t idx; + + if (TAILQ_EMPTY(&bbr->r_ctl.rc_map)) { + /* Nothing outstanding that we know of */ + return (NULL); + } + rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap); + if (rsm == NULL) { + /* Nothing in the transmit map */ + return (NULL); + } + if (tp->t_flags & TF_SENTFIN) { + /* Fin restricted, don't find anything once a fin is sent */ + return (NULL); + } + if (rsm->r_flags & BBR_ACKED) { + /* + * Ok the first one is acked (this really should not happen + * since we remove the from the tmap once they are acked) + */ + rsm = bbr_find_lowest_rsm(bbr); + if (rsm == NULL) + return (NULL); + } + idx = rsm->r_rtr_cnt - 1; + if (SEQ_LEQ(cts, rsm->r_tim_lastsent[idx])) { + /* Send timestamp is the same or less? can't be ready */ + return (NULL); + } + /* Get our RTT time */ + if (bbr_is_lost(bbr, rsm, cts) && + ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || + (rsm->r_flags & BBR_SACK_PASSED))) { + if ((rsm->r_flags & BBR_MARKED_LOST) == 0) { + rsm->r_flags |= BBR_MARKED_LOST; + bbr->r_ctl.rc_lost += rsm->r_end - rsm->r_start; + bbr->r_ctl.rc_lost_bytes += rsm->r_end - rsm->r_start; + } + bbr_cong_signal(tp, NULL, CC_NDUPACK, rsm); +#ifdef BBR_INVARIANTS + if ((rsm->r_end - rsm->r_start) == 0) + panic("tp:%p bbr:%p rsm:%p length is 0?", tp, bbr, rsm); +#endif + return (rsm); + } + return (NULL); +} + +/* + * RACK Timer, here we simply do logging and house keeping. + * the normal bbr_output_wtime() function will call the + * appropriate thing to check if we need to do a RACK retransmit. + * We return 1, saying don't proceed with bbr_output_wtime only + * when all timers have been stopped (destroyed PCB?). + */ +static int +bbr_timeout_rack(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) +{ + /* + * This timer simply provides an internal trigger to send out data. + * The check_recovery_mode call will see if there are needed + * retransmissions, if so we will enter fast-recovery. The output + * call may or may not do the same thing depending on sysctl + * settings. + */ + uint32_t lost; + + if (bbr->rc_all_timers_stopped) { + return (1); + } + if (TSTMP_LT(cts, bbr->r_ctl.rc_timer_exp)) { + /* Its not time yet */ + return (0); + } + BBR_STAT_INC(bbr_to_tot); + lost = bbr->r_ctl.rc_lost; + if (bbr->r_state && (bbr->r_state != tp->t_state)) + bbr_set_state(tp, bbr, 0); + bbr_log_to_event(bbr, cts, BBR_TO_FRM_RACK); + if (bbr->r_ctl.rc_resend == NULL) { + /* Lets do the check here */ + bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts); + } + if (bbr_policer_call_from_rack_to) + bbr_lt_bw_sampling(bbr, cts, (bbr->r_ctl.rc_lost > lost)); + bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; + return (0); +} + +static __inline void +bbr_clone_rsm(struct tcp_bbr *bbr, struct bbr_sendmap *nrsm, struct bbr_sendmap *rsm, uint32_t start) +{ + int idx; + + nrsm->r_start = start; + nrsm->r_end = rsm->r_end; + nrsm->r_rtr_cnt = rsm->r_rtr_cnt; + nrsm->r_flags = rsm->r_flags; + /* We don't transfer forward the SYN flag */ + nrsm->r_flags &= ~BBR_HAS_SYN; + /* We move forward the FIN flag, not that this should happen */ + rsm->r_flags &= ~BBR_HAS_FIN; + nrsm->r_dupack = rsm->r_dupack; + nrsm->r_rtr_bytes = 0; + nrsm->r_is_gain = rsm->r_is_gain; + nrsm->r_is_drain = rsm->r_is_drain; + nrsm->r_delivered = rsm->r_delivered; + nrsm->r_ts_valid = rsm->r_ts_valid; + nrsm->r_del_ack_ts = rsm->r_del_ack_ts; + nrsm->r_del_time = rsm->r_del_time; + nrsm->r_app_limited = rsm->r_app_limited; + nrsm->r_first_sent_time = rsm->r_first_sent_time; + nrsm->r_flight_at_send = rsm->r_flight_at_send; + /* We split a piece the lower section looses any just_ret flag. */ + nrsm->r_bbr_state = rsm->r_bbr_state; + for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { + nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; + } + rsm->r_end = nrsm->r_start; + idx = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options), bbr->r_ctl.rc_pace_max_segs); + idx /= 8; + /* Check if we got too small */ + if ((rsm->r_is_smallmap == 0) && + ((rsm->r_end - rsm->r_start) <= idx)) { + bbr->r_ctl.rc_num_small_maps_alloced++; + rsm->r_is_smallmap = 1; + } + /* Check the new one as well */ + if ((nrsm->r_end - nrsm->r_start) <= idx) { + bbr->r_ctl.rc_num_small_maps_alloced++; + nrsm->r_is_smallmap = 1; + } +} + +static int +bbr_sack_mergable(struct bbr_sendmap *at, + uint32_t start, uint32_t end) +{ + /* + * Given a sack block defined by + * start and end, and a current postion + * at. Return 1 if either side of at + * would show that the block is mergable + * to that side. A block to be mergable + * must have overlap with the start/end + * and be in the SACK'd state. + */ + struct bbr_sendmap *l_rsm; + struct bbr_sendmap *r_rsm; + + /* first get the either side blocks */ + l_rsm = TAILQ_PREV(at, bbr_head, r_next); + r_rsm = TAILQ_NEXT(at, r_next); + if (l_rsm && (l_rsm->r_flags & BBR_ACKED)) { + /* Potentially mergeable */ + if ((l_rsm->r_end == start) || + (SEQ_LT(start, l_rsm->r_end) && + SEQ_GT(end, l_rsm->r_end))) { + /* + * map blk |------| + * sack blk |------| + * + * map blk |------| + * sack blk |------| + */ + return (1); + } + } + if (r_rsm && (r_rsm->r_flags & BBR_ACKED)) { + /* Potentially mergeable */ + if ((r_rsm->r_start == end) || + (SEQ_LT(start, r_rsm->r_start) && + SEQ_GT(end, r_rsm->r_start))) { + /* + * map blk |---------| + * sack blk |----| + * + * map blk |---------| + * sack blk |-------| + */ + return (1); + } + } + return (0); +} + +static struct bbr_sendmap * +bbr_merge_rsm(struct tcp_bbr *bbr, + struct bbr_sendmap *l_rsm, + struct bbr_sendmap *r_rsm) +{ + /* + * We are merging two ack'd RSM's, + * the l_rsm is on the left (lower seq + * values) and the r_rsm is on the right + * (higher seq value). The simplest way + * to merge these is to move the right + * one into the left. I don't think there + * is any reason we need to try to find + * the oldest (or last oldest retransmitted). + */ + l_rsm->r_end = r_rsm->r_end; + if (l_rsm->r_dupack < r_rsm->r_dupack) + l_rsm->r_dupack = r_rsm->r_dupack; + if (r_rsm->r_rtr_bytes) + l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes; + if (r_rsm->r_in_tmap) { + /* This really should not happen */ + TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, r_rsm, r_tnext); + } + if (r_rsm->r_app_limited) + l_rsm->r_app_limited = r_rsm->r_app_limited; + /* Now the flags */ + if (r_rsm->r_flags & BBR_HAS_FIN) + l_rsm->r_flags |= BBR_HAS_FIN; + if (r_rsm->r_flags & BBR_TLP) + l_rsm->r_flags |= BBR_TLP; + if (r_rsm->r_flags & BBR_RWND_COLLAPSED) + l_rsm->r_flags |= BBR_RWND_COLLAPSED; + if (r_rsm->r_flags & BBR_MARKED_LOST) { + /* This really should not happen */ + bbr->r_ctl.rc_lost_bytes -= r_rsm->r_end - r_rsm->r_start; + } + TAILQ_REMOVE(&bbr->r_ctl.rc_map, r_rsm, r_next); + if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) { + /* Transfer the split limit to the map we free */ + r_rsm->r_limit_type = l_rsm->r_limit_type; + l_rsm->r_limit_type = 0; + } + bbr_free(bbr, r_rsm); + return(l_rsm); +} + +/* + * TLP Timer, here we simply setup what segment we want to + * have the TLP expire on, the normal bbr_output_wtime() will then + * send it out. + * + * We return 1, saying don't proceed with bbr_output_wtime only + * when all timers have been stopped (destroyed PCB?). + */ +static int +bbr_timeout_tlp(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) +{ + /* + * Tail Loss Probe. + */ + struct bbr_sendmap *rsm = NULL; + struct socket *so; + uint32_t amm; + uint32_t out, avail; + uint32_t maxseg; + int collapsed_win = 0; + + if (bbr->rc_all_timers_stopped) { + return (1); + } + if (TSTMP_LT(cts, bbr->r_ctl.rc_timer_exp)) { + /* Its not time yet */ + return (0); + } + if (bbr_progress_timeout_check(bbr)) { + tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); + return (1); + } + /* Did we somehow get into persists? */ + if (bbr->rc_in_persist) { + return (0); + } + if (bbr->r_state && (bbr->r_state != tp->t_state)) + bbr_set_state(tp, bbr, 0); + BBR_STAT_INC(bbr_tlp_tot); + maxseg = tp->t_maxseg - bbr->rc_last_options; +#ifdef KERN_TLS + if (bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { + /* + * For hardware TLS we do *not* want to send + * new data. + */ + goto need_retran; + } +#endif + /* + * A TLP timer has expired. We have been idle for 2 rtts. So we now + * need to figure out how to force a full MSS segment out. + */ + so = tp->t_inpcb->inp_socket; + avail = sbavail(&so->so_snd); + out = ctf_outstanding(tp); + if (out > tp->snd_wnd) { + /* special case, we need a retransmission */ + collapsed_win = 1; + goto need_retran; + } + if (avail > out) { + /* New data is available */ + amm = avail - out; + if (amm > maxseg) { + amm = maxseg; + } else if ((amm < maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) { + /* not enough to fill a MTU and no-delay is off */ + goto need_retran; + } + /* Set the send-new override */ + if ((out + amm) <= tp->snd_wnd) { + bbr->rc_tlp_new_data = 1; + } else { + goto need_retran; + } + bbr->r_ctl.rc_tlp_seg_send_cnt = 0; + bbr->r_ctl.rc_last_tlp_seq = tp->snd_max; + bbr->r_ctl.rc_tlp_send = NULL; + /* cap any slots */ + BBR_STAT_INC(bbr_tlp_newdata); + goto send; + } +need_retran: + /* + * Ok we need to arrange the last un-acked segment to be re-sent, or + * optionally the first un-acked segment. + */ + if (collapsed_win == 0) { + rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_map, bbr_sendmap, r_next); + if (rsm && (BBR_ACKED | BBR_HAS_FIN)) { + rsm = bbr_find_high_nonack(bbr, rsm); + } + if (rsm == NULL) { + goto restore; + } + } else { + /* + * We must find the last segment + * that was acceptable by the client. + */ + TAILQ_FOREACH_REVERSE(rsm, &bbr->r_ctl.rc_map, bbr_head, r_next) { + if ((rsm->r_flags & BBR_RWND_COLLAPSED) == 0) { + /* Found one */ + break; + } + } + if (rsm == NULL) { + /* None? if so send the first */ + rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); + if (rsm == NULL) + goto restore; + } + } + if ((rsm->r_end - rsm->r_start) > maxseg) { + /* + * We need to split this the last segment in two. + */ + struct bbr_sendmap *nrsm; + + nrsm = bbr_alloc_full_limit(bbr); + if (nrsm == NULL) { + /* + * We can't get memory to split, we can either just + * not split it. Or retransmit the whole piece, lets + * do the large send (BTLP :-) ). + */ + goto go_for_it; + } + bbr_clone_rsm(bbr, nrsm, rsm, (rsm->r_end - maxseg)); + TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next); + if (rsm->r_in_tmap) { + TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext); + nrsm->r_in_tmap = 1; + } + rsm->r_flags &= (~BBR_HAS_FIN); + rsm = nrsm; + } +go_for_it: + bbr->r_ctl.rc_tlp_send = rsm; + bbr->rc_tlp_rtx_out = 1; + if (rsm->r_start == bbr->r_ctl.rc_last_tlp_seq) { + bbr->r_ctl.rc_tlp_seg_send_cnt++; + tp->t_rxtshift++; + } else { + bbr->r_ctl.rc_last_tlp_seq = rsm->r_start; + bbr->r_ctl.rc_tlp_seg_send_cnt = 1; + } +send: + if (bbr->r_ctl.rc_tlp_seg_send_cnt > bbr_tlp_max_resend) { + /* + * Can't [re]/transmit a segment we have retranmitted the + * max times. We need the retransmit timer to take over. + */ +restore: + bbr->rc_tlp_new_data = 0; + bbr->r_ctl.rc_tlp_send = NULL; + if (rsm) + rsm->r_flags &= ~BBR_TLP; + BBR_STAT_INC(bbr_tlp_retran_fail); + return (0); + } else if (rsm) { + rsm->r_flags |= BBR_TLP; + } + if (rsm && (rsm->r_start == bbr->r_ctl.rc_last_tlp_seq) && + (bbr->r_ctl.rc_tlp_seg_send_cnt > bbr_tlp_max_resend)) { + /* + * We have retransmitted to many times for TLP. Switch to + * the regular RTO timer + */ + goto restore; + } + bbr_log_to_event(bbr, cts, BBR_TO_FRM_TLP); + bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; + return (0); +} + +/* + * Delayed ack Timer, here we simply need to setup the + * ACK_NOW flag and remove the DELACK flag. From there + * the output routine will send the ack out. + * + * We only return 1, saying don't proceed, if all timers + * are stopped (destroyed PCB?). + */ +static int +bbr_timeout_delack(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) +{ + if (bbr->rc_all_timers_stopped) { + return (1); + } + bbr_log_to_event(bbr, cts, BBR_TO_FRM_DELACK); + tp->t_flags &= ~TF_DELACK; + tp->t_flags |= TF_ACKNOW; + TCPSTAT_INC(tcps_delack); + bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; + return (0); +} + +/* + * Persists timer, here we simply need to setup the + * FORCE-DATA flag the output routine will send + * the one byte send. + * + * We only return 1, saying don't proceed, if all timers + * are stopped (destroyed PCB?). + */ +static int +bbr_timeout_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) +{ + struct tcptemp *t_template; + int32_t retval = 1; + + if (bbr->rc_all_timers_stopped) { + return (1); + } + if (bbr->rc_in_persist == 0) + return (0); + KASSERT(tp->t_inpcb != NULL, + ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); + /* + * Persistence timer into zero window. Force a byte to be output, if + * possible. + */ + bbr_log_to_event(bbr, cts, BBR_TO_FRM_PERSIST); + bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; + TCPSTAT_INC(tcps_persisttimeo); + /* + * Have we exceeded the user specified progress time? + */ + if (bbr_progress_timeout_check(bbr)) { + tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); + goto out; + } + /* + * Hack: if the peer is dead/unreachable, we do not time out if the + * window is closed. After a full backoff, drop the connection if + * the idle time (no responses to probes) reaches the maximum + * backoff that we would use if retransmitting. + */ + if (tp->t_rxtshift == TCP_MAXRXTSHIFT && + (ticks - tp->t_rcvtime >= tcp_maxpersistidle || + ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { + TCPSTAT_INC(tcps_persistdrop); + tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); + goto out; + } + if ((sbavail(&bbr->rc_inp->inp_socket->so_snd) == 0) && + tp->snd_una == tp->snd_max) { + bbr_exit_persist(tp, bbr, cts, __LINE__); + retval = 0; + goto out; + } + /* + * If the user has closed the socket then drop a persisting + * connection after a much reduced timeout. + */ + if (tp->t_state > TCPS_CLOSE_WAIT && + (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { + TCPSTAT_INC(tcps_persistdrop); + tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); + goto out; + } + t_template = tcpip_maketemplate(bbr->rc_inp); + if (t_template) { + tcp_respond(tp, t_template->tt_ipgen, + &t_template->tt_t, (struct mbuf *)NULL, + tp->rcv_nxt, tp->snd_una - 1, 0); + /* This sends an ack */ + if (tp->t_flags & TF_DELACK) + tp->t_flags &= ~TF_DELACK; + free(t_template, M_TEMP); + } + if (tp->t_rxtshift < TCP_MAXRXTSHIFT) + tp->t_rxtshift++; + bbr_start_hpts_timer(bbr, tp, cts, 3, 0, 0); +out: + return (retval); +} + +/* + * If a keepalive goes off, we had no other timers + * happening. We always return 1 here since this + * routine either drops the connection or sends + * out a segment with respond. + */ +static int +bbr_timeout_keepalive(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) +{ + struct tcptemp *t_template; + struct inpcb *inp; + + if (bbr->rc_all_timers_stopped) { + return (1); + } + bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; + inp = tp->t_inpcb; + bbr_log_to_event(bbr, cts, BBR_TO_FRM_KEEP); + /* + * Keep-alive timer went off; send something or drop connection if + * idle for too long. + */ + TCPSTAT_INC(tcps_keeptimeo); + if (tp->t_state < TCPS_ESTABLISHED) + goto dropit; + if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && + tp->t_state <= TCPS_CLOSING) { + if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) + goto dropit; + /* + * Send a packet designed to force a response if the peer is + * up and reachable: either an ACK if the connection is + * still alive, or an RST if the peer has closed the + * connection due to timeout or reboot. Using sequence + * number tp->snd_una-1 causes the transmitted zero-length + * segment to lie outside the receive window; by the + * protocol spec, this requires the correspondent TCP to + * respond. + */ + TCPSTAT_INC(tcps_keepprobe); + t_template = tcpip_maketemplate(inp); + if (t_template) { + tcp_respond(tp, t_template->tt_ipgen, + &t_template->tt_t, (struct mbuf *)NULL, + tp->rcv_nxt, tp->snd_una - 1, 0); + free(t_template, M_TEMP); + } + } + bbr_start_hpts_timer(bbr, tp, cts, 4, 0, 0); + return (1); +dropit: + TCPSTAT_INC(tcps_keepdrops); + tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); + return (1); +} + +/* + * Retransmit helper function, clear up all the ack + * flags and take care of important book keeping. + */ +static void +bbr_remxt_tmr(struct tcpcb *tp) +{ + /* + * The retransmit timer went off, all sack'd blocks must be + * un-acked. + */ + struct bbr_sendmap *rsm, *trsm = NULL; + struct tcp_bbr *bbr; + uint32_t cts, lost; + + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + cts = tcp_get_usecs(&bbr->rc_tv); + lost = bbr->r_ctl.rc_lost; + if (bbr->r_state && (bbr->r_state != tp->t_state)) + bbr_set_state(tp, bbr, 0); + + TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) { + if (rsm->r_flags & BBR_ACKED) { + uint32_t old_flags; + + rsm->r_dupack = 0; + if (rsm->r_in_tmap == 0) { + /* We must re-add it back to the tlist */ + if (trsm == NULL) { + TAILQ_INSERT_HEAD(&bbr->r_ctl.rc_tmap, rsm, r_tnext); + } else { + TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, trsm, rsm, r_tnext); + } + rsm->r_in_tmap = 1; + } + old_flags = rsm->r_flags; + rsm->r_flags |= BBR_RXT_CLEARED; + rsm->r_flags &= ~(BBR_ACKED | BBR_SACK_PASSED | BBR_WAS_SACKPASS); + bbr_log_type_rsmclear(bbr, cts, rsm, old_flags, __LINE__); + } else { + if ((rsm->r_flags & BBR_MARKED_LOST) == 0) { + bbr->r_ctl.rc_lost += rsm->r_end - rsm->r_start; + bbr->r_ctl.rc_lost_bytes += rsm->r_end - rsm->r_start; + } + if (bbr_marks_rxt_sack_passed) { + /* + * With this option, we will rack out + * in 1ms increments the rest of the packets. + */ + rsm->r_flags |= BBR_SACK_PASSED | BBR_MARKED_LOST; + rsm->r_flags &= ~BBR_WAS_SACKPASS; + } else { + /* + * With this option we only mark them lost + * and remove all sack'd markings. We will run + * another RXT or a TLP. This will cause + * us to eventually send more based on what + * ack's come in. + */ + rsm->r_flags |= BBR_MARKED_LOST; + rsm->r_flags &= ~BBR_WAS_SACKPASS; + rsm->r_flags &= ~BBR_SACK_PASSED; + } + } + trsm = rsm; + } + bbr->r_ctl.rc_resend = TAILQ_FIRST(&bbr->r_ctl.rc_map); + /* Clear the count (we just un-acked them) */ + bbr_log_to_event(bbr, cts, BBR_TO_FRM_TMR); + bbr->rc_tlp_new_data = 0; + bbr->r_ctl.rc_tlp_seg_send_cnt = 0; + /* zap the behindness on a rxt */ + bbr->r_ctl.rc_hptsi_agg_delay = 0; + bbr->r_agg_early_set = 0; + bbr->r_ctl.rc_agg_early = 0; + bbr->rc_tlp_rtx_out = 0; + bbr->r_ctl.rc_sacked = 0; + bbr->r_ctl.rc_sacklast = NULL; + bbr->r_timer_override = 1; + bbr_lt_bw_sampling(bbr, cts, (bbr->r_ctl.rc_lost > lost)); +} + +/* + * Re-transmit timeout! If we drop the PCB we will return 1, otherwise + * we will setup to retransmit the lowest seq number outstanding. + */ +static int +bbr_timeout_rxt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) +{ + int32_t rexmt; + int32_t retval = 0; + + bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; + if (bbr->rc_all_timers_stopped) { + return (1); + } + if (TCPS_HAVEESTABLISHED(tp->t_state) && + (tp->snd_una == tp->snd_max)) { + /* Nothing outstanding .. nothing to do */ + return (0); + } + /* + * Retransmission timer went off. Message has not been acked within + * retransmit interval. Back off to a longer retransmit interval + * and retransmit one segment. + */ + if (bbr_progress_timeout_check(bbr)) { + retval = 1; + tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); + goto out; + } + bbr_remxt_tmr(tp); + if ((bbr->r_ctl.rc_resend == NULL) || + ((bbr->r_ctl.rc_resend->r_flags & BBR_RWND_COLLAPSED) == 0)) { + /* + * If the rwnd collapsed on + * the one we are retransmitting + * it does not count against the + * rxt count. + */ + tp->t_rxtshift++; + } + if (tp->t_rxtshift > TCP_MAXRXTSHIFT) { + tp->t_rxtshift = TCP_MAXRXTSHIFT; + TCPSTAT_INC(tcps_timeoutdrop); + retval = 1; + tcp_set_inp_to_drop(bbr->rc_inp, + (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); + goto out; + } + if (tp->t_state == TCPS_SYN_SENT) { + /* + * If the SYN was retransmitted, indicate CWND to be limited + * to 1 segment in cc_conn_init(). + */ + tp->snd_cwnd = 1; + } else if (tp->t_rxtshift == 1) { + /* + * first retransmit; record ssthresh and cwnd so they can be + * recovered if this turns out to be a "bad" retransmit. A + * retransmit is considered "bad" if an ACK for this segment + * is received within RTT/2 interval; the assumption here is + * that the ACK was already in flight. See "On Estimating + * End-to-End Network Path Properties" by Allman and Paxson + * for more details. + */ + tp->snd_cwnd = tp->t_maxseg - bbr->rc_last_options; + if (!IN_RECOVERY(tp->t_flags)) { + tp->snd_cwnd_prev = tp->snd_cwnd; + tp->snd_ssthresh_prev = tp->snd_ssthresh; + tp->snd_recover_prev = tp->snd_recover; + tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); + tp->t_flags |= TF_PREVVALID; + } else { + tp->t_flags &= ~TF_PREVVALID; + } + tp->snd_cwnd = tp->t_maxseg - bbr->rc_last_options; + } else { + tp->snd_cwnd = tp->t_maxseg - bbr->rc_last_options; + tp->t_flags &= ~TF_PREVVALID; + } + TCPSTAT_INC(tcps_rexmttimeo); + if ((tp->t_state == TCPS_SYN_SENT) || + (tp->t_state == TCPS_SYN_RECEIVED)) + rexmt = USEC_2_TICKS(BBR_INITIAL_RTO) * tcp_backoff[tp->t_rxtshift]; + else + rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; + TCPT_RANGESET(tp->t_rxtcur, rexmt, + MSEC_2_TICKS(bbr->r_ctl.rc_min_rto_ms), + MSEC_2_TICKS(((uint32_t)bbr->rc_max_rto_sec) * 1000)); + /* + * We enter the path for PLMTUD if connection is established or, if + * connection is FIN_WAIT_1 status, reason for the last is that if + * amount of data we send is very small, we could send it in couple + * of packets and process straight to FIN. In that case we won't + * catch ESTABLISHED state. + */ + if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) + || (tp->t_state == TCPS_FIN_WAIT_1))) { +#ifdef INET6 + int32_t isipv6; +#endif + + /* + * Idea here is that at each stage of mtu probe (usually, + * 1448 -> 1188 -> 524) should be given 2 chances to recover + * before further clamping down. 'tp->t_rxtshift % 2 == 0' + * should take care of that. + */ + if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == + (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && + (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && + tp->t_rxtshift % 2 == 0)) { + /* + * Enter Path MTU Black-hole Detection mechanism: - + * Disable Path MTU Discovery (IP "DF" bit). - + * Reduce MTU to lower value than what we negotiated + * with peer. + */ + if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { + /* + * Record that we may have found a black + * hole. + */ + tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; + /* Keep track of previous MSS. */ + tp->t_pmtud_saved_maxseg = tp->t_maxseg; + } + /* + * Reduce the MSS to blackhole value or to the + * default in an attempt to retransmit. + */ +#ifdef INET6 + isipv6 = bbr->r_is_v6; + if (isipv6 && + tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { + /* Use the sysctl tuneable blackhole MSS. */ + tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; + TCPSTAT_INC(tcps_pmtud_blackhole_activated); + } else if (isipv6) { + /* Use the default MSS. */ + tp->t_maxseg = V_tcp_v6mssdflt; + /* + * Disable Path MTU Discovery when we switch + * to minmss. + */ + tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; + TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); + } +#endif +#if defined(INET6) && defined(INET) + else +#endif +#ifdef INET + if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { + /* Use the sysctl tuneable blackhole MSS. */ + tp->t_maxseg = V_tcp_pmtud_blackhole_mss; + TCPSTAT_INC(tcps_pmtud_blackhole_activated); + } else { + /* Use the default MSS. */ + tp->t_maxseg = V_tcp_mssdflt; + /* + * Disable Path MTU Discovery when we switch + * to minmss. + */ + tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; + TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); + } +#endif + } else { + /* + * If further retransmissions are still unsuccessful + * with a lowered MTU, maybe this isn't a blackhole + * and we restore the previous MSS and blackhole + * detection flags. The limit '6' is determined by + * giving each probe stage (1448, 1188, 524) 2 + * chances to recover. + */ + if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && + (tp->t_rxtshift >= 6)) { + tp->t_flags2 |= TF2_PLPMTU_PMTUD; + tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; + tp->t_maxseg = tp->t_pmtud_saved_maxseg; + TCPSTAT_INC(tcps_pmtud_blackhole_failed); + } + } + } + /* + * Disable RFC1323 and SACK if we haven't got any response to our + * third SYN to work-around some broken terminal servers (most of + * which have hopefully been retired) that have bad VJ header + * compression code which trashes TCP segments containing + * unknown-to-them TCP options. + */ + if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && + (tp->t_rxtshift == 3)) + tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT); + /* + * If we backed off this far, our srtt estimate is probably bogus. + * Clobber it so we'll take the next rtt measurement as our srtt; + * move the current srtt into rttvar to keep the current retransmit + * times until then. + */ + if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { +#ifdef INET6 + if (bbr->r_is_v6) + in6_losing(tp->t_inpcb); + else +#endif + in_losing(tp->t_inpcb); + tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); + tp->t_srtt = 0; + } + sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una); + tp->snd_recover = tp->snd_max; + tp->t_flags |= TF_ACKNOW; + tp->t_rtttime = 0; +out: + return (retval); +} + +static int +bbr_process_timers(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, uint8_t hpts_calling) +{ + int32_t ret = 0; + int32_t timers = (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK); + + if (timers == 0) { + return (0); + } + if (tp->t_state == TCPS_LISTEN) { + /* no timers on listen sockets */ + if (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) + return (0); + return (1); + } + if (TSTMP_LT(cts, bbr->r_ctl.rc_timer_exp)) { + uint32_t left; + + if (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { + ret = -1; + bbr_log_to_processing(bbr, cts, ret, 0, hpts_calling); + return (0); + } + if (hpts_calling == 0) { + ret = -2; + bbr_log_to_processing(bbr, cts, ret, 0, hpts_calling); + return (0); + } + /* + * Ok our timer went off early and we are not paced false + * alarm, go back to sleep. + */ + left = bbr->r_ctl.rc_timer_exp - cts; + ret = -3; + bbr_log_to_processing(bbr, cts, ret, left, hpts_calling); + tcp_hpts_insert(tp->t_inpcb, HPTS_USEC_TO_SLOTS(left)); + return (1); + } + bbr->rc_tmr_stopped = 0; + bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; + if (timers & PACE_TMR_DELACK) { + ret = bbr_timeout_delack(tp, bbr, cts); + } else if (timers & PACE_TMR_PERSIT) { + ret = bbr_timeout_persist(tp, bbr, cts); + } else if (timers & PACE_TMR_RACK) { + bbr->r_ctl.rc_tlp_rxt_last_time = cts; + ret = bbr_timeout_rack(tp, bbr, cts); + } else if (timers & PACE_TMR_TLP) { + bbr->r_ctl.rc_tlp_rxt_last_time = cts; + ret = bbr_timeout_tlp(tp, bbr, cts); + } else if (timers & PACE_TMR_RXT) { + bbr->r_ctl.rc_tlp_rxt_last_time = cts; + ret = bbr_timeout_rxt(tp, bbr, cts); + } else if (timers & PACE_TMR_KEEP) { + ret = bbr_timeout_keepalive(tp, bbr, cts); + } + bbr_log_to_processing(bbr, cts, ret, timers, hpts_calling); + return (ret); +} + +static void +bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts) +{ + if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { + uint8_t hpts_removed = 0; + + if (bbr->rc_inp->inp_in_hpts && + (bbr->rc_timer_first == 1)) { + /* + * If we are canceling timer's when we have the + * timer ahead of the output being paced. We also + * must remove ourselves from the hpts. + */ + hpts_removed = 1; + tcp_hpts_remove(bbr->rc_inp, HPTS_REMOVE_OUTPUT); + if (bbr->r_ctl.rc_last_delay_val) { + /* Update the last hptsi delay too */ + uint32_t time_since_send; + + if (TSTMP_GT(cts, bbr->rc_pacer_started)) + time_since_send = cts - bbr->rc_pacer_started; + else + time_since_send = 0; + if (bbr->r_ctl.rc_last_delay_val > time_since_send) { + /* Cut down our slot time */ + bbr->r_ctl.rc_last_delay_val -= time_since_send; + } else { + bbr->r_ctl.rc_last_delay_val = 0; + } + bbr->rc_pacer_started = cts; + } + } + bbr->rc_timer_first = 0; + bbr_log_to_cancel(bbr, line, cts, hpts_removed); + bbr->rc_tmr_stopped = bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK; + bbr->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); + } +} + +static void +bbr_timer_stop(struct tcpcb *tp, uint32_t timer_type) +{ + struct tcp_bbr *bbr; + + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + bbr->rc_all_timers_stopped = 1; + return; +} + +/* + * stop all timers always returning 0. + */ +static int +bbr_stopall(struct tcpcb *tp) +{ + return (0); +} + +static void +bbr_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta) +{ + return; +} + +/* + * return true if a bbr timer (rack or tlp) is active. + */ +static int +bbr_timer_active(struct tcpcb *tp, uint32_t timer_type) +{ + return (0); +} + +static uint32_t +bbr_get_earliest_send_outstanding(struct tcp_bbr *bbr, struct bbr_sendmap *u_rsm, uint32_t cts) +{ + struct bbr_sendmap *rsm; + + rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap); + if ((rsm == NULL) || (u_rsm == rsm)) + return (cts); + return(rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]); +} + +static void +bbr_update_rsm(struct tcpcb *tp, struct tcp_bbr *bbr, + struct bbr_sendmap *rsm, uint32_t cts, uint32_t pacing_time) +{ + int32_t idx; + + rsm->r_rtr_cnt++; + rsm->r_dupack = 0; + if (rsm->r_rtr_cnt > BBR_NUM_OF_RETRANS) { + rsm->r_rtr_cnt = BBR_NUM_OF_RETRANS; + rsm->r_flags |= BBR_OVERMAX; + } + if (rsm->r_flags & BBR_RWND_COLLAPSED) { + /* Take off the collapsed flag at rxt */ + rsm->r_flags &= ~BBR_RWND_COLLAPSED; + } + if (rsm->r_flags & BBR_MARKED_LOST) { + /* We have retransmitted, its no longer lost */ + rsm->r_flags &= ~BBR_MARKED_LOST; + bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start; + } + if (rsm->r_flags & BBR_RXT_CLEARED) { + /* + * We hit a RXT timer on it and + * we cleared the "acked" flag. + * We now have it going back into + * flight, we can remove the cleared + * flag and possibly do accounting on + * this piece. + */ + rsm->r_flags &= ~BBR_RXT_CLEARED; + } + if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & BBR_TLP) == 0)) { + bbr->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); + rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); + } + idx = rsm->r_rtr_cnt - 1; + rsm->r_tim_lastsent[idx] = cts; + rsm->r_pacing_delay = pacing_time; + rsm->r_delivered = bbr->r_ctl.rc_delivered; + rsm->r_ts_valid = bbr->rc_ts_valid; + if (bbr->rc_ts_valid) + rsm->r_del_ack_ts = bbr->r_ctl.last_inbound_ts; + if (bbr->r_ctl.r_app_limited_until) + rsm->r_app_limited = 1; + else + rsm->r_app_limited = 0; + if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) + rsm->r_bbr_state = bbr_state_val(bbr); + else + rsm->r_bbr_state = 8; + if (rsm->r_flags & BBR_ACKED) { + /* Problably MTU discovery messing with us */ + uint32_t old_flags; + + old_flags = rsm->r_flags; + rsm->r_flags &= ~BBR_ACKED; + bbr_log_type_rsmclear(bbr, cts, rsm, old_flags, __LINE__); + bbr->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); + if (bbr->r_ctl.rc_sacked == 0) + bbr->r_ctl.rc_sacklast = NULL; + } + if (rsm->r_in_tmap) { + TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext); + } + TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_tmap, rsm, r_tnext); + rsm->r_in_tmap = 1; + if (rsm->r_flags & BBR_SACK_PASSED) { + /* We have retransmitted due to the SACK pass */ + rsm->r_flags &= ~BBR_SACK_PASSED; + rsm->r_flags |= BBR_WAS_SACKPASS; + } + rsm->r_first_sent_time = bbr_get_earliest_send_outstanding(bbr, rsm, cts); + rsm->r_flight_at_send = ctf_flight_size(bbr->rc_tp, + (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); + bbr->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); + if (bbr->r_ctl.rc_bbr_hptsi_gain > BBR_UNIT) { + rsm->r_is_gain = 1; + rsm->r_is_drain = 0; + } else if (bbr->r_ctl.rc_bbr_hptsi_gain < BBR_UNIT) { + rsm->r_is_drain = 1; + rsm->r_is_gain = 0; + } else { + rsm->r_is_drain = 0; + rsm->r_is_gain = 0; + } + rsm->r_del_time = bbr->r_ctl.rc_del_time; /* TEMP GOOGLE CODE */ +} + +/* + * Returns 0, or the sequence where we stopped + * updating. We also update the lenp to be the amount + * of data left. + */ + +static uint32_t +bbr_update_entry(struct tcpcb *tp, struct tcp_bbr *bbr, + struct bbr_sendmap *rsm, uint32_t cts, int32_t *lenp, uint32_t pacing_time) +{ + /* + * We (re-)transmitted starting at rsm->r_start for some length + * (possibly less than r_end. + */ + struct bbr_sendmap *nrsm; + uint32_t c_end; + int32_t len; + + len = *lenp; + c_end = rsm->r_start + len; + if (SEQ_GEQ(c_end, rsm->r_end)) { + /* + * We retransmitted the whole piece or more than the whole + * slopping into the next rsm. + */ + bbr_update_rsm(tp, bbr, rsm, cts, pacing_time); + if (c_end == rsm->r_end) { + *lenp = 0; + return (0); + } else { + int32_t act_len; + + /* Hangs over the end return whats left */ + act_len = rsm->r_end - rsm->r_start; + *lenp = (len - act_len); + return (rsm->r_end); + } + /* We don't get out of this block. */ + } + /* + * Here we retransmitted less than the whole thing which means we + * have to split this into what was transmitted and what was not. + */ + nrsm = bbr_alloc_full_limit(bbr); + if (nrsm == NULL) { + *lenp = 0; + return (0); + } + /* + * So here we are going to take the original rsm and make it what we + * retransmitted. nrsm will be the tail portion we did not + * retransmit. For example say the chunk was 1, 11 (10 bytes). And + * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to + * 1, 6 and the new piece will be 6, 11. + */ + bbr_clone_rsm(bbr, nrsm, rsm, c_end); + TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next); + nrsm->r_dupack = 0; + if (rsm->r_in_tmap) { + TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext); + nrsm->r_in_tmap = 1; + } + rsm->r_flags &= (~BBR_HAS_FIN); + bbr_update_rsm(tp, bbr, rsm, cts, pacing_time); + *lenp = 0; + return (0); +} + +static uint64_t +bbr_get_hardware_rate(struct tcp_bbr *bbr) +{ + uint64_t bw; + + bw = bbr_get_bw(bbr); + bw *= (uint64_t)bbr_hptsi_gain[BBR_SUB_GAIN]; + bw /= (uint64_t)BBR_UNIT; + return(bw); +} + +static void +bbr_setup_less_of_rate(struct tcp_bbr *bbr, uint32_t cts, + uint64_t act_rate, uint64_t rate_wanted) +{ + /* + * We could not get a full gains worth + * of rate. + */ + if (get_filter_value(&bbr->r_ctl.rc_delrate) >= act_rate) { + /* we can't even get the real rate */ + uint64_t red; + + bbr->skip_gain = 1; + bbr->gain_is_limited = 0; + red = get_filter_value(&bbr->r_ctl.rc_delrate) - act_rate; + if (red) + filter_reduce_by(&bbr->r_ctl.rc_delrate, red, cts); + } else { + /* We can use a lower gain */ + bbr->skip_gain = 0; + bbr->gain_is_limited = 1; + } +} + +static void +bbr_update_hardware_pacing_rate(struct tcp_bbr *bbr, uint32_t cts) +{ + const struct tcp_hwrate_limit_table *nrte; + int error, rate = -1; + + if (bbr->r_ctl.crte == NULL) + return; + if ((bbr->rc_inp->inp_route.ro_rt == NULL) || + (bbr->rc_inp->inp_route.ro_rt->rt_ifp == NULL)) { + /* Lost our routes? */ + /* Clear the way for a re-attempt */ + bbr->bbr_attempt_hdwr_pace = 0; +lost_rate: + bbr->gain_is_limited = 0; + bbr->skip_gain = 0; + bbr->bbr_hdrw_pacing = 0; + counter_u64_add(bbr_flows_whdwr_pacing, -1); + counter_u64_add(bbr_flows_nohdwr_pacing, 1); + tcp_bbr_tso_size_check(bbr, cts); + return; + } + rate = bbr_get_hardware_rate(bbr); + nrte = tcp_chg_pacing_rate(bbr->r_ctl.crte, + bbr->rc_tp, + bbr->rc_inp->inp_route.ro_rt->rt_ifp, + rate, + (RS_PACING_GEQ|RS_PACING_SUB_OK), + &error); + if (nrte == NULL) { + goto lost_rate; + } + if (nrte != bbr->r_ctl.crte) { + bbr->r_ctl.crte = nrte; + if (error == 0) { + BBR_STAT_INC(bbr_hdwr_rl_mod_ok); + if (bbr->r_ctl.crte->rate < rate) { + /* We have a problem */ + bbr_setup_less_of_rate(bbr, cts, + bbr->r_ctl.crte->rate, rate); + } else { + /* We are good */ + bbr->gain_is_limited = 0; + bbr->skip_gain = 0; + } + } else { + /* A failure should release the tag */ + BBR_STAT_INC(bbr_hdwr_rl_mod_fail); + bbr->gain_is_limited = 0; + bbr->skip_gain = 0; + bbr->bbr_hdrw_pacing = 0; + } + bbr_type_log_hdwr_pacing(bbr, + bbr->r_ctl.crte->ptbl->rs_ifp, + rate, + ((bbr->r_ctl.crte == NULL) ? 0 : bbr->r_ctl.crte->rate), + __LINE__, + cts, + error); + } +} + +static void +bbr_adjust_for_hw_pacing(struct tcp_bbr *bbr, uint32_t cts) +{ + /* + * If we have hardware pacing support + * we need to factor that in for our + * TSO size. + */ + const struct tcp_hwrate_limit_table *rlp; + uint32_t cur_delay, seg_sz, maxseg, new_tso, delta, hdwr_delay; + + if ((bbr->bbr_hdrw_pacing == 0) || + (IN_RECOVERY(bbr->rc_tp->t_flags)) || + (bbr->r_ctl.crte == NULL)) + return; + if (bbr->hw_pacing_set == 0) { + /* Not yet by the hdwr pacing count delay */ + return; + } + if (bbr_hdwr_pace_adjust == 0) { + /* No adjustment */ + return; + } + rlp = bbr->r_ctl.crte; + if (bbr->rc_tp->t_maxseg > bbr->rc_last_options) + maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options; + else + maxseg = BBR_MIN_SEG - bbr->rc_last_options; + /* + * So lets first get the + * time we will take between + * TSO sized sends currently without + * hardware help. + */ + cur_delay = bbr_get_pacing_delay(bbr, BBR_UNIT, + bbr->r_ctl.rc_pace_max_segs, cts, 1); + hdwr_delay = bbr->r_ctl.rc_pace_max_segs / maxseg; + hdwr_delay *= rlp->time_between; + if (cur_delay > hdwr_delay) + delta = cur_delay - hdwr_delay; + else + delta = 0; + bbr_log_type_tsosize(bbr, cts, delta, cur_delay, hdwr_delay, + (bbr->r_ctl.rc_pace_max_segs / maxseg), + 1); + if (delta && + (delta < (max(rlp->time_between, + bbr->r_ctl.bbr_hptsi_segments_delay_tar)))) { + /* + * Now lets divide by the pacing + * time between each segment the + * hardware sends rounding up and + * derive a bytes from that. We multiply + * that by bbr_hdwr_pace_adjust to get + * more bang for our buck. + * + * The goal is to have the software pacer + * waiting no more than an additional + * pacing delay if we can (without the + * compensation i.e. x bbr_hdwr_pace_adjust). + */ + seg_sz = max(((cur_delay + rlp->time_between)/rlp->time_between), + (bbr->r_ctl.rc_pace_max_segs/maxseg)); + seg_sz *= bbr_hdwr_pace_adjust; + if (bbr_hdwr_pace_floor && + (seg_sz < bbr->r_ctl.crte->ptbl->rs_min_seg)) { + /* Currently hardware paces + * out rs_min_seg segments at a time. + * We need to make sure we always send at least + * a full burst of bbr_hdwr_pace_floor down. + */ + seg_sz = bbr->r_ctl.crte->ptbl->rs_min_seg; + } + seg_sz *= maxseg; + } else if (delta == 0) { + /* + * The highest pacing rate is + * above our b/w gained. This means + * we probably are going quite fast at + * the hardware highest rate. Lets just multiply + * the calculated TSO size by the + * multiplier factor (its probably + * 4 segments in the default config for + * mlx). + */ + seg_sz = bbr->r_ctl.rc_pace_max_segs * bbr_hdwr_pace_adjust; + if (bbr_hdwr_pace_floor && + (seg_sz < bbr->r_ctl.crte->ptbl->rs_min_seg)) { + /* Currently hardware paces + * out rs_min_seg segments at a time. + * We need to make sure we always send at least + * a full burst of bbr_hdwr_pace_floor down. + */ + seg_sz = bbr->r_ctl.crte->ptbl->rs_min_seg; + } + } else { + /* + * The pacing time difference is so + * big that the hardware will + * pace out more rapidly then we + * really want and then we + * will have a long delay. Lets just keep + * the same TSO size so its as if + * we were not using hdwr pacing (we + * just gain a bit of spacing from the + * hardware if seg_sz > 1). + */ + seg_sz = bbr->r_ctl.rc_pace_max_segs; + } + if (seg_sz > bbr->r_ctl.rc_pace_max_segs) + new_tso = seg_sz; + else + new_tso = bbr->r_ctl.rc_pace_max_segs; + if (new_tso >= (PACE_MAX_IP_BYTES-maxseg)) + new_tso = PACE_MAX_IP_BYTES - maxseg; + + if (new_tso != bbr->r_ctl.rc_pace_max_segs) { + bbr_log_type_tsosize(bbr, cts, new_tso, 0, bbr->r_ctl.rc_pace_max_segs, maxseg, 0); + bbr->r_ctl.rc_pace_max_segs = new_tso; + } +} + +static void +tcp_bbr_tso_size_check(struct tcp_bbr *bbr, uint32_t cts) +{ + uint64_t bw; + uint32_t old_tso = 0, new_tso; + uint32_t maxseg, bytes; + uint32_t tls_seg=0; + /* + * Google/linux uses the following algorithm to determine + * the TSO size based on the b/w of the link (from Neal Cardwell email 9/27/18): + * + * bytes = bw_in_bytes_per_second / 1000 + * bytes = min(bytes, 64k) + * tso_segs = bytes / MSS + * if (bw < 1.2Mbs) + * min_tso_segs = 1 + * else + * min_tso_segs = 2 + * tso_segs = max(tso_segs, min_tso_segs) + * + * * Note apply a device specific limit (we apply this in the + * tcp_m_copym). + * Note that before the initial measurement is made google bursts out + * a full iwnd just like new-reno/cubic. + * + * We do not use this algorithm. Instead we + * use a two phased approach: + * + * if ( bw <= per-tcb-cross-over) + * goal_tso = calculate how much with this bw we + * can send in goal-time seconds. + * if (goal_tso > mss) + * seg = goal_tso / mss + * tso = seg * mss + * else + * tso = mss + * if (tso > per-tcb-max) + * tso = per-tcb-max + * else if ( bw > 512Mbps) + * tso = max-tso (64k/mss) + * else + * goal_tso = bw / per-tcb-divsor + * seg = (goal_tso + mss-1)/mss + * tso = seg * mss + * + * if (tso < per-tcb-floor) + * tso = per-tcb-floor + * if (tso > per-tcb-utter_max) + * tso = per-tcb-utter_max + * + * Note the default per-tcb-divisor is 1000 (same as google). + * the goal cross over is 30Mbps however. To recreate googles + * algorithm you need to set: + * + * cross-over = 23,168,000 bps + * goal-time = 18000 + * per-tcb-max = 2 + * per-tcb-divisor = 1000 + * per-tcb-floor = 1 + * + * This will get you "google bbr" behavior with respect to tso size. + * + * Note we do set anything TSO size until we are past the initial + * window. Before that we gnerally use either a single MSS + * or we use the full IW size (so we burst a IW at a time) + * Also note that Hardware-TLS is special and does alternate + * things to minimize PCI Bus Bandwidth use. + */ + + if (bbr->rc_tp->t_maxseg > bbr->rc_last_options) { + maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options; + } else { + maxseg = BBR_MIN_SEG - bbr->rc_last_options; + } +#ifdef KERN_TLS + if (bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { + tls_seg = ctf_get_opt_tls_size(bbr->rc_inp->inp_socket, bbr->rc_tp->snd_wnd); + bbr->r_ctl.rc_pace_min_segs = (tls_seg + bbr->rc_last_options); + } +#endif + old_tso = bbr->r_ctl.rc_pace_max_segs; + if (bbr->rc_past_init_win == 0) { + /* + * Not enough data has been acknowledged to make a + * judgement unless we are hardware TLS. Set up + * the inital TSO based on if we are sending a + * full IW at once or not. + */ + if (bbr->rc_use_google) + bbr->r_ctl.rc_pace_max_segs = ((bbr->rc_tp->t_maxseg - bbr->rc_last_options) * 2); + else if (bbr->bbr_init_win_cheat) + bbr->r_ctl.rc_pace_max_segs = bbr_initial_cwnd(bbr, bbr->rc_tp); + else + bbr->r_ctl.rc_pace_max_segs = bbr->rc_tp->t_maxseg - bbr->rc_last_options; + if (bbr->r_ctl.rc_pace_min_segs != bbr->rc_tp->t_maxseg) + bbr->r_ctl.rc_pace_min_segs = bbr->rc_tp->t_maxseg; +#ifdef KERN_TLS + if ((bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) && tls_seg) { + /* + * For hardware TLS we set our min to the tls_seg size. + */ + bbr->r_ctl.rc_pace_max_segs = tls_seg; + bbr->r_ctl.rc_pace_min_segs = tls_seg + bbr->rc_last_options; + } +#endif + if (bbr->r_ctl.rc_pace_max_segs == 0) { + bbr->r_ctl.rc_pace_max_segs = maxseg; + } + bbr_log_type_tsosize(bbr, cts, bbr->r_ctl.rc_pace_max_segs, tls_seg, old_tso, maxseg, 0); +#ifdef KERN_TLS + if ((bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) == 0) +#endif + bbr_adjust_for_hw_pacing(bbr, cts); + return; + } + /** + * Now lets set the TSO goal based on our delivery rate in + * bytes per second. Note we only do this if + * we have acked at least the initial cwnd worth of data. + */ + bw = bbr_get_bw(bbr); + if (IN_RECOVERY(bbr->rc_tp->t_flags) && + (bbr->rc_use_google == 0)) { + /* We clamp to one MSS in recovery */ + new_tso = maxseg; + } else if (bbr->rc_use_google) { + int min_tso_segs; + + /* Google considers the gain too */ + if (bbr->r_ctl.rc_bbr_hptsi_gain != BBR_UNIT) { + bw *= bbr->r_ctl.rc_bbr_hptsi_gain; + bw /= BBR_UNIT; + } + bytes = bw / 1024; + if (bytes > (64 * 1024)) + bytes = 64 * 1024; + new_tso = bytes / maxseg; + if (bw < ONE_POINT_TWO_MEG) + min_tso_segs = 1; + else + min_tso_segs = 2; + if (new_tso < min_tso_segs) + new_tso = min_tso_segs; + new_tso *= maxseg; + } else if (bbr->rc_no_pacing) { + new_tso = (PACE_MAX_IP_BYTES / maxseg) * maxseg; + } else if (bw <= bbr->r_ctl.bbr_cross_over) { + /* + * Calculate the worse case b/w TSO if we are inserting no + * more than a delay_target number of TSO's. + */ + uint32_t tso_len, min_tso; + + tso_len = bbr_get_pacing_length(bbr, BBR_UNIT, bbr->r_ctl.bbr_hptsi_segments_delay_tar, bw); + if (tso_len > maxseg) { + new_tso = tso_len / maxseg; + if (new_tso > bbr->r_ctl.bbr_hptsi_segments_max) + new_tso = bbr->r_ctl.bbr_hptsi_segments_max; + new_tso *= maxseg; + } else { + /* + * less than a full sized frame yikes.. long rtt or + * low bw? + */ + min_tso = bbr_minseg(bbr); + if ((tso_len > min_tso) && (bbr_all_get_min == 0)) + new_tso = rounddown(tso_len, min_tso); + else + new_tso = min_tso; + } + } else if (bw > FIVETWELVE_MBPS) { + /* + * This guy is so fast b/w wise that we can TSO as large as + * possible of segments that the NIC will allow. + */ + new_tso = rounddown(PACE_MAX_IP_BYTES, maxseg); + } else { + /* + * This formula is based on attempting to send a segment or + * more every bbr_hptsi_per_second. The default is 1000 + * which means you are targeting what you can send every 1ms + * based on the peers bw. + * + * If the number drops to say 500, then you are looking more + * at 2ms and you will raise how much we send in a single + * TSO thus saving CPU (less bbr_output_wtime() calls). The + * trade off of course is you will send more at once and + * thus tend to clump up the sends into larger "bursts" + * building a queue. + */ + bw /= bbr->r_ctl.bbr_hptsi_per_second; + new_tso = roundup(bw, (uint64_t)maxseg); + /* + * Gate the floor to match what our lower than 48Mbps + * algorithm does. The ceiling (bbr_hptsi_segments_max) thus + * becomes the floor for this calculation. + */ + if (new_tso < (bbr->r_ctl.bbr_hptsi_segments_max * maxseg)) + new_tso = (bbr->r_ctl.bbr_hptsi_segments_max * maxseg); + } + if (bbr->r_ctl.bbr_hptsi_segments_floor && (new_tso < (maxseg * bbr->r_ctl.bbr_hptsi_segments_floor))) + new_tso = maxseg * bbr->r_ctl.bbr_hptsi_segments_floor; + if (new_tso > PACE_MAX_IP_BYTES) + new_tso = rounddown(PACE_MAX_IP_BYTES, maxseg); + /* Enforce an utter maximum if we are not HW-TLS */ +#ifdef KERN_TLS + if ((bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) == 0) +#endif + if (bbr->r_ctl.bbr_utter_max && (new_tso > (bbr->r_ctl.bbr_utter_max * maxseg))) { + new_tso = bbr->r_ctl.bbr_utter_max * maxseg; + } +#ifdef KERN_TLS + if (tls_seg) { + /* + * Lets move the output size + * up to 1 or more TLS record sizes. + */ + uint32_t temp; + + temp = roundup(new_tso, tls_seg); + new_tso = temp; + /* Back down if needed to under a full frame */ + while (new_tso > PACE_MAX_IP_BYTES) + new_tso -= tls_seg; + } +#endif + if (old_tso != new_tso) { + /* Only log changes */ + bbr_log_type_tsosize(bbr, cts, new_tso, tls_seg, old_tso, maxseg, 0); + bbr->r_ctl.rc_pace_max_segs = new_tso; + } +#ifdef KERN_TLS + if ((bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) && + tls_seg) { + bbr->r_ctl.rc_pace_min_segs = tls_seg + bbr->rc_last_options; + } else +#endif + /* We have hardware pacing and not hardware TLS! */ + bbr_adjust_for_hw_pacing(bbr, cts); +} + +static void +bbr_log_output(struct tcp_bbr *bbr, struct tcpcb *tp, struct tcpopt *to, int32_t len, + uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t cts, + struct mbuf *mb, int32_t * abandon, struct bbr_sendmap *hintrsm, uint32_t delay_calc, + struct sockbuf *sb) +{ + + struct bbr_sendmap *rsm, *nrsm; + register uint32_t snd_max, snd_una; + uint32_t pacing_time; + /* + * Add to the RACK log of packets in flight or retransmitted. If + * there is a TS option we will use the TS echoed, if not we will + * grab a TS. + * + * Retransmissions will increment the count and move the ts to its + * proper place. Note that if options do not include TS's then we + * won't be able to effectively use the ACK for an RTT on a retran. + * + * Notes about r_start and r_end. Lets consider a send starting at + * sequence 1 for 10 bytes. In such an example the r_start would be + * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. + * This means that r_end is actually the first sequence for the next + * slot (11). + * + */ + INP_WLOCK_ASSERT(tp->t_inpcb); + if (err) { + /* + * We don't log errors -- we could but snd_max does not + * advance in this case either. + */ + return; + } + if (th_flags & TH_RST) { + /* + * We don't log resets and we return immediately from + * sending + */ + *abandon = 1; + return; + } + snd_una = tp->snd_una; + if (th_flags & (TH_SYN | TH_FIN) && (hintrsm == NULL)) { + /* + * The call to bbr_log_output is made before bumping + * snd_max. This means we can record one extra byte on a SYN + * or FIN if seq_out is adding more on and a FIN is present + * (and we are not resending). + */ + if (th_flags & TH_SYN) + len++; + if (th_flags & TH_FIN) + len++; + } + if (SEQ_LEQ((seq_out + len), snd_una)) { + /* Are sending an old segment to induce an ack (keep-alive)? */ + return; + } + if (SEQ_LT(seq_out, snd_una)) { + /* huh? should we panic? */ + uint32_t end; + + end = seq_out + len; + seq_out = snd_una; + len = end - seq_out; + } + snd_max = tp->snd_max; + if (len == 0) { + /* We don't log zero window probes */ + return; + } + pacing_time = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, len, cts, 1); + /* First question is it a retransmission? */ + if (seq_out == snd_max) { +again: + rsm = bbr_alloc(bbr); + if (rsm == NULL) { + return; + } + rsm->r_flags = 0; + if (th_flags & TH_SYN) + rsm->r_flags |= BBR_HAS_SYN; + if (th_flags & TH_FIN) + rsm->r_flags |= BBR_HAS_FIN; + rsm->r_tim_lastsent[0] = cts; + rsm->r_rtr_cnt = 1; + rsm->r_rtr_bytes = 0; + rsm->r_start = seq_out; + rsm->r_end = rsm->r_start + len; + rsm->r_dupack = 0; + rsm->r_delivered = bbr->r_ctl.rc_delivered; + rsm->r_pacing_delay = pacing_time; + rsm->r_ts_valid = bbr->rc_ts_valid; + if (bbr->rc_ts_valid) + rsm->r_del_ack_ts = bbr->r_ctl.last_inbound_ts; + rsm->r_del_time = bbr->r_ctl.rc_del_time; + if (bbr->r_ctl.r_app_limited_until) + rsm->r_app_limited = 1; + else + rsm->r_app_limited = 0; + rsm->r_first_sent_time = bbr_get_earliest_send_outstanding(bbr, rsm, cts); + rsm->r_flight_at_send = ctf_flight_size(bbr->rc_tp, + (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); + /* + * Here we must also add in this rsm since snd_max + * is updated after we return from a new send. + */ + rsm->r_flight_at_send += len; + TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_map, rsm, r_next); + TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_tmap, rsm, r_tnext); + rsm->r_in_tmap = 1; + if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) + rsm->r_bbr_state = bbr_state_val(bbr); + else + rsm->r_bbr_state = 8; + if (bbr->r_ctl.rc_bbr_hptsi_gain > BBR_UNIT) { + rsm->r_is_gain = 1; + rsm->r_is_drain = 0; + } else if (bbr->r_ctl.rc_bbr_hptsi_gain < BBR_UNIT) { + rsm->r_is_drain = 1; + rsm->r_is_gain = 0; + } else { + rsm->r_is_drain = 0; + rsm->r_is_gain = 0; + } + return; + } + /* + * If we reach here its a retransmission and we need to find it. + */ +more: + if (hintrsm && (hintrsm->r_start == seq_out)) { + rsm = hintrsm; + hintrsm = NULL; + } else if (bbr->r_ctl.rc_next) { + /* We have a hint from a previous run */ + rsm = bbr->r_ctl.rc_next; + } else { + /* No hints sorry */ + rsm = NULL; + } + if ((rsm) && (rsm->r_start == seq_out)) { + /* + * We used rc_next or hintrsm to retransmit, hopefully the + * likely case. + */ + seq_out = bbr_update_entry(tp, bbr, rsm, cts, &len, pacing_time); + if (len == 0) { + return; + } else { + goto more; + } + } + /* Ok it was not the last pointer go through it the hard way. */ + TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) { + if (rsm->r_start == seq_out) { + seq_out = bbr_update_entry(tp, bbr, rsm, cts, &len, pacing_time); + bbr->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); + if (len == 0) { + return; + } else { + continue; + } + } + if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { + /* Transmitted within this piece */ + /* + * Ok we must split off the front and then let the + * update do the rest + */ + nrsm = bbr_alloc_full_limit(bbr); + if (nrsm == NULL) { + bbr_update_rsm(tp, bbr, rsm, cts, pacing_time); + return; + } + /* + * copy rsm to nrsm and then trim the front of rsm + * to not include this part. + */ + bbr_clone_rsm(bbr, nrsm, rsm, seq_out); + TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next); + if (rsm->r_in_tmap) { + TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext); + nrsm->r_in_tmap = 1; + } + rsm->r_flags &= (~BBR_HAS_FIN); + seq_out = bbr_update_entry(tp, bbr, nrsm, cts, &len, pacing_time); + if (len == 0) { + return; + } + } + } + /* + * Hmm not found in map did they retransmit both old and on into the + * new? + */ + if (seq_out == tp->snd_max) { + goto again; + } else if (SEQ_LT(seq_out, tp->snd_max)) { +#ifdef BBR_INVARIANTS + printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", + seq_out, len, tp->snd_una, tp->snd_max); + printf("Starting Dump of all rack entries\n"); + TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) { + printf("rsm:%p start:%u end:%u\n", + rsm, rsm->r_start, rsm->r_end); + } + printf("Dump complete\n"); + panic("seq_out not found rack:%p tp:%p", + bbr, tp); +#endif + } else { +#ifdef BBR_INVARIANTS + /* + * Hmm beyond sndmax? (only if we are using the new rtt-pack + * flag) + */ + panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", + seq_out, len, tp->snd_max, tp); +#endif + } +} + +static void +bbr_collapse_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, int32_t rtt) +{ + /* + * Collapse timeout back the cum-ack moved. + */ + tp->t_rxtshift = 0; + tp->t_softerror = 0; +} + + +static void +tcp_bbr_xmit_timer(struct tcp_bbr *bbr, uint32_t rtt_usecs, uint32_t rsm_send_time, uint32_t r_start, uint32_t tsin) +{ + bbr->rtt_valid = 1; + bbr->r_ctl.cur_rtt = rtt_usecs; + bbr->r_ctl.ts_in = tsin; + if (rsm_send_time) + bbr->r_ctl.cur_rtt_send_time = rsm_send_time; +} + +static void +bbr_make_timestamp_determination(struct tcp_bbr *bbr) +{ + /** + * We have in our bbr control: + * 1) The timestamp we started observing cum-acks (bbr->r_ctl.bbr_ts_check_tstmp). + * 2) Our timestamp indicating when we sent that packet (bbr->r_ctl.rsm->bbr_ts_check_our_cts). + * 3) The current timestamp that just came in (bbr->r_ctl.last_inbound_ts) + * 4) The time that the packet that generated that ack was sent (bbr->r_ctl.cur_rtt_send_time) + * + * Now we can calculate the time between the sends by doing: + * + * delta = bbr->r_ctl.cur_rtt_send_time - bbr->r_ctl.bbr_ts_check_our_cts + * + * And the peer's time between receiving them by doing: + * + * peer_delta = bbr->r_ctl.last_inbound_ts - bbr->r_ctl.bbr_ts_check_tstmp + * + * We want to figure out if the timestamp values are in msec, 10msec or usec. + * We also may find that we can't use the timestamps if say we see + * that the peer_delta indicates that though we may have taken 10ms to + * pace out the data, it only saw 1ms between the two packets. This would + * indicate that somewhere on the path is a batching entity that is giving + * out time-slices of the actual b/w. This would mean we could not use + * reliably the peers timestamps. + * + * We expect delta > peer_delta initially. Until we figure out the + * timestamp difference which we will store in bbr->r_ctl.bbr_peer_tsratio. + * If we place 1000 there then its a ms vs our usec. If we place 10000 there + * then its 10ms vs our usec. If the peer is running a usec clock we would + * put a 1 there. If the value is faster then ours, we will disable the + * use of timestamps (though we could revist this later if we find it to be not + * just an isolated one or two flows)). + * + * To detect the batching middle boxes we will come up with our compensation and + * if with it in place, we find the peer is drastically off (by some margin) in + * the smaller direction, then we will assume the worst case and disable use of timestamps. + * + */ + uint64_t delta, peer_delta, delta_up; + + delta = bbr->r_ctl.cur_rtt_send_time - bbr->r_ctl.bbr_ts_check_our_cts; + if (delta < bbr_min_usec_delta) { + /* + * Have not seen a min amount of time + * between our send times so we can + * make a determination of the timestamp + * yet. + */ + return; + } + peer_delta = bbr->r_ctl.last_inbound_ts - bbr->r_ctl.bbr_ts_check_tstmp; + if (peer_delta < bbr_min_peer_delta) { + /* + * We may have enough in the form of + * our delta but the peers number + * has not changed that much. It could + * be its clock ratio is such that + * we need more data (10ms tick) or + * there may be other compression scenarios + * going on. In any event we need the + * spread to be larger. + */ + return; + } + /* Ok lets first see which way our delta is going */ + if (peer_delta > delta) { + /* Very unlikely, the peer without + * compensation shows that it saw + * the two sends arrive further apart + * then we saw then in micro-seconds. + */ + if (peer_delta < (delta + ((delta * (uint64_t)1000)/ (uint64_t)bbr_delta_percent))) { + /* well it looks like the peer is a micro-second clock. */ + bbr->rc_ts_clock_set = 1; + bbr->r_ctl.bbr_peer_tsratio = 1; + } else { + bbr->rc_ts_cant_be_used = 1; + bbr->rc_ts_clock_set = 1; + } + return; + } + /* Ok we know that the peer_delta is smaller than our send distance */ + bbr->rc_ts_clock_set = 1; + /* First question is it within the percentage that they are using usec time? */ + delta_up = (peer_delta * 1000) / (uint64_t)bbr_delta_percent; + if ((peer_delta + delta_up) >= delta) { + /* Its a usec clock */ + bbr->r_ctl.bbr_peer_tsratio = 1; + bbr_log_tstmp_validation(bbr, peer_delta, delta); + return; + } + /* Ok if not usec, what about 10usec (though unlikely)? */ + delta_up = (peer_delta * 1000 * 10) / (uint64_t)bbr_delta_percent; + if (((peer_delta * 10) + delta_up) >= delta) { + bbr->r_ctl.bbr_peer_tsratio = 10; + bbr_log_tstmp_validation(bbr, peer_delta, delta); + return; + } + /* And what about 100usec (though again unlikely)? */ + delta_up = (peer_delta * 1000 * 100) / (uint64_t)bbr_delta_percent; + if (((peer_delta * 100) + delta_up) >= delta) { + bbr->r_ctl.bbr_peer_tsratio = 100; + bbr_log_tstmp_validation(bbr, peer_delta, delta); + return; + } + /* And how about 1 msec (the most likely one)? */ + delta_up = (peer_delta * 1000 * 1000) / (uint64_t)bbr_delta_percent; + if (((peer_delta * 1000) + delta_up) >= delta) { + bbr->r_ctl.bbr_peer_tsratio = 1000; + bbr_log_tstmp_validation(bbr, peer_delta, delta); + return; + } + /* Ok if not msec could it be 10 msec? */ + delta_up = (peer_delta * 1000 * 10000) / (uint64_t)bbr_delta_percent; + if (((peer_delta * 10000) + delta_up) >= delta) { + bbr->r_ctl.bbr_peer_tsratio = 10000; + return; + } + /* If we fall down here the clock tick so slowly we can't use it */ + bbr->rc_ts_cant_be_used = 1; + bbr->r_ctl.bbr_peer_tsratio = 0; + bbr_log_tstmp_validation(bbr, peer_delta, delta); +} + +/* + * Collect new round-trip time estimate + * and update averages and current timeout. + */ +static void +tcp_bbr_xmit_timer_commit(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts) +{ + int32_t delta; + uint32_t rtt, tsin; + int32_t rtt_ticks; + + + if (bbr->rtt_valid == 0) + /* No valid sample */ + return; + + rtt = bbr->r_ctl.cur_rtt; + tsin = bbr->r_ctl.ts_in; + if (bbr->rc_prtt_set_ts) { + /* + * We are to force feed the rttProp filter due + * to an entry into PROBE_RTT. This assures + * that the times are sync'd between when we + * go into PROBE_RTT and the filter expiration. + * + * Google does not use a true filter, so they do + * this implicitly since they only keep one value + * and when they enter probe-rtt they update the + * value to the newest rtt. + */ + uint32_t rtt_prop; + + bbr->rc_prtt_set_ts = 0; + rtt_prop = get_filter_value_small(&bbr->r_ctl.rc_rttprop); + if (rtt > rtt_prop) + filter_increase_by_small(&bbr->r_ctl.rc_rttprop, (rtt - rtt_prop), cts); + else + apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts); + } + if (bbr->rc_ack_was_delayed) + rtt += bbr->r_ctl.rc_ack_hdwr_delay; + + if (rtt < bbr->r_ctl.rc_lowest_rtt) + bbr->r_ctl.rc_lowest_rtt = rtt; + bbr_log_rtt_sample(bbr, rtt, tsin); + if (bbr->r_init_rtt) { + /* + * The initial rtt is not-trusted, nuke it and lets get + * our first valid measurement in. + */ + bbr->r_init_rtt = 0; + tp->t_srtt = 0; + } + if ((bbr->rc_ts_clock_set == 0) && bbr->rc_ts_valid) { + /* + * So we have not yet figured out + * what the peers TSTMP value is + * in (most likely ms). We need a + * series of cum-ack's to determine + * this reliably. + */ + if (bbr->rc_ack_is_cumack) { + if (bbr->rc_ts_data_set) { + /* Lets attempt to determine the timestamp granularity. */ + bbr_make_timestamp_determination(bbr); + } else { + bbr->rc_ts_data_set = 1; + bbr->r_ctl.bbr_ts_check_tstmp = bbr->r_ctl.last_inbound_ts; + bbr->r_ctl.bbr_ts_check_our_cts = bbr->r_ctl.cur_rtt_send_time; + } + } else { + /* + * We have to have consecutive acks + * reset any "filled" state to none. + */ + bbr->rc_ts_data_set = 0; + } + } + /* Round it up */ + rtt_ticks = USEC_2_TICKS((rtt + (USECS_IN_MSEC - 1))); + if (rtt_ticks == 0) + rtt_ticks = 1; + if (tp->t_srtt != 0) { + /* + * srtt is stored as fixed point with 5 bits after the + * binary point (i.e., scaled by 8). The following magic is + * equivalent to the smoothing algorithm in rfc793 with an + * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point). + * Adjust rtt to origin 0. + */ + + delta = ((rtt_ticks - 1) << TCP_DELTA_SHIFT) + - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); + + tp->t_srtt += delta; + if (tp->t_srtt <= 0) + tp->t_srtt = 1; + + /* + * We accumulate a smoothed rtt variance (actually, a + * smoothed mean difference), then set the retransmit timer + * to smoothed rtt + 4 times the smoothed variance. rttvar + * is stored as fixed point with 4 bits after the binary + * point (scaled by 16). The following is equivalent to + * rfc793 smoothing with an alpha of .75 (rttvar = + * rttvar*3/4 + |delta| / 4). This replaces rfc793's + * wired-in beta. + */ + if (delta < 0) + delta = -delta; + delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); + tp->t_rttvar += delta; + if (tp->t_rttvar <= 0) + tp->t_rttvar = 1; + if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) + tp->t_rttbest = tp->t_srtt + tp->t_rttvar; + } else { + /* + * No rtt measurement yet - use the unsmoothed rtt. Set the + * variance to half the rtt (so our first retransmit happens + * at 3*rtt). + */ + tp->t_srtt = rtt_ticks << TCP_RTT_SHIFT; + tp->t_rttvar = rtt_ticks << (TCP_RTTVAR_SHIFT - 1); + tp->t_rttbest = tp->t_srtt + tp->t_rttvar; + } + TCPSTAT_INC(tcps_rttupdated); + tp->t_rttupdated++; +#ifdef NETFLIX_STATS + stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt_ticks)); +#endif + /* + * the retransmit should happen at rtt + 4 * rttvar. Because of the + * way we do the smoothing, srtt and rttvar will each average +1/2 + * tick of bias. When we compute the retransmit timer, we want 1/2 + * tick of rounding and 1 extra tick because of +-1/2 tick + * uncertainty in the firing of the timer. The bias will give us + * exactly the 1.5 tick we need. But, because the bias is + * statistical, we have to test that we don't drop below the minimum + * feasible timer (which is 2 ticks). + */ + TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), + max(MSEC_2_TICKS(bbr->r_ctl.rc_min_rto_ms), rtt_ticks + 2), + MSEC_2_TICKS(((uint32_t)bbr->rc_max_rto_sec) * 1000)); + + /* + * We received an ack for a packet that wasn't retransmitted; it is + * probably safe to discard any error indications we've received + * recently. This isn't quite right, but close enough for now (a + * route might have failed after we sent a segment, and the return + * path might not be symmetrical). + */ + tp->t_softerror = 0; + rtt = (TICKS_2_USEC(bbr->rc_tp->t_srtt) >> TCP_RTT_SHIFT); + if (bbr->r_ctl.bbr_smallest_srtt_this_state > rtt) + bbr->r_ctl.bbr_smallest_srtt_this_state = rtt; +} + +static void +bbr_earlier_retran(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, + uint32_t t, uint32_t cts, int ack_type) +{ + /* + * For this RSM, we acknowledged the data from a previous + * transmission, not the last one we made. This means we did a false + * retransmit. + */ + if (rsm->r_flags & BBR_HAS_FIN) { + /* + * The sending of the FIN often is multiple sent when we + * have everything outstanding ack'd. We ignore this case + * since its over now. + */ + return; + } + if (rsm->r_flags & BBR_TLP) { + /* + * We expect TLP's to have this occur often + */ + bbr->rc_tlp_rtx_out = 0; + return; + } + if (ack_type != BBR_CUM_ACKED) { + /* + * If it was not a cum-ack we + * don't really know for sure since + * the timestamp could be from some + * other transmission. + */ + return; + } + + if (rsm->r_flags & BBR_WAS_SACKPASS) { + /* + * We retransmitted based on a sack and the earlier + * retransmission ack'd it - re-ordering is occuring. + */ + BBR_STAT_INC(bbr_reorder_seen); + bbr->r_ctl.rc_reorder_ts = cts; + } + /* Back down the loss count */ + if (rsm->r_flags & BBR_MARKED_LOST) { + bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start; + bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start; + rsm->r_flags &= ~BBR_MARKED_LOST; + if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost)) + /* LT sampling also needs adjustment */ + bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost; + } + /***** RRS HERE ************************/ + /* Do we need to do this??? */ + /* bbr_reset_lt_bw_sampling(bbr, cts); */ + /***** RRS HERE ************************/ + BBR_STAT_INC(bbr_badfr); + BBR_STAT_ADD(bbr_badfr_bytes, (rsm->r_end - rsm->r_start)); +} + + +static void +bbr_set_reduced_rtt(struct tcp_bbr *bbr, uint32_t cts, uint32_t line) +{ + bbr->r_ctl.rc_rtt_shrinks = cts; + if (bbr_can_force_probertt && + (TSTMP_GT(cts, bbr->r_ctl.last_in_probertt)) && + ((cts - bbr->r_ctl.last_in_probertt) > bbr->r_ctl.rc_probertt_int)) { + /* + * We should enter probe-rtt its been too long + * since we have been there. + */ + bbr_enter_probe_rtt(bbr, cts, __LINE__); + } else + bbr_check_probe_rtt_limits(bbr, cts); +} + +static void +tcp_bbr_commit_bw(struct tcp_bbr *bbr, uint32_t cts) +{ + uint64_t orig_bw; + + if (bbr->r_ctl.rc_bbr_cur_del_rate == 0) { + /* We never apply a zero measurment */ + bbr_log_type_bbrupd(bbr, 20, cts, 0, 0, + 0, 0, 0, 0, 0, 0); + return; + } + if (bbr->r_ctl.r_measurement_count < 0xffffffff) + bbr->r_ctl.r_measurement_count++; + orig_bw = get_filter_value(&bbr->r_ctl.rc_delrate); + apply_filter_max(&bbr->r_ctl.rc_delrate, bbr->r_ctl.rc_bbr_cur_del_rate, bbr->r_ctl.rc_pkt_epoch); + bbr_log_type_bbrupd(bbr, 21, cts, (uint32_t)orig_bw, + (uint32_t)get_filter_value(&bbr->r_ctl.rc_delrate), + 0, 0, 0, 0, 0, 0); + if (orig_bw && + (orig_bw != get_filter_value(&bbr->r_ctl.rc_delrate))) { + if (bbr->bbr_hdrw_pacing) { + /* + * Apply a new rate to the hardware + * possibly. + */ + bbr_update_hardware_pacing_rate(bbr, cts); + } + bbr_set_state_target(bbr, __LINE__); + tcp_bbr_tso_size_check(bbr, cts); + if (bbr->r_recovery_bw) { + bbr_setup_red_bw(bbr, cts); + bbr_log_type_bw_reduce(bbr, BBR_RED_BW_USELRBW); + } + } else if ((orig_bw == 0) && get_filter_value(&bbr->r_ctl.rc_delrate)) + tcp_bbr_tso_size_check(bbr, cts); +} + +static void +bbr_nf_measurement(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rtt, uint32_t cts) +{ + if (bbr->rc_in_persist == 0) { + /* We log only when not in persist */ + /* Translate to a Bytes Per Second */ + uint64_t tim, bw, ts_diff, ts_bw; + uint32_t upper, lower, delivered; + + if (TSTMP_GT(bbr->r_ctl.rc_del_time, rsm->r_del_time)) + tim = (uint64_t)(bbr->r_ctl.rc_del_time - rsm->r_del_time); + else + tim = 1; + /* + * Now that we have processed the tim (skipping the sample + * or possibly updating the time, go ahead and + * calculate the cdr. + */ + delivered = (bbr->r_ctl.rc_delivered - rsm->r_delivered); + bw = (uint64_t)delivered; + bw *= (uint64_t)USECS_IN_SECOND; + bw /= tim; + if (bw == 0) { + /* We must have a calculatable amount */ + return; + } + upper = (bw >> 32) & 0x00000000ffffffff; + lower = bw & 0x00000000ffffffff; + /* + * If we are using this b/w shove it in now so we + * can see in the trace viewer if it gets over-ridden. + */ + if (rsm->r_ts_valid && + bbr->rc_ts_valid && + bbr->rc_ts_clock_set && + (bbr->rc_ts_cant_be_used == 0) && + bbr->rc_use_ts_limit) { + ts_diff = max((bbr->r_ctl.last_inbound_ts - rsm->r_del_ack_ts), 1); + ts_diff *= bbr->r_ctl.bbr_peer_tsratio; + if ((delivered == 0) || + (rtt < 1000)) { + /* Can't use the ts */ + bbr_log_type_bbrupd(bbr, 61, cts, + ts_diff, + bbr->r_ctl.last_inbound_ts, + rsm->r_del_ack_ts, 0, + 0, 0, 0, delivered); + } else { + ts_bw = (uint64_t)delivered; + ts_bw *= (uint64_t)USECS_IN_SECOND; + ts_bw /= ts_diff; + bbr_log_type_bbrupd(bbr, 62, cts, + (ts_bw >> 32), + (ts_bw & 0xffffffff), 0, 0, + 0, 0, ts_diff, delivered); + if ((bbr->ts_can_raise) && + (ts_bw > bw)) { + bbr_log_type_bbrupd(bbr, 8, cts, + delivered, + ts_diff, + (bw >> 32), + (bw & 0x00000000ffffffff), + 0, 0, 0, 0); + bw = ts_bw; + } else if (ts_bw && (ts_bw < bw)) { + bbr_log_type_bbrupd(bbr, 7, cts, + delivered, + ts_diff, + (bw >> 32), + (bw & 0x00000000ffffffff), + 0, 0, 0, 0); + bw = ts_bw; + } + } + } + if (rsm->r_first_sent_time && + TSTMP_GT(rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)],rsm->r_first_sent_time)) { + uint64_t sbw, sti; + /* + * We use what was in flight at the time of our + * send and the size of this send to figure + * out what we have been sending at (amount). + * For the time we take from the time of + * the send of the first send outstanding + * until this send plus this sends pacing + * time. This gives us a good calculation + * as to the rate we have been sending at. + */ + + sbw = (uint64_t)(rsm->r_flight_at_send); + sbw *= (uint64_t)USECS_IN_SECOND; + sti = rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)] - rsm->r_first_sent_time; + sti += rsm->r_pacing_delay; + sbw /= sti; + if (sbw < bw) { + bbr_log_type_bbrupd(bbr, 6, cts, + delivered, + (uint32_t)sti, + (bw >> 32), + (uint32_t)bw, + rsm->r_first_sent_time, 0, (sbw >> 32), + (uint32_t)sbw); + bw = sbw; + } + } + /* Use the google algorithm for b/w measurements */ + bbr->r_ctl.rc_bbr_cur_del_rate = bw; + if ((rsm->r_app_limited == 0) || + (bw > get_filter_value(&bbr->r_ctl.rc_delrate))) { + tcp_bbr_commit_bw(bbr, cts); + bbr_log_type_bbrupd(bbr, 10, cts, (uint32_t)tim, delivered, + 0, 0, 0, 0, bbr->r_ctl.rc_del_time, rsm->r_del_time); + } + } +} + +static void +bbr_google_measurement(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rtt, uint32_t cts) +{ + if (bbr->rc_in_persist == 0) { + /* We log only when not in persist */ + /* Translate to a Bytes Per Second */ + uint64_t tim, bw; + uint32_t upper, lower, delivered; + int no_apply = 0; + + if (TSTMP_GT(bbr->r_ctl.rc_del_time, rsm->r_del_time)) + tim = (uint64_t)(bbr->r_ctl.rc_del_time - rsm->r_del_time); + else + tim = 1; + /* + * Now that we have processed the tim (skipping the sample + * or possibly updating the time, go ahead and + * calculate the cdr. + */ + delivered = (bbr->r_ctl.rc_delivered - rsm->r_delivered); + bw = (uint64_t)delivered; + bw *= (uint64_t)USECS_IN_SECOND; + bw /= tim; + if (tim < bbr->r_ctl.rc_lowest_rtt) { + bbr_log_type_bbrupd(bbr, 99, cts, (uint32_t)tim, delivered, + tim, bbr->r_ctl.rc_lowest_rtt, 0, 0, 0, 0); + + no_apply = 1; + } + upper = (bw >> 32) & 0x00000000ffffffff; + lower = bw & 0x00000000ffffffff; + /* + * If we are using this b/w shove it in now so we + * can see in the trace viewer if it gets over-ridden. + */ + bbr->r_ctl.rc_bbr_cur_del_rate = bw; + /* Gate by the sending rate */ + if (rsm->r_first_sent_time && + TSTMP_GT(rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)],rsm->r_first_sent_time)) { + uint64_t sbw, sti; + /* + * We use what was in flight at the time of our + * send and the size of this send to figure + * out what we have been sending at (amount). + * For the time we take from the time of + * the send of the first send outstanding + * until this send plus this sends pacing + * time. This gives us a good calculation + * as to the rate we have been sending at. + */ + + sbw = (uint64_t)(rsm->r_flight_at_send); + sbw *= (uint64_t)USECS_IN_SECOND; + sti = rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)] - rsm->r_first_sent_time; + sti += rsm->r_pacing_delay; + sbw /= sti; + if (sbw < bw) { + bbr_log_type_bbrupd(bbr, 6, cts, + delivered, + (uint32_t)sti, + (bw >> 32), + (uint32_t)bw, + rsm->r_first_sent_time, 0, (sbw >> 32), + (uint32_t)sbw); + bw = sbw; + } + if ((sti > tim) && + (sti < bbr->r_ctl.rc_lowest_rtt)) { + bbr_log_type_bbrupd(bbr, 99, cts, (uint32_t)tim, delivered, + (uint32_t)sti, bbr->r_ctl.rc_lowest_rtt, 0, 0, 0, 0); + no_apply = 1; + } else + no_apply = 0; + } + bbr->r_ctl.rc_bbr_cur_del_rate = bw; + if ((no_apply == 0) && + ((rsm->r_app_limited == 0) || + (bw > get_filter_value(&bbr->r_ctl.rc_delrate)))) { + tcp_bbr_commit_bw(bbr, cts); + bbr_log_type_bbrupd(bbr, 10, cts, (uint32_t)tim, delivered, + 0, 0, 0, 0, bbr->r_ctl.rc_del_time, rsm->r_del_time); + } + } +} + + +static void +bbr_update_bbr_info(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rtt, uint32_t cts, uint32_t tsin, + uint32_t uts, int32_t match, uint32_t rsm_send_time, int32_t ack_type, struct tcpopt *to) +{ + uint64_t old_rttprop; + + /* Update our delivery time and amount */ + bbr->r_ctl.rc_delivered += (rsm->r_end - rsm->r_start); + bbr->r_ctl.rc_del_time = cts; + if (rtt == 0) { + /* + * 0 means its a retransmit, for now we don't use these for + * the rest of BBR. + */ + return; + } + if ((bbr->rc_use_google == 0) && + (match != BBR_RTT_BY_EXACTMATCH) && + (match != BBR_RTT_BY_TIMESTAMP)){ + /* + * We get a lot of rtt updates, lets not pay attention to + * any that are not an exact match. That way we don't have + * to worry about timestamps and the whole nonsense of + * unsure if its a retransmission etc (if we ever had the + * timestamp fixed to always have the last thing sent this + * would not be a issue). + */ + return; + } + if ((bbr_no_retran && bbr->rc_use_google) && + (match != BBR_RTT_BY_EXACTMATCH) && + (match != BBR_RTT_BY_TIMESTAMP)){ + /* + * We only do measurements in google mode + * with bbr_no_retran on for sure things. + */ + return; + } + /* Only update srtt if we know by exact match */ + tcp_bbr_xmit_timer(bbr, rtt, rsm_send_time, rsm->r_start, tsin); + if (ack_type == BBR_CUM_ACKED) + bbr->rc_ack_is_cumack = 1; + else + bbr->rc_ack_is_cumack = 0; + old_rttprop = bbr_get_rtt(bbr, BBR_RTT_PROP); + /* + * Note the following code differs to the original + * BBR spec. It calls for <= not <. However after a + * long discussion in email with Neal, he acknowledged + * that it should be < than so that we will have flows + * going into probe-rtt (we were seeing cases where that + * did not happen and caused ugly things to occur). We + * have added this agreed upon fix to our code base. + */ + if (rtt < old_rttprop) { + /* Update when we last saw a rtt drop */ + bbr_log_rtt_shrinks(bbr, cts, 0, rtt, __LINE__, BBR_RTTS_NEWRTT, 0); + bbr_set_reduced_rtt(bbr, cts, __LINE__); + } + bbr_log_type_bbrrttprop(bbr, rtt, (rsm ? rsm->r_end : 0), uts, cts, + match, rsm->r_start, rsm->r_flags); + apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts); + if (old_rttprop != bbr_get_rtt(bbr, BBR_RTT_PROP)) { + /* + * The RTT-prop moved, reset the target (may be a + * nop for some states). + */ + bbr_set_state_target(bbr, __LINE__); + if (bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) + bbr_log_rtt_shrinks(bbr, cts, 0, 0, + __LINE__, BBR_RTTS_NEW_TARGET, 0); + else if (old_rttprop < bbr_get_rtt(bbr, BBR_RTT_PROP)) + /* It went up */ + bbr_check_probe_rtt_limits(bbr, cts); + } + if ((bbr->rc_use_google == 0) && + (match == BBR_RTT_BY_TIMESTAMP)) { + /* + * We don't do b/w update with + * these since they are not really + * reliable. + */ + return; + } + if (bbr->r_ctl.r_app_limited_until && + (bbr->r_ctl.rc_delivered >= bbr->r_ctl.r_app_limited_until)) { + /* We are no longer app-limited */ + bbr->r_ctl.r_app_limited_until = 0; + } + if (bbr->rc_use_google) { + bbr_google_measurement(bbr, rsm, rtt, cts); + } else { + bbr_nf_measurement(bbr, rsm, rtt, cts); + } +} + +/* + * Convert a timestamp that the main stack + * uses (milliseconds) into one that bbr uses + * (microseconds). Return that converted timestamp. + */ +static uint32_t +bbr_ts_convert(uint32_t cts) { + uint32_t sec, msec; + + sec = cts / MS_IN_USEC; + msec = cts - (MS_IN_USEC * sec); + return ((sec * USECS_IN_SECOND) + (msec * MS_IN_USEC)); +} + +/* + * Return 0 if we did not update the RTT time, return + * 1 if we did. + */ +static int +bbr_update_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, + struct bbr_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, uint32_t th_ack) +{ + int32_t i; + uint32_t t, uts = 0; + + if ((rsm->r_flags & BBR_ACKED) || + (rsm->r_flags & BBR_WAS_RENEGED) || + (rsm->r_flags & BBR_RXT_CLEARED)) { + /* Already done */ + return (0); + } + if (rsm->r_rtr_cnt == 1) { + /* + * Only one transmit. Hopefully the normal case. + */ + if (TSTMP_GT(cts, rsm->r_tim_lastsent[0])) + t = cts - rsm->r_tim_lastsent[0]; + else + t = 1; + if ((int)t <= 0) + t = 1; + bbr->r_ctl.rc_last_rtt = t; + bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, 0, + BBR_RTT_BY_EXACTMATCH, rsm->r_tim_lastsent[0], ack_type, to); + return (1); + } + /* Convert to usecs */ + if ((bbr_can_use_ts_for_rtt == 1) && + (bbr->rc_use_google == 1) && + (ack_type == BBR_CUM_ACKED) && + (to->to_flags & TOF_TS) && + (to->to_tsecr != 0)) { + + t = tcp_tv_to_mssectick(&bbr->rc_tv) - to->to_tsecr; + if (t < 1) + t = 1; + t *= MS_IN_USEC; + bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, 0, + BBR_RTT_BY_TIMESTAMP, + rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)], + ack_type, to); + return (1); + } + uts = bbr_ts_convert(to->to_tsecr); + if ((to->to_flags & TOF_TS) && + (to->to_tsecr != 0) && + (ack_type == BBR_CUM_ACKED) && + ((rsm->r_flags & BBR_OVERMAX) == 0)) { + /* + * Now which timestamp does it match? In this block the ACK + * may be coming from a previous transmission. + */ + uint32_t fudge; + + fudge = BBR_TIMER_FUDGE; + for (i = 0; i < rsm->r_rtr_cnt; i++) { + if ((SEQ_GEQ(uts, (rsm->r_tim_lastsent[i] - fudge))) && + (SEQ_LEQ(uts, (rsm->r_tim_lastsent[i] + fudge)))) { + if (TSTMP_GT(cts, rsm->r_tim_lastsent[i])) + t = cts - rsm->r_tim_lastsent[i]; + else + t = 1; + if ((int)t <= 0) + t = 1; + bbr->r_ctl.rc_last_rtt = t; + bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts, BBR_RTT_BY_TSMATCHING, + rsm->r_tim_lastsent[i], ack_type, to); + if ((i + 1) < rsm->r_rtr_cnt) { + /* Likely */ + bbr_earlier_retran(tp, bbr, rsm, t, cts, ack_type); + } else if (rsm->r_flags & BBR_TLP) { + bbr->rc_tlp_rtx_out = 0; + } + return (1); + } + } + /* Fall through if we can't find a matching timestamp */ + } + /* + * Ok its a SACK block that we retransmitted. or a windows + * machine without timestamps. We can tell nothing from the + * time-stamp since its not there or the time the peer last + * recieved a segment that moved forward its cum-ack point. + * + * Lets look at the last retransmit and see what we can tell + * (with BBR for space we only keep 2 note we have to keep + * at least 2 so the map can not be condensed more). + */ + i = rsm->r_rtr_cnt - 1; + if (TSTMP_GT(cts, rsm->r_tim_lastsent[i])) + t = cts - rsm->r_tim_lastsent[i]; + else + goto not_sure; + if (t < bbr->r_ctl.rc_lowest_rtt) { + /* + * We retransmitted and the ack came back in less + * than the smallest rtt we have observed in the + * windowed rtt. We most likey did an improper + * retransmit as outlined in 4.2 Step 3 point 2 in + * the rack-draft. + * + * Use the prior transmission to update all the + * information as long as there is only one prior + * transmission. + */ + if ((rsm->r_flags & BBR_OVERMAX) == 0) { +#ifdef BBR_INVARIANTS + if (rsm->r_rtr_cnt == 1) + panic("rsm:%p bbr:%p rsm has overmax and only 1 retranmit flags:%x?", rsm, bbr, rsm->r_flags); +#endif + i = rsm->r_rtr_cnt - 2; + if (TSTMP_GT(cts, rsm->r_tim_lastsent[i])) + t = cts - rsm->r_tim_lastsent[i]; + else + t = 1; + bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts, BBR_RTT_BY_EARLIER_RET, + rsm->r_tim_lastsent[i], ack_type, to); + bbr_earlier_retran(tp, bbr, rsm, t, cts, ack_type); + } else { + /* + * Too many prior transmissions, just + * updated BBR delivered + */ +not_sure: + bbr_update_bbr_info(bbr, rsm, 0, cts, to->to_tsecr, uts, + BBR_RTT_BY_SOME_RETRAN, 0, ack_type, to); + } + } else { + /* + * We retransmitted it and the retransmit did the + * job. + */ + if (rsm->r_flags & BBR_TLP) + bbr->rc_tlp_rtx_out = 0; + if ((rsm->r_flags & BBR_OVERMAX) == 0) + bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts, + BBR_RTT_BY_THIS_RETRAN, 0, ack_type, to); + else + bbr_update_bbr_info(bbr, rsm, 0, cts, to->to_tsecr, uts, + BBR_RTT_BY_SOME_RETRAN, 0, ack_type, to); + return (1); + } + return (0); +} + +/* + * Mark the SACK_PASSED flag on all entries prior to rsm send wise. + */ +static void +bbr_log_sack_passed(struct tcpcb *tp, + struct tcp_bbr *bbr, struct bbr_sendmap *rsm) +{ + struct bbr_sendmap *nrsm; + + nrsm = rsm; + TAILQ_FOREACH_REVERSE_FROM(nrsm, &bbr->r_ctl.rc_tmap, + bbr_head, r_tnext) { + if (nrsm == rsm) { + /* Skip orginal segment he is acked */ + continue; + } + if (nrsm->r_flags & BBR_ACKED) { + /* Skip ack'd segments */ + continue; + } + if (nrsm->r_flags & BBR_SACK_PASSED) { + /* + * We found one that is already marked + * passed, we have been here before and + * so all others below this are marked. + */ + break; + } + BBR_STAT_INC(bbr_sack_passed); + nrsm->r_flags |= BBR_SACK_PASSED; + if (((nrsm->r_flags & BBR_MARKED_LOST) == 0) && + bbr_is_lost(bbr, nrsm, bbr->r_ctl.rc_rcvtime)) { + bbr->r_ctl.rc_lost += nrsm->r_end - nrsm->r_start; + bbr->r_ctl.rc_lost_bytes += nrsm->r_end - nrsm->r_start; + nrsm->r_flags |= BBR_MARKED_LOST; + } + nrsm->r_flags &= ~BBR_WAS_SACKPASS; + } +} + +/* + * Returns the number of bytes that were + * newly ack'd by sack blocks. + */ +static uint32_t +bbr_proc_sack_blk(struct tcpcb *tp, struct tcp_bbr *bbr, struct sackblk *sack, + struct tcpopt *to, struct bbr_sendmap **prsm, uint32_t cts) +{ + int32_t times = 0; + uint32_t start, end, maxseg, changed = 0; + struct bbr_sendmap *rsm, *nrsm; + int32_t used_ref = 1; + uint8_t went_back = 0, went_fwd = 0; + + maxseg = tp->t_maxseg - bbr->rc_last_options; + start = sack->start; + end = sack->end; + rsm = *prsm; + if (rsm == NULL) + used_ref = 0; + + /* Do we locate the block behind where we last were? */ + if (rsm && SEQ_LT(start, rsm->r_start)) { + went_back = 1; + TAILQ_FOREACH_REVERSE_FROM(rsm, &bbr->r_ctl.rc_map, bbr_head, r_next) { + if (SEQ_GEQ(start, rsm->r_start) && + SEQ_LT(start, rsm->r_end)) { + goto do_rest_ofb; + } + } + } +start_at_beginning: + went_fwd = 1; + /* + * Ok lets locate the block where this guy is fwd from rsm (if its + * set) + */ + TAILQ_FOREACH_FROM(rsm, &bbr->r_ctl.rc_map, r_next) { + if (SEQ_GEQ(start, rsm->r_start) && + SEQ_LT(start, rsm->r_end)) { + break; + } + } +do_rest_ofb: + if (rsm == NULL) { + /* + * This happens when we get duplicate sack blocks with the + * same end. For example SACK 4: 100 SACK 3: 100 The sort + * will not change there location so we would just start at + * the end of the first one and get lost. + */ + if (tp->t_flags & TF_SENTFIN) { + /* + * Check to see if we have not logged the FIN that + * went out. + */ + nrsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_map, bbr_sendmap, r_next); + if (nrsm && (nrsm->r_end + 1) == tp->snd_max) { + /* + * Ok we did not get the FIN logged. + */ + nrsm->r_end++; + rsm = nrsm; + goto do_rest_ofb; + } + } + if (times == 1) { +#ifdef BBR_INVARIANTS + panic("tp:%p bbr:%p sack:%p to:%p prsm:%p", + tp, bbr, sack, to, prsm); +#else + goto out; +#endif + } + times++; + BBR_STAT_INC(bbr_sack_proc_restart); + rsm = NULL; + goto start_at_beginning; + } + /* Ok we have an ACK for some piece of rsm */ + if (rsm->r_start != start) { + /* + * Need to split this in two pieces the before and after. + */ + if (bbr_sack_mergable(rsm, start, end)) + nrsm = bbr_alloc_full_limit(bbr); + else + nrsm = bbr_alloc_limit(bbr, BBR_LIMIT_TYPE_SPLIT); + if (nrsm == NULL) { + /* We could not allocate ignore the sack */ + struct sackblk blk; + + blk.start = start; + blk.end = end; + sack_filter_reject(&bbr->r_ctl.bbr_sf, &blk); + goto out; + } + bbr_clone_rsm(bbr, nrsm, rsm, start); + TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next); + if (rsm->r_in_tmap) { + TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext); + nrsm->r_in_tmap = 1; + } + rsm->r_flags &= (~BBR_HAS_FIN); + rsm = nrsm; + } + if (SEQ_GEQ(end, rsm->r_end)) { + /* + * The end of this block is either beyond this guy or right + * at this guy. + */ + if ((rsm->r_flags & BBR_ACKED) == 0) { + bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_SACKED, 0); + changed += (rsm->r_end - rsm->r_start); + bbr->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); + bbr_log_sack_passed(tp, bbr, rsm); + if (rsm->r_flags & BBR_MARKED_LOST) { + bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start; + } + /* Is Reordering occuring? */ + if (rsm->r_flags & BBR_SACK_PASSED) { + BBR_STAT_INC(bbr_reorder_seen); + bbr->r_ctl.rc_reorder_ts = cts; + if (rsm->r_flags & BBR_MARKED_LOST) { + bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start; + if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost)) + /* LT sampling also needs adjustment */ + bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost; + } + } + rsm->r_flags |= BBR_ACKED; + rsm->r_flags &= ~(BBR_TLP|BBR_WAS_RENEGED|BBR_RXT_CLEARED|BBR_MARKED_LOST); + if (rsm->r_in_tmap) { + TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext); + rsm->r_in_tmap = 0; + } + } + bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_SACKED); + if (end == rsm->r_end) { + /* This block only - done */ + goto out; + } + /* There is more not coverend by this rsm move on */ + start = rsm->r_end; + nrsm = TAILQ_NEXT(rsm, r_next); + rsm = nrsm; + times = 0; + goto do_rest_ofb; + } + if (rsm->r_flags & BBR_ACKED) { + /* Been here done that */ + goto out; + } + /* Ok we need to split off this one at the tail */ + if (bbr_sack_mergable(rsm, start, end)) + nrsm = bbr_alloc_full_limit(bbr); + else + nrsm = bbr_alloc_limit(bbr, BBR_LIMIT_TYPE_SPLIT); + if (nrsm == NULL) { + /* failed XXXrrs what can we do but loose the sack info? */ + struct sackblk blk; + + blk.start = start; + blk.end = end; + sack_filter_reject(&bbr->r_ctl.bbr_sf, &blk); + goto out; + } + /* Clone it */ + bbr_clone_rsm(bbr, nrsm, rsm, end); + /* The sack block does not cover this guy fully */ + rsm->r_flags &= (~BBR_HAS_FIN); + TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next); + if (rsm->r_in_tmap) { + TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext); + nrsm->r_in_tmap = 1; + } + nrsm->r_dupack = 0; + bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_SACKED, 0); + bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_SACKED); + changed += (rsm->r_end - rsm->r_start); + bbr->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); + bbr_log_sack_passed(tp, bbr, rsm); + /* Is Reordering occuring? */ + if (rsm->r_flags & BBR_MARKED_LOST) { + bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start; + } + if (rsm->r_flags & BBR_SACK_PASSED) { + BBR_STAT_INC(bbr_reorder_seen); + bbr->r_ctl.rc_reorder_ts = cts; + if (rsm->r_flags & BBR_MARKED_LOST) { + bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start; + if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost)) + /* LT sampling also needs adjustment */ + bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost; + } + } + rsm->r_flags &= ~(BBR_TLP|BBR_WAS_RENEGED|BBR_RXT_CLEARED|BBR_MARKED_LOST); + rsm->r_flags |= BBR_ACKED; + if (rsm->r_in_tmap) { + TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext); + rsm->r_in_tmap = 0; + } +out: + if (rsm && (rsm->r_flags & BBR_ACKED)) { + /* + * Now can we merge this newly acked + * block with either the previous or + * next block? + */ + nrsm = TAILQ_NEXT(rsm, r_next); + if (nrsm && + (nrsm->r_flags & BBR_ACKED)) { + /* yep this and next can be merged */ + rsm = bbr_merge_rsm(bbr, rsm, nrsm); + } + /* Now what about the previous? */ + nrsm = TAILQ_PREV(rsm, bbr_head, r_next); + if (nrsm && + (nrsm->r_flags & BBR_ACKED)) { + /* yep the previous and this can be merged */ + rsm = bbr_merge_rsm(bbr, nrsm, rsm); + } + } + if (used_ref == 0) { + BBR_STAT_INC(bbr_sack_proc_all); + } else { + BBR_STAT_INC(bbr_sack_proc_short); + } + if (went_fwd && went_back) { + BBR_STAT_INC(bbr_sack_search_both); + } else if (went_fwd) { + BBR_STAT_INC(bbr_sack_search_fwd); + } else if (went_back) { + BBR_STAT_INC(bbr_sack_search_back); + } + /* Save off where the next seq is */ + if (rsm) + bbr->r_ctl.rc_sacklast = TAILQ_NEXT(rsm, r_next); + else + bbr->r_ctl.rc_sacklast = NULL; + *prsm = rsm; + return (changed); +} + + +static void inline +bbr_peer_reneges(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, tcp_seq th_ack) +{ + struct bbr_sendmap *tmap; + + BBR_STAT_INC(bbr_reneges_seen); + tmap = NULL; + while (rsm && (rsm->r_flags & BBR_ACKED)) { + /* Its no longer sacked, mark it so */ + uint32_t oflags; + bbr->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); +#ifdef BBR_INVARIANTS + if (rsm->r_in_tmap) { + panic("bbr:%p rsm:%p flags:0x%x in tmap?", + bbr, rsm, rsm->r_flags); + } +#endif + oflags = rsm->r_flags; + if (rsm->r_flags & BBR_MARKED_LOST) { + bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start; + bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start; + if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost)) + /* LT sampling also needs adjustment */ + bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost; + } + rsm->r_flags &= ~(BBR_ACKED | BBR_SACK_PASSED | BBR_WAS_SACKPASS | BBR_MARKED_LOST); + rsm->r_flags |= BBR_WAS_RENEGED; + rsm->r_flags |= BBR_RXT_CLEARED; + bbr_log_type_rsmclear(bbr, bbr->r_ctl.rc_rcvtime, rsm, oflags, __LINE__); + /* Rebuild it into our tmap */ + if (tmap == NULL) { + TAILQ_INSERT_HEAD(&bbr->r_ctl.rc_tmap, rsm, r_tnext); + tmap = rsm; + } else { + TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, tmap, rsm, r_tnext); + tmap = rsm; + } + tmap->r_in_tmap = 1; + /* + * XXXrrs Delivered? Should we do anything here? + * + * Of course we don't on a rxt timeout so maybe its ok that + * we don't? + * + * For now lets not. + */ + rsm = TAILQ_NEXT(rsm, r_next); + } + /* + * Now lets possibly clear the sack filter so we start recognizing + * sacks that cover this area. + */ + sack_filter_clear(&bbr->r_ctl.bbr_sf, th_ack); +} + +static void +bbr_log_syn(struct tcpcb *tp, struct tcpopt *to) +{ + struct tcp_bbr *bbr; + struct bbr_sendmap *rsm; + uint32_t cts; + + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + cts = bbr->r_ctl.rc_rcvtime; + rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); + if (rsm && (rsm->r_flags & BBR_HAS_SYN)) { + if ((rsm->r_end - rsm->r_start) <= 1) { + /* Log out the SYN completely */ + bbr->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; + rsm->r_rtr_bytes = 0; + TAILQ_REMOVE(&bbr->r_ctl.rc_map, rsm, r_next); + if (rsm->r_in_tmap) { + TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext); + rsm->r_in_tmap = 0; + } + if (bbr->r_ctl.rc_next == rsm) { + /* scoot along the marker */ + bbr->r_ctl.rc_next = TAILQ_FIRST(&bbr->r_ctl.rc_map); + } + if (to != NULL) + bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_CUM_ACKED, 0); + bbr_free(bbr, rsm); + } else { + /* There is more (Fast open)? strip out SYN. */ + rsm->r_flags &= ~BBR_HAS_SYN; + rsm->r_start++; + } + } +} + +/* + * Returns the number of bytes that were + * acknowledged by SACK blocks. + */ + +static uint32_t +bbr_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, + uint32_t *prev_acked) +{ + uint32_t changed, last_seq, entered_recovery = 0; + struct tcp_bbr *bbr; + struct bbr_sendmap *rsm; + struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; + register uint32_t th_ack; + int32_t i, j, k, new_sb, num_sack_blks = 0; + uint32_t cts, acked, ack_point, sack_changed = 0; + uint32_t p_maxseg, maxseg, p_acked = 0; + + INP_WLOCK_ASSERT(tp->t_inpcb); + if (th->th_flags & TH_RST) { + /* We don't log resets */ + return (0); + } + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + cts = bbr->r_ctl.rc_rcvtime; + + rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); + changed = 0; + maxseg = tp->t_maxseg - bbr->rc_last_options; + p_maxseg = min(bbr->r_ctl.rc_pace_max_segs, maxseg); + th_ack = th->th_ack; + if (SEQ_GT(th_ack, tp->snd_una)) { + acked = th_ack - tp->snd_una; + bbr_log_progress_event(bbr, tp, ticks, PROGRESS_UPDATE, __LINE__); + bbr->rc_tp->t_acktime = ticks; + } else + acked = 0; + if (SEQ_LEQ(th_ack, tp->snd_una)) { + /* Only sent here for sack processing */ + goto proc_sack; + } + if (rsm && SEQ_GT(th_ack, rsm->r_start)) { + changed = th_ack - rsm->r_start; + } else if ((rsm == NULL) && ((th_ack - 1) == tp->iss)) { + /* + * For the SYN incoming case we will not have called + * tcp_output for the sending of the SYN, so there will be + * no map. All other cases should probably be a panic. + */ + if ((to->to_flags & TOF_TS) && (to->to_tsecr != 0)) { + /* + * We have a timestamp that can be used to generate + * an initial RTT. + */ + uint32_t ts, now, rtt; + + ts = bbr_ts_convert(to->to_tsecr); + now = bbr_ts_convert(tcp_tv_to_mssectick(&bbr->rc_tv)); + rtt = now - ts; + if (rtt < 1) + rtt = 1; + bbr_log_type_bbrrttprop(bbr, rtt, + tp->iss, 0, cts, + BBR_RTT_BY_TIMESTAMP, tp->iss, 0); + apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts); + changed = 1; + bbr->r_wanted_output = 1; + goto out; + } + goto proc_sack; + } else if (rsm == NULL) { + goto out; + } + if (changed) { + /* + * The ACK point is advancing to th_ack, we must drop off + * the packets in the rack log and calculate any eligble + * RTT's. + */ + bbr->r_wanted_output = 1; +more: + if (rsm == NULL) { + + if (tp->t_flags & TF_SENTFIN) { + /* if we send a FIN we will not hav a map */ + goto proc_sack; + } +#ifdef BBR_INVARIANTS + panic("No rack map tp:%p for th:%p state:%d bbr:%p snd_una:%u snd_max:%u chg:%d\n", + tp, + th, tp->t_state, bbr, + tp->snd_una, tp->snd_max, changed); +#endif + goto proc_sack; + } + } + if (SEQ_LT(th_ack, rsm->r_start)) { + /* Huh map is missing this */ +#ifdef BBR_INVARIANTS + printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d bbr:%p\n", + rsm->r_start, + th_ack, tp->t_state, + bbr->r_state, bbr); + panic("th-ack is bad bbr:%p tp:%p", bbr, tp); +#endif + goto proc_sack; + } else if (th_ack == rsm->r_start) { + /* None here to ack */ + goto proc_sack; + } + /* + * Clear the dup ack counter, it will + * either be freed or if there is some + * remaining we need to start it at zero. + */ + rsm->r_dupack = 0; + /* Now do we consume the whole thing? */ + if (SEQ_GEQ(th_ack, rsm->r_end)) { + /* Its all consumed. */ + uint32_t left; + + if (rsm->r_flags & BBR_ACKED) { + /* + * It was acked on the scoreboard -- remove it from + * total + */ + p_acked += (rsm->r_end - rsm->r_start); + bbr->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); + if (bbr->r_ctl.rc_sacked == 0) + bbr->r_ctl.rc_sacklast = NULL; + } else { + bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_CUM_ACKED, th_ack); + if (rsm->r_flags & BBR_MARKED_LOST) { + bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start; + } + if (rsm->r_flags & BBR_SACK_PASSED) { + /* + * There are acked segments ACKED on the + * scoreboard further up. We are seeing + * reordering. + */ + BBR_STAT_INC(bbr_reorder_seen); + bbr->r_ctl.rc_reorder_ts = cts; + if (rsm->r_flags & BBR_MARKED_LOST) { + bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start; + if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost)) + /* LT sampling also needs adjustment */ + bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost; + } + } + rsm->r_flags &= ~BBR_MARKED_LOST; + } + bbr->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; + rsm->r_rtr_bytes = 0; + TAILQ_REMOVE(&bbr->r_ctl.rc_map, rsm, r_next); + if (rsm->r_in_tmap) { + TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext); + rsm->r_in_tmap = 0; + } + if (bbr->r_ctl.rc_next == rsm) { + /* scoot along the marker */ + bbr->r_ctl.rc_next = TAILQ_FIRST(&bbr->r_ctl.rc_map); + } + bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_CUM_ACKED); + /* Adjust the packet counts */ + left = th_ack - rsm->r_end; + /* Free back to zone */ + bbr_free(bbr, rsm); + if (left) { + rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); + goto more; + } + goto proc_sack; + } + if (rsm->r_flags & BBR_ACKED) { + /* + * It was acked on the scoreboard -- remove it from total + * for the part being cum-acked. + */ + p_acked += (rsm->r_end - rsm->r_start); + bbr->r_ctl.rc_sacked -= (th_ack - rsm->r_start); + if (bbr->r_ctl.rc_sacked == 0) + bbr->r_ctl.rc_sacklast = NULL; + } else { + /* + * It was acked up to th_ack point for the first time + */ + struct bbr_sendmap lrsm; + + memcpy(&lrsm, rsm, sizeof(struct bbr_sendmap)); + lrsm.r_end = th_ack; + bbr_update_rtt(tp, bbr, &lrsm, to, cts, BBR_CUM_ACKED, th_ack); + } + if ((rsm->r_flags & BBR_MARKED_LOST) && + ((rsm->r_flags & BBR_ACKED) == 0)) { + /* + * It was marked lost and partly ack'd now + * for the first time. We lower the rc_lost_bytes + * and still leave it MARKED. + */ + bbr->r_ctl.rc_lost_bytes -= th_ack - rsm->r_start; + } + bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_CUM_ACKED); + bbr->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; + rsm->r_rtr_bytes = 0; + /* adjust packet count */ + rsm->r_start = th_ack; +proc_sack: + /* Check for reneging */ + rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); + if (rsm && (rsm->r_flags & BBR_ACKED) && (th_ack == rsm->r_start)) { + /* + * The peer has moved snd_una up to the edge of this send, + * i.e. one that it had previously acked. The only way that + * can be true if the peer threw away data (space issues) + * that it had previously sacked (else it would have given + * us snd_una up to (rsm->r_end). We need to undo the acked + * markings here. + * + * Note we have to look to make sure th_ack is our + * rsm->r_start in case we get an old ack where th_ack is + * behind snd_una. + */ + bbr_peer_reneges(bbr, rsm, th->th_ack); + } + if ((to->to_flags & TOF_SACK) == 0) { + /* We are done nothing left to log */ + goto out; + } + rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_map, bbr_sendmap, r_next); + if (rsm) { + last_seq = rsm->r_end; + } else { + last_seq = tp->snd_max; + } + /* Sack block processing */ + if (SEQ_GT(th_ack, tp->snd_una)) + ack_point = th_ack; + else + ack_point = tp->snd_una; + for (i = 0; i < to->to_nsacks; i++) { + bcopy((to->to_sacks + i * TCPOLEN_SACK), + &sack, sizeof(sack)); + sack.start = ntohl(sack.start); + sack.end = ntohl(sack.end); + if (SEQ_GT(sack.end, sack.start) && + SEQ_GT(sack.start, ack_point) && + SEQ_LT(sack.start, tp->snd_max) && + SEQ_GT(sack.end, ack_point) && + SEQ_LEQ(sack.end, tp->snd_max)) { + if ((bbr->r_ctl.rc_num_small_maps_alloced > bbr_sack_block_limit) && + (SEQ_LT(sack.end, last_seq)) && + ((sack.end - sack.start) < (p_maxseg / 8))) { + /* + * Not the last piece and its smaller than + * 1/8th of a p_maxseg. We ignore this. + */ + BBR_STAT_INC(bbr_runt_sacks); + continue; + } + sack_blocks[num_sack_blks] = sack; + num_sack_blks++; +#ifdef NETFLIX_STATS + } else if (SEQ_LEQ(sack.start, th_ack) && + SEQ_LEQ(sack.end, th_ack)) { + /* + * Its a D-SACK block. + */ + tcp_record_dsack(sack.start, sack.end); +#endif + } + } + if (num_sack_blks == 0) + goto out; + /* + * Sort the SACK blocks so we can update the rack scoreboard with + * just one pass. + */ + new_sb = sack_filter_blks(&bbr->r_ctl.bbr_sf, sack_blocks, + num_sack_blks, th->th_ack); + ctf_log_sack_filter(bbr->rc_tp, new_sb, sack_blocks); + BBR_STAT_ADD(bbr_sack_blocks, num_sack_blks); + BBR_STAT_ADD(bbr_sack_blocks_skip, (num_sack_blks - new_sb)); + num_sack_blks = new_sb; + if (num_sack_blks < 2) { + goto do_sack_work; + } + /* Sort the sacks */ + for (i = 0; i < num_sack_blks; i++) { + for (j = i + 1; j < num_sack_blks; j++) { + if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { + sack = sack_blocks[i]; + sack_blocks[i] = sack_blocks[j]; + sack_blocks[j] = sack; + } + } + } + /* + * Now are any of the sack block ends the same (yes some + * implememtations send these)? + */ +again: + if (num_sack_blks > 1) { + for (i = 0; i < num_sack_blks; i++) { + for (j = i + 1; j < num_sack_blks; j++) { + if (sack_blocks[i].end == sack_blocks[j].end) { + /* + * Ok these two have the same end we + * want the smallest end and then + * throw away the larger and start + * again. + */ + if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { + /* + * The second block covers + * more area use that + */ + sack_blocks[i].start = sack_blocks[j].start; + } + /* + * Now collapse out the dup-sack and + * lower the count + */ + for (k = (j + 1); k < num_sack_blks; k++) { + sack_blocks[j].start = sack_blocks[k].start; + sack_blocks[j].end = sack_blocks[k].end; + j++; + } + num_sack_blks--; + goto again; + } + } + } + } +do_sack_work: + rsm = bbr->r_ctl.rc_sacklast; + for (i = 0; i < num_sack_blks; i++) { + acked = bbr_proc_sack_blk(tp, bbr, &sack_blocks[i], to, &rsm, cts); + if (acked) { + bbr->r_wanted_output = 1; + changed += acked; + sack_changed += acked; + } + } +out: + *prev_acked = p_acked; + if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) { + /* + * Ok we have a high probability that we need to go in to + * recovery since we have data sack'd + */ + struct bbr_sendmap *rsm; + + rsm = bbr_check_recovery_mode(tp, bbr, cts); + if (rsm) { + /* Enter recovery */ + entered_recovery = 1; + bbr->r_wanted_output = 1; + /* + * When we enter recovery we need to assure we send + * one packet. + */ + if (bbr->r_ctl.rc_resend == NULL) { + bbr->r_ctl.rc_resend = rsm; + } + } + } + if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) { + /* + * See if we need to rack-retransmit anything if so set it + * up as the thing to resend assuming something else is not + * already in that position. + */ + if (bbr->r_ctl.rc_resend == NULL) { + bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts); + } + } + /* + * We return the amount that changed via sack, this is used by the + * ack-received code to augment what was changed between th_ack <-> + * snd_una. + */ + return (sack_changed); +} + +static void +bbr_strike_dupack(struct tcp_bbr *bbr) +{ + struct bbr_sendmap *rsm; + + rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap); + if (rsm && (rsm->r_dupack < 0xff)) { + rsm->r_dupack++; + if (rsm->r_dupack >= DUP_ACK_THRESHOLD) + bbr->r_wanted_output = 1; + } +} + +/* + * Return value of 1, we do not need to call bbr_process_data(). + * return value of 0, bbr_process_data can be called. + * For ret_val if its 0 the TCB is locked and valid, if its non-zero + * its unlocked and probably unsafe to touch the TCB. + */ +static int +bbr_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, struct tcpopt *to, + uint32_t tiwin, int32_t tlen, + int32_t * ofia, int32_t thflags, int32_t * ret_val) +{ + int32_t ourfinisacked = 0; + int32_t acked_amount; + uint16_t nsegs; + int32_t acked; + uint32_t lost, sack_changed = 0; + struct mbuf *mfree; + struct tcp_bbr *bbr; + uint32_t prev_acked = 0; + + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + lost = bbr->r_ctl.rc_lost; + nsegs = max(1, m->m_pkthdr.lro_nsegs); + if (SEQ_GT(th->th_ack, tp->snd_max)) { + ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); + bbr->r_wanted_output = 1; + return (1); + } + if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { + /* Process the ack */ + if (bbr->rc_in_persist) + tp->t_rxtshift = 0; + if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd)) + bbr_strike_dupack(bbr); + sack_changed = bbr_log_ack(tp, to, th, &prev_acked); + } + bbr_lt_bw_sampling(bbr, bbr->r_ctl.rc_rcvtime, (bbr->r_ctl.rc_lost > lost)); + if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { + /* + * Old ack, behind the last one rcv'd or a duplicate ack + * with SACK info. + */ + if (th->th_ack == tp->snd_una) { + bbr_ack_received(tp, bbr, th, 0, sack_changed, prev_acked, __LINE__, 0); + if (bbr->r_state == TCPS_SYN_SENT) { + /* + * Special case on where we sent SYN. When + * the SYN-ACK is processed in syn_sent + * state it bumps the snd_una. This causes + * us to hit here even though we did ack 1 + * byte. + * + * Go through the nothing left case so we + * send data. + */ + goto nothing_left; + } + } + return (0); + } + /* + * If we reach this point, ACK is not a duplicate, i.e., it ACKs + * something we sent. + */ + if (tp->t_flags & TF_NEEDSYN) { + /* + * T/TCP: Connection was half-synchronized, and our SYN has + * been ACK'd (so connection is now fully synchronized). Go + * to non-starred state, increment snd_una for ACK of SYN, + * and check if we can do window scaling. + */ + tp->t_flags &= ~TF_NEEDSYN; + tp->snd_una++; + /* Do window scaling? */ + if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == + (TF_RCVD_SCALE | TF_REQ_SCALE)) { + tp->rcv_scale = tp->request_r_scale; + /* Send window already scaled. */ + } + } + INP_WLOCK_ASSERT(tp->t_inpcb); + + acked = BYTES_THIS_ACK(tp, th); + TCPSTAT_ADD(tcps_rcvackpack, (int)nsegs); + TCPSTAT_ADD(tcps_rcvackbyte, acked); + + /* + * If we just performed our first retransmit, and the ACK arrives + * within our recovery window, then it was a mistake to do the + * retransmit in the first place. Recover our original cwnd and + * ssthresh, and proceed to transmit where we left off. + */ + if (tp->t_flags & TF_PREVVALID) { + tp->t_flags &= ~TF_PREVVALID; + if (tp->t_rxtshift == 1 && + (int)(ticks - tp->t_badrxtwin) < 0) + bbr_cong_signal(tp, th, CC_RTO_ERR, NULL); + } + SOCKBUF_LOCK(&so->so_snd); + acked_amount = min(acked, (int)sbavail(&so->so_snd)); + tp->snd_wnd -= acked_amount; + mfree = sbcut_locked(&so->so_snd, acked_amount); + /* NB: sowwakeup_locked() does an implicit unlock. */ + sowwakeup_locked(so); + m_freem(mfree); + if (SEQ_GT(th->th_ack, tp->snd_una)) { + bbr_collapse_rtt(tp, bbr, TCP_REXMTVAL(tp)); + } + tp->snd_una = th->th_ack; + bbr_ack_received(tp, bbr, th, acked, sack_changed, prev_acked, __LINE__, (bbr->r_ctl.rc_lost - lost)); + if (IN_RECOVERY(tp->t_flags)) { + if (SEQ_LT(th->th_ack, tp->snd_recover) && + (SEQ_LT(th->th_ack, tp->snd_max))) { + tcp_bbr_partialack(tp); + } else { + bbr_post_recovery(tp); + } + } + if (SEQ_GT(tp->snd_una, tp->snd_recover)) { + tp->snd_recover = tp->snd_una; + } + if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { + tp->snd_nxt = tp->snd_max; + } + if (tp->snd_una == tp->snd_max) { + /* Nothing left outstanding */ +nothing_left: + bbr_log_progress_event(bbr, tp, ticks, PROGRESS_CLEAR, __LINE__); + if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) + bbr->rc_tp->t_acktime = 0; + if ((sbused(&so->so_snd) == 0) && + (tp->t_flags & TF_SENTFIN)) { + ourfinisacked = 1; + } + bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); + if (bbr->rc_in_persist == 0) { + bbr->r_ctl.rc_went_idle_time = bbr->r_ctl.rc_rcvtime; + } + sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una); + bbr_log_ack_clear(bbr, bbr->r_ctl.rc_rcvtime); + /* + * We invalidate the last ack here since we + * don't want to transfer forward the time + * for our sum's calculations. + */ + if ((tp->t_state >= TCPS_FIN_WAIT_1) && + (sbavail(&so->so_snd) == 0) && + (tp->t_flags2 & TF2_DROP_AF_DATA)) { + /* + * The socket was gone and the peer sent data, time + * to reset him. + */ + *ret_val = 1; + tp = tcp_close(tp); + ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); + BBR_STAT_INC(bbr_dropped_af_data); + return (1); + } + /* Set need output so persist might get set */ + bbr->r_wanted_output = 1; + } + if (ofia) + *ofia = ourfinisacked; + return (0); +} + +static void +bbr_enter_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, int32_t line) +{ + if (bbr->rc_in_persist == 0) { + bbr_timer_cancel(bbr, __LINE__, cts); + bbr->r_ctl.rc_last_delay_val = 0; + tp->t_rxtshift = 0; + bbr->rc_in_persist = 1; + bbr->r_ctl.rc_went_idle_time = cts; + /* We should be capped when rw went to 0 but just in case */ + bbr_log_type_pesist(bbr, cts, 0, line, 1); + /* Time freezes for the state, so do the accounting now */ + if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) { + uint32_t time_in; + + time_in = cts - bbr->r_ctl.rc_bbr_state_time; + if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) { + int32_t idx; + + idx = bbr_state_val(bbr); + counter_u64_add(bbr_state_time[(idx + 5)], time_in); + } else { + counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in); + } + } + bbr->r_ctl.rc_bbr_state_time = cts; + } +} + +static void +bbr_restart_after_idle(struct tcp_bbr *bbr, uint32_t cts, uint32_t idle_time) +{ + /* + * Note that if idle time does not exceed our + * threshold, we do nothing continuing the state + * transitions we were last walking through. + */ + if (idle_time >= bbr_idle_restart_threshold) { + if (bbr->rc_use_idle_restart) { + bbr->rc_bbr_state = BBR_STATE_IDLE_EXIT; + /* + * Set our target using BBR_UNIT, so + * we increase at a dramatic rate but + * we stop when we get the pipe + * full again for our current b/w estimate. + */ + bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT; + bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT; + bbr_set_state_target(bbr, __LINE__); + /* Now setup our gains to ramp up */ + bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_startup_pg; + bbr->r_ctl.rc_bbr_cwnd_gain = bbr->r_ctl.rc_startup_pg; + bbr_log_type_statechange(bbr, cts, __LINE__); + } else { + bbr_substate_change(bbr, cts, __LINE__, 1); + } + } +} + +static void +bbr_exit_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, int32_t line) +{ + uint32_t idle_time; + + if (bbr->rc_in_persist == 0) + return; + idle_time = bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time); + bbr->rc_in_persist = 0; + bbr->rc_hit_state_1 = 0; + tp->t_flags &= ~TF_FORCEDATA; + bbr->r_ctl.rc_del_time = cts; + /* + * We invalidate the last ack here since we + * don't want to transfer forward the time + * for our sum's calculations. + */ + if (bbr->rc_inp->inp_in_hpts) { + tcp_hpts_remove(bbr->rc_inp, HPTS_REMOVE_OUTPUT); + bbr->rc_timer_first = 0; + bbr->r_ctl.rc_hpts_flags = 0; + bbr->r_ctl.rc_last_delay_val = 0; + bbr->r_ctl.rc_hptsi_agg_delay = 0; + bbr->r_agg_early_set = 0; + bbr->r_ctl.rc_agg_early = 0; + } + bbr_log_type_pesist(bbr, cts, idle_time, line, 0); + if (idle_time >= bbr_rtt_probe_time) { + /* + * This qualifies as a RTT_PROBE session since we drop the + * data outstanding to nothing and waited more than + * bbr_rtt_probe_time. + */ + bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_PERSIST, 0); + bbr->r_ctl.last_in_probertt = bbr->r_ctl.rc_rtt_shrinks = cts; + } + tp->t_rxtshift = 0; + /* + * If in probeBW and we have persisted more than an RTT lets do + * special handling. + */ + /* Force a time based epoch */ + bbr_set_epoch(bbr, cts, __LINE__); + /* + * Setup the lost so we don't count anything against the guy + * we have been stuck with during persists. + */ + bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost; + /* Time un-freezes for the state */ + bbr->r_ctl.rc_bbr_state_time = cts; + if ((bbr->rc_bbr_state == BBR_STATE_PROBE_BW) || + (bbr->rc_bbr_state == BBR_STATE_PROBE_RTT)) { + /* + * If we are going back to probe-bw + * or probe_rtt, we may need to possibly + * do a fast restart. + */ + bbr_restart_after_idle(bbr, cts, idle_time); + } +} + +static void +bbr_collapsed_window(struct tcp_bbr *bbr) +{ + /* + * Now we must walk the + * send map and divide the + * ones left stranded. These + * guys can't cause us to abort + * the connection and are really + * "unsent". However if a buggy + * client actually did keep some + * of the data i.e. collapsed the win + * and refused to ack and then opened + * the win and acked that data. We would + * get into an ack war, the simplier + * method then of just pretending we + * did not send those segments something + * won't work. + */ + struct bbr_sendmap *rsm, *nrsm; + tcp_seq max_seq; + uint32_t maxseg; + int can_split = 0; + int fnd = 0; + + maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options; + max_seq = bbr->rc_tp->snd_una + bbr->rc_tp->snd_wnd; + bbr_log_type_rwnd_collapse(bbr, max_seq, 1, 0); + TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) { + /* Find the first seq past or at maxseq */ + if (rsm->r_flags & BBR_RWND_COLLAPSED) + rsm->r_flags &= ~BBR_RWND_COLLAPSED; + if (SEQ_GEQ(max_seq, rsm->r_start) && + SEQ_GEQ(rsm->r_end, max_seq)) { + fnd = 1; + break; + } + } + bbr->rc_has_collapsed = 0; + if (!fnd) { + /* Nothing to do strange */ + return; + } + /* + * Now can we split? + * + * We don't want to split if splitting + * would generate too many small segments + * less we let an attacker fragment our + * send_map and leave us out of memory. + */ + if ((max_seq != rsm->r_start) && + (max_seq != rsm->r_end)){ + /* can we split? */ + int res1, res2; + + res1 = max_seq - rsm->r_start; + res2 = rsm->r_end - max_seq; + if ((res1 >= (maxseg/8)) && + (res2 >= (maxseg/8))) { + /* No small pieces here */ + can_split = 1; + } else if (bbr->r_ctl.rc_num_small_maps_alloced < bbr_sack_block_limit) { + /* We are under the limit */ + can_split = 1; + } + } + /* Ok do we need to split this rsm? */ + if (max_seq == rsm->r_start) { + /* It's this guy no split required */ + nrsm = rsm; + } else if (max_seq == rsm->r_end) { + /* It's the next one no split required. */ + nrsm = TAILQ_NEXT(rsm, r_next); + if (nrsm == NULL) { + /* Huh? */ + return; + } + } else if (can_split && SEQ_LT(max_seq, rsm->r_end)) { + /* yep we need to split it */ + nrsm = bbr_alloc_limit(bbr, BBR_LIMIT_TYPE_SPLIT); + if (nrsm == NULL) { + /* failed XXXrrs what can we do mark the whole? */ + nrsm = rsm; + goto no_split; + } + /* Clone it */ + bbr_log_type_rwnd_collapse(bbr, max_seq, 3, 0); + bbr_clone_rsm(bbr, nrsm, rsm, max_seq); + TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next); + if (rsm->r_in_tmap) { + TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext); + nrsm->r_in_tmap = 1; + } + } else { + /* + * Split not allowed just start here just + * use this guy. + */ + nrsm = rsm; + } +no_split: + BBR_STAT_INC(bbr_collapsed_win); + /* reuse fnd as a count */ + fnd = 0; + TAILQ_FOREACH_FROM(nrsm, &bbr->r_ctl.rc_map, r_next) { + nrsm->r_flags |= BBR_RWND_COLLAPSED; + fnd++; + bbr->rc_has_collapsed = 1; + } + bbr_log_type_rwnd_collapse(bbr, max_seq, 4, fnd); +} + +static void +bbr_un_collapse_window(struct tcp_bbr *bbr) +{ + struct bbr_sendmap *rsm; + int cleared = 0; + + TAILQ_FOREACH_REVERSE(rsm, &bbr->r_ctl.rc_map, bbr_head, r_next) { + if (rsm->r_flags & BBR_RWND_COLLAPSED) { + /* Clear the flag */ + rsm->r_flags &= ~BBR_RWND_COLLAPSED; + cleared++; + } else + break; + } + bbr_log_type_rwnd_collapse(bbr, + (bbr->rc_tp->snd_una + bbr->rc_tp->snd_wnd), 0, cleared); + bbr->rc_has_collapsed = 0; +} + +/* + * Return value of 1, the TCB is unlocked and most + * likely gone, return value of 0, the TCB is still + * locked. + */ +static int +bbr_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, + uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) +{ + /* + * Update window information. Don't look at window if no ACK: TAC's + * send garbage on first SYN. + */ + uint16_t nsegs; + int32_t tfo_syn; + struct tcp_bbr *bbr; + + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + INP_WLOCK_ASSERT(tp->t_inpcb); + nsegs = max(1, m->m_pkthdr.lro_nsegs); + if ((thflags & TH_ACK) && + (SEQ_LT(tp->snd_wl1, th->th_seq) || + (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || + (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { + /* keep track of pure window updates */ + if (tlen == 0 && + tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) + TCPSTAT_INC(tcps_rcvwinupd); + tp->snd_wnd = tiwin; + tp->snd_wl1 = th->th_seq; + tp->snd_wl2 = th->th_ack; + if (tp->snd_wnd > tp->max_sndwnd) + tp->max_sndwnd = tp->snd_wnd; + bbr->r_wanted_output = 1; + } else if (thflags & TH_ACK) { + if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { + tp->snd_wnd = tiwin; + tp->snd_wl1 = th->th_seq; + tp->snd_wl2 = th->th_ack; + } + } + if (tp->snd_wnd < ctf_outstanding(tp)) + /* The peer collapsed its window on us */ + bbr_collapsed_window(bbr); + else if (bbr->rc_has_collapsed) + bbr_un_collapse_window(bbr); + /* Was persist timer active and now we have window space? */ + if ((bbr->rc_in_persist != 0) && + (tp->snd_wnd >= min((bbr->r_ctl.rc_high_rwnd/2), + bbr_minseg(bbr)))) { + /* + * Make the rate persist at end of persist mode if idle long + * enough + */ + bbr_exit_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__); + + /* Make sure we output to start the timer */ + bbr->r_wanted_output = 1; + } + /* Do we need to enter persist? */ + if ((bbr->rc_in_persist == 0) && + (tp->snd_wnd < min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) && + TCPS_HAVEESTABLISHED(tp->t_state) && + (tp->snd_max == tp->snd_una) && + sbavail(&tp->t_inpcb->inp_socket->so_snd) && + (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { + /* No send window.. we must enter persist */ + bbr_enter_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__); + } + if (tp->t_flags2 & TF2_DROP_AF_DATA) { + m_freem(m); + return (0); + } + /* + * Process segments with URG. + */ + if ((thflags & TH_URG) && th->th_urp && + TCPS_HAVERCVDFIN(tp->t_state) == 0) { + /* + * This is a kludge, but if we receive and accept random + * urgent pointers, we'll crash in soreceive. It's hard to + * imagine someone actually wanting to send this much urgent + * data. + */ + SOCKBUF_LOCK(&so->so_rcv); + if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { + th->th_urp = 0; /* XXX */ + thflags &= ~TH_URG; /* XXX */ + SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ + goto dodata; /* XXX */ + } + /* + * If this segment advances the known urgent pointer, then + * mark the data stream. This should not happen in + * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a + * FIN has been received from the remote side. In these + * states we ignore the URG. + * + * According to RFC961 (Assigned Protocols), the urgent + * pointer points to the last octet of urgent data. We + * continue, however, to consider it to indicate the first + * octet of data past the urgent section as the original + * spec states (in one of two places). + */ + if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) { + tp->rcv_up = th->th_seq + th->th_urp; + so->so_oobmark = sbavail(&so->so_rcv) + + (tp->rcv_up - tp->rcv_nxt) - 1; + if (so->so_oobmark == 0) + so->so_rcv.sb_state |= SBS_RCVATMARK; + sohasoutofband(so); + tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); + } + SOCKBUF_UNLOCK(&so->so_rcv); + /* + * Remove out of band data so doesn't get presented to user. + * This can happen independent of advancing the URG pointer, + * but if two URG's are pending at once, some out-of-band + * data may creep in... ick. + */ + if (th->th_urp <= (uint32_t)tlen && + !(so->so_options & SO_OOBINLINE)) { + /* hdr drop is delayed */ + tcp_pulloutofband(so, th, m, drop_hdrlen); + } + } else { + /* + * If no out of band data is expected, pull receive urgent + * pointer along with the receive window. + */ + if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) + tp->rcv_up = tp->rcv_nxt; + } +dodata: /* XXX */ + INP_WLOCK_ASSERT(tp->t_inpcb); + + /* + * Process the segment text, merging it into the TCP sequencing + * queue, and arranging for acknowledgment of receipt if necessary. + * This process logically involves adjusting tp->rcv_wnd as data is + * presented to the user (this happens in tcp_usrreq.c, case + * PRU_RCVD). If a FIN has already been received on this connection + * then we just ignore the text. + */ + tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && + IS_FASTOPEN(tp->t_flags)); + if ((tlen || (thflags & TH_FIN) || tfo_syn) && + TCPS_HAVERCVDFIN(tp->t_state) == 0) { + tcp_seq save_start = th->th_seq; + tcp_seq save_rnxt = tp->rcv_nxt; + int save_tlen = tlen; + + m_adj(m, drop_hdrlen); /* delayed header drop */ + /* + * Insert segment which includes th into TCP reassembly + * queue with control block tp. Set thflags to whether + * reassembly now includes a segment with FIN. This handles + * the common case inline (segment is the next to be + * received on an established connection, and the queue is + * empty), avoiding linkage into and removal from the queue + * and repetition of various conversions. Set DELACK for + * segments received in order, but ack immediately when + * segments are out of order (so fast retransmit can work). + */ + if (th->th_seq == tp->rcv_nxt && + SEGQ_EMPTY(tp) && + (TCPS_HAVEESTABLISHED(tp->t_state) || + tfo_syn)) { +#ifdef NETFLIX_SB_LIMITS + u_int mcnt, appended; + + if (so->so_rcv.sb_shlim) { + mcnt = m_memcnt(m); + appended = 0; + if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, + CFO_NOSLEEP, NULL) == false) { + counter_u64_add(tcp_sb_shlim_fails, 1); + m_freem(m); + return (0); + } + } +#endif + if (DELAY_ACK(tp, bbr, nsegs) || tfo_syn) { + bbr->bbr_segs_rcvd += max(1, nsegs); + tp->t_flags |= TF_DELACK; + bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); + } else { + bbr->r_wanted_output = 1; + tp->t_flags |= TF_ACKNOW; + } + tp->rcv_nxt += tlen; + thflags = th->th_flags & TH_FIN; + TCPSTAT_ADD(tcps_rcvpack, (int)nsegs); + TCPSTAT_ADD(tcps_rcvbyte, tlen); + SOCKBUF_LOCK(&so->so_rcv); + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) + m_freem(m); + else +#ifdef NETFLIX_SB_LIMITS + appended = +#endif + sbappendstream_locked(&so->so_rcv, m, 0); + /* NB: sorwakeup_locked() does an implicit unlock. */ + sorwakeup_locked(so); +#ifdef NETFLIX_SB_LIMITS + if (so->so_rcv.sb_shlim && appended != mcnt) + counter_fo_release(so->so_rcv.sb_shlim, + mcnt - appended); +#endif + } else { + /* + * XXX: Due to the header drop above "th" is + * theoretically invalid by now. Fortunately + * m_adj() doesn't actually frees any mbufs when + * trimming from the head. + */ + tcp_seq temp = save_start; + thflags = tcp_reass(tp, th, &temp, &tlen, m); + tp->t_flags |= TF_ACKNOW; + } + if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0)) { + if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { + /* + * DSACK actually handled in the fastpath + * above. + */ + tcp_update_sack_list(tp, save_start, + save_start + save_tlen); + } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { + if ((tp->rcv_numsacks >= 1) && + (tp->sackblks[0].end == save_start)) { + /* + * Partial overlap, recorded at todrop + * above. + */ + tcp_update_sack_list(tp, + tp->sackblks[0].start, + tp->sackblks[0].end); + } else { + tcp_update_dsack_list(tp, save_start, + save_start + save_tlen); + } + } else if (tlen >= save_tlen) { + /* Update of sackblks. */ + tcp_update_dsack_list(tp, save_start, + save_start + save_tlen); + } else if (tlen > 0) { + tcp_update_dsack_list(tp, save_start, + save_start + tlen); + } + } + } else { + m_freem(m); + thflags &= ~TH_FIN; + } + + /* + * If FIN is received ACK the FIN and let the user know that the + * connection is closing. + */ + if (thflags & TH_FIN) { + if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { + socantrcvmore(so); + /* + * If connection is half-synchronized (ie NEEDSYN + * flag on) then delay ACK, so it may be piggybacked + * when SYN is sent. Otherwise, since we received a + * FIN then no more input can be expected, send ACK + * now. + */ + if (tp->t_flags & TF_NEEDSYN) { + tp->t_flags |= TF_DELACK; + bbr_timer_cancel(bbr, + __LINE__, bbr->r_ctl.rc_rcvtime); + } else { + tp->t_flags |= TF_ACKNOW; + } + tp->rcv_nxt++; + } + switch (tp->t_state) { + + /* + * In SYN_RECEIVED and ESTABLISHED STATES enter the + * CLOSE_WAIT state. + */ + case TCPS_SYN_RECEIVED: + tp->t_starttime = ticks; + /* FALLTHROUGH */ + case TCPS_ESTABLISHED: + tcp_state_change(tp, TCPS_CLOSE_WAIT); + break; + + /* + * If still in FIN_WAIT_1 STATE FIN has not been + * acked so enter the CLOSING state. + */ + case TCPS_FIN_WAIT_1: + tcp_state_change(tp, TCPS_CLOSING); + break; + + /* + * In FIN_WAIT_2 state enter the TIME_WAIT state, + * starting the time-wait timer, turning off the + * other standard timers. + */ + case TCPS_FIN_WAIT_2: + bbr->rc_timer_first = 1; + bbr_timer_cancel(bbr, + __LINE__, bbr->r_ctl.rc_rcvtime); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + INP_WLOCK_ASSERT(tp->t_inpcb); + tcp_twstart(tp); + return (1); + } + } + /* + * Return any desired output. + */ + if ((tp->t_flags & TF_ACKNOW) || + (sbavail(&so->so_snd) > ctf_outstanding(tp))) { + bbr->r_wanted_output = 1; + } + INP_WLOCK_ASSERT(tp->t_inpcb); + return (0); +} + +/* + * Here nothing is really faster, its just that we + * have broken out the fast-data path also just like + * the fast-ack. Return 1 if we processed the packet + * return 0 if you need to take the "slow-path". + */ +static int +bbr_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, + uint32_t tiwin, int32_t nxt_pkt) +{ + uint16_t nsegs; + int32_t newsize = 0; /* automatic sockbuf scaling */ + struct tcp_bbr *bbr; +#ifdef NETFLIX_SB_LIMITS + u_int mcnt, appended; +#endif +#ifdef TCPDEBUG + /* + * The size of tcp_saveipgen must be the size of the max ip header, + * now IPv6. + */ + u_char tcp_saveipgen[IP6_HDR_LEN]; + struct tcphdr tcp_savetcp; + short ostate = 0; + +#endif + /* On the hpts and we would have called output */ + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + + /* + * If last ACK falls within this segment's sequence numbers, record + * the timestamp. NOTE that the test is modified according to the + * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). + */ + if (bbr->r_ctl.rc_resend != NULL) { + return (0); + } + if (tiwin && tiwin != tp->snd_wnd) { + return (0); + } + if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { + return (0); + } + if (__predict_false((to->to_flags & TOF_TS) && + (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { + return (0); + } + if (__predict_false((th->th_ack != tp->snd_una))) { + return (0); + } + if (__predict_false(tlen > sbspace(&so->so_rcv))) { + return (0); + } + if ((to->to_flags & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { + tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); + tp->ts_recent = to->to_tsval; + } + /* + * This is a pure, in-sequence data packet with nothing on the + * reassembly queue and we have enough buffer space to take it. + */ + nsegs = max(1, m->m_pkthdr.lro_nsegs); +#ifdef NETFLIX_SB_LIMITS + if (so->so_rcv.sb_shlim) { + mcnt = m_memcnt(m); + appended = 0; + if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, + CFO_NOSLEEP, NULL) == false) { + counter_u64_add(tcp_sb_shlim_fails, 1); + m_freem(m); + return (1); + } + } +#endif + /* Clean receiver SACK report if present */ + if (tp->rcv_numsacks) + tcp_clean_sackreport(tp); + TCPSTAT_INC(tcps_preddat); + tp->rcv_nxt += tlen; + /* + * Pull snd_wl1 up to prevent seq wrap relative to th_seq. + */ + tp->snd_wl1 = th->th_seq; + /* + * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. + */ + tp->rcv_up = tp->rcv_nxt; + TCPSTAT_ADD(tcps_rcvpack, (int)nsegs); + TCPSTAT_ADD(tcps_rcvbyte, tlen); +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) + tcp_trace(TA_INPUT, ostate, tp, + (void *)tcp_saveipgen, &tcp_savetcp, 0); +#endif + newsize = tcp_autorcvbuf(m, th, so, tp, tlen); + + /* Add data to socket buffer. */ + SOCKBUF_LOCK(&so->so_rcv); + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { + m_freem(m); + } else { + /* + * Set new socket buffer size. Give up when limit is + * reached. + */ + if (newsize) + if (!sbreserve_locked(&so->so_rcv, + newsize, so, NULL)) + so->so_rcv.sb_flags &= ~SB_AUTOSIZE; + m_adj(m, drop_hdrlen); /* delayed header drop */ +#ifdef NETFLIX_SB_LIMITS + appended = +#endif + sbappendstream_locked(&so->so_rcv, m, 0); + ctf_calc_rwin(so, tp); + } + /* NB: sorwakeup_locked() does an implicit unlock. */ + sorwakeup_locked(so); +#ifdef NETFLIX_SB_LIMITS + if (so->so_rcv.sb_shlim && mcnt != appended) + counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended); +#endif + if (DELAY_ACK(tp, bbr, nsegs)) { + bbr->bbr_segs_rcvd += max(1, nsegs); + tp->t_flags |= TF_DELACK; + bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); + } else { + bbr->r_wanted_output = 1; + tp->t_flags |= TF_ACKNOW; + } + return (1); +} + +/* + * This subfunction is used to try to highly optimize the + * fast path. We again allow window updates that are + * in sequence to remain in the fast-path. We also add + * in the __predict's to attempt to help the compiler. + * Note that if we return a 0, then we can *not* process + * it and the caller should push the packet into the + * slow-path. If we return 1, then all is well and + * the packet is fully processed. + */ +static int +bbr_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, + uint32_t tiwin, int32_t nxt_pkt) +{ + int32_t acked; + uint16_t nsegs; + uint32_t sack_changed; +#ifdef TCPDEBUG + /* + * The size of tcp_saveipgen must be the size of the max ip header, + * now IPv6. + */ + u_char tcp_saveipgen[IP6_HDR_LEN]; + struct tcphdr tcp_savetcp; + short ostate = 0; + +#endif + uint32_t prev_acked = 0; + struct tcp_bbr *bbr; + + if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { + /* Old ack, behind (or duplicate to) the last one rcv'd */ + return (0); + } + if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { + /* Above what we have sent? */ + return (0); + } + if (__predict_false(tiwin == 0)) { + /* zero window */ + return (0); + } + if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { + /* We need a SYN or a FIN, unlikely.. */ + return (0); + } + if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { + /* Timestamp is behind .. old ack with seq wrap? */ + return (0); + } + if (__predict_false(IN_RECOVERY(tp->t_flags))) { + /* Still recovering */ + return (0); + } + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + if (__predict_false(bbr->r_ctl.rc_resend != NULL)) { + /* We are retransmitting */ + return (0); + } + if (__predict_false(bbr->rc_in_persist != 0)) { + /* In persist mode */ + return (0); + } + if (bbr->r_ctl.rc_sacked) { + /* We have sack holes on our scoreboard */ + return (0); + } + /* Ok if we reach here, we can process a fast-ack */ + nsegs = max(1, m->m_pkthdr.lro_nsegs); + sack_changed = bbr_log_ack(tp, to, th, &prev_acked); + /* + * We never detect loss in fast ack [we can't + * have a sack and can't be in recovery so + * we always pass 0 (nothing detected)]. + */ + bbr_lt_bw_sampling(bbr, bbr->r_ctl.rc_rcvtime, 0); + /* Did the window get updated? */ + if (tiwin != tp->snd_wnd) { + tp->snd_wnd = tiwin; + tp->snd_wl1 = th->th_seq; + if (tp->snd_wnd > tp->max_sndwnd) + tp->max_sndwnd = tp->snd_wnd; + } + /* Do we need to exit persists? */ + if ((bbr->rc_in_persist != 0) && + (tp->snd_wnd >= min((bbr->r_ctl.rc_high_rwnd/2), + bbr_minseg(bbr)))) { + bbr_exit_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__); + bbr->r_wanted_output = 1; + } + /* Do we need to enter persists? */ + if ((bbr->rc_in_persist == 0) && + (tp->snd_wnd < min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) && + TCPS_HAVEESTABLISHED(tp->t_state) && + (tp->snd_max == tp->snd_una) && + sbavail(&tp->t_inpcb->inp_socket->so_snd) && + (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { + /* No send window.. we must enter persist */ + bbr_enter_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__); + } + /* + * If last ACK falls within this segment's sequence numbers, record + * the timestamp. NOTE that the test is modified according to the + * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). + */ + if ((to->to_flags & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { + tp->ts_recent_age = bbr->r_ctl.rc_rcvtime; + tp->ts_recent = to->to_tsval; + } + /* + * This is a pure ack for outstanding data. + */ + TCPSTAT_INC(tcps_predack); + + /* + * "bad retransmit" recovery. + */ + if (tp->t_flags & TF_PREVVALID) { + tp->t_flags &= ~TF_PREVVALID; + if (tp->t_rxtshift == 1 && + (int)(ticks - tp->t_badrxtwin) < 0) + bbr_cong_signal(tp, th, CC_RTO_ERR, NULL); + } + /* + * Recalculate the transmit timer / rtt. + * + * Some boxes send broken timestamp replies during the SYN+ACK + * phase, ignore timestamps of 0 or we could calculate a huge RTT + * and blow up the retransmit timer. + */ + acked = BYTES_THIS_ACK(tp, th); + +#ifdef TCP_HHOOK + /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ + hhook_run_tcp_est_in(tp, th, to); +#endif + + TCPSTAT_ADD(tcps_rcvackpack, (int)nsegs); + TCPSTAT_ADD(tcps_rcvackbyte, acked); + sbdrop(&so->so_snd, acked); + + if (SEQ_GT(th->th_ack, tp->snd_una)) + bbr_collapse_rtt(tp, bbr, TCP_REXMTVAL(tp)); + tp->snd_una = th->th_ack; + if (tp->snd_wnd < ctf_outstanding(tp)) + /* The peer collapsed its window on us */ + bbr_collapsed_window(bbr); + else if (bbr->rc_has_collapsed) + bbr_un_collapse_window(bbr); + + if (SEQ_GT(tp->snd_una, tp->snd_recover)) { + tp->snd_recover = tp->snd_una; + } + bbr_ack_received(tp, bbr, th, acked, sack_changed, prev_acked, __LINE__, 0); + /* + * Pull snd_wl2 up to prevent seq wrap relative to th_ack. + */ + tp->snd_wl2 = th->th_ack; + m_freem(m); + /* + * If all outstanding data are acked, stop retransmit timer, + * otherwise restart timer using current (possibly backed-off) + * value. If process is waiting for space, wakeup/selwakeup/signal. + * If data are ready to send, let tcp_output decide between more + * output or persist. + */ +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) + tcp_trace(TA_INPUT, ostate, tp, + (void *)tcp_saveipgen, + &tcp_savetcp, 0); +#endif + /* Wake up the socket if we have room to write more */ + sowwakeup(so); + if (tp->snd_una == tp->snd_max) { + /* Nothing left outstanding */ + bbr_log_progress_event(bbr, tp, ticks, PROGRESS_CLEAR, __LINE__); + if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) + bbr->rc_tp->t_acktime = 0; + bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); + if (bbr->rc_in_persist == 0) { + bbr->r_ctl.rc_went_idle_time = bbr->r_ctl.rc_rcvtime; + } + sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una); + bbr_log_ack_clear(bbr, bbr->r_ctl.rc_rcvtime); + /* + * We invalidate the last ack here since we + * don't want to transfer forward the time + * for our sum's calculations. + */ + bbr->r_wanted_output = 1; + } + if (sbavail(&so->so_snd)) { + bbr->r_wanted_output = 1; + } + return (1); +} + +/* + * Return value of 1, the TCB is unlocked and most + * likely gone, return value of 0, the TCB is still + * locked. + */ +static int +bbr_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, + uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) +{ + int32_t todrop; + int32_t ourfinisacked = 0; + struct tcp_bbr *bbr; + int32_t ret_val = 0; + + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + ctf_calc_rwin(so, tp); + /* + * If the state is SYN_SENT: if seg contains an ACK, but not for our + * SYN, drop the input. if seg contains a RST, then drop the + * connection. if seg does not contain SYN, then drop it. Otherwise + * this is an acceptable SYN segment initialize tp->rcv_nxt and + * tp->irs if seg contains ack then advance tp->snd_una. BRR does + * not support ECN so we will not say we are capable. if SYN has + * been acked change to ESTABLISHED else SYN_RCVD state arrange for + * segment to be acked (eventually) continue processing rest of + * data/controls, beginning with URG + */ + if ((thflags & TH_ACK) && + (SEQ_LEQ(th->th_ack, tp->iss) || + SEQ_GT(th->th_ack, tp->snd_max))) { + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + return (1); + } + if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { + TCP_PROBE5(connect__refused, NULL, tp, + mtod(m, const char *), tp, th); + tp = tcp_drop(tp, ECONNREFUSED); + ctf_do_drop(m, tp); + return (1); + } + if (thflags & TH_RST) { + ctf_do_drop(m, tp); + return (1); + } + if (!(thflags & TH_SYN)) { + ctf_do_drop(m, tp); + return (1); + } + tp->irs = th->th_seq; + tcp_rcvseqinit(tp); + if (thflags & TH_ACK) { + int tfo_partial = 0; + + TCPSTAT_INC(tcps_connects); + soisconnected(so); +#ifdef MAC + mac_socketpeer_set_from_mbuf(m, so); +#endif + /* Do window scaling on this connection? */ + if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == + (TF_RCVD_SCALE | TF_REQ_SCALE)) { + tp->rcv_scale = tp->request_r_scale; + } + tp->rcv_adv += min(tp->rcv_wnd, + TCP_MAXWIN << tp->rcv_scale); + /* + * If not all the data that was sent in the TFO SYN + * has been acked, resend the remainder right away. + */ + if (IS_FASTOPEN(tp->t_flags) && + (tp->snd_una != tp->snd_max)) { + tp->snd_nxt = th->th_ack; + tfo_partial = 1; + } + /* + * If there's data, delay ACK; if there's also a FIN ACKNOW + * will be turned on later. + */ + if (DELAY_ACK(tp, bbr, 1) && tlen != 0 && (tfo_partial == 0)) { + bbr->bbr_segs_rcvd += 1; + tp->t_flags |= TF_DELACK; + bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); + } else { + bbr->r_wanted_output = 1; + tp->t_flags |= TF_ACKNOW; + } + if (SEQ_GT(th->th_ack, tp->iss)) { + /* + * The SYN is acked + * handle it specially. + */ + bbr_log_syn(tp, to); + } + if (SEQ_GT(th->th_ack, tp->snd_una)) { + /* + * We advance snd_una for the + * fast open case. If th_ack is + * acknowledging data beyond + * snd_una we can't just call + * ack-processing since the + * data stream in our send-map + * will start at snd_una + 1 (one + * beyond the SYN). If its just + * equal we don't need to do that + * and there is no send_map. + */ + tp->snd_una++; + } + /* + * Received in SYN_SENT[*] state. Transitions: + * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 + */ + tp->t_starttime = ticks; + if (tp->t_flags & TF_NEEDFIN) { + tcp_state_change(tp, TCPS_FIN_WAIT_1); + tp->t_flags &= ~TF_NEEDFIN; + thflags &= ~TH_SYN; + } else { + tcp_state_change(tp, TCPS_ESTABLISHED); + TCP_PROBE5(connect__established, NULL, tp, + mtod(m, const char *), tp, th); + cc_conn_init(tp); + } + } else { + /* + * Received initial SYN in SYN-SENT[*] state => simultaneous + * open. If segment contains CC option and there is a + * cached CC, apply TAO test. If it succeeds, connection is * + * half-synchronized. Otherwise, do 3-way handshake: + * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If + * there was no CC option, clear cached CC value. + */ + tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); + tcp_state_change(tp, TCPS_SYN_RECEIVED); + } + INP_WLOCK_ASSERT(tp->t_inpcb); + /* + * Advance th->th_seq to correspond to first data byte. If data, + * trim to stay within window, dropping FIN if necessary. + */ + th->th_seq++; + if (tlen > tp->rcv_wnd) { + todrop = tlen - tp->rcv_wnd; + m_adj(m, -todrop); + tlen = tp->rcv_wnd; + thflags &= ~TH_FIN; + TCPSTAT_INC(tcps_rcvpackafterwin); + TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); + } + tp->snd_wl1 = th->th_seq - 1; + tp->rcv_up = th->th_seq; + /* + * Client side of transaction: already sent SYN and data. If the + * remote host used T/TCP to validate the SYN, our data will be + * ACK'd; if so, enter normal data segment processing in the middle + * of step 5, ack processing. Otherwise, goto step 6. + */ + if (thflags & TH_ACK) { + if ((to->to_flags & TOF_TS) != 0) { + uint32_t t, rtt; + + t = tcp_tv_to_mssectick(&bbr->rc_tv); + if (TSTMP_GEQ(t, to->to_tsecr)) { + rtt = t - to->to_tsecr; + if (rtt == 0) { + rtt = 1; + } + rtt *= MS_IN_USEC; + tcp_bbr_xmit_timer(bbr, rtt, 0, 0, 0); + apply_filter_min_small(&bbr->r_ctl.rc_rttprop, + rtt, bbr->r_ctl.rc_rcvtime); + } + } + if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) + return (ret_val); + /* We may have changed to FIN_WAIT_1 above */ + if (tp->t_state == TCPS_FIN_WAIT_1) { + /* + * In FIN_WAIT_1 STATE in addition to the processing + * for the ESTABLISHED state if our FIN is now + * acknowledged then enter FIN_WAIT_2. + */ + if (ourfinisacked) { + /* + * If we can't receive any more data, then + * closing user can proceed. Starting the + * timer is contrary to the specification, + * but if we don't get a FIN we'll hang + * forever. + * + * XXXjl: we should release the tp also, and + * use a compressed state. + */ + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { + soisdisconnected(so); + tcp_timer_activate(tp, TT_2MSL, + (tcp_fast_finwait2_recycle ? + tcp_finwait2_timeout : + TP_MAXIDLE(tp))); + } + tcp_state_change(tp, TCPS_FIN_WAIT_2); + } + } + } + return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, + tiwin, thflags, nxt_pkt)); +} + +/* + * Return value of 1, the TCB is unlocked and most + * likely gone, return value of 0, the TCB is still + * locked. + */ +static int +bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, + uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) +{ + int32_t ourfinisacked = 0; + int32_t ret_val; + struct tcp_bbr *bbr; + + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + ctf_calc_rwin(so, tp); + if ((thflags & TH_ACK) && + (SEQ_LEQ(th->th_ack, tp->snd_una) || + SEQ_GT(th->th_ack, tp->snd_max))) { + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + return (1); + } + if (IS_FASTOPEN(tp->t_flags)) { + /* + * When a TFO connection is in SYN_RECEIVED, the only valid + * packets are the initial SYN, a retransmit/copy of the + * initial SYN (possibly with a subset of the original + * data), a valid ACK, a FIN, or a RST. + */ + if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + return (1); + } else if (thflags & TH_SYN) { + /* non-initial SYN is ignored */ + if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || + (bbr->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || + (bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { + ctf_do_drop(m, NULL); + return (0); + } + } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { + ctf_do_drop(m, NULL); + return (0); + } + } + if ((thflags & TH_RST) || + (tp->t_fin_is_rst && (thflags & TH_FIN))) + return (ctf_process_rst(m, th, so, tp)); + /* + * RFC 1323 PAWS: If we have a timestamp reply on this segment and + * it's less than ts_recent, drop it. + */ + if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && + TSTMP_LT(to->to_tsval, tp->ts_recent)) { + if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) + return (ret_val); + } + /* + * In the SYN-RECEIVED state, validate that the packet belongs to + * this connection before trimming the data to fit the receive + * window. Check the sequence number versus IRS since we know the + * sequence numbers haven't wrapped. This is a partial fix for the + * "LAND" DoS attack. + */ + if (SEQ_LT(th->th_seq, tp->irs)) { + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + return (1); + } + INP_WLOCK_ASSERT(tp->t_inpcb); + if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { + return (ret_val); + } + /* + * If last ACK falls within this segment's sequence numbers, record + * its timestamp. NOTE: 1) That the test incorporates suggestions + * from the latest proposal of the tcplw@cray.com list (Braden + * 1993/04/26). 2) That updating only on newer timestamps interferes + * with our earlier PAWS tests, so this check should be solely + * predicated on the sequence space of this segment. 3) That we + * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ + * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + + * SEG.Len, This modified check allows us to overcome RFC1323's + * limitations as described in Stevens TCP/IP Illustrated Vol. 2 + * p.869. In such cases, we can still calculate the RTT correctly + * when RCV.NXT == Last.ACK.Sent. + */ + if ((to->to_flags & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + + ((thflags & (TH_SYN | TH_FIN)) != 0))) { + tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); + tp->ts_recent = to->to_tsval; + } + tp->snd_wnd = tiwin; + /* + * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag + * is on (half-synchronized state), then queue data for later + * processing; else drop segment and return. + */ + if ((thflags & TH_ACK) == 0) { + if (IS_FASTOPEN(tp->t_flags)) { + cc_conn_init(tp); + } + return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, + tiwin, thflags, nxt_pkt)); + } + TCPSTAT_INC(tcps_connects); + soisconnected(so); + /* Do window scaling? */ + if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == + (TF_RCVD_SCALE | TF_REQ_SCALE)) { + tp->rcv_scale = tp->request_r_scale; + } + /* + * ok for the first time in lets see if we can use the ts to figure + * out what the initial RTT was. + */ + if ((to->to_flags & TOF_TS) != 0) { + uint32_t t, rtt; + + t = tcp_tv_to_mssectick(&bbr->rc_tv); + if (TSTMP_GEQ(t, to->to_tsecr)) { + rtt = t - to->to_tsecr; + if (rtt == 0) { + rtt = 1; + } + rtt *= MS_IN_USEC; + tcp_bbr_xmit_timer(bbr, rtt, 0, 0, 0); + apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, bbr->r_ctl.rc_rcvtime); + } + } + /* Drop off any SYN in the send map (probably not there) */ + if (thflags & TH_ACK) + bbr_log_syn(tp, to); + if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { + + tcp_fastopen_decrement_counter(tp->t_tfo_pending); + tp->t_tfo_pending = NULL; + /* + * Account for the ACK of our SYN prior to regular + * ACK processing below. + */ + tp->snd_una++; + } + /* + * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> + * FIN-WAIT-1 + */ + tp->t_starttime = ticks; + if (tp->t_flags & TF_NEEDFIN) { + tcp_state_change(tp, TCPS_FIN_WAIT_1); + tp->t_flags &= ~TF_NEEDFIN; + } else { + tcp_state_change(tp, TCPS_ESTABLISHED); + TCP_PROBE5(accept__established, NULL, tp, + mtod(m, const char *), tp, th); + /* + * TFO connections call cc_conn_init() during SYN + * processing. Calling it again here for such connections + * is not harmless as it would undo the snd_cwnd reduction + * that occurs when a TFO SYN|ACK is retransmitted. + */ + if (!IS_FASTOPEN(tp->t_flags)) + cc_conn_init(tp); + } + /* + * If segment contains data or ACK, will call tcp_reass() later; if + * not, do so now to pass queued data to user. + */ + if (tlen == 0 && (thflags & TH_FIN) == 0) + (void)tcp_reass(tp, (struct tcphdr *)0, NULL, 0, + (struct mbuf *)0); + tp->snd_wl1 = th->th_seq - 1; + if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { + return (ret_val); + } + if (tp->t_state == TCPS_FIN_WAIT_1) { + /* We could have went to FIN_WAIT_1 (or EST) above */ + /* + * In FIN_WAIT_1 STATE in addition to the processing for the + * ESTABLISHED state if our FIN is now acknowledged then + * enter FIN_WAIT_2. + */ + if (ourfinisacked) { + /* + * If we can't receive any more data, then closing + * user can proceed. Starting the timer is contrary + * to the specification, but if we don't get a FIN + * we'll hang forever. + * + * XXXjl: we should release the tp also, and use a + * compressed state. + */ + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { + soisdisconnected(so); + tcp_timer_activate(tp, TT_2MSL, + (tcp_fast_finwait2_recycle ? + tcp_finwait2_timeout : + TP_MAXIDLE(tp))); + } + tcp_state_change(tp, TCPS_FIN_WAIT_2); + } + } + return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, + tiwin, thflags, nxt_pkt)); +} + +/* + * Return value of 1, the TCB is unlocked and most + * likely gone, return value of 0, the TCB is still + * locked. + */ +static int +bbr_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, + uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) +{ + struct tcp_bbr *bbr; + int32_t ret_val; + + /* + * Header prediction: check for the two common cases of a + * uni-directional data xfer. If the packet has no control flags, + * is in-sequence, the window didn't change and we're not + * retransmitting, it's a candidate. If the length is zero and the + * ack moved forward, we're the sender side of the xfer. Just free + * the data acked & wake any higher level process that was blocked + * waiting for space. If the length is non-zero and the ack didn't + * move, we're the receiver side. If we're getting packets in-order + * (the reassembly queue is empty), add the data toc The socket + * buffer and note that we need a delayed ack. Make sure that the + * hidden state-flags are also off. Since we check for + * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. + */ + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + if (bbr->r_ctl.rc_delivered < (4 * tp->t_maxseg)) { + /* + * If we have delived under 4 segments increase the initial + * window if raised by the peer. We use this to determine + * dynamic and static rwnd's at the end of a connection. + */ + bbr->r_ctl.rc_init_rwnd = max(tiwin, tp->snd_wnd); + } + if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && + __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) && + __predict_true(SEGQ_EMPTY(tp)) && + __predict_true(th->th_seq == tp->rcv_nxt)) { + if (tlen == 0) { + if (bbr_fastack(m, th, so, tp, to, drop_hdrlen, tlen, + tiwin, nxt_pkt)) { + return (0); + } + } else { + if (bbr_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, + tiwin, nxt_pkt)) { + return (0); + } + } + } + ctf_calc_rwin(so, tp); + + if ((thflags & TH_RST) || + (tp->t_fin_is_rst && (thflags & TH_FIN))) + return (ctf_process_rst(m, th, so, tp)); + /* + * RFC5961 Section 4.2 Send challenge ACK for any SYN in + * synchronized state. + */ + if (thflags & TH_SYN) { + ctf_challenge_ack(m, th, tp, &ret_val); + return (ret_val); + } + /* + * RFC 1323 PAWS: If we have a timestamp reply on this segment and + * it's less than ts_recent, drop it. + */ + if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && + TSTMP_LT(to->to_tsval, tp->ts_recent)) { + if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) + return (ret_val); + } + INP_WLOCK_ASSERT(tp->t_inpcb); + if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { + return (ret_val); + } + /* + * If last ACK falls within this segment's sequence numbers, record + * its timestamp. NOTE: 1) That the test incorporates suggestions + * from the latest proposal of the tcplw@cray.com list (Braden + * 1993/04/26). 2) That updating only on newer timestamps interferes + * with our earlier PAWS tests, so this check should be solely + * predicated on the sequence space of this segment. 3) That we + * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ + * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + + * SEG.Len, This modified check allows us to overcome RFC1323's + * limitations as described in Stevens TCP/IP Illustrated Vol. 2 + * p.869. In such cases, we can still calculate the RTT correctly + * when RCV.NXT == Last.ACK.Sent. + */ + if ((to->to_flags & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + + ((thflags & (TH_SYN | TH_FIN)) != 0))) { + tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); + tp->ts_recent = to->to_tsval; + } + /* + * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag + * is on (half-synchronized state), then queue data for later + * processing; else drop segment and return. + */ + if ((thflags & TH_ACK) == 0) { + if (tp->t_flags & TF_NEEDSYN) { + return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, + tiwin, thflags, nxt_pkt)); + } else if (tp->t_flags & TF_ACKNOW) { + ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + bbr->r_wanted_output = 1; + return (ret_val); + } else { + ctf_do_drop(m, NULL); + return (0); + } + } + /* + * Ack processing. + */ + if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { + return (ret_val); + } + if (sbavail(&so->so_snd)) { + if (bbr_progress_timeout_check(bbr)) { + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + return (1); + } + } + /* State changes only happen in bbr_process_data() */ + return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, + tiwin, thflags, nxt_pkt)); +} + +/* + * Return value of 1, the TCB is unlocked and most + * likely gone, return value of 0, the TCB is still + * locked. + */ +static int +bbr_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, + uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) +{ + struct tcp_bbr *bbr; + int32_t ret_val; + + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + ctf_calc_rwin(so, tp); + if ((thflags & TH_RST) || + (tp->t_fin_is_rst && (thflags & TH_FIN))) + return (ctf_process_rst(m, th, so, tp)); + /* + * RFC5961 Section 4.2 Send challenge ACK for any SYN in + * synchronized state. + */ + if (thflags & TH_SYN) { + ctf_challenge_ack(m, th, tp, &ret_val); + return (ret_val); + } + /* + * RFC 1323 PAWS: If we have a timestamp reply on this segment and + * it's less than ts_recent, drop it. + */ + if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && + TSTMP_LT(to->to_tsval, tp->ts_recent)) { + if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) + return (ret_val); + } + INP_WLOCK_ASSERT(tp->t_inpcb); + if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { + return (ret_val); + } + /* + * If last ACK falls within this segment's sequence numbers, record + * its timestamp. NOTE: 1) That the test incorporates suggestions + * from the latest proposal of the tcplw@cray.com list (Braden + * 1993/04/26). 2) That updating only on newer timestamps interferes + * with our earlier PAWS tests, so this check should be solely + * predicated on the sequence space of this segment. 3) That we + * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ + * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + + * SEG.Len, This modified check allows us to overcome RFC1323's + * limitations as described in Stevens TCP/IP Illustrated Vol. 2 + * p.869. In such cases, we can still calculate the RTT correctly + * when RCV.NXT == Last.ACK.Sent. + */ + if ((to->to_flags & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + + ((thflags & (TH_SYN | TH_FIN)) != 0))) { + tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); + tp->ts_recent = to->to_tsval; + } + /* + * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag + * is on (half-synchronized state), then queue data for later + * processing; else drop segment and return. + */ + if ((thflags & TH_ACK) == 0) { + if (tp->t_flags & TF_NEEDSYN) { + return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, + tiwin, thflags, nxt_pkt)); + } else if (tp->t_flags & TF_ACKNOW) { + ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + bbr->r_wanted_output = 1; + return (ret_val); + } else { + ctf_do_drop(m, NULL); + return (0); + } + } + /* + * Ack processing. + */ + if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { + return (ret_val); + } + if (sbavail(&so->so_snd)) { + if (bbr_progress_timeout_check(bbr)) { + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + return (1); + } + } + return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, + tiwin, thflags, nxt_pkt)); +} + +static int +bbr_check_data_after_close(struct mbuf *m, struct tcp_bbr *bbr, + struct tcpcb *tp, int32_t * tlen, struct tcphdr *th, struct socket *so) +{ + + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + if (bbr->rc_allow_data_af_clo == 0) { +close_now: + tp = tcp_close(tp); + TCPSTAT_INC(tcps_rcvafterclose); + ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); + return (1); + } + if (sbavail(&so->so_snd) == 0) + goto close_now; + /* Ok we allow data that is ignored and a followup reset */ + tp->rcv_nxt = th->th_seq + *tlen; + tp->t_flags2 |= TF2_DROP_AF_DATA; + bbr->r_wanted_output = 1; + *tlen = 0; + return (0); +} + +/* + * Return value of 1, the TCB is unlocked and most + * likely gone, return value of 0, the TCB is still + * locked. + */ +static int +bbr_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, + uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) +{ + int32_t ourfinisacked = 0; + int32_t ret_val; + struct tcp_bbr *bbr; + + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + ctf_calc_rwin(so, tp); + if ((thflags & TH_RST) || + (tp->t_fin_is_rst && (thflags & TH_FIN))) + return (ctf_process_rst(m, th, so, tp)); + /* + * RFC5961 Section 4.2 Send challenge ACK for any SYN in + * synchronized state. + */ + if (thflags & TH_SYN) { + ctf_challenge_ack(m, th, tp, &ret_val); + return (ret_val); + } + /* + * RFC 1323 PAWS: If we have a timestamp reply on this segment and + * it's less than ts_recent, drop it. + */ + if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && + TSTMP_LT(to->to_tsval, tp->ts_recent)) { + if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) + return (ret_val); + } + INP_WLOCK_ASSERT(tp->t_inpcb); + if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { + return (ret_val); + } + /* + * If new data are received on a connection after the user processes + * are gone, then RST the other end. + */ + if ((so->so_state & SS_NOFDREF) && tlen) { + /* + * We call a new function now so we might continue and setup + * to reset at all data being ack'd. + */ + if (bbr_check_data_after_close(m, bbr, tp, &tlen, th, so)) + return (1); + } + /* + * If last ACK falls within this segment's sequence numbers, record + * its timestamp. NOTE: 1) That the test incorporates suggestions + * from the latest proposal of the tcplw@cray.com list (Braden + * 1993/04/26). 2) That updating only on newer timestamps interferes + * with our earlier PAWS tests, so this check should be solely + * predicated on the sequence space of this segment. 3) That we + * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ + * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + + * SEG.Len, This modified check allows us to overcome RFC1323's + * limitations as described in Stevens TCP/IP Illustrated Vol. 2 + * p.869. In such cases, we can still calculate the RTT correctly + * when RCV.NXT == Last.ACK.Sent. + */ + if ((to->to_flags & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + + ((thflags & (TH_SYN | TH_FIN)) != 0))) { + tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); + tp->ts_recent = to->to_tsval; + } + /* + * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag + * is on (half-synchronized state), then queue data for later + * processing; else drop segment and return. + */ + if ((thflags & TH_ACK) == 0) { + if (tp->t_flags & TF_NEEDSYN) { + return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, + tiwin, thflags, nxt_pkt)); + } else if (tp->t_flags & TF_ACKNOW) { + ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + bbr->r_wanted_output = 1; + return (ret_val); + } else { + ctf_do_drop(m, NULL); + return (0); + } + } + /* + * Ack processing. + */ + if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { + return (ret_val); + } + if (ourfinisacked) { + /* + * If we can't receive any more data, then closing user can + * proceed. Starting the timer is contrary to the + * specification, but if we don't get a FIN we'll hang + * forever. + * + * XXXjl: we should release the tp also, and use a + * compressed state. + */ + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { + soisdisconnected(so); + tcp_timer_activate(tp, TT_2MSL, + (tcp_fast_finwait2_recycle ? + tcp_finwait2_timeout : + TP_MAXIDLE(tp))); + } + tcp_state_change(tp, TCPS_FIN_WAIT_2); + } + if (sbavail(&so->so_snd)) { + if (bbr_progress_timeout_check(bbr)) { + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + return (1); + } + } + return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, + tiwin, thflags, nxt_pkt)); +} + +/* + * Return value of 1, the TCB is unlocked and most + * likely gone, return value of 0, the TCB is still + * locked. + */ +static int +bbr_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, + uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) +{ + int32_t ourfinisacked = 0; + int32_t ret_val; + struct tcp_bbr *bbr; + + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + ctf_calc_rwin(so, tp); + if ((thflags & TH_RST) || + (tp->t_fin_is_rst && (thflags & TH_FIN))) + return (ctf_process_rst(m, th, so, tp)); + /* + * RFC5961 Section 4.2 Send challenge ACK for any SYN in + * synchronized state. + */ + if (thflags & TH_SYN) { + ctf_challenge_ack(m, th, tp, &ret_val); + return (ret_val); + } + /* + * RFC 1323 PAWS: If we have a timestamp reply on this segment and + * it's less than ts_recent, drop it. + */ + if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && + TSTMP_LT(to->to_tsval, tp->ts_recent)) { + if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) + return (ret_val); + } + INP_WLOCK_ASSERT(tp->t_inpcb); + if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { + return (ret_val); + } + /* + * If new data are received on a connection after the user processes + * are gone, then RST the other end. + */ + if ((so->so_state & SS_NOFDREF) && tlen) { + /* + * We call a new function now so we might continue and setup + * to reset at all data being ack'd. + */ + if (bbr_check_data_after_close(m, bbr, tp, &tlen, th, so)) + return (1); + } + /* + * If last ACK falls within this segment's sequence numbers, record + * its timestamp. NOTE: 1) That the test incorporates suggestions + * from the latest proposal of the tcplw@cray.com list (Braden + * 1993/04/26). 2) That updating only on newer timestamps interferes + * with our earlier PAWS tests, so this check should be solely + * predicated on the sequence space of this segment. 3) That we + * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ + * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + + * SEG.Len, This modified check allows us to overcome RFC1323's + * limitations as described in Stevens TCP/IP Illustrated Vol. 2 + * p.869. In such cases, we can still calculate the RTT correctly + * when RCV.NXT == Last.ACK.Sent. + */ + if ((to->to_flags & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + + ((thflags & (TH_SYN | TH_FIN)) != 0))) { + tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); + tp->ts_recent = to->to_tsval; + } + /* + * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag + * is on (half-synchronized state), then queue data for later + * processing; else drop segment and return. + */ + if ((thflags & TH_ACK) == 0) { + if (tp->t_flags & TF_NEEDSYN) { + return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, + tiwin, thflags, nxt_pkt)); + } else if (tp->t_flags & TF_ACKNOW) { + ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + bbr->r_wanted_output = 1; + return (ret_val); + } else { + ctf_do_drop(m, NULL); + return (0); + } + } + /* + * Ack processing. + */ + if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { + return (ret_val); + } + if (ourfinisacked) { + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + tcp_twstart(tp); + m_freem(m); + return (1); + } + if (sbavail(&so->so_snd)) { + if (bbr_progress_timeout_check(bbr)) { + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + return (1); + } + } + return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, + tiwin, thflags, nxt_pkt)); +} + +/* + * Return value of 1, the TCB is unlocked and most + * likely gone, return value of 0, the TCB is still + * locked. + */ +static int +bbr_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, + uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) +{ + int32_t ourfinisacked = 0; + int32_t ret_val; + struct tcp_bbr *bbr; + + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + ctf_calc_rwin(so, tp); + if ((thflags & TH_RST) || + (tp->t_fin_is_rst && (thflags & TH_FIN))) + return (ctf_process_rst(m, th, so, tp)); + /* + * RFC5961 Section 4.2 Send challenge ACK for any SYN in + * synchronized state. + */ + if (thflags & TH_SYN) { + ctf_challenge_ack(m, th, tp, &ret_val); + return (ret_val); + } + /* + * RFC 1323 PAWS: If we have a timestamp reply on this segment and + * it's less than ts_recent, drop it. + */ + if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && + TSTMP_LT(to->to_tsval, tp->ts_recent)) { + if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) + return (ret_val); + } + INP_WLOCK_ASSERT(tp->t_inpcb); + if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { + return (ret_val); + } + /* + * If new data are received on a connection after the user processes + * are gone, then RST the other end. + */ + if ((so->so_state & SS_NOFDREF) && tlen) { + /* + * We call a new function now so we might continue and setup + * to reset at all data being ack'd. + */ + if (bbr_check_data_after_close(m, bbr, tp, &tlen, th, so)) + return (1); + } + /* + * If last ACK falls within this segment's sequence numbers, record + * its timestamp. NOTE: 1) That the test incorporates suggestions + * from the latest proposal of the tcplw@cray.com list (Braden + * 1993/04/26). 2) That updating only on newer timestamps interferes + * with our earlier PAWS tests, so this check should be solely + * predicated on the sequence space of this segment. 3) That we + * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ + * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + + * SEG.Len, This modified check allows us to overcome RFC1323's + * limitations as described in Stevens TCP/IP Illustrated Vol. 2 + * p.869. In such cases, we can still calculate the RTT correctly + * when RCV.NXT == Last.ACK.Sent. + */ + if ((to->to_flags & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + + ((thflags & (TH_SYN | TH_FIN)) != 0))) { + tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); + tp->ts_recent = to->to_tsval; + } + /* + * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag + * is on (half-synchronized state), then queue data for later + * processing; else drop segment and return. + */ + if ((thflags & TH_ACK) == 0) { + if (tp->t_flags & TF_NEEDSYN) { + return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, + tiwin, thflags, nxt_pkt)); + } else if (tp->t_flags & TF_ACKNOW) { + ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + bbr->r_wanted_output = 1; + return (ret_val); + } else { + ctf_do_drop(m, NULL); + return (0); + } + } + /* + * case TCPS_LAST_ACK: Ack processing. + */ + if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { + return (ret_val); + } + if (ourfinisacked) { + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + tp = tcp_close(tp); + ctf_do_drop(m, tp); + return (1); + } + if (sbavail(&so->so_snd)) { + if (bbr_progress_timeout_check(bbr)) { + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + return (1); + } + } + return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, + tiwin, thflags, nxt_pkt)); +} + + +/* + * Return value of 1, the TCB is unlocked and most + * likely gone, return value of 0, the TCB is still + * locked. + */ +static int +bbr_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, + uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) +{ + int32_t ourfinisacked = 0; + int32_t ret_val; + struct tcp_bbr *bbr; + + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + ctf_calc_rwin(so, tp); + /* Reset receive buffer auto scaling when not in bulk receive mode. */ + if ((thflags & TH_RST) || + (tp->t_fin_is_rst && (thflags & TH_FIN))) + return (ctf_process_rst(m, th, so, tp)); + + /* + * RFC5961 Section 4.2 Send challenge ACK for any SYN in + * synchronized state. + */ + if (thflags & TH_SYN) { + ctf_challenge_ack(m, th, tp, &ret_val); + return (ret_val); + } + INP_WLOCK_ASSERT(tp->t_inpcb); + /* + * RFC 1323 PAWS: If we have a timestamp reply on this segment and + * it's less than ts_recent, drop it. + */ + if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && + TSTMP_LT(to->to_tsval, tp->ts_recent)) { + if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) + return (ret_val); + } + INP_WLOCK_ASSERT(tp->t_inpcb); + if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { + return (ret_val); + } + /* + * If new data are received on a connection after the user processes + * are gone, then we may RST the other end depending on the outcome + * of bbr_check_data_after_close. + */ + if ((so->so_state & SS_NOFDREF) && + tlen) { + /* + * We call a new function now so we might continue and setup + * to reset at all data being ack'd. + */ + if (bbr_check_data_after_close(m, bbr, tp, &tlen, th, so)) + return (1); + } + INP_WLOCK_ASSERT(tp->t_inpcb); + /* + * If last ACK falls within this segment's sequence numbers, record + * its timestamp. NOTE: 1) That the test incorporates suggestions + * from the latest proposal of the tcplw@cray.com list (Braden + * 1993/04/26). 2) That updating only on newer timestamps interferes + * with our earlier PAWS tests, so this check should be solely + * predicated on the sequence space of this segment. 3) That we + * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ + * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + + * SEG.Len, This modified check allows us to overcome RFC1323's + * limitations as described in Stevens TCP/IP Illustrated Vol. 2 + * p.869. In such cases, we can still calculate the RTT correctly + * when RCV.NXT == Last.ACK.Sent. + */ + INP_WLOCK_ASSERT(tp->t_inpcb); + if ((to->to_flags & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + + ((thflags & (TH_SYN | TH_FIN)) != 0))) { + tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); + tp->ts_recent = to->to_tsval; + } + /* + * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag + * is on (half-synchronized state), then queue data for later + * processing; else drop segment and return. + */ + if ((thflags & TH_ACK) == 0) { + if (tp->t_flags & TF_NEEDSYN) { + return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, + tiwin, thflags, nxt_pkt)); + } else if (tp->t_flags & TF_ACKNOW) { + ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + bbr->r_wanted_output = 1; + return (ret_val); + } else { + ctf_do_drop(m, NULL); + return (0); + } + } + /* + * Ack processing. + */ + INP_WLOCK_ASSERT(tp->t_inpcb); + if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { + return (ret_val); + } + if (sbavail(&so->so_snd)) { + if (bbr_progress_timeout_check(bbr)) { + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + return (1); + } + } + INP_WLOCK_ASSERT(tp->t_inpcb); + return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, + tiwin, thflags, nxt_pkt)); +} + +static void +bbr_stop_all_timers(struct tcpcb *tp) +{ + struct tcp_bbr *bbr; + + /* + * Assure no timers are running. + */ + if (tcp_timer_active(tp, TT_PERSIST)) { + /* We enter in persists, set the flag appropriately */ + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + bbr->rc_in_persist = 1; + } + tcp_timer_suspend(tp, TT_PERSIST); + tcp_timer_suspend(tp, TT_REXMT); + tcp_timer_suspend(tp, TT_KEEP); + tcp_timer_suspend(tp, TT_DELACK); +} + +static void +bbr_google_mode_on(struct tcp_bbr *bbr) +{ + bbr->rc_use_google = 1; + bbr->rc_no_pacing = 0; + bbr->r_ctl.bbr_google_discount = bbr_google_discount; + bbr->r_use_policer = bbr_policer_detection_enabled; + bbr->r_ctl.rc_probertt_int = (USECS_IN_SECOND * 10); + bbr->bbr_use_rack_cheat = 0; + bbr->r_ctl.rc_incr_tmrs = 0; + bbr->r_ctl.rc_inc_tcp_oh = 0; + bbr->r_ctl.rc_inc_ip_oh = 0; + bbr->r_ctl.rc_inc_enet_oh = 0; + reset_time(&bbr->r_ctl.rc_delrate, + BBR_NUM_RTTS_FOR_GOOG_DEL_LIMIT); + reset_time_small(&bbr->r_ctl.rc_rttprop, + (11 * USECS_IN_SECOND)); + tcp_bbr_tso_size_check(bbr, tcp_get_usecs(&bbr->rc_tv)); +} + +static void +bbr_google_mode_off(struct tcp_bbr *bbr) +{ + bbr->rc_use_google = 0; + bbr->r_ctl.bbr_google_discount = 0; + bbr->no_pacing_until = bbr_no_pacing_until; + bbr->r_use_policer = 0; + if (bbr->no_pacing_until) + bbr->rc_no_pacing = 1; + else + bbr->rc_no_pacing = 0; + if (bbr_use_rack_resend_cheat) + bbr->bbr_use_rack_cheat = 1; + else + bbr->bbr_use_rack_cheat = 0; + if (bbr_incr_timers) + bbr->r_ctl.rc_incr_tmrs = 1; + else + bbr->r_ctl.rc_incr_tmrs = 0; + if (bbr_include_tcp_oh) + bbr->r_ctl.rc_inc_tcp_oh = 1; + else + bbr->r_ctl.rc_inc_tcp_oh = 0; + if (bbr_include_ip_oh) + bbr->r_ctl.rc_inc_ip_oh = 1; + else + bbr->r_ctl.rc_inc_ip_oh = 0; + if (bbr_include_enet_oh) + bbr->r_ctl.rc_inc_enet_oh = 1; + else + bbr->r_ctl.rc_inc_enet_oh = 0; + bbr->r_ctl.rc_probertt_int = bbr_rtt_probe_limit; + reset_time(&bbr->r_ctl.rc_delrate, + bbr_num_pktepo_for_del_limit); + reset_time_small(&bbr->r_ctl.rc_rttprop, + (bbr_filter_len_sec * USECS_IN_SECOND)); + tcp_bbr_tso_size_check(bbr, tcp_get_usecs(&bbr->rc_tv)); +} +/* + * Return 0 on success, non-zero on failure + * which indicates the error (usually no memory). + */ +static int +bbr_init(struct tcpcb *tp) +{ + struct tcp_bbr *bbr = NULL; + struct inpcb *inp; + uint32_t cts; + + tp->t_fb_ptr = uma_zalloc(bbr_pcb_zone, (M_NOWAIT | M_ZERO)); + if (tp->t_fb_ptr == NULL) { + /* + * We need to allocate memory but cant. The INP and INP_INFO + * locks and they are recusive (happens during setup. So a + * scheme to drop the locks fails :( + * + */ + return (ENOMEM); + } + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + bbr->rtt_valid = 0; + inp = tp->t_inpcb; + inp->inp_flags2 |= INP_CANNOT_DO_ECN; + inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; + TAILQ_INIT(&bbr->r_ctl.rc_map); + TAILQ_INIT(&bbr->r_ctl.rc_free); + TAILQ_INIT(&bbr->r_ctl.rc_tmap); + bbr->rc_tp = tp; + if (tp->t_inpcb) { + bbr->rc_inp = tp->t_inpcb; + } + cts = tcp_get_usecs(&bbr->rc_tv); + tp->t_acktime = 0; + bbr->rc_allow_data_af_clo = bbr_ignore_data_after_close; + bbr->r_ctl.rc_reorder_fade = bbr_reorder_fade; + bbr->rc_tlp_threshold = bbr_tlp_thresh; + bbr->r_ctl.rc_reorder_shift = bbr_reorder_thresh; + bbr->r_ctl.rc_pkt_delay = bbr_pkt_delay; + bbr->r_ctl.rc_min_to = bbr_min_to; + bbr->rc_bbr_state = BBR_STATE_STARTUP; + bbr->r_ctl.bbr_lost_at_state = 0; + bbr->r_ctl.rc_lost_at_startup = 0; + bbr->rc_all_timers_stopped = 0; + bbr->r_ctl.rc_bbr_lastbtlbw = 0; + bbr->r_ctl.rc_pkt_epoch_del = 0; + bbr->r_ctl.rc_pkt_epoch = 0; + bbr->r_ctl.rc_lowest_rtt = 0xffffffff; + bbr->r_ctl.rc_bbr_hptsi_gain = bbr_high_gain; + bbr->r_ctl.rc_bbr_cwnd_gain = bbr_high_gain; + bbr->r_ctl.rc_went_idle_time = cts; + bbr->rc_pacer_started = cts; + bbr->r_ctl.rc_pkt_epoch_time = cts; + bbr->r_ctl.rc_rcvtime = cts; + bbr->r_ctl.rc_bbr_state_time = cts; + bbr->r_ctl.rc_del_time = cts; + bbr->r_ctl.rc_tlp_rxt_last_time = cts; + bbr->r_ctl.last_in_probertt = cts; + bbr->skip_gain = 0; + bbr->gain_is_limited = 0; + bbr->no_pacing_until = bbr_no_pacing_until; + if (bbr->no_pacing_until) + bbr->rc_no_pacing = 1; + if (bbr_use_google_algo) { + bbr->rc_no_pacing = 0; + bbr->rc_use_google = 1; + bbr->r_ctl.bbr_google_discount = bbr_google_discount; + bbr->r_use_policer = bbr_policer_detection_enabled; + } else { + bbr->rc_use_google = 0; + bbr->r_ctl.bbr_google_discount = 0; + bbr->r_use_policer = 0; + } + if (bbr_ts_limiting) + bbr->rc_use_ts_limit = 1; + else + bbr->rc_use_ts_limit = 0; + if (bbr_ts_can_raise) + bbr->ts_can_raise = 1; + else + bbr->ts_can_raise = 0; + if (V_tcp_delack_enabled == 1) + tp->t_delayed_ack = 2; + else if (V_tcp_delack_enabled == 0) + tp->t_delayed_ack = 0; + else if (V_tcp_delack_enabled < 100) + tp->t_delayed_ack = V_tcp_delack_enabled; + else + tp->t_delayed_ack = 2; + if (bbr->rc_use_google == 0) + bbr->r_ctl.rc_probertt_int = bbr_rtt_probe_limit; + else + bbr->r_ctl.rc_probertt_int = (USECS_IN_SECOND * 10); + bbr->r_ctl.rc_min_rto_ms = bbr_rto_min_ms; + bbr->rc_max_rto_sec = bbr_rto_max_sec; + bbr->rc_init_win = bbr_def_init_win; + if (tp->t_flags & TF_REQ_TSTMP) + bbr->rc_last_options = TCP_TS_OVERHEAD; + bbr->r_ctl.rc_pace_max_segs = tp->t_maxseg - bbr->rc_last_options; + bbr->r_ctl.rc_high_rwnd = tp->snd_wnd; + bbr->r_init_rtt = 1; + + counter_u64_add(bbr_flows_nohdwr_pacing, 1); + if (bbr_allow_hdwr_pacing) + bbr->bbr_hdw_pace_ena = 1; + else + bbr->bbr_hdw_pace_ena = 0; + if (bbr_sends_full_iwnd) + bbr->bbr_init_win_cheat = 1; + else + bbr->bbr_init_win_cheat = 0; + bbr->r_ctl.bbr_utter_max = bbr_hptsi_utter_max; + bbr->r_ctl.rc_drain_pg = bbr_drain_gain; + bbr->r_ctl.rc_startup_pg = bbr_high_gain; + bbr->rc_loss_exit = bbr_exit_startup_at_loss; + bbr->r_ctl.bbr_rttprobe_gain_val = bbr_rttprobe_gain; + bbr->r_ctl.bbr_hptsi_per_second = bbr_hptsi_per_second; + bbr->r_ctl.bbr_hptsi_segments_delay_tar = bbr_hptsi_segments_delay_tar; + bbr->r_ctl.bbr_hptsi_segments_max = bbr_hptsi_segments_max; + bbr->r_ctl.bbr_hptsi_segments_floor = bbr_hptsi_segments_floor; + bbr->r_ctl.bbr_hptsi_bytes_min = bbr_hptsi_bytes_min; + bbr->r_ctl.bbr_cross_over = bbr_cross_over; + bbr->r_ctl.rc_rtt_shrinks = cts; + if (bbr->rc_use_google) { + setup_time_filter(&bbr->r_ctl.rc_delrate, + FILTER_TYPE_MAX, + BBR_NUM_RTTS_FOR_GOOG_DEL_LIMIT); + setup_time_filter_small(&bbr->r_ctl.rc_rttprop, + FILTER_TYPE_MIN, (11 * USECS_IN_SECOND)); + } else { + setup_time_filter(&bbr->r_ctl.rc_delrate, + FILTER_TYPE_MAX, + bbr_num_pktepo_for_del_limit); + setup_time_filter_small(&bbr->r_ctl.rc_rttprop, + FILTER_TYPE_MIN, (bbr_filter_len_sec * USECS_IN_SECOND)); + } + bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_INIT, 0); + if (bbr_uses_idle_restart) + bbr->rc_use_idle_restart = 1; + else + bbr->rc_use_idle_restart = 0; + bbr->r_ctl.rc_bbr_cur_del_rate = 0; + bbr->r_ctl.rc_initial_hptsi_bw = bbr_initial_bw_bps; + if (bbr_resends_use_tso) + bbr->rc_resends_use_tso = 1; +#ifdef NETFLIX_PEAKRATE + tp->t_peakrate_thr = tp->t_maxpeakrate; +#endif + if (tp->snd_una != tp->snd_max) { + /* Create a send map for the current outstanding data */ + struct bbr_sendmap *rsm; + + rsm = bbr_alloc(bbr); + if (rsm == NULL) { + uma_zfree(bbr_pcb_zone, tp->t_fb_ptr); + tp->t_fb_ptr = NULL; + return (ENOMEM); + } + rsm->r_flags = BBR_OVERMAX; + rsm->r_tim_lastsent[0] = cts; + rsm->r_rtr_cnt = 1; + rsm->r_rtr_bytes = 0; + rsm->r_start = tp->snd_una; + rsm->r_end = tp->snd_max; + rsm->r_dupack = 0; + rsm->r_delivered = bbr->r_ctl.rc_delivered; + rsm->r_ts_valid = 0; + rsm->r_del_ack_ts = tp->ts_recent; + rsm->r_del_time = cts; + if (bbr->r_ctl.r_app_limited_until) + rsm->r_app_limited = 1; + else + rsm->r_app_limited = 0; + TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_map, rsm, r_next); + TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_tmap, rsm, r_tnext); + rsm->r_in_tmap = 1; + if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) + rsm->r_bbr_state = bbr_state_val(bbr); + else + rsm->r_bbr_state = 8; + } + if (bbr_use_rack_resend_cheat && (bbr->rc_use_google == 0)) + bbr->bbr_use_rack_cheat = 1; + if (bbr_incr_timers && (bbr->rc_use_google == 0)) + bbr->r_ctl.rc_incr_tmrs = 1; + if (bbr_include_tcp_oh && (bbr->rc_use_google == 0)) + bbr->r_ctl.rc_inc_tcp_oh = 1; + if (bbr_include_ip_oh && (bbr->rc_use_google == 0)) + bbr->r_ctl.rc_inc_ip_oh = 1; + if (bbr_include_enet_oh && (bbr->rc_use_google == 0)) + bbr->r_ctl.rc_inc_enet_oh = 1; + + bbr_log_type_statechange(bbr, cts, __LINE__); + if (TCPS_HAVEESTABLISHED(tp->t_state) && + (tp->t_srtt)) { + uint32_t rtt; + + rtt = (TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT); + apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts); + } + /* announce the settings and state */ + bbr_log_settings_change(bbr, BBR_RECOVERY_LOWRTT); + tcp_bbr_tso_size_check(bbr, cts); + /* + * Now call the generic function to start a timer. This will place + * the TCB on the hptsi wheel if a timer is needed with appropriate + * flags. + */ + bbr_stop_all_timers(tp); + bbr_start_hpts_timer(bbr, tp, cts, 5, 0, 0); + return (0); +} + +/* + * Return 0 if we can accept the connection. Return + * non-zero if we can't handle the connection. A EAGAIN + * means you need to wait until the connection is up. + * a EADDRNOTAVAIL means we can never handle the connection + * (no SACK). + */ +static int +bbr_handoff_ok(struct tcpcb *tp) +{ + if ((tp->t_state == TCPS_CLOSED) || + (tp->t_state == TCPS_LISTEN)) { + /* Sure no problem though it may not stick */ + return (0); + } + if ((tp->t_state == TCPS_SYN_SENT) || + (tp->t_state == TCPS_SYN_RECEIVED)) { + /* + * We really don't know you have to get to ESTAB or beyond + * to tell. + */ + return (EAGAIN); + } + if ((tp->t_flags & TF_SACK_PERMIT) || bbr_sack_not_required) { + return (0); + } + /* + * If we reach here we don't do SACK on this connection so we can + * never do rack. + */ + return (EINVAL); +} + +static void +bbr_fini(struct tcpcb *tp, int32_t tcb_is_purged) +{ + if (tp->t_fb_ptr) { + uint32_t calc; + struct tcp_bbr *bbr; + struct bbr_sendmap *rsm; + + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + if (bbr->r_ctl.crte) + tcp_rel_pacing_rate(bbr->r_ctl.crte, bbr->rc_tp); + bbr_log_flowend(bbr); + bbr->rc_tp = NULL; + if (tp->t_inpcb) { + /* Backout any flags2 we applied */ + tp->t_inpcb->inp_flags2 &= ~INP_CANNOT_DO_ECN; + tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; + tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY; + } + if (bbr->bbr_hdrw_pacing) + counter_u64_add(bbr_flows_whdwr_pacing, -1); + else + counter_u64_add(bbr_flows_nohdwr_pacing, -1); + rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); + while (rsm) { + TAILQ_REMOVE(&bbr->r_ctl.rc_map, rsm, r_next); + uma_zfree(bbr_zone, rsm); + rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); + } + rsm = TAILQ_FIRST(&bbr->r_ctl.rc_free); + while (rsm) { + TAILQ_REMOVE(&bbr->r_ctl.rc_free, rsm, r_next); + uma_zfree(bbr_zone, rsm); + rsm = TAILQ_FIRST(&bbr->r_ctl.rc_free); + } + calc = bbr->r_ctl.rc_high_rwnd - bbr->r_ctl.rc_init_rwnd; + if (calc > (bbr->r_ctl.rc_init_rwnd / 10)) + BBR_STAT_INC(bbr_dynamic_rwnd); + else + BBR_STAT_INC(bbr_static_rwnd); + bbr->r_ctl.rc_free_cnt = 0; + uma_zfree(bbr_pcb_zone, tp->t_fb_ptr); + tp->t_fb_ptr = NULL; + } + /* Make sure snd_nxt is correctly set */ + tp->snd_nxt = tp->snd_max; +} + +static void +bbr_set_state(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t win) +{ + switch (tp->t_state) { + case TCPS_SYN_SENT: + bbr->r_state = TCPS_SYN_SENT; + bbr->r_substate = bbr_do_syn_sent; + break; + case TCPS_SYN_RECEIVED: + bbr->r_state = TCPS_SYN_RECEIVED; + bbr->r_substate = bbr_do_syn_recv; + break; + case TCPS_ESTABLISHED: + bbr->r_ctl.rc_init_rwnd = max(win, bbr->rc_tp->snd_wnd); + bbr->r_state = TCPS_ESTABLISHED; + bbr->r_substate = bbr_do_established; + break; + case TCPS_CLOSE_WAIT: + bbr->r_state = TCPS_CLOSE_WAIT; + bbr->r_substate = bbr_do_close_wait; + break; + case TCPS_FIN_WAIT_1: + bbr->r_state = TCPS_FIN_WAIT_1; + bbr->r_substate = bbr_do_fin_wait_1; + break; + case TCPS_CLOSING: + bbr->r_state = TCPS_CLOSING; + bbr->r_substate = bbr_do_closing; + break; + case TCPS_LAST_ACK: + bbr->r_state = TCPS_LAST_ACK; + bbr->r_substate = bbr_do_lastack; + break; + case TCPS_FIN_WAIT_2: + bbr->r_state = TCPS_FIN_WAIT_2; + bbr->r_substate = bbr_do_fin_wait_2; + break; + case TCPS_LISTEN: + case TCPS_CLOSED: + case TCPS_TIME_WAIT: + default: + break; + }; +} + +static void +bbr_substate_change(struct tcp_bbr *bbr, uint32_t cts, int32_t line, int dolog) +{ + /* + * Now what state are we going into now? Is there adjustments + * needed? + */ + int32_t old_state, old_gain; + + + old_state = bbr_state_val(bbr); + old_gain = bbr->r_ctl.rc_bbr_hptsi_gain; + if (bbr_state_val(bbr) == BBR_SUB_LEVEL1) { + /* Save the lowest srtt we saw in our end of the sub-state */ + bbr->rc_hit_state_1 = 0; + if (bbr->r_ctl.bbr_smallest_srtt_this_state != 0xffffffff) + bbr->r_ctl.bbr_smallest_srtt_state2 = bbr->r_ctl.bbr_smallest_srtt_this_state; + } + bbr->rc_bbr_substate++; + if (bbr->rc_bbr_substate >= BBR_SUBSTATE_COUNT) { + /* Cycle back to first state-> gain */ + bbr->rc_bbr_substate = 0; + } + if (bbr_state_val(bbr) == BBR_SUB_GAIN) { + /* + * We enter the gain(5/4) cycle (possibly less if + * shallow buffer detection is enabled) + */ + if (bbr->skip_gain) { + /* + * Hardware pacing has set our rate to + * the max and limited our b/w just + * do level i.e. no gain. + */ + bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_LEVEL1]; + } else if (bbr->gain_is_limited && + bbr->bbr_hdrw_pacing && + bbr->r_ctl.crte) { + /* + * We can't gain above the hardware pacing + * rate which is less than our rate + the gain + * calculate the gain needed to reach the hardware + * pacing rate.. + */ + uint64_t bw, rate, gain_calc; + + bw = bbr_get_bw(bbr); + rate = bbr->r_ctl.crte->rate; + if ((rate > bw) && + (((bw * (uint64_t)bbr_hptsi_gain[BBR_SUB_GAIN]) / (uint64_t)BBR_UNIT) > rate)) { + gain_calc = (rate * BBR_UNIT) / bw; + if (gain_calc < BBR_UNIT) + gain_calc = BBR_UNIT; + bbr->r_ctl.rc_bbr_hptsi_gain = (uint16_t)gain_calc; + } else { + bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_GAIN]; + } + } else + bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_GAIN]; + if ((bbr->rc_use_google == 0) && (bbr_gain_to_target == 0)) { + bbr->r_ctl.rc_bbr_state_atflight = cts; + } else + bbr->r_ctl.rc_bbr_state_atflight = 0; + } else if (bbr_state_val(bbr) == BBR_SUB_DRAIN) { + bbr->rc_hit_state_1 = 1; + bbr->r_ctl.rc_exta_time_gd = 0; + bbr->r_ctl.flightsize_at_drain = ctf_flight_size(bbr->rc_tp, + (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); + if (bbr_state_drain_2_tar) { + bbr->r_ctl.rc_bbr_state_atflight = 0; + } else + bbr->r_ctl.rc_bbr_state_atflight = cts; + bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_DRAIN]; + } else { + /* All other cycles hit here 2-7 */ + if ((old_state == BBR_SUB_DRAIN) && bbr->rc_hit_state_1) { + if (bbr_sub_drain_slam_cwnd && + (bbr->rc_use_google == 0) && + (bbr->rc_tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd)) { + bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd; + bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); + } + if ((cts - bbr->r_ctl.rc_bbr_state_time) > bbr_get_rtt(bbr, BBR_RTT_PROP)) + bbr->r_ctl.rc_exta_time_gd += ((cts - bbr->r_ctl.rc_bbr_state_time) - + bbr_get_rtt(bbr, BBR_RTT_PROP)); + else + bbr->r_ctl.rc_exta_time_gd = 0; + if (bbr->r_ctl.rc_exta_time_gd) { + bbr->r_ctl.rc_level_state_extra = bbr->r_ctl.rc_exta_time_gd; + /* Now chop up the time for each state (div by 7) */ + bbr->r_ctl.rc_level_state_extra /= 7; + if (bbr_rand_ot && bbr->r_ctl.rc_level_state_extra) { + /* Add a randomization */ + bbr_randomize_extra_state_time(bbr); + } + } + } + bbr->r_ctl.rc_bbr_state_atflight = max(1, cts); + bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[bbr_state_val(bbr)]; + } + if (bbr->rc_use_google) { + bbr->r_ctl.rc_bbr_state_atflight = max(1, cts); + } + bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost; + bbr->r_ctl.rc_bbr_cwnd_gain = bbr_cwnd_gain; + if (dolog) + bbr_log_type_statechange(bbr, cts, line); + + if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) { + uint32_t time_in; + + time_in = cts - bbr->r_ctl.rc_bbr_state_time; + if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) { + counter_u64_add(bbr_state_time[(old_state + 5)], time_in); + } else { + counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in); + } + } + bbr->r_ctl.bbr_smallest_srtt_this_state = 0xffffffff; + bbr_set_state_target(bbr, __LINE__); + if (bbr_sub_drain_slam_cwnd && + (bbr->rc_use_google == 0) && + (bbr_state_val(bbr) == BBR_SUB_DRAIN)) { + /* Slam down the cwnd */ + bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd; + bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state; + if (bbr_sub_drain_app_limit) { + /* Go app limited if we are on a long drain */ + bbr->r_ctl.r_app_limited_until = (bbr->r_ctl.rc_delivered + + ctf_flight_size(bbr->rc_tp, + (bbr->r_ctl.rc_sacked + + bbr->r_ctl.rc_lost_bytes))); + } + bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); + } + if (bbr->rc_lt_use_bw) { + /* In policed mode we clamp pacing_gain to BBR_UNIT */ + bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT; + } + /* Google changes TSO size every cycle */ + if (bbr->rc_use_google) + tcp_bbr_tso_size_check(bbr, cts); + bbr->r_ctl.gain_epoch = cts; + bbr->r_ctl.rc_bbr_state_time = cts; + bbr->r_ctl.substate_pe = bbr->r_ctl.rc_pkt_epoch; +} + +static void +bbr_set_probebw_google_gains(struct tcp_bbr *bbr, uint32_t cts, uint32_t losses) +{ + if ((bbr_state_val(bbr) == BBR_SUB_DRAIN) && + (google_allow_early_out == 1) && + (bbr->r_ctl.rc_flight_at_input <= bbr->r_ctl.rc_target_at_state)) { + /* We have reached out target flight size possibly early */ + goto change_state; + } + if (TSTMP_LT(cts, bbr->r_ctl.rc_bbr_state_time)) { + return; + } + if ((cts - bbr->r_ctl.rc_bbr_state_time) < bbr_get_rtt(bbr, BBR_RTT_PROP)) { + /* + * Must be a rttProp movement forward before + * we can change states. + */ + return; + } + if (bbr_state_val(bbr) == BBR_SUB_GAIN) { + /* + * The needed time has passed but for + * the gain cycle extra rules apply: + * 1) If we have seen loss, we exit + * 2) If we have not reached the target + * we stay in GAIN (gain-to-target). + */ + if (google_consider_lost && losses) + goto change_state; + if (bbr->r_ctl.rc_target_at_state > bbr->r_ctl.rc_flight_at_input) { + return; + } + } +change_state: + /* For gain we must reach our target, all others last 1 rttProp */ + bbr_substate_change(bbr, cts, __LINE__, 1); +} + +static void +bbr_set_probebw_gains(struct tcp_bbr *bbr, uint32_t cts, uint32_t losses) +{ + uint32_t flight, bbr_cur_cycle_time; + + if (bbr->rc_use_google) { + bbr_set_probebw_google_gains(bbr, cts, losses); + return; + } + if (cts == 0) { + /* + * Never alow cts to be 0 we + * do this so we can judge if + * we have set a timestamp. + */ + cts = 1; + } + if (bbr_state_is_pkt_epoch) + bbr_cur_cycle_time = bbr_get_rtt(bbr, BBR_RTT_PKTRTT); + else + bbr_cur_cycle_time = bbr_get_rtt(bbr, BBR_RTT_PROP); + + if (bbr->r_ctl.rc_bbr_state_atflight == 0) { + if (bbr_state_val(bbr) == BBR_SUB_DRAIN) { + flight = ctf_flight_size(bbr->rc_tp, + (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); + if (bbr_sub_drain_slam_cwnd && bbr->rc_hit_state_1) { + /* Keep it slam down */ + if (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state) { + bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state; + bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); + } + if (bbr_sub_drain_app_limit) { + /* Go app limited if we are on a long drain */ + bbr->r_ctl.r_app_limited_until = (bbr->r_ctl.rc_delivered + flight); + } + } + if (TSTMP_GT(cts, bbr->r_ctl.gain_epoch) && + (((cts - bbr->r_ctl.gain_epoch) > bbr_get_rtt(bbr, BBR_RTT_PROP)) || + (flight >= bbr->r_ctl.flightsize_at_drain))) { + /* + * Still here after the same time as + * the gain. We need to drain harder + * for the next srtt. Reduce by a set amount + * the gain drop is capped at DRAIN states + * value (88). + */ + bbr->r_ctl.flightsize_at_drain = flight; + if (bbr_drain_drop_mul && + bbr_drain_drop_div && + (bbr_drain_drop_mul < bbr_drain_drop_div)) { + /* Use your specific drop value (def 4/5 = 20%) */ + bbr->r_ctl.rc_bbr_hptsi_gain *= bbr_drain_drop_mul; + bbr->r_ctl.rc_bbr_hptsi_gain /= bbr_drain_drop_div; + } else { + /* You get drop of 20% */ + bbr->r_ctl.rc_bbr_hptsi_gain *= 4; + bbr->r_ctl.rc_bbr_hptsi_gain /= 5; + } + if (bbr->r_ctl.rc_bbr_hptsi_gain <= bbr_drain_floor) { + /* Reduce our gain again to the bottom */ + bbr->r_ctl.rc_bbr_hptsi_gain = max(bbr_drain_floor, 1); + } + bbr_log_exit_gain(bbr, cts, 4); + /* + * Extend out so we wait another + * epoch before dropping again. + */ + bbr->r_ctl.gain_epoch = cts; + } + if (flight <= bbr->r_ctl.rc_target_at_state) { + if (bbr_sub_drain_slam_cwnd && + (bbr->rc_use_google == 0) && + (bbr->rc_tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd)) { + bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd; + bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); + } + bbr->r_ctl.rc_bbr_state_atflight = max(cts, 1); + bbr_log_exit_gain(bbr, cts, 3); + } + } else { + /* Its a gain */ + if (bbr->r_ctl.rc_lost > bbr->r_ctl.bbr_lost_at_state) { + bbr->r_ctl.rc_bbr_state_atflight = max(cts, 1); + goto change_state; + } + if ((ctf_outstanding(bbr->rc_tp) >= bbr->r_ctl.rc_target_at_state) || + ((ctf_outstanding(bbr->rc_tp) + bbr->rc_tp->t_maxseg - 1) >= + bbr->rc_tp->snd_wnd)) { + bbr->r_ctl.rc_bbr_state_atflight = max(cts, 1); + bbr_log_exit_gain(bbr, cts, 2); + } + } + /** + * We fall through and return always one of two things has + * occured. + * 1) We are still not at target + * + * 2) We reached the target and set rc_bbr_state_atflight + * which means we no longer hit this block + * next time we are called. + */ + return; + } +change_state: + if (TSTMP_LT(cts, bbr->r_ctl.rc_bbr_state_time)) + return; + if ((cts - bbr->r_ctl.rc_bbr_state_time) < bbr_cur_cycle_time) { + /* Less than a full time-period has passed */ + return; + } + if (bbr->r_ctl.rc_level_state_extra && + (bbr_state_val(bbr) > BBR_SUB_DRAIN) && + ((cts - bbr->r_ctl.rc_bbr_state_time) < + (bbr_cur_cycle_time + bbr->r_ctl.rc_level_state_extra))) { + /* Less than a full time-period + extra has passed */ + return; + } + if (bbr_gain_gets_extra_too && + bbr->r_ctl.rc_level_state_extra && + (bbr_state_val(bbr) == BBR_SUB_GAIN) && + ((cts - bbr->r_ctl.rc_bbr_state_time) < + (bbr_cur_cycle_time + bbr->r_ctl.rc_level_state_extra))) { + /* Less than a full time-period + extra has passed */ + return; + } + bbr_substate_change(bbr, cts, __LINE__, 1); +} + +static uint32_t +bbr_get_a_state_target(struct tcp_bbr *bbr, uint32_t gain) +{ + uint32_t mss, tar; + + if (bbr->rc_use_google) { + /* Google just uses the cwnd target */ + tar = bbr_get_target_cwnd(bbr, bbr_get_bw(bbr), gain); + } else { + mss = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options), + bbr->r_ctl.rc_pace_max_segs); + /* Get the base cwnd with gain rounded to a mss */ + tar = roundup(bbr_get_raw_target_cwnd(bbr, bbr_get_bw(bbr), + gain), mss); + /* Make sure it is within our min */ + if (tar < get_min_cwnd(bbr)) + return (get_min_cwnd(bbr)); + } + return (tar); +} + +static void +bbr_set_state_target(struct tcp_bbr *bbr, int line) +{ + uint32_t tar, meth; + + if ((bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) && + ((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google)) { + /* Special case using old probe-rtt method */ + tar = bbr_rtt_probe_cwndtarg * (bbr->rc_tp->t_maxseg - bbr->rc_last_options); + meth = 1; + } else { + /* Non-probe-rtt case and reduced probe-rtt */ + if ((bbr->rc_bbr_state == BBR_STATE_PROBE_BW) && + (bbr->r_ctl.rc_bbr_hptsi_gain > BBR_UNIT)) { + /* For gain cycle we use the hptsi gain */ + tar = bbr_get_a_state_target(bbr, bbr->r_ctl.rc_bbr_hptsi_gain); + meth = 2; + } else if ((bbr_target_is_bbunit) || bbr->rc_use_google) { + /* + * If configured, or for google all other states + * get BBR_UNIT. + */ + tar = bbr_get_a_state_target(bbr, BBR_UNIT); + meth = 3; + } else { + /* + * Or we set a target based on the pacing gain + * for non-google mode and default (non-configured). + * Note we don't set a target goal below drain (192). + */ + if (bbr->r_ctl.rc_bbr_hptsi_gain < bbr_hptsi_gain[BBR_SUB_DRAIN]) { + tar = bbr_get_a_state_target(bbr, bbr_hptsi_gain[BBR_SUB_DRAIN]); + meth = 4; + } else { + tar = bbr_get_a_state_target(bbr, bbr->r_ctl.rc_bbr_hptsi_gain); + meth = 5; + } + } + } + bbr_log_set_of_state_target(bbr, tar, line, meth); + bbr->r_ctl.rc_target_at_state = tar; +} + +static void +bbr_enter_probe_rtt(struct tcp_bbr *bbr, uint32_t cts, int32_t line) +{ + /* Change to probe_rtt */ + uint32_t time_in; + + bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost; + bbr->r_ctl.flightsize_at_drain = ctf_flight_size(bbr->rc_tp, + (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); + bbr->r_ctl.r_app_limited_until = (bbr->r_ctl.flightsize_at_drain + + bbr->r_ctl.rc_delivered); + /* Setup so we force feed the filter */ + if (bbr->rc_use_google || bbr_probertt_sets_rtt) + bbr->rc_prtt_set_ts = 1; + if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) { + time_in = cts - bbr->r_ctl.rc_bbr_state_time; + counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in); + } + bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_ENTERPROBE, 0); + bbr->r_ctl.rc_rtt_shrinks = cts; + bbr->r_ctl.last_in_probertt = cts; + bbr->r_ctl.rc_probertt_srttchktim = cts; + bbr->r_ctl.rc_bbr_state_time = cts; + bbr->rc_bbr_state = BBR_STATE_PROBE_RTT; + /* We need to force the filter to update */ + + if ((bbr_sub_drain_slam_cwnd) && + bbr->rc_hit_state_1 && + (bbr->rc_use_google == 0) && + (bbr_state_val(bbr) == BBR_SUB_DRAIN)) { + if (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_saved_cwnd) + bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd; + } else + bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd; + /* Update the lost */ + bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost; + if ((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google){ + /* Set to the non-configurable default of 4 (PROBE_RTT_MIN) */ + bbr->rc_tp->snd_cwnd = bbr_rtt_probe_cwndtarg * (bbr->rc_tp->t_maxseg - bbr->rc_last_options); + bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); + bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT; + bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT; + bbr_log_set_of_state_target(bbr, bbr->rc_tp->snd_cwnd, __LINE__, 6); + bbr->r_ctl.rc_target_at_state = bbr->rc_tp->snd_cwnd; + } else { + /* + * We bring it down slowly by using a hptsi gain that is + * probably 75%. This will slowly float down our outstanding + * without tampering with the cwnd. + */ + bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.bbr_rttprobe_gain_val; + bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT; + bbr_set_state_target(bbr, __LINE__); + if (bbr_prtt_slam_cwnd && + (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) { + bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state; + bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); + } + } + if (ctf_flight_size(bbr->rc_tp, + (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) <= + bbr->r_ctl.rc_target_at_state) { + /* We are at target */ + bbr->r_ctl.rc_bbr_enters_probertt = cts; + } else { + /* We need to come down to reach target before our time begins */ + bbr->r_ctl.rc_bbr_enters_probertt = 0; + } + bbr->r_ctl.rc_pe_of_prtt = bbr->r_ctl.rc_pkt_epoch; + BBR_STAT_INC(bbr_enter_probertt); + bbr_log_exit_gain(bbr, cts, 0); + bbr_log_type_statechange(bbr, cts, line); +} + +static void +bbr_check_probe_rtt_limits(struct tcp_bbr *bbr, uint32_t cts) +{ + /* + * Sanity check on probe-rtt intervals. + * In crazy situations where we are competing + * against new-reno flows with huge buffers + * our rtt-prop interval could come to dominate + * things if we can't get through a full set + * of cycles, we need to adjust it. + */ + if (bbr_can_adjust_probertt && + (bbr->rc_use_google == 0)) { + uint16_t val = 0; + uint32_t cur_rttp, fval, newval, baseval; + + /* Are we to small and go into probe-rtt to often? */ + baseval = (bbr_get_rtt(bbr, BBR_RTT_PROP) * (BBR_SUBSTATE_COUNT + 1)); + cur_rttp = roundup(baseval, USECS_IN_SECOND); + fval = bbr_filter_len_sec * USECS_IN_SECOND; + if (bbr_is_ratio == 0) { + if (fval > bbr_rtt_probe_limit) + newval = cur_rttp + (fval - bbr_rtt_probe_limit); + else + newval = cur_rttp; + } else { + int mul; + + mul = fval / bbr_rtt_probe_limit; + newval = cur_rttp * mul; + } + if (cur_rttp > bbr->r_ctl.rc_probertt_int) { + bbr->r_ctl.rc_probertt_int = cur_rttp; + reset_time_small(&bbr->r_ctl.rc_rttprop, newval); + val = 1; + } else { + /* + * No adjustments were made + * do we need to shrink it? + */ + if (bbr->r_ctl.rc_probertt_int > bbr_rtt_probe_limit) { + if (cur_rttp <= bbr_rtt_probe_limit) { + /* + * Things have calmed down lets + * shrink all the way to default + */ + bbr->r_ctl.rc_probertt_int = bbr_rtt_probe_limit; + reset_time_small(&bbr->r_ctl.rc_rttprop, + (bbr_filter_len_sec * USECS_IN_SECOND)); + cur_rttp = bbr_rtt_probe_limit; + newval = (bbr_filter_len_sec * USECS_IN_SECOND); + val = 2; + } else { + /* + * Well does some adjustment make sense? + */ + if (cur_rttp < bbr->r_ctl.rc_probertt_int) { + /* We can reduce interval time some */ + bbr->r_ctl.rc_probertt_int = cur_rttp; + reset_time_small(&bbr->r_ctl.rc_rttprop, newval); + val = 3; + } + } + } + } + if (val) + bbr_log_rtt_shrinks(bbr, cts, cur_rttp, newval, __LINE__, BBR_RTTS_RESETS_VALUES, val); + } +} + +static void +bbr_exit_probe_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) +{ + /* Exit probe-rtt */ + + if (tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd) { + tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd; + bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); + } + bbr_log_exit_gain(bbr, cts, 1); + bbr->rc_hit_state_1 = 0; + bbr->r_ctl.rc_rtt_shrinks = cts; + bbr->r_ctl.last_in_probertt = cts; + bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_RTTPROBE, 0); + bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost; + bbr->r_ctl.r_app_limited_until = (ctf_flight_size(tp, + (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) + + bbr->r_ctl.rc_delivered); + if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) { + uint32_t time_in; + + time_in = cts - bbr->r_ctl.rc_bbr_state_time; + counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in); + } + if (bbr->rc_filled_pipe) { + /* Switch to probe_bw */ + bbr->rc_bbr_state = BBR_STATE_PROBE_BW; + bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts); + bbr->r_ctl.rc_bbr_cwnd_gain = bbr_cwnd_gain; + bbr_substate_change(bbr, cts, __LINE__, 0); + bbr_log_type_statechange(bbr, cts, __LINE__); + } else { + /* Back to startup */ + bbr->rc_bbr_state = BBR_STATE_STARTUP; + bbr->r_ctl.rc_bbr_state_time = cts; + /* + * We don't want to give a complete free 3 + * measurements until we exit, so we use + * the number of pe's we were in probe-rtt + * to add to the startup_epoch. That way + * we will still retain the old state. + */ + bbr->r_ctl.rc_bbr_last_startup_epoch += (bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_pe_of_prtt); + bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost; + /* Make sure to use the lower pg when shifting back in */ + if (bbr->r_ctl.rc_lost && + bbr_use_lower_gain_in_startup && + (bbr->rc_use_google == 0)) + bbr->r_ctl.rc_bbr_hptsi_gain = bbr_startup_lower; + else + bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_startup_pg; + bbr->r_ctl.rc_bbr_cwnd_gain = bbr->r_ctl.rc_startup_pg; + /* Probably not needed but set it anyway */ + bbr_set_state_target(bbr, __LINE__); + bbr_log_type_statechange(bbr, cts, __LINE__); + bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, + bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 0); + } + bbr_check_probe_rtt_limits(bbr, cts); +} + +static int32_t inline +bbr_should_enter_probe_rtt(struct tcp_bbr *bbr, uint32_t cts) +{ + if ((bbr->rc_past_init_win == 1) && + (bbr->rc_in_persist == 0) && + (bbr_calc_time(cts, bbr->r_ctl.rc_rtt_shrinks) >= bbr->r_ctl.rc_probertt_int)) { + return (1); + } + if (bbr_can_force_probertt && + (bbr->rc_in_persist == 0) && + (TSTMP_GT(cts, bbr->r_ctl.last_in_probertt)) && + ((cts - bbr->r_ctl.last_in_probertt) > bbr->r_ctl.rc_probertt_int)) { + return (1); + } + return (0); +} + + +static int32_t +bbr_google_startup(struct tcp_bbr *bbr, uint32_t cts, int32_t pkt_epoch) +{ + uint64_t btlbw, gain; + if (pkt_epoch == 0) { + /* + * Need to be on a pkt-epoch to continue. + */ + return (0); + } + btlbw = bbr_get_full_bw(bbr); + gain = ((bbr->r_ctl.rc_bbr_lastbtlbw * + (uint64_t)bbr_start_exit) / (uint64_t)100) + bbr->r_ctl.rc_bbr_lastbtlbw; + if (btlbw >= gain) { + bbr->r_ctl.rc_bbr_last_startup_epoch = bbr->r_ctl.rc_pkt_epoch; + bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, + bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 3); + bbr->r_ctl.rc_bbr_lastbtlbw = btlbw; + } + if ((bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_bbr_last_startup_epoch) >= BBR_STARTUP_EPOCHS) + return (1); + bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, + bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 8); + return(0); +} + +static int32_t inline +bbr_state_startup(struct tcp_bbr *bbr, uint32_t cts, int32_t epoch, int32_t pkt_epoch) +{ + /* Have we gained 25% in the last 3 packet based epoch's? */ + uint64_t btlbw, gain; + int do_exit; + int delta, rtt_gain; + + if ((bbr->rc_tp->snd_una == bbr->rc_tp->snd_max) && + (bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time) >= bbr_rtt_probe_time)) { + /* + * This qualifies as a RTT_PROBE session since we drop the + * data outstanding to nothing and waited more than + * bbr_rtt_probe_time. + */ + bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_WASIDLE, 0); + bbr_set_reduced_rtt(bbr, cts, __LINE__); + } + if (bbr_should_enter_probe_rtt(bbr, cts)) { + bbr_enter_probe_rtt(bbr, cts, __LINE__); + return (0); + } + if (bbr->rc_use_google) + return (bbr_google_startup(bbr, cts, pkt_epoch)); + + if ((bbr->r_ctl.rc_lost > bbr->r_ctl.rc_lost_at_startup) && + (bbr_use_lower_gain_in_startup)) { + /* Drop to a lower gain 1.5 x since we saw loss */ + bbr->r_ctl.rc_bbr_hptsi_gain = bbr_startup_lower; + } + if (pkt_epoch == 0) { + /* + * Need to be on a pkt-epoch to continue. + */ + return (0); + } + if (bbr_rtt_gain_thresh) { + /* + * Do we allow a flow to stay + * in startup with no loss and no + * gain in rtt over a set threshold? + */ + if (bbr->r_ctl.rc_pkt_epoch_rtt && + bbr->r_ctl.startup_last_srtt && + (bbr->r_ctl.rc_pkt_epoch_rtt > bbr->r_ctl.startup_last_srtt)) { + delta = bbr->r_ctl.rc_pkt_epoch_rtt - bbr->r_ctl.startup_last_srtt; + rtt_gain = (delta * 100) / bbr->r_ctl.startup_last_srtt; + } else + rtt_gain = 0; + if ((bbr->r_ctl.startup_last_srtt == 0) || + (bbr->r_ctl.rc_pkt_epoch_rtt < bbr->r_ctl.startup_last_srtt)) + /* First time or new lower value */ + bbr->r_ctl.startup_last_srtt = bbr->r_ctl.rc_pkt_epoch_rtt; + + if ((bbr->r_ctl.rc_lost == 0) && + (rtt_gain < bbr_rtt_gain_thresh)) { + /* + * No loss, and we are under + * our gain threhold for + * increasing RTT. + */ + if (bbr->r_ctl.rc_bbr_last_startup_epoch < bbr->r_ctl.rc_pkt_epoch) + bbr->r_ctl.rc_bbr_last_startup_epoch++; + bbr_log_startup_event(bbr, cts, rtt_gain, + delta, bbr->r_ctl.startup_last_srtt, 10); + return (0); + } + } + if ((bbr->r_ctl.r_measurement_count == bbr->r_ctl.last_startup_measure) && + (bbr->r_ctl.rc_lost_at_startup == bbr->r_ctl.rc_lost) && + (!IN_RECOVERY(bbr->rc_tp->t_flags))) { + /* + * We only assess if we have a new measurment when + * we have no loss and are not in recovery. + * Drag up by one our last_startup epoch so we will hold + * the number of non-gain we have already accumulated. + */ + if (bbr->r_ctl.rc_bbr_last_startup_epoch < bbr->r_ctl.rc_pkt_epoch) + bbr->r_ctl.rc_bbr_last_startup_epoch++; + bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, + bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 9); + return (0); + } + /* Case where we reduced the lost (bad retransmit) */ + if (bbr->r_ctl.rc_lost_at_startup > bbr->r_ctl.rc_lost) + bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost; + bbr->r_ctl.last_startup_measure = bbr->r_ctl.r_measurement_count; + btlbw = bbr_get_full_bw(bbr); + if (bbr->r_ctl.rc_bbr_hptsi_gain == bbr_startup_lower) + gain = ((bbr->r_ctl.rc_bbr_lastbtlbw * + (uint64_t)bbr_low_start_exit) / (uint64_t)100) + bbr->r_ctl.rc_bbr_lastbtlbw; + else + gain = ((bbr->r_ctl.rc_bbr_lastbtlbw * + (uint64_t)bbr_start_exit) / (uint64_t)100) + bbr->r_ctl.rc_bbr_lastbtlbw; + do_exit = 0; + if (btlbw > bbr->r_ctl.rc_bbr_lastbtlbw) + bbr->r_ctl.rc_bbr_lastbtlbw = btlbw; + if (btlbw >= gain) { + bbr->r_ctl.rc_bbr_last_startup_epoch = bbr->r_ctl.rc_pkt_epoch; + /* Update the lost so we won't exit in next set of tests */ + bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost; + bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, + bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 3); + } + if ((bbr->rc_loss_exit && + (bbr->r_ctl.rc_lost > bbr->r_ctl.rc_lost_at_startup) && + (bbr->r_ctl.rc_pkt_epoch_loss_rate > bbr_startup_loss_thresh)) && + ((bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_bbr_last_startup_epoch) >= BBR_STARTUP_EPOCHS)) { + /* + * If we had no gain, we had loss and that loss was above + * our threshould, the rwnd is not constrained, and we have + * had at least 3 packet epochs exit. Note that this is + * switched off by sysctl. Google does not do this by the + * way. + */ + if ((ctf_flight_size(bbr->rc_tp, + (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) + + (2 * max(bbr->r_ctl.rc_pace_max_segs, bbr->rc_tp->t_maxseg))) <= bbr->rc_tp->snd_wnd) { + do_exit = 1; + bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, + bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 4); + } else { + /* Just record an updated loss value */ + bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost; + bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, + bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 5); + } + } else + bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost; + if (((bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_bbr_last_startup_epoch) >= BBR_STARTUP_EPOCHS) || + do_exit) { + /* Return 1 to exit the startup state. */ + return (1); + } + /* Stay in startup */ + bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, + bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 8); + return (0); +} + +static void +bbr_state_change(struct tcp_bbr *bbr, uint32_t cts, int32_t epoch, int32_t pkt_epoch, uint32_t losses) +{ + /* + * A tick occured in the rtt epoch do we need to do anything? + */ +#ifdef BBR_INVARIANTS + if ((bbr->rc_bbr_state != BBR_STATE_STARTUP) && + (bbr->rc_bbr_state != BBR_STATE_DRAIN) && + (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) && + (bbr->rc_bbr_state != BBR_STATE_IDLE_EXIT) && + (bbr->rc_bbr_state != BBR_STATE_PROBE_BW)) { + /* Debug code? */ + panic("Unknown BBR state %d?\n", bbr->rc_bbr_state); + } +#endif + if (bbr->rc_bbr_state == BBR_STATE_STARTUP) { + /* Do we exit the startup state? */ + if (bbr_state_startup(bbr, cts, epoch, pkt_epoch)) { + uint32_t time_in; + + bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, + bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 6); + bbr->rc_filled_pipe = 1; + bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost; + if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) { + + time_in = cts - bbr->r_ctl.rc_bbr_state_time; + counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in); + } else + time_in = 0; + if (bbr->rc_no_pacing) + bbr->rc_no_pacing = 0; + bbr->r_ctl.rc_bbr_state_time = cts; + bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_drain_pg; + bbr->rc_bbr_state = BBR_STATE_DRAIN; + bbr_set_state_target(bbr, __LINE__); + if ((bbr->rc_use_google == 0) && + bbr_slam_cwnd_in_main_drain) { + /* Here we don't have to worry about probe-rtt */ + bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd; + bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state; + bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); + } + bbr->r_ctl.rc_bbr_cwnd_gain = bbr_high_gain; + bbr_log_type_statechange(bbr, cts, __LINE__); + if (ctf_flight_size(bbr->rc_tp, + (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) <= + bbr->r_ctl.rc_target_at_state) { + /* + * Switch to probe_bw if we are already + * there + */ + bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts); + bbr_substate_change(bbr, cts, __LINE__, 0); + bbr->rc_bbr_state = BBR_STATE_PROBE_BW; + bbr_log_type_statechange(bbr, cts, __LINE__); + } + } + } else if (bbr->rc_bbr_state == BBR_STATE_IDLE_EXIT) { + uint32_t inflight; + struct tcpcb *tp; + + tp = bbr->rc_tp; + inflight = ctf_flight_size(tp, + (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); + if (inflight >= bbr->r_ctl.rc_target_at_state) { + /* We have reached a flight of the cwnd target */ + bbr->rc_bbr_state = BBR_STATE_PROBE_BW; + bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT; + bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT; + bbr_set_state_target(bbr, __LINE__); + /* + * Rig it so we don't do anything crazy and + * start fresh with a new randomization. + */ + bbr->r_ctl.bbr_smallest_srtt_this_state = 0xffffffff; + bbr->rc_bbr_substate = BBR_SUB_LEVEL6; + bbr_substate_change(bbr, cts, __LINE__, 1); + } + } else if (bbr->rc_bbr_state == BBR_STATE_DRAIN) { + /* Has in-flight reached the bdp (or less)? */ + uint32_t inflight; + struct tcpcb *tp; + + tp = bbr->rc_tp; + inflight = ctf_flight_size(tp, + (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); + if ((bbr->rc_use_google == 0) && + bbr_slam_cwnd_in_main_drain && + (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) { + /* + * Here we don't have to worry about probe-rtt + * re-slam it, but keep it slammed down. + */ + bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state; + bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); + } + if (inflight <= bbr->r_ctl.rc_target_at_state) { + /* We have drained */ + bbr->rc_bbr_state = BBR_STATE_PROBE_BW; + bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost; + if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) { + uint32_t time_in; + + time_in = cts - bbr->r_ctl.rc_bbr_state_time; + counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in); + } + if ((bbr->rc_use_google == 0) && + bbr_slam_cwnd_in_main_drain && + (tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd)) { + /* Restore the cwnd */ + tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd; + bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); + } + /* Setup probe-rtt has being done now RRS-HERE */ + bbr->r_ctl.rc_rtt_shrinks = cts; + bbr->r_ctl.last_in_probertt = cts; + bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_LEAVE_DRAIN, 0); + /* Randomly pick a sub-state */ + bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts); + bbr_substate_change(bbr, cts, __LINE__, 0); + bbr_log_type_statechange(bbr, cts, __LINE__); + } + } else if (bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) { + uint32_t flight; + + flight = ctf_flight_size(bbr->rc_tp, + (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); + bbr->r_ctl.r_app_limited_until = (flight + bbr->r_ctl.rc_delivered); + if (((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google) && + (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) { + /* + * We must keep cwnd at the desired MSS. + */ + bbr->rc_tp->snd_cwnd = bbr_rtt_probe_cwndtarg * (bbr->rc_tp->t_maxseg - bbr->rc_last_options); + bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); + } else if ((bbr_prtt_slam_cwnd) && + (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) { + /* Re-slam it */ + bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state; + bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); + } + if (bbr->r_ctl.rc_bbr_enters_probertt == 0) { + /* Has outstanding reached our target? */ + if (flight <= bbr->r_ctl.rc_target_at_state) { + bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_REACHTAR, 0); + bbr->r_ctl.rc_bbr_enters_probertt = cts; + /* If time is exactly 0, be 1usec off */ + if (bbr->r_ctl.rc_bbr_enters_probertt == 0) + bbr->r_ctl.rc_bbr_enters_probertt = 1; + if (bbr->rc_use_google == 0) { + /* + * Restore any lowering that as occured to + * reach here + */ + if (bbr->r_ctl.bbr_rttprobe_gain_val) + bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.bbr_rttprobe_gain_val; + else + bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT; + } + } + if ((bbr->r_ctl.rc_bbr_enters_probertt == 0) && + (bbr->rc_use_google == 0) && + bbr->r_ctl.bbr_rttprobe_gain_val && + (((cts - bbr->r_ctl.rc_probertt_srttchktim) > bbr_get_rtt(bbr, bbr_drain_rtt)) || + (flight >= bbr->r_ctl.flightsize_at_drain))) { + /* + * We have doddled with our current hptsi + * gain an srtt and have still not made it + * to target, or we have increased our flight. + * Lets reduce the gain by xx% + * flooring the reduce at DRAIN (based on + * mul/div) + */ + int red; + + bbr->r_ctl.flightsize_at_drain = flight; + bbr->r_ctl.rc_probertt_srttchktim = cts; + red = max((bbr->r_ctl.bbr_rttprobe_gain_val / 10), 1); + if ((bbr->r_ctl.rc_bbr_hptsi_gain - red) > max(bbr_drain_floor, 1)) { + /* Reduce our gain again */ + bbr->r_ctl.rc_bbr_hptsi_gain -= red; + bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_SHRINK_PG, 0); + } else if (bbr->r_ctl.rc_bbr_hptsi_gain > max(bbr_drain_floor, 1)) { + /* one more chance before we give up */ + bbr->r_ctl.rc_bbr_hptsi_gain = max(bbr_drain_floor, 1); + bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_SHRINK_PG_FINAL, 0); + } else { + /* At the very bottom */ + bbr->r_ctl.rc_bbr_hptsi_gain = max((bbr_drain_floor-1), 1); + } + } + } + if (bbr->r_ctl.rc_bbr_enters_probertt && + (TSTMP_GT(cts, bbr->r_ctl.rc_bbr_enters_probertt)) && + ((cts - bbr->r_ctl.rc_bbr_enters_probertt) >= bbr_rtt_probe_time)) { + /* Time to exit probe RTT normally */ + bbr_exit_probe_rtt(bbr->rc_tp, bbr, cts); + } + } else if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) { + if ((bbr->rc_tp->snd_una == bbr->rc_tp->snd_max) && + (bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time) >= bbr_rtt_probe_time)) { + /* + * This qualifies as a RTT_PROBE session since we + * drop the data outstanding to nothing and waited + * more than bbr_rtt_probe_time. + */ + bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_WASIDLE, 0); + bbr_set_reduced_rtt(bbr, cts, __LINE__); + } + if (bbr_should_enter_probe_rtt(bbr, cts)) { + bbr_enter_probe_rtt(bbr, cts, __LINE__); + } else { + bbr_set_probebw_gains(bbr, cts, losses); + } + } +} + +static void +bbr_check_bbr_for_state(struct tcp_bbr *bbr, uint32_t cts, int32_t line, uint32_t losses) +{ + int32_t epoch = 0; + + if ((cts - bbr->r_ctl.rc_rcv_epoch_start) >= bbr_get_rtt(bbr, BBR_RTT_PROP)) { + bbr_set_epoch(bbr, cts, line); + /* At each epoch doe lt bw sampling */ + epoch = 1; + } + bbr_state_change(bbr, cts, epoch, bbr->rc_is_pkt_epoch_now, losses); +} + +static int +bbr_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, + int32_t nxt_pkt, struct timeval *tv) +{ + int32_t thflags, retval; + uint32_t cts, lcts; + uint32_t tiwin; + struct tcpopt to; + struct tcp_bbr *bbr; + struct bbr_sendmap *rsm; + struct timeval ltv; + int32_t did_out = 0; + int32_t in_recovery; + uint16_t nsegs; + int32_t prev_state; + uint32_t lost; + + nsegs = max(1, m->m_pkthdr.lro_nsegs); + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + /* add in our stats */ + kern_prefetch(bbr, &prev_state); + prev_state = 0; + thflags = th->th_flags; + /* + * If this is either a state-changing packet or current state isn't + * established, we require a write lock on tcbinfo. Otherwise, we + * allow the tcbinfo to be in either alocked or unlocked, as the + * caller may have unnecessarily acquired a write lock due to a + * race. + */ + INP_WLOCK_ASSERT(tp->t_inpcb); + KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", + __func__)); + KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", + __func__)); + + tp->t_rcvtime = ticks; + /* + * Unscale the window into a 32-bit value. For the SYN_SENT state + * the scale is zero. + */ + tiwin = th->th_win << tp->snd_scale; +#ifdef NETFLIX_STATS + stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); +#endif + /* + * Parse options on any incoming segment. + */ + tcp_dooptions(&to, (u_char *)(th + 1), + (th->th_off << 2) - sizeof(struct tcphdr), + (thflags & TH_SYN) ? TO_SYN : 0); + + if (m->m_flags & M_TSTMP) { + /* Prefer the hardware timestamp if present */ + struct timespec ts; + + mbuf_tstmp2timespec(m, &ts); + bbr->rc_tv.tv_sec = ts.tv_sec; + bbr->rc_tv.tv_usec = ts.tv_nsec / 1000; + bbr->r_ctl.rc_rcvtime = cts = tcp_tv_to_usectick(&bbr->rc_tv); + } else if (m->m_flags & M_TSTMP_LRO) { + /* Next the arrival timestamp */ + struct timespec ts; + + mbuf_tstmp2timespec(m, &ts); + bbr->rc_tv.tv_sec = ts.tv_sec; + bbr->rc_tv.tv_usec = ts.tv_nsec / 1000; + bbr->r_ctl.rc_rcvtime = cts = tcp_tv_to_usectick(&bbr->rc_tv); + } else { + /* + * Ok just get the current time. + */ + bbr->r_ctl.rc_rcvtime = lcts = cts = tcp_get_usecs(&bbr->rc_tv); + } + /* + * If echoed timestamp is later than the current time, fall back to + * non RFC1323 RTT calculation. Normalize timestamp if syncookies + * were used when this connection was established. + */ + if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { + to.to_tsecr -= tp->ts_offset; + if (TSTMP_GT(to.to_tsecr, tcp_tv_to_mssectick(&bbr->rc_tv))) + to.to_tsecr = 0; + } + /* + * If its the first time in we need to take care of options and + * verify we can do SACK for rack! + */ + if (bbr->r_state == 0) { + /* + * Process options only when we get SYN/ACK back. The SYN + * case for incoming connections is handled in tcp_syncache. + * According to RFC1323 the window field in a SYN (i.e., a + * or ) segment itself is never scaled. XXX + * this is traditional behavior, may need to be cleaned up. + */ + if (bbr->rc_inp == NULL) { + bbr->rc_inp = tp->t_inpcb; + } + /* + * We need to init rc_inp here since its not init'd when + * bbr_init is called + */ + if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { + if ((to.to_flags & TOF_SCALE) && + (tp->t_flags & TF_REQ_SCALE)) { + tp->t_flags |= TF_RCVD_SCALE; + tp->snd_scale = to.to_wscale; + } + /* + * Initial send window. It will be updated with the + * next incoming segment to the scaled value. + */ + tp->snd_wnd = th->th_win; + if (to.to_flags & TOF_TS) { + tp->t_flags |= TF_RCVD_TSTMP; + tp->ts_recent = to.to_tsval; + tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); + } + if (to.to_flags & TOF_MSS) + tcp_mss(tp, to.to_mss); + if ((tp->t_flags & TF_SACK_PERMIT) && + (to.to_flags & TOF_SACKPERM) == 0) + tp->t_flags &= ~TF_SACK_PERMIT; + if (IS_FASTOPEN(tp->t_flags)) { + if (to.to_flags & TOF_FASTOPEN) { + uint16_t mss; + + if (to.to_flags & TOF_MSS) + mss = to.to_mss; + else + if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) + mss = TCP6_MSS; + else + mss = TCP_MSS; + tcp_fastopen_update_cache(tp, mss, + to.to_tfo_len, to.to_tfo_cookie); + } else + tcp_fastopen_disable_path(tp); + } + } + /* + * At this point we are at the initial call. Here we decide + * if we are doing RACK or not. We do this by seeing if + * TF_SACK_PERMIT is set, if not rack is *not* possible and + * we switch to the default code. + */ + if ((tp->t_flags & TF_SACK_PERMIT) == 0) { + /* Bail */ + tcp_switch_back_to_default(tp); + (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, + tlen, iptos); + return (1); + } + /* Set the flag */ + bbr->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; + tcp_set_hpts(tp->t_inpcb); + sack_filter_clear(&bbr->r_ctl.bbr_sf, th->th_ack); + } + if (thflags & TH_ACK) { + /* Track ack types */ + if (to.to_flags & TOF_SACK) + BBR_STAT_INC(bbr_acks_with_sacks); + else + BBR_STAT_INC(bbr_plain_acks); + } + /* + * This is the one exception case where we set the rack state + * always. All other times (timers etc) we must have a rack-state + * set (so we assure we have done the checks above for SACK). + */ + if (bbr->r_state != tp->t_state) + bbr_set_state(tp, bbr, tiwin); + + if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map)) != NULL) + kern_prefetch(rsm, &prev_state); + prev_state = bbr->r_state; + bbr->rc_ack_was_delayed = 0; + lost = bbr->r_ctl.rc_lost; + bbr->rc_is_pkt_epoch_now = 0; + if (m->m_flags & (M_TSTMP|M_TSTMP_LRO)) { + /* Get the real time into lcts and figure the real delay */ + lcts = tcp_get_usecs(<v); + if (TSTMP_GT(lcts, cts)) { + bbr->r_ctl.rc_ack_hdwr_delay = lcts - cts; + bbr->rc_ack_was_delayed = 1; + if (TSTMP_GT(bbr->r_ctl.rc_ack_hdwr_delay, + bbr->r_ctl.highest_hdwr_delay)) + bbr->r_ctl.highest_hdwr_delay = bbr->r_ctl.rc_ack_hdwr_delay; + } else { + bbr->r_ctl.rc_ack_hdwr_delay = 0; + bbr->rc_ack_was_delayed = 0; + } + } else { + bbr->r_ctl.rc_ack_hdwr_delay = 0; + bbr->rc_ack_was_delayed = 0; + } + bbr_log_ack_event(bbr, th, &to, tlen, nsegs, cts, nxt_pkt, m); + if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { + retval = 0; + m_freem(m); + goto done_with_input; + } + /* + * If a segment with the ACK-bit set arrives in the SYN-SENT state + * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. + */ + if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && + (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { + ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + return (1); + } + in_recovery = IN_RECOVERY(tp->t_flags); + if (tiwin > bbr->r_ctl.rc_high_rwnd) + bbr->r_ctl.rc_high_rwnd = tiwin; +#ifdef BBR_INVARIANTS + if ((tp->t_inpcb->inp_flags & INP_DROPPED) || + (tp->t_inpcb->inp_flags2 & INP_FREED)) { + panic("tp:%p bbr:%p given a dropped inp:%p", + tp, bbr, tp->t_inpcb); + } +#endif + bbr->r_ctl.rc_flight_at_input = ctf_flight_size(tp, + (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); + bbr->rtt_valid = 0; + if (to.to_flags & TOF_TS) { + bbr->rc_ts_valid = 1; + bbr->r_ctl.last_inbound_ts = to.to_tsval; + } else { + bbr->rc_ts_valid = 0; + bbr->r_ctl.last_inbound_ts = 0; + } + retval = (*bbr->r_substate) (m, th, so, + tp, &to, drop_hdrlen, + tlen, tiwin, thflags, nxt_pkt); +#ifdef BBR_INVARIANTS + if ((retval == 0) && + (tp->t_inpcb == NULL)) { + panic("retval:%d tp:%p t_inpcb:NULL state:%d", + retval, tp, prev_state); + } +#endif + if (nxt_pkt == 0) + BBR_STAT_INC(bbr_rlock_left_ret0); + else + BBR_STAT_INC(bbr_rlock_left_ret1); + if (retval == 0) { + /* + * If retval is 1 the tcb is unlocked and most likely the tp + * is gone. + */ + INP_WLOCK_ASSERT(tp->t_inpcb); + tcp_bbr_xmit_timer_commit(bbr, tp, cts); + if (bbr->rc_is_pkt_epoch_now) + bbr_set_pktepoch(bbr, cts, __LINE__); + bbr_check_bbr_for_state(bbr, cts, __LINE__, (bbr->r_ctl.rc_lost - lost)); + if (nxt_pkt == 0) { + if (bbr->r_wanted_output != 0) { + bbr->rc_output_starts_timer = 0; + did_out = 1; + (void)tp->t_fb->tfb_tcp_output(tp); + } else + bbr_start_hpts_timer(bbr, tp, cts, 6, 0, 0); + } + if ((nxt_pkt == 0) && + ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && + (SEQ_GT(tp->snd_max, tp->snd_una) || + (tp->t_flags & TF_DELACK) || + ((tcp_always_keepalive || bbr->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && + (tp->t_state <= TCPS_CLOSING)))) { + /* + * We could not send (probably in the hpts but + * stopped the timer)? + */ + if ((tp->snd_max == tp->snd_una) && + ((tp->t_flags & TF_DELACK) == 0) && + (bbr->rc_inp->inp_in_hpts) && + (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { + /* + * keep alive not needed if we are hptsi + * output yet + */ + ; + } else { + if (bbr->rc_inp->inp_in_hpts) { + tcp_hpts_remove(bbr->rc_inp, HPTS_REMOVE_OUTPUT); + if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && + (TSTMP_GT(lcts, bbr->rc_pacer_started))) { + uint32_t del; + + del = lcts - bbr->rc_pacer_started; + if (del > bbr->r_ctl.rc_last_delay_val) { + BBR_STAT_INC(bbr_force_timer_start); + bbr->r_ctl.rc_last_delay_val -= del; + bbr->rc_pacer_started = lcts; + } else { + /* We are late */ + BBR_STAT_INC(bbr_force_output); + (void)tp->t_fb->tfb_tcp_output(tp); + } + } + } + bbr_start_hpts_timer(bbr, tp, cts, 8, bbr->r_ctl.rc_last_delay_val, + 0); + } + } else if ((bbr->rc_output_starts_timer == 0) && (nxt_pkt == 0)) { + /* Do we have the correct timer running? */ + bbr_timer_audit(tp, bbr, lcts, &so->so_snd); + } + /* Do we have a new state */ + if (bbr->r_state != tp->t_state) + bbr_set_state(tp, bbr, tiwin); +done_with_input: + bbr_log_doseg_done(bbr, cts, nxt_pkt, did_out); + if (did_out) + bbr->r_wanted_output = 0; +#ifdef BBR_INVARIANTS + if (tp->t_inpcb == NULL) { + panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d", + did_out, + retval, tp, prev_state); + } +#endif + } + return (retval); +} + +static void +bbr_log_type_hrdwtso(struct tcpcb *tp, struct tcp_bbr *bbr, int len, int mod, int what_we_can_send) +{ + if (tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + struct timeval tv; + uint32_t cts; + + cts = tcp_get_usecs(&tv); + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + log.u_bbr.flex1 = bbr->r_ctl.rc_pace_min_segs; + log.u_bbr.flex2 = what_we_can_send; + log.u_bbr.flex3 = bbr->r_ctl.rc_pace_max_segs; + log.u_bbr.flex4 = len; + log.u_bbr.flex5 = 0; + log.u_bbr.flex7 = mod; + log.u_bbr.flex8 = 1; + TCP_LOG_EVENTP(tp, NULL, + &tp->t_inpcb->inp_socket->so_rcv, + &tp->t_inpcb->inp_socket->so_snd, + TCP_HDWR_TLS, 0, + 0, &log, false, &tv); + } +} + +static void +bbr_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) +{ + struct timeval tv; + int retval; + + /* First lets see if we have old packets */ + if (tp->t_in_pkt) { + if (ctf_do_queued_segments(so, tp, 1)) { + m_freem(m); + return; + } + } + if (m->m_flags & M_TSTMP_LRO) { + tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000; + tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000; + } else { + /* Should not be should we kassert instead? */ + tcp_get_usecs(&tv); + } + retval = bbr_do_segment_nounlock(m, th, so, tp, + drop_hdrlen, tlen, iptos, 0, &tv); + if (retval == 0) + INP_WUNLOCK(tp->t_inpcb); +} + +/* + * Return how much data can be sent without violating the + * cwnd or rwnd. + */ + +static inline uint32_t +bbr_what_can_we_send(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t sendwin, + uint32_t avail, int32_t sb_offset, uint32_t cts) +{ + uint32_t len; + + if (ctf_outstanding(tp) >= tp->snd_wnd) { + /* We never want to go over our peers rcv-window */ + len = 0; + } else { + uint32_t flight; + + flight = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); + if (flight >= sendwin) { + /* + * We have in flight what we are allowed by cwnd (if + * it was rwnd blocking it would have hit above out + * >= tp->snd_wnd). + */ + return (0); + } + len = sendwin - flight; + if ((len + ctf_outstanding(tp)) > tp->snd_wnd) { + /* We would send too much (beyond the rwnd) */ + len = tp->snd_wnd - ctf_outstanding(tp); + } + if ((len + sb_offset) > avail) { + /* + * We don't have that much in the SB, how much is + * there? + */ + len = avail - sb_offset; + } + } + return (len); +} + +static inline void +bbr_do_error_accounting(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, int32_t len, int32_t error) +{ +#ifdef NETFLIX_STATS + TCPSTAT_INC(tcps_sndpack_error); + TCPSTAT_ADD(tcps_sndbyte_error, len); +#endif +} + +static inline void +bbr_do_send_accounting(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, int32_t len, int32_t error) +{ + if (error) { + bbr_do_error_accounting(tp, bbr, rsm, len, error); + return; + } + if ((tp->t_flags & TF_FORCEDATA) && len == 1) { + /* Window probe */ + TCPSTAT_INC(tcps_sndprobe); +#ifdef NETFLIX_STATS + stats_voi_update_abs_u32(tp->t_stats, + VOI_TCP_RETXPB, len); +#endif + } else if (rsm) { + if (rsm->r_flags & BBR_TLP) { + /* + * TLP should not count in retran count, but in its + * own bin + */ +#ifdef NETFLIX_STATS + tp->t_sndtlppack++; + tp->t_sndtlpbyte += len; + TCPSTAT_INC(tcps_tlpresends); + TCPSTAT_ADD(tcps_tlpresend_bytes, len); +#endif + } else { + /* Retransmit */ + tp->t_sndrexmitpack++; + TCPSTAT_INC(tcps_sndrexmitpack); + TCPSTAT_ADD(tcps_sndrexmitbyte, len); +#ifdef NETFLIX_STATS + stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, + len); +#endif + } + /* + * Logs in 0 - 8, 8 is all non probe_bw states 0-7 is + * sub-state + */ + counter_u64_add(bbr_state_lost[rsm->r_bbr_state], len); + if (bbr->rc_bbr_state != BBR_STATE_PROBE_BW) { + /* Non probe_bw log in 1, 2, or 4. */ + counter_u64_add(bbr_state_resend[bbr->rc_bbr_state], len); + } else { + /* + * Log our probe state 3, and log also 5-13 to show + * us the recovery sub-state for the send. This + * means that 3 == (5+6+7+8+9+10+11+12+13) + */ + counter_u64_add(bbr_state_resend[BBR_STATE_PROBE_BW], len); + counter_u64_add(bbr_state_resend[(bbr_state_val(bbr) + 5)], len); + } + /* Place in both 16's the totals of retransmitted */ + counter_u64_add(bbr_state_lost[16], len); + counter_u64_add(bbr_state_resend[16], len); + /* Place in 17's the total sent */ + counter_u64_add(bbr_state_resend[17], len); + counter_u64_add(bbr_state_lost[17], len); + + } else { + /* New sends */ + TCPSTAT_INC(tcps_sndpack); + TCPSTAT_ADD(tcps_sndbyte, len); + /* Place in 17's the total sent */ + counter_u64_add(bbr_state_resend[17], len); + counter_u64_add(bbr_state_lost[17], len); +#ifdef NETFLIX_STATS + stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, + len); +#endif + } +} + +static void +bbr_cwnd_limiting(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t in_level) +{ + if (bbr->rc_filled_pipe && bbr_target_cwnd_mult_limit && (bbr->rc_use_google == 0)) { + /* + * Limit the cwnd to not be above N x the target plus whats + * is outstanding. The target is based on the current b/w + * estimate. + */ + uint32_t target; + + target = bbr_get_target_cwnd(bbr, bbr_get_bw(bbr), BBR_UNIT); + target += ctf_outstanding(tp); + target *= bbr_target_cwnd_mult_limit; + if (tp->snd_cwnd > target) + tp->snd_cwnd = target; + bbr_log_type_cwndupd(bbr, 0, 0, 0, 10, 0, 0, __LINE__); + } +} + +static int +bbr_window_update_needed(struct tcpcb *tp, struct socket *so, uint32_t recwin, int32_t maxseg) +{ + /* + * "adv" is the amount we could increase the window, taking into + * account that we are limited by TCP_MAXWIN << tp->rcv_scale. + */ + uint32_t adv; + int32_t oldwin; + + adv = min(recwin, TCP_MAXWIN << tp->rcv_scale); + if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { + oldwin = (tp->rcv_adv - tp->rcv_nxt); + adv -= oldwin; + } else + oldwin = 0; + + /* + * If the new window size ends up being the same as the old size + * when it is scaled, then don't force a window update. + */ + if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale) + return (0); + + if (adv >= (2 * maxseg) && + (adv >= (so->so_rcv.sb_hiwat / 4) || + recwin <= (so->so_rcv.sb_hiwat / 8) || + so->so_rcv.sb_hiwat <= 8 * maxseg)) { + return (1); + } + if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) + return (1); + return (0); +} + +/* + * Return 0 on success and a errno on failure to send. + * Note that a 0 return may not mean we sent anything + * if the TCB was on the hpts. A non-zero return + * does indicate the error we got from ip[6]_output. + */ +static int +bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) +{ + struct socket *so; + int32_t len; + uint32_t cts; + uint32_t recwin, sendwin; + int32_t sb_offset; + int32_t flags, abandon, error = 0; + struct tcp_log_buffer *lgb = NULL; + struct mbuf *m; + struct mbuf *mb; + uint32_t if_hw_tsomaxsegcount = 0; + uint32_t if_hw_tsomaxsegsize = 0; + uint32_t if_hw_tsomax = 0; + struct ip *ip = NULL; +#ifdef TCPDEBUG + struct ipovly *ipov = NULL; +#endif + struct tcp_bbr *bbr; + struct tcphdr *th; +#ifdef NETFLIX_TCPOUDP + struct udphdr *udp = NULL; +#endif + u_char opt[TCP_MAXOLEN]; + unsigned ipoptlen, optlen, hdrlen; +#ifdef NETFLIX_TCPOUDP + unsigned ulen; +#endif + uint32_t bbr_seq; + uint32_t delay_calc=0; + uint8_t doing_tlp = 0; + uint8_t local_options; +#ifdef BBR_INVARIANTS + uint8_t doing_retran_from = 0; + uint8_t picked_up_retran = 0; +#endif + uint8_t wanted_cookie = 0; + uint8_t more_to_rxt=0; + int32_t prefetch_so_done = 0; + int32_t prefetch_rsm = 0; + uint32_t what_we_can = 0; + uint32_t tot_len = 0; + uint32_t rtr_cnt = 0; + uint32_t maxseg, pace_max_segs, p_maxseg; + int32_t csum_flags; + int32_t hw_tls; +#if defined(IPSEC) || defined(IPSEC_SUPPORT) + unsigned ipsec_optlen = 0; + +#endif + volatile int32_t sack_rxmit; + struct bbr_sendmap *rsm = NULL; + int32_t tso, mtu; + int force_tso = 0; + struct tcpopt to; + int32_t slot = 0; + struct inpcb *inp; + struct sockbuf *sb; + uint32_t hpts_calling; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; + int32_t isipv6; +#endif + uint8_t app_limited = BBR_JR_SENT_DATA; + uint8_t filled_all = 0; + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + /* We take a cache hit here */ + memcpy(&bbr->rc_tv, tv, sizeof(struct timeval)); + cts = tcp_tv_to_usectick(&bbr->rc_tv); + inp = bbr->rc_inp; + so = inp->inp_socket; + sb = &so->so_snd; +#ifdef KERN_TLS + if (sb->sb_flags & SB_TLS_IFNET) + hw_tls = 1; + else +#endif + hw_tls = 0; + kern_prefetch(sb, &maxseg); + maxseg = tp->t_maxseg - bbr->rc_last_options; + if (bbr_minseg(bbr) < maxseg) { + tcp_bbr_tso_size_check(bbr, cts); + } + /* Remove any flags that indicate we are pacing on the inp */ + pace_max_segs = bbr->r_ctl.rc_pace_max_segs; + p_maxseg = min(maxseg, pace_max_segs); + INP_WLOCK_ASSERT(inp); +#ifdef TCP_OFFLOAD + if (tp->t_flags & TF_TOE) + return (tcp_offload_output(tp)); +#endif + +#ifdef INET6 + if (bbr->r_state) { + /* Use the cache line loaded if possible */ + isipv6 = bbr->r_is_v6; + } else { + isipv6 = (inp->inp_vflag & INP_IPV6) != 0; + } +#endif + if (((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && + inp->inp_in_hpts) { + /* + * We are on the hpts for some timer but not hptsi output. + * Possibly remove from the hpts so we can send/recv etc. + */ + if ((tp->t_flags & TF_ACKNOW) == 0) { + /* + * No immediate demand right now to send an ack, but + * the user may have read, making room for new data + * (a window update). If so we may want to cancel + * whatever timer is running (KEEP/DEL-ACK?) and + * continue to send out a window update. Or we may + * have gotten more data into the socket buffer to + * send. + */ + recwin = min(max(sbspace(&so->so_rcv), 0), + TCP_MAXWIN << tp->rcv_scale); + if ((bbr_window_update_needed(tp, so, recwin, maxseg) == 0) && + ((sbavail(sb) + ((tcp_outflags[tp->t_state] & TH_FIN) ? 1 : 0)) <= + (tp->snd_max - tp->snd_una))) { + /* + * Nothing new to send and no window update + * is needed to send. Lets just return and + * let the timer-run off. + */ + return (0); + } + } + tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT); + bbr_timer_cancel(bbr, __LINE__, cts); + } + if (bbr->r_ctl.rc_last_delay_val) { + /* Calculate a rough delay for early escape to sending */ + if (SEQ_GT(cts, bbr->rc_pacer_started)) + delay_calc = cts - bbr->rc_pacer_started; + if (delay_calc >= bbr->r_ctl.rc_last_delay_val) + delay_calc -= bbr->r_ctl.rc_last_delay_val; + else + delay_calc = 0; + } + /* Mark that we have called bbr_output(). */ + if ((bbr->r_timer_override) || + (tp->t_flags & TF_FORCEDATA) || + (tp->t_state < TCPS_ESTABLISHED)) { + /* Timeouts or early states are exempt */ + if (inp->inp_in_hpts) + tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT); + } else if (inp->inp_in_hpts) { + if ((bbr->r_ctl.rc_last_delay_val) && + (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && + delay_calc) { + /* + * We were being paced for output and the delay has + * already exceeded when we were supposed to be + * called, lets go ahead and pull out of the hpts + * and call output. + */ + counter_u64_add(bbr_out_size[TCP_MSS_ACCT_LATE], 1); + bbr->r_ctl.rc_last_delay_val = 0; + tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT); + } else if (tp->t_state == TCPS_CLOSED) { + bbr->r_ctl.rc_last_delay_val = 0; + tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT); + } else { + /* + * On the hpts, you shall not pass! even if ACKNOW + * is on, we will when the hpts fires, unless of + * course we are overdue. + */ + counter_u64_add(bbr_out_size[TCP_MSS_ACCT_INPACE], 1); + return (0); + } + } + bbr->rc_cwnd_limited = 0; + if (bbr->r_ctl.rc_last_delay_val) { + /* recalculate the real delay and deal with over/under */ + if (SEQ_GT(cts, bbr->rc_pacer_started)) + delay_calc = cts - bbr->rc_pacer_started; + else + delay_calc = 0; + if (delay_calc >= bbr->r_ctl.rc_last_delay_val) + /* Setup the delay which will be added in */ + delay_calc -= bbr->r_ctl.rc_last_delay_val; + else { + /* + * We are early setup to adjust + * our slot time. + */ + bbr->r_ctl.rc_agg_early += (bbr->r_ctl.rc_last_delay_val - delay_calc); + bbr->r_ctl.rc_last_delay_val = 0; + bbr->r_agg_early_set = 1; + if (bbr->r_ctl.rc_hptsi_agg_delay) { + if (bbr->r_ctl.rc_hptsi_agg_delay >= bbr->r_ctl.rc_agg_early) { + /* Nope our previous late cancels out the early */ + bbr->r_ctl.rc_hptsi_agg_delay -= bbr->r_ctl.rc_agg_early; + bbr->r_agg_early_set = 0; + bbr->r_ctl.rc_agg_early = 0; + } else { + bbr->r_ctl.rc_agg_early -= bbr->r_ctl.rc_hptsi_agg_delay; + bbr->r_ctl.rc_hptsi_agg_delay = 0; + } + } + bbr_log_pacing_delay_calc(bbr, inp->inp_hpts_calls, + bbr->r_ctl.rc_agg_early, cts, 3, 0, + bbr->r_agg_early_set, 3); + BBR_STAT_INC(bbr_early); + delay_calc = 0; + } + } else { + /* We were not delayed due to hptsi */ + if (bbr->r_agg_early_set) + bbr->r_ctl.rc_agg_early = 0; + bbr->r_agg_early_set = 0; + delay_calc = 0; + } + if (delay_calc) { + /* + * We had a hptsi delay which means we are falling behind on + * sending at the expected rate. Calculate an extra amount + * of data we can send, if any, to put us back on track. + */ + if ((bbr->r_ctl.rc_hptsi_agg_delay + delay_calc) < bbr->r_ctl.rc_hptsi_agg_delay) + bbr->r_ctl.rc_hptsi_agg_delay = 0xffffffff; + else + bbr->r_ctl.rc_hptsi_agg_delay += delay_calc; + } + sendwin = min(tp->snd_wnd, tp->snd_cwnd); + if ((tp->snd_una == tp->snd_max) && + (bbr->rc_bbr_state != BBR_STATE_IDLE_EXIT) && + (sbavail(sb))) { + /* + * Ok we have been idle with nothing outstanding + * we possibly need to start fresh with either a new + * suite of states or a fast-ramp up. + */ + bbr_restart_after_idle(bbr, + cts, bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time)); + } + /* + * Now was there a hptsi delay where we are behind? We only count + * being behind if: a) We are not in recovery. b) There was a delay. + * c) We had room to send something. + * + */ + hpts_calling = inp->inp_hpts_calls; + inp->inp_hpts_calls = 0; + if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { + if (bbr_process_timers(tp, bbr, cts, hpts_calling)) { + counter_u64_add(bbr_out_size[TCP_MSS_ACCT_ATIMER], 1); + return (0); + } + } + bbr->rc_inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY; + if (hpts_calling && + (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { + bbr->r_ctl.rc_last_delay_val = 0; + } + bbr->r_timer_override = 0; + bbr->r_wanted_output = 0; + /* + * For TFO connections in SYN_RECEIVED, only allow the initial + * SYN|ACK and those sent by the retransmit timer. + */ + if (IS_FASTOPEN(tp->t_flags) && + ((tp->t_state == TCPS_SYN_RECEIVED) || + (tp->t_state == TCPS_SYN_SENT)) && + SEQ_GT(tp->snd_max, tp->snd_una) && /* inital SYN or SYN|ACK sent */ + (tp->t_rxtshift == 0)) { /* not a retransmit */ + return (0); + } + /* + * Before sending anything check for a state update. For hpts + * calling without input this is important. If its input calling + * then this was already done. + */ + if (bbr->rc_use_google == 0) + bbr_check_bbr_for_state(bbr, cts, __LINE__, 0); +again: + /* + * If we've recently taken a timeout, snd_max will be greater than + * snd_max. BBR in general does not pay much attention to snd_nxt + * for historic reasons the persist timer still uses it. This means + * we have to look at it. All retransmissions that are not persits + * use the rsm that needs to be sent so snd_nxt is ignored. At the + * end of this routine we pull snd_nxt always up to snd_max. + */ + doing_tlp = 0; +#ifdef BBR_INVARIANTS + doing_retran_from = picked_up_retran = 0; +#endif + error = 0; + tso = 0; + slot = 0; + mtu = 0; + sendwin = min(tp->snd_wnd, tp->snd_cwnd); + sb_offset = tp->snd_max - tp->snd_una; + flags = tcp_outflags[tp->t_state]; + sack_rxmit = 0; + len = 0; + rsm = NULL; + if (flags & TH_RST) { + SOCKBUF_LOCK(sb); + goto send; + } +recheck_resend: + while (bbr->r_ctl.rc_free_cnt < bbr_min_req_free) { + /* We need to always have one in reserve */ + rsm = bbr_alloc(bbr); + if (rsm == NULL) { + error = ENOMEM; + /* Lie to get on the hpts */ + tot_len = tp->t_maxseg; + if (hpts_calling) + /* Retry in a ms */ + slot = 1001; + goto just_return_nolock; + } + TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_free, rsm, r_next); + bbr->r_ctl.rc_free_cnt++; + rsm = NULL; + } + /* What do we send, a resend? */ + if (bbr->r_ctl.rc_resend == NULL) { + /* Check for rack timeout */ + bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts); + if (bbr->r_ctl.rc_resend) { +#ifdef BBR_INVARIANTS + picked_up_retran = 1; +#endif + bbr_cong_signal(tp, NULL, CC_NDUPACK, bbr->r_ctl.rc_resend); + } + } + if (bbr->r_ctl.rc_resend) { + rsm = bbr->r_ctl.rc_resend; +#ifdef BBR_INVARIANTS + doing_retran_from = 1; +#endif + /* Remove any TLP flags its a RACK or T-O */ + rsm->r_flags &= ~BBR_TLP; + bbr->r_ctl.rc_resend = NULL; + if (SEQ_LT(rsm->r_start, tp->snd_una)) { +#ifdef BBR_INVARIANTS + panic("Huh, tp:%p bbr:%p rsm:%p start:%u < snd_una:%u\n", + tp, bbr, rsm, rsm->r_start, tp->snd_una); + goto recheck_resend; +#else + /* TSNH */ + rsm = NULL; + goto recheck_resend; +#endif + } + rtr_cnt++; + if (rsm->r_flags & BBR_HAS_SYN) { + /* Only retransmit a SYN by itself */ + len = 0; + if ((flags & TH_SYN) == 0) { + /* Huh something is wrong */ + rsm->r_start++; + if (rsm->r_start == rsm->r_end) { + /* Clean it up, somehow we missed the ack? */ + bbr_log_syn(tp, NULL); + } else { + /* TFO with data? */ + rsm->r_flags &= ~BBR_HAS_SYN; + len = rsm->r_end - rsm->r_start; + } + } else { + /* Retransmitting SYN */ + rsm = NULL; + SOCKBUF_LOCK(sb); + goto send; + } + } else + len = rsm->r_end - rsm->r_start; + if ((bbr->rc_resends_use_tso == 0) && +#ifdef KERN_TLS + ((sb->sb_flags & SB_TLS_IFNET) == 0) && +#endif + (len > maxseg)) { + len = maxseg; + more_to_rxt = 1; + } + sb_offset = rsm->r_start - tp->snd_una; + if (len > 0) { + sack_rxmit = 1; + TCPSTAT_INC(tcps_sack_rexmits); + TCPSTAT_ADD(tcps_sack_rexmit_bytes, + min(len, maxseg)); + } else { + /* I dont think this can happen */ + rsm = NULL; + goto recheck_resend; + } + BBR_STAT_INC(bbr_resends_set); + } else if (bbr->r_ctl.rc_tlp_send) { + /* + * Tail loss probe + */ + doing_tlp = 1; + rsm = bbr->r_ctl.rc_tlp_send; + bbr->r_ctl.rc_tlp_send = NULL; + sack_rxmit = 1; + len = rsm->r_end - rsm->r_start; + rtr_cnt++; + if ((bbr->rc_resends_use_tso == 0) && (len > maxseg)) + len = maxseg; + + if (SEQ_GT(tp->snd_una, rsm->r_start)) { +#ifdef BBR_INVARIANTS + panic("tp:%p bbc:%p snd_una:%u rsm:%p r_start:%u", + tp, bbr, tp->snd_una, rsm, rsm->r_start); +#else + /* TSNH */ + rsm = NULL; + goto recheck_resend; +#endif + } + sb_offset = rsm->r_start - tp->snd_una; + BBR_STAT_INC(bbr_tlp_set); + } + /* + * Enforce a connection sendmap count limit if set + * as long as we are not retransmiting. + */ + if ((rsm == NULL) && + (bbr_tcp_map_entries_limit > 0) && + (bbr->r_ctl.rc_num_maps_alloced >= bbr_tcp_map_entries_limit)) { + BBR_STAT_INC(bbr_alloc_limited); + if (!bbr->alloc_limit_reported) { + bbr->alloc_limit_reported = 1; + BBR_STAT_INC(bbr_alloc_limited_conns); + } + goto just_return_nolock; + } +#ifdef BBR_INVARIANTS + if (rsm && SEQ_LT(rsm->r_start, tp->snd_una)) { + panic("tp:%p bbr:%p rsm:%p sb_offset:%u len:%u", + tp, bbr, rsm, sb_offset, len); + } +#endif + /* + * Get standard flags, and add SYN or FIN if requested by 'hidden' + * state flags. + */ + if (tp->t_flags & TF_NEEDFIN && (rsm == NULL)) + flags |= TH_FIN; + if (tp->t_flags & TF_NEEDSYN) + flags |= TH_SYN; + + if (rsm && (rsm->r_flags & BBR_HAS_FIN)) { + /* we are retransmitting the fin */ + len--; + if (len) { + /* + * When retransmitting data do *not* include the + * FIN. This could happen from a TLP probe if we + * allowed data with a FIN. + */ + flags &= ~TH_FIN; + } + } else if (rsm) { + if (flags & TH_FIN) + flags &= ~TH_FIN; + } + if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { + void *end_rsm; + + end_rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_tmap, bbr_sendmap, r_tnext); + if (end_rsm) + kern_prefetch(end_rsm, &prefetch_rsm); + prefetch_rsm = 1; + } + SOCKBUF_LOCK(sb); + /* + * If in persist timeout with window of 0, send 1 byte. Otherwise, + * if window is small but nonzero and time TF_SENTFIN expired, we + * will send what we can and go to transmit state. + */ + if (tp->t_flags & TF_FORCEDATA) { + if ((sendwin == 0) || (sendwin <= (tp->snd_max - tp->snd_una))) { + /* + * If we still have some data to send, then clear + * the FIN bit. Usually this would happen below + * when it realizes that we aren't sending all the + * data. However, if we have exactly 1 byte of + * unsent data, then it won't clear the FIN bit + * below, and if we are in persist state, we wind up + * sending the packet without recording that we sent + * the FIN bit. + * + * We can't just blindly clear the FIN bit, because + * if we don't have any more data to send then the + * probe will be the FIN itself. + */ + if (sb_offset < sbused(sb)) + flags &= ~TH_FIN; + sendwin = 1; + } else { + if ((bbr->rc_in_persist != 0) && + (tp->snd_wnd >= min((bbr->r_ctl.rc_high_rwnd/2), + bbr_minseg(bbr)))) { + /* Exit persists if there is space */ + bbr_exit_persist(tp, bbr, cts, __LINE__); + } + if (rsm == NULL) { + /* + * If we are dropping persist mode then we + * need to correct sb_offset if not a + * retransmit. + */ + sb_offset = tp->snd_max - tp->snd_una; + } + } + } + /* + * If snd_nxt == snd_max and we have transmitted a FIN, the + * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a + * negative length. This can also occur when TCP opens up its + * congestion window while receiving additional duplicate acks after + * fast-retransmit because TCP will reset snd_nxt to snd_max after + * the fast-retransmit. + * + * In the normal retransmit-FIN-only case, however, snd_nxt will be + * set to snd_una, the sb_offset will be 0, and the length may wind + * up 0. + * + * If sack_rxmit is true we are retransmitting from the scoreboard + * in which case len is already set. + */ + if (sack_rxmit == 0) { + uint32_t avail; + + avail = sbavail(sb); + if (SEQ_GT(tp->snd_max, tp->snd_una)) + sb_offset = tp->snd_max - tp->snd_una; + else + sb_offset = 0; + if (bbr->rc_tlp_new_data) { + /* TLP is forcing out new data */ + uint32_t tlplen; + + doing_tlp = 1; + tlplen = maxseg; + + if (tlplen > (uint32_t)(avail - sb_offset)) { + tlplen = (uint32_t)(avail - sb_offset); + } + if (tlplen > tp->snd_wnd) { + len = tp->snd_wnd; + } else { + len = tlplen; + } + bbr->rc_tlp_new_data = 0; + } else { + what_we_can = len = bbr_what_can_we_send(tp, bbr, sendwin, avail, sb_offset, cts); + if ((len < p_maxseg) && + (bbr->rc_in_persist == 0) && + (ctf_outstanding(tp) >= (2 * p_maxseg)) && + ((avail - sb_offset) >= p_maxseg)) { + /* + * We are not completing whats in the socket + * buffer (i.e. there is at least a segment + * waiting to send) and we have 2 or more + * segments outstanding. There is no sense + * of sending a little piece. Lets defer and + * and wait until we can send a whole + * segment. + */ + len = 0; + } + if ((tp->t_flags & TF_FORCEDATA) && (bbr->rc_in_persist)) { + /* + * We are in persists, figure out if + * a retransmit is available (maybe the previous + * persists we sent) or if we have to send new + * data. + */ + rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); + if (rsm) { + len = rsm->r_end - rsm->r_start; + if (rsm->r_flags & BBR_HAS_FIN) + len--; + if ((bbr->rc_resends_use_tso == 0) && (len > maxseg)) + len = maxseg; + if (len > 1) + BBR_STAT_INC(bbr_persist_reneg); + /* + * XXXrrs we could force the len to + * 1 byte here to cause the chunk to + * split apart.. but that would then + * mean we always retransmit it as + * one byte even after the window + * opens. + */ + sack_rxmit = 1; + sb_offset = rsm->r_start - tp->snd_una; + } else { + /* + * First time through in persists or peer + * acked our one byte. Though we do have + * to have something in the sb. + */ + len = 1; + sb_offset = 0; + if (avail == 0) + len = 0; + } + } + } + } + if (prefetch_so_done == 0) { + kern_prefetch(so, &prefetch_so_done); + prefetch_so_done = 1; + } + /* + * Lop off SYN bit if it has already been sent. However, if this is + * SYN-SENT state and if segment contains data and if we don't know + * that foreign host supports TAO, suppress sending segment. + */ + if ((flags & TH_SYN) && (rsm == NULL) && + SEQ_GT(tp->snd_max, tp->snd_una)) { + if (tp->t_state != TCPS_SYN_RECEIVED) + flags &= ~TH_SYN; + /* + * When sending additional segments following a TFO SYN|ACK, + * do not include the SYN bit. + */ + if (IS_FASTOPEN(tp->t_flags) && + (tp->t_state == TCPS_SYN_RECEIVED)) + flags &= ~TH_SYN; + sb_offset--, len++; + if (sbavail(sb) == 0) + len = 0; + } else if ((flags & TH_SYN) && rsm) { + /* + * Subtract one from the len for the SYN being + * retransmitted. + */ + len--; + } + /* + * Be careful not to send data and/or FIN on SYN segments. This + * measure is needed to prevent interoperability problems with not + * fully conformant TCP implementations. + */ + if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { + len = 0; + flags &= ~TH_FIN; + } + /* + * On TFO sockets, ensure no data is sent in the following cases: + * + * - When retransmitting SYN|ACK on a passively-created socket + * - When retransmitting SYN on an actively created socket + * - When sending a zero-length cookie (cookie request) on an + * actively created socket + * - When the socket is in the CLOSED state (RST is being sent) + */ + if (IS_FASTOPEN(tp->t_flags) && + (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || + ((tp->t_state == TCPS_SYN_SENT) && + (tp->t_tfo_client_cookie_len == 0)) || + (flags & TH_RST))) { + len = 0; + sack_rxmit = 0; + rsm = NULL; + } + /* Without fast-open there should never be data sent on a SYN */ + if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) + len = 0; + if (len <= 0) { + /* + * If FIN has been sent but not acked, but we haven't been + * called to retransmit, len will be < 0. Otherwise, window + * shrank after we sent into it. If window shrank to 0, + * cancel pending retransmit, pull snd_nxt back to (closed) + * window, and set the persist timer if it isn't already + * going. If the window didn't close completely, just wait + * for an ACK. + * + * We also do a general check here to ensure that we will + * set the persist timer when we have data to send, but a + * 0-byte window. This makes sure the persist timer is set + * even if the packet hits one of the "goto send" lines + * below. + */ + len = 0; + if ((tp->snd_wnd == 0) && + (TCPS_HAVEESTABLISHED(tp->t_state)) && + (tp->snd_una == tp->snd_max) && + (sb_offset < (int)sbavail(sb))) { + /* + * Not enough room in the rwnd to send + * a paced segment out. + */ + bbr_enter_persist(tp, bbr, cts, __LINE__); + } + } else if ((rsm == NULL) && + (doing_tlp == 0) && + (len < bbr->r_ctl.rc_pace_max_segs)) { + /* + * We are not sending a full segment for + * some reason. Should we not send anything (think + * sws or persists)? + */ + if ((tp->snd_wnd < min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) && + (TCPS_HAVEESTABLISHED(tp->t_state)) && + (len < (int)(sbavail(sb) - sb_offset))) { + /* + * Here the rwnd is less than + * the pacing size, this is not a retransmit, + * we are established and + * the send is not the last in the socket buffer + * lets not send, and possibly enter persists. + */ + len = 0; + if (tp->snd_max == tp->snd_una) + bbr_enter_persist(tp, bbr, cts, __LINE__); + } else if ((tp->snd_cwnd >= bbr->r_ctl.rc_pace_max_segs) && + (ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + + bbr->r_ctl.rc_lost_bytes)) > (2 * maxseg)) && + (len < (int)(sbavail(sb) - sb_offset)) && + (len < bbr_minseg(bbr))) { + /* + * Here we are not retransmitting, and + * the cwnd is not so small that we could + * not send at least a min size (rxt timer + * not having gone off), We have 2 segments or + * more already in flight, its not the tail end + * of the socket buffer and the cwnd is blocking + * us from sending out minimum pacing segment size. + * Lets not send anything. + */ + bbr->rc_cwnd_limited = 1; + len = 0; + } else if (((tp->snd_wnd - ctf_outstanding(tp)) < + min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) && + (ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + + bbr->r_ctl.rc_lost_bytes)) > (2 * maxseg)) && + (len < (int)(sbavail(sb) - sb_offset)) && + (TCPS_HAVEESTABLISHED(tp->t_state))) { + /* + * Here we have a send window but we have + * filled it up and we can't send another pacing segment. + * We also have in flight more than 2 segments + * and we are not completing the sb i.e. we allow + * the last bytes of the sb to go out even if + * its not a full pacing segment. + */ + len = 0; + } + } + /* len will be >= 0 after this point. */ + KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); + tcp_sndbuf_autoscale(tp, so, sendwin); + /* + * + */ + if (bbr->rc_in_persist && + len && + (rsm == NULL) && + (len < min((bbr->r_ctl.rc_high_rwnd/2), bbr->r_ctl.rc_pace_max_segs))) { + /* + * We are in persist, not doing a retransmit and don't have enough space + * yet to send a full TSO. So is it at the end of the sb + * if so we need to send else nuke to 0 and don't send. + */ + int sbleft; + if (sbavail(sb) > sb_offset) + sbleft = sbavail(sb) - sb_offset; + else + sbleft = 0; + if (sbleft >= min((bbr->r_ctl.rc_high_rwnd/2), bbr->r_ctl.rc_pace_max_segs)) { + /* not at end of sb lets not send */ + len = 0; + } + } + /* + * Decide if we can use TCP Segmentation Offloading (if supported by + * hardware). + * + * TSO may only be used if we are in a pure bulk sending state. The + * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP + * options prevent using TSO. With TSO the TCP header is the same + * (except for the sequence number) for all generated packets. This + * makes it impossible to transmit any options which vary per + * generated segment or packet. + * + * IPv4 handling has a clear separation of ip options and ip header + * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() + * does the right thing below to provide length of just ip options + * and thus checking for ipoptlen is enough to decide if ip options + * are present. + */ +#ifdef INET6 + if (isipv6) + ipoptlen = ip6_optlen(inp); + else +#endif + if (inp->inp_options) + ipoptlen = inp->inp_options->m_len - + offsetof(struct ipoption, ipopt_list); + else + ipoptlen = 0; +#if defined(IPSEC) || defined(IPSEC_SUPPORT) + /* + * Pre-calculate here as we save another lookup into the darknesses + * of IPsec that way and can actually decide if TSO is ok. + */ +#ifdef INET6 + if (isipv6 && IPSEC_ENABLED(ipv6)) + ipsec_optlen = IPSEC_HDRSIZE(ipv6, inp); +#ifdef INET + else +#endif +#endif /* INET6 */ +#ifdef INET + if (IPSEC_ENABLED(ipv4)) + ipsec_optlen = IPSEC_HDRSIZE(ipv4, inp); +#endif /* INET */ +#endif /* IPSEC */ +#if defined(IPSEC) || defined(IPSEC_SUPPORT) + ipoptlen += ipsec_optlen; +#endif + if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && + (len > maxseg) && + (tp->t_port == 0) && + ((tp->t_flags & TF_SIGNATURE) == 0) && + tp->rcv_numsacks == 0 && + ipoptlen == 0) + tso = 1; + + recwin = min(max(sbspace(&so->so_rcv), 0), + TCP_MAXWIN << tp->rcv_scale); + /* + * Sender silly window avoidance. We transmit under the following + * conditions when len is non-zero: + * + * - We have a full segment (or more with TSO) - This is the last + * buffer in a write()/send() and we are either idle or running + * NODELAY - we've timed out (e.g. persist timer) - we have more + * then 1/2 the maximum send window's worth of data (receiver may be + * limited the window size) - we need to retransmit + */ + if (rsm) + goto send; + if (len) { + if (sack_rxmit) + goto send; + if (len >= p_maxseg) + goto send; + /* + * NOTE! on localhost connections an 'ack' from the remote + * end may occur synchronously with the output and cause us + * to flush a buffer queued with moretocome. XXX + * + */ + if (((tp->t_flags & TF_MORETOCOME) == 0) && /* normal case */ + ((tp->t_flags & TF_NODELAY) || + ((uint32_t)len + (uint32_t)sb_offset) >= sbavail(&so->so_snd)) && + (tp->t_flags & TF_NOPUSH) == 0) { + goto send; + } + if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ + goto send; + } + if (tp->t_flags & TF_FORCEDATA) { /* typ. timeout case */ + goto send; + } + if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { + goto send; + } + } + /* + * Sending of standalone window updates. + * + * Window updates are important when we close our window due to a + * full socket buffer and are opening it again after the application + * reads data from it. Once the window has opened again and the + * remote end starts to send again the ACK clock takes over and + * provides the most current window information. + * + * We must avoid the silly window syndrome whereas every read from + * the receive buffer, no matter how small, causes a window update + * to be sent. We also should avoid sending a flurry of window + * updates when the socket buffer had queued a lot of data and the + * application is doing small reads. + * + * Prevent a flurry of pointless window updates by only sending an + * update when we can increase the advertized window by more than + * 1/4th of the socket buffer capacity. When the buffer is getting + * full or is very small be more aggressive and send an update + * whenever we can increase by two mss sized segments. In all other + * situations the ACK's to new incoming data will carry further + * window increases. + * + * Don't send an independent window update if a delayed ACK is + * pending (it will get piggy-backed on it) or the remote side + * already has done a half-close and won't send more data. Skip + * this if the connection is in T/TCP half-open state. + */ + if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && + !(tp->t_flags & TF_DELACK) && + !TCPS_HAVERCVDFIN(tp->t_state)) { + /* Check to see if we should do a window update */ + if (bbr_window_update_needed(tp, so, recwin, maxseg)) + goto send; + } + /* + * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW + * is also a catch-all for the retransmit timer timeout case. + */ + if (tp->t_flags & TF_ACKNOW) { + goto send; + } + if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { + goto send; + } + if (SEQ_GT(tp->snd_up, tp->snd_una)) { + goto send; + } + /* + * If our state indicates that FIN should be sent and we have not + * yet done so, then we need to send. + */ + if (flags & TH_FIN && + ((tp->t_flags & TF_SENTFIN) == 0)) { + goto send; + } + /* + * No reason to send a segment, just return. + */ +just_return: + SOCKBUF_UNLOCK(sb); +just_return_nolock: + if (tot_len) + slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0); + if (bbr->rc_no_pacing) + slot = 0; + if (tot_len == 0) { + if ((ctf_outstanding(tp) + min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) >= + tp->snd_wnd) { + BBR_STAT_INC(bbr_rwnd_limited); + app_limited = BBR_JR_RWND_LIMITED; + bbr_cwnd_limiting(tp, bbr, ctf_outstanding(tp)); + if ((bbr->rc_in_persist == 0) && + TCPS_HAVEESTABLISHED(tp->t_state) && + (tp->snd_max == tp->snd_una) && + sbavail(&tp->t_inpcb->inp_socket->so_snd)) { + /* No send window.. we must enter persist */ + bbr_enter_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__); + } + } else if (ctf_outstanding(tp) >= sbavail(sb)) { + BBR_STAT_INC(bbr_app_limited); + app_limited = BBR_JR_APP_LIMITED; + bbr_cwnd_limiting(tp, bbr, ctf_outstanding(tp)); + } else if ((ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + + bbr->r_ctl.rc_lost_bytes)) + p_maxseg) >= tp->snd_cwnd) { + BBR_STAT_INC(bbr_cwnd_limited); + app_limited = BBR_JR_CWND_LIMITED; + bbr_cwnd_limiting(tp, bbr, ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + + bbr->r_ctl.rc_lost_bytes))); + bbr->rc_cwnd_limited = 1; + } else { + BBR_STAT_INC(bbr_app_limited); + app_limited = BBR_JR_APP_LIMITED; + bbr_cwnd_limiting(tp, bbr, ctf_outstanding(tp)); + } + bbr->r_ctl.rc_hptsi_agg_delay = 0; + bbr->r_agg_early_set = 0; + bbr->r_ctl.rc_agg_early = 0; + bbr->r_ctl.rc_last_delay_val = 0; + } else if (bbr->rc_use_google == 0) + bbr_check_bbr_for_state(bbr, cts, __LINE__, 0); + /* Are we app limited? */ + if ((app_limited == BBR_JR_APP_LIMITED) || + (app_limited == BBR_JR_RWND_LIMITED)) { + /** + * We are application limited. + */ + bbr->r_ctl.r_app_limited_until = (ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + + bbr->r_ctl.rc_lost_bytes)) + bbr->r_ctl.rc_delivered); + } + if (tot_len == 0) + counter_u64_add(bbr_out_size[TCP_MSS_ACCT_JUSTRET], 1); + tp->t_flags &= ~TF_FORCEDATA; + /* Dont update the time if we did not send */ + bbr->r_ctl.rc_last_delay_val = 0; + bbr->rc_output_starts_timer = 1; + bbr_start_hpts_timer(bbr, tp, cts, 9, slot, tot_len); + bbr_log_type_just_return(bbr, cts, tot_len, hpts_calling, app_limited, p_maxseg, len); + if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { + /* Make sure snd_nxt is drug up */ + tp->snd_nxt = tp->snd_max; + } + return (error); + +send: + if (doing_tlp == 0) { + /* + * Data not a TLP, and its not the rxt firing. If it is the + * rxt firing, we want to leave the tlp_in_progress flag on + * so we don't send another TLP. It has to be a rack timer + * or normal send (response to acked data) to clear the tlp + * in progress flag. + */ + bbr->rc_tlp_in_progress = 0; + bbr->rc_tlp_rtx_out = 0; + } else { + /* + * Its a TLP. + */ + bbr->rc_tlp_in_progress = 1; + } + bbr_timer_cancel(bbr, __LINE__, cts); + if (rsm == NULL) { + if (sbused(sb) > 0) { + /* + * This is sub-optimal. We only send a stand alone + * FIN on its own segment. + */ + if (flags & TH_FIN) { + flags &= ~TH_FIN; + if ((len == 0) && ((tp->t_flags & TF_ACKNOW) == 0)) { + /* Lets not send this */ + slot = 0; + goto just_return; + } + } + } + } else { + /* + * We do *not* send a FIN on a retransmit if it has data. + * The if clause here where len > 1 should never come true. + */ + if ((len > 0) && + (((rsm->r_flags & BBR_HAS_FIN) == 0) && + (flags & TH_FIN))) { + flags &= ~TH_FIN; + len--; + } + } + SOCKBUF_LOCK_ASSERT(sb); + if (len > 0) { + if ((tp->snd_una == tp->snd_max) && + (bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time) >= bbr_rtt_probe_time)) { + /* + * This qualifies as a RTT_PROBE session since we + * drop the data outstanding to nothing and waited + * more than bbr_rtt_probe_time. + */ + bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_WASIDLE, 0); + bbr_set_reduced_rtt(bbr, cts, __LINE__); + } + if (len >= maxseg) + tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; + else + tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; + } + /* + * Before ESTABLISHED, force sending of initial options unless TCP + * set not to do any options. NOTE: we assume that the IP/TCP header + * plus TCP options always fit in a single mbuf, leaving room for a + * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) + * + optlen <= MCLBYTES + */ + optlen = 0; +#ifdef INET6 + if (isipv6) + hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); + else +#endif + hdrlen = sizeof(struct tcpiphdr); + + /* + * Compute options for segment. We only have to care about SYN and + * established connection segments. Options for SYN-ACK segments + * are handled in TCP syncache. + */ + to.to_flags = 0; + local_options = 0; + if ((tp->t_flags & TF_NOOPT) == 0) { + /* Maximum segment size. */ + if (flags & TH_SYN) { + to.to_mss = tcp_mssopt(&inp->inp_inc); +#ifdef NETFLIX_TCPOUDP + if (tp->t_port) + to.to_mss -= V_tcp_udp_tunneling_overhead; +#endif + to.to_flags |= TOF_MSS; + /* + * On SYN or SYN|ACK transmits on TFO connections, + * only include the TFO option if it is not a + * retransmit, as the presence of the TFO option may + * have caused the original SYN or SYN|ACK to have + * been dropped by a middlebox. + */ + if (IS_FASTOPEN(tp->t_flags) && + (tp->t_rxtshift == 0)) { + if (tp->t_state == TCPS_SYN_RECEIVED) { + to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; + to.to_tfo_cookie = + (u_int8_t *)&tp->t_tfo_cookie.server; + to.to_flags |= TOF_FASTOPEN; + wanted_cookie = 1; + } else if (tp->t_state == TCPS_SYN_SENT) { + to.to_tfo_len = + tp->t_tfo_client_cookie_len; + to.to_tfo_cookie = + tp->t_tfo_cookie.client; + to.to_flags |= TOF_FASTOPEN; + wanted_cookie = 1; + } + } + } + /* Window scaling. */ + if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { + to.to_wscale = tp->request_r_scale; + to.to_flags |= TOF_SCALE; + } + /* Timestamps. */ + if ((tp->t_flags & TF_RCVD_TSTMP) || + ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { + to.to_tsval = tcp_tv_to_mssectick(&bbr->rc_tv) + tp->ts_offset; + to.to_tsecr = tp->ts_recent; + to.to_flags |= TOF_TS; + local_options += TCPOLEN_TIMESTAMP + 2; + } + /* Set receive buffer autosizing timestamp. */ + if (tp->rfbuf_ts == 0 && + (so->so_rcv.sb_flags & SB_AUTOSIZE)) + tp->rfbuf_ts = tcp_tv_to_mssectick(&bbr->rc_tv); + /* Selective ACK's. */ + if (flags & TH_SYN) + to.to_flags |= TOF_SACKPERM; + else if (TCPS_HAVEESTABLISHED(tp->t_state) && + tp->rcv_numsacks > 0) { + to.to_flags |= TOF_SACK; + to.to_nsacks = tp->rcv_numsacks; + to.to_sacks = (u_char *)tp->sackblks; + } +#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) + /* TCP-MD5 (RFC2385). */ + if (tp->t_flags & TF_SIGNATURE) + to.to_flags |= TOF_SIGNATURE; +#endif /* TCP_SIGNATURE */ + + /* Processing the options. */ + hdrlen += (optlen = tcp_addoptions(&to, opt)); + /* + * If we wanted a TFO option to be added, but it was unable + * to fit, ensure no data is sent. + */ + if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && + !(to.to_flags & TOF_FASTOPEN)) + len = 0; + } +#ifdef NETFLIX_TCPOUDP + if (tp->t_port) { + if (V_tcp_udp_tunneling_port == 0) { + /* The port was removed?? */ + SOCKBUF_UNLOCK(&so->so_snd); + return (EHOSTUNREACH); + } + + hdrlen += sizeof(struct udphdr); + } +#endif +#ifdef INET6 + if (isipv6) + ipoptlen = ip6_optlen(tp->t_inpcb); + else +#endif + if (tp->t_inpcb->inp_options) + ipoptlen = tp->t_inpcb->inp_options->m_len - + offsetof(struct ipoption, ipopt_list); + else + ipoptlen = 0; + ipoptlen = 0; +#if defined(IPSEC) || defined(IPSEC_SUPPORT) + ipoptlen += ipsec_optlen; +#endif + if (bbr->rc_last_options != local_options) { + /* + * Cache the options length this generally does not change + * on a connection. We use this to calculate TSO. + */ + bbr->rc_last_options = local_options; + } + maxseg = tp->t_maxseg - (ipoptlen + optlen); + p_maxseg = min(maxseg, pace_max_segs); + /* + * Adjust data length if insertion of options will bump the packet + * length beyond the t_maxseg length. Clear the FIN bit because we + * cut off the tail of the segment. + */ +#ifdef KERN_TLS + /* force TSO for so TLS offload can get mss */ + if (sb->sb_flags & SB_TLS_IFNET) { + force_tso = 1; + } +#endif + + if (len > maxseg) { + if (len != 0 && (flags & TH_FIN)) { + flags &= ~TH_FIN; + } + if (tso) { + uint32_t moff; + int32_t max_len; + + /* extract TSO information */ + if_hw_tsomax = tp->t_tsomax; + if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; + if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; + KASSERT(ipoptlen == 0, + ("%s: TSO can't do IP options", __func__)); + + /* + * Check if we should limit by maximum payload + * length: + */ + if (if_hw_tsomax != 0) { + /* compute maximum TSO length */ + max_len = (if_hw_tsomax - hdrlen - + max_linkhdr); + if (max_len <= 0) { + len = 0; + } else if (len > max_len) { + len = max_len; + } + } + /* + * Prevent the last segment from being fractional + * unless the send sockbuf can be emptied: + */ + if (((sb_offset + len) < sbavail(sb)) && + (hw_tls == 0)) { + moff = len % (uint32_t)maxseg; + if (moff != 0) { + len -= moff; + } + } + /* + * In case there are too many small fragments don't + * use TSO: + */ + if (len <= maxseg) { + len = maxseg; + tso = 0; + } + } else { + /* Not doing TSO */ + if (optlen + ipoptlen > tp->t_maxseg) { + /* + * Since we don't have enough space to put + * the IP header chain and the TCP header in + * one packet as required by RFC 7112, don't + * send it. + */ + SOCKBUF_UNLOCK(&so->so_snd); + error = EMSGSIZE; + sack_rxmit = 0; + goto out; + } + len = maxseg; + } + } else { + /* Not doing TSO */ + if_hw_tsomaxsegcount = 0; + tso = 0; + } + KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, + ("%s: len > IP_MAXPACKET", __func__)); +#ifdef DIAGNOSTIC +#ifdef INET6 + if (max_linkhdr + hdrlen > MCLBYTES) +#else + if (max_linkhdr + hdrlen > MHLEN) +#endif + panic("tcphdr too big"); +#endif + /* + * This KASSERT is here to catch edge cases at a well defined place. + * Before, those had triggered (random) panic conditions further + * down. + */ +#ifdef BBR_INVARIANTS + if (sack_rxmit) { + if (SEQ_LT(rsm->r_start, tp->snd_una)) { + panic("RSM:%p TP:%p bbr:%p start:%u is < snd_una:%u", + rsm, tp, bbr, rsm->r_start, tp->snd_una); + } + } +#endif + KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); + if ((len == 0) && + (flags & TH_FIN) && + (sbused(sb))) { + /* + * We have outstanding data, don't send a fin by itself!. + */ + slot = 0; + goto just_return; + } + /* + * Grab a header mbuf, attaching a copy of data to be transmitted, + * and initialize the header from the template for sends on this + * connection. + */ + if (len) { + uint32_t moff; + uint32_t orig_len; + + /* + * We place a limit on sending with hptsi. + */ + if ((rsm == NULL) && len > pace_max_segs) + len = pace_max_segs; + if (len <= maxseg) + tso = 0; +#ifdef INET6 + if (MHLEN < hdrlen + max_linkhdr) + m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); + else +#endif + m = m_gethdr(M_NOWAIT, MT_DATA); + + if (m == NULL) { + BBR_STAT_INC(bbr_failed_mbuf_aloc); + bbr_log_enobuf_jmp(bbr, len, cts, __LINE__, len, 0, 0); + SOCKBUF_UNLOCK(sb); + error = ENOBUFS; + sack_rxmit = 0; + goto out; + } + m->m_data += max_linkhdr; + m->m_len = hdrlen; + /* + * Start the m_copy functions from the closest mbuf to the + * sb_offset in the socket buffer chain. + */ + if ((sb_offset > sbavail(sb)) || ((len + sb_offset) > sbavail(sb))) { +#ifdef BBR_INVARIANTS + if ((len + sb_offset) > (sbavail(sb) + ((flags & (TH_FIN | TH_SYN)) ? 1 : 0))) + panic("tp:%p bbr:%p len:%u sb_offset:%u sbavail:%u rsm:%p %u:%u:%u", + tp, bbr, len, sb_offset, sbavail(sb), rsm, + doing_retran_from, + picked_up_retran, + doing_tlp); + +#endif + /* + * In this messed up situation we have two choices, + * a) pretend the send worked, and just start timers + * and what not (not good since that may lead us + * back here a lot). b) Send the lowest segment + * in the map. c) Drop the connection. Lets do + * which if it continues to happen will lead to + * via timeouts. + */ + BBR_STAT_INC(bbr_offset_recovery); + rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); + sb_offset = 0; + if (rsm == NULL) { + sack_rxmit = 0; + len = sbavail(sb); + } else { + sack_rxmit = 1; + if (rsm->r_start != tp->snd_una) { + /* + * Things are really messed up, + * is the only thing to do. + */ + BBR_STAT_INC(bbr_offset_drop); + tcp_set_inp_to_drop(inp, EFAULT); + return (0); + } + len = rsm->r_end - rsm->r_start; + } + if (len > sbavail(sb)) + len = sbavail(sb); + if (len > maxseg) + len = maxseg; + } + mb = sbsndptr_noadv(sb, sb_offset, &moff); + if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { + m_copydata(mb, moff, (int)len, + mtod(m, caddr_t)+hdrlen); + if (rsm == NULL) + sbsndptr_adv(sb, mb, len); + m->m_len += len; + } else { + struct sockbuf *msb; + + if (rsm) + msb = NULL; + else + msb = sb; +#ifdef BBR_INVARIANTS + if ((len + moff) > (sbavail(sb) + ((flags & (TH_FIN | TH_SYN)) ? 1 : 0))) { + if (rsm) { + panic("tp:%p bbr:%p len:%u moff:%u sbavail:%u rsm:%p snd_una:%u rsm_start:%u flg:%x %u:%u:%u sr:%d ", + tp, bbr, len, moff, + sbavail(sb), rsm, + tp->snd_una, rsm->r_flags, rsm->r_start, + doing_retran_from, + picked_up_retran, + doing_tlp, sack_rxmit); + } else { + panic("tp:%p bbr:%p len:%u moff:%u sbavail:%u sb_offset:%u snd_una:%u", + tp, bbr, len, moff, sbavail(sb), sb_offset, tp->snd_una); + } + } +#endif + orig_len = len; + m->m_next = tcp_m_copym( +#ifdef NETFLIX_COPY_ARGS + tp, +#endif + mb, moff, &len, + if_hw_tsomaxsegcount, + if_hw_tsomaxsegsize, msb, + ((rsm == NULL) ? hw_tls : 0) +#ifdef NETFLIX_COPY_ARGS + , &filled_all +#endif + ); + if (len <= maxseg && !force_tso) { + /* + * Must have ran out of mbufs for the copy + * shorten it to no longer need tso. Lets + * not put on sendalot since we are low on + * mbufs. + */ + tso = 0; + } + if (m->m_next == NULL) { + SOCKBUF_UNLOCK(sb); + (void)m_free(m); + error = ENOBUFS; + sack_rxmit = 0; + goto out; + } + } +#ifdef BBR_INVARIANTS + if (tso && len < maxseg) { + panic("tp:%p tso on, but len:%d < maxseg:%d", + tp, len, maxseg); + } + if (tso && if_hw_tsomaxsegcount) { + int32_t seg_cnt = 0; + struct mbuf *foo; + + foo = m; + while (foo) { + seg_cnt++; + foo = foo->m_next; + } + if (seg_cnt > if_hw_tsomaxsegcount) { + panic("seg_cnt:%d > max:%d", seg_cnt, if_hw_tsomaxsegcount); + } + } +#endif + /* + * If we're sending everything we've got, set PUSH. (This + * will keep happy those implementations which only give + * data to the user when a buffer fills or a PUSH comes in.) + */ + if (sb_offset + len == sbused(sb) && + sbused(sb) && + !(flags & TH_SYN)) { + flags |= TH_PUSH; + } + SOCKBUF_UNLOCK(sb); + } else { + SOCKBUF_UNLOCK(sb); + if (tp->t_flags & TF_ACKNOW) + TCPSTAT_INC(tcps_sndacks); + else if (flags & (TH_SYN | TH_FIN | TH_RST)) + TCPSTAT_INC(tcps_sndctrl); + else if (SEQ_GT(tp->snd_up, tp->snd_una)) + TCPSTAT_INC(tcps_sndurg); + else + TCPSTAT_INC(tcps_sndwinup); + + m = m_gethdr(M_NOWAIT, MT_DATA); + if (m == NULL) { + BBR_STAT_INC(bbr_failed_mbuf_aloc); + bbr_log_enobuf_jmp(bbr, len, cts, __LINE__, len, 0, 0); + error = ENOBUFS; + /* Fudge the send time since we could not send */ + sack_rxmit = 0; + goto out; + } +#ifdef INET6 + if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && + MHLEN >= hdrlen) { + M_ALIGN(m, hdrlen); + } else +#endif + m->m_data += max_linkhdr; + m->m_len = hdrlen; + } + SOCKBUF_UNLOCK_ASSERT(sb); + m->m_pkthdr.rcvif = (struct ifnet *)0; +#ifdef MAC + mac_inpcb_create_mbuf(inp, m); +#endif +#ifdef INET6 + if (isipv6) { + ip6 = mtod(m, struct ip6_hdr *); +#ifdef NETFLIX_TCPOUDP + if (tp->t_port) { + udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); + udp->uh_sport = htons(V_tcp_udp_tunneling_port); + udp->uh_dport = tp->t_port; + ulen = hdrlen + len - sizeof(struct ip6_hdr); + udp->uh_ulen = htons(ulen); + th = (struct tcphdr *)(udp + 1); + } else { +#endif + th = (struct tcphdr *)(ip6 + 1); + +#ifdef NETFLIX_TCPOUDP + } +#endif + tcpip_fillheaders(inp, +#ifdef NETFLIX_TCPOUDP + tp->t_port, +#endif + ip6, th); + } else +#endif /* INET6 */ + { + ip = mtod(m, struct ip *); +#ifdef TCPDEBUG + ipov = (struct ipovly *)ip; +#endif +#ifdef NETFLIX_TCPOUDP + if (tp->t_port) { + udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); + udp->uh_sport = htons(V_tcp_udp_tunneling_port); + udp->uh_dport = tp->t_port; + ulen = hdrlen + len - sizeof(struct ip); + udp->uh_ulen = htons(ulen); + th = (struct tcphdr *)(udp + 1); + } else +#endif + th = (struct tcphdr *)(ip + 1); + tcpip_fillheaders(inp, +#ifdef NETFLIX_TCPOUDP + tp->t_port, +#endif + ip, th); + } + /* + * If we are doing retransmissions, then snd_nxt will not reflect + * the first unsent octet. For ACK only packets, we do not want the + * sequence number of the retransmitted packet, we want the sequence + * number of the next unsent octet. So, if there is no data (and no + * SYN or FIN), use snd_max instead of snd_nxt when filling in + * ti_seq. But if we are in persist state, snd_max might reflect + * one byte beyond the right edge of the window, so use snd_nxt in + * that case, since we know we aren't doing a retransmission. + * (retransmit and persist are mutually exclusive...) + */ + if (sack_rxmit == 0) { + if (len && ((flags & (TH_FIN | TH_SYN | TH_RST)) == 0)) { + /* New data (including new persists) */ + th->th_seq = htonl(tp->snd_max); + bbr_seq = tp->snd_max; + } else if (flags & TH_SYN) { + /* Syn's always send from iss */ + th->th_seq = htonl(tp->iss); + bbr_seq = tp->iss; + } else if (flags & TH_FIN) { + if (flags & TH_FIN && tp->t_flags & TF_SENTFIN) { + /* + * If we sent the fin already its 1 minus + * snd_max + */ + th->th_seq = (htonl(tp->snd_max - 1)); + bbr_seq = (tp->snd_max - 1); + } else { + /* First time FIN use snd_max */ + th->th_seq = htonl(tp->snd_max); + bbr_seq = tp->snd_max; + } + } else if (flags & TH_RST) { + /* + * For a Reset send the last cum ack in sequence + * (this like any other choice may still generate a + * challenge ack, if a ack-update packet is in + * flight). + */ + th->th_seq = htonl(tp->snd_una); + bbr_seq = tp->snd_una; + } else { + /* + * len == 0 and not persist we use snd_max, sending + * an ack unless we have sent the fin then its 1 + * minus. + */ + /* + * XXXRRS Question if we are in persists and we have + * nothing outstanding to send and we have not sent + * a FIN, we will send an ACK. In such a case it + * might be better to send (tp->snd_una - 1) which + * would force the peer to ack. + */ + if (tp->t_flags & TF_SENTFIN) { + th->th_seq = htonl(tp->snd_max - 1); + bbr_seq = (tp->snd_max - 1); + } else { + th->th_seq = htonl(tp->snd_max); + bbr_seq = tp->snd_max; + } + } + } else { + /* All retransmits use the rsm to guide the send */ + th->th_seq = htonl(rsm->r_start); + bbr_seq = rsm->r_start; + } + th->th_ack = htonl(tp->rcv_nxt); + if (optlen) { + bcopy(opt, th + 1, optlen); + th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; + } + th->th_flags = flags; + /* + * Calculate receive window. Don't shrink window, but avoid silly + * window syndrome. + */ + if ((flags & TH_RST) || ((recwin < (so->so_rcv.sb_hiwat / 4) && + recwin < maxseg))) + recwin = 0; + if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && + recwin < (tp->rcv_adv - tp->rcv_nxt)) + recwin = (tp->rcv_adv - tp->rcv_nxt); + if (recwin > TCP_MAXWIN << tp->rcv_scale) + recwin = TCP_MAXWIN << tp->rcv_scale; + + /* + * According to RFC1323 the window field in a SYN (i.e., a or + * ) segment itself is never scaled. The case is + * handled in syncache. + */ + if (flags & TH_SYN) + th->th_win = htons((u_short) + (min(sbspace(&so->so_rcv), TCP_MAXWIN))); + else + th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); + /* + * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 + * window. This may cause the remote transmitter to stall. This + * flag tells soreceive() to disable delayed acknowledgements when + * draining the buffer. This can occur if the receiver is + * attempting to read more data than can be buffered prior to + * transmitting on the connection. + */ + if (th->th_win == 0) { + tp->t_sndzerowin++; + tp->t_flags |= TF_RXWIN0SENT; + } else + tp->t_flags &= ~TF_RXWIN0SENT; + if (SEQ_GT(tp->snd_up, tp->snd_max)) { + th->th_urp = htons((u_short)(tp->snd_up - tp->snd_max)); + th->th_flags |= TH_URG; + } else + /* + * If no urgent pointer to send, then we pull the urgent + * pointer to the left edge of the send window so that it + * doesn't drift into the send window on sequence number + * wraparound. + */ + tp->snd_up = tp->snd_una; /* drag it along */ + +#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) + if (to.to_flags & TOF_SIGNATURE) { + /* + * Calculate MD5 signature and put it into the place + * determined before. NOTE: since TCP options buffer doesn't + * point into mbuf's data, calculate offset and use it. + */ + if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, + (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { + /* + * Do not send segment if the calculation of MD5 + * digest has failed. + */ + goto out; + } + } +#endif + + /* + * Put TCP length in extended header, and then checksum extended + * header and data. + */ + m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ +#ifdef INET6 + if (isipv6) { + /* + * ip6_plen is not need to be filled now, and will be filled + * in ip6_output. + */ +#ifdef NETFLIX_TCPOUDP + if (tp->t_port) { + m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); + udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); + th->th_sum = htons(0); + UDPSTAT_INC(udps_opackets); + } else { +#endif + csum_flags = m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) + + optlen + len, IPPROTO_TCP, 0); +#ifdef NETFLIX_TCPOUDP + } +#endif + } +#endif +#if defined(INET6) && defined(INET) + else +#endif +#ifdef INET + { +#ifdef NETFLIX_TCPOUDP + if (tp->t_port) { + m->m_pkthdr.csum_flags = CSUM_UDP; + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); + udp->uh_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); + th->th_sum = htons(0); + UDPSTAT_INC(udps_opackets); + } else { +#endif + csum_flags = m->m_pkthdr.csum_flags = CSUM_TCP; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + th->th_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + + IPPROTO_TCP + len + optlen)); +#ifdef NETFLIX_TCPOUDP + } +#endif + /* IP version must be set here for ipv4/ipv6 checking later */ + KASSERT(ip->ip_v == IPVERSION, + ("%s: IP version incorrect: %d", __func__, ip->ip_v)); + } +#endif + + /* + * Enable TSO and specify the size of the segments. The TCP pseudo + * header checksum is always provided. XXX: Fixme: This is currently + * not the case for IPv6. + */ + if (tso || force_tso) { + KASSERT(force_tso || len > maxseg, + ("%s: len:%d <= tso_segsz:%d", __func__, len, maxseg)); + m->m_pkthdr.csum_flags |= CSUM_TSO; + csum_flags |= CSUM_TSO; + m->m_pkthdr.tso_segsz = maxseg; + } + KASSERT(len + hdrlen == m_length(m, NULL), + ("%s: mbuf chain different than expected: %d + %u != %u", + __func__, len, hdrlen, m_length(m, NULL))); + +#ifdef TCP_HHOOK + /* Run HHOOK_TC_ESTABLISHED_OUT helper hooks. */ + hhook_run_tcp_est_out(tp, th, &to, len, tso); +#endif +#ifdef TCPDEBUG + /* + * Trace. + */ + if (so->so_options & SO_DEBUG) { + u_short save = 0; + +#ifdef INET6 + if (!isipv6) +#endif + { + save = ipov->ih_len; + ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + + * (th->th_off << 2) */ ); + } + tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); +#ifdef INET6 + if (!isipv6) +#endif + ipov->ih_len = save; + } +#endif /* TCPDEBUG */ + + /* Log to the black box */ + if (tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); + /* Record info on type of transmission */ + log.u_bbr.flex1 = bbr->r_ctl.rc_hptsi_agg_delay; + log.u_bbr.flex2 = (bbr->r_recovery_bw << 3); + log.u_bbr.flex3 = maxseg; + log.u_bbr.flex4 = delay_calc; + /* Encode filled_all into the upper flex5 bit */ + log.u_bbr.flex5 = bbr->rc_past_init_win; + log.u_bbr.flex5 <<= 1; + log.u_bbr.flex5 |= bbr->rc_no_pacing; + log.u_bbr.flex5 <<= 29; + if (filled_all) + log.u_bbr.flex5 |= 0x80000000; + log.u_bbr.flex5 |= tp->t_maxseg; + log.u_bbr.flex6 = bbr->r_ctl.rc_pace_max_segs; + log.u_bbr.flex7 = (bbr->rc_bbr_state << 8) | bbr_state_val(bbr); + /* lets poke in the low and the high here for debugging */ + log.u_bbr.pkts_out = bbr->rc_tp->t_maxseg; + if (rsm || sack_rxmit) { + if (doing_tlp) + log.u_bbr.flex8 = 2; + else + log.u_bbr.flex8 = 1; + } else { + log.u_bbr.flex8 = 0; + } + lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, + len, &log, false, NULL, NULL, 0, tv); + } else { + lgb = NULL; + } + /* + * Fill in IP length and desired time to live and send to IP level. + * There should be a better way to handle ttl and tos; we could keep + * them in the template, but need a way to checksum without them. + */ + /* + * m->m_pkthdr.len should have been set before cksum calcuration, + * because in6_cksum() need it. + */ +#ifdef INET6 + if (isipv6) { + /* + * we separately set hoplimit for every segment, since the + * user might want to change the value via setsockopt. Also, + * desired default hop limit might be changed via Neighbor + * Discovery. + */ + ip6->ip6_hlim = in6_selecthlim(inp, NULL); + + /* + * Set the packet size here for the benefit of DTrace + * probes. ip6_output() will set it properly; it's supposed + * to include the option header lengths as well. + */ + ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); + + if (V_path_mtu_discovery && maxseg > V_tcp_minmss) + tp->t_flags2 |= TF2_PLPMTU_PMTUD; + else + tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; + + if (tp->t_state == TCPS_SYN_SENT) + TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); + + TCP_PROBE5(send, NULL, tp, ip6, tp, th); + /* TODO: IPv6 IP6TOS_ECT bit on */ + error = ip6_output(m, inp->in6p_outputopts, + &inp->inp_route6, + ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), + NULL, NULL, inp); + + if (error == EMSGSIZE && inp->inp_route6.ro_rt != NULL) + mtu = inp->inp_route6.ro_rt->rt_mtu; + } +#endif /* INET6 */ +#if defined(INET) && defined(INET6) + else +#endif +#ifdef INET + { + ip->ip_len = htons(m->m_pkthdr.len); +#ifdef INET6 + if (isipv6) + ip->ip_ttl = in6_selecthlim(inp, NULL); +#endif /* INET6 */ + /* + * If we do path MTU discovery, then we set DF on every + * packet. This might not be the best thing to do according + * to RFC3390 Section 2. However the tcp hostcache migitates + * the problem so it affects only the first tcp connection + * with a host. + * + * NB: Don't set DF on small MTU/MSS to have a safe + * fallback. + */ + if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { + tp->t_flags2 |= TF2_PLPMTU_PMTUD; + if (tp->t_port == 0 || len < V_tcp_minmss) { + ip->ip_off |= htons(IP_DF); + } + } else { + tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; + } + + if (tp->t_state == TCPS_SYN_SENT) + TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); + + TCP_PROBE5(send, NULL, tp, ip, tp, th); + + error = ip_output(m, inp->inp_options, &inp->inp_route, + ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0, + inp); + if (error == EMSGSIZE && inp->inp_route.ro_rt != NULL) + mtu = inp->inp_route.ro_rt->rt_mtu; + } +#endif /* INET */ +out: + + if (lgb) { + lgb->tlb_errno = error; + lgb = NULL; + } + /* + * In transmit state, time the transmission and arrange for the + * retransmit. In persist state, just set snd_max. + */ + if (error == 0) { + if (TCPS_HAVEESTABLISHED(tp->t_state) && + (tp->t_flags & TF_SACK_PERMIT) && + tp->rcv_numsacks > 0) + tcp_clean_dsack_blocks(tp); + /* We sent an ack clear the bbr_segs_rcvd count */ + bbr->output_error_seen = 0; + bbr->oerror_cnt = 0; + bbr->bbr_segs_rcvd = 0; + if (len == 0) + counter_u64_add(bbr_out_size[TCP_MSS_ACCT_SNDACK], 1); + else if (hw_tls) { + if (filled_all || + (len >= bbr->r_ctl.rc_pace_max_segs)) + BBR_STAT_INC(bbr_meets_tso_thresh); + else { + if (doing_tlp) { + BBR_STAT_INC(bbr_miss_tlp); + bbr_log_type_hrdwtso(tp, bbr, len, 1, what_we_can); + + + } else if (rsm) { + BBR_STAT_INC(bbr_miss_retran); + bbr_log_type_hrdwtso(tp, bbr, len, 2, what_we_can); + } else if ((ctf_outstanding(tp) + bbr->r_ctl.rc_pace_max_segs) > sbavail(sb)) { + BBR_STAT_INC(bbr_miss_tso_app); + bbr_log_type_hrdwtso(tp, bbr, len, 3, what_we_can); + } else if ((ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + + bbr->r_ctl.rc_lost_bytes)) + bbr->r_ctl.rc_pace_max_segs) > tp->snd_cwnd) { + BBR_STAT_INC(bbr_miss_tso_cwnd); + bbr_log_type_hrdwtso(tp, bbr, len, 4, what_we_can); + } else if ((ctf_outstanding(tp) + bbr->r_ctl.rc_pace_max_segs) > tp->snd_wnd) { + BBR_STAT_INC(bbr_miss_tso_rwnd); + bbr_log_type_hrdwtso(tp, bbr, len, 5, what_we_can); + } else { + BBR_STAT_INC(bbr_miss_unknown); + bbr_log_type_hrdwtso(tp, bbr, len, 6, what_we_can); + } + } + } + /* Do accounting for new sends */ + if ((len > 0) && (rsm == NULL)) { + int idx; + if (tp->snd_una == tp->snd_max) { + /* + * Special case to match google, when + * nothing is in flight the delivered + * time does get updated to the current + * time (see tcp_rate_bsd.c). + */ + bbr->r_ctl.rc_del_time = cts; + } + if (len >= maxseg) { + idx = (len / maxseg) + 3; + if (idx >= TCP_MSS_ACCT_ATIMER) + counter_u64_add(bbr_out_size[(TCP_MSS_ACCT_ATIMER - 1)], 1); + else + counter_u64_add(bbr_out_size[idx], 1); + } else { + /* smaller than a MSS */ + idx = len / (bbr_hptsi_bytes_min - bbr->rc_last_options); + if (idx >= TCP_MSS_SMALL_MAX_SIZE_DIV) + idx = (TCP_MSS_SMALL_MAX_SIZE_DIV - 1); + counter_u64_add(bbr_out_size[(idx + TCP_MSS_SMALL_SIZE_OFF)], 1); + } + } + } + abandon = 0; + /* + * We must do the send accounting before we log the output, + * otherwise the state of the rsm could change and we account to the + * wrong bucket. + */ + if (len > 0) { + bbr_do_send_accounting(tp, bbr, rsm, len, error); + if (error == 0) { + if (tp->snd_una == tp->snd_max) + bbr->r_ctl.rc_tlp_rxt_last_time = cts; + } + } + bbr_log_output(bbr, tp, &to, len, bbr_seq, (uint8_t) flags, error, + cts, mb, &abandon, rsm, 0, sb); + if (abandon) { + /* + * If bbr_log_output destroys the TCB or sees a TH_RST being + * sent we should hit this condition. + */ + return (0); + } + if (((tp->t_flags & TF_FORCEDATA) == 0) || + (bbr->rc_in_persist == 0)) { + /* + * Advance snd_nxt over sequence space of this segment. + */ + if (error) + /* We don't log or do anything with errors */ + goto skip_upd; + + if (tp->snd_una == tp->snd_max && + (len || (flags & (TH_SYN | TH_FIN)))) { + /* + * Update the time we just added data since none was + * outstanding. + */ + bbr_log_progress_event(bbr, tp, ticks, PROGRESS_START, __LINE__); + bbr->rc_tp->t_acktime = ticks; + } + if (flags & (TH_SYN | TH_FIN) && (rsm == NULL)) { + if (flags & TH_SYN) { + tp->snd_max++; + } + if ((flags & TH_FIN) && ((tp->t_flags & TF_SENTFIN) == 0)) { + tp->snd_max++; + tp->t_flags |= TF_SENTFIN; + } + } + if (sack_rxmit == 0) + tp->snd_max += len; +skip_upd: + if ((error == 0) && len) + tot_len += len; + } else { + /* Persists case */ + int32_t xlen = len; + + if (error) + goto nomore; + + if (flags & TH_SYN) + ++xlen; + if ((flags & TH_FIN) && ((tp->t_flags & TF_SENTFIN) == 0)) { + ++xlen; + tp->t_flags |= TF_SENTFIN; + } + if (xlen && (tp->snd_una == tp->snd_max)) { + /* + * Update the time we just added data since none was + * outstanding. + */ + bbr_log_progress_event(bbr, tp, ticks, PROGRESS_START, __LINE__); + bbr->rc_tp->t_acktime = ticks; + } + if (sack_rxmit == 0) + tp->snd_max += xlen; + tot_len += (len + optlen + ipoptlen); + } +nomore: + if (error) { + /* + * Failures do not advance the seq counter above. For the + * case of ENOBUFS we will fall out and become ack-clocked. + * capping the cwnd at the current flight. + * Everything else will just have to retransmit with the timer + * (no pacer). + */ + SOCKBUF_UNLOCK_ASSERT(sb); + BBR_STAT_INC(bbr_saw_oerr); + /* Clear all delay/early tracks */ + bbr->r_ctl.rc_hptsi_agg_delay = 0; + bbr->r_ctl.rc_agg_early = 0; + bbr->r_agg_early_set = 0; + bbr->output_error_seen = 1; + if (bbr->oerror_cnt < 0xf) + bbr->oerror_cnt++; + if (bbr_max_net_error_cnt && (bbr->oerror_cnt >= bbr_max_net_error_cnt)) { + /* drop the session */ + tcp_set_inp_to_drop(inp, ENETDOWN); + } + switch (error) { + case ENOBUFS: + /* + * Make this guy have to get ack's to send + * more but lets make sure we don't + * slam him below a T-O (1MSS). + */ + if (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) { + tp->snd_cwnd = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + + bbr->r_ctl.rc_lost_bytes)) - maxseg; + if (tp->snd_cwnd < maxseg) + tp->snd_cwnd = maxseg; + } + slot = (bbr_error_base_paceout + 1) << bbr->oerror_cnt; + BBR_STAT_INC(bbr_saw_enobuf); + if (bbr->bbr_hdrw_pacing) + counter_u64_add(bbr_hdwr_pacing_enobuf, 1); + else + counter_u64_add(bbr_nohdwr_pacing_enobuf, 1); + /* + * Here even in the enobuf's case we want to do our + * state update. The reason being we may have been + * called by the input function. If so we have had + * things change. + */ + error = 0; + goto enobufs; + case EMSGSIZE: + /* + * For some reason the interface we used initially + * to send segments changed to another or lowered + * its MTU. If TSO was active we either got an + * interface without TSO capabilits or TSO was + * turned off. If we obtained mtu from ip_output() + * then update it and try again. + */ + /* Turn on tracing (or try to) */ + { + int old_maxseg; + + old_maxseg = tp->t_maxseg; + BBR_STAT_INC(bbr_saw_emsgsiz); + bbr_log_msgsize_fail(bbr, tp, len, maxseg, mtu, csum_flags, tso, cts); + if (mtu != 0) + tcp_mss_update(tp, -1, mtu, NULL, NULL); + if (old_maxseg <= tp->t_maxseg) { + /* Huh it did not shrink? */ + tp->t_maxseg = old_maxseg - 40; + bbr_log_msgsize_fail(bbr, tp, len, maxseg, mtu, 0, tso, cts); + } + tp->t_flags &= ~TF_FORCEDATA; + /* + * Nuke all other things that can interfere + * with slot + */ + if ((tot_len + len) && (len >= tp->t_maxseg)) { + slot = bbr_get_pacing_delay(bbr, + bbr->r_ctl.rc_bbr_hptsi_gain, + (tot_len + len), cts, 0); + if (slot < bbr_error_base_paceout) + slot = (bbr_error_base_paceout + 2) << bbr->oerror_cnt; + } else + slot = (bbr_error_base_paceout + 2) << bbr->oerror_cnt; + bbr->rc_output_starts_timer = 1; + bbr_start_hpts_timer(bbr, tp, cts, 10, slot, + tot_len); + return (error); + } + case EPERM: + tp->t_softerror = error; + /* Fall through */ + case EHOSTDOWN: + case EHOSTUNREACH: + case ENETDOWN: + case ENETUNREACH: + if (TCPS_HAVERCVDSYN(tp->t_state)) { + tp->t_softerror = error; + } + /* FALLTHROUGH */ + default: + tp->t_flags &= ~TF_FORCEDATA; + slot = (bbr_error_base_paceout + 3) << bbr->oerror_cnt; + bbr->rc_output_starts_timer = 1; + bbr_start_hpts_timer(bbr, tp, cts, 11, slot, 0); + return (error); + } +#ifdef NETFLIX_STATS + } else if (((tp->t_flags & TF_GPUTINPROG) == 0) && + len && + (rsm == NULL) && + (bbr->rc_in_persist == 0)) { + tp->gput_seq = bbr_seq; + tp->gput_ack = bbr_seq + + min(sbavail(&so->so_snd) - sb_offset, sendwin); + tp->gput_ts = cts; + tp->t_flags |= TF_GPUTINPROG; +#endif + } + TCPSTAT_INC(tcps_sndtotal); + if ((bbr->bbr_hdw_pace_ena) && + (bbr->bbr_attempt_hdwr_pace == 0) && + (bbr->rc_past_init_win) && + (bbr->rc_bbr_state != BBR_STATE_STARTUP) && + (get_filter_value(&bbr->r_ctl.rc_delrate)) && + (inp->inp_route.ro_rt && + inp->inp_route.ro_rt->rt_ifp)) { + /* + * We are past the initial window and + * have at least one measurement so we + * could use hardware pacing if its available. + * We have an interface and we have not attempted + * to setup hardware pacing, lets try to now. + */ + uint64_t rate_wanted; + int err = 0; + + rate_wanted = bbr_get_hardware_rate(bbr); + bbr->bbr_attempt_hdwr_pace = 1; + bbr->r_ctl.crte = tcp_set_pacing_rate(bbr->rc_tp, + inp->inp_route.ro_rt->rt_ifp, + rate_wanted, + (RS_PACING_GEQ|RS_PACING_SUB_OK), + &err); + if (bbr->r_ctl.crte) { + bbr_type_log_hdwr_pacing(bbr, + bbr->r_ctl.crte->ptbl->rs_ifp, + rate_wanted, + bbr->r_ctl.crte->rate, + __LINE__, cts, err); + BBR_STAT_INC(bbr_hdwr_rl_add_ok); + counter_u64_add(bbr_flows_nohdwr_pacing, -1); + counter_u64_add(bbr_flows_whdwr_pacing, 1); + bbr->bbr_hdrw_pacing = 1; + /* Now what is our gain status? */ + if (bbr->r_ctl.crte->rate < rate_wanted) { + /* We have a problem */ + bbr_setup_less_of_rate(bbr, cts, + bbr->r_ctl.crte->rate, rate_wanted); + } else { + /* We are good */ + bbr->gain_is_limited = 0; + bbr->skip_gain = 0; + } + tcp_bbr_tso_size_check(bbr, cts); + } else { + bbr_type_log_hdwr_pacing(bbr, + inp->inp_route.ro_rt->rt_ifp, + rate_wanted, + 0, + __LINE__, cts, err); + BBR_STAT_INC(bbr_hdwr_rl_add_fail); + } + } + if (bbr->bbr_hdrw_pacing) { + /* + * Worry about cases where the route + * changes or something happened that we + * lost our hardware pacing possibly during + * the last ip_output call. + */ + if (inp->inp_snd_tag == NULL) { + /* A change during ip output disabled hw pacing? */ + bbr->bbr_hdrw_pacing = 0; + } else if ((inp->inp_route.ro_rt == NULL) || + (inp->inp_route.ro_rt->rt_ifp != inp->inp_snd_tag->ifp)) { + /* + * We had an interface or route change, + * detach from the current hdwr pacing + * and setup to re-attempt next go + * round. + */ + bbr->bbr_hdrw_pacing = 0; + bbr->bbr_attempt_hdwr_pace = 0; + tcp_rel_pacing_rate(bbr->r_ctl.crte, bbr->rc_tp); + tcp_bbr_tso_size_check(bbr, cts); + } + } + /* + * Data sent (as far as we can tell). If this advertises a larger + * window than any other segment, then remember the size of the + * advertised window. Any pending ACK has now been sent. + */ + if (SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) + tp->rcv_adv = tp->rcv_nxt + recwin; + + tp->last_ack_sent = tp->rcv_nxt; + if ((error == 0) && + (bbr->r_ctl.rc_pace_max_segs > tp->t_maxseg) && + (doing_tlp == 0) && + (tso == 0) && + (hw_tls == 0) && + (len > 0) && + ((flags & TH_RST) == 0) && + (IN_RECOVERY(tp->t_flags) == 0) && + (bbr->rc_in_persist == 0) && + ((tp->t_flags & TF_FORCEDATA) == 0) && + (tot_len < bbr->r_ctl.rc_pace_max_segs)) { + /* + * For non-tso we need to goto again until we have sent out + * enough data to match what we are hptsi out every hptsi + * interval. + */ + if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { + /* Make sure snd_nxt is drug up */ + tp->snd_nxt = tp->snd_max; + } + if (rsm != NULL) { + rsm = NULL; + goto skip_again; + } + rsm = NULL; + sack_rxmit = 0; + tp->t_flags &= ~(TF_ACKNOW | TF_DELACK | TF_FORCEDATA); + goto again; + } +skip_again: + if (((flags & (TH_RST | TH_SYN | TH_FIN)) == 0) && tot_len) { + /* + * Calculate/Re-Calculate the hptsi slot in usecs based on + * what we have sent so far + */ + slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0); + if (bbr->rc_no_pacing) + slot = 0; + } + tp->t_flags &= ~(TF_ACKNOW | TF_DELACK | TF_FORCEDATA); +enobufs: + if (bbr->rc_use_google == 0) + bbr_check_bbr_for_state(bbr, cts, __LINE__, 0); + bbr_cwnd_limiting(tp, bbr, ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + + bbr->r_ctl.rc_lost_bytes))); + bbr->rc_output_starts_timer = 1; + if (bbr->bbr_use_rack_cheat && + (more_to_rxt || + ((bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts)) != NULL))) { + /* Rack cheats and shotguns out all rxt's 1ms apart */ + if (slot > 1000) + slot = 1000; + } + if (bbr->bbr_hdrw_pacing && (bbr->hw_pacing_set == 0)) { + /* + * We don't change the tso size until some number of sends + * to give the hardware commands time to get down + * to the interface. + */ + bbr->r_ctl.bbr_hdwr_cnt_noset_snt++; + if (bbr->r_ctl.bbr_hdwr_cnt_noset_snt >= bbr_hdwr_pacing_delay_cnt) { + bbr->hw_pacing_set = 1; + tcp_bbr_tso_size_check(bbr, cts); + } + } + bbr_start_hpts_timer(bbr, tp, cts, 12, slot, tot_len); + if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { + /* Make sure snd_nxt is drug up */ + tp->snd_nxt = tp->snd_max; + } + return (error); + +} + +/* + * See bbr_output_wtime() for return values. + */ +static int +bbr_output(struct tcpcb *tp) +{ + int32_t ret; + struct timeval tv; + struct tcp_bbr *bbr; + + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + INP_WLOCK_ASSERT(tp->t_inpcb); + (void)tcp_get_usecs(&tv); + ret = bbr_output_wtime(tp, &tv); + return (ret); +} + +static void +bbr_mtu_chg(struct tcpcb *tp) +{ + struct tcp_bbr *bbr; + struct bbr_sendmap *rsm, *frsm = NULL; + uint32_t maxseg; + + /* + * The MTU has changed. a) Clear the sack filter. b) Mark everything + * over the current size as SACK_PASS so a retransmit will occur. + */ + + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + maxseg = tp->t_maxseg - bbr->rc_last_options; + sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una); + TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) { + /* Don't mess with ones acked (by sack?) */ + if (rsm->r_flags & BBR_ACKED) + continue; + if ((rsm->r_end - rsm->r_start) > maxseg) { + /* + * We mark sack-passed on all the previous large + * sends we did. This will force them to retransmit. + */ + rsm->r_flags |= BBR_SACK_PASSED; + if (((rsm->r_flags & BBR_MARKED_LOST) == 0) && + bbr_is_lost(bbr, rsm, bbr->r_ctl.rc_rcvtime)) { + bbr->r_ctl.rc_lost_bytes += rsm->r_end - rsm->r_start; + bbr->r_ctl.rc_lost += rsm->r_end - rsm->r_start; + rsm->r_flags |= BBR_MARKED_LOST; + } + if (frsm == NULL) + frsm = rsm; + } + } + if (frsm) { + bbr->r_ctl.rc_resend = frsm; + } +} + +/* + * bbr_ctloutput() must drop the inpcb lock before performing copyin on + * socket option arguments. When it re-acquires the lock after the copy, it + * has to revalidate that the connection is still valid for the socket + * option. + */ +static int +bbr_set_sockopt(struct socket *so, struct sockopt *sopt, + struct inpcb *inp, struct tcpcb *tp, struct tcp_bbr *bbr) +{ + int32_t error = 0, optval; + + switch (sopt->sopt_name) { + case TCP_RACK_PACE_MAX_SEG: + case TCP_RACK_MIN_TO: + case TCP_RACK_REORD_THRESH: + case TCP_RACK_REORD_FADE: + case TCP_RACK_TLP_THRESH: + case TCP_RACK_PKT_DELAY: + case TCP_BBR_ALGORITHM: + case TCP_BBR_TSLIMITS: + case TCP_BBR_IWINTSO: + case TCP_BBR_RECFORCE: + case TCP_BBR_STARTUP_PG: + case TCP_BBR_DRAIN_PG: + case TCP_BBR_RWND_IS_APP: + case TCP_BBR_PROBE_RTT_INT: + case TCP_BBR_PROBE_RTT_GAIN: + case TCP_BBR_PROBE_RTT_LEN: + case TCP_BBR_STARTUP_LOSS_EXIT: + case TCP_BBR_USEDEL_RATE: + case TCP_BBR_MIN_RTO: + case TCP_BBR_MAX_RTO: + case TCP_BBR_PACE_PER_SEC: + case TCP_DELACK: + case TCP_BBR_PACE_DEL_TAR: + case TCP_BBR_SEND_IWND_IN_TSO: + case TCP_BBR_EXTRA_STATE: + case TCP_BBR_UTTER_MAX_TSO: + case TCP_BBR_MIN_TOPACEOUT: + case TCP_BBR_FLOOR_MIN_TSO: + case TCP_BBR_TSTMP_RAISES: + case TCP_BBR_POLICER_DETECT: + case TCP_BBR_USE_RACK_CHEAT: + case TCP_DATA_AFTER_CLOSE: + case TCP_BBR_HDWR_PACE: + case TCP_BBR_PACE_SEG_MAX: + case TCP_BBR_PACE_SEG_MIN: + case TCP_BBR_PACE_CROSS: + case TCP_BBR_PACE_OH: +#ifdef NETFLIX_PEAKRATE + case TCP_MAXPEAKRATE: +#endif + case TCP_BBR_TMR_PACE_OH: + case TCP_BBR_RACK_RTT_USE: + case TCP_BBR_RETRAN_WTSO: + break; + default: + return (tcp_default_ctloutput(so, sopt, inp, tp)); + break; + } + INP_WUNLOCK(inp); + error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); + if (error) + return (error); + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + INP_WUNLOCK(inp); + return (ECONNRESET); + } + tp = intotcpcb(inp); + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + switch (sopt->sopt_name) { + case TCP_BBR_PACE_PER_SEC: + BBR_OPTS_INC(tcp_bbr_pace_per_sec); + bbr->r_ctl.bbr_hptsi_per_second = optval; + break; + case TCP_BBR_PACE_DEL_TAR: + BBR_OPTS_INC(tcp_bbr_pace_del_tar); + bbr->r_ctl.bbr_hptsi_segments_delay_tar = optval; + break; + case TCP_BBR_PACE_SEG_MAX: + BBR_OPTS_INC(tcp_bbr_pace_seg_max); + bbr->r_ctl.bbr_hptsi_segments_max = optval; + break; + case TCP_BBR_PACE_SEG_MIN: + BBR_OPTS_INC(tcp_bbr_pace_seg_min); + bbr->r_ctl.bbr_hptsi_bytes_min = optval; + break; + case TCP_BBR_PACE_CROSS: + BBR_OPTS_INC(tcp_bbr_pace_cross); + bbr->r_ctl.bbr_cross_over = optval; + break; + case TCP_BBR_ALGORITHM: + BBR_OPTS_INC(tcp_bbr_algorithm); + if (optval && (bbr->rc_use_google == 0)) { + /* Turn on the google mode */ + bbr_google_mode_on(bbr); + if ((optval > 3) && (optval < 500)) { + /* + * Must be at least greater than .3% + * and must be less than 50.0%. + */ + bbr->r_ctl.bbr_google_discount = optval; + } + } else if ((optval == 0) && (bbr->rc_use_google == 1)) { + /* Turn off the google mode */ + bbr_google_mode_off(bbr); + } + break; + case TCP_BBR_TSLIMITS: + BBR_OPTS_INC(tcp_bbr_tslimits); + if (optval == 1) + bbr->rc_use_ts_limit = 1; + else if (optval == 0) + bbr->rc_use_ts_limit = 0; + else + error = EINVAL; + break; + + case TCP_BBR_IWINTSO: + BBR_OPTS_INC(tcp_bbr_iwintso); + if ((optval >= 0) && (optval < 128)) { + uint32_t twin; + + bbr->rc_init_win = optval; + twin = bbr_initial_cwnd(bbr, tp); + if ((bbr->rc_past_init_win == 0) && (twin > tp->snd_cwnd)) + tp->snd_cwnd = twin; + else + error = EBUSY; + } else + error = EINVAL; + break; + case TCP_BBR_STARTUP_PG: + BBR_OPTS_INC(tcp_bbr_startup_pg); + if ((optval > 0) && (optval < BBR_MAX_GAIN_VALUE)) { + bbr->r_ctl.rc_startup_pg = optval; + if (bbr->rc_bbr_state == BBR_STATE_STARTUP) { + bbr->r_ctl.rc_bbr_hptsi_gain = optval; + } + } else + error = EINVAL; + break; + case TCP_BBR_DRAIN_PG: + BBR_OPTS_INC(tcp_bbr_drain_pg); + if ((optval > 0) && (optval < BBR_MAX_GAIN_VALUE)) + bbr->r_ctl.rc_drain_pg = optval; + else + error = EINVAL; + break; + case TCP_BBR_PROBE_RTT_LEN: + BBR_OPTS_INC(tcp_bbr_probertt_len); + if (optval <= 1) + reset_time_small(&bbr->r_ctl.rc_rttprop, (optval * USECS_IN_SECOND)); + else + error = EINVAL; + break; + case TCP_BBR_PROBE_RTT_GAIN: + BBR_OPTS_INC(tcp_bbr_probertt_gain); + if (optval <= BBR_UNIT) + bbr->r_ctl.bbr_rttprobe_gain_val = optval; + else + error = EINVAL; + break; + case TCP_BBR_PROBE_RTT_INT: + BBR_OPTS_INC(tcp_bbr_probe_rtt_int); + if (optval > 1000) + bbr->r_ctl.rc_probertt_int = optval; + else + error = EINVAL; + break; + case TCP_BBR_MIN_TOPACEOUT: + BBR_OPTS_INC(tcp_bbr_topaceout); + if (optval == 0) { + bbr->no_pacing_until = 0; + bbr->rc_no_pacing = 0; + } else if (optval <= 0x00ff) { + bbr->no_pacing_until = optval; + if ((bbr->r_ctl.rc_pkt_epoch < bbr->no_pacing_until) && + (bbr->rc_bbr_state == BBR_STATE_STARTUP)){ + /* Turn on no pacing */ + bbr->rc_no_pacing = 1; + } + } else + error = EINVAL; + break; + case TCP_BBR_STARTUP_LOSS_EXIT: + BBR_OPTS_INC(tcp_bbr_startup_loss_exit); + bbr->rc_loss_exit = optval; + break; + case TCP_BBR_USEDEL_RATE: + error = EINVAL; + break; + case TCP_BBR_MIN_RTO: + BBR_OPTS_INC(tcp_bbr_min_rto); + bbr->r_ctl.rc_min_rto_ms = optval; + break; + case TCP_BBR_MAX_RTO: + BBR_OPTS_INC(tcp_bbr_max_rto); + bbr->rc_max_rto_sec = optval; + break; + case TCP_RACK_MIN_TO: + /* Minimum time between rack t-o's in ms */ + BBR_OPTS_INC(tcp_rack_min_to); + bbr->r_ctl.rc_min_to = optval; + break; + case TCP_RACK_REORD_THRESH: + /* RACK reorder threshold (shift amount) */ + BBR_OPTS_INC(tcp_rack_reord_thresh); + if ((optval > 0) && (optval < 31)) + bbr->r_ctl.rc_reorder_shift = optval; + else + error = EINVAL; + break; + case TCP_RACK_REORD_FADE: + /* Does reordering fade after ms time */ + BBR_OPTS_INC(tcp_rack_reord_fade); + bbr->r_ctl.rc_reorder_fade = optval; + break; + case TCP_RACK_TLP_THRESH: + /* RACK TLP theshold i.e. srtt+(srtt/N) */ + BBR_OPTS_INC(tcp_rack_tlp_thresh); + if (optval) + bbr->rc_tlp_threshold = optval; + else + error = EINVAL; + break; + case TCP_BBR_USE_RACK_CHEAT: + BBR_OPTS_INC(tcp_use_rackcheat); + if (bbr->rc_use_google) { + error = EINVAL; + break; + } + BBR_OPTS_INC(tcp_rack_cheat); + if (optval) + bbr->bbr_use_rack_cheat = 1; + else + bbr->bbr_use_rack_cheat = 0; + break; + case TCP_BBR_FLOOR_MIN_TSO: + BBR_OPTS_INC(tcp_utter_max_tso); + if ((optval >= 0) && (optval < 40)) + bbr->r_ctl.bbr_hptsi_segments_floor = optval; + else + error = EINVAL; + break; + case TCP_BBR_UTTER_MAX_TSO: + BBR_OPTS_INC(tcp_utter_max_tso); + if ((optval >= 0) && (optval < 0xffff)) + bbr->r_ctl.bbr_utter_max = optval; + else + error = EINVAL; + break; + + case TCP_BBR_EXTRA_STATE: + BBR_OPTS_INC(tcp_extra_state); + if (optval) + bbr->rc_use_idle_restart = 1; + else + bbr->rc_use_idle_restart = 0; + break; + case TCP_BBR_SEND_IWND_IN_TSO: + BBR_OPTS_INC(tcp_iwnd_tso); + if (optval) { + bbr->bbr_init_win_cheat = 1; + if (bbr->rc_past_init_win == 0) { + uint32_t cts; + cts = tcp_get_usecs(&bbr->rc_tv); + tcp_bbr_tso_size_check(bbr, cts); + } + } else + bbr->bbr_init_win_cheat = 0; + break; + case TCP_BBR_HDWR_PACE: + BBR_OPTS_INC(tcp_hdwr_pacing); + if (optval){ + bbr->bbr_hdw_pace_ena = 1; + bbr->bbr_attempt_hdwr_pace = 0; + } else { + bbr->bbr_hdw_pace_ena = 0; + if (bbr->bbr_hdrw_pacing) { + bbr->bbr_hdrw_pacing = 0; + in_pcbdetach_txrtlmt(bbr->rc_inp); + } + } + break; + + case TCP_DELACK: + BBR_OPTS_INC(tcp_delack); + if (optval < 100) { + if (optval == 0) /* off */ + tp->t_delayed_ack = 0; + else if (optval == 1) /* on which is 2 */ + tp->t_delayed_ack = 2; + else /* higher than 2 and less than 100 */ + tp->t_delayed_ack = optval; + if (tp->t_flags & TF_DELACK) { + tp->t_flags &= ~TF_DELACK; + tp->t_flags |= TF_ACKNOW; + bbr_output(tp); + } + } else + error = EINVAL; + break; + case TCP_RACK_PKT_DELAY: + /* RACK added ms i.e. rack-rtt + reord + N */ + BBR_OPTS_INC(tcp_rack_pkt_delay); + bbr->r_ctl.rc_pkt_delay = optval; + break; +#ifdef NETFLIX_PEAKRATE + case TCP_MAXPEAKRATE: + BBR_OPTS_INC(tcp_maxpeak); + error = tcp_set_maxpeakrate(tp, optval); + if (!error) + tp->t_peakrate_thr = tp->t_maxpeakrate; + break; +#endif + case TCP_BBR_RETRAN_WTSO: + BBR_OPTS_INC(tcp_retran_wtso); + if (optval) + bbr->rc_resends_use_tso = 1; + else + bbr->rc_resends_use_tso = 0; + break; + case TCP_DATA_AFTER_CLOSE: + BBR_OPTS_INC(tcp_data_ac); + if (optval) + bbr->rc_allow_data_af_clo = 1; + else + bbr->rc_allow_data_af_clo = 0; + break; + case TCP_BBR_POLICER_DETECT: + BBR_OPTS_INC(tcp_policer_det); + if (bbr->rc_use_google == 0) + error = EINVAL; + else if (optval) + bbr->r_use_policer = 1; + else + bbr->r_use_policer = 0; + break; + + case TCP_BBR_TSTMP_RAISES: + BBR_OPTS_INC(tcp_ts_raises); + if (optval) + bbr->ts_can_raise = 1; + else + bbr->ts_can_raise = 0; + break; + case TCP_BBR_TMR_PACE_OH: + BBR_OPTS_INC(tcp_pacing_oh_tmr); + if (bbr->rc_use_google) { + error = EINVAL; + } else { + if (optval) + bbr->r_ctl.rc_incr_tmrs = 1; + else + bbr->r_ctl.rc_incr_tmrs = 0; + } + break; + case TCP_BBR_PACE_OH: + BBR_OPTS_INC(tcp_pacing_oh); + if (bbr->rc_use_google) { + error = EINVAL; + } else { + if (optval > (BBR_INCL_TCP_OH| + BBR_INCL_IP_OH| + BBR_INCL_ENET_OH)) { + error = EINVAL; + break; + } + if (optval & BBR_INCL_TCP_OH) + bbr->r_ctl.rc_inc_tcp_oh = 1; + else + bbr->r_ctl.rc_inc_tcp_oh = 0; + if (optval & BBR_INCL_IP_OH) + bbr->r_ctl.rc_inc_ip_oh = 1; + else + bbr->r_ctl.rc_inc_ip_oh = 0; + if (optval & BBR_INCL_ENET_OH) + bbr->r_ctl.rc_inc_enet_oh = 1; + else + bbr->r_ctl.rc_inc_enet_oh = 0; + } + break; + default: + return (tcp_default_ctloutput(so, sopt, inp, tp)); + break; + } +#ifdef NETFLIX_STATS + tcp_log_socket_option(tp, sopt->sopt_name, optval, error); +#endif + INP_WUNLOCK(inp); + return (error); +} + +/* + * return 0 on success, error-num on failure + */ +static int +bbr_get_sockopt(struct socket *so, struct sockopt *sopt, + struct inpcb *inp, struct tcpcb *tp, struct tcp_bbr *bbr) +{ + int32_t error, optval; + + /* + * Because all our options are either boolean or an int, we can just + * pull everything into optval and then unlock and copy. If we ever + * add a option that is not a int, then this will have quite an + * impact to this routine. + */ + switch (sopt->sopt_name) { + case TCP_BBR_PACE_PER_SEC: + optval = bbr->r_ctl.bbr_hptsi_per_second; + break; + case TCP_BBR_PACE_DEL_TAR: + optval = bbr->r_ctl.bbr_hptsi_segments_delay_tar; + break; + case TCP_BBR_PACE_SEG_MAX: + optval = bbr->r_ctl.bbr_hptsi_segments_max; + break; + case TCP_BBR_MIN_TOPACEOUT: + optval = bbr->no_pacing_until; + break; + case TCP_BBR_PACE_SEG_MIN: + optval = bbr->r_ctl.bbr_hptsi_bytes_min; + break; + case TCP_BBR_PACE_CROSS: + optval = bbr->r_ctl.bbr_cross_over; + break; + case TCP_BBR_ALGORITHM: + optval = bbr->rc_use_google; + break; + case TCP_BBR_TSLIMITS: + optval = bbr->rc_use_ts_limit; + break; + case TCP_BBR_IWINTSO: + optval = bbr->rc_init_win; + break; + case TCP_BBR_STARTUP_PG: + optval = bbr->r_ctl.rc_startup_pg; + break; + case TCP_BBR_DRAIN_PG: + optval = bbr->r_ctl.rc_drain_pg; + break; + case TCP_BBR_PROBE_RTT_INT: + optval = bbr->r_ctl.rc_probertt_int; + break; + case TCP_BBR_PROBE_RTT_LEN: + optval = (bbr->r_ctl.rc_rttprop.cur_time_limit / USECS_IN_SECOND); + break; + case TCP_BBR_PROBE_RTT_GAIN: + optval = bbr->r_ctl.bbr_rttprobe_gain_val; + break; + case TCP_BBR_STARTUP_LOSS_EXIT: + optval = bbr->rc_loss_exit; + break; + case TCP_BBR_USEDEL_RATE: + error = EINVAL; + break; + case TCP_BBR_MIN_RTO: + optval = bbr->r_ctl.rc_min_rto_ms; + break; + case TCP_BBR_MAX_RTO: + optval = bbr->rc_max_rto_sec; + break; + case TCP_RACK_PACE_MAX_SEG: + /* Max segments in a pace */ + optval = bbr->r_ctl.rc_pace_max_segs; + break; + case TCP_RACK_MIN_TO: + /* Minimum time between rack t-o's in ms */ + optval = bbr->r_ctl.rc_min_to; + break; + case TCP_RACK_REORD_THRESH: + /* RACK reorder threshold (shift amount) */ + optval = bbr->r_ctl.rc_reorder_shift; + break; + case TCP_RACK_REORD_FADE: + /* Does reordering fade after ms time */ + optval = bbr->r_ctl.rc_reorder_fade; + break; + case TCP_BBR_USE_RACK_CHEAT: + /* Do we use the rack cheat for rxt */ + optval = bbr->bbr_use_rack_cheat; + break; + case TCP_BBR_FLOOR_MIN_TSO: + optval = bbr->r_ctl.bbr_hptsi_segments_floor; + break; + case TCP_BBR_UTTER_MAX_TSO: + optval = bbr->r_ctl.bbr_utter_max; + break; + case TCP_BBR_SEND_IWND_IN_TSO: + /* Do we send TSO size segments initially */ + optval = bbr->bbr_init_win_cheat; + break; + case TCP_BBR_EXTRA_STATE: + optval = bbr->rc_use_idle_restart; + break; + case TCP_RACK_TLP_THRESH: + /* RACK TLP theshold i.e. srtt+(srtt/N) */ + optval = bbr->rc_tlp_threshold; + break; + case TCP_RACK_PKT_DELAY: + /* RACK added ms i.e. rack-rtt + reord + N */ + optval = bbr->r_ctl.rc_pkt_delay; + break; + case TCP_BBR_RETRAN_WTSO: + optval = bbr->rc_resends_use_tso; + break; + case TCP_DATA_AFTER_CLOSE: + optval = bbr->rc_allow_data_af_clo; + break; + case TCP_DELACK: + optval = tp->t_delayed_ack; + break; + case TCP_BBR_HDWR_PACE: + optval = bbr->bbr_hdw_pace_ena; + break; + case TCP_BBR_POLICER_DETECT: + optval = bbr->r_use_policer; + break; + case TCP_BBR_TSTMP_RAISES: + optval = bbr->ts_can_raise; + break; + case TCP_BBR_TMR_PACE_OH: + optval = bbr->r_ctl.rc_incr_tmrs; + break; + case TCP_BBR_PACE_OH: + optval = 0; + if (bbr->r_ctl.rc_inc_tcp_oh) + optval |= BBR_INCL_TCP_OH; + if (bbr->r_ctl.rc_inc_ip_oh) + optval |= BBR_INCL_IP_OH; + if (bbr->r_ctl.rc_inc_enet_oh) + optval |= BBR_INCL_ENET_OH; + break; + default: + return (tcp_default_ctloutput(so, sopt, inp, tp)); + break; + } + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &optval, sizeof optval); + return (error); +} + +/* + * return 0 on success, error-num on failure + */ +static int +bbr_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) +{ + int32_t error = EINVAL; + struct tcp_bbr *bbr; + + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + if (bbr == NULL) { + /* Huh? */ + goto out; + } + if (sopt->sopt_dir == SOPT_SET) { + return (bbr_set_sockopt(so, sopt, inp, tp, bbr)); + } else if (sopt->sopt_dir == SOPT_GET) { + return (bbr_get_sockopt(so, sopt, inp, tp, bbr)); + } +out: + INP_WUNLOCK(inp); + return (error); +} + + +struct tcp_function_block __tcp_bbr = { + .tfb_tcp_block_name = __XSTRING(STACKNAME), + .tfb_tcp_output = bbr_output, + .tfb_do_queued_segments = ctf_do_queued_segments, + .tfb_do_segment_nounlock = bbr_do_segment_nounlock, + .tfb_tcp_do_segment = bbr_do_segment, + .tfb_tcp_ctloutput = bbr_ctloutput, + .tfb_tcp_fb_init = bbr_init, + .tfb_tcp_fb_fini = bbr_fini, + .tfb_tcp_timer_stop_all = bbr_stopall, + .tfb_tcp_timer_activate = bbr_timer_activate, + .tfb_tcp_timer_active = bbr_timer_active, + .tfb_tcp_timer_stop = bbr_timer_stop, + .tfb_tcp_rexmit_tmr = bbr_remxt_tmr, + .tfb_tcp_handoff_ok = bbr_handoff_ok, + .tfb_tcp_mtu_chg = bbr_mtu_chg +}; + +static const char *bbr_stack_names[] = { + __XSTRING(STACKNAME), +#ifdef STACKALIAS + __XSTRING(STACKALIAS), +#endif +}; + +static bool bbr_mod_inited = false; + +static int +tcp_addbbr(module_t mod, int32_t type, void *data) +{ + int32_t err = 0; + int num_stacks; + + switch (type) { + case MOD_LOAD: + printf("Attempting to load " __XSTRING(MODNAME) "\n"); + bbr_zone = uma_zcreate(__XSTRING(MODNAME) "_map", + sizeof(struct bbr_sendmap), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + bbr_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", + sizeof(struct tcp_bbr), + NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); + sysctl_ctx_init(&bbr_sysctl_ctx); + bbr_sysctl_root = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, + SYSCTL_STATIC_CHILDREN(_net_inet_tcp), + OID_AUTO, +#ifdef STACKALIAS + __XSTRING(STACKALIAS), +#else + __XSTRING(STACKNAME), +#endif + CTLFLAG_RW, 0, + ""); + if (bbr_sysctl_root == NULL) { + printf("Failed to add sysctl node\n"); + err = EFAULT; + goto free_uma; + } + bbr_init_sysctls(); + num_stacks = nitems(bbr_stack_names); + err = register_tcp_functions_as_names(&__tcp_bbr, M_WAITOK, + bbr_stack_names, &num_stacks); + if (err) { + printf("Failed to register %s stack name for " + "%s module\n", bbr_stack_names[num_stacks], + __XSTRING(MODNAME)); + sysctl_ctx_free(&bbr_sysctl_ctx); + free_uma: + uma_zdestroy(bbr_zone); + uma_zdestroy(bbr_pcb_zone); + bbr_counter_destroy(); + printf("Failed to register " __XSTRING(MODNAME) + " module err:%d\n", err); + return (err); + } + tcp_lro_reg_mbufq(); + bbr_mod_inited = true; + printf(__XSTRING(MODNAME) " is now available\n"); + break; + case MOD_QUIESCE: + err = deregister_tcp_functions(&__tcp_bbr, true, false); + break; + case MOD_UNLOAD: + err = deregister_tcp_functions(&__tcp_bbr, false, true); + if (err == EBUSY) + break; + if (bbr_mod_inited) { + uma_zdestroy(bbr_zone); + uma_zdestroy(bbr_pcb_zone); + sysctl_ctx_free(&bbr_sysctl_ctx); + bbr_counter_destroy(); + printf(__XSTRING(MODNAME) + " is now no longer available\n"); + bbr_mod_inited = false; + } + tcp_lro_dereg_mbufq(); + err = 0; + break; + default: + return (EOPNOTSUPP); + } + return (err); +} + +static moduledata_t tcp_bbr = { + .name = __XSTRING(MODNAME), + .evhand = tcp_addbbr, + .priv = 0 +}; + +MODULE_VERSION(MODNAME, 1); +DECLARE_MODULE(MODNAME, tcp_bbr, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); +MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c index 7ef1f3cc7832..f4a17e4dfc4b 100644 --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -1,6 +1,5 @@ /*- - * Copyright (c) 2016-2019 - * Netflix Inc. All rights reserved. + * Copyright (c) 2016-2019 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -32,7 +31,8 @@ __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" - +#include "opt_ratelimit.h" +#include "opt_kern_tls.h" #include #include #include @@ -45,18 +45,20 @@ __FBSDID("$FreeBSD$"); #include #include #include /* for proc0 declaration */ -#ifdef NETFLIX_STATS -#include -#endif #include #include +#ifdef KERN_TLS +#include +#endif #include #include -#include #ifdef NETFLIX_STATS +#include +#include #include /* Must come after qmath.h and tree.h */ #endif #include +#include #include #include #include @@ -79,8 +81,8 @@ __FBSDID("$FreeBSD$"); #include #include #include -#define TCPOUTFLAGS #include +#define TCPOUTFLAGS #include #include #include @@ -90,6 +92,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ @@ -163,25 +166,41 @@ struct sysctl_oid *rack_sysctl_root; * must maintain the new rack scoreboard. * */ -static int32_t rack_precache = 1; static int32_t rack_tlp_thresh = 1; static int32_t rack_reorder_thresh = 2; static int32_t rack_reorder_fade = 60000; /* 0 - never fade, def 60,000 * - 60 seconds */ +/* Attack threshold detections */ +static uint32_t rack_highest_sack_thresh_seen = 0; +static uint32_t rack_highest_move_thresh_seen = 0; + static int32_t rack_pkt_delay = 1; -static int32_t rack_inc_var = 0;/* For TLP */ -static int32_t rack_reduce_largest_on_idle = 0; static int32_t rack_min_pace_time = 0; -static int32_t rack_min_pace_time_seg_req=6; static int32_t rack_early_recovery = 1; -static int32_t rack_early_recovery_max_seg = 6; static int32_t rack_send_a_lot_in_prr = 1; static int32_t rack_min_to = 1; /* Number of ms minimum timeout */ -static int32_t rack_tlp_in_recovery = 1; /* Can we do TLP in recovery? */ static int32_t rack_verbose_logging = 0; static int32_t rack_ignore_data_after_close = 1; -static int32_t rack_map_entries_limit = 1024; -static int32_t rack_map_split_limit = 256; +static int32_t use_rack_cheat = 1; +static int32_t rack_persist_min = 250; /* 250ms */ +static int32_t rack_persist_max = 1000; /* 1 Second */ +static int32_t rack_sack_not_required = 0; /* set to one to allow non-sack to use rack */ +static int32_t rack_hw_tls_max_seg = 0; /* 0 means use hw-tls single segment */ + +/* Sack attack detection thresholds and such */ +static int32_t tcp_force_detection = 0; + +#ifdef NETFLIX_EXP_DETECTION +static int32_t tcp_sack_to_ack_thresh = 700; /* 70 % */ +static int32_t tcp_sack_to_move_thresh = 600; /* 60 % */ +static int32_t tcp_restoral_thresh = 650; /* 65 % (sack:2:ack -5%) */ +static int32_t tcp_attack_on_turns_on_logging = 0; +static int32_t tcp_map_minimum = 500; +#endif +static int32_t tcp_sad_decay_val = 800; +static int32_t tcp_sad_pacing_interval = 2000; +static int32_t tcp_sad_low_pps = 100; + /* * Currently regular tcp has a rto_min of 30ms @@ -191,11 +210,11 @@ static int32_t rack_map_split_limit = 256; */ static int32_t rack_tlp_min = 10; static int32_t rack_rto_min = 30; /* 30ms same as main freebsd */ -static int32_t rack_rto_max = 30000; /* 30 seconds */ +static int32_t rack_rto_max = 4000; /* 4 seconds */ static const int32_t rack_free_cache = 2; static int32_t rack_hptsi_segments = 40; static int32_t rack_rate_sample_method = USE_RTT_LOW; -static int32_t rack_pace_every_seg = 1; +static int32_t rack_pace_every_seg = 0; static int32_t rack_delayed_ack_time = 200; /* 200ms */ static int32_t rack_slot_reduction = 4; static int32_t rack_lower_cwnd_at_tlp = 0; @@ -204,9 +223,12 @@ static int32_t rack_proportional_rate = 10; static int32_t rack_tlp_max_resend = 2; static int32_t rack_limited_retran = 0; static int32_t rack_always_send_oldest = 0; -static int32_t rack_sack_block_limit = 128; static int32_t rack_use_sack_filter = 1; static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; +static int32_t rack_per_of_gp = 50; +static int32_t rack_tcp_map_entries_limit = 1500; +static int32_t rack_tcp_map_split_limit = 256; + /* Rack specific counters */ counter_u64_t rack_badfr; @@ -217,8 +239,11 @@ counter_u64_t rack_timestamp_mismatch; counter_u64_t rack_reorder_seen; counter_u64_t rack_paced_segments; counter_u64_t rack_unpaced_segments; +counter_u64_t rack_calc_zero; +counter_u64_t rack_calc_nonzero; counter_u64_t rack_saw_enobuf; counter_u64_t rack_saw_enetunreach; +counter_u64_t rack_per_timer_hole; /* Tail loss probe counters */ counter_u64_t rack_tlp_tot; @@ -239,13 +264,34 @@ counter_u64_t rack_split_limited; counter_u64_t rack_sack_proc_all; counter_u64_t rack_sack_proc_short; counter_u64_t rack_sack_proc_restart; -counter_u64_t rack_runt_sacks; +counter_u64_t rack_sack_attacks_detected; +counter_u64_t rack_sack_attacks_reversed; +counter_u64_t rack_sack_used_next_merge; +counter_u64_t rack_sack_splits; +counter_u64_t rack_sack_used_prev_merge; +counter_u64_t rack_sack_skipped_acked; +counter_u64_t rack_ack_total; +counter_u64_t rack_express_sack; +counter_u64_t rack_sack_total; +counter_u64_t rack_move_none; +counter_u64_t rack_move_some; + counter_u64_t rack_used_tlpmethod; counter_u64_t rack_used_tlpmethod2; counter_u64_t rack_enter_tlp_calc; counter_u64_t rack_input_idle_reduces; +counter_u64_t rack_collapsed_win; counter_u64_t rack_tlp_does_nada; +/* Counters for HW TLS */ +counter_u64_t rack_tls_rwnd; +counter_u64_t rack_tls_cwnd; +counter_u64_t rack_tls_app; +counter_u64_t rack_tls_other; +counter_u64_t rack_tls_filled; +counter_u64_t rack_tls_rxt; +counter_u64_t rack_tls_tlp; + /* Temp CPU counters */ counter_u64_t rack_find_high; @@ -253,21 +299,12 @@ counter_u64_t rack_progress_drops; counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; -/* - * This was originally defined in tcp_timer.c, but is now reproduced here given - * the unification of the SYN and non-SYN retransmit timer exponents combined - * with wanting to retain previous behaviour for previously deployed stack - * versions. - */ -int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = - { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; - static void rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); static int rack_process_ack(struct mbuf *m, struct tcphdr *th, - struct socket *so, struct tcpcb *tp, struct tcpopt *to, + struct socket *so, struct tcpcb *tp, struct tcpopt *to, uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); static int rack_process_data(struct mbuf *m, struct tcphdr *th, @@ -320,17 +357,13 @@ rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, static void rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm); -static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num); +static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, int num); static int32_t rack_output(struct tcpcb *tp); -static void -rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, - struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, - uint8_t iptos, int32_t nxt_pkt, struct timeval *tv); static uint32_t rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, - uint32_t cts); + uint32_t cts, int *moved_two); static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th); static void rack_remxt_tmr(struct tcpcb *tp); static int @@ -354,9 +387,6 @@ static int rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type); static int32_t tcp_addrack(module_t mod, int32_t type, void *data); -static void -rack_challenge_ack(struct mbuf *m, struct tcphdr *th, - struct tcpcb *tp, int32_t * ret_val); static int rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, @@ -365,13 +395,6 @@ static int rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); -static void rack_do_drop(struct mbuf *m, struct tcpcb *tp); -static void -rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, - struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val); -static void -rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, - struct tcphdr *th, int32_t rstreason, int32_t tlen); static int rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, @@ -400,13 +423,6 @@ static int rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); -static int -rack_drop_checks(struct tcpopt *to, struct mbuf *m, - struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, - int32_t * drop_hdrlen, int32_t * ret_val); -static int -rack_process_rst(struct mbuf *m, struct tcphdr *th, - struct socket *so, struct tcpcb *tp); struct rack_sendmap * tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused); @@ -414,10 +430,6 @@ static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt); static void tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th); -static int -rack_ts_check(struct mbuf *m, struct tcphdr *th, - struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val); - int32_t rack_clear_counter=0; @@ -453,9 +465,12 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS) counter_u64_zero(rack_to_arm_rack); counter_u64_zero(rack_to_arm_tlp); counter_u64_zero(rack_paced_segments); + counter_u64_zero(rack_calc_zero); + counter_u64_zero(rack_calc_nonzero); counter_u64_zero(rack_unpaced_segments); counter_u64_zero(rack_saw_enobuf); counter_u64_zero(rack_saw_enetunreach); + counter_u64_zero(rack_per_timer_hole); counter_u64_zero(rack_to_alloc_hard); counter_u64_zero(rack_to_alloc_emerg); counter_u64_zero(rack_sack_proc_all); @@ -466,12 +481,31 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS) counter_u64_zero(rack_alloc_limited_conns); counter_u64_zero(rack_split_limited); counter_u64_zero(rack_find_high); - counter_u64_zero(rack_runt_sacks); + counter_u64_zero(rack_tls_rwnd); + counter_u64_zero(rack_tls_cwnd); + counter_u64_zero(rack_tls_app); + counter_u64_zero(rack_tls_other); + counter_u64_zero(rack_tls_filled); + counter_u64_zero(rack_tls_rxt); + counter_u64_zero(rack_tls_tlp); + counter_u64_zero(rack_sack_attacks_detected); + counter_u64_zero(rack_sack_attacks_reversed); + counter_u64_zero(rack_sack_used_next_merge); + counter_u64_zero(rack_sack_used_prev_merge); + counter_u64_zero(rack_sack_splits); + counter_u64_zero(rack_sack_skipped_acked); + counter_u64_zero(rack_ack_total); + counter_u64_zero(rack_express_sack); + counter_u64_zero(rack_sack_total); + counter_u64_zero(rack_move_none); + counter_u64_zero(rack_move_some); counter_u64_zero(rack_used_tlpmethod); counter_u64_zero(rack_used_tlpmethod2); counter_u64_zero(rack_enter_tlp_calc); counter_u64_zero(rack_progress_drops); counter_u64_zero(rack_tlp_does_nada); + counter_u64_zero(rack_collapsed_win); + } rack_clear_counter = 0; return (0); @@ -482,48 +516,60 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS) static void rack_init_sysctls() { - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "map_limit", CTLFLAG_RW, - &rack_map_entries_limit , 1024, - "Is there a limit on how big the sendmap can grow? "); - - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "map_splitlimit", CTLFLAG_RW, - &rack_map_split_limit , 256, - "Is there a limit on how much splitting a peer can do?"); - + struct sysctl_oid *rack_counters; + struct sysctl_oid *rack_attack; + SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "rate_sample_method", CTLFLAG_RW, &rack_rate_sample_method , USE_RTT_LOW, "What method should we use for rate sampling 0=high, 1=low "); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "hw_tlsmax", CTLFLAG_RW, + &rack_hw_tls_max_seg , 0, + "Do we have a multplier of TLS records we can send as a max (0=1 TLS record)? "); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "data_after_close", CTLFLAG_RW, &rack_ignore_data_after_close, 0, "Do we hold off sending a RST until all pending data is ack'd"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "cheat_rxt", CTLFLAG_RW, + &use_rack_cheat, 1, + "Do we use the rxt cheat for rack?"); + + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "persmin", CTLFLAG_RW, + &rack_persist_min, 250, + "What is the minimum time in milliseconds between persists"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "persmax", CTLFLAG_RW, + &rack_persist_max, 1000, + "What is the largest delay in milliseconds between persists"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "no_sack_needed", CTLFLAG_RW, + &rack_sack_not_required, 0, + "Do we allow rack to run on connections not supporting SACK?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlpmethod", CTLFLAG_RW, &rack_tlp_threshold_use, TLP_USE_TWO_ONE, "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "gp_percentage", CTLFLAG_RW, + &rack_per_of_gp, 50, + "Do we pace to percentage of goodput (0=old method)?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "min_pace_time", CTLFLAG_RW, &rack_min_pace_time, 0, "Should we enforce a minimum pace time of 1ms"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "min_pace_segs", CTLFLAG_RW, - &rack_min_pace_time_seg_req, 6, - "How many segments have to be in the len to enforce min-pace-time"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "idle_reduce_high", CTLFLAG_RW, - &rack_reduce_largest_on_idle, 0, - "Should we reduce the largest cwnd seen to IW on idle reduction"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "bb_verbose", CTLFLAG_RW, @@ -544,26 +590,11 @@ rack_init_sysctls() OID_AUTO, "tlpminto", CTLFLAG_RW, &rack_tlp_min, 10, "TLP minimum timeout per the specification (10ms)"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "precache", CTLFLAG_RW, - &rack_precache, 0, - "Where should we precache the mcopy (0 is not at all)"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "sblklimit", CTLFLAG_RW, - &rack_sack_block_limit, 128, - "When do we start paying attention to small sack blocks"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "send_oldest", CTLFLAG_RW, &rack_always_send_oldest, 1, "Should we always send the oldest TLP and RACK-TLP"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "rack_tlp_in_recovery", CTLFLAG_RW, - &rack_tlp_in_recovery, 1, - "Can we do a TLP during recovery?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "rack_tlimit", CTLFLAG_RW, @@ -607,12 +638,12 @@ rack_init_sysctls() SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "hptsi_every_seg", CTLFLAG_RW, - &rack_pace_every_seg, 1, - "Should we pace out every segment hptsi"); + &rack_pace_every_seg, 0, + "Should we use the original pacing mechanism that did not pace much?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "hptsi_seg_max", CTLFLAG_RW, - &rack_hptsi_segments, 6, + &rack_hptsi_segments, 40, "Should we pace out only a limited size of segments"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), @@ -624,11 +655,6 @@ rack_init_sysctls() OID_AUTO, "minto", CTLFLAG_RW, &rack_min_to, 1, "Minimum rack timeout in milliseconds"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "earlyrecoveryseg", CTLFLAG_RW, - &rack_early_recovery_max_seg, 6, - "Max segments in early recovery"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "earlyrecovery", CTLFLAG_RW, @@ -654,221 +680,376 @@ rack_init_sysctls() OID_AUTO, "pktdelay", CTLFLAG_RW, &rack_pkt_delay, 1, "Extra RACK time (in ms) besides reordering thresh"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, + + rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "inc_var", CTLFLAG_RW, - &rack_inc_var, 0, - "Should rack add to the TLP timer the variance in rtt calculation"); + OID_AUTO, + "stats", + CTLFLAG_RW, 0, + "Rack Counters"); rack_badfr = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "badfr", CTLFLAG_RD, &rack_badfr, "Total number of bad FRs"); rack_badfr_bytes = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "badfr_bytes", CTLFLAG_RD, &rack_badfr_bytes, "Total number of bad FRs"); rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "prrsndret", CTLFLAG_RD, &rack_rtm_prr_retran, "Total number of prr based retransmits"); rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "prrsndnew", CTLFLAG_RD, &rack_rtm_prr_newdata, "Total number of prr based new transmits"); rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tsnf", CTLFLAG_RD, &rack_timestamp_mismatch, "Total number of timestamps that we could not find the reported ts"); rack_find_high = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "findhigh", CTLFLAG_RD, &rack_find_high, "Total number of FIN causing find-high"); rack_reorder_seen = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "reordering", CTLFLAG_RD, &rack_reorder_seen, "Total number of times we added delay due to reordering"); rack_tlp_tot = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tlp_to_total", CTLFLAG_RD, &rack_tlp_tot, "Total number of tail loss probe expirations"); rack_tlp_newdata = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tlp_new", CTLFLAG_RD, &rack_tlp_newdata, "Total number of tail loss probe sending new data"); rack_tlp_retran = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tlp_retran", CTLFLAG_RD, &rack_tlp_retran, "Total number of tail loss probe sending retransmitted data"); rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, &rack_tlp_retran_bytes, "Total bytes of tail loss probe sending retransmitted data"); rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tlp_retran_fail", CTLFLAG_RD, &rack_tlp_retran_fail, "Total number of tail loss probe sending retransmitted data that failed (wait for t3)"); rack_to_tot = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "rack_to_tot", CTLFLAG_RD, &rack_to_tot, "Total number of times the rack to expired?"); rack_to_arm_rack = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "arm_rack", CTLFLAG_RD, &rack_to_arm_rack, "Total number of times the rack timer armed?"); rack_to_arm_tlp = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "arm_tlp", CTLFLAG_RD, &rack_to_arm_tlp, "Total number of times the tlp timer armed?"); + + rack_calc_zero = counter_u64_alloc(M_WAITOK); + rack_calc_nonzero = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "calc_zero", CTLFLAG_RD, + &rack_calc_zero, + "Total number of times pacing time worked out to zero?"); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "calc_nonzero", CTLFLAG_RD, + &rack_calc_nonzero, + "Total number of times pacing time worked out to non-zero?"); rack_paced_segments = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "paced", CTLFLAG_RD, &rack_paced_segments, "Total number of times a segment send caused hptsi"); rack_unpaced_segments = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "unpaced", CTLFLAG_RD, &rack_unpaced_segments, "Total number of times a segment did not cause hptsi"); rack_saw_enobuf = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "saw_enobufs", CTLFLAG_RD, &rack_saw_enobuf, "Total number of times a segment did not cause hptsi"); rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "saw_enetunreach", CTLFLAG_RD, &rack_saw_enetunreach, "Total number of times a segment did not cause hptsi"); rack_to_alloc = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "allocs", CTLFLAG_RD, &rack_to_alloc, "Total allocations of tracking structures"); rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "allochard", CTLFLAG_RD, &rack_to_alloc_hard, "Total allocations done with sleeping the hard way"); rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "allocemerg", CTLFLAG_RD, &rack_to_alloc_emerg, "Total allocations done from emergency cache"); rack_to_alloc_limited = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "alloc_limited", CTLFLAG_RD, &rack_to_alloc_limited, "Total allocations dropped due to limit"); rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "alloc_limited_conns", CTLFLAG_RD, &rack_alloc_limited_conns, "Connections with allocations dropped due to limit"); rack_split_limited = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "split_limited", CTLFLAG_RD, &rack_split_limited, "Split allocations dropped due to limit"); rack_sack_proc_all = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "sack_long", CTLFLAG_RD, &rack_sack_proc_all, "Total times we had to walk whole list for sack processing"); rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "sack_restart", CTLFLAG_RD, &rack_sack_proc_restart, "Total times we had to walk whole list due to a restart"); rack_sack_proc_short = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "sack_short", CTLFLAG_RD, &rack_sack_proc_short, "Total times we took shortcut for sack processing"); rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tlp_calc_entered", CTLFLAG_RD, &rack_enter_tlp_calc, "Total times we called calc-tlp"); rack_used_tlpmethod = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "hit_tlp_method", CTLFLAG_RD, &rack_used_tlpmethod, "Total number of runt sacks"); rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "hit_tlp_method2", CTLFLAG_RD, &rack_used_tlpmethod2, - "Total number of runt sacks 2"); - rack_runt_sacks = counter_u64_alloc(M_WAITOK); - SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + "Total number of times we hit TLP method 2"); + /* Sack Attacker detection stuff */ + rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "runtsacks", CTLFLAG_RD, - &rack_runt_sacks, - "Total number of runt sacks"); + OID_AUTO, + "sack_attack", + CTLFLAG_RW, 0, + "Rack Sack Attack Counters and Controls"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_attack), + OID_AUTO, "detect_highsackratio", CTLFLAG_RW, + &rack_highest_sack_thresh_seen, 0, + "Highest sack to ack ratio seen"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_attack), + OID_AUTO, "detect_highmoveratio", CTLFLAG_RW, + &rack_highest_move_thresh_seen, 0, + "Highest move to non-move ratio seen"); + rack_ack_total = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_attack), + OID_AUTO, "acktotal", CTLFLAG_RD, + &rack_ack_total, + "Total number of Ack's"); + + rack_express_sack = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_attack), + OID_AUTO, "exp_sacktotal", CTLFLAG_RD, + &rack_express_sack, + "Total expresss number of Sack's"); + rack_sack_total = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_attack), + OID_AUTO, "sacktotal", CTLFLAG_RD, + &rack_sack_total, + "Total number of SACK's"); + rack_move_none = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_attack), + OID_AUTO, "move_none", CTLFLAG_RD, + &rack_move_none, + "Total number of SACK index reuse of postions under threshold"); + rack_move_some = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_attack), + OID_AUTO, "move_some", CTLFLAG_RD, + &rack_move_some, + "Total number of SACK index reuse of postions over threshold"); + rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_attack), + OID_AUTO, "attacks", CTLFLAG_RD, + &rack_sack_attacks_detected, + "Total number of SACK attackers that had sack disabled"); + rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_attack), + OID_AUTO, "reversed", CTLFLAG_RD, + &rack_sack_attacks_reversed, + "Total number of SACK attackers that were later determined false positive"); + rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_attack), + OID_AUTO, "nextmerge", CTLFLAG_RD, + &rack_sack_used_next_merge, + "Total number of times we used the next merge"); + rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_attack), + OID_AUTO, "prevmerge", CTLFLAG_RD, + &rack_sack_used_prev_merge, + "Total number of times we used the prev merge"); + rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_attack), + OID_AUTO, "skipacked", CTLFLAG_RD, + &rack_sack_skipped_acked, + "Total number of times we skipped previously sacked"); + rack_sack_splits = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_attack), + OID_AUTO, "ofsplit", CTLFLAG_RD, + &rack_sack_splits, + "Total number of times we did the old fashion tree split"); rack_progress_drops = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "prog_drops", CTLFLAG_RD, &rack_progress_drops, "Total number of progress drops"); rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, &rack_input_idle_reduces, "Total number of idle reductions on input"); + rack_collapsed_win = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "collapsed_win", CTLFLAG_RD, + &rack_collapsed_win, + "Total number of collapsed windows"); rack_tlp_does_nada = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tlp_nada", CTLFLAG_RD, &rack_tlp_does_nada, "Total number of nada tlp calls"); + + rack_tls_rwnd = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "tls_rwnd", CTLFLAG_RD, + &rack_tls_rwnd, + "Total hdwr tls rwnd limited"); + + rack_tls_cwnd = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "tls_cwnd", CTLFLAG_RD, + &rack_tls_cwnd, + "Total hdwr tls cwnd limited"); + + rack_tls_app = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "tls_app", CTLFLAG_RD, + &rack_tls_app, + "Total hdwr tls app limited"); + + rack_tls_other = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "tls_other", CTLFLAG_RD, + &rack_tls_other, + "Total hdwr tls other limited"); + + rack_tls_filled = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "tls_filled", CTLFLAG_RD, + &rack_tls_filled, + "Total hdwr tls filled"); + + rack_tls_rxt = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "tls_rxt", CTLFLAG_RD, + &rack_tls_rxt, + "Total hdwr rxt"); + + rack_tls_tlp = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "tls_tlp", CTLFLAG_RD, + &rack_tls_tlp, + "Total hdwr tls tlp"); + rack_per_timer_hole = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "timer_hole", CTLFLAG_RD, + &rack_per_timer_hole, + "Total persists start in timer hole"); + COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "outsize", CTLFLAG_RD, @@ -883,10 +1064,52 @@ rack_init_sysctls() &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); } +static __inline int +rb_map_cmp(struct rack_sendmap *b, struct rack_sendmap *a) +{ + if (SEQ_GEQ(b->r_start, a->r_start) && + SEQ_LT(b->r_start, a->r_end)) { + /* + * The entry b is within the + * block a. i.e.: + * a -- |-------------| + * b -- |----| + * + * b -- |------| + * + * b -- |-----------| + */ + return (0); + } else if (SEQ_GEQ(b->r_start, a->r_end)) { + /* + * b falls as either the next + * sequence block after a so a + * is said to be smaller than b. + * i.e: + * a -- |------| + * b -- |--------| + * or + * b -- |-----| + */ + return (1); + } + /* + * Whats left is where a is + * larger than b. i.e: + * a -- |-------| + * b -- |---| + * or even possibly + * b -- |--------------| + */ + return (-1); +} + +RB_PROTOTYPE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); +RB_GENERATE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); + static inline int32_t rack_progress_timeout_check(struct tcpcb *tp) { -#ifdef NETFLIX_PROGRESS if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) { if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) { /* @@ -897,21 +1120,52 @@ rack_progress_timeout_check(struct tcpcb *tp) struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; counter_u64_add(rack_progress_drops, 1); +#ifdef NETFLIX_STATS TCPSTAT_INC(tcps_progdrops); +#endif rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__); return (1); } } -#endif return (0); } + +static void +rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod) +{ + if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + struct timeval tv; + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.flex1 = tsused; + log.u_bbr.flex2 = thresh; + log.u_bbr.flex3 = rsm->r_flags; + log.u_bbr.flex4 = rsm->r_dupack; + log.u_bbr.flex5 = rsm->r_start; + log.u_bbr.flex6 = rsm->r_end; + log.u_bbr.flex8 = mod; + log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; + log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + TCP_LOG_EVENTP(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + BBR_LOG_SETTINGS_CHG, 0, + 0, &log, false, &tv); + } +} + + + static void rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; + struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT); @@ -920,22 +1174,27 @@ rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot log.u_bbr.flex4 = slot; log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot; log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; + log.u_bbr.flex7 = rack->rc_in_persist; log.u_bbr.flex8 = which; + log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; - TCP_LOG_EVENT(rack->rc_tp, NULL, + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_TIMERSTAR, 0, - 0, &log, false); + 0, &log, false, &tv); } } static void -rack_log_to_event(struct tcp_rack *rack, int32_t to_num) +rack_log_to_event(struct tcp_rack *rack, int32_t to_num, int no) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; + struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; @@ -943,11 +1202,15 @@ rack_log_to_event(struct tcp_rack *rack, int32_t to_num) log.u_bbr.flex8 = to_num; log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; log.u_bbr.flex2 = rack->rc_rack_rtt; - TCP_LOG_EVENT(rack->rc_tp, NULL, + log.u_bbr.flex3 = no; + log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_RTO, 0, - 0, &log, false); + 0, &log, false, &tv); } } @@ -957,6 +1220,7 @@ rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, int32_t t, { if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; + struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; @@ -969,11 +1233,14 @@ rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, int32_t t, log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt; log.u_bbr.rttProp = rack->r_ctl.rack_rs.rs_rtt_tot; log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; - TCP_LOG_EVENT(tp, NULL, + log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + TCP_LOG_EVENTP(tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_BBRRTT, 0, - 0, &log, false); + 0, &log, false, &tv); } } @@ -989,10 +1256,16 @@ rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) union tcp_log_stackspecific log; struct timeval tv; - memset(&log, 0, sizeof(log)); /* Convert our ms to a microsecond */ + memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = rtt * 1000; + log.u_bbr.flex2 = rack->r_ctl.ack_count; + log.u_bbr.flex3 = rack->r_ctl.sack_count; + log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; + log.u_bbr.flex5 = rack->r_ctl.sack_moved_extra; + log.u_bbr.flex8 = rack->sack_attack_disable; log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -1007,6 +1280,7 @@ rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, { if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { union tcp_log_stackspecific log; + struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; @@ -1016,11 +1290,13 @@ rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, log.u_bbr.flex3 = tp->t_maxunacktime; log.u_bbr.flex4 = tp->t_acktime; log.u_bbr.flex8 = event; - TCP_LOG_EVENT(tp, NULL, + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + TCP_LOG_EVENTP(tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_PROGRESS, 0, - 0, &log, false); + 0, &log, false, &tv); } } @@ -1029,18 +1305,22 @@ rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_ { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; + struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = slot; + log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); log.u_bbr.flex8 = rack->rc_in_persist; - TCP_LOG_EVENT(rack->rc_tp, NULL, + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_BBRSND, 0, - 0, &log, false); + 0, &log, false, &tv); } } @@ -1049,41 +1329,76 @@ rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_ { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; + struct timeval tv; memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = did_out; log.u_bbr.flex2 = nxt_pkt; log.u_bbr.flex3 = way_out; log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; + log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; + log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs; log.u_bbr.flex7 = rack->r_wanted_output; log.u_bbr.flex8 = rack->rc_in_persist; - TCP_LOG_EVENT(rack->rc_tp, NULL, + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_DOSEG_DONE, 0, - 0, &log, false); + 0, &log, false, &tv); } } +static void +rack_log_type_hrdwtso(struct tcpcb *tp, struct tcp_rack *rack, int len, int mod, int32_t orig_len, int frm) +{ + if (tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + struct timeval tv; + uint32_t cts; + memset(&log, 0, sizeof(log)); + cts = tcp_get_usecs(&tv); + log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs; + log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; + log.u_bbr.flex4 = len; + log.u_bbr.flex5 = orig_len; + log.u_bbr.flex6 = rack->r_ctl.rc_sacked; + log.u_bbr.flex7 = mod; + log.u_bbr.flex8 = frm; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + TCP_LOG_EVENTP(tp, NULL, + &tp->t_inpcb->inp_socket->so_rcv, + &tp->t_inpcb->inp_socket->so_snd, + TCP_HDWR_TLS, 0, + 0, &log, false, &tv); + } +} + static void rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, uint8_t hpts_calling) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; + struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = slot; log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; + log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.flex7 = hpts_calling; log.u_bbr.flex8 = rack->rc_in_persist; - TCP_LOG_EVENT(rack->rc_tp, NULL, + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_JUSTRET, 0, - tlen, &log, false); + tlen, &log, false, &tv); } } @@ -1092,6 +1407,7 @@ rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; + struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; @@ -1100,13 +1416,16 @@ rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line) log.u_bbr.flex2 = 0; log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex4 = 0; + log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; log.u_bbr.flex8 = hpts_removed; - TCP_LOG_EVENT(rack->rc_tp, NULL, + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_TIMERCANC, 0, - 0, &log, false); + 0, &log, false, &tv); } } @@ -1115,6 +1434,7 @@ rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; + struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.flex1 = timers; @@ -1122,14 +1442,75 @@ rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex5 = cts; - TCP_LOG_EVENT(rack->rc_tp, NULL, + log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_TO_PROCESS, 0, - 0, &log, false); + 0, &log, false, &tv); } } +static void +rack_log_to_prr(struct tcp_rack *rack, int frm) +{ + if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.flex1 = rack->r_ctl.rc_prr_out; + log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs; + log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt; + log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered; + log.u_bbr.flex5 = rack->r_ctl.rc_sacked; + log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt; + log.u_bbr.flex8 = frm; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + TCP_LOG_EVENTP(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + BBR_LOG_BBRUPD, 0, + 0, &log, false, &tv); + } +} + +#ifdef NETFLIX_EXP_DETECTION +static void +rack_log_sad(struct tcp_rack *rack, int event) +{ + if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.flex1 = rack->r_ctl.sack_count; + log.u_bbr.flex2 = rack->r_ctl.ack_count; + log.u_bbr.flex3 = rack->r_ctl.sack_moved_extra; + log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; + log.u_bbr.flex5 = rack->r_ctl.rc_num_maps_alloced; + log.u_bbr.flex6 = tcp_sack_to_ack_thresh; + log.u_bbr.pkts_out = tcp_sack_to_move_thresh; + log.u_bbr.lt_epoch = (tcp_force_detection << 8); + log.u_bbr.lt_epoch |= rack->do_detection; + log.u_bbr.applimited = tcp_map_minimum; + log.u_bbr.flex7 = rack->sack_attack_disable; + log.u_bbr.flex8 = event; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.delivered = tcp_sad_decay_val; + TCP_LOG_EVENTP(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + TCP_SAD_DETECTION, 0, + 0, &log, false, &tv); + } +} +#endif + static void rack_counter_destroy() { @@ -1158,14 +1539,15 @@ rack_counter_destroy() counter_u64_free(rack_sack_proc_restart); counter_u64_free(rack_to_alloc); counter_u64_free(rack_to_alloc_limited); + counter_u64_free(rack_alloc_limited_conns); counter_u64_free(rack_split_limited); counter_u64_free(rack_find_high); - counter_u64_free(rack_runt_sacks); counter_u64_free(rack_enter_tlp_calc); counter_u64_free(rack_used_tlpmethod); counter_u64_free(rack_used_tlpmethod2); counter_u64_free(rack_progress_drops); counter_u64_free(rack_input_idle_reduces); + counter_u64_free(rack_collapsed_win); counter_u64_free(rack_tlp_does_nada); COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); @@ -1185,7 +1567,7 @@ rack_alloc(struct tcp_rack *rack) if (rack->rc_free_cnt) { counter_u64_add(rack_to_alloc_emerg, 1); rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); - TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next); + TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); rack->rc_free_cnt--; return (rsm); } @@ -1195,8 +1577,9 @@ rack_alloc(struct tcp_rack *rack) static struct rack_sendmap * rack_alloc_full_limit(struct tcp_rack *rack) { - if ((rack_map_entries_limit > 0) && - (rack->r_ctl.rc_num_maps_alloced >= rack_map_entries_limit)) { + if ((rack_tcp_map_entries_limit > 0) && + (rack->do_detection == 0) && + (rack->r_ctl.rc_num_maps_alloced >= rack_tcp_map_entries_limit)) { counter_u64_add(rack_to_alloc_limited, 1); if (!rack->alloc_limit_reported) { rack->alloc_limit_reported = 1; @@ -1215,8 +1598,9 @@ rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) if (limit_type) { /* currently there is only one limit type */ - if (rack_map_split_limit > 0 && - rack->r_ctl.rc_num_split_allocs >= rack_map_split_limit) { + if (rack_tcp_map_split_limit > 0 && + (rack->do_detection == 0) && + rack->r_ctl.rc_num_split_allocs >= rack_tcp_map_split_limit) { counter_u64_add(rack_split_limited, 1); if (!rack->alloc_limit_reported) { rack->alloc_limit_reported = 1; @@ -1244,13 +1628,11 @@ rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) } if (rack->r_ctl.rc_tlpsend == rsm) rack->r_ctl.rc_tlpsend = NULL; - if (rack->r_ctl.rc_next == rsm) - rack->r_ctl.rc_next = NULL; if (rack->r_ctl.rc_sacklast == rsm) rack->r_ctl.rc_sacklast = NULL; if (rack->rc_free_cnt < rack_free_cache) { memset(rsm, 0, sizeof(struct rack_sendmap)); - TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next); + TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); rsm->r_limit_type = 0; rack->rc_free_cnt++; return; @@ -1271,13 +1653,12 @@ rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, ui #endif INP_WLOCK_ASSERT(tp->t_inpcb); - tp->ccv->nsegs = nsegs; tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { uint32_t max; - max = rack->r_ctl.rc_early_recovery_segs * tp->t_maxseg; + max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp); if (tp->ccv->bytes_this_ack > max) { tp->ccv->bytes_this_ack = max; } @@ -1295,6 +1676,12 @@ rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, ui SEQ_GEQ(th->th_ack, tp->gput_ack)) { gput = (((int64_t) (th->th_ack - tp->gput_seq)) << 3) / max(1, tcp_ts_getticks() - tp->gput_ts); + /* We store it in bytes per ms (or kbytes per sec) */ + rack->r_ctl.rc_gp_history[rack->r_ctl.rc_gp_hist_idx] = gput / 8; + rack->r_ctl.rc_gp_hist_idx++; + if (rack->r_ctl.rc_gp_hist_idx >= RACK_GP_HIST) + rack->r_ctl.rc_gp_hist_filled = 1; + rack->r_ctl.rc_gp_hist_idx %= RACK_GP_HIST; stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, gput); /* @@ -1309,6 +1696,7 @@ rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, ui tp->t_stats_gput_prev); tp->t_flags &= ~TF_GPUTINPROG; tp->t_stats_gput_prev = gput; + if (tp->t_maxpeakrate) { /* * We update t_peakrate_thr. This gives us roughly @@ -1320,7 +1708,7 @@ rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, ui #endif if (tp->snd_cwnd > tp->snd_ssthresh) { tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, - nsegs * V_tcp_abc_l_var * tp->t_maxseg); + nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp)); if (tp->t_bytes_acked >= tp->snd_cwnd) { tp->t_bytes_acked -= tp->snd_cwnd; tp->ccv->flags |= CCF_ABC_SENTAWND; @@ -1392,9 +1780,12 @@ rack_post_recovery(struct tcpcb *tp, struct tcphdr *th) /* Suck the next prr cnt back into cwnd */ tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt; rack->r_ctl.rc_prr_sndcnt = 0; + rack_log_to_prr(rack, 1); } tp->snd_recover = tp->snd_una; EXIT_RECOVERY(tp->t_flags); + + } static void @@ -1407,13 +1798,15 @@ rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) rack = (struct tcp_rack *)tp->t_fb_ptr; switch (type) { case CC_NDUPACK: -/* rack->r_ctl.rc_ssthresh_set = 1;*/ + tp->t_flags &= ~TF_WASFRECOVERY; + tp->t_flags &= ~TF_WASCRECOVERY; if (!IN_FASTRECOVERY(tp->t_flags)) { rack->r_ctl.rc_tlp_rtx_out = 0; rack->r_ctl.rc_prr_delivered = 0; rack->r_ctl.rc_prr_out = 0; rack->r_ctl.rc_loss_count = 0; - rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; + rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); + rack_log_to_prr(rack, 2); rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_ECN_PERMIT) @@ -1433,8 +1826,8 @@ rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) tp->t_bytes_acked = 0; EXIT_RECOVERY(tp->t_flags); tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 / - tp->t_maxseg) * tp->t_maxseg; - tp->snd_cwnd = tp->t_maxseg; + ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp); + tp->snd_cwnd = ctf_fixed_maxseg(tp); break; case CC_RTO_ERR: TCPSTAT_INC(tcps_sndrexmitbad); @@ -1442,10 +1835,14 @@ rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; - if (tp->t_flags & TF_WASFRECOVERY) + if (tp->t_flags & TF_WASFRECOVERY) { ENTER_FASTRECOVERY(tp->t_flags); - if (tp->t_flags & TF_WASCRECOVERY) + tp->t_flags &= ~TF_WASFRECOVERY; + } + if (tp->t_flags & TF_WASCRECOVERY) { ENTER_CONGRECOVERY(tp->t_flags); + tp->t_flags &= ~TF_WASCRECOVERY; + } tp->snd_nxt = tp->snd_max; tp->t_badrxtwin = 0; break; @@ -1461,7 +1858,7 @@ rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) static inline void -rack_cc_after_idle(struct tcpcb *tp, int reduce_largest) +rack_cc_after_idle(struct tcpcb *tp) { uint32_t i_cwnd; @@ -1475,29 +1872,11 @@ rack_cc_after_idle(struct tcpcb *tp, int reduce_largest) if (CC_ALGO(tp)->after_idle != NULL) CC_ALGO(tp)->after_idle(tp->ccv); - if (V_tcp_initcwnd_segments) - i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg), - max(2 * tp->t_maxseg, 14600)); - else if (V_tcp_do_rfc3390) - i_cwnd = min(4 * tp->t_maxseg, - max(2 * tp->t_maxseg, 4380)); - else { - /* Per RFC5681 Section 3.1 */ - if (tp->t_maxseg > 2190) - i_cwnd = 2 * tp->t_maxseg; - else if (tp->t_maxseg > 1095) - i_cwnd = 3 * tp->t_maxseg; - else - i_cwnd = 4 * tp->t_maxseg; - } - if (reduce_largest) { - /* - * Do we reduce the largest cwnd to make - * rack play nice on restart hptsi wise? - */ - if (((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd > i_cwnd) - ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd = i_cwnd; - } + if (tp->snd_cwnd == 1) + i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ + else + i_cwnd = tcp_compute_initwnd(tcp_maxseg(tp)); + /* * Being idle is no differnt than the initial window. If the cc * clamps it down below the initial window raise it to the initial @@ -1526,320 +1905,6 @@ rack_cc_after_idle(struct tcpcb *tp, int reduce_largest) (tlen <= tp->t_maxseg) && \ (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) -static inline void -rack_calc_rwin(struct socket *so, struct tcpcb *tp) -{ - int32_t win; - - /* - * Calculate amount of space in receive window, and then do TCP - * input processing. Receive window is amount of space in rcv queue, - * but not less than advertised window. - */ - win = sbspace(&so->so_rcv); - if (win < 0) - win = 0; - tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); -} - -static void -rack_do_drop(struct mbuf *m, struct tcpcb *tp) -{ - /* - * Drop space held by incoming segment and return. - */ - if (tp != NULL) - INP_WUNLOCK(tp->t_inpcb); - if (m) - m_freem(m); -} - -static void -rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t rstreason, int32_t tlen) -{ - if (tp != NULL) { - tcp_dropwithreset(m, th, tp, tlen, rstreason); - INP_WUNLOCK(tp->t_inpcb); - } else - tcp_dropwithreset(m, th, NULL, tlen, rstreason); -} - -/* - * The value in ret_val informs the caller - * if we dropped the tcb (and lock) or not. - * 1 = we dropped it, 0 = the TCB is still locked - * and valid. - */ -static void -rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val) -{ - /* - * Generate an ACK dropping incoming segment if it occupies sequence - * space, where the ACK reflects our state. - * - * We can now skip the test for the RST flag since all paths to this - * code happen after packets containing RST have been dropped. - * - * In the SYN-RECEIVED state, don't send an ACK unless the segment - * we received passes the SYN-RECEIVED ACK test. If it fails send a - * RST. This breaks the loop in the "LAND" DoS attack, and also - * prevents an ACK storm between two listening ports that have been - * sent forged SYN segments, each with the source address of the - * other. - */ - struct tcp_rack *rack; - - if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && - (SEQ_GT(tp->snd_una, th->th_ack) || - SEQ_GT(th->th_ack, tp->snd_max))) { - *ret_val = 1; - rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); - return; - } else - *ret_val = 0; - rack = (struct tcp_rack *)tp->t_fb_ptr; - rack->r_wanted_output++; - tp->t_flags |= TF_ACKNOW; - if (m) - m_freem(m); -} - - -static int -rack_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp) -{ - /* - * RFC5961 Section 3.2 - * - * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in - * window, we send challenge ACK. - * - * Note: to take into account delayed ACKs, we should test against - * last_ack_sent instead of rcv_nxt. Note 2: we handle special case - * of closed window, not covered by the RFC. - */ - int dropped = 0; - - if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) && - SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || - (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { - - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - KASSERT(tp->t_state != TCPS_SYN_SENT, - ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", - __func__, th, tp)); - - if (V_tcp_insecure_rst || - (tp->last_ack_sent == th->th_seq) || - (tp->rcv_nxt == th->th_seq) || - ((tp->last_ack_sent - 1) == th->th_seq)) { - TCPSTAT_INC(tcps_drops); - /* Drop the connection. */ - switch (tp->t_state) { - case TCPS_SYN_RECEIVED: - so->so_error = ECONNREFUSED; - goto close; - case TCPS_ESTABLISHED: - case TCPS_FIN_WAIT_1: - case TCPS_FIN_WAIT_2: - case TCPS_CLOSE_WAIT: - case TCPS_CLOSING: - case TCPS_LAST_ACK: - so->so_error = ECONNRESET; - close: - tcp_state_change(tp, TCPS_CLOSED); - /* FALLTHROUGH */ - default: - tp = tcp_close(tp); - } - dropped = 1; - rack_do_drop(m, tp); - } else { - TCPSTAT_INC(tcps_badrst); - /* Send challenge ACK. */ - tcp_respond(tp, mtod(m, void *), th, m, - tp->rcv_nxt, tp->snd_nxt, TH_ACK); - tp->last_ack_sent = tp->rcv_nxt; - } - } else { - m_freem(m); - } - return (dropped); -} - -/* - * The value in ret_val informs the caller - * if we dropped the tcb (and lock) or not. - * 1 = we dropped it, 0 = the TCB is still locked - * and valid. - */ -static void -rack_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val) -{ - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - - TCPSTAT_INC(tcps_badsyn); - if (V_tcp_insecure_syn && - SEQ_GEQ(th->th_seq, tp->last_ack_sent) && - SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { - tp = tcp_drop(tp, ECONNRESET); - *ret_val = 1; - rack_do_drop(m, tp); - } else { - /* Send challenge ACK. */ - tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, - tp->snd_nxt, TH_ACK); - tp->last_ack_sent = tp->rcv_nxt; - m = NULL; - *ret_val = 0; - rack_do_drop(m, NULL); - } -} - -/* - * rack_ts_check returns 1 for you should not proceed. It places - * in ret_val what should be returned 1/0 by the caller. The 1 indicates - * that the TCB is unlocked and probably dropped. The 0 indicates the - * TCB is still valid and locked. - */ -static int -rack_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val) -{ - - /* Check to see if ts_recent is over 24 days old. */ - if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { - /* - * Invalidate ts_recent. If this segment updates ts_recent, - * the age will be reset later and ts_recent will get a - * valid value. If it does not, setting ts_recent to zero - * will at least satisfy the requirement that zero be placed - * in the timestamp echo reply when ts_recent isn't valid. - * The age isn't reset until we get a valid ts_recent - * because we don't want out-of-order segments to be dropped - * when ts_recent is old. - */ - tp->ts_recent = 0; - } else { - TCPSTAT_INC(tcps_rcvduppack); - TCPSTAT_ADD(tcps_rcvdupbyte, tlen); - TCPSTAT_INC(tcps_pawsdrop); - *ret_val = 0; - if (tlen) { - rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val); - } else { - rack_do_drop(m, NULL); - } - return (1); - } - return (0); -} - -/* - * rack_drop_checks returns 1 for you should not proceed. It places - * in ret_val what should be returned 1/0 by the caller. The 1 indicates - * that the TCB is unlocked and probably dropped. The 0 indicates the - * TCB is still valid and locked. - */ -static int -rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val) -{ - int32_t todrop; - int32_t thflags; - int32_t tlen; - - thflags = *thf; - tlen = *tlenp; - todrop = tp->rcv_nxt - th->th_seq; - if (todrop > 0) { - if (thflags & TH_SYN) { - thflags &= ~TH_SYN; - th->th_seq++; - if (th->th_urp > 1) - th->th_urp--; - else - thflags &= ~TH_URG; - todrop--; - } - /* - * Following if statement from Stevens, vol. 2, p. 960. - */ - if (todrop > tlen - || (todrop == tlen && (thflags & TH_FIN) == 0)) { - /* - * Any valid FIN must be to the left of the window. - * At this point the FIN must be a duplicate or out - * of sequence; drop it. - */ - thflags &= ~TH_FIN; - /* - * Send an ACK to resynchronize and drop any data. - * But keep on processing for RST or ACK. - */ - tp->t_flags |= TF_ACKNOW; - todrop = tlen; - TCPSTAT_INC(tcps_rcvduppack); - TCPSTAT_ADD(tcps_rcvdupbyte, todrop); - } else { - TCPSTAT_INC(tcps_rcvpartduppack); - TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); - } - if (tp->t_flags & TF_SACK_PERMIT) { - /* - * record the left, to-be-dropped edge of data - * here, for use as dsack block further down - */ - tcp_update_sack_list(tp, th->th_seq, - th->th_seq + todrop); - /* - * ACK now, as the next in-sequence segment - * will clear the DSACK block again - */ - tp->t_flags |= TF_ACKNOW; - } - *drop_hdrlen += todrop; /* drop from the top afterwards */ - th->th_seq += todrop; - tlen -= todrop; - if (th->th_urp > todrop) - th->th_urp -= todrop; - else { - thflags &= ~TH_URG; - th->th_urp = 0; - } - } - /* - * If segment ends after window, drop trailing data (and PUSH and - * FIN); if nothing left, just ACK. - */ - todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); - if (todrop > 0) { - TCPSTAT_INC(tcps_rcvpackafterwin); - if (todrop >= tlen) { - TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); - /* - * If window is closed can only take segments at - * window edge, and have to drop data and PUSH from - * incoming segments. Continue processing, but - * remember to ack. Otherwise, drop segment and - * ack. - */ - if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { - tp->t_flags |= TF_ACKNOW; - TCPSTAT_INC(tcps_rcvwinprobe); - } else { - rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val); - return (1); - } - } else - TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); - m_adj(m, -todrop); - tlen -= todrop; - thflags &= ~(TH_PUSH | TH_FIN); - } - *thf = thflags; - *tlenp = tlen; - return (0); -} - static struct rack_sendmap * rack_find_lowest_rsm(struct tcp_rack *rack) { @@ -1872,7 +1937,7 @@ rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) */ counter_u64_add(rack_find_high, 1); prsm = rsm; - TAILQ_FOREACH_REVERSE_FROM(prsm, &rack->r_ctl.rc_map, rack_head, r_next) { + RB_FOREACH_REVERSE_FROM(prsm, rack_rb_tree_head, rsm) { if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { continue; } @@ -1944,7 +2009,6 @@ rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts) thresh += 1; } /* We don't let the rack timeout be above a RTO */ - if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) { thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur); } @@ -1971,7 +2035,7 @@ rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, thresh = (srtt * 2); /* Get the previous sent packet, if any */ - maxseg = tcp_maxseg(tp); + maxseg = ctf_fixed_maxseg(tp); counter_u64_add(rack_enter_tlp_calc, 1); len = rsm->r_end - rsm->r_start; if (rack->rack_tlp_threshold_use == TLP_USE_ID) { @@ -2044,6 +2108,24 @@ rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, return (thresh); } +static uint32_t +rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack) +{ + /* + * We want the rack_rtt which is the + * last rtt we measured. However if that + * does not exist we fallback to the srtt (which + * we probably will never do) and then as a last + * resort we use RACK_INITIAL_RTO if no srtt is + * yet set. + */ + if (rack->rc_rack_rtt) + return(rack->rc_rack_rtt); + else if (tp->t_srtt == 0) + return(RACK_INITIAL_RTO); + return (TICKS_2_MSEC(tp->t_srtt >> TCP_RTT_SHIFT)); +} + static struct rack_sendmap * rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) { @@ -2055,17 +2137,12 @@ rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) struct tcp_rack *rack; struct rack_sendmap *rsm; int32_t idx; - uint32_t srtt_cur, srtt, thresh; + uint32_t srtt, thresh; rack = (struct tcp_rack *)tp->t_fb_ptr; - if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) { + if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { return (NULL); } - srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT; - srtt = TICKS_2_MSEC(srtt_cur); - if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt)) - srtt = rack->rc_rack_rtt; - rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); if (rsm == NULL) return (NULL); @@ -2076,6 +2153,7 @@ rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) return (NULL); } idx = rsm->r_rtr_cnt - 1; + srtt = rack_grab_rtt(tp, rack); thresh = rack_calc_thresh_rack(rack, srtt, tsused); if (tsused < rsm->r_tim_lastsent[idx]) { return (NULL); @@ -2100,7 +2178,7 @@ rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT)); TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], - tcp_persmin, tcp_persmax); + rack_persist_min, rack_persist_max); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; @@ -2109,7 +2187,7 @@ rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) } static uint32_t -rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) +rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack) { /* * Start the FR timer, we do this based on getting the first one in @@ -2117,7 +2195,7 @@ rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) * events we need to stop the running timer (if its running) before * starting the new one. */ - uint32_t thresh, exp, to, srtt, time_since_sent; + uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse; uint32_t srtt_cur; int32_t idx; int32_t is_tlp_timer = 0; @@ -2131,13 +2209,31 @@ rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) /* We can't start any timer in persists */ return (rack_get_persists_timer_val(tp, rack)); } + if ((tp->t_state < TCPS_ESTABLISHED) || + ((tp->t_flags & TF_SACK_PERMIT) == 0)) + goto activate_rxt; rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); - if (rsm == NULL) { + if ((rsm == NULL) || sup_rack) { /* Nothing on the send map */ activate_rxt: + time_since_sent = 0; + rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); + if (rsm) { + idx = rsm->r_rtr_cnt - 1; + if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time)) + tstmp_touse = rsm->r_tim_lastsent[idx]; + else + tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time; + if (TSTMP_GT(tstmp_touse, cts)) + time_since_sent = cts - tstmp_touse; + } if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; to = TICKS_2_MSEC(tp->t_rxtcur); + if (to > time_since_sent) + to -= time_since_sent; + else + to = rack->r_ctl.rc_min_to; if (to == 0) to = 1; return (to); @@ -2151,6 +2247,16 @@ rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) goto activate_rxt; } } + if (rack->sack_attack_disable) { + /* + * We don't want to do + * any TLP's if you are an attacker. + * Though if you are doing what + * is expected you may still have + * SACK-PASSED marks. + */ + goto activate_rxt; + } /* Convert from ms to usecs */ if (rsm->r_flags & RACK_SACK_PASSED) { if ((tp->t_flags & TF_SENTFIN) && @@ -2162,12 +2268,20 @@ rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) */ goto activate_rxt; } - if (tp->t_srtt) { - srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); - srtt = TICKS_2_MSEC(srtt_cur); - } else - srtt = RACK_INITIAL_RTO; - + if ((rack->use_rack_cheat == 0) && + (IN_RECOVERY(tp->t_flags)) && + (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { + /* + * We are not cheating, in recovery and + * not enough ack's to yet get our next + * retransmission out. + * + * Note that classified attackers do not + * get to use the rack-cheat. + */ + goto activate_tlp; + } + srtt = rack_grab_rtt(tp, rack); thresh = rack_calc_thresh_rack(rack, srtt, cts); idx = rsm->r_rtr_cnt - 1; exp = rsm->r_tim_lastsent[idx] + thresh; @@ -2181,6 +2295,7 @@ rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) } } else { /* Ok we need to do a TLP not RACK */ +activate_tlp: if ((rack->rc_tlp_in_progress != 0) || (rack->r_ctl.rc_tlp_rtx_out != 0)) { /* @@ -2189,12 +2304,6 @@ rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) */ goto activate_rxt; } - if ((tp->snd_max - tp->snd_una) > tp->snd_wnd) { - /* - * Peer collapsed rwnd, don't do TLP. - */ - goto activate_rxt; - } rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); if (rsm == NULL) { /* We found no rsm to TLP with. */ @@ -2206,10 +2315,13 @@ rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) goto activate_rxt; } idx = rsm->r_rtr_cnt - 1; - if (TSTMP_GT(cts, rsm->r_tim_lastsent[idx])) - time_since_sent = cts - rsm->r_tim_lastsent[idx]; - else - time_since_sent = 0; + time_since_sent = 0; + if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time)) + tstmp_touse = rsm->r_tim_lastsent[idx]; + else + tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time; + if (TSTMP_GT(tstmp_touse, cts)) + time_since_sent = cts - tstmp_touse; is_tlp_timer = 1; if (tp->t_srtt) { srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); @@ -2260,10 +2372,6 @@ static void rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { if (rack->rc_in_persist == 0) { - if (((tp->t_flags & TF_SENTFIN) == 0) && - (tp->snd_max - tp->snd_una) >= sbavail(&rack->rc_inp->inp_socket->so_snd)) - /* Must need to send more data to enter persist */ - return; rack->r_ctl.rc_went_idle_time = cts; rack_timer_cancel(tp, rack, cts, __LINE__); tp->t_rxtshift = 0; @@ -2285,8 +2393,8 @@ rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack) } static void -rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int32_t line, - int32_t slot, uint32_t tot_len_this_send, int32_t frm_out_sbavail) +rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, + int32_t slot, uint32_t tot_len_this_send, int sup_rack) { struct inpcb *inp; uint32_t delayed_ack = 0; @@ -2299,7 +2407,6 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int /* A previous call is already set up */ return; } - if ((tp->t_state == TCPS_CLOSED) || (tp->t_state == TCPS_LISTEN)) { return; @@ -2308,6 +2415,7 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { left = rack->r_ctl.rc_timer_exp - cts; } + rack->tlp_timer_up = 0; rack->r_ctl.rc_timer_exp = 0; if (rack->rc_inp->inp_in_hpts == 0) { rack->r_ctl.rc_hpts_flags = 0; @@ -2325,23 +2433,21 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int else slot = 1; } - if ((tp->snd_wnd == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) { - /* No send window.. we must enter persist */ - rack_enter_persist(tp, rack, cts); - } else if ((frm_out_sbavail && - (frm_out_sbavail > (tp->snd_max - tp->snd_una)) && - (tp->snd_wnd < tp->t_maxseg)) && - TCPS_HAVEESTABLISHED(tp->t_state)) { + hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack); + if (rack->sack_attack_disable && + (slot < USEC_TO_MSEC(tcp_sad_pacing_interval))) { /* - * If we have no window or we can't send a segment (and have - * data to send.. we cheat here and frm_out_sbavail is - * passed in with the sbavail(sb) only from bbr_output) and - * we are established, then we must enter persits (if not - * already in persits). + * We have a potential attacker on + * the line. We have possibly some + * (or now) pacing time set. We want to + * slow down the processing of sacks by some + * amount (if it is an attacker). Set the default + * slot for attackers in place (unless the orginal + * interval is longer). Its stored in + * micro-seconds, so lets convert to msecs. */ - rack_enter_persist(tp, rack, cts); + slot = USEC_TO_MSEC(tcp_sad_pacing_interval); } - hpts_timeout = rack_timer_start(tp, rack, cts); if (tp->t_flags & TF_DELACK) { delayed_ack = TICKS_2_MSEC(tcp_delacktime); rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; @@ -2398,6 +2504,11 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int rack->r_ctl.rc_timer_exp = cts + hpts_timeout; } if (slot) { + rack->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY; + if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) + inp->inp_flags2 |= INP_DONT_SACK_QUEUE; + else + inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; rack->r_ctl.rc_last_output_to = cts + slot; if ((hpts_timeout == 0) || (hpts_timeout > slot)) { if (rack->rc_inp->inp_in_hpts == 0) @@ -2413,6 +2524,15 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int rack_log_to_start(rack, cts, hpts_timeout, slot, 0); } } else if (hpts_timeout) { + if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) { + /* For a rack timer, don't wake us */ + rack->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY; + inp->inp_flags2 |= INP_DONT_SACK_QUEUE; + } else { + /* All other timers wake us up */ + rack->rc_inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY; + inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; + } if (rack->rc_inp->inp_in_hpts == 0) tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout)); rack_log_to_start(rack, cts, hpts_timeout, slot, 0); @@ -2448,7 +2568,7 @@ rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) * settings. */ struct rack_sendmap *rsm; - int32_t recovery; + int32_t recovery, ll; if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); @@ -2457,12 +2577,16 @@ rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) /* Its not time yet */ return (0); } - rack_log_to_event(rack, RACK_TO_FRM_RACK); recovery = IN_RECOVERY(tp->t_flags); counter_u64_add(rack_to_tot, 1); if (rack->r_state && (rack->r_state != tp->t_state)) rack_set_state(tp, rack); rsm = rack_check_recovery_mode(tp, cts); + if (rsm) + ll = rsm->r_end - rsm->r_start; + else + ll = 0; + rack_log_to_event(rack, RACK_TO_FRM_RACK, ll); if (rsm) { uint32_t rtt; @@ -2470,23 +2594,23 @@ rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) if (rtt == 0) rtt = 1; if ((recovery == 0) && - (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)) { + (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { /* * The rack-timeout that enter's us into recovery * will force out one MSS and set us up so that we * can do one more send in 2*rtt (transitioning the * rack timeout into a rack-tlp). */ - rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; - } else if ((rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) && - ((rsm->r_end - rsm->r_start) > rack->r_ctl.rc_prr_sndcnt)) { + rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); + rack_log_to_prr(rack, 3); + } else if ((rack->r_ctl.rc_prr_sndcnt < (rsm->r_end - rsm->r_start)) && + rack->use_rack_cheat) { /* - * When a rack timer goes, we have to send at - * least one segment. They will be paced a min of 1ms - * apart via the next rack timer (or further - * if the rack timer dictates it). + * When a rack timer goes, if the rack cheat is + * on, arrange it so we can send a full segment. */ - rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; + rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); + rack_log_to_prr(rack, 4); } } else { /* This is a case that should happen rarely if ever */ @@ -2500,6 +2624,24 @@ rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) return (0); } +static __inline void +rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm, + struct rack_sendmap *rsm, uint32_t start) +{ + int idx; + + nrsm->r_start = start; + nrsm->r_end = rsm->r_end; + nrsm->r_rtr_cnt = rsm->r_rtr_cnt; + nrsm->r_flags = rsm->r_flags; + nrsm->r_dupack = rsm->r_dupack; + nrsm->r_rtr_bytes = 0; + rsm->r_end = nrsm->r_start; + for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { + nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; + } +} + static struct rack_sendmap * rack_merge_rsm(struct tcp_rack *rack, struct rack_sendmap *l_rsm, @@ -2515,19 +2657,32 @@ rack_merge_rsm(struct tcp_rack *rack, * is any reason we need to try to find * the oldest (or last oldest retransmitted). */ + struct rack_sendmap *rm; + l_rsm->r_end = r_rsm->r_end; + if (l_rsm->r_dupack < r_rsm->r_dupack) + l_rsm->r_dupack = r_rsm->r_dupack; if (r_rsm->r_rtr_bytes) l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes; if (r_rsm->r_in_tmap) { /* This really should not happen */ TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext); + r_rsm->r_in_tmap = 0; } /* Now the flags */ if (r_rsm->r_flags & RACK_HAS_FIN) l_rsm->r_flags |= RACK_HAS_FIN; if (r_rsm->r_flags & RACK_TLP) l_rsm->r_flags |= RACK_TLP; - TAILQ_REMOVE(&rack->r_ctl.rc_map, r_rsm, r_next); + if (r_rsm->r_flags & RACK_RWND_COLLAPSED) + l_rsm->r_flags |= RACK_RWND_COLLAPSED; + rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm); +#ifdef INVARIANTS + if (rm != r_rsm) { + panic("removing head in rack:%p rsm:%p rm:%p", + rack, r_rsm, rm); + } +#endif if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) { /* Transfer the split limit to the map we free */ r_rsm->r_limit_type = l_rsm->r_limit_type; @@ -2552,9 +2707,11 @@ rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) * Tail Loss Probe. */ struct rack_sendmap *rsm = NULL; + struct rack_sendmap *insret; struct socket *so; uint32_t amm, old_prr_snd = 0; uint32_t out, avail; + int collapsed_win = 0; if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); @@ -2571,14 +2728,28 @@ rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) * A TLP timer has expired. We have been idle for 2 rtts. So we now * need to figure out how to force a full MSS segment out. */ - rack_log_to_event(rack, RACK_TO_FRM_TLP); + rack_log_to_event(rack, RACK_TO_FRM_TLP, 0); counter_u64_add(rack_tlp_tot, 1); if (rack->r_state && (rack->r_state != tp->t_state)) rack_set_state(tp, rack); so = tp->t_inpcb->inp_socket; +#ifdef KERN_TLS + if (rack->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { + /* + * For hardware TLS we do *not* want to send + * new data, lets instead just do a retransmission. + */ + goto need_retran; + } +#endif avail = sbavail(&so->so_snd); out = tp->snd_max - tp->snd_una; - rack->rc_timer_up = 1; + rack->tlp_timer_up = 1; + if (out > tp->snd_wnd) { + /* special case, we need a retransmission */ + collapsed_win = 1; + goto need_retran; + } /* * If we are in recovery we can jazz out a segment if new data is * present simply by setting rc_prr_sndcnt to a segment. @@ -2587,18 +2758,19 @@ rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) ((rack_always_send_oldest == 0) || (TAILQ_EMPTY(&rack->r_ctl.rc_tmap)))) { /* New data is available */ amm = avail - out; - if (amm > tp->t_maxseg) { - amm = tp->t_maxseg; - } else if ((amm < tp->t_maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) { + if (amm > ctf_fixed_maxseg(tp)) { + amm = ctf_fixed_maxseg(tp); + } else if ((amm < ctf_fixed_maxseg(tp)) && ((tp->t_flags & TF_NODELAY) == 0)) { /* not enough to fill a MTU and no-delay is off */ goto need_retran; } if (IN_RECOVERY(tp->t_flags)) { /* Unlikely */ old_prr_snd = rack->r_ctl.rc_prr_sndcnt; - if (out + amm <= tp->snd_wnd) + if (out + amm <= tp->snd_wnd) { rack->r_ctl.rc_prr_sndcnt = amm; - else + rack_log_to_prr(rack, 4); + } else goto need_retran; } else { /* Set the send-new override */ @@ -2618,28 +2790,52 @@ rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) * Ok we need to arrange the last un-acked segment to be re-sent, or * optionally the first un-acked segment. */ - if (rack_always_send_oldest) - rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); - else { - rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); - if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { - rsm = rack_find_high_nonack(rack, rsm); + if (collapsed_win == 0) { + if (rack_always_send_oldest) + rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); + else { + rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); + if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { + rsm = rack_find_high_nonack(rack, rsm); + } + } + if (rsm == NULL) { + counter_u64_add(rack_tlp_does_nada, 1); +#ifdef TCP_BLACKBOX + tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); +#endif + goto out; + } + } else { + /* + * We must find the last segment + * that was acceptable by the client. + */ + RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { + if ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0) { + /* Found one */ + break; + } + } + if (rsm == NULL) { + /* None? if so send the first */ + rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); + if (rsm == NULL) { + counter_u64_add(rack_tlp_does_nada, 1); +#ifdef TCP_BLACKBOX + tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); +#endif + goto out; + } } } - if (rsm == NULL) { - counter_u64_add(rack_tlp_does_nada, 1); -#ifdef TCP_BLACKBOX - tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); -#endif - goto out; - } - if ((rsm->r_end - rsm->r_start) > tp->t_maxseg) { + if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) { /* * We need to split this the last segment in two. */ - int32_t idx; struct rack_sendmap *nrsm; + nrsm = rack_alloc_full_limit(rack); if (nrsm == NULL) { /* @@ -2649,17 +2845,15 @@ rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) counter_u64_add(rack_tlp_does_nada, 1); goto out; } - nrsm->r_start = (rsm->r_end - tp->t_maxseg); - nrsm->r_end = rsm->r_end; - nrsm->r_rtr_cnt = rsm->r_rtr_cnt; - nrsm->r_flags = rsm->r_flags; - nrsm->r_sndcnt = rsm->r_sndcnt; - nrsm->r_rtr_bytes = 0; - rsm->r_end = nrsm->r_start; - for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { - nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; + rack_clone_rsm(rack, nrsm, rsm, + (rsm->r_end - ctf_fixed_maxseg(tp))); + insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); +#ifdef INVARIANTS + if (insret != NULL) { + panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", + nrsm, insret, rack, rsm); } - TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); +#endif if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; @@ -2684,11 +2878,12 @@ rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) * peer in max times. We need the retransmit timer to take * over. */ -restore: + restore: rack->r_ctl.rc_tlpsend = NULL; if (rsm) rsm->r_flags &= ~RACK_TLP; rack->r_ctl.rc_prr_sndcnt = old_prr_snd; + rack_log_to_prr(rack, 5); counter_u64_add(rack_tlp_retran_fail, 1); goto out; } else if (rsm) { @@ -2708,7 +2903,7 @@ rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; return (0); out: - rack->rc_timer_up = 0; + rack->tlp_timer_up = 0; rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; return (0); } @@ -2727,7 +2922,7 @@ rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } - rack_log_to_event(rack, RACK_TO_FRM_DELACK); + rack_log_to_event(rack, RACK_TO_FRM_DELACK, 0); tp->t_flags &= ~TF_DELACK; tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_delack); @@ -2746,8 +2941,9 @@ rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) static int rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { + struct tcptemp *t_template; struct inpcb *inp; - int32_t retval = 0; + int32_t retval = 1; inp = tp->t_inpcb; @@ -2795,9 +2991,22 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); goto out; } - tp->t_flags |= TF_FORCEDATA; + t_template = tcpip_maketemplate(rack->rc_inp); + if (t_template) { + tcp_respond(tp, t_template->tt_ipgen, + &t_template->tt_t, (struct mbuf *)NULL, + tp->rcv_nxt, tp->snd_una - 1, 0); + /* This sends an ack */ + if (tp->t_flags & TF_DELACK) + tp->t_flags &= ~TF_DELACK; + free(t_template, M_TEMP); + } + if (tp->t_rxtshift < TCP_MAXRXTSHIFT) + tp->t_rxtshift++; out: - rack_log_to_event(rack, RACK_TO_FRM_PERSIST); + rack_log_to_event(rack, RACK_TO_FRM_PERSIST, 0); + rack_start_hpts_timer(rack, tp, cts, + 0, 0, 0); return (retval); } @@ -2818,7 +3027,7 @@ rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) } rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; inp = tp->t_inpcb; - rack_log_to_event(rack, RACK_TO_FRM_KEEP); + rack_log_to_event(rack, RACK_TO_FRM_KEEP, 0); /* * Keep-alive timer went off; send something or drop connection if * idle for too long. @@ -2849,7 +3058,7 @@ rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) free(t_template, M_TEMP); } } - rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0); + rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); return (1); dropit: TCPSTAT_INC(tcps_keepdrops); @@ -2874,7 +3083,7 @@ rack_remxt_tmr(struct tcpcb *tp) rack = (struct tcp_rack *)tp->t_fb_ptr; rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__); - rack_log_to_event(rack, RACK_TO_FRM_TMR); + rack_log_to_event(rack, RACK_TO_FRM_TMR, 0); if (rack->r_state && (rack->r_state != tp->t_state)) rack_set_state(tp, rack); /* @@ -2885,10 +3094,11 @@ rack_remxt_tmr(struct tcpcb *tp) * so for now we will just let the normal rxt timer * and tlp timer take care of it. */ - TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { + RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { if (rsm->r_flags & RACK_ACKED) { cnt++; - rsm->r_sndcnt = 0; + rsm->r_dupack = 0; + rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); if (rsm->r_in_tmap == 0) { /* We must re-add it back to the tlist */ if (trsm == NULL) { @@ -2897,9 +3107,9 @@ rack_remxt_tmr(struct tcpcb *tp) TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); } rsm->r_in_tmap = 1; - trsm = rsm; } } + trsm = rsm; rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS); } /* Clear the count (we just un-acked them) */ @@ -2907,10 +3117,9 @@ rack_remxt_tmr(struct tcpcb *tp) /* Clear the tlp rtx mark */ rack->r_ctl.rc_tlp_rtx_out = 0; rack->r_ctl.rc_tlp_seg_send_cnt = 0; - rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_map); - /* Setup so we send one segment */ - if (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) - rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; + rack->r_ctl.rc_resend = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); + rack->r_ctl.rc_prr_sndcnt = 0; + rack_log_to_prr(rack, 6); rack->r_timer_override = 1; } @@ -2944,7 +3153,18 @@ rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) * retransmit interval. Back off to a longer retransmit interval * and retransmit one segment. */ - if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { + rack_remxt_tmr(tp); + if ((rack->r_ctl.rc_resend == NULL) || + ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) { + /* + * If the rwnd collapsed on + * the one we are retransmitting + * it does not count against the + * rxt count. + */ + tp->t_rxtshift++; + } + if (tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; TCPSTAT_INC(tcps_timeoutdrop); retval = 1; @@ -2952,7 +3172,6 @@ rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); goto out; } - rack_remxt_tmr(tp); if (tp->t_state == TCPS_SYN_SENT) { /* * If the SYN was retransmitted, indicate CWND to be limited @@ -2987,7 +3206,7 @@ rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) TCPSTAT_INC(tcps_rexmttimeo); if ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) - rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_syn_backoff[tp->t_rxtshift]); + rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]); else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; TCPT_RANGESET(tp->t_rxtcur, rexmt, @@ -3088,16 +3307,6 @@ rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) } } } - /* - * Disable RFC1323 and SACK if we haven't got any response to our - * third SYN to work-around some broken terminal servers (most of - * which have hopefully been retired) that have bad VJ header - * compression code which trashes TCP segments containing - * unknown-to-them TCP options. - */ - if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && - (tp->t_rxtshift == 3)) - tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT); /* * If we backed off this far, our srtt estimate is probably bogus. * Clobber it so we'll take the next rtt measurement as our srtt; @@ -3168,10 +3377,13 @@ rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8 if (timers & PACE_TMR_DELACK) { ret = rack_timeout_delack(tp, rack, cts); } else if (timers & PACE_TMR_RACK) { + rack->r_ctl.rc_tlp_rxt_last_time = cts; ret = rack_timeout_rack(tp, rack, cts); } else if (timers & PACE_TMR_TLP) { + rack->r_ctl.rc_tlp_rxt_last_time = cts; ret = rack_timeout_tlp(tp, rack, cts); } else if (timers & PACE_TMR_RXT) { + rack->r_ctl.rc_tlp_rxt_last_time = cts; ret = rack_timeout_rxt(tp, rack, cts); } else if (timers & PACE_TMR_PERSIT) { ret = rack_timeout_persist(tp, rack, cts); @@ -3262,7 +3474,8 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, int32_t idx; rsm->r_rtr_cnt++; - rsm->r_sndcnt++; + rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); + rsm->r_dupack = 0; if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; rsm->r_flags |= RACK_OVERMAX; @@ -3280,6 +3493,7 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, } if (rsm->r_in_tmap) { TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); + rsm->r_in_tmap = 0; } TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; @@ -3288,23 +3502,20 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, rsm->r_flags &= ~RACK_SACK_PASSED; rsm->r_flags |= RACK_WAS_SACKPASS; } - /* Update memory for next rtr */ - rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); } static uint32_t rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, - struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp) + struct rack_sendmap *rsm, uint32_t ts, int32_t *lenp) { /* * We (re-)transmitted starting at rsm->r_start for some length * (possibly less than r_end. */ - struct rack_sendmap *nrsm; + struct rack_sendmap *nrsm, *insret; uint32_t c_end; int32_t len; - int32_t idx; len = *lenp; c_end = rsm->r_start + len; @@ -3346,17 +3557,16 @@ rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to * 1, 6 and the new piece will be 6, 11. */ - nrsm->r_start = c_end; - nrsm->r_end = rsm->r_end; - nrsm->r_rtr_cnt = rsm->r_rtr_cnt; - nrsm->r_flags = rsm->r_flags; - nrsm->r_sndcnt = rsm->r_sndcnt; - nrsm->r_rtr_bytes = 0; - rsm->r_end = c_end; - for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { - nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; + rack_clone_rsm(rack, nrsm, rsm, c_end); + nrsm->r_dupack = 0; + rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); + insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); +#ifdef INVARIANTS + if (insret != NULL) { + panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", + nrsm, insret, rack, rsm); } - TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); +#endif if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; @@ -3374,9 +3584,8 @@ rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, uint8_t pass, struct rack_sendmap *hintrsm) { struct tcp_rack *rack; - struct rack_sendmap *rsm, *nrsm; + struct rack_sendmap *rsm, *nrsm, *insret, fe; register uint32_t snd_max, snd_una; - int32_t idx; /* * Add to the RACK log of packets in flight or retransmitted. If @@ -3426,7 +3635,10 @@ rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, end = seq_out + len; seq_out = snd_una; - len = end - seq_out; + if (SEQ_GEQ(end, seq_out)) + len = end - seq_out; + else + len = 0; } snd_max = tp->snd_max; if (th_flags & (TH_SYN | TH_FIN)) { @@ -3456,8 +3668,9 @@ rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, if (IN_RECOVERY(tp->t_flags)) { rack->r_ctl.rc_prr_out += len; } - /* First question is it a retransmission? */ + /* First question is it a retransmission or new? */ if (seq_out == snd_max) { + /* Its new */ again: rsm = rack_alloc(rack); if (rsm == NULL) { @@ -3475,10 +3688,24 @@ rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, rsm->r_tim_lastsent[0] = ts; rsm->r_rtr_cnt = 1; rsm->r_rtr_bytes = 0; - rsm->r_start = seq_out; - rsm->r_end = rsm->r_start + len; - rsm->r_sndcnt = 0; - TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next); + if (th_flags & TH_SYN) { + /* The data space is one beyond snd_una */ + rsm->r_start = seq_out + 1; + rsm->r_end = rsm->r_start + (len - 1); + } else { + /* Normal case */ + rsm->r_start = seq_out; + rsm->r_end = rsm->r_start + len; + } + rsm->r_dupack = 0; + rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); + insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); +#ifdef INVARIANTS + if (insret != NULL) { + panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", + nrsm, insret, rack, rsm); + } +#endif TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; return; @@ -3486,22 +3713,16 @@ rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, /* * If we reach here its a retransmission and we need to find it. */ + memset(&fe, 0, sizeof(fe)); more: if (hintrsm && (hintrsm->r_start == seq_out)) { rsm = hintrsm; hintrsm = NULL; - } else if (rack->r_ctl.rc_next) { - /* We have a hint from a previous run */ - rsm = rack->r_ctl.rc_next; } else { /* No hints sorry */ rsm = NULL; } if ((rsm) && (rsm->r_start == seq_out)) { - /* - * We used rc_next or hintrsm to retransmit, hopefully the - * likely case. - */ seq_out = rack_update_entry(tp, rack, rsm, ts, &len); if (len == 0) { return; @@ -3510,14 +3731,16 @@ rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, } } /* Ok it was not the last pointer go through it the hard way. */ - TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { +refind: + fe.r_start = seq_out; + rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); + if (rsm) { if (rsm->r_start == seq_out) { seq_out = rack_update_entry(tp, rack, rsm, ts, &len); - rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); if (len == 0) { return; } else { - continue; + goto refind; } } if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { @@ -3535,17 +3758,14 @@ rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, * copy rsm to nrsm and then trim the front of rsm * to not include this part. */ - nrsm->r_start = seq_out; - nrsm->r_end = rsm->r_end; - nrsm->r_rtr_cnt = rsm->r_rtr_cnt; - nrsm->r_flags = rsm->r_flags; - nrsm->r_sndcnt = rsm->r_sndcnt; - nrsm->r_rtr_bytes = 0; - for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { - nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; + rack_clone_rsm(rack, nrsm, rsm, seq_out); + insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); +#ifdef INVARIANTS + if (insret != NULL) { + panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", + nrsm, insret, rack, rsm); } - rsm->r_end = nrsm->r_start; - TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); +#endif if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; @@ -3568,7 +3788,7 @@ rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", seq_out, len, tp->snd_una, tp->snd_max); printf("Starting Dump of all rack entries\n"); - TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { + RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { printf("rsm:%p start:%u end:%u\n", rsm, rsm->r_start, rsm->r_end); } @@ -3799,7 +4019,7 @@ rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, rack->r_ctl.rc_rack_min_rtt = 1; } } - tcp_rack_xmit_timer(rack, TCP_TS_TO_TICKS(t) + 1); + tcp_rack_xmit_timer(rack, t + 1); if ((rsm->r_flags & RACK_TLP) && (!IN_RECOVERY(tp->t_flags))) { /* Segment was a TLP and our retrans matched */ @@ -3812,9 +4032,9 @@ rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, * When we enter recovery we need to assure * we send one packet. */ - rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; - } else - rack->r_ctl.rc_tlp_rtx_out = 0; + rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); + rack_log_to_prr(rack, 7); + } } if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { /* New more recent rack_tmit_time */ @@ -3833,7 +4053,7 @@ rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, if ((to->to_flags & TOF_TS) && (ack_type == CUM_ACKED) && (to->to_tsecr) && - ((rsm->r_flags & (RACK_DEFERRED | RACK_OVERMAX)) == 0)) { + ((rsm->r_flags & RACK_OVERMAX) == 0)) { /* * Now which timestamp does it match? In this block the ACK * must be coming from a previous transmission. @@ -3930,11 +4150,7 @@ rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm) { struct rack_sendmap *nrsm; - uint32_t ts; - int32_t idx; - idx = rsm->r_rtr_cnt - 1; - ts = rsm->r_tim_lastsent[idx]; nrsm = rsm; TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, rack_head, r_tnext) { @@ -3943,7 +4159,11 @@ rack_log_sack_passed(struct tcpcb *tp, continue; } if (nrsm->r_flags & RACK_ACKED) { - /* Skip ack'd segments */ + /* + * Skip ack'd segments, though we + * should not see these, since tmap + * should not have ack'd segments. + */ continue; } if (nrsm->r_flags & RACK_SACK_PASSED) { @@ -3954,146 +4174,351 @@ rack_log_sack_passed(struct tcpcb *tp, */ break; } - idx = nrsm->r_rtr_cnt - 1; - if (ts == nrsm->r_tim_lastsent[idx]) { - /* - * For this case lets use seq no, if we sent in a - * big block (TSO) we would have a bunch of segments - * sent at the same time. - * - * We would only get a report if its SEQ is earlier. - * If we have done multiple retransmits the times - * would not be equal. - */ - if (SEQ_LT(nrsm->r_start, rsm->r_start)) { - nrsm->r_flags |= RACK_SACK_PASSED; - nrsm->r_flags &= ~RACK_WAS_SACKPASS; - } - } else { - /* - * Here they were sent at different times, not a big - * block. Since we transmitted this one later and - * see it sack'd then this must also be missing (or - * we would have gotten a sack block for it) - */ - nrsm->r_flags |= RACK_SACK_PASSED; - nrsm->r_flags &= ~RACK_WAS_SACKPASS; - } + nrsm->r_flags |= RACK_SACK_PASSED; + nrsm->r_flags &= ~RACK_WAS_SACKPASS; } } static uint32_t rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, - struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts) + struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, int *moved_two) { - int32_t idx; - int32_t times = 0; uint32_t start, end, changed = 0; - struct rack_sendmap *rsm, *nrsm; + struct rack_sendmap stack_map; + struct rack_sendmap *rsm, *nrsm, fe, *insret, *prev, *next; int32_t used_ref = 1; + int moved = 0; start = sack->start; end = sack->end; rsm = *prsm; - if (rsm && SEQ_LT(start, rsm->r_start)) { - TAILQ_FOREACH_REVERSE_FROM(rsm, &rack->r_ctl.rc_map, rack_head, r_next) { - if (SEQ_GEQ(start, rsm->r_start) && - SEQ_LT(start, rsm->r_end)) { - goto do_rest_ofb; - } - } - } - if (rsm == NULL) { -start_at_beginning: - rsm = NULL; - used_ref = 0; - } - /* First lets locate the block where this guy is */ - TAILQ_FOREACH_FROM(rsm, &rack->r_ctl.rc_map, r_next) { - if (SEQ_GEQ(start, rsm->r_start) && - SEQ_LT(start, rsm->r_end)) { - break; - } - } + memset(&fe, 0, sizeof(fe)); do_rest_ofb: - if (rsm == NULL) { - /* - * This happens when we get duplicate sack blocks with the - * same end. For example SACK 4: 100 SACK 3: 100 The sort - * will not change there location so we would just start at - * the end of the first one and get lost. + if ((rsm == NULL) || + (SEQ_LT(end, rsm->r_start)) || + (SEQ_GEQ(start, rsm->r_end)) || + (SEQ_LT(start, rsm->r_start))) { + /* + * We are not in the right spot, + * find the correct spot in the tree. */ - if (tp->t_flags & TF_SENTFIN) { - /* - * Check to see if we have not logged the FIN that - * went out. + used_ref = 0; + fe.r_start = start; + rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); + moved++; + } + if (rsm == NULL) { + /* TSNH */ + goto out; + } + /* Ok we have an ACK for some piece of this rsm */ + if (rsm->r_start != start) { + if ((rsm->r_flags & RACK_ACKED) == 0) { + /** + * Need to split this in two pieces the before and after, + * the before remains in the map, the after must be + * added. In other words we have: + * rsm |--------------| + * sackblk |-------> + * rsm will become + * rsm |---| + * and nrsm will be the sacked piece + * nrsm |----------| + * + * But before we start down that path lets + * see if the sack spans over on top of + * the next guy and it is already sacked. */ - nrsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); - if (nrsm && (nrsm->r_end + 1) == tp->snd_max) { - /* - * Ok we did not get the FIN logged. + next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); + if (next && (next->r_flags & RACK_ACKED) && + SEQ_GEQ(end, next->r_start)) { + /** + * So the next one is already acked, and + * we can thus by hookery use our stack_map + * to reflect the piece being sacked and + * then adjust the two tree entries moving + * the start and ends around. So we start like: + * rsm |------------| (not-acked) + * next |-----------| (acked) + * sackblk |--------> + * We want to end like so: + * rsm |------| (not-acked) + * next |-----------------| (acked) + * nrsm |-----| + * Where nrsm is a temporary stack piece we + * use to update all the gizmos. */ - nrsm->r_end++; + /* Copy up our fudge block */ + nrsm = &stack_map; + memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); + /* Now adjust our tree blocks */ + rsm->r_end = start; + next->r_start = start; + /* Clear out the dup ack count of the remainder */ + rsm->r_dupack = 0; + rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); + /* Now lets make sure our fudge block is right */ + nrsm->r_start = start; + /* Now lets update all the stats and such */ + rack_update_rtt(tp, rack, nrsm, to, cts, SACKED); + changed += (nrsm->r_end - nrsm->r_start); + rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); + if (nrsm->r_flags & RACK_SACK_PASSED) { + counter_u64_add(rack_reorder_seen, 1); + rack->r_ctl.rc_reorder_ts = cts; + } + /* + * Now we want to go up from rsm (the + * one left un-acked) to the next one + * in the tmap. We do this so when + * we walk backwards we include marking + * sack-passed on rsm (The one passed in + * is skipped since it is generally called + * on something sacked before removing it + * from the tmap). + */ + if (rsm->r_in_tmap) { + nrsm = TAILQ_NEXT(rsm, r_tnext); + /* + * Now that we have the next + * one walk backwards from there. + */ + if (nrsm && nrsm->r_in_tmap) + rack_log_sack_passed(tp, rack, nrsm); + } + /* Now are we done? */ + if (SEQ_LT(end, next->r_end) || + (end == next->r_end)) { + /* Done with block */ + goto out; + } + counter_u64_add(rack_sack_used_next_merge, 1); + /* Postion for the next block */ + start = next->r_end; + rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, next); + if (rsm == NULL) + goto out; + } else { + /** + * We can't use any hookery here, so we + * need to split the map. We enter like + * so: + * rsm |--------| + * sackblk |-----> + * We will add the new block nrsm and + * that will be the new portion, and then + * fall through after reseting rsm. So we + * split and look like this: + * rsm |----| + * sackblk |-----> + * nrsm |---| + * We then fall through reseting + * rsm to nrsm, so the next block + * picks it up. + */ + nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); + if (nrsm == NULL) { + /* + * failed XXXrrs what can we do but loose the sack + * info? + */ + goto out; + } + counter_u64_add(rack_sack_splits, 1); + rack_clone_rsm(rack, nrsm, rsm, start); + insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); +#ifdef INVARIANTS + if (insret != NULL) { + panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", + nrsm, insret, rack, rsm); + } +#endif + if (rsm->r_in_tmap) { + TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); + nrsm->r_in_tmap = 1; + } + rsm->r_flags &= (~RACK_HAS_FIN); + /* Position us to point to the new nrsm that starts the sack blk */ rsm = nrsm; - goto do_rest_ofb; + } + } else { + /* Already sacked this piece */ + counter_u64_add(rack_sack_skipped_acked, 1); + moved++; + if (end == rsm->r_end) { + /* Done with block */ + rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); + goto out; + } else if (SEQ_LT(end, rsm->r_end)) { + /* A partial sack to a already sacked block */ + moved++; + rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); + goto out; + } else { + /* + * The end goes beyond this guy + * repostion the start to the + * next block. + */ + start = rsm->r_end; + rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); + if (rsm == NULL) + goto out; } } - if (times == 1) { -#ifdef INVARIANTS - panic("tp:%p rack:%p sack:%p to:%p prsm:%p", - tp, rack, sack, to, prsm); -#else - goto out; -#endif - } - times++; - counter_u64_add(rack_sack_proc_restart, 1); - goto start_at_beginning; - } - /* Ok we have an ACK for some piece of rsm */ - if (rsm->r_start != start) { - /* - * Need to split this in two pieces the before and after. - */ - nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); - if (nrsm == NULL) { - /* - * failed XXXrrs what can we do but loose the sack - * info? - */ - goto out; - } - nrsm->r_start = start; - nrsm->r_rtr_bytes = 0; - nrsm->r_end = rsm->r_end; - nrsm->r_rtr_cnt = rsm->r_rtr_cnt; - nrsm->r_flags = rsm->r_flags; - nrsm->r_sndcnt = rsm->r_sndcnt; - for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { - nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; - } - rsm->r_end = nrsm->r_start; - TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); - if (rsm->r_in_tmap) { - TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); - nrsm->r_in_tmap = 1; - } - rsm->r_flags &= (~RACK_HAS_FIN); - rsm = nrsm; } if (SEQ_GEQ(end, rsm->r_end)) { - /* + /** * The end of this block is either beyond this guy or right - * at this guy. + * at this guy. I.e.: + * rsm --- |-----| + * end |-----| + * + * end |---------| */ - + if (rsm->r_flags & RACK_TLP) + rack->r_ctl.rc_tlp_rtx_out = 0; if ((rsm->r_flags & RACK_ACKED) == 0) { rack_update_rtt(tp, rack, rsm, to, cts, SACKED); changed += (rsm->r_end - rsm->r_start); rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); - rack_log_sack_passed(tp, rack, rsm); + if (rsm->r_in_tmap) /* should be true */ + rack_log_sack_passed(tp, rack, rsm); /* Is Reordering occuring? */ if (rsm->r_flags & RACK_SACK_PASSED) { + rsm->r_flags &= ~RACK_SACK_PASSED; + counter_u64_add(rack_reorder_seen, 1); + rack->r_ctl.rc_reorder_ts = cts; + } + rsm->r_flags |= RACK_ACKED; + rsm->r_flags &= ~RACK_TLP; + if (rsm->r_in_tmap) { + TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); + rsm->r_in_tmap = 0; + } + } else { + counter_u64_add(rack_sack_skipped_acked, 1); + moved++; + } + if (end == rsm->r_end) { + /* This block only - done, setup for next */ + goto out; + } + /* + * There is more not coverend by this rsm move on + * to the next block in the RB tree. + */ + nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); + start = rsm->r_end; + rsm = nrsm; + if (rsm == NULL) + goto out; + goto do_rest_ofb; + } + /** + * The end of this sack block is smaller than + * our rsm i.e.: + * rsm --- |-----| + * end |--| + */ + if ((rsm->r_flags & RACK_ACKED) == 0) { + prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); + if (prev && (prev->r_flags & RACK_ACKED)) { + /** + * Goal, we want the right remainder of rsm to shrink + * in place and span from (rsm->r_start = end) to rsm->r_end. + * We want to expand prev to go all the way + * to prev->r_end <- end. + * so in the tree we have before: + * prev |--------| (acked) + * rsm |-------| (non-acked) + * sackblk |-| + * We churn it so we end up with + * prev |----------| (acked) + * rsm |-----| (non-acked) + * nrsm |-| (temporary) + */ + nrsm = &stack_map; + memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); + prev->r_end = end; + rsm->r_start = end; + /* Now adjust nrsm (stack copy) to be + * the one that is the small + * piece that was "sacked". + */ + nrsm->r_end = end; + rsm->r_dupack = 0; + rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); + /* + * Now nrsm is our new little piece + * that is acked (which was merged + * to prev). Update the rtt and changed + * based on that. Also check for reordering. + */ + rack_update_rtt(tp, rack, nrsm, to, cts, SACKED); + changed += (nrsm->r_end - nrsm->r_start); + rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); + if (nrsm->r_flags & RACK_SACK_PASSED) { + counter_u64_add(rack_reorder_seen, 1); + rack->r_ctl.rc_reorder_ts = cts; + } + rsm = prev; + counter_u64_add(rack_sack_used_prev_merge, 1); + } else { + /** + * This is the case where our previous + * block is not acked either, so we must + * split the block in two. + */ + nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); + if (nrsm == NULL) { + /* failed rrs what can we do but loose the sack info? */ + goto out; + } + /** + * In this case nrsm becomes + * nrsm->r_start = end; + * nrsm->r_end = rsm->r_end; + * which is un-acked. + * + * rsm->r_end = nrsm->r_start; + * i.e. the remaining un-acked + * piece is left on the left + * hand side. + * + * So we start like this + * rsm |----------| (not acked) + * sackblk |---| + * build it so we have + * rsm |---| (acked) + * nrsm |------| (not acked) + */ + counter_u64_add(rack_sack_splits, 1); + rack_clone_rsm(rack, nrsm, rsm, end); + rsm->r_flags &= (~RACK_HAS_FIN); + insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); +#ifdef INVARIANTS + if (insret != NULL) { + panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", + nrsm, insret, rack, rsm); + } +#endif + if (rsm->r_in_tmap) { + TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); + nrsm->r_in_tmap = 1; + } + nrsm->r_dupack = 0; + rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); + if (rsm->r_flags & RACK_TLP) + rack->r_ctl.rc_tlp_rtx_out = 0; + rack_update_rtt(tp, rack, rsm, to, cts, SACKED); + changed += (rsm->r_end - rsm->r_start); + rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); + if (rsm->r_in_tmap) /* should be true */ + rack_log_sack_passed(tp, rack, rsm); + /* Is Reordering occuring? */ + if (rsm->r_flags & RACK_SACK_PASSED) { + rsm->r_flags &= ~RACK_SACK_PASSED; counter_u64_add(rack_reorder_seen, 1); rack->r_ctl.rc_reorder_ts = cts; } @@ -4104,79 +4529,38 @@ rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack rsm->r_in_tmap = 0; } } - if (end == rsm->r_end) { - /* This block only - done */ - goto out; - } - /* There is more not coverend by this rsm move on */ - start = rsm->r_end; - nrsm = TAILQ_NEXT(rsm, r_next); - rsm = nrsm; - times = 0; - goto do_rest_ofb; - } - /* Ok we need to split off this one at the tail */ - nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); - if (nrsm == NULL) { - /* failed rrs what can we do but loose the sack info? */ - goto out; - } - /* Clone it */ - nrsm->r_start = end; - nrsm->r_end = rsm->r_end; - nrsm->r_rtr_bytes = 0; - nrsm->r_rtr_cnt = rsm->r_rtr_cnt; - nrsm->r_flags = rsm->r_flags; - nrsm->r_sndcnt = rsm->r_sndcnt; - for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { - nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; - } - /* The sack block does not cover this guy fully */ - rsm->r_flags &= (~RACK_HAS_FIN); - rsm->r_end = end; - TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); - if (rsm->r_in_tmap) { - TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); - nrsm->r_in_tmap = 1; - } - if (rsm->r_flags & RACK_ACKED) { - /* Been here done that */ - goto out; - } - rack_update_rtt(tp, rack, rsm, to, cts, SACKED); - changed += (rsm->r_end - rsm->r_start); - rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); - rack_log_sack_passed(tp, rack, rsm); - /* Is Reordering occuring? */ - if (rsm->r_flags & RACK_SACK_PASSED) { - counter_u64_add(rack_reorder_seen, 1); - rack->r_ctl.rc_reorder_ts = cts; - } - rsm->r_flags |= RACK_ACKED; - rsm->r_flags &= ~RACK_TLP; - if (rsm->r_in_tmap) { - TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); - rsm->r_in_tmap = 0; + } else if (start != end){ + /* + * The block was already acked. + */ + counter_u64_add(rack_sack_skipped_acked, 1); + moved++; } out: if (rsm && (rsm->r_flags & RACK_ACKED)) { /* - * Now can we merge this newly acked - * block with either the previous or + * Now can we merge where we worked + * with either the previous or * next block? */ - nrsm = TAILQ_NEXT(rsm, r_next); - if (nrsm && - (nrsm->r_flags & RACK_ACKED)) { + next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); + while (next) { + if (next->r_flags & RACK_ACKED) { /* yep this and next can be merged */ - rsm = rack_merge_rsm(rack, rsm, nrsm); + rsm = rack_merge_rsm(rack, rsm, next); + next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); + } else + break; } /* Now what about the previous? */ - nrsm = TAILQ_PREV(rsm, rack_head, r_next); - if (nrsm && - (nrsm->r_flags & RACK_ACKED)) { + prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); + while (prev) { + if (prev->r_flags & RACK_ACKED) { /* yep the previous and this can be merged */ - rsm = rack_merge_rsm(rack, nrsm, rsm); + rsm = rack_merge_rsm(rack, prev, rsm); + prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); + } else + break; } } if (used_ref == 0) { @@ -4184,12 +4568,14 @@ rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack } else { counter_u64_add(rack_sack_proc_short, 1); } - /* Save off where we last were */ - if (rsm) - rack->r_ctl.rc_sacklast = TAILQ_NEXT(rsm, r_next); + /* Save off the next one for quick reference. */ + if (rsm) + nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); else - rack->r_ctl.rc_sacklast = NULL; - *prsm = rsm; + nrsm = NULL; + *prsm = rack->r_ctl.rc_sacklast = nrsm; + /* Pass back the moved. */ + *moved_two = moved; return (changed); } @@ -4218,7 +4604,7 @@ rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ac tmap = rsm; } tmap->r_in_tmap = 1; - rsm = TAILQ_NEXT(rsm, r_next); + rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); } /* * Now lets possibly clear the sack filter so we start @@ -4229,17 +4615,73 @@ rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ac } +static void +rack_do_decay(struct tcp_rack *rack) +{ + struct timeval res; + +#define timersub(tvp, uvp, vvp) \ + do { \ + (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ + (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ + if ((vvp)->tv_usec < 0) { \ + (vvp)->tv_sec--; \ + (vvp)->tv_usec += 1000000; \ + } \ + } while (0) + + timersub(&rack->r_ctl.rc_last_ack, &rack->r_ctl.rc_last_time_decay, &res); +#undef timersub + + rack->r_ctl.input_pkt++; + if ((rack->rc_in_persist) || + (res.tv_sec >= 1) || + (rack->rc_tp->snd_max == rack->rc_tp->snd_una)) { + /* + * Check for decay of non-SAD, + * we want all SAD detection metrics to + * decay 1/4 per second (or more) passed. + */ + uint32_t pkt_delta; + + pkt_delta = rack->r_ctl.input_pkt - rack->r_ctl.saved_input_pkt; + /* Update our saved tracking values */ + rack->r_ctl.saved_input_pkt = rack->r_ctl.input_pkt; + rack->r_ctl.rc_last_time_decay = rack->r_ctl.rc_last_ack; + /* Now do we escape without decay? */ + if (rack->rc_in_persist || + (rack->rc_tp->snd_max == rack->rc_tp->snd_una) || + (pkt_delta < tcp_sad_low_pps)){ + /* + * We don't decay idle connections + * or ones that have a low input pps. + */ + return; + } + /* Decay the counters */ + rack->r_ctl.ack_count = ctf_decay_count(rack->r_ctl.ack_count, + tcp_sad_decay_val); + rack->r_ctl.sack_count = ctf_decay_count(rack->r_ctl.sack_count, + tcp_sad_decay_val); + rack->r_ctl.sack_moved_extra = ctf_decay_count(rack->r_ctl.sack_moved_extra, + tcp_sad_decay_val); + rack->r_ctl.sack_noextra_move = ctf_decay_count(rack->r_ctl.sack_noextra_move, + tcp_sad_decay_val); + } +} + static void rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) { - uint32_t changed, last_seq, entered_recovery = 0; + uint32_t changed, entered_recovery = 0; struct tcp_rack *rack; - struct rack_sendmap *rsm; + struct rack_sendmap *rsm, *rm; struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; register uint32_t th_ack; int32_t i, j, k, num_sack_blks = 0; uint32_t cts, acked, ack_point, sack_changed = 0; - + int loop_start = 0, moved_two = 0; + INP_WLOCK_ASSERT(tp->t_inpcb); if (th->th_flags & TH_RST) { /* We don't log resets */ @@ -4247,10 +4689,31 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) } rack = (struct tcp_rack *)tp->t_fb_ptr; cts = tcp_ts_getticks(); - rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); + rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); changed = 0; th_ack = th->th_ack; + if (rack->sack_attack_disable == 0) + rack_do_decay(rack); + if (BYTES_THIS_ACK(tp, th) >= ctf_fixed_maxseg(rack->rc_tp)) { + /* + * You only get credit for + * MSS and greater (and you get extra + * credit for larger cum-ack moves). + */ + int ac; + ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp); + rack->r_ctl.ack_count += ac; + counter_u64_add(rack_ack_total, ac); + } + if (rack->r_ctl.ack_count > 0xfff00000) { + /* + * reduce the number to keep us under + * a uint32_t. + */ + rack->r_ctl.ack_count /= 2; + rack->r_ctl.sack_count /= 2; + } if (SEQ_GT(th_ack, tp->snd_una)) { rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); tp->t_acktime = ticks; @@ -4264,8 +4727,8 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) * RTT's. */ rack->r_wanted_output++; -more: - rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); + more: + rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); if (rsm == NULL) { if ((th_ack - 1) == tp->iss) { /* @@ -4282,9 +4745,9 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) } #ifdef INVARIANTS panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n", - tp, - th, tp->t_state, rack, - tp->snd_una, tp->snd_max, tp->snd_nxt, changed); + tp, + th, tp->t_state, rack, + tp->snd_una, tp->snd_max, tp->snd_nxt, changed); #endif goto proc_sack; } @@ -4292,8 +4755,8 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) /* Huh map is missing this */ #ifdef INVARIANTS printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", - rsm->r_start, - th_ack, tp->t_state, rack->r_state); + rsm->r_start, + th_ack, tp->t_state, rack->r_state); #endif goto proc_sack; } @@ -4305,15 +4768,19 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; rsm->r_rtr_bytes = 0; - TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next); + if (rsm->r_flags & RACK_TLP) + rack->r_ctl.rc_tlp_rtx_out = 0; + rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); +#ifdef INVARIANTS + if (rm != rsm) { + panic("removing head in rack:%p rsm:%p rm:%p", + rack, rsm, rm); + } +#endif if (rsm->r_in_tmap) { TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 0; } - if (rack->r_ctl.rc_next == rsm) { - /* scoot along the marker */ - rack->r_ctl.rc_next = TAILQ_FIRST(&rack->r_ctl.rc_map); - } if (rsm->r_flags & RACK_ACKED) { /* * It was acked on the scoreboard -- remove @@ -4322,10 +4789,11 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); } else if (rsm->r_flags & RACK_SACK_PASSED) { /* - * There are acked segments ACKED on the + * There are segments ACKED on the * scoreboard further up. We are seeing * reordering. */ + rsm->r_flags &= ~RACK_SACK_PASSED; counter_u64_add(rack_reorder_seen, 1); rsm->r_flags |= RACK_ACKED; rack->r_ctl.rc_reorder_ts = cts; @@ -4357,13 +4825,31 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) */ rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); } - rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; - rsm->r_rtr_bytes = 0; + /* + * Clear the dup ack count for + * the piece that remains. + */ + rsm->r_dupack = 0; + rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); + if (rsm->r_rtr_bytes) { + /* + * It was retransmitted adjust the + * sack holes for what was acked. + */ + int ack_am; + + ack_am = (th_ack - rsm->r_start); + if (ack_am >= rsm->r_rtr_bytes) { + rack->r_ctl.rc_holes_rxt -= ack_am; + rsm->r_rtr_bytes -= ack_am; + } + } + /* Update where the piece starts */ rsm->r_start = th_ack; } proc_sack: /* Check for reneging */ - rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); + rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { /* * The peer has moved snd_una up to @@ -4382,15 +4868,9 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) rack_peer_reneges(rack, rsm, th->th_ack); } if ((to->to_flags & TOF_SACK) == 0) { - /* We are done nothing left to log */ + /* We are done nothing left */ goto out; } - rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); - if (rsm) { - last_seq = rsm->r_end; - } else { - last_seq = tp->snd_max; - } /* Sack block processing */ if (SEQ_GT(th_ack, tp->snd_una)) ack_point = th_ack; @@ -4398,7 +4878,7 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) ack_point = tp->snd_una; for (i = 0; i < to->to_nsacks; i++) { bcopy((to->to_sacks + i * TCPOLEN_SACK), - &sack, sizeof(sack)); + &sack, sizeof(sack)); sack.start = ntohl(sack.start); sack.end = ntohl(sack.end); if (SEQ_GT(sack.end, sack.start) && @@ -4406,28 +4886,19 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) SEQ_LT(sack.start, tp->snd_max) && SEQ_GT(sack.end, ack_point) && SEQ_LEQ(sack.end, tp->snd_max)) { - if ((rack->r_ctl.rc_num_maps_alloced > rack_sack_block_limit) && - (SEQ_LT(sack.end, last_seq)) && - ((sack.end - sack.start) < (tp->t_maxseg / 8))) { - /* - * Not the last piece and its smaller than - * 1/8th of a MSS. We ignore this. - */ - counter_u64_add(rack_runt_sacks, 1); - continue; - } sack_blocks[num_sack_blks] = sack; num_sack_blks++; +#ifdef NETFLIX_STATS } else if (SEQ_LEQ(sack.start, th_ack) && SEQ_LEQ(sack.end, th_ack)) { /* * Its a D-SACK block. */ -/* tcp_record_dsack(sack.start, sack.end); */ + tcp_record_dsack(sack.start, sack.end); +#endif } + } - if (num_sack_blks == 0) - goto out; /* * Sort the SACK blocks so we can update the rack scoreboard with * just one pass. @@ -4437,7 +4908,12 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) num_sack_blks, th->th_ack); ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks); } + if (num_sack_blks == 0) { + /* Nothing to sack (DSACKs?) */ + goto out_with_totals; + } if (num_sack_blks < 2) { + /* Only one, we don't need to sort */ goto do_sack_work; } /* Sort the sacks */ @@ -4452,9 +4928,11 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) } /* * Now are any of the sack block ends the same (yes some - * implememtations send these)? + * implementations send these)? */ again: + if (num_sack_blks == 0) + goto out_with_totals; if (num_sack_blks > 1) { for (i = 0; i < num_sack_blks; i++) { for (j = i + 1; j < num_sack_blks; j++) { @@ -4488,16 +4966,230 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) } } do_sack_work: - rsm = rack->r_ctl.rc_sacklast; - for (i = 0; i < num_sack_blks; i++) { - acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts); + /* + * First lets look to see if + * we have retransmitted and + * can use the transmit next? + */ + rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); + if (rsm && + SEQ_GT(sack_blocks[0].end, rsm->r_start) && + SEQ_LT(sack_blocks[0].start, rsm->r_end)) { + /* + * We probably did the FR and the next + * SACK in continues as we would expect. + */ + acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &moved_two); if (acked) { rack->r_wanted_output++; changed += acked; sack_changed += acked; } + if (num_sack_blks == 1) { + /* + * This is what we would expect from + * a normal implementation to happen + * after we have retransmitted the FR, + * i.e the sack-filter pushes down + * to 1 block and the next to be retransmitted + * is the sequence in the sack block (has more + * are acked). Count this as ACK'd data to boost + * up the chances of recovering any false positives. + */ + rack->r_ctl.ack_count += (acked / ctf_fixed_maxseg(rack->rc_tp)); + counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp))); + counter_u64_add(rack_express_sack, 1); + if (rack->r_ctl.ack_count > 0xfff00000) { + /* + * reduce the number to keep us under + * a uint32_t. + */ + rack->r_ctl.ack_count /= 2; + rack->r_ctl.sack_count /= 2; + } + goto out_with_totals; + } else { + /* + * Start the loop through the + * rest of blocks, past the first block. + */ + moved_two = 0; + loop_start = 1; + } + } + /* Its a sack of some sort */ + rack->r_ctl.sack_count++; + if (rack->r_ctl.sack_count > 0xfff00000) { + /* + * reduce the number to keep us under + * a uint32_t. + */ + rack->r_ctl.ack_count /= 2; + rack->r_ctl.sack_count /= 2; + } + counter_u64_add(rack_sack_total, 1); + if (rack->sack_attack_disable) { + /* An attacker disablement is in place */ + if (num_sack_blks > 1) { + rack->r_ctl.sack_count += (num_sack_blks - 1); + rack->r_ctl.sack_moved_extra++; + counter_u64_add(rack_move_some, 1); + if (rack->r_ctl.sack_moved_extra > 0xfff00000) { + rack->r_ctl.sack_moved_extra /= 2; + rack->r_ctl.sack_noextra_move /= 2; + } + } + goto out; + } + rsm = rack->r_ctl.rc_sacklast; + for (i = loop_start; i < num_sack_blks; i++) { + acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &moved_two); + if (acked) { + rack->r_wanted_output++; + changed += acked; + sack_changed += acked; + } + if (moved_two) { + /* + * If we did not get a SACK for at least a MSS and + * had to move at all, or if we moved more than our + * threshold, it counts against the "extra" move. + */ + rack->r_ctl.sack_moved_extra += moved_two; + counter_u64_add(rack_move_some, 1); + } else { + /* + * else we did not have to move + * any more than we would expect. + */ + rack->r_ctl.sack_noextra_move++; + counter_u64_add(rack_move_none, 1); + } + if (moved_two && (acked < ctf_fixed_maxseg(rack->rc_tp))) { + /* + * If the SACK was not a full MSS then + * we add to sack_count the number of + * MSS's (or possibly more than + * a MSS if its a TSO send) we had to skip by. + */ + rack->r_ctl.sack_count += moved_two; + counter_u64_add(rack_sack_total, moved_two); + } + /* + * Now we need to setup for the next + * round. First we make sure we won't + * exceed the size of our uint32_t on + * the various counts, and then clear out + * moved_two. + */ + if ((rack->r_ctl.sack_moved_extra > 0xfff00000) || + (rack->r_ctl.sack_noextra_move > 0xfff00000)) { + rack->r_ctl.sack_moved_extra /= 2; + rack->r_ctl.sack_noextra_move /= 2; + } + if (rack->r_ctl.sack_count > 0xfff00000) { + rack->r_ctl.ack_count /= 2; + rack->r_ctl.sack_count /= 2; + } + moved_two = 0; + } +out_with_totals: + if (num_sack_blks > 1) { + /* + * You get an extra stroke if + * you have more than one sack-blk, this + * could be where we are skipping forward + * and the sack-filter is still working, or + * it could be an attacker constantly + * moving us. + */ + rack->r_ctl.sack_moved_extra++; + counter_u64_add(rack_move_some, 1); } out: +#ifdef NETFLIX_EXP_DETECTION + if ((rack->do_detection || tcp_force_detection) && + tcp_sack_to_ack_thresh && + tcp_sack_to_move_thresh && + ((rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum) || rack->sack_attack_disable)) { + /* + * We have thresholds set to find + * possible attackers and disable sack. + * Check them. + */ + uint64_t ackratio, moveratio, movetotal; + + /* Log detecting */ + rack_log_sad(rack, 1); + ackratio = (uint64_t)(rack->r_ctl.sack_count); + ackratio *= (uint64_t)(1000); + if (rack->r_ctl.ack_count) + ackratio /= (uint64_t)(rack->r_ctl.ack_count); + else { + /* We really should not hit here */ + ackratio = 1000; + } + if ((rack->sack_attack_disable == 0) && + (ackratio > rack_highest_sack_thresh_seen)) + rack_highest_sack_thresh_seen = (uint32_t)ackratio; + movetotal = rack->r_ctl.sack_moved_extra; + movetotal += rack->r_ctl.sack_noextra_move; + moveratio = rack->r_ctl.sack_moved_extra; + moveratio *= (uint64_t)1000; + if (movetotal) + moveratio /= movetotal; + else { + /* No moves, thats pretty good */ + moveratio = 0; + } + if ((rack->sack_attack_disable == 0) && + (moveratio > rack_highest_move_thresh_seen)) + rack_highest_move_thresh_seen = (uint32_t)moveratio; + if (rack->sack_attack_disable == 0) { + if ((ackratio > tcp_sack_to_ack_thresh) && + (moveratio > tcp_sack_to_move_thresh)) { + /* Disable sack processing */ + rack->sack_attack_disable = 1; + if (rack->r_rep_attack == 0) { + rack->r_rep_attack = 1; + counter_u64_add(rack_sack_attacks_detected, 1); + } + if (tcp_attack_on_turns_on_logging) { + /* + * Turn on logging, used for debugging + * false positives. + */ + rack->rc_tp->t_logstate = tcp_attack_on_turns_on_logging; + } + /* Clamp the cwnd at flight size */ + rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd; + rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + rack_log_sad(rack, 2); + } + } else { + /* We are sack-disabled check for false positives */ + if ((ackratio <= tcp_restoral_thresh) || + (rack->r_ctl.rc_num_maps_alloced < tcp_map_minimum)) { + rack->sack_attack_disable = 0; + rack_log_sad(rack, 3); + /* Restart counting */ + rack->r_ctl.sack_count = 0; + rack->r_ctl.sack_moved_extra = 0; + rack->r_ctl.sack_noextra_move = 1; + rack->r_ctl.ack_count = max(1, + (BYTES_THIS_ACK(tp, th)/ctf_fixed_maxseg(rack->rc_tp))); + + if (rack->r_rep_reverse == 0) { + rack->r_rep_reverse = 1; + counter_u64_add(rack_sack_attacks_reversed, 1); + } + /* Restore the cwnd */ + if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd) + rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd; + } + } + } +#endif if (changed) { /* Something changed cancel the rack timer */ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); @@ -4523,12 +5215,13 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) * When we enter recovery we need to assure we send * one packet. */ - rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; + rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); + rack_log_to_prr(rack, 8); rack->r_timer_override = 1; } } if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) { - /* Deal with changed an PRR here (in recovery only) */ + /* Deal with changed and PRR here (in recovery only) */ uint32_t pipe, snd_una; rack->r_ctl.rc_prr_delivered += changed; @@ -4547,6 +5240,7 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; else { rack->r_ctl.rc_prr_sndcnt = 0; + rack_log_to_prr(rack, 9); sndcnt = 0; } sndcnt++; @@ -4555,6 +5249,7 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) else sndcnt = 0; rack->r_ctl.rc_prr_sndcnt = sndcnt; + rack_log_to_prr(rack, 10); } else { uint32_t limit; @@ -4564,19 +5259,38 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) limit = 0; if (changed > limit) limit = changed; - limit += tp->t_maxseg; + limit += ctf_fixed_maxseg(tp); if (tp->snd_ssthresh > pipe) { rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); + rack_log_to_prr(rack, 11); } else { rack->r_ctl.rc_prr_sndcnt = min(0, limit); + rack_log_to_prr(rack, 12); } } - if (rack->r_ctl.rc_prr_sndcnt >= tp->t_maxseg) { + if (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) { rack->r_timer_override = 1; } } } +static void +rack_strike_dupack(struct tcp_rack *rack) +{ + struct rack_sendmap *rsm; + + rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); + if (rsm && (rsm->r_dupack < 0xff)) { + rsm->r_dupack++; + if (rsm->r_dupack >= DUP_ACK_THRESHOLD) { + rack->r_wanted_output = 1; + rack_log_retran_reason(rack, rsm, __LINE__, 1, 3); + } else { + rack_log_retran_reason(rack, rsm, __LINE__, 0, 3); + } + } +} + /* * Return value of 1, we do not need to call rack_process_data(). * return value of 0, rack_process_data can be called. @@ -4598,10 +5312,15 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, rack = (struct tcp_rack *)tp->t_fb_ptr; if (SEQ_GT(th->th_ack, tp->snd_max)) { - rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val); + ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); + rack->r_wanted_output++; return (1); } if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { + if (rack->rc_in_persist) + tp->t_rxtshift = 0; + if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd)) + rack_strike_dupack(rack); rack_log_ack(tp, to, th); } if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { @@ -4675,9 +5394,6 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); rack->r_wanted_output++; } - /* - * If no data (only SYN) was ACK'd, skip rest of ACK processing. - */ if (acked == 0) { if (ofia) *ofia = ourfinisacked; @@ -4732,7 +5448,8 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, if (tp->snd_una == tp->snd_max) { /* Nothing left outstanding */ rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); - tp->t_acktime = 0; + if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) + tp->t_acktime = 0; rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); /* Set need output so persist might get set */ rack->r_wanted_output++; @@ -4748,7 +5465,7 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, */ *ret_val = 1; tp = tcp_close(tp); - rack_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); return (1); } } @@ -4757,6 +5474,91 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, return (0); } +static void +rack_collapsed_window(struct tcp_rack *rack) +{ + /* + * Now we must walk the + * send map and divide the + * ones left stranded. These + * guys can't cause us to abort + * the connection and are really + * "unsent". However if a buggy + * client actually did keep some + * of the data i.e. collapsed the win + * and refused to ack and then opened + * the win and acked that data. We would + * get into an ack war, the simplier + * method then of just pretending we + * did not send those segments something + * won't work. + */ + struct rack_sendmap *rsm, *nrsm, fe, *insret; + tcp_seq max_seq; + uint32_t maxseg; + + max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd; + maxseg = ctf_fixed_maxseg(rack->rc_tp); + memset(&fe, 0, sizeof(fe)); + fe.r_start = max_seq; + /* Find the first seq past or at maxseq */ + rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); + if (rsm == NULL) { + /* Nothing to do strange */ + rack->rc_has_collapsed = 0; + return; + } + /* + * Now do we need to split at + * the collapse point? + */ + if (SEQ_GT(max_seq, rsm->r_start)) { + nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); + if (nrsm == NULL) { + /* We can't get a rsm, mark all? */ + nrsm = rsm; + goto no_split; + } + /* Clone it */ + rack_clone_rsm(rack, nrsm, rsm, max_seq); + insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); +#ifdef INVARIANTS + if (insret != NULL) { + panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", + nrsm, insret, rack, rsm); + } +#endif + if (rsm->r_in_tmap) { + TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); + nrsm->r_in_tmap = 1; + } + /* + * Set in the new RSM as the + * collapsed starting point + */ + rsm = nrsm; + } +no_split: + counter_u64_add(rack_collapsed_win, 1); + RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) { + nrsm->r_flags |= RACK_RWND_COLLAPSED; + rack->rc_has_collapsed = 1; + } +} + +static void +rack_un_collapse_window(struct tcp_rack *rack) +{ + struct rack_sendmap *rsm; + + RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { + if (rsm->r_flags & RACK_RWND_COLLAPSED) + rsm->r_flags &= ~RACK_RWND_COLLAPSED; + else + break; + } + rack->rc_has_collapsed = 0; +} /* * Return value of 1, the TCB is unlocked and most @@ -4773,11 +5575,7 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, * send garbage on first SYN. */ int32_t nsegs; -#ifdef TCP_RFC7413 int32_t tfo_syn; -#else -#define tfo_syn (FALSE) -#endif struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; @@ -4804,13 +5602,36 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, tp->snd_wl2 = th->th_ack; } } + if (tp->snd_wnd < ctf_outstanding(tp)) + /* The peer collapsed the window */ + rack_collapsed_window(rack); + else if (rack->rc_has_collapsed) + rack_un_collapse_window(rack); /* Was persist timer active and now we have window space? */ - if ((rack->rc_in_persist != 0) && tp->snd_wnd) { + if ((rack->rc_in_persist != 0) && + (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), + rack->r_ctl.rc_pace_min_segs))) { rack_exit_persist(tp, rack); tp->snd_nxt = tp->snd_max; /* Make sure we output to start the timer */ rack->r_wanted_output++; } + /* Do we enter persists? */ + if ((rack->rc_in_persist == 0) && + (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && + TCPS_HAVEESTABLISHED(tp->t_state) && + (tp->snd_max == tp->snd_una) && + sbavail(&tp->t_inpcb->inp_socket->so_snd) && + (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { + /* + * Here the rwnd is less than + * the pacing size, we are established, + * nothing is outstanding, and there is + * data to send. Enter persists. + */ + tp->snd_nxt = tp->snd_una; + rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); + } if (tp->t_flags2 & TF2_DROP_AF_DATA) { m_freem(m); return (0); @@ -4886,10 +5707,8 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, * PRU_RCVD). If a FIN has already been received on this connection * then we just ignore the text. */ -#ifdef TCP_RFC7413 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && - (tp->t_flags & TF_FASTOPEN)); -#endif + IS_FASTOPEN(tp->t_flags)); if ((tlen || (thflags & TH_FIN) || tfo_syn) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; @@ -4912,6 +5731,20 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, SEGQ_EMPTY(tp) && (TCPS_HAVEESTABLISHED(tp->t_state) || tfo_syn)) { +#ifdef NETFLIX_SB_LIMITS + u_int mcnt, appended; + + if (so->so_rcv.sb_shlim) { + mcnt = m_memcnt(m); + appended = 0; + if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, + CFO_NOSLEEP, NULL) == false) { + counter_u64_add(tcp_sb_shlim_fails, 1); + m_freem(m); + return (0); + } + } +#endif if (DELAY_ACK(tp, tlen) || tfo_syn) { rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); @@ -4925,12 +5758,20 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, TCPSTAT_ADD(tcps_rcvpack, nsegs); TCPSTAT_ADD(tcps_rcvbyte, tlen); SOCKBUF_LOCK(&so->so_rcv); - if (so->so_rcv.sb_state & SBS_CANTRCVMORE) + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); - else - sbappendstream_locked(&so->so_rcv, m, 0); + } else +#ifdef NETFLIX_SB_LIMITS + appended = +#endif + sbappendstream_locked(&so->so_rcv, m, 0); /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); +#ifdef NETFLIX_SB_LIMITS + if (so->so_rcv.sb_shlim && appended != mcnt) + counter_fo_release(so->so_rcv.sb_shlim, + mcnt - appended); +#endif } else { /* * XXX: Due to the header drop above "th" is @@ -5063,6 +5904,9 @@ rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, int32_t nsegs; int32_t newsize = 0; /* automatic sockbuf scaling */ struct tcp_rack *rack; +#ifdef NETFLIX_SB_LIMITS + u_int mcnt, appended; +#endif #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, @@ -5112,10 +5956,21 @@ rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, */ nsegs = max(1, m->m_pkthdr.lro_nsegs); - +#ifdef NETFLIX_SB_LIMITS + if (so->so_rcv.sb_shlim) { + mcnt = m_memcnt(m); + appended = 0; + if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, + CFO_NOSLEEP, NULL) == false) { + counter_u64_add(tcp_sb_shlim_fails, 1); + m_freem(m); + return (1); + } + } +#endif /* Clean receiver SACK report if present */ if (tp->rcv_numsacks) - tcp_clean_sackreport(tp); + tcp_clean_sackreport(tp); TCPSTAT_INC(tcps_preddat); tp->rcv_nxt += tlen; /* @@ -5149,11 +6004,18 @@ rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, newsize, so, NULL)) so->so_rcv.sb_flags &= ~SB_AUTOSIZE; m_adj(m, drop_hdrlen); /* delayed header drop */ - sbappendstream_locked(&so->so_rcv, m, 0); - rack_calc_rwin(so, tp); +#ifdef NETFLIX_SB_LIMITS + appended = +#endif + sbappendstream_locked(&so->so_rcv, m, 0); + ctf_calc_rwin(so, tp); } /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); +#ifdef NETFLIX_SB_LIMITS + if (so->so_rcv.sb_shlim && mcnt != appended) + counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended); +#endif if (DELAY_ACK(tp, tlen)) { rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); tp->t_flags |= TF_DELACK; @@ -5231,6 +6093,12 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, /* Ok if we reach here, we can process a fast-ack */ nsegs = max(1, m->m_pkthdr.lro_nsegs); rack_log_ack(tp, to, th); + /* + * We made progress, clear the tlp + * out flag so we could start a TLP + * again. + */ + rack->r_ctl.rc_tlp_rtx_out = 0; /* Did the window get updated? */ if (tiwin != tp->snd_wnd) { tp->snd_wnd = tiwin; @@ -5238,9 +6106,28 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; } - if ((rack->rc_in_persist != 0) && (tp->snd_wnd >= tp->t_maxseg)) { + /* Do we exit persists? */ + if ((rack->rc_in_persist != 0) && + (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), + rack->r_ctl.rc_pace_min_segs))) { rack_exit_persist(tp, rack); } + /* Do we enter persists? */ + if ((rack->rc_in_persist == 0) && + (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && + TCPS_HAVEESTABLISHED(tp->t_state) && + (tp->snd_max == tp->snd_una) && + sbavail(&tp->t_inpcb->inp_socket->so_snd) && + (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { + /* + * Here the rwnd is less than + * the pacing size, we are established, + * nothing is outstanding, and there is + * data to send. Enter persists. + */ + tp->snd_nxt = tp->snd_una; + rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); + } /* * If last ACK falls within this segment's sequence numbers, record * the timestamp. NOTE that the test is modified according to the @@ -5290,6 +6177,12 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0); tp->snd_una = th->th_ack; + if (tp->snd_wnd < ctf_outstanding(tp)) { + /* The peer collapsed the window */ + rack_collapsed_window(rack); + } else if (rack->rc_has_collapsed) + rack_un_collapse_window(rack); + /* * Pull snd_wl2 up to prevent seq wrap relative to th_ack. */ @@ -5313,7 +6206,8 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, #endif if (tp->snd_una == tp->snd_max) { rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); - tp->t_acktime = 0; + if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) + tp->t_acktime = 0; rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); } /* Wake up the socket if we have room to write more */ @@ -5337,8 +6231,9 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, int32_t ret_val = 0; int32_t todrop; int32_t ourfinisacked = 0; + struct tcp_rack *rack; - rack_calc_rwin(so, tp); + ctf_calc_rwin(so, tp); /* * If the state is SYN_SENT: if seg contains an ACK, but not for our * SYN, drop the input. if seg contains a RST, then drop the @@ -5353,27 +6248,30 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { - rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { TCP_PROBE5(connect__refused, NULL, tp, mtod(m, const char *), tp, th); tp = tcp_drop(tp, ECONNREFUSED); - rack_do_drop(m, tp); + ctf_do_drop(m, tp); return (1); } if (thflags & TH_RST) { - rack_do_drop(m, tp); + ctf_do_drop(m, tp); return (1); } if (!(thflags & TH_SYN)) { - rack_do_drop(m, tp); + ctf_do_drop(m, tp); return (1); } tp->irs = th->th_seq; tcp_rcvseqinit(tp); + rack = (struct tcp_rack *)tp->t_fb_ptr; if (thflags & TH_ACK) { + int tfo_partial = 0; + TCPSTAT_INC(tcps_connects); soisconnected(so); #ifdef MAC @@ -5386,23 +6284,48 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, } tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale); + /* + * If not all the data that was sent in the TFO SYN + * has been acked, resend the remainder right away. + */ + if (IS_FASTOPEN(tp->t_flags) && + (tp->snd_una != tp->snd_max)) { + tp->snd_nxt = th->th_ack; + tfo_partial = 1; + } /* * If there's data, delay ACK; if there's also a FIN ACKNOW * will be turned on later. */ - if (DELAY_ACK(tp, tlen) && tlen != 0) { - rack_timer_cancel(tp, (struct tcp_rack *)tp->t_fb_ptr, - ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rcvtime, __LINE__); + if (DELAY_ACK(tp, tlen) && tlen != 0 && (tfo_partial == 0)) { + rack_timer_cancel(tp, rack, + rack->r_ctl.rc_rcvtime, __LINE__); tp->t_flags |= TF_DELACK; } else { - ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; + rack->r_wanted_output++; tp->t_flags |= TF_ACKNOW; } - if ((thflags & TH_ECE) && V_tcp_do_ecn) { + if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) && + V_tcp_do_ecn) { tp->t_flags |= TF_ECN_PERMIT; TCPSTAT_INC(tcps_ecn_shs); } + if (SEQ_GT(th->th_ack, tp->snd_una)) { + /* + * We advance snd_una for the + * fast open case. If th_ack is + * acknowledging data beyond + * snd_una we can't just call + * ack-processing since the + * data stream in our send-map + * will start at snd_una + 1 (one + * beyond the SYN). If its just + * equal we don't need to do that + * and there is no send_map. + */ + tp->snd_una++; + } /* * Received in SYN_SENT[*] state. Transitions: * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 @@ -5454,6 +6377,16 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, * of step 5, ack processing. Otherwise, goto step 6. */ if (thflags & TH_ACK) { + /* For syn-sent we need to possibly update the rtt */ + if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { + uint32_t t; + + t = tcp_ts_getticks() - to->to_tsecr; + if (!tp->t_rttlow || tp->t_rttlow > t) + tp->t_rttlow = t; + tcp_rack_xmit_timer(rack, t + 1); + tcp_rack_xmit_timer_commit(rack, tp); + } if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) return (ret_val); /* We may have changed to FIN_WAIT_1 above */ @@ -5486,7 +6419,7 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, - tiwin, thflags, nxt_pkt)); + tiwin, thflags, nxt_pkt)); } /* @@ -5499,62 +6432,52 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { + struct tcp_rack *rack; int32_t ret_val = 0; int32_t ourfinisacked = 0; - rack_calc_rwin(so, tp); - + ctf_calc_rwin(so, tp); if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { - rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } -#ifdef TCP_RFC7413 - if (tp->t_flags & TF_FASTOPEN) { + rack = (struct tcp_rack *)tp->t_fb_ptr; + if (IS_FASTOPEN(tp->t_flags)) { /* - * When a TFO connection is in SYN_RECEIVED, the only valid - * packets are the initial SYN, a retransmit/copy of the - * initial SYN (possibly with a subset of the original - * data), a valid ACK, a FIN, or a RST. + * When a TFO connection is in SYN_RECEIVED, the + * only valid packets are the initial SYN, a + * retransmit/copy of the initial SYN (possibly with + * a subset of the original data), a valid ACK, a + * FIN, or a RST. */ if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { - rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } else if (thflags & TH_SYN) { /* non-initial SYN is ignored */ - struct tcp_rack *rack; - - rack = (struct tcp_rack *)tp->t_fb_ptr; if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { - rack_do_drop(m, NULL); + ctf_do_drop(m, NULL); return (0); } } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { - rack_do_drop(m, NULL); + ctf_do_drop(m, NULL); return (0); } } -#endif - if (thflags & TH_RST) - return (rack_process_rst(m, th, so, tp)); - /* - * RFC5961 Section 4.2 Send challenge ACK for any SYN in - * synchronized state. - */ - if (thflags & TH_SYN) { - rack_challenge_ack(m, th, tp, &ret_val); - return (ret_val); - } + if ((thflags & TH_RST) || + (tp->t_fin_is_rst && (thflags & TH_FIN))) + return (ctf_process_rst(m, th, so, tp)); /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { - if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) + if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } /* @@ -5565,10 +6488,10 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, * "LAND" DoS attack. */ if (SEQ_LT(th->th_seq, tp->irs)) { - rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } - if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { + if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* @@ -5592,18 +6515,16 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } + tp->snd_wnd = tiwin; /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { -#ifdef TCP_RFC7413 - if (tp->t_flags & TF_FASTOPEN) { - tp->snd_wnd = tiwin; + if (IS_FASTOPEN(tp->t_flags)) { cc_conn_init(tp); } -#endif return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } @@ -5613,13 +6534,22 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; - tp->snd_wnd = tiwin; } /* * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> * FIN-WAIT-1 */ tp->t_starttime = ticks; + if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { + tcp_fastopen_decrement_counter(tp->t_tfo_pending); + tp->t_tfo_pending = NULL; + + /* + * Account for the ACK of our SYN prior to + * regular ACK processing below. + */ + tp->snd_una++; + } if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; @@ -5627,25 +6557,13 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(accept__established, NULL, tp, mtod(m, const char *), tp, th); -#ifdef TCP_RFC7413 - if (tp->t_tfo_pending) { - tcp_fastopen_decrement_counter(tp->t_tfo_pending); - tp->t_tfo_pending = NULL; - - /* - * Account for the ACK of our SYN prior to regular - * ACK processing below. - */ - tp->snd_una++; - } /* * TFO connections call cc_conn_init() during SYN * processing. Calling it again here for such connections * is not harmless as it would undo the snd_cwnd reduction * that occurs when a TFO SYN|ACK is retransmitted. */ - if (!(tp->t_flags & TF_FASTOPEN)) -#endif + if (!IS_FASTOPEN(tp->t_flags)) cc_conn_init(tp); } /* @@ -5653,9 +6571,19 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, * not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) - (void)tcp_reass(tp, (struct tcphdr *)0, NULL, 0, + (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; + /* For syn-recv we need to possibly update the rtt */ + if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { + uint32_t t; + + t = tcp_ts_getticks() - to->to_tsecr; + if (!tp->t_rttlow || tp->t_rttlow > t) + tp->t_rttlow = t; + tcp_rack_xmit_timer(rack, t + 1); + tcp_rack_xmit_timer_commit(rack, tp); + } if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } @@ -5735,17 +6663,18 @@ rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, } } } - rack_calc_rwin(so, tp); + ctf_calc_rwin(so, tp); - if (thflags & TH_RST) - return (rack_process_rst(m, th, so, tp)); + if ((thflags & TH_RST) || + (tp->t_fin_is_rst && (thflags & TH_FIN))) + return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { - rack_challenge_ack(m, th, tp, &ret_val); + ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* @@ -5754,10 +6683,10 @@ rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { - if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) + if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { + if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* @@ -5793,10 +6722,11 @@ rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { - rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; return (ret_val); } else { - rack_do_drop(m, NULL); + ctf_do_drop(m, NULL); return (0); } } @@ -5809,7 +6739,7 @@ rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); - rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } @@ -5830,15 +6760,16 @@ rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, { int32_t ret_val = 0; - rack_calc_rwin(so, tp); - if (thflags & TH_RST) - return (rack_process_rst(m, th, so, tp)); + ctf_calc_rwin(so, tp); + if ((thflags & TH_RST) || + (tp->t_fin_is_rst && (thflags & TH_FIN))) + return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { - rack_challenge_ack(m, th, tp, &ret_val); + ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* @@ -5847,10 +6778,10 @@ rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { - if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) + if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { + if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* @@ -5885,10 +6816,11 @@ rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { - rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; return (ret_val); } else { - rack_do_drop(m, NULL); + ctf_do_drop(m, NULL); return (0); } } @@ -5901,7 +6833,7 @@ rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); - rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } @@ -5913,7 +6845,7 @@ static int rack_check_data_after_close(struct mbuf *m, struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so) { - struct tcp_rack *rack; + struct tcp_rack *rack; INP_INFO_RLOCK_ASSERT(&V_tcbinfo); rack = (struct tcp_rack *)tp->t_fb_ptr; @@ -5921,7 +6853,7 @@ rack_check_data_after_close(struct mbuf *m, close_now: tp = tcp_close(tp); TCPSTAT_INC(tcps_rcvafterclose); - rack_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); + ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); return (1); } if (sbavail(&so->so_snd) == 0) @@ -5947,16 +6879,17 @@ rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, int32_t ret_val = 0; int32_t ourfinisacked = 0; - rack_calc_rwin(so, tp); + ctf_calc_rwin(so, tp); - if (thflags & TH_RST) - return (rack_process_rst(m, th, so, tp)); + if ((thflags & TH_RST) || + (tp->t_fin_is_rst && (thflags & TH_FIN))) + return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { - rack_challenge_ack(m, th, tp, &ret_val); + ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* @@ -5965,10 +6898,10 @@ rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { - if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) + if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { + if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* @@ -6010,10 +6943,11 @@ rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { - rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; return (ret_val); } else { - rack_do_drop(m, NULL); + ctf_do_drop(m, NULL); return (0); } } @@ -6045,7 +6979,7 @@ rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); - rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } @@ -6066,16 +7000,17 @@ rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, int32_t ret_val = 0; int32_t ourfinisacked = 0; - rack_calc_rwin(so, tp); + ctf_calc_rwin(so, tp); - if (thflags & TH_RST) - return (rack_process_rst(m, th, so, tp)); + if ((thflags & TH_RST) || + (tp->t_fin_is_rst && (thflags & TH_FIN))) + return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { - rack_challenge_ack(m, th, tp, &ret_val); + ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* @@ -6084,10 +7019,10 @@ rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { - if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) + if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { + if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* @@ -6129,10 +7064,11 @@ rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { - rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; return (ret_val); } else { - rack_do_drop(m, NULL); + ctf_do_drop(m, NULL); return (0); } } @@ -6151,7 +7087,7 @@ rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); - rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } @@ -6172,16 +7108,17 @@ rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, int32_t ret_val = 0; int32_t ourfinisacked = 0; - rack_calc_rwin(so, tp); + ctf_calc_rwin(so, tp); - if (thflags & TH_RST) - return (rack_process_rst(m, th, so, tp)); + if ((thflags & TH_RST) || + (tp->t_fin_is_rst && (thflags & TH_FIN))) + return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { - rack_challenge_ack(m, th, tp, &ret_val); + ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* @@ -6190,10 +7127,10 @@ rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { - if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) + if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { + if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* @@ -6235,10 +7172,11 @@ rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { - rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; return (ret_val); } else { - rack_do_drop(m, NULL); + ctf_do_drop(m, NULL); return (0); } } @@ -6251,13 +7189,13 @@ rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, if (ourfinisacked) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tp = tcp_close(tp); - rack_do_drop(m, tp); + ctf_do_drop(m, tp); return (1); } if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); - rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } @@ -6279,17 +7217,18 @@ rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, int32_t ret_val = 0; int32_t ourfinisacked = 0; - rack_calc_rwin(so, tp); + ctf_calc_rwin(so, tp); /* Reset receive buffer auto scaling when not in bulk receive mode. */ - if (thflags & TH_RST) - return (rack_process_rst(m, th, so, tp)); + if ((thflags & TH_RST) || + (tp->t_fin_is_rst && (thflags & TH_FIN))) + return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { - rack_challenge_ack(m, th, tp, &ret_val); + ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* @@ -6298,10 +7237,10 @@ rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { - if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) + if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { + if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* @@ -6344,10 +7283,11 @@ rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { - rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; return (ret_val); } else { - rack_do_drop(m, NULL); + ctf_do_drop(m, NULL); return (0); } } @@ -6360,7 +7300,7 @@ rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); - rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } @@ -6377,10 +7317,43 @@ rack_clear_rate_sample(struct tcp_rack *rack) rack->r_ctl.rack_rs.rs_rtt_tot = 0; } +static void +rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack) +{ + uint32_t tls_seg = 0; + +#ifdef KERN_TLS + if (rack->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { + tls_seg = ctf_get_opt_tls_size(rack->rc_inp->inp_socket, rack->rc_tp->snd_wnd); + rack->r_ctl.rc_pace_min_segs = tls_seg; + } else +#endif + rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp); + rack->r_ctl.rc_pace_max_segs = ctf_fixed_maxseg(tp) * rack->rc_pace_max_segs; + if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) + rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES; +#ifdef KERN_TLS + if (tls_seg != 0) { + if (rack_hw_tls_max_seg > 1) { + rack->r_ctl.rc_pace_max_segs /= tls_seg; + if (rack_hw_tls_max_seg < rack->r_ctl.rc_pace_max_segs) + rack->r_ctl.rc_pace_max_segs = rack_hw_tls_max_seg; + } else { + rack->r_ctl.rc_pace_max_segs = 1; + } + if (rack->r_ctl.rc_pace_max_segs == 0) + rack->r_ctl.rc_pace_max_segs = 1; + rack->r_ctl.rc_pace_max_segs *= tls_seg; + } +#endif + rack_log_type_hrdwtso(tp, rack, tls_seg, rack->rc_inp->inp_socket->so_snd.sb_flags, 0, 2); +} + static int rack_init(struct tcpcb *tp) { struct tcp_rack *rack = NULL; + struct rack_sendmap *insret; tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); if (tp->t_fb_ptr == NULL) { @@ -6395,13 +7368,14 @@ rack_init(struct tcpcb *tp) memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack)); rack = (struct tcp_rack *)tp->t_fb_ptr; - TAILQ_INIT(&rack->r_ctl.rc_map); + RB_INIT(&rack->r_ctl.rc_mtree); TAILQ_INIT(&rack->r_ctl.rc_free); TAILQ_INIT(&rack->r_ctl.rc_tmap); rack->rc_tp = tp; if (tp->t_inpcb) { rack->rc_inp = tp->t_inpcb; } + tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; /* Probably not needed but lets be sure */ rack_clear_rate_sample(rack); rack->r_cpu = 0; @@ -6409,27 +7383,36 @@ rack_init(struct tcpcb *tp) rack->rc_allow_data_af_clo = rack_ignore_data_after_close; rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; rack->rc_pace_reduce = rack_slot_reduction; + if (use_rack_cheat) + rack->use_rack_cheat = 1; if (V_tcp_delack_enabled) tp->t_delayed_ack = 1; else tp->t_delayed_ack = 0; rack->rc_pace_max_segs = rack_hptsi_segments; - rack->r_ctl.rc_early_recovery_segs = rack_early_recovery_max_seg; rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; rack->r_ctl.rc_pkt_delay = rack_pkt_delay; rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce; - rack->r_idle_reduce_largest = rack_reduce_largest_on_idle; rack->r_enforce_min_pace = rack_min_pace_time; - rack->r_min_pace_seg_thresh = rack_min_pace_time_seg_req; rack->r_ctl.rc_prop_rate = rack_proportional_rate; rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; rack->r_ctl.rc_early_recovery = rack_early_recovery; rack->rc_always_pace = rack_pace_every_seg; + rack_set_pace_segments(tp, rack); + rack->r_ctl.rc_high_rwnd = tp->snd_wnd; rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; rack->rack_tlp_threshold_use = rack_tlp_threshold_use; rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; rack->r_ctl.rc_min_to = rack_min_to; - rack->r_ctl.rc_prr_inc_var = rack_inc_var; + rack->rack_per_of_gp = rack_per_of_gp; + microuptime(&rack->r_ctl.rc_last_ack); + rack->r_ctl.rc_last_time_decay = rack->r_ctl.rc_last_ack; + rack->r_ctl.rc_tlp_rxt_last_time = tcp_ts_getticks(); + /* Do we force on detection? */ + if (tcp_force_detection) + rack->do_detection = 1; + else + rack->do_detection = 0; if (tp->snd_una != tp->snd_max) { /* Create a send map for the current outstanding data */ struct rack_sendmap *rsm; @@ -6441,18 +7424,24 @@ rack_init(struct tcpcb *tp) return (ENOMEM); } rsm->r_flags = RACK_OVERMAX; - rsm->r_tim_lastsent[0] = tcp_ts_getticks(); + rsm->r_tim_lastsent[0] = rack->r_ctl.rc_tlp_rxt_last_time; rsm->r_rtr_cnt = 1; rsm->r_rtr_bytes = 0; rsm->r_start = tp->snd_una; rsm->r_end = tp->snd_max; - rsm->r_sndcnt = 0; - TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next); + rsm->r_dupack = 0; + insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); +#ifdef INVARIANTS + if (insret != NULL) { + panic("Insert in rb tree fails ret:%p rack:%p rsm:%p", + insret, rack, rsm); + } +#endif TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; } rack_stop_all_timers(tp); - rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); + rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); return (0); } @@ -6472,7 +7461,7 @@ rack_handoff_ok(struct tcpcb *tp) */ return (EAGAIN); } - if (tp->t_flags & TF_SACK_PERMIT) { + if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){ return (0); } /* @@ -6487,21 +7476,28 @@ rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) { if (tp->t_fb_ptr) { struct tcp_rack *rack; - struct rack_sendmap *rsm; - + struct rack_sendmap *rsm, *nrsm, *rm; + if (tp->t_inpcb) { + tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; + tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY; + } rack = (struct tcp_rack *)tp->t_fb_ptr; #ifdef TCP_BLACKBOX tcp_log_flowend(tp); #endif - rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); - while (rsm) { - TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next); + RB_FOREACH_SAFE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm) { + rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); +#ifdef INVARIANTS + if (rm != rsm) { + panic("At fini, rack:%p rsm:%p rm:%p", + rack, rsm, rm); + } +#endif uma_zfree(rack_zone, rsm); - rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); } rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); while (rsm) { - TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next); + TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); uma_zfree(rack_zone, rsm); rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); } @@ -6513,6 +7509,7 @@ rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) tp->snd_nxt = tp->snd_max; } + static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) { @@ -6526,6 +7523,7 @@ rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) rack->r_substate = rack_do_syn_recv; break; case TCPS_ESTABLISHED: + rack_set_pace_segments(tp, rack); rack->r_state = TCPS_ESTABLISHED; rack->r_substate = rack_do_established; break; @@ -6600,21 +7598,13 @@ rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) return; } } - if (rsm && (rsm->r_flags & RACK_SACK_PASSED)) { - if ((tp->t_flags & TF_SENTFIN) && - ((tp->snd_max - tp->snd_una) == 1) && - (rsm->r_flags & RACK_HAS_FIN)) { - /* needs to be a RXT */ - if (tmr_up == PACE_TMR_RXT) - return; - } else if (tmr_up == PACE_TMR_RACK) - return; - } else if (SEQ_GT(tp->snd_max,tp->snd_una) && + if (SEQ_GT(tp->snd_max, tp->snd_una) && ((tmr_up == PACE_TMR_TLP) || + (tmr_up == PACE_TMR_RACK) || (tmr_up == PACE_TMR_RXT))) { /* - * Either a TLP or RXT is fine if no sack-passed - * is in place and data is outstanding. + * Either a Rack, TLP or RXT is fine if we + * have outstanding data. */ return; } else if (tmr_up == PACE_TMR_DELACK) { @@ -6633,11 +7623,11 @@ rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) * with the slot set to what was in the saved slot. */ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); - rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); + rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); } -static void -rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, +static int +rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, int32_t nxt_pkt, struct timeval *tv) { @@ -6650,6 +7640,10 @@ rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct rack_sendmap *rsm; int32_t prev_state = 0; + if (m->m_flags & M_TSTMP_LRO) { + tv->tv_sec = m->m_pkthdr.rcv_tstmp /1000000000; + tv->tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000; + } cts = tcp_tv_to_mssectick(tv); rack = (struct tcp_rack *)tp->t_fb_ptr; @@ -6662,34 +7656,55 @@ rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, * allow the tcbinfo to be in either locked or unlocked, as the * caller may have unnecessarily acquired a lock due to a race. */ + if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || + tp->t_state != TCPS_ESTABLISHED) { + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + } INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", __func__)); - { + if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; + struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; - TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, - tlen, &log, true); + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; + TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, + tlen, &log, true, &tv); + } + if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { + way_out = 4; + retval = 0; + goto done_with_input; + } + /* + * If a segment with the ACK-bit set arrives in the SYN-SENT state + * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. + */ + if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && + (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + return(1); } /* * Segment received on connection. Reset idle time and keep-alive * timer. XXX: This should be done after segment validation to * ignore broken/spoofed segs. */ - if (tp->t_idle_reduce && (tp->snd_max == tp->snd_una)) { - if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) { - counter_u64_add(rack_input_idle_reduces, 1); - rack_cc_after_idle(tp, - (rack->r_idle_reduce_largest ? 1 :0)); - } + if (tp->t_idle_reduce && + (tp->snd_max == tp->snd_una) && + ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { + counter_u64_add(rack_input_idle_reduces, 1); + rack_cc_after_idle(tp); } - rack->r_ctl.rc_rcvtime = cts; tp->t_rcvtime = ticks; /* @@ -6700,6 +7715,8 @@ rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, #ifdef NETFLIX_STATS stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); #endif + if (tiwin > rack->r_ctl.rc_high_rwnd) + rack->r_ctl.rc_high_rwnd = tiwin; /* * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move * this to occur after we've validated the segment. @@ -6782,6 +7799,22 @@ rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, if ((tp->t_flags & TF_SACK_PERMIT) && (to.to_flags & TOF_SACKPERM) == 0) tp->t_flags &= ~TF_SACK_PERMIT; + if (IS_FASTOPEN(tp->t_flags)) { + if (to.to_flags & TOF_FASTOPEN) { + uint16_t mss; + + if (to.to_flags & TOF_MSS) + mss = to.to_mss; + else + if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) + mss = TCP6_MSS; + else + mss = TCP_MSS; + tcp_fastopen_update_cache(tp, mss, + to.to_tfo_len, to.to_tfo_cookie); + } else + tcp_fastopen_disable_path(tp); + } } /* * At this point we are at the initial call. Here we decide @@ -6793,7 +7826,7 @@ rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, tcp_switch_back_to_default(tp); (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, tlen, iptos); - return; + return (1); } /* Set the flag */ rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; @@ -6805,9 +7838,12 @@ rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, * always. All other times (timers etc) we must have a rack-state * set (so we assure we have done the checks above for SACK). */ + memcpy(&rack->r_ctl.rc_last_ack, tv, sizeof(struct timeval)); + rack->r_ctl.rc_rcvtime = cts; if (rack->r_state != tp->t_state) rack_set_state(tp, rack); - if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = TAILQ_FIRST(&rack->r_ctl.rc_map)) != NULL) + if (SEQ_GT(th->th_ack, tp->snd_una) && + (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL) kern_prefetch(rsm, &prev_state); prev_state = rack->r_state; rack->r_ctl.rc_tlp_send_cnt = 0; @@ -6828,15 +7864,24 @@ rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, * is gone. */ INP_WLOCK_ASSERT(tp->t_inpcb); + if (rack->set_pacing_done_a_iw == 0) { + /* How much has been acked? */ + if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) { + /* We have enough to set in the pacing segment size */ + rack->set_pacing_done_a_iw = 1; + rack_set_pace_segments(tp, rack); + } + } tcp_rack_xmit_timer_commit(rack, tp); - if (nxt_pkt == 0) { + if ((nxt_pkt == 0) || (IN_RECOVERY(tp->t_flags))) { if (rack->r_wanted_output != 0) { did_out = 1; (void)tp->t_fb->tfb_tcp_output(tp); } - rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0); + rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); } - if (((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && + if ((nxt_pkt == 0) && + ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && (SEQ_GT(tp->snd_max, tp->snd_una) || (tp->t_flags & TF_DELACK) || ((tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && @@ -6844,20 +7889,24 @@ rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, /* We could not send (probably in the hpts but stopped the timer earlier)? */ if ((tp->snd_max == tp->snd_una) && ((tp->t_flags & TF_DELACK) == 0) && + (rack->rc_inp->inp_in_hpts) && (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { /* keep alive not needed if we are hptsi output yet */ ; } else { - if (rack->rc_inp->inp_in_hpts) + if (rack->rc_inp->inp_in_hpts) { tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); - rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); + counter_u64_add(rack_per_timer_hole, 1); + } + rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); } way_out = 1; - } else { + } else if (nxt_pkt == 0) { /* Do we have the correct timer running? */ rack_timer_audit(tp, rack, &so->so_snd); way_out = 2; } + done_with_input: rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out); if (did_out) rack->r_wanted_output = 0; @@ -6868,8 +7917,8 @@ rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, retval, tp, prev_state); } #endif - INP_WUNLOCK(tp->t_inpcb); } + return (retval); } void @@ -6877,29 +7926,24 @@ rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) { struct timeval tv; -#ifdef RSS - struct tcp_function_block *tfb; - struct tcp_rack *rack; - struct inpcb *inp; - rack = (struct tcp_rack *)tp->t_fb_ptr; - if (rack->r_state == 0) { - /* - * Initial input (ACK to SYN-ACK etc)lets go ahead and get - * it processed - */ - tcp_get_usecs(&tv); - rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, - tlen, iptos, 0, &tv); - return; + /* First lets see if we have old packets */ + if (tp->t_in_pkt) { + if (ctf_do_queued_segments(so, tp, 1)) { + m_freem(m); + return; + } } - tcp_queue_to_input(tp, m, th, tlen, drop_hdrlen, iptos); - INP_WUNLOCK(tp->t_inpcb); -#else - tcp_get_usecs(&tv); - rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, - tlen, iptos, 0, &tv); -#endif + if (m->m_flags & M_TSTMP_LRO) { + tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000; + tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000; + } else { + /* Should not be should we kassert instead? */ + tcp_get_usecs(&tv); + } + if(rack_do_segment_nounlock(m, th, so, tp, + drop_hdrlen, tlen, iptos, 0, &tv) == 0) + INP_WUNLOCK(tp->t_inpcb); } struct rack_sendmap * @@ -6907,10 +7951,10 @@ tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) { struct rack_sendmap *rsm = NULL; int32_t idx; - uint32_t srtt_cur, srtt = 0, thresh = 0, ts_low = 0; + uint32_t srtt = 0, thresh = 0, ts_low = 0; /* Return the next guy to be re-transmitted */ - if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) { + if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { return (NULL); } if (tp->t_flags & TF_SENTFIN) { @@ -6927,10 +7971,6 @@ tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) return (NULL); } check_it: - srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT; - srtt = TICKS_2_MSEC(srtt_cur); - if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt)) - srtt = rack->rc_rack_rtt; if (rsm->r_flags & RACK_ACKED) { return (NULL); } @@ -6938,18 +7978,133 @@ tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) /* Its not yet ready */ return (NULL); } + srtt = rack_grab_rtt(tp, rack); idx = rsm->r_rtr_cnt - 1; ts_low = rsm->r_tim_lastsent[idx]; thresh = rack_calc_thresh_rack(rack, srtt, tsused); - if (tsused <= ts_low) { + if ((tsused == ts_low) || + (TSTMP_LT(tsused, ts_low))) { + /* No time since sending */ return (NULL); } - if ((tsused - ts_low) >= thresh) { + if ((tsused - ts_low) < thresh) { + /* It has not been long enough yet */ + return (NULL); + } + if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || + ((rsm->r_flags & RACK_SACK_PASSED) && + (rack->sack_attack_disable == 0))) { + /* + * We have passed the dup-ack threshold + * a SACK has indicated this is missing. + * Note that if you are a declared attacker + * it is only the dup-ack threshold that + * will cause retransmits. + */ + /* log retransmit reason */ + rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1); return (rsm); } return (NULL); } +static int32_t +rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len) +{ + int32_t slot = 0; + + if ((rack->rack_per_of_gp == 0) || + (rack->rc_always_pace == 0)) { + /* + * We use the most optimistic possible cwnd/srtt for + * sending calculations. This will make our + * calculation anticipate getting more through + * quicker then possible. But thats ok we don't want + * the peer to have a gap in data sending. + */ + uint32_t srtt, cwnd, tr_perms = 0; + +old_method: + if (rack->r_ctl.rc_rack_min_rtt) + srtt = rack->r_ctl.rc_rack_min_rtt; + else + srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT)); + if (rack->r_ctl.rc_rack_largest_cwnd) + cwnd = rack->r_ctl.rc_rack_largest_cwnd; + else + cwnd = tp->snd_cwnd; + tr_perms = cwnd / srtt; + if (tr_perms == 0) { + tr_perms = ctf_fixed_maxseg(tp); + } + /* + * Calculate how long this will take to drain, if + * the calculation comes out to zero, thats ok we + * will use send_a_lot to possibly spin around for + * more increasing tot_len_this_send to the point + * that its going to require a pace, or we hit the + * cwnd. Which in that case we are just waiting for + * a ACK. + */ + slot = len / tr_perms; + /* Now do we reduce the time so we don't run dry? */ + if (slot && rack->rc_pace_reduce) { + int32_t reduce; + + reduce = (slot / rack->rc_pace_reduce); + if (reduce < slot) { + slot -= reduce; + } else + slot = 0; + } + } else { + int cnt; + uint64_t bw_est, bw_raise, res, lentim; + + bw_est = 0; + for (cnt=0; cntr_ctl.rc_gp_hist_filled == 0) && + (rack->r_ctl.rc_gp_history[cnt] == 0)) + break; + bw_est += rack->r_ctl.rc_gp_history[cnt]; + } + if (bw_est == 0) { + /* + * No way yet to make a b/w estimate + * (no goodput est yet). + */ + goto old_method; + } + /* Covert to bytes per second */ + bw_est *= MSEC_IN_SECOND; + /* + * Now ratchet it up by our percentage. Note + * that the minimum you can do is 1 which would + * get you 101% of the average last N goodput estimates. + * The max you can do is 256 which would yeild you + * 356% of the last N goodput estimates. + */ + bw_raise = bw_est * (uint64_t)rack->rack_per_of_gp; + bw_est += bw_raise; + /* average by the number we added */ + bw_est /= cnt; + /* Now calculate a rate based on this b/w */ + lentim = (uint64_t) len * (uint64_t)MSEC_IN_SECOND; + res = lentim / bw_est; + slot = (uint32_t)res; + } + if (rack->r_enforce_min_pace && + (slot == 0)) { + /* We are enforcing a minimum pace time of 1ms */ + slot = rack->r_enforce_min_pace; + } + if (slot) + counter_u64_add(rack_calc_nonzero, 1); + else + counter_u64_add(rack_calc_zero, 1); + return (slot); +} + static int rack_output(struct tcpcb *tp) { @@ -6961,22 +8116,19 @@ rack_output(struct tcpcb *tp) struct mbuf *mb; uint32_t if_hw_tsomaxsegcount = 0; uint32_t if_hw_tsomaxsegsize; + int32_t maxseg; long tot_len_this_send = 0; struct ip *ip = NULL; #ifdef TCPDEBUG struct ipovly *ipov = NULL; #endif -#ifdef NETFLIX_TCP_O_UDP struct udphdr *udp = NULL; -#endif struct tcp_rack *rack; struct tcphdr *th; uint8_t pass = 0; + uint8_t wanted_cookie = 0; u_char opt[TCP_MAXOLEN]; - unsigned ipoptlen, optlen, hdrlen; -#ifdef NETFLIX_TCP_O_UDP - unsigned ulen; -#endif + unsigned ipoptlen, optlen, hdrlen, ulen=0; uint32_t rack_seq; #if defined(IPSEC) || defined(IPSEC_SUPPORT) @@ -6987,13 +8139,16 @@ rack_output(struct tcpcb *tp) int32_t sub_from_prr = 0; volatile int32_t sack_rxmit; struct rack_sendmap *rsm = NULL; - int32_t tso, mtu, would_have_fin = 0; + int32_t tso, mtu; struct tcpopt to; int32_t slot = 0; + int32_t sup_rack = 0; uint32_t cts; - uint8_t hpts_calling, doing_tlp = 0; + uint8_t hpts_calling, new_data_tlp = 0, doing_tlp = 0; int32_t do_a_prefetch; int32_t prefetch_rsm = 0; + int force_tso = 0; + int32_t orig_len; int32_t prefetch_so_done = 0; struct tcp_log_buffer *lgb = NULL; struct inpcb *inp; @@ -7002,11 +8157,8 @@ rack_output(struct tcpcb *tp) struct ip6_hdr *ip6 = NULL; int32_t isipv6; #endif -#ifdef KERN_TLS - const bool hw_tls = (so->so_snd.sb_flags & SB_TLS_IFNET) != 0; -#else - const bool hw_tls = false; -#endif + uint8_t filled_all = 0; + bool hw_tls = false; /* setup and take the cache hits here */ rack = (struct tcp_rack *)tp->t_fb_ptr; @@ -7015,24 +8167,26 @@ rack_output(struct tcpcb *tp) sb = &so->so_snd; kern_prefetch(sb, &do_a_prefetch); do_a_prefetch = 1; + +#ifdef KERN_TLS + hw_tls = (so->so_snd.sb_flags & SB_TLS_IFNET) != 0; +#endif INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return (tcp_offload_output(tp)); #endif - -#ifdef TCP_RFC7413 + maxseg = ctf_fixed_maxseg(tp); /* * For TFO connections in SYN_RECEIVED, only allow the initial * SYN|ACK and those sent by the retransmit timer. */ - if ((tp->t_flags & TF_FASTOPEN) && + if (IS_FASTOPEN(tp->t_flags) && (tp->t_state == TCPS_SYN_RECEIVED) && - SEQ_GT(tp->snd_max, tp->snd_una) && /* inital SYN|ACK sent */ - (tp->snd_nxt != tp->snd_una)) /* not a retransmit */ + SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ + (rack->r_ctl.rc_resend == NULL)) /* not a retransmit */ return (0); -#endif #ifdef INET6 if (rack->r_state) { /* Use the cache line loaded if possible */ @@ -7074,6 +8228,17 @@ rack_output(struct tcpcb *tp) } rack->r_wanted_output = 0; rack->r_timer_override = 0; + /* + * For TFO connections in SYN_SENT or SYN_RECEIVED, + * only allow the initial SYN or SYN|ACK and those sent + * by the retransmit timer. + */ + if (IS_FASTOPEN(tp->t_flags) && + ((tp->t_state == TCPS_SYN_RECEIVED) || + (tp->t_state == TCPS_SYN_SENT)) && + SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ + (tp->t_rxtshift == 0)) /* not a retransmit */ + return (0); /* * Determine length of data that should be transmitted, and flags * that will be used. If there is some data or critical controls @@ -7083,8 +8248,7 @@ rack_output(struct tcpcb *tp) idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); if (tp->t_idle_reduce) { if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) - rack_cc_after_idle(tp, - (rack->r_idle_reduce_largest ? 1 :0)); + rack_cc_after_idle(tp); } tp->t_flags &= ~TF_LASTIDLE; if (idle) { @@ -7107,17 +8271,6 @@ rack_output(struct tcpcb *tp) sendwin = min(tp->snd_wnd, tp->snd_cwnd); flags = tcp_outflags[tp->t_state]; - /* - * Send any SACK-generated retransmissions. If we're explicitly - * trying to send out new data (when sendalot is 1), bypass this - * function. If we retransmit in fast recovery mode, decrement - * snd_cwnd, since we're replacing a (future) new transmission with - * a retransmission now, and we previously incremented snd_cwnd in - * tcp_input(). - */ - /* - * Still in sack recovery , reset rxmit flag to zero. - */ while (rack->rc_free_cnt < rack_free_cache) { rsm = rack_alloc(rack); if (rsm == NULL) { @@ -7126,7 +8279,7 @@ rack_output(struct tcpcb *tp) slot = 1; goto just_return_nolock; } - TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next); + TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); rack->rc_free_cnt++; rsm = NULL; } @@ -7145,18 +8298,24 @@ rack_output(struct tcpcb *tp) long tlen; doing_tlp = 1; - rsm = rack->r_ctl.rc_tlpsend; + /* + * Check if we can do a TLP with a RACK'd packet + * this can happen if we are not doing the rack + * cheat and we skipped to a TLP and it + * went off. + */ + rsm = tcp_rack_output(tp, rack, cts); + if (rsm == NULL) + rsm = rack->r_ctl.rc_tlpsend; rack->r_ctl.rc_tlpsend = NULL; sack_rxmit = 1; tlen = rsm->r_end - rsm->r_start; - if (tlen > tp->t_maxseg) - tlen = tp->t_maxseg; -#ifdef INVARIANTS - if (SEQ_GT(tp->snd_una, rsm->r_start)) { - panic("tp:%p rack:%p snd_una:%u rsm:%p r_start:%u", - tp, rack, tp->snd_una, rsm, rsm->r_start); - } -#endif + if (tlen > ctf_fixed_maxseg(tp)) + tlen = ctf_fixed_maxseg(tp); + KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), + ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", + __func__, __LINE__, + rsm->r_start, tp->snd_una, tp, rack, rsm)); sb_offset = rsm->r_start - tp->snd_una; cwin = min(tp->snd_wnd, tlen); len = cwin; @@ -7167,16 +8326,19 @@ rack_output(struct tcpcb *tp) len = rsm->r_end - rsm->r_start; sack_rxmit = 1; sendalot = 0; + KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), + ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", + __func__, __LINE__, + rsm->r_start, tp->snd_una, tp, rack, rsm)); sb_offset = rsm->r_start - tp->snd_una; - if (len >= tp->t_maxseg) { - len = tp->t_maxseg; + if (len >= ctf_fixed_maxseg(tp)) { + len = ctf_fixed_maxseg(tp); } - KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d", - __func__, sb_offset)); } else if ((rack->rc_in_persist == 0) && ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) { - long tlen; + int maxseg; + maxseg = ctf_fixed_maxseg(tp); if ((!IN_RECOVERY(tp->t_flags)) && ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) { /* Enter recovery if not induced by a time-out */ @@ -7188,7 +8350,8 @@ rack_output(struct tcpcb *tp) * When we enter recovery we need to assure we send * one packet. */ - rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; + rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); + rack_log_to_prr(rack, 13); } #ifdef INVARIANTS if (SEQ_LT(rsm->r_start, tp->snd_una)) { @@ -7196,39 +8359,57 @@ rack_output(struct tcpcb *tp) tp, rack, rsm, rsm->r_start, tp->snd_una); } #endif - tlen = rsm->r_end - rsm->r_start; + len = rsm->r_end - rsm->r_start; + KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), + ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", + __func__, __LINE__, + rsm->r_start, tp->snd_una, tp, rack, rsm)); sb_offset = rsm->r_start - tp->snd_una; - if (tlen > rack->r_ctl.rc_prr_sndcnt) { - len = rack->r_ctl.rc_prr_sndcnt; - } else { - len = tlen; - } - if (len >= tp->t_maxseg) { - sendalot = 1; - len = tp->t_maxseg; - } else { - sendalot = 0; - if ((rack->rc_timer_up == 0) && - (len < tlen)) { + /* Can we send it within the PRR boundary? */ + if ((rack->use_rack_cheat == 0) && (len > rack->r_ctl.rc_prr_sndcnt)) { + /* It does not fit */ + if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) > len) && + (rack->r_ctl.rc_prr_sndcnt < maxseg)) { /* - * If its not a timer don't send a partial - * segment. + * prr is less than a segment, we + * have more acks due in besides + * what we need to resend. Lets not send + * to avoid sending small pieces of + * what we need to retransmit. */ len = 0; goto just_return_nolock; } + len = rack->r_ctl.rc_prr_sndcnt; + } + sendalot = 0; + if (len >= maxseg) { + len = maxseg; } - KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d", - __func__, sb_offset)); if (len > 0) { sub_from_prr = 1; sack_rxmit = 1; TCPSTAT_INC(tcps_sack_rexmits); TCPSTAT_ADD(tcps_sack_rexmit_bytes, - min(len, tp->t_maxseg)); + min(len, ctf_fixed_maxseg(tp))); counter_u64_add(rack_rtm_prr_retran, 1); } } + /* + * Enforce a connection sendmap count limit if set + * as long as we are not retransmiting. + */ + if ((rsm == NULL) && + (rack->do_detection == 0) && + (rack_tcp_map_entries_limit > 0) && + (rack->r_ctl.rc_num_maps_alloced >= rack_tcp_map_entries_limit)) { + counter_u64_add(rack_to_alloc_limited, 1); + if (!rack->alloc_limit_reported) { + rack->alloc_limit_reported = 1; + counter_u64_add(rack_alloc_limited_conns, 1); + } + goto just_return_nolock; + } if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { /* we are retransmitting the fin */ len--; @@ -7244,20 +8425,6 @@ rack_output(struct tcpcb *tp) /* For debugging */ rack->r_ctl.rc_rsm_at_retran = rsm; #endif - /* - * Enforce a connection sendmap count limit if set - * as long as we are not retransmiting. - */ - if ((rsm == NULL) && - (rack_map_entries_limit > 0) && - (rack->r_ctl.rc_num_maps_alloced >= rack_map_entries_limit)) { - counter_u64_add(rack_to_alloc_limited, 1); - if (!rack->alloc_limit_reported) { - rack->alloc_limit_reported = 1; - counter_u64_add(rack_alloc_limited_conns, 1); - } - goto just_return_nolock; - } /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. @@ -7299,7 +8466,9 @@ rack_output(struct tcpcb *tp) flags &= ~TH_FIN; sendwin = 1; } else { - if (rack->rc_in_persist) + if ((rack->rc_in_persist != 0) && + (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), + rack->r_ctl.rc_pace_min_segs))) rack_exit_persist(tp, rack); /* * If we are dropping persist mode then we need to @@ -7328,7 +8497,7 @@ rack_output(struct tcpcb *tp) uint32_t avail; avail = sbavail(sb); - if (SEQ_GT(tp->snd_nxt, tp->snd_una)) + if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail) sb_offset = tp->snd_nxt - tp->snd_una; else sb_offset = 0; @@ -7343,7 +8512,7 @@ rack_output(struct tcpcb *tp) else len = rack->r_ctl.rc_tlp_new_data; rack->r_ctl.rc_tlp_new_data = 0; - doing_tlp = 1; + new_data_tlp = doing_tlp = 1; } else { if (sendwin > avail) { /* use the available */ @@ -7387,13 +8556,12 @@ rack_output(struct tcpcb *tp) if (len > 0) { if (len > rack->r_ctl.rc_prr_sndcnt) len = rack->r_ctl.rc_prr_sndcnt; - if (len > 0) { sub_from_prr = 1; counter_u64_add(rack_rtm_prr_newdata, 1); } } - if (len > tp->t_maxseg) { + if (len > ctf_fixed_maxseg(tp)) { /* * We should never send more than a MSS when * retransmitting or sending new data in prr @@ -7402,8 +8570,8 @@ rack_output(struct tcpcb *tp) * let us send a lot as well :-) */ if (rack->r_ctl.rc_prr_sendalot == 0) - len = tp->t_maxseg; - } else if (len < tp->t_maxseg) { + len = ctf_fixed_maxseg(tp); + } else if (len < ctf_fixed_maxseg(tp)) { /* * Do we send any? The idea here is if the * send empty's the socket buffer we want to @@ -7429,19 +8597,18 @@ rack_output(struct tcpcb *tp) * SYN-SENT state and if segment contains data and if we don't know * that foreign host supports TAO, suppress sending segment. */ - if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { - if ((tp->t_state != TCPS_SYN_RECEIVED) && - (tp->t_state != TCPS_SYN_SENT)) + if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) && + ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) { + if (tp->t_state != TCPS_SYN_RECEIVED) flags &= ~TH_SYN; -#ifdef TCP_RFC7413 /* * When sending additional segments following a TFO SYN|ACK, * do not include the SYN bit. */ - if ((tp->t_flags & TF_FASTOPEN) && + if (IS_FASTOPEN(tp->t_flags) && (tp->t_state == TCPS_SYN_RECEIVED)) flags &= ~TH_SYN; -#endif + sb_offset--, len++; } /* * Be careful not to send data and/or FIN on SYN segments. This @@ -7452,16 +8619,30 @@ rack_output(struct tcpcb *tp) len = 0; flags &= ~TH_FIN; } -#ifdef TCP_RFC7413 /* - * When retransmitting SYN|ACK on a passively-created TFO socket, - * don't include data, as the presence of data may have caused the - * original SYN|ACK to have been dropped by a middlebox. + * On TFO sockets, ensure no data is sent in the following cases: + * + * - When retransmitting SYN|ACK on a passively-created socket + * + * - When retransmitting SYN on an actively created socket + * + * - When sending a zero-length cookie (cookie request) on an + * actively created socket + * + * - When the socket is in the CLOSED state (RST is being sent) */ - if ((tp->t_flags & TF_FASTOPEN) && - ((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift > 0))) + if (IS_FASTOPEN(tp->t_flags) && + (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || + ((tp->t_state == TCPS_SYN_SENT) && + (tp->t_tfo_client_cookie_len == 0)) || + (flags & TH_RST))) { + sack_rxmit = 0; len = 0; -#endif + } + /* Without fast-open there should never be data sent on a SYN */ + if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) + len = 0; + orig_len = len; if (len <= 0) { /* * If FIN has been sent but not acked, but we haven't been @@ -7481,10 +8662,68 @@ rack_output(struct tcpcb *tp) len = 0; if ((tp->snd_wnd == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) && + (tp->snd_una == tp->snd_max) && (sb_offset < (int)sbavail(sb))) { tp->snd_nxt = tp->snd_una; rack_enter_persist(tp, rack, cts); } + } else if ((rsm == NULL) && + ((doing_tlp == 0) || (new_data_tlp == 1)) && + (len < rack->r_ctl.rc_pace_max_segs)) { + /* + * We are not sending a full segment for + * some reason. Should we not send anything (think + * sws or persists)? + */ + if ((tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && + (TCPS_HAVEESTABLISHED(tp->t_state)) && + (len < (int)(sbavail(sb) - sb_offset))) { + /* + * Here the rwnd is less than + * the pacing size, this is not a retransmit, + * we are established and + * the send is not the last in the socket buffer + * we send nothing, and may enter persists. + */ + len = 0; + if (tp->snd_max == tp->snd_una) { + /* + * Nothing out we can + * go into persists. + */ + rack_enter_persist(tp, rack, cts); + tp->snd_nxt = tp->snd_una; + } + } else if ((tp->snd_cwnd >= max(rack->r_ctl.rc_pace_min_segs, (maxseg * 4))) && + (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * maxseg)) && + (len < (int)(sbavail(sb) - sb_offset)) && + (len < rack->r_ctl.rc_pace_min_segs)) { + /* + * Here we are not retransmitting, and + * the cwnd is not so small that we could + * not send at least a min size (rxt timer + * not having gone off), We have 2 segments or + * more already in flight, its not the tail end + * of the socket buffer and the cwnd is blocking + * us from sending out a minimum pacing segment size. + * Lets not send anything. + */ + len = 0; + } else if (((tp->snd_wnd - ctf_outstanding(tp)) < + min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && + (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * maxseg)) && + (len < (int)(sbavail(sb) - sb_offset)) && + (TCPS_HAVEESTABLISHED(tp->t_state))) { + /* + * Here we have a send window but we have + * filled it up and we can't send another pacing segment. + * We also have in flight more than 2 segments + * and we are not completing the sb i.e. we allow + * the last bytes of the sb to go out even if + * its not a full pacing segment. + */ + len = 0; + } } /* len will be >= 0 after this point. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); @@ -7537,10 +8776,8 @@ rack_output(struct tcpcb *tp) #if defined(IPSEC) || defined(IPSEC_SUPPORT) ipoptlen += ipsec_optlen; #endif - if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && -#ifdef NETFLIX_TCP_O_UDP + if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > ctf_fixed_maxseg(tp) && (tp->t_port == 0) && -#endif ((tp->t_flags & TF_SIGNATURE) == 0) && tp->rcv_numsacks == 0 && sack_rxmit == 0 && ipoptlen == 0) @@ -7556,16 +8793,7 @@ rack_output(struct tcpcb *tp) */ outstanding--; } - if (outstanding > 0) { - /* - * This is sub-optimal. We only send a stand alone - * FIN on its own segment. - */ - if (flags & TH_FIN) { - flags &= ~TH_FIN; - would_have_fin = 1; - } - } else if (sack_rxmit) { + if (sack_rxmit) { if ((rsm->r_flags & RACK_HAS_FIN) == 0) flags &= ~TH_FIN; } else { @@ -7587,7 +8815,7 @@ rack_output(struct tcpcb *tp) * limited the window size) - we need to retransmit */ if (len) { - if (len >= tp->t_maxseg) { + if (len >= ctf_fixed_maxseg(tp)) { pass = 1; goto send; } @@ -7677,10 +8905,10 @@ rack_output(struct tcpcb *tp) if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale) goto dontupdate; - if (adv >= (int32_t)(2 * tp->t_maxseg) && + if (adv >= (int32_t)(2 * ctf_fixed_maxseg(tp)) && (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || - so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) { + so->so_rcv.sb_hiwat <= 8 * ctf_fixed_maxseg(tp))) { pass = 7; goto send; } @@ -7709,13 +8937,10 @@ rack_output(struct tcpcb *tp) * If our state indicates that FIN should be sent and we have not * yet done so, then we need to send. */ - if (flags & TH_FIN) { - if ((tp->t_flags & TF_SENTFIN) || - (((tp->t_flags & TF_SENTFIN) == 0) && - (tp->snd_nxt == tp->snd_una))) { - pass = 11; - goto send; - } + if ((flags & TH_FIN) && + (tp->snd_nxt == tp->snd_una)) { + pass = 11; + goto send; } /* * No reason to send a segment, just return. @@ -7725,12 +8950,38 @@ rack_output(struct tcpcb *tp) just_return_nolock: if (tot_len_this_send == 0) counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); - rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1); + if (slot) { + /* set the rack tcb into the slot N */ + counter_u64_add(rack_paced_segments, 1); + } else if (tot_len_this_send) { + counter_u64_add(rack_unpaced_segments, 1); + } + /* Check if we need to go into persists or not */ + if ((rack->rc_in_persist == 0) && + (tp->snd_max == tp->snd_una) && + TCPS_HAVEESTABLISHED(tp->t_state) && + sbavail(&tp->t_inpcb->inp_socket->so_snd) && + (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd) && + (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs))) { + /* Yes lets make sure to move to persist before timer-start */ + rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); + } + rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack); rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling); tp->t_flags &= ~TF_FORCEDATA; return (0); send: + if ((flags & TH_FIN) && + sbavail(&tp->t_inpcb->inp_socket->so_snd)) { + /* + * We do not transmit a FIN + * with data outstanding. We + * need to make it so all data + * is acked first. + */ + flags &= ~TH_FIN; + } if (doing_tlp == 0) { /* * Data not a TLP, and its not the rxt firing. If it is the @@ -7743,7 +8994,7 @@ rack_output(struct tcpcb *tp) } SOCKBUF_LOCK_ASSERT(sb); if (len > 0) { - if (len >= tp->t_maxseg) + if (len >= ctf_fixed_maxseg(tp)) tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; else tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; @@ -7774,27 +9025,44 @@ rack_output(struct tcpcb *tp) if (flags & TH_SYN) { tp->snd_nxt = tp->iss; to.to_mss = tcp_mssopt(&inp->inp_inc); -#ifdef NETFLIX_TCP_O_UDP +#ifdef NETFLIX_TCPOUDP if (tp->t_port) to.to_mss -= V_tcp_udp_tunneling_overhead; #endif to.to_flags |= TOF_MSS; -#ifdef TCP_RFC7413 + /* - * Only include the TFO option on the first - * transmission of the SYN|ACK on a - * passively-created TFO socket, as the presence of - * the TFO option may have caused the original - * SYN|ACK to have been dropped by a middlebox. + * On SYN or SYN|ACK transmits on TFO connections, + * only include the TFO option if it is not a + * retransmit, as the presence of the TFO option may + * have caused the original SYN or SYN|ACK to have + * been dropped by a middlebox. */ - if ((tp->t_flags & TF_FASTOPEN) && - (tp->t_state == TCPS_SYN_RECEIVED) && + if (IS_FASTOPEN(tp->t_flags) && (tp->t_rxtshift == 0)) { - to.to_tfo_len = TCP_FASTOPEN_MAX_COOKIE_LEN; - to.to_tfo_cookie = (u_char *)&tp->t_tfo_cookie; - to.to_flags |= TOF_FASTOPEN; + if (tp->t_state == TCPS_SYN_RECEIVED) { + to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; + to.to_tfo_cookie = + (u_int8_t *)&tp->t_tfo_cookie.server; + to.to_flags |= TOF_FASTOPEN; + wanted_cookie = 1; + } else if (tp->t_state == TCPS_SYN_SENT) { + to.to_tfo_len = + tp->t_tfo_client_cookie_len; + to.to_tfo_cookie = + tp->t_tfo_cookie.client; + to.to_flags |= TOF_FASTOPEN; + wanted_cookie = 1; + /* + * If we wind up having more data to + * send with the SYN than can fit in + * one segment, don't send any more + * until the SYN|ACK comes back from + * the other end. + */ + sendalot = 0; + } } -#endif } /* Window scaling. */ if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { @@ -7829,8 +9097,15 @@ rack_output(struct tcpcb *tp) /* Processing the options. */ hdrlen += optlen = tcp_addoptions(&to, opt); + /* + * If we wanted a TFO option to be added, but it was unable + * to fit, ensure no data is sent. + */ + if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && + !(to.to_flags & TOF_FASTOPEN)) + len = 0; } -#ifdef NETFLIX_TCP_O_UDP +#ifdef NETFLIX_TCPOUDP if (tp->t_port) { if (V_tcp_udp_tunneling_port == 0) { /* The port was removed?? */ @@ -7854,16 +9129,18 @@ rack_output(struct tcpcb *tp) ipoptlen += ipsec_optlen; #endif +#ifdef KERN_TLS + /* force TSO for so TLS offload can get mss */ + if (sb->sb_flags & SB_TLS_IFNET) { + force_tso = 1; + } +#endif /* * Adjust data length if insertion of options will bump the packet * length beyond the t_maxseg length. Clear the FIN bit because we * cut off the tail of the segment. */ if (len + optlen + ipoptlen > tp->t_maxseg) { - if (flags & TH_FIN) { - would_have_fin = 1; - flags &= ~TH_FIN; - } if (tso) { uint32_t if_hw_tsomax; uint32_t moff; @@ -7896,18 +9173,19 @@ rack_output(struct tcpcb *tp) * unless the send sockbuf can be emptied: */ max_len = (tp->t_maxseg - optlen); - if ((sb_offset + len) < sbavail(sb)) { + if (((sb_offset + len) < sbavail(sb)) && + (hw_tls == 0)) { moff = len % (u_int)max_len; if (moff != 0) { len -= moff; sendalot = 1; } - } - /* + } + /* * In case there are too many small fragments don't * use TSO: */ - if (len <= max_len) { + if (len <= maxseg) { len = max_len; sendalot = 1; tso = 0; @@ -7974,9 +9252,11 @@ rack_output(struct tcpcb *tp) uint32_t moff; if (rack->rc_pace_max_segs) - max_val = rack->rc_pace_max_segs * tp->t_maxseg; + max_val = rack->rc_pace_max_segs * ctf_fixed_maxseg(tp); else max_val = len; + if (rack->r_ctl.rc_pace_max_segs < max_val) + max_val = rack->r_ctl.rc_pace_max_segs; /* * We allow a limit on sending with hptsi. */ @@ -8017,9 +9297,17 @@ rack_output(struct tcpcb *tp) msb = NULL; else msb = sb; - m->m_next = tcp_m_copym(/*tp, */ mb, moff, &len, - if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, - hw_tls /*, NULL */); + m->m_next = tcp_m_copym( +#ifdef NETFLIX_COPY_ARGS + tp, +#endif + mb, moff, &len, + if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, + ((rsm == NULL) ? hw_tls : 0) +#ifdef NETFLIX_COPY_ARGS + , &filled_all +#endif + ); if (len <= (tp->t_maxseg - optlen)) { /* * Must have ran out of mbufs for the copy @@ -8053,8 +9341,6 @@ rack_output(struct tcpcb *tp) * TLP should not count in retran count, but * in its own bin */ -/* tp->t_sndtlppack++;*/ -/* tp->t_sndtlpbyte += len;*/ counter_u64_add(rack_tlp_retran, 1); counter_u64_add(rack_tlp_retran_bytes, len); } else { @@ -8085,7 +9371,7 @@ rack_output(struct tcpcb *tp) flags |= TH_PUSH; /* - * Are we doing hptsi, if so we must calculate the slot. We + * Are we doing pacing, if so we must calculate the slot. We * only do hptsi in ESTABLISHED and with no RESET being * sent where we have data to send. */ @@ -8094,56 +9380,10 @@ rack_output(struct tcpcb *tp) ((tp->t_state == TCPS_FIN_WAIT_1) && ((tp->t_flags & TF_SENTFIN) == 0) && ((flags & TH_FIN) == 0))) && - ((flags & TH_RST) == 0) && - (rack->rc_always_pace)) { - /* - * We use the most optimistic possible cwnd/srtt for - * sending calculations. This will make our - * calculation anticipate getting more through - * quicker then possible. But thats ok we don't want - * the peer to have a gap in data sending. - */ - uint32_t srtt, cwnd, tr_perms = 0; - - if (rack->r_ctl.rc_rack_min_rtt) - srtt = rack->r_ctl.rc_rack_min_rtt; - else - srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT)); - if (rack->r_ctl.rc_rack_largest_cwnd) - cwnd = rack->r_ctl.rc_rack_largest_cwnd; - else - cwnd = tp->snd_cwnd; - tr_perms = cwnd / srtt; - if (tr_perms == 0) { - tr_perms = tp->t_maxseg; - } + ((flags & TH_RST) == 0)) { + /* Get our pacing rate */ tot_len_this_send += len; - /* - * Calculate how long this will take to drain, if - * the calculation comes out to zero, thats ok we - * will use send_a_lot to possibly spin around for - * more increasing tot_len_this_send to the point - * that its going to require a pace, or we hit the - * cwnd. Which in that case we are just waiting for - * a ACK. - */ - slot = tot_len_this_send / tr_perms; - /* Now do we reduce the time so we don't run dry? */ - if (slot && rack->rc_pace_reduce) { - int32_t reduce; - - reduce = (slot / rack->rc_pace_reduce); - if (reduce < slot) { - slot -= reduce; - } else - slot = 0; - } - if (rack->r_enforce_min_pace && - (slot == 0) && - (tot_len_this_send >= (rack->r_min_pace_seg_thresh * tp->t_maxseg))) { - /* We are enforcing a minimum pace time of 1ms */ - slot = rack->r_enforce_min_pace; - } + slot = rack_get_pacing_delay(rack, tp, tot_len_this_send); } SOCKBUF_UNLOCK(sb); } else { @@ -8180,7 +9420,7 @@ rack_output(struct tcpcb *tp) #ifdef INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); -#ifdef NETFLIX_TCP_O_UDP +#ifdef NETFLIX_TCPOUDP if (tp->t_port) { udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); udp->uh_sport = htons(V_tcp_udp_tunneling_port); @@ -8188,10 +9428,14 @@ rack_output(struct tcpcb *tp) ulen = hdrlen + len - sizeof(struct ip6_hdr); udp->uh_ulen = htons(ulen); th = (struct tcphdr *)(udp + 1); - } else + } else #endif th = (struct tcphdr *)(ip6 + 1); - tcpip_fillheaders(inp, /*tp->t_port, */ ip6, th); + tcpip_fillheaders(inp, +#ifdef NETFLIX_TCPOUDP + tp->t_port, +#endif + ip6, th); } else #endif /* INET6 */ { @@ -8199,7 +9443,7 @@ rack_output(struct tcpcb *tp) #ifdef TCPDEBUG ipov = (struct ipovly *)ip; #endif -#ifdef NETFLIX_TCP_O_UDP +#ifdef NETFLIX_TCPOUDP if (tp->t_port) { udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); udp->uh_sport = htons(V_tcp_udp_tunneling_port); @@ -8210,7 +9454,11 @@ rack_output(struct tcpcb *tp) } else #endif th = (struct tcphdr *)(ip + 1); - tcpip_fillheaders(inp,/*tp->t_port, */ ip, th); + tcpip_fillheaders(inp, +#ifdef NETFLIX_TCPOUDP + tp->t_port, +#endif + ip, th); } /* * Fill in fields, remembering maximum advertised window for use in @@ -8301,15 +9549,20 @@ rack_output(struct tcpcb *tp) /* * Calculate receive window. Don't shrink window, but avoid silly * window syndrome. + * If a RST segment is sent, advertise a window of zero. */ - if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && - recwin < (long)tp->t_maxseg) + if (flags & TH_RST) { recwin = 0; - if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && - recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) - recwin = (long)(tp->rcv_adv - tp->rcv_nxt); - if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) - recwin = (long)TCP_MAXWIN << tp->rcv_scale; + } else { + if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && + recwin < (long)ctf_fixed_maxseg(tp)) + recwin = 0; + if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && + recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) + recwin = (long)(tp->rcv_adv - tp->rcv_nxt); + if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) + recwin = (long)TCP_MAXWIN << tp->rcv_scale; + } /* * According to RFC1323 the window field in a SYN (i.e., a or @@ -8376,7 +9629,6 @@ rack_output(struct tcpcb *tp) * ip6_plen is not need to be filled now, and will be filled * in ip6_output. */ -#ifdef NETFLIX_TCP_O_UDP if (tp->t_port) { m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); @@ -8384,15 +9636,12 @@ rack_output(struct tcpcb *tp) th->th_sum = htons(0); UDPSTAT_INC(udps_opackets); } else { -#endif m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 0); -#ifdef NETFLIX_TCP_O_UDP } -#endif } #endif #if defined(INET6) && defined(INET) @@ -8400,7 +9649,6 @@ rack_output(struct tcpcb *tp) #endif #ifdef INET { -#ifdef NETFLIX_TCP_O_UDP if (tp->t_port) { m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); @@ -8409,28 +9657,24 @@ rack_output(struct tcpcb *tp) th->th_sum = htons(0); UDPSTAT_INC(udps_opackets); } else { -#endif m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen)); -#ifdef NETFLIX_TCP_O_UDP } -#endif /* IP version must be set here for ipv4/ipv6 checking later */ KASSERT(ip->ip_v == IPVERSION, ("%s: IP version incorrect: %d", __func__, ip->ip_v)); } #endif - /* * Enable TSO and specify the size of the segments. The TCP pseudo * header checksum is always provided. XXX: Fixme: This is currently * not the case for IPv6. */ - if (tso) { - KASSERT(len > tp->t_maxseg - optlen, + if (tso || force_tso) { + KASSERT(force_tso || len > tp->t_maxseg - optlen, ("%s: len <= tso_segsz", __func__)); m->m_pkthdr.csum_flags |= CSUM_TSO; m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; @@ -8443,7 +9687,6 @@ rack_output(struct tcpcb *tp) /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ hhook_run_tcp_est_out(tp, th, &to, len, tso); #endif - #ifdef TCPDEBUG /* * Trace. @@ -8470,18 +9713,29 @@ rack_output(struct tcpcb *tp) /* We're getting ready to send; log now. */ if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; + struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; + log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; + log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; + log.u_bbr.flex4 = orig_len; + if (filled_all) + log.u_bbr.flex5 = 0x80000000; + else + log.u_bbr.flex5 = 0; if (rsm || sack_rxmit) { log.u_bbr.flex8 = 1; } else { log.u_bbr.flex8 = 0; } + log.u_bbr.pkts_out = tp->t_maxseg; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, - len, &log, false, NULL, NULL, 0, NULL); + len, &log, false, NULL, NULL, 0, &tv); } else lgb = NULL; @@ -8585,7 +9839,7 @@ rack_output(struct tcpcb *tp) if (TCPS_HAVEESTABLISHED(tp->t_state) && (tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) - tcp_clean_dsack_blocks(tp); + tcp_clean_dsack_blocks(tp); if (len == 0) counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); else if (len == 1) { @@ -8593,12 +9847,38 @@ rack_output(struct tcpcb *tp) } else if (len > 1) { int idx; - idx = (len / tp->t_maxseg) + 3; + idx = (len / ctf_fixed_maxseg(tp)) + 3; if (idx >= TCP_MSS_ACCT_ATIMER) counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); else counter_u64_add(rack_out_size[idx], 1); } + if (hw_tls && len > 0) { + if (filled_all) { + counter_u64_add(rack_tls_filled, 1); + rack_log_type_hrdwtso(tp, rack, len, 0, orig_len, 1); + } else { + if (rsm) { + counter_u64_add(rack_tls_rxt, 1); + rack_log_type_hrdwtso(tp, rack, len, 2, orig_len, 1); + } else if (doing_tlp) { + counter_u64_add(rack_tls_tlp, 1); + rack_log_type_hrdwtso(tp, rack, len, 3, orig_len, 1); + } else if ( (ctf_outstanding(tp) + rack->r_ctl.rc_pace_min_segs) > sbavail(sb)) { + counter_u64_add(rack_tls_app, 1); + rack_log_type_hrdwtso(tp, rack, len, 4, orig_len, 1); + } else if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) + rack->r_ctl.rc_pace_min_segs) > tp->snd_cwnd) { + counter_u64_add(rack_tls_cwnd, 1); + rack_log_type_hrdwtso(tp, rack, len, 5, orig_len, 1); + } else if ((ctf_outstanding(tp) + rack->r_ctl.rc_pace_min_segs) > tp->snd_wnd) { + counter_u64_add(rack_tls_rwnd, 1); + rack_log_type_hrdwtso(tp, rack, len, 6, orig_len, 1); + } else { + rack_log_type_hrdwtso(tp, rack, len, 7, orig_len, 1); + counter_u64_add(rack_tls_other, 1); + } + } + } } if (sub_from_prr && (error == 0)) { if (rack->r_ctl.rc_prr_sndcnt >= len) @@ -8609,17 +9889,20 @@ rack_output(struct tcpcb *tp) sub_from_prr = 0; rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts, pass, rsm); + if ((error == 0) && + (len > 0) && + (tp->snd_una == tp->snd_max)) + rack->r_ctl.rc_tlp_rxt_last_time = cts; if ((tp->t_flags & TF_FORCEDATA) == 0 || (rack->rc_in_persist == 0)) { -#ifdef NETFLIX_STATS tcp_seq startseq = tp->snd_nxt; -#endif + /* * Advance snd_nxt over sequence space of this segment. */ if (error) /* We don't log or do anything with errors */ - goto timer; + goto nomore; if (flags & (TH_SYN | TH_FIN)) { if (flags & TH_SYN) @@ -8631,7 +9914,7 @@ rack_output(struct tcpcb *tp) } /* In the ENOBUFS case we do *not* update snd_max */ if (sack_rxmit) - goto timer; + goto nomore; tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { @@ -8644,6 +9927,17 @@ rack_output(struct tcpcb *tp) tp->t_acktime = ticks; } tp->snd_max = tp->snd_nxt; + /* + * Time this transmission if not a retransmission and + * not currently timing anything. + * This is only relevant in case of switching back to + * the base stack. + */ + if (tp->t_rtttime == 0) { + tp->t_rtttime = ticks; + tp->t_rtseq = startseq; + TCPSTAT_INC(tcps_segstimed); + } #ifdef NETFLIX_STATS if (!(tp->t_flags & TF_GPUTINPROG) && len) { tp->t_flags |= TF_GPUTINPROG; @@ -8654,26 +9948,6 @@ rack_output(struct tcpcb *tp) } #endif } - /* - * Set retransmit timer if not currently set, and not doing - * a pure ack or a keep-alive probe. Initial value for - * retransmit timer is smoothed round-trip time + 2 * - * round-trip time variance. Initialize shift counter which - * is used for backoff of retransmit time. - */ -timer: - if ((tp->snd_wnd == 0) && - TCPS_HAVEESTABLISHED(tp->t_state)) { - /* - * If the persists timer was set above (right before - * the goto send), and still needs to be on. Lets - * make sure all is canceled. If the persist timer - * is not running, we want to get it up. - */ - if (rack->rc_in_persist == 0) { - rack_enter_persist(tp, rack, cts); - } - } } else { /* * Persist case, update snd_max but since we are in persist @@ -8755,7 +10029,7 @@ rack_output(struct tcpcb *tp) goto again; } slot = 10; - rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1); + rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); tp->t_flags &= ~TF_FORCEDATA; return (error); case ENETUNREACH: @@ -8769,7 +10043,7 @@ rack_output(struct tcpcb *tp) /* FALLTHROUGH */ default: slot = 10; - rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1); + rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); tp->t_flags &= ~TF_FORCEDATA; return (error); } @@ -8789,15 +10063,22 @@ rack_output(struct tcpcb *tp) tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); enobufs: rack->r_tlp_running = 0; - if ((flags & TH_RST) || (would_have_fin == 1)) { + if (flags & TH_RST) { /* - * We don't send again after a RST. We also do *not* send - * again if we would have had a find, but now have - * outstanding data. + * We don't send again after sending a RST. */ slot = 0; sendalot = 0; } + if (rsm && (slot == 0)) { + /* + * Dup ack retransmission possibly, so + * lets assure we have at least min rack + * time, if its a rack resend then the rack + * to will also be set to this. + */ + slot = rack->r_ctl.rc_min_to; + } if (slot) { /* set the rack tcb into the slot N */ counter_u64_add(rack_paced_segments, 1); @@ -8811,7 +10092,7 @@ rack_output(struct tcpcb *tp) counter_u64_add(rack_unpaced_segments, 1); } tp->t_flags &= ~TF_FORCEDATA; - rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1); + rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0); return (error); } @@ -8847,8 +10128,10 @@ rack_set_sockopt(struct socket *so, struct sockopt *sopt, case TCP_RACK_TLP_INC_VAR: case TCP_RACK_IDLE_REDUCE_HIGH: case TCP_RACK_MIN_PACE: - case TCP_RACK_MIN_PACE_SEG: + case TCP_RACK_GP_INCREASE: case TCP_BBR_RACK_RTT_USE: + case TCP_BBR_USE_RACK_CHEAT: + case TCP_RACK_DO_DETECTION: case TCP_DATA_AFTER_CLOSE: break; default: @@ -8867,6 +10150,13 @@ rack_set_sockopt(struct socket *so, struct sockopt *sopt, tp = intotcpcb(inp); rack = (struct tcp_rack *)tp->t_fb_ptr; switch (sopt->sopt_name) { + case TCP_RACK_DO_DETECTION: + RACK_OPTS_INC(tcp_rack_no_sack); + if (optval == 0) + rack->do_detection = 0; + else + rack->do_detection = 1; + break; case TCP_RACK_PROP_RATE: if ((optval <= 0) || (optval >= 100)) { error = EINVAL; @@ -8919,6 +10209,7 @@ rack_set_sockopt(struct socket *so, struct sockopt *sopt, /* Max segments in a pace */ RACK_OPTS_INC(tcp_rack_max_seg); rack->rc_pace_max_segs = optval; + rack_set_pace_segments(tp, rack); break; case TCP_RACK_PRR_SENDALOT: /* Allow PRR to send more than one seg */ @@ -8956,6 +10247,13 @@ rack_set_sockopt(struct socket *so, struct sockopt *sopt, else error = EINVAL; break; + case TCP_BBR_USE_RACK_CHEAT: + RACK_OPTS_INC(tcp_rack_cheat); + if (optval) + rack->use_rack_cheat = 1; + else + rack->use_rack_cheat = 0; + break; case TCP_RACK_PKT_DELAY: /* RACK added ms i.e. rack-rtt + reord + N */ RACK_OPTS_INC(tcp_rack_pkt_delay); @@ -8963,15 +10261,10 @@ rack_set_sockopt(struct socket *so, struct sockopt *sopt, break; case TCP_RACK_TLP_INC_VAR: /* Does TLP include rtt variance in t-o */ - RACK_OPTS_INC(tcp_rack_tlp_inc_var); - rack->r_ctl.rc_prr_inc_var = optval; + return (EINVAL); break; case TCP_RACK_IDLE_REDUCE_HIGH: - RACK_OPTS_INC(tcp_rack_idle_reduce_high); - if (optval) - rack->r_idle_reduce_largest = 1; - else - rack->r_idle_reduce_largest = 0; + return (EINVAL); break; case TCP_DELACK: if (optval == 0) @@ -8991,12 +10284,13 @@ rack_set_sockopt(struct socket *so, struct sockopt *sopt, else rack->r_enforce_min_pace = optval; break; - case TCP_RACK_MIN_PACE_SEG: - RACK_OPTS_INC(tcp_rack_min_pace_seg); - if (optval >= 16) - rack->r_min_pace_seg_thresh = 15; + case TCP_RACK_GP_INCREASE: + if ((optval >= 0) && + (optval <= 256)) + rack->rack_per_of_gp = optval; else - rack->r_min_pace_seg_thresh = optval; + error = EINVAL; + break; case TCP_BBR_RACK_RTT_USE: if ((optval != USE_RTT_HIGH) && @@ -9016,7 +10310,9 @@ rack_set_sockopt(struct socket *so, struct sockopt *sopt, return (tcp_default_ctloutput(so, sopt, inp, tp)); break; } -/* tcp_log_socket_option(tp, sopt->sopt_name, optval, error);*/ +#ifdef NETFLIX_STATS + tcp_log_socket_option(tp, sopt->sopt_name, optval, error); +#endif INP_WUNLOCK(inp); return (error); } @@ -9034,6 +10330,10 @@ rack_get_sockopt(struct socket *so, struct sockopt *sopt, * impact to this routine. */ switch (sopt->sopt_name) { + case TCP_RACK_DO_DETECTION: + optval = rack->do_detection; + break; + case TCP_RACK_PROP_RATE: optval = rack->r_ctl.rc_prop_rate; break; @@ -9081,6 +10381,10 @@ rack_get_sockopt(struct socket *so, struct sockopt *sopt, /* Does reordering fade after ms time */ optval = rack->r_ctl.rc_reorder_fade; break; + case TCP_BBR_USE_RACK_CHEAT: + /* Do we use the rack cheat for rxt */ + optval = rack->use_rack_cheat; + break; case TCP_RACK_TLP_THRESH: /* RACK TLP theshold i.e. srtt+(srtt/N) */ optval = rack->r_ctl.rc_tlp_threshold; @@ -9094,16 +10398,16 @@ rack_get_sockopt(struct socket *so, struct sockopt *sopt, break; case TCP_RACK_TLP_INC_VAR: /* Does TLP include rtt variance in t-o */ - optval = rack->r_ctl.rc_prr_inc_var; + return (EINVAL); break; case TCP_RACK_IDLE_REDUCE_HIGH: - optval = rack->r_idle_reduce_largest; + return (EINVAL); break; case TCP_RACK_MIN_PACE: optval = rack->r_enforce_min_pace; break; - case TCP_RACK_MIN_PACE_SEG: - optval = rack->r_min_pace_seg_thresh; + case TCP_RACK_GP_INCREASE: + optval = rack->rack_per_of_gp; break; case TCP_BBR_RACK_RTT_USE: optval = rack->r_ctl.rc_rate_sample_method; @@ -9145,9 +10449,11 @@ rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struc } -struct tcp_function_block __tcp_rack = { +static struct tcp_function_block __tcp_rack = { .tfb_tcp_block_name = __XSTRING(STACKNAME), .tfb_tcp_output = rack_output, + .tfb_do_queued_segments = ctf_do_queued_segments, + .tfb_do_segment_nounlock = rack_do_segment_nounlock, .tfb_tcp_do_segment = rack_do_segment, .tfb_tcp_ctloutput = rack_ctloutput, .tfb_tcp_fb_init = rack_init, @@ -9202,7 +10508,11 @@ tcp_addrack(module_t mod, int32_t type, void *data) rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_STATIC_CHILDREN(_net_inet_tcp), OID_AUTO, +#ifdef STACKALIAS + __XSTRING(STACKALIAS), +#else __XSTRING(STACKNAME), +#endif CTLFLAG_RW, 0, ""); if (rack_sysctl_root == NULL) { @@ -9226,6 +10536,7 @@ tcp_addrack(module_t mod, int32_t type, void *data) printf("Failed to register rack module -- err:%d\n", err); return (err); } + tcp_lro_reg_mbufq(); rack_mod_inited = true; break; case MOD_QUIESCE: @@ -9242,6 +10553,7 @@ tcp_addrack(module_t mod, int32_t type, void *data) rack_counter_destroy(); rack_mod_inited = false; } + tcp_lro_dereg_mbufq(); err = 0; break; default: diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.c b/sys/netinet/tcp_stacks/rack_bbr_common.c index 96553320cd97..a61c8c4eedf1 100644 --- a/sys/netinet/tcp_stacks/rack_bbr_common.c +++ b/sys/netinet/tcp_stacks/rack_bbr_common.c @@ -40,7 +40,7 @@ __FBSDID("$FreeBSD$"); #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include "opt_ratelimit.h" -/*#include "opt_kern_tls.h"*/ +#include "opt_kern_tls.h" #include #include #include @@ -50,20 +50,25 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #ifdef KERN_TLS -#include +#include #endif #include #include #include +#ifdef NETFLIX_STATS +#include /* Must come after qmath.h and tree.h */ +#endif #include #include #include #include #include #include +#include #include #include #include @@ -85,6 +90,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#define TCPOUTFLAGS #include #include #include @@ -133,14 +139,14 @@ __FBSDID("$FreeBSD$"); uint32_t ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd) { - struct sbtls_info *tls; + struct ktls_session *tls; uint32_t len; again: tls = so->so_snd.sb_tls_info; - len = tls->sb_params.sb_maxlen; /* max tls payload */ - len += tls->sb_params.sb_tls_hlen; /* tls header len */ - len += tls->sb_params.sb_tls_tlen; /* tls trailer len */ + len = tls->params.max_frame_len; /* max tls payload */ + len += tls->params.tls_hlen; /* tls header len */ + len += tls->params.tls_tlen; /* tls trailer len */ if ((len * 4) > rwnd) { /* * Stroke this will suck counter and what @@ -148,10 +154,10 @@ ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd) * TCP perspective I am not sure * what should be done... */ - if (tls->sb_params.sb_maxlen > 4096) { - tls->sb_params.sb_maxlen -= 4096; - if (tls->sb_params.sb_maxlen < 4096) - tls->sb_params.sb_maxlen = 4096; + if (tls->params.max_frame_len > 4096) { + tls->params.max_frame_len -= 4096; + if (tls->params.max_frame_len < 4096) + tls->params.max_frame_len = 4096; goto again; } } @@ -414,7 +420,13 @@ ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int * have been called (if we can). */ m->m_pkthdr.lro_nsegs = 1; - tcp_get_usecs(&tv); + if (m->m_flags & M_TSTMP_LRO) { + tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000; + tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000; + } else { + /* Should not be should we kassert instead? */ + tcp_get_usecs(&tv); + } /* Now what about next packet? */ if (m_save || has_pkt) nxt_pkt = 1; @@ -425,7 +437,7 @@ ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int if (retval) { /* We lost the lock and tcb probably */ m = m_save; - while (m) { + while(m) { m_save = m->m_nextpkt; m->m_nextpkt = NULL; m_freem(m); @@ -434,7 +446,7 @@ ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int if (no_vn == 0) CURVNET_RESTORE(); INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); - return (retval); + return(retval); } skipped_pkt: m = m_save; @@ -442,7 +454,7 @@ ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int if (no_vn == 0) CURVNET_RESTORE(); INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); - return (retval); + return(retval); } int @@ -457,7 +469,7 @@ ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt) tp->t_tail_pkt = NULL; if (ctf_process_inbound_raw(tp, so, m, have_pkt)) { /* We lost the tcpcb (maybe a RST came in)? */ - return (1); + return(1); } } return (0); @@ -466,14 +478,14 @@ ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt) uint32_t ctf_outstanding(struct tcpcb *tp) { - return (tp->snd_max - tp->snd_una); + return(tp->snd_max - tp->snd_una); } uint32_t ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked) { if (rc_sacked <= ctf_outstanding(tp)) - return (ctf_outstanding(tp) - rc_sacked); + return(ctf_outstanding(tp) - rc_sacked); else { /* TSNH */ #ifdef INVARIANTS @@ -908,5 +920,5 @@ ctf_decay_count(uint32_t count, uint32_t decay) * count decay value. */ decayed_count = count - (uint32_t)perc_count; - return (decayed_count); + return(decayed_count); } diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.h b/sys/netinet/tcp_stacks/rack_bbr_common.h index 822208338d67..6cb2fed7c2fa 100644 --- a/sys/netinet/tcp_stacks/rack_bbr_common.h +++ b/sys/netinet/tcp_stacks/rack_bbr_common.h @@ -1,7 +1,7 @@ #ifndef __pacer_timer_h__ #define __pacer_timer_h__ /*- - * Copyright (c) 2017 Netflix, Inc. + * Copyright (c) 2017-9 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions diff --git a/sys/netinet/tcp_stacks/sack_filter.c b/sys/netinet/tcp_stacks/sack_filter.c index 2ef0eadfa944..c4b35d5b8ca8 100644 --- a/sys/netinet/tcp_stacks/sack_filter.c +++ b/sys/netinet/tcp_stacks/sack_filter.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2017 Netflix, Inc. + * Copyright (c) 2017-9 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -140,6 +140,7 @@ static int32_t is_sack_on_board(struct sack_filter *sf, struct sackblk *b) { int32_t i, cnt; + for (i = sf->sf_cur, cnt=0; cnt < SACK_FILTER_BLOCKS; cnt++) { if (sack_blk_used(sf, i)) { if (SEQ_LT(b->start, sf->sf_ack)) { @@ -150,8 +151,9 @@ is_sack_on_board(struct sack_filter *sf, struct sackblk *b) /* End back behind too */ b->end = sf->sf_ack; } - if (b->start == b->end) + if (b->start == b->end) { return(1); + } /* Jonathans Rule 1 */ if (SEQ_LEQ(sf->sf_blks[i].start, b->start) && SEQ_GEQ(sf->sf_blks[i].end, b->end)) { @@ -312,21 +314,22 @@ sack_filter_new(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq if (num == 0) return(num); - /* Now what we are left is either + /* Now what we are left with is either * completely merged on to the board - * from the above steps, or are new + * from the above steps, or is new * and need to be added to the board * with the last one updated to current. * - * First copy it out we want to return that + * First copy it out, we want to return that * to our caller for processing. */ memcpy(in, blkboard, (num * sizeof(struct sackblk))); numblks = num; /* Now go through and add to our board as needed */ for(i=(num-1); i>=0; i--) { - if (is_sack_on_board(sf, &blkboard[i])) + if (is_sack_on_board(sf, &blkboard[i])) { continue; + } /* Add this guy its not listed */ sf->sf_cur++; sf->sf_cur %= SACK_FILTER_BLOCKS; @@ -462,65 +465,6 @@ sack_board_collapse(struct sack_filter *sf) } } -#ifndef _KERNEL -static -#endif -int -sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack) -{ - int32_t i, ret; - - if (numblks > TCP_MAX_SACK) { - panic("sf:%p sb:%p Impossible number of sack blocks %d > 4\n", - sf, in, - numblks); - return(numblks); - } - if ((sf->sf_used == 0) && numblks) { - /* - * We are brand new add the blocks in - * reverse order. Note we can see more - * than one in new, since ack's could be lost. - */ - sf->sf_ack = th_ack; - for(i=(numblks-1), sf->sf_cur=0; i >= 0; i--) { - memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk)); - sf->sf_bits = sack_blk_set(sf, sf->sf_cur); - sf->sf_cur++; - sf->sf_cur %= SACK_FILTER_BLOCKS; - sf->sf_used++; -#ifndef _KERNEL - if (sf->sf_used > highest_used) - highest_used = sf->sf_used; -#endif - } - if (sf->sf_cur) - sf->sf_cur--; - return(numblks); - } - if (SEQ_GT(th_ack, sf->sf_ack)) { - sack_filter_prune(sf, th_ack); - } - if (numblks) { - if (SEQ_GEQ(th_ack, sf->sf_ack)) { - ret = sack_filter_new(sf, in, numblks, th_ack); - } else { - ret = sack_filter_old(sf, in, numblks); - } - } else - ret = 0; -#ifndef _KERNEL - if ((sf->sf_used > 1) && (no_collapse == 0)) - sack_board_collapse(sf); - -#else - if (sf->sf_used > 1) - sack_board_collapse(sf); - -#endif - return (ret); -} - #ifndef _KERNEL uint64_t saved=0; uint64_t tot_sack_blks=0; @@ -541,6 +485,133 @@ sack_filter_dump(FILE *out, struct sack_filter *sf) } } } +#endif + +#ifndef _KERNEL +static +#endif +int +sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, + tcp_seq th_ack) +{ + int32_t i, ret; + + if (numblks > TCP_MAX_SACK) { +#ifdef _KERNEL + panic("sf:%p sb:%p Impossible number of sack blocks %d > 4\n", + sf, in, + numblks); +#endif + return(numblks); + } +#ifndef _KERNEL + if ((sf->sf_used > 1) && (no_collapse == 0)) + sack_board_collapse(sf); + +#else + if (sf->sf_used > 1) + sack_board_collapse(sf); +#endif + if ((sf->sf_used == 0) && numblks) { + /* + * We are brand new add the blocks in + * reverse order. Note we can see more + * than one in new, since ack's could be lost. + */ + int cnt_added = 0; + + sf->sf_ack = th_ack; + for(i=(numblks-1), sf->sf_cur=0; i >= 0; i--) { + memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk)); + sf->sf_bits = sack_blk_set(sf, sf->sf_cur); + sf->sf_cur++; + sf->sf_cur %= SACK_FILTER_BLOCKS; + sf->sf_used++; + cnt_added++; +#ifndef _KERNEL + if (sf->sf_used > highest_used) + highest_used = sf->sf_used; +#endif + } + if (sf->sf_cur) + sf->sf_cur--; + + return (cnt_added); + } + if (SEQ_GT(th_ack, sf->sf_ack)) { + sack_filter_prune(sf, th_ack); + } + if (numblks) { + if (SEQ_GEQ(th_ack, sf->sf_ack)) { + ret = sack_filter_new(sf, in, numblks, th_ack); + } else { + ret = sack_filter_old(sf, in, numblks); + } + } else + ret = 0; + return (ret); +} + +void +sack_filter_reject(struct sack_filter *sf, struct sackblk *in) +{ + /* + * Given a specified block (that had made + * it past the sack filter). Reject that + * block triming it off any sack-filter block + * that has it. Usually because the block was + * too small and did not cover a whole send. + * + * This function will only "undo" sack-blocks + * that are fresh and touch the edges of + * blocks in our filter. + */ + int i; + + for(i=0; isf_blks[i].end == in->end) { + /* The end moves back to start */ + if (SEQ_GT(in->start, sf->sf_blks[i].start)) + /* in-blk |----| */ + /* sf-blk |---------| */ + sf->sf_blks[i].end = in->start; + else { + /* It consumes this block */ + /* in-blk |---------| */ + /* sf-blk |------| */ + /* */ + /* sf-blk |---------| */ + sf->sf_bits = sack_blk_clr(sf, i); + sf->sf_used--; + } + continue; + } + if (sf->sf_blks[i].start == in->start) { + if (SEQ_LT(in->end, sf->sf_blks[i].end)) { + /* in-blk |----| */ + /* sf-blk |---------| */ + sf->sf_blks[i].start = in->end; + } else { + /* It consumes this block */ + /* in-blk |----------| */ + /* sf-blk |-------| */ + /* */ + /* sf-blk |----------| */ + sf->sf_bits = sack_blk_clr(sf, i); + sf->sf_used--; + } + continue; + } + } +} + +#ifndef _KERNEL int main(int argc, char **argv) @@ -548,12 +619,12 @@ main(int argc, char **argv) char buffer[512]; struct sackblk blks[TCP_MAX_SACK]; FILE *err; - tcp_seq th_ack, snd_una; + tcp_seq th_ack, snd_una, snd_max = 0; struct sack_filter sf; int32_t numblks,i; int snd_una_set=0; double a, b, c; - int invalid_sack_print = 0; + int invalid_sack_print = 0; uint32_t chg_remembered=0; uint32_t sack_chg=0; char line_buf[10][256]; @@ -604,7 +675,11 @@ main(int argc, char **argv) line_buf_at++; if (strncmp(buffer, "QUIT", 4) == 0) { break; - } else if (strncmp(buffer, "DONE", 4) == 0) { + } else if (strncmp(buffer, "DUMP", 4) == 0) { + sack_filter_dump(out, &sf); + } else if (strncmp(buffer, "MAX:", 4) == 0) { + snd_max = strtoul(&buffer[4], NULL, 0); + } else if (strncmp(buffer, "COMMIT", 6) == 0) { int nn, ii; if (numblks) { uint32_t szof, tot_chg; @@ -660,6 +735,7 @@ main(int argc, char **argv) char *end=NULL; uint32_t start; uint32_t endv; + start = strtoul(&buffer[5], &end, 0); if (end) { endv = strtoul(&end[1], NULL, 0); @@ -667,6 +743,8 @@ main(int argc, char **argv) fprintf(out, "--Sack invalid skip 0 start:%u : ??\n", start); continue; } + if (SEQ_GT(endv, snd_max)) + snd_max = endv; if (SEQ_LT(endv, start)) { fprintf(out, "--Sack invalid skip 1 endv:%u < start:%u\n", endv, start); continue; @@ -678,6 +756,28 @@ main(int argc, char **argv) blks[numblks].start = start; blks[numblks].end = endv; numblks++; + } else if (strncmp(buffer, "REJ:n:n", 4) == 0) { + struct sackblk in; + char *end=NULL; + + in.start = strtoul(&buffer[4], &end, 0); + if (end) { + in.end = strtoul(&end[1], NULL, 0); + sack_filter_reject(&sf, &in); + } else + fprintf(out, "Invalid input END:A:B\n"); + } else if (strncmp(buffer, "HELP", 4) == 0) { + fprintf(out, "You can input:\n"); + fprintf(out, "SACK:S:E -- to define a sack block\n"); + fprintf(out, "RXT -- to clear the filter without changing the remembered\n"); + fprintf(out, "EXIT -- To clear the sack filter and start all fresh\n"); + fprintf(out, "ACK:N -- To advance the cum-ack to N\n"); + fprintf(out, "MAX:N -- To set send-max to N\n"); + fprintf(out, "COMMIT -- To apply the sack you built to the filter and dump the filter\n"); + fprintf(out, "DUMP -- To display the current contents of the sack filter\n"); + fprintf(out, "QUIT -- To exit this program\n"); + } else { + fprintf(out, "Command %s unknown\n", buffer); } memset(buffer, 0, sizeof(buffer)); } diff --git a/sys/netinet/tcp_stacks/sack_filter.h b/sys/netinet/tcp_stacks/sack_filter.h index 3ef986209566..2d01a0c15471 100644 --- a/sys/netinet/tcp_stacks/sack_filter.h +++ b/sys/netinet/tcp_stacks/sack_filter.h @@ -1,7 +1,7 @@ #ifndef __sack_filter_h__ #define __sack_filter_h__ /*- - * Copyright (c) 2017 Netflix, Inc. + * Copyright (c) 2017-9 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -50,7 +50,8 @@ struct sack_filter { }; #ifdef _KERNEL void sack_filter_clear(struct sack_filter *sf, tcp_seq seq); -int sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack); - +int sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, + tcp_seq th_ack); +void sack_filter_reject(struct sack_filter *sf, struct sackblk *in); #endif #endif diff --git a/sys/netinet/tcp_stacks/tcp_bbr.h b/sys/netinet/tcp_stacks/tcp_bbr.h new file mode 100644 index 000000000000..f09e25a18390 --- /dev/null +++ b/sys/netinet/tcp_stacks/tcp_bbr.h @@ -0,0 +1,845 @@ +/*- + * Copyright (c) 2016-9 + * Netflix Inc. All rights reserved. + * Author Randall R. Stewart + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_TCP_BBR_H_ +#define _NETINET_TCP_BBR_H_ + +#define BBR_INITIAL_RTO 1000000 /* 1 second in micro-seconds */ +/* Send map flags */ +#define BBR_ACKED 0x0001 /* The remote endpoint acked this */ +#define BBR_WAS_RENEGED 0x0002 /* The peer reneged the ack */ +#define BBR_RXT_CLEARED 0x0004 /* ACK Cleared by the RXT timer */ +#define BBR_OVERMAX 0x0008 /* We have more retran's then we can + * fit */ +#define BBR_SACK_PASSED 0x0010 /* A sack was done above this block */ +#define BBR_WAS_SACKPASS 0x0020 /* We retransmitted due to SACK pass */ +#define BBR_HAS_FIN 0x0040 /* segment is sent with fin */ +#define BBR_TLP 0x0080 /* segment sent as tail-loss-probe */ +#define BBR_HAS_SYN 0x0100 /* segment has the syn */ +#define BBR_MARKED_LOST 0x0200 /* + * This segments is lost and + * totaled into bbr->rc_ctl.rc_lost + */ +#define BBR_RWND_COLLAPSED 0x0400 /* The peer collapsed the rwnd on the segment */ +#define BBR_NUM_OF_RETRANS 7 + +/* Defines for socket options to set pacing overheads */ +#define BBR_INCL_ENET_OH 0x01 +#define BBR_INCL_IP_OH 0x02 +#define BBR_INCL_TCP_OH 0x03 + +/* + * With the addition of both measurement algorithms + * I had to move over the size of a + * cache line (unfortunately). For now there is + * no way around this. We may be able to cut back + * at some point I hope. + */ +struct bbr_sendmap { + TAILQ_ENTRY(bbr_sendmap) r_next; /* seq number arrayed next */ + TAILQ_ENTRY(bbr_sendmap) r_tnext; /* Time of tmit based next */ + uint32_t r_start; /* Sequence number of the segment */ + uint32_t r_end; /* End seq, this is 1 beyond actually */ + + uint32_t r_rtr_bytes; /* How many bytes have been retransmitted */ + uint32_t r_delivered; /* Delivered amount at send */ + + uint32_t r_del_time; /* The time of the last delivery update */ + uint8_t r_rtr_cnt:4, /* Retran count, index this -1 to get time + * sent */ + unused_bit:1, + r_is_drain:1, /* In a draining cycle */ + r_app_limited:1,/* We went app limited */ + r_ts_valid:1; /* Timestamp field is valid (r_del_ack_ts) */ + uint8_t r_dupack; /* Dup ack count */ + uint8_t r_in_tmap:1, /* Flag to see if its in the r_tnext array */ + r_is_smallmap:1,/* Was logged as a small-map send-map item */ + r_is_gain:1, /* Was in gain cycle */ + r_bbr_state:5; /* The BBR state at send */ + uint8_t r_limit_type; /* is this entry counted against a limit? */ + + uint16_t r_flags; /* Flags as defined above */ + uint16_t r_spare16; + uint32_t r_del_ack_ts; /* At send what timestamp of peer was (if r_ts_valid set) */ + /****************Cache line*****************/ + uint32_t r_tim_lastsent[BBR_NUM_OF_RETRANS]; + /* + * Question, should we instead just grab the sending b/w + * from the filter with the gain and store it in a + * uint64_t instead? + */ + uint32_t r_first_sent_time; /* Time of first pkt in flight sent */ + uint32_t r_pacing_delay; /* pacing delay of this send */ + uint32_t r_flight_at_send; /* flight at the time of the send */ +#ifdef _KERNEL +} __aligned(CACHE_LINE_SIZE); +#else +}; +#endif +#define BBR_LIMIT_TYPE_SPLIT 1 + +TAILQ_HEAD(bbr_head, bbr_sendmap); + +#define BBR_SEGMENT_TIME_SIZE 1500 /* How many bytes in time_between */ + +#define BBR_MIN_SEG 1460 /* MSS size */ +#define BBR_MAX_GAIN_VALUE 0xffff + +#define BBR_TIMER_FUDGE 1500 /* 1.5ms in micro seconds */ + +/* BW twiddle secret codes */ +#define BBR_RED_BW_CONGSIG 0 /* We enter recovery and set using b/w */ +#define BBR_RED_BW_RATECAL 1 /* We are calculating the loss rate */ +#define BBR_RED_BW_USELRBW 2 /* We are dropping the lower b/w with + * cDR */ +#define BBR_RED_BW_SETHIGHLOSS 3 /* We have set our highloss value at + * exit from probe-rtt */ +#define BBR_RED_BW_PE_CLREARLY 4 /* We have decided to clear the + * reduction early */ +#define BBR_RED_BW_PE_CLAFDEL 5 /* We are clearing it on schedule + * delayed */ +#define BBR_RED_BW_REC_ENDCLL 6 /* Recover exits save high if needed + * an clear to start measuring */ +#define BBR_RED_BW_PE_NOEARLY_OUT 7 /* Set pkt epoch judged that we do not + * get out of jail early */ +/* codes for just-return */ +#define BBR_JR_SENT_DATA 0 +#define BBR_JR_CWND_LIMITED 1 +#define BBR_JR_RWND_LIMITED 2 +#define BBR_JR_APP_LIMITED 3 +#define BBR_JR_ASSESSING 4 +/* For calculating a rate */ +#define BBR_CALC_BW 1 +#define BBR_CALC_LOSS 2 + +#define BBR_RTT_BY_TIMESTAMP 0 +#define BBR_RTT_BY_EXACTMATCH 1 +#define BBR_RTT_BY_EARLIER_RET 2 +#define BBR_RTT_BY_THIS_RETRAN 3 +#define BBR_RTT_BY_SOME_RETRAN 4 +#define BBR_RTT_BY_TSMATCHING 5 + +/* Markers to track where we enter persists from */ +#define BBR_PERSISTS_FROM_1 1 +#define BBR_PERSISTS_FROM_2 2 +#define BBR_PERSISTS_FROM_3 3 +#define BBR_PERSISTS_FROM_4 4 +#define BBR_PERSISTS_FROM_5 5 + +/* magic cookies to ask for the RTT */ +#define BBR_RTT_PROP 0 +#define BBR_RTT_RACK 1 +#define BBR_RTT_PKTRTT 2 +#define BBR_SRTT 3 + +#define BBR_SACKED 0 +#define BBR_CUM_ACKED 1 + +/* threshold in useconds where we consider we need a higher min cwnd */ +#define BBR_HIGH_SPEED 1000 +#define BBR_HIGHSPEED_NUM_MSS 12 + +#define MAX_REDUCE_RXT 3 /* What is the maximum times we are willing to + * reduce b/w in RTX's. Setting this has a + * multiplicative effect e.g. if we are + * reducing by 20% then setting it to 3 means + * you will have reduced the b/w estimate by > + * 60% before you stop. */ +/* + * We use the rate sample structure to + * assist in single sack/ack rate and rtt + * calculation. In the future we will expand + * this in BBR to do forward rate sample + * b/w estimation. + */ +#define BBR_RS_RTT_EMPTY 0x00000001 /* Nothing yet stored in RTT's */ +#define BBR_RS_BW_EMPTY 0x00000002 /* Nothing yet stored in cDR */ +#define BBR_RS_RTT_VALID 0x00000004 /* We have at least one valid RTT */ +#define BBR_RS_BW_VAILD 0x00000008 /* We have a valid cDR */ +#define BBR_RS_EMPTY (BBR_RS_RTT_EMPTY|BBR_RS_BW_EMPTY) +struct bbr_rtt_sample { + uint32_t rs_flags; + uint32_t rs_rtt_lowest; + uint32_t rs_rtt_lowest_sendtime; + uint32_t rs_rtt_low_seq_start; + + uint32_t rs_rtt_highest; + uint32_t rs_rtt_cnt; + + uint64_t rs_rtt_tot; + uint32_t cur_rtt; + uint32_t cur_rtt_bytecnt; + + uint32_t cur_rtt_rsmcnt; + uint32_t rc_crtt_set:1, + avail_bits:31; + uint64_t rs_cDR; +}; + +/* RTT shrink reasons */ +#define BBR_RTTS_INIT 0 +#define BBR_RTTS_NEWRTT 1 +#define BBR_RTTS_RTTPROBE 2 +#define BBR_RTTS_WASIDLE 3 +#define BBR_RTTS_PERSIST 4 +#define BBR_RTTS_REACHTAR 5 +#define BBR_RTTS_ENTERPROBE 6 +#define BBR_RTTS_SHRINK_PG 7 +#define BBR_RTTS_SHRINK_PG_FINAL 8 +#define BBR_RTTS_NEW_TARGET 9 +#define BBR_RTTS_LEAVE_DRAIN 10 +#define BBR_RTTS_RESETS_VALUES 11 + +#define BBR_NUM_RATES 5 +/* Rate flags */ +#define BBR_RT_FLAG_FREE 0x00 /* Is on the free list */ +#define BBR_RT_FLAG_INUSE 0x01 /* Has been allocated */ +#define BBR_RT_FLAG_READY 0x02 /* Ready to initiate a measurement. */ +#define BBR_RT_FLAG_CAPPED_PRE 0x04 /* Ready to cap if we send the next segment */ +#define BBR_RT_FLAG_CAPPED 0x08 /* Measurement is capped */ +#define BBR_RT_FLAG_PASTFA 0x10 /* Past the first ack. */ +#define BBR_RT_FLAG_LIMITED 0x20 /* Saw application/cwnd or rwnd limited period */ +#define BBR_RT_SEEN_A_ACK 0x40 /* A ack has been saved */ +#define BBR_RT_PREV_RTT_SET 0x80 /* There was a RTT set in */ +#define BBR_RT_PREV_SEND_TIME 0x100 /* + *There was a RTT send time set that can be used + * no snd_limits + */ +#define BBR_RT_SET_GRADIENT 0x200 +#define BBR_RT_TS_VALID 0x400 + + +struct bbr_log { + union { + struct bbr_sendmap *rsm; /* For alloc/free */ + uint64_t sb_acc; /* For out/ack or t-o */ + }; + struct tcpcb *tp; + uint32_t t_flags; + uint32_t th_seq; + uint32_t th_ack; + uint32_t snd_una; + uint32_t snd_nxt; + uint32_t snd_max; + uint32_t snd_cwnd; + uint32_t snd_wnd; + uint32_t rc_lost; + uint32_t target_cwnd; /* UU */ + uint32_t inflight; /* UU */ + uint32_t applimited; /* UU */ + /* Things for BBR */ + uint32_t delivered; /* UU */ + uint64_t cur_del_rate; /* UU */ + uint64_t delRate; /* UU */ + uint64_t rttProp; /* UU */ + uint64_t lt_bw; /* UU */ + uint32_t timeStamp; + uint32_t time; + uint32_t slot; /* UU */ + uint32_t delayed_by; + uint32_t exp_del; + uint32_t pkts_out; + uint32_t new_win; + uint32_t hptsi_gain; /* UU */ + uint32_t cwnd_gain; /* UU */ + uint32_t epoch; /* UU */ + uint32_t lt_epoch; /* UU */ + /* Sack fun */ + uint32_t blk_start[4]; /* xx */ + uint32_t blk_end[4]; + uint32_t len; /* Timeout T3=1, TLP=2, RACK=3 */ + uint8_t type; + uint8_t n_sackblks; + uint8_t applied; /* UU */ + uint8_t inhpts; /* UU */ + uint8_t ininput; /* UU */ + uint8_t use_lt_bw; /* UU */ +}; + +struct bbr_log_sysctl_out { + uint32_t bbr_log_at; + uint32_t bbr_log_max; + struct bbr_log entries[0]; +}; + +/* + * Magic numbers for logging timeout events if the + * logging is enabled. + */ +#define BBR_TO_FRM_TMR 1 +#define BBR_TO_FRM_TLP 2 +#define BBR_TO_FRM_RACK 3 +#define BBR_TO_FRM_KEEP 4 +#define BBR_TO_FRM_PERSIST 5 +#define BBR_TO_FRM_DELACK 6 + +#define BBR_SEES_STRETCH_ACK 1 +#define BBR_SEES_COMPRESSED_ACKS 2 + + +/* + * As we get each SACK we wade through the + * rc_map and mark off what is acked. + * We also increment rc_sacked as well. + * + * We also pay attention to missing entries + * based on the time and possibly mark them + * for retransmit. If we do and we are not already + * in recovery we enter recovery. In doing + * so we claer prr_delivered/holes_rxt and prr_sent_dur_rec. + * We also setup rc_next/rc_snd_nxt/rc_send_end so + * we will know where to send from. When not in + * recovery rc_next will be NULL and rc_snd_nxt should + * equal snd_max. + * + * Whenever we retransmit from recovery we increment + * rc_holes_rxt as we retran a block and mark it as retransmitted + * with the time it was sent. During non-recovery sending we + * add to our map and note the time down of any send expanding + * the rc_map at the tail and moving rc_snd_nxt up with snd_max. + * + * In recovery during SACK/ACK processing if a chunk has + * been retransmitted and it is now acked, we decrement rc_holes_rxt. + * When we retransmit from the scoreboard we use + * rc_next and rc_snd_nxt/rc_send_end to help us + * find what needs to be retran. + * + * To calculate pipe we simply take (snd_max - snd_una) + rc_holes_rxt + * This gets us the effect of RFC6675 pipe, counting twice for + * bytes retransmitted. + */ + +#define TT_BBR_FR_TMR 0x2001 + +#define BBR_SCALE 8 +#define BBR_UNIT (1 << BBR_SCALE) + +#define BBR_NUM_RTTS_FOR_DEL_LIMIT 8 /* How many pkt-rtts do we keep + * Delivery rate for */ +#define BBR_NUM_RTTS_FOR_GOOG_DEL_LIMIT 10 /* How many pkt-rtts do we keep + * Delivery rate for google */ + +#define BBR_SECONDS_NO_RTT 10 /* 10 seconds with no RTT shrinkage */ +#define BBR_PROBERTT_MAX 200 /* 200ms */ +#define BBR_PROBERTT_NUM_MSS 4 +#define BBR_STARTUP_EPOCHS 3 +#define USECS_IN_MSEC 1000 +#define BBR_TIME_TO_SECONDS(a) (a / USECS_IN_SECOND) +#define BBR_TIME_TO_MILLI(a) (a / MS_IN_USEC) + + +/* BBR keeps time in usec's so we divide by 1000 and round up */ +#define BBR_TS_TO_MS(t) ((t+999)/MS_IN_USEC) + +/* + * Locking for the rack control block. + * a) Locked by INP_WLOCK + * b) Locked by the hpts-mutex + * + */ +#define BBR_STATE_STARTUP 0x01 +#define BBR_STATE_DRAIN 0x02 +#define BBR_STATE_PROBE_BW 0x03 +#define BBR_STATE_PROBE_RTT 0x04 +#define BBR_STATE_IDLE_EXIT 0x05 + +/* Substate defines for STATE == PROBE_BW */ +#define BBR_SUB_GAIN 0 /* State 0 where we are 5/4 BBR_UNIT */ +#define BBR_SUB_DRAIN 1 /* State 1 where we are at 3/4 BBR_UNIT */ +#define BBR_SUB_LEVEL1 2 /* State 1 first BBR_UNIT */ +#define BBR_SUB_LEVEL2 3 /* State 2nd BBR_UNIT */ +#define BBR_SUB_LEVEL3 4 /* State 3rd BBR_UNIT */ +#define BBR_SUB_LEVEL4 5 /* State 4th BBR_UNIT */ +#define BBR_SUB_LEVEL5 6 /* State 5th BBR_UNIT */ +#define BBR_SUB_LEVEL6 7 /* State last BBR_UNIT */ +#define BBR_SUBSTATE_COUNT 8 + +/* Single remaining reduce log */ +#define BBR_REDUCE_AT_FR 5 + +#define BBR_BIG_LOG_SIZE 300000 + +/* Bits per second in bytes per second */ +#define FORTY_EIGHT_MBPS 6000000 /* 48 megabits in bytes */ +#define THIRTY_MBPS 3750000 /* 30 megabits in bytes */ +#define TWENTY_THREE_MBPS 2896000 +#define FIVETWELVE_MBPS 64000000 /* 512 megabits in bytes */ +#define ONE_POINT_TWO_MEG 150000 /* 1.2 megabits in bytes */ + +struct bbr_stats { + uint64_t bbr_badfr; /* 0 */ + uint64_t bbr_badfr_bytes; /* 1 */ + uint64_t bbr_saw_oerr; /* 2 */ + uint64_t bbr_saw_emsgsiz; /* 3 */ + uint64_t bbr_reorder_seen; /* 4 */ + uint64_t bbr_tlp_tot; /* 5 */ + uint64_t bbr_tlp_newdata; /* 6 */ + uint64_t bbr_offset_recovery; /* 7 */ + uint64_t bbr_tlp_retran_fail; /* 8 */ + uint64_t bbr_to_tot; /* 9 */ + uint64_t bbr_to_arm_rack; /* 10 */ + uint64_t bbr_enter_probertt; /* 11 */ + uint64_t bbr_tlp_set; /* 12 */ + uint64_t bbr_resends_set; /* 13 */ + uint64_t bbr_force_output; /* 14 */ + uint64_t bbr_to_arm_tlp; /* 15 */ + uint64_t bbr_paced_segments; /* 16 */ + uint64_t bbr_saw_enobuf; /* 17 */ + uint64_t bbr_to_alloc_failed; /* 18 */ + uint64_t bbr_to_alloc_emerg; /* 19 */ + uint64_t bbr_sack_proc_all; /* 20 */ + uint64_t bbr_sack_proc_short; /* 21 */ + uint64_t bbr_sack_proc_restart; /* 22 */ + uint64_t bbr_to_alloc; /* 23 */ + uint64_t bbr_offset_drop; /* 24 */ + uint64_t bbr_runt_sacks; /* 25 */ + uint64_t bbr_sack_passed; /* 26 */ + uint64_t bbr_rlock_left_ret0; /* 27 */ + uint64_t bbr_rlock_left_ret1; /* 28 */ + uint64_t bbr_dynamic_rwnd; /* 29 */ + uint64_t bbr_static_rwnd; /* 30 */ + uint64_t bbr_sack_blocks; /* 31 */ + uint64_t bbr_sack_blocks_skip; /* 32 */ + uint64_t bbr_sack_search_both; /* 33 */ + uint64_t bbr_sack_search_fwd; /* 34 */ + uint64_t bbr_sack_search_back; /* 35 */ + uint64_t bbr_plain_acks; /* 36 */ + uint64_t bbr_acks_with_sacks; /* 37 */ + uint64_t bbr_progress_drops; /* 38 */ + uint64_t bbr_early; /* 39 */ + uint64_t bbr_reneges_seen; /* 40 */ + uint64_t bbr_persist_reneg; /* 41 */ + uint64_t bbr_dropped_af_data; /* 42 */ + uint64_t bbr_failed_mbuf_aloc; /* 43 */ + uint64_t bbr_cwnd_limited; /* 44 */ + uint64_t bbr_rwnd_limited; /* 45 */ + uint64_t bbr_app_limited; /* 46 */ + uint64_t bbr_force_timer_start; /* 47 */ + uint64_t bbr_hpts_min_time; /* 48 */ + uint64_t bbr_meets_tso_thresh; /* 49 */ + uint64_t bbr_miss_tso_rwnd; /* 50 */ + uint64_t bbr_miss_tso_cwnd; /* 51 */ + uint64_t bbr_miss_tso_app; /* 52 */ + uint64_t bbr_miss_retran; /* 53 */ + uint64_t bbr_miss_tlp; /* 54 */ + uint64_t bbr_miss_unknown; /* 55 */ + uint64_t bbr_hdwr_rl_add_ok; /* 56 */ + uint64_t bbr_hdwr_rl_add_fail; /* 57 */ + uint64_t bbr_hdwr_rl_mod_ok; /* 58 */ + uint64_t bbr_hdwr_rl_mod_fail; /* 59 */ + uint64_t bbr_collapsed_win; /* 60 */ + uint64_t bbr_alloc_limited; /* 61 */ + uint64_t bbr_alloc_limited_conns; /* 62 */ + uint64_t bbr_split_limited; /* 63 */ +}; + +/* + * The structure bbr_opt_stats is a simple + * way to see how many options are being + * changed in the stack. + */ +struct bbr_opts_stats { + uint64_t tcp_bbr_pace_per_sec; + uint64_t tcp_bbr_pace_del_tar; + uint64_t tcp_bbr_pace_seg_max; + uint64_t tcp_bbr_pace_seg_min; + uint64_t tcp_bbr_pace_cross; + uint64_t tcp_bbr_drain_inc_extra; + uint64_t tcp_bbr_unlimited; + uint64_t tcp_bbr_iwintso; + uint64_t tcp_bbr_rec_over_hpts; + uint64_t tcp_bbr_recforce; + uint64_t tcp_bbr_startup_pg; + uint64_t tcp_bbr_drain_pg; + uint64_t tcp_bbr_rwnd_is_app; + uint64_t tcp_bbr_probe_rtt_int; + uint64_t tcp_bbr_one_retran; + uint64_t tcp_bbr_startup_loss_exit; + uint64_t tcp_bbr_use_lowgain; + uint64_t tcp_bbr_lowgain_thresh; + uint64_t tcp_bbr_lowgain_half; + uint64_t tcp_bbr_lowgain_fd; + uint64_t tcp_bbr_usedel_rate; + uint64_t tcp_bbr_min_rto; + uint64_t tcp_bbr_max_rto; + uint64_t tcp_rack_pace_max_seg; + uint64_t tcp_rack_min_to; + uint64_t tcp_rack_reord_thresh; + uint64_t tcp_rack_reord_fade; + uint64_t tcp_rack_tlp_thresh; + uint64_t tcp_rack_pkt_delay; + uint64_t tcp_bbr_startup_exit_epoch; + uint64_t tcp_bbr_ack_comp_alg; + uint64_t tcp_rack_cheat; + uint64_t tcp_iwnd_tso; + uint64_t tcp_utter_max_tso; + uint64_t tcp_hdwr_pacing; + uint64_t tcp_extra_state; + uint64_t tcp_floor_min_tso; + /* New */ + uint64_t tcp_bbr_algorithm; + uint64_t tcp_bbr_tslimits; + uint64_t tcp_bbr_probertt_len; + uint64_t tcp_bbr_probertt_gain; + uint64_t tcp_bbr_topaceout; + uint64_t tcp_use_rackcheat; + uint64_t tcp_delack; + uint64_t tcp_maxpeak; + uint64_t tcp_retran_wtso; + uint64_t tcp_data_ac; + uint64_t tcp_ts_raises; + uint64_t tcp_pacing_oh_tmr; + uint64_t tcp_pacing_oh; + uint64_t tcp_policer_det; +}; + + +#ifdef _KERNEL +#define BBR_STAT_SIZE (sizeof(struct bbr_stats)/sizeof(uint64_t)) +extern counter_u64_t bbr_stat_arry[BBR_STAT_SIZE]; +#define BBR_STAT_ADD(name, amm) counter_u64_add(bbr_stat_arry[(offsetof(struct bbr_stats, name)/sizeof(uint64_t))], (amm)) +#define BBR_STAT_INC(name) BBR_STAT_ADD(name, 1) +#define BBR_OPTS_SIZE (sizeof(struct bbr_stats)/sizeof(uint64_t)) +extern counter_u64_t bbr_opts_arry[BBR_OPTS_SIZE]; +#define BBR_OPTS_ADD(name, amm) counter_u64_add(bbr_opts_arry[(offsetof(struct bbr_opts_stats, name)/sizeof(uint64_t))], (amm)) +#define BBR_OPTS_INC(name) BBR_OPTS_ADD(name, 1) +#endif + +#define BBR_NUM_LOSS_RATES 3 +#define BBR_NUM_BW_RATES 3 + +#define BBR_RECOVERY_LOWRTT 1 +#define BBR_RECOVERY_MEDRTT 2 +#define BBR_RECOVERY_HIGHRTT 3 +#define BBR_RECOVERY_EXTREMERTT 4 + + +struct bbr_control { + /*******************************/ + /* Cache line 2 from bbr start */ + /*******************************/ + struct bbr_head rc_map; /* List of all segments Lock(a) */ + struct bbr_head rc_tmap; /* List in transmit order Lock(a) */ + struct bbr_sendmap *rc_resend; /* something we have been asked to + * resend */ + uint32_t rc_last_delay_val; /* How much we expect to delay Lock(a) */ + uint32_t rc_bbr_hptsi_gain:16, /* Current hptsi gain Lock(a) */ + rc_hpts_flags:16; /* flags on whats on the pacer wheel */ + + uint32_t rc_delivered; /* BRR delivered amount Lock(a) */ + uint32_t rc_hptsi_agg_delay; /* How much time are we behind */ + + uint32_t rc_flight_at_input; + uint32_t rc_lost_bytes; /* Total bytes currently marked lost */ + /*******************************/ + /* Cache line 3 from bbr start */ + /*******************************/ + struct time_filter rc_delrate; + /*******************************/ + /* Cache line 4 from bbr start */ + /*******************************/ + struct bbr_head rc_free; /* List of Free map entries Lock(a) */ + struct bbr_sendmap *rc_tlp_send; /* something we have been + * asked to resend */ + uint32_t rc_del_time; + uint32_t rc_target_at_state; /* Target for a state */ + + uint16_t rc_free_cnt; /* Number of free entries on the rc_free list + * Lock(a) */ + uint16_t rc_startup_pg; + + uint32_t cur_rtt; /* Last RTT from ack */ + + + uint32_t rc_went_idle_time; /* Used for persits to see if its + * probe-rtt qualified */ + uint32_t rc_pace_max_segs:17, /* How much in any single TSO we send Lock(a) */ + rc_pace_min_segs:15; /* The minimum single segment size before we enter persists */ + + uint32_t rc_rtt_shrinks; /* Time of last rtt shrinkage Lock(a) */ + uint32_t r_app_limited_until; + uint32_t rc_timer_exp; /* If a timer ticks of expiry */ + uint32_t rc_rcv_epoch_start; /* Start time of the Epoch Lock(a) */ + + /*******************************/ + /* Cache line 5 from bbr start */ + /*******************************/ + + uint32_t rc_lost_at_pktepoch; /* what the lost value was at the last + * pkt-epoch */ + uint32_t r_measurement_count; /* count of measurement applied lock(a) */ + + + uint32_t rc_last_tlp_seq; /* Last tlp sequence Lock(a) */ + uint16_t rc_reorder_shift; /* Socket option value Lock(a) */ + uint16_t rc_pkt_delay; /* Socket option value Lock(a) */ + + struct bbr_sendmap *rc_sacklast; /* sack remembered place + * Lock(a) */ + struct bbr_sendmap *rc_next; /* remembered place where we next + * retransmit at Lock(a) */ + + uint32_t rc_sacked; /* Tot sacked on scoreboard Lock(a) */ + uint32_t rc_holes_rxt; /* Tot retraned from scoreboard Lock(a) */ + + uint32_t rc_reorder_ts; /* Last time we saw reordering Lock(a) */ + uint32_t rc_init_rwnd; /* Initial rwnd when we transitioned */ + /*- --- + * used only inital and close + */ + uint32_t rc_high_rwnd; /* Highest rwnd seen */ + uint32_t rc_lowest_rtt; /* Smallest RTT we have seen */ + + uint32_t rc_last_rtt; /* Last valid measured RTT that ack'd data */ + uint32_t bbr_cross_over; + + /*******************************/ + /* Cache line 6 from bbr start */ + /*******************************/ + struct sack_filter bbr_sf; + + /*******************************/ + /* Cache line 7 from bbr start */ + /*******************************/ + struct time_filter_small rc_rttprop; + uint32_t last_inbound_ts; /* Peers last timestamp */ + + uint32_t rc_inc_tcp_oh: 1, + rc_inc_ip_oh: 1, + rc_inc_enet_oh:1, + rc_incr_tmrs:1, + restrict_growth:28; + uint32_t rc_lt_epoch_use; /* When we started lt-bw use Lock(a) */ + + uint32_t rc_recovery_start; /* Time we start recovery Lock(a) */ + uint32_t rc_lt_del; /* Delivered at lt bw sampling start Lock(a) */ + + uint64_t rc_bbr_cur_del_rate; /* Current measured delivery rate + * Lock(a) */ + + /*******************************/ + /* Cache line 8 from bbr start */ + /*******************************/ + uint32_t rc_cwnd_on_ent; /* On entry to recovery the cwnd + * Lock(a) */ + uint32_t rc_agg_early; /* aggregate amount early */ + + uint32_t rc_rcvtime; /* When we last received data Lock(a) */ + uint32_t rc_pkt_epoch_del; /* seq num that we need for RTT epoch */ + + uint32_t rc_pkt_epoch; /* Epoch based on packet RTTs */ + uint32_t rc_pkt_epoch_time; /* Time we started the pkt epoch */ + + uint32_t rc_pkt_epoch_rtt; /* RTT using the packet epoch */ + uint32_t rc_rtt_epoch; /* Current RTT epoch, it ticks every rttProp + * Lock(a) */ + uint32_t lowest_rtt; + uint32_t bbr_smallest_srtt_this_state; + + uint32_t rc_lt_epoch; /* LT epoch start of bw_sampling */ + uint32_t rc_lost_at_startup; + + uint32_t rc_bbr_state_atflight; + uint32_t rc_bbr_last_startup_epoch; /* Last startup epoch where we + * increased 20% */ + uint32_t rc_bbr_enters_probertt; /* Timestamp we entered + * probertt Lock(a) */ + uint32_t rc_lt_time; /* Time of lt sampling start Lock(a) */ + + /*******************************/ + /* Cache line 9 from bbr start */ + /*******************************/ + uint64_t rc_lt_bw; /* LT bw calculated Lock(a) */ + uint64_t rc_bbr_lastbtlbw; /* For startup, what was last btlbw I + * saw to check the 20% gain Lock(a) */ + + + uint32_t rc_bbr_cwnd_gain; /* Current cwnd gain Lock(a) */ + uint32_t rc_pkt_epoch_loss_rate; /* pkt-epoch loss rate */ + + uint32_t rc_saved_cwnd; /* Saved cwnd during Probe-rtt drain Lock(a) */ + uint32_t substate_pe; + + uint32_t rc_lost; /* Number of bytes lost Lock(a) */ + uint32_t rc_exta_time_gd; /* How much extra time we got in d/g */ + + uint32_t rc_lt_lost; /* Number of lt bytes lost at sampling start + * Lock(a) */ + uint32_t rc_bbr_state_time; + + uint32_t rc_min_to; /* Socket option value Lock(a) */ + uint32_t rc_initial_hptsi_bw; /* Our initial startup bw Lock(a) */ + + uint32_t bbr_lost_at_state; /* Temp counter debug lost value as we + * enter a state */ + /*******************************/ + /* Cache line 10 from bbr start */ + /*******************************/ + uint32_t rc_level_state_extra; + uint32_t rc_red_cwnd_pe; + const struct tcp_hwrate_limit_table *crte; + uint64_t red_bw; + + uint32_t rc_probertt_int; + uint32_t rc_probertt_srttchktim; /* Time we last did a srtt + * check */ + uint32_t gain_epoch; /* Epoch we should be out of gain */ + uint32_t rc_min_rto_ms; + + uint32_t rc_reorder_fade; /* Socket option value Lock(a) */ + uint32_t last_startup_measure; + + int32_t bbr_hptsi_per_second; + int32_t bbr_hptsi_segments_delay_tar; + + int32_t bbr_hptsi_segments_max; + uint32_t bbr_rttprobe_gain_val; + /*******************************/ + /* Cache line 11 from bbr start */ + /*******************************/ + uint32_t cur_rtt_send_time; /* Time we sent our rtt measured packet */ + uint32_t bbr_peer_tsratio; /* Our calculated ts ratio to multply */ + uint32_t bbr_ts_check_tstmp; /* When we filled it the TS that came on the ack */ + uint32_t bbr_ts_check_our_cts; /* When we filled it the cts of the send */ + uint32_t rc_tlp_rxt_last_time; + uint32_t bbr_smallest_srtt_state2; + uint32_t bbr_hdwr_cnt_noset_snt; /* count of hw pacing sends during delay */ + uint32_t startup_last_srtt; + uint32_t rc_ack_hdwr_delay; + uint32_t highest_hdwr_delay; /* Largest delay we have seen from hardware */ + uint32_t non_gain_extra; + uint32_t recovery_lr; /* The sum of the loss rate from the pe's during recovery */ + uint32_t last_in_probertt; + uint32_t flightsize_at_drain; /* In draining what was the last marked flight size */ + uint32_t rc_pe_of_prtt; /* PE we went into probe-rtt */ + uint32_t ts_in; /* ts that went with the last rtt */ + + uint16_t rc_tlp_seg_send_cnt; /* Number of times we have TLP sent + * rc_last_tlp_seq Lock(a) */ + uint16_t rc_drain_pg; + uint32_t rc_num_maps_alloced; /* num send map entries allocated */ + uint32_t rc_num_split_allocs; /* num split map entries allocated */ + uint16_t rc_num_small_maps_alloced; /* Number of sack blocks + * allocated */ + uint16_t bbr_hptsi_bytes_min; + + uint16_t bbr_hptsi_segments_floor; + uint16_t bbr_utter_max; + uint16_t bbr_google_discount; + +}; + + +struct socket; +struct tcp_bbr { + /* First cache line 0x00 */ + int32_t(*r_substate) (struct mbuf *, struct tcphdr *, + struct socket *, struct tcpcb *, struct tcpopt *, + int32_t, int32_t, uint32_t, int32_t, int32_t); /* Lock(a) */ + struct tcpcb *rc_tp; /* The tcpcb Lock(a) */ + struct inpcb *rc_inp; /* The inpcb Lock(a) */ + struct timeval rc_tv; + uint32_t rc_pacer_started; /* Time we started the pacer */ + uint16_t no_pacing_until:8, /* No pacing until N packet epochs */ + ts_can_raise:1,/* TS b/w calculations can raise the bw higher */ + skip_gain:1, /* Skip the gain cycle (hardware pacing) */ + gain_is_limited:1, /* With hardware pacing we are limiting gain */ + output_error_seen:1, + oerror_cnt:4, + hw_pacing_set:1; /* long enough has passed for us to start pacing */ + uint16_t xxx_r_ack_count; /* During recovery count of ack's received + * that added data since output */ + uint16_t bbr_segs_rcvd; /* In Segment count since we sent a ack */ + + uint8_t bbr_timer_src:4, /* Used for debugging Lock(a) */ + bbr_use_rack_cheat:1, /* Use the rack cheat */ + bbr_init_win_cheat:1, /* Send full IW for TSO */ + bbr_attempt_hdwr_pace:1,/* Try to do hardware pacing */ + bbr_hdrw_pacing:1; /* Hardware pacing is available */ + uint8_t bbr_hdw_pace_ena:1, /* Does the connection allow hardware pacing to be attempted */ + bbr_prev_in_rec:1, /* We were previously in recovery */ + pkt_conservation:1, + use_policer_detection:1, + xxx_bbr_hdw_pace_idx:4; /* If hardware pacing is on, index to slot in pace tbl */ + uint16_t r_wanted_output:1, + rtt_valid:1, + rc_timer_first:1, + rc_output_starts_timer:1, + rc_resends_use_tso:1, + rc_all_timers_stopped:1, + rc_loss_exit:1, + rc_ack_was_delayed:1, + rc_lt_is_sampling:1, + rc_filled_pipe:1, + rc_tlp_new_data:1, + rc_hit_state_1:1, + rc_ts_valid:1, + rc_prtt_set_ts:1, + rc_is_pkt_epoch_now:1, + rc_has_collapsed:1; + + uint8_t r_state:4, /* Current bbr state Lock(a) */ + r_agg_early_set:1, /* Did we get called early */ + r_init_rtt:1, + r_use_policer:1, /* For google mode only */ + r_recovery_bw:1; + uint8_t r_timer_override:1, /* pacer override Lock(a) 0/1 */ + rc_in_persist:1, + rc_lt_use_bw:1, + rc_allow_data_af_clo:1, + rc_tlp_rtx_out:1, /* A TLP is in flight */ + rc_tlp_in_progress:1, /* a TLP timer is running needed? */ + rc_use_idle_restart:1; /* Do we restart fast after idle (persist or applim) */ + uint8_t rc_bbr_state:3, /* What is the major BBR state */ + rc_bbr_substate:3, /* For probeBW state */ + r_is_v6:1, + rc_past_init_win:1; + uint8_t rc_last_options; + uint8_t rc_tlp_threshold; /* Socket option value Lock(a) */ + uint8_t rc_max_rto_sec; + uint8_t rc_cwnd_limited:1, /* We are cwnd limited */ + rc_tmr_stopped:7; /* What timers have been stopped */ + uint8_t rc_use_google:1, + rc_use_ts_limit:1, + rc_ts_data_set:1, /* We have filled a set point to determine */ + rc_ts_clock_set:1, /* We have determined the ts type */ + rc_ts_cant_be_used:1, /* We determined we can't use ts values */ + rc_ack_is_cumack:1, + rc_no_pacing:1, + alloc_limit_reported:1; + uint8_t rc_init_win; + /* Cache line 2 0x40 */ + struct bbr_control r_ctl; +#ifdef _KERNEL +} __aligned(CACHE_LINE_SIZE); +#else +}; +#endif + +#endif diff --git a/sys/netinet/tcp_stacks/tcp_rack.h b/sys/netinet/tcp_stacks/tcp_rack.h index 235951999e64..a19fc8969d9f 100644 --- a/sys/netinet/tcp_stacks/tcp_rack.h +++ b/sys/netinet/tcp_stacks/tcp_rack.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2016 Netflix, Inc. + * Copyright (c) 2016-9 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -28,39 +28,39 @@ #ifndef _NETINET_TCP_RACK_H_ #define _NETINET_TCP_RACK_H_ -#define RACK_ACKED 0x0001/* The remote endpoint acked this */ -#define RACK_TO_MIXED 0x0002/* A timeout occured that mixed the send order */ -#define RACK_DEFERRED 0x0004/* We can't use this for RTT calc */ -#define RACK_OVERMAX 0x0008/* We have more retran's then we can fit */ -#define RACK_SACK_PASSED 0x0010/* A sack was done above this block */ -#define RACK_WAS_SACKPASS 0x0020/* We retransmitted due to SACK pass */ -#define RACK_HAS_FIN 0x0040/* segment is sent with fin */ -#define RACK_TLP 0x0080/* segment sent as tail-loss-probe */ - +#define RACK_ACKED 0x0001/* The remote endpoint acked this */ +#define RACK_TO_MIXED 0x0002/* A timeout occured that mixed the send order - not used */ +#define RACK_DEFERRED 0x0004/* We can't use this for RTT calc - not used */ +#define RACK_OVERMAX 0x0008/* We have more retran's then we can fit */ +#define RACK_SACK_PASSED 0x0010/* A sack was done above this block */ +#define RACK_WAS_SACKPASS 0x0020/* We retransmitted due to SACK pass */ +#define RACK_HAS_FIN 0x0040/* segment is sent with fin */ +#define RACK_TLP 0x0080/* segment sent as tail-loss-probe */ +#define RACK_RWND_COLLAPSED 0x0100/* The peer collapsed the rwnd on the segment */ #define RACK_NUM_OF_RETRANS 3 #define RACK_INITIAL_RTO 1000 /* 1 second in milli seconds */ struct rack_sendmap { - TAILQ_ENTRY(rack_sendmap) r_next; /* seq number arrayed next */ - TAILQ_ENTRY(rack_sendmap) r_tnext; /* Time of transmit based next */ - uint32_t r_tim_lastsent[RACK_NUM_OF_RETRANS]; uint32_t r_start; /* Sequence number of the segment */ uint32_t r_end; /* End seq, this is 1 beyond actually */ + TAILQ_ENTRY(rack_sendmap) r_tnext; /* Time of transmit based next */ + RB_ENTRY(rack_sendmap) r_next; /* RB Tree next */ uint32_t r_rtr_bytes; /* How many bytes have been retransmitted */ uint16_t r_rtr_cnt; /* Retran count, index this -1 to get time * sent */ - uint8_t r_flags; /* Flags as defined above */ - uint8_t r_sndcnt; /* Retran count, not limited by - * RACK_NUM_OF_RETRANS */ + uint16_t r_flags; /* Flags as defined above */ + uint32_t r_tim_lastsent[RACK_NUM_OF_RETRANS]; + uint8_t r_dupack; /* Dup ack count */ uint8_t r_in_tmap; /* Flag to see if its in the r_tnext array */ uint8_t r_limit_type; /* is this entry counted against a limit? */ - uint8_t r_resv[2]; + uint8_t r_resv[49]; }; -#define RACK_LIMIT_TYPE_SPLIT 1 +RB_HEAD(rack_rb_tree_head, rack_sendmap); TAILQ_HEAD(rack_head, rack_sendmap); +#define RACK_LIMIT_TYPE_SPLIT 1 /* * We use the rate sample structure to @@ -136,6 +136,8 @@ struct rack_opts_stats { uint64_t rack_no_timer_in_hpts; uint64_t tcp_rack_min_pace_seg; uint64_t tcp_rack_min_pace; + uint64_t tcp_rack_cheat; + uint64_t tcp_rack_no_sack; }; #define TLP_USE_ID 1 /* Internet draft behavior */ @@ -188,15 +190,19 @@ extern counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; * b) Locked by the hpts-mutex * */ +#define RACK_GP_HIST 4 /* How much goodput history do we maintain? */ struct rack_control { /* Second cache line 0x40 from tcp_rack */ - struct rack_head rc_map;/* List of all segments Lock(a) */ + struct rack_rb_tree_head rc_mtree; /* Tree of all segments Lock(a) */ struct rack_head rc_tmap; /* List in transmit order Lock(a) */ struct rack_sendmap *rc_tlpsend; /* Remembered place for * tlp_sending Lock(a) */ struct rack_sendmap *rc_resend; /* something we have been asked to * resend */ + struct timeval rc_last_time_decay; /* SAD time decay happened here */ + uint32_t input_pkt; + uint32_t saved_input_pkt; uint32_t rc_hpts_flags; uint32_t rc_timer_exp; /* If a timer ticks of expiry */ uint32_t rc_rack_min_rtt; /* lowest RTT seen Lock(a) */ @@ -244,22 +250,32 @@ struct rack_control { * have allocated */ uint32_t rc_rcvtime; /* When we last received data */ uint32_t rc_num_split_allocs; /* num split map entries allocated */ + uint32_t rc_last_output_to; uint32_t rc_went_idle_time; struct rack_sendmap *rc_sacklast; /* sack remembered place * Lock(a) */ - struct rack_sendmap *rc_next; /* remembered place where we next - * retransmit at Lock(a) */ struct rack_sendmap *rc_rsm_at_retran; /* Debug variable kept for * cache line alignment * Lock(a) */ + struct timeval rc_last_ack; /* Cache line split 0x100 */ struct sack_filter rack_sf; /* Cache line split 0x140 */ /* Flags for various things */ + uint32_t rc_pace_max_segs; + uint32_t rc_pace_min_segs; + uint32_t rc_high_rwnd; + uint32_t ack_count; + uint32_t sack_count; + uint32_t sack_noextra_move; + uint32_t sack_moved_extra; struct rack_rtt_sample rack_rs; + uint32_t rc_tlp_rxt_last_time; + uint32_t rc_saved_cwnd; + uint32_t rc_gp_history[RACK_GP_HIST]; uint32_t rc_tlp_threshold; /* Socket option value Lock(a) */ uint16_t rc_early_recovery_segs; /* Socket option value Lock(a) */ uint16_t rc_reorder_shift; /* Socket option value Lock(a) */ @@ -270,9 +286,11 @@ struct rack_control { uint8_t rc_early_recovery; /* Socket option value Lock(a) */ uint8_t rc_prr_sendalot;/* Socket option value Lock(a) */ uint8_t rc_min_to; /* Socket option value Lock(a) */ - uint8_t rc_prr_inc_var; /* Socket option value Lock(a) */ uint8_t rc_tlp_rtx_out; /* This is TLPRtxOut in the draft */ uint8_t rc_rate_sample_method; + uint8_t rc_gp_hist_idx: 7, + rc_gp_hist_filled: 1; + }; #ifdef _KERNEL @@ -305,16 +323,22 @@ struct tcp_rack { rc_last_pto_set : 1, /* XXX not used */ rc_tlp_in_progress : 1, rc_always_pace : 1, /* Socket option value Lock(a) */ - rc_timer_up : 1; /* The rack timer is up flag Lock(a) */ - uint8_t r_idle_reduce_largest : 1, - r_enforce_min_pace : 2, - r_min_pace_seg_thresh : 5; + tlp_timer_up : 1; /* The tlp timer is up flag Lock(a) */ + uint8_t r_enforce_min_pace : 2, + rc_has_collapsed : 1, + r_rep_attack : 1, + r_rep_reverse : 1, + r_xxx_min_pace_seg_thresh : 3; uint8_t rack_tlp_threshold_use; uint8_t rc_allow_data_af_clo: 1, delayed_ack : 1, + set_pacing_done_a_iw : 1, + use_rack_cheat : 1, alloc_limit_reported : 1, - rc_avail : 5; - uint8_t r_resv[2]; /* Fill to cache line boundary */ + sack_attack_disable : 1, + do_detection : 1, + rc_avail : 1; + uint16_t rack_per_of_gp; /* Cache line 2 0x40 */ struct rack_control r_ctl; } __aligned(CACHE_LINE_SIZE); diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h index ca351c5d4e83..c20e02d258fe 100644 --- a/sys/sys/mbuf.h +++ b/sys/sys/mbuf.h @@ -439,18 +439,18 @@ void mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs); */ #define M_COPYFLAGS \ (M_PKTHDR|M_EOR|M_RDONLY|M_BCAST|M_MCAST|M_PROMISC|M_VLANTAG|M_TSTMP| \ - M_TSTMP_HPREC|M_PROTOFLAGS) + M_TSTMP_HPREC|M_TSTMP_LRO|M_PROTOFLAGS) /* * Mbuf flag description for use with printf(9) %b identifier. */ #define M_FLAG_BITS \ "\20\1M_EXT\2M_PKTHDR\3M_EOR\4M_RDONLY\5M_BCAST\6M_MCAST" \ - "\7M_PROMISC\10M_VLANTAG\11M_NOMAP\12M_NOFREE\13M_TSTMP\14M_TSTMP_HPREC" + "\7M_PROMISC\10M_VLANTAG\11M_NOMAP\12M_NOFREE\13M_TSTMP\14M_TSTMP_HPREC\15M_TSTMP_LRO" #define M_FLAG_PROTOBITS \ - "\15M_PROTO1\16M_PROTO2\17M_PROTO3\20M_PROTO4\21M_PROTO5" \ - "\22M_PROTO6\23M_PROTO7\24M_PROTO8\25M_PROTO9\26M_PROTO10" \ - "\27M_PROTO11" + "\16M_PROTO1\17M_PROTO2\20M_PROTO3\21M_PROTO4" \ + "\22M_PROTO5\23M_PROTO6\24M_PROTO7\25M_PROTO8\26M_PROTO9" \ + "\27M_PROTO10\28M_PROTO11" #define M_FLAG_PRINTF (M_FLAG_BITS M_FLAG_PROTOBITS) /* @@ -1496,7 +1496,7 @@ mbuf_tstmp2timespec(struct mbuf *m, struct timespec *ts) { KASSERT((m->m_flags & M_PKTHDR) != 0, ("mbuf %p no M_PKTHDR", m)); - KASSERT((m->m_flags & M_TSTMP) != 0, ("mbuf %p no M_TSTMP", m)); + KASSERT((m->m_flags & (M_TSTMP|M_TSTMP_LRO)) != 0, ("mbuf %p no M_TSTMP or M_TSTMP_LRO", m)); ts->tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000; ts->tv_nsec = m->m_pkthdr.rcv_tstmp % 1000000000; } diff --git a/sys/sys/tim_filter.h b/sys/sys/tim_filter.h new file mode 100644 index 000000000000..a131a5d251b1 --- /dev/null +++ b/sys/sys/tim_filter.h @@ -0,0 +1,134 @@ +#ifndef __tim_filter_h__ +#define __tim_filter_h__ +/*- + * Copyright (c) 2016-9 Netflix, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * Author: Randall Stewart + */ + +#include +#include +/* + * Do not change the size unless you know what you are + * doing, the current size of 5 is designed around + * the cache-line size for an amd64 processor. Other processors + * may need other sizes. + */ +#define NUM_FILTER_ENTRIES 3 + +struct filter_entry { + uint64_t value; /* Value */ + uint32_t time_up; /* Time updated */ +} __packed ; + +struct filter_entry_small { + uint32_t value; /* Value */ + uint32_t time_up; /* Time updated */ +}; + +struct time_filter { + uint32_t cur_time_limit; + struct filter_entry entries[NUM_FILTER_ENTRIES]; +#ifdef _KERNEL +} __aligned(CACHE_LINE_SIZE); +#else +}; +#endif +struct time_filter_small { + uint32_t cur_time_limit; + struct filter_entry_small entries[NUM_FILTER_ENTRIES]; +}; + +/* + * To conserve on space there is a code duplication here (this + * is where polymophism would be nice in the kernel). Everything + * is duplicated to have a filter with a value of uint32_t instead + * of a uint64_t. This saves 20 bytes and the structure size + * drops to 44 from 64. The bad part about this is you end + * up with two sets of functions. The xxx_small() access + * the uint32_t value's where the xxx() the uint64_t values. + * This forces the user to keep straight which type of structure + * they allocated and which call they need to make. crossing + * over calls will create either invalid memory references or + * very bad results :) + */ + +#define FILTER_TYPE_MIN 1 +#define FILTER_TYPE_MAX 2 + +#ifdef _KERNEL +int setup_time_filter(struct time_filter *tf, int fil_type, uint32_t time_len); +void reset_time(struct time_filter *tf, uint32_t time_len); +void forward_filter_clock(struct time_filter *tf, uint32_t ticks_forward); +void tick_filter_clock(struct time_filter *tf, uint32_t now); +uint32_t apply_filter_min(struct time_filter *tf, uint64_t value, uint32_t now); +uint32_t apply_filter_max(struct time_filter *tf, uint64_t value, uint32_t now); +void filter_reduce_by(struct time_filter *tf, uint64_t reduce_by, uint32_t now); +void filter_increase_by(struct time_filter *tf, uint64_t incr_by, uint32_t now); +static uint64_t inline +get_filter_value(struct time_filter *tf) +{ + return(tf->entries[0].value); +} + +static uint32_t inline +get_cur_timelim(struct time_filter *tf) +{ + return(tf->cur_time_limit); +} + + +int setup_time_filter_small(struct time_filter_small *tf, + int fil_type, uint32_t time_len); +void reset_time_small(struct time_filter_small *tf, uint32_t time_len); +void forward_filter_clock_small(struct time_filter_small *tf, + uint32_t ticks_forward); +void tick_filter_clock_small(struct time_filter_small *tf, uint32_t now); +uint32_t apply_filter_min_small(struct time_filter_small *tf, + uint32_t value, uint32_t now); +uint32_t apply_filter_max_small(struct time_filter_small *tf, + uint32_t value, uint32_t now); +void filter_reduce_by_small(struct time_filter_small *tf, + uint32_t reduce_by, uint32_t now); +void filter_increase_by_small(struct time_filter_small *tf, + uint32_t incr_by, uint32_t now); +static uint64_t inline +get_filter_value_small(struct time_filter_small *tf) +{ + return(tf->entries[0].value); +} + +static uint32_t inline +get_cur_timelim_small(struct time_filter_small *tf) +{ + return(tf->cur_time_limit); +} + + +#endif +#endif From c9205e35008e9275adb1803a10a7306c4f1c44a9 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 24 Sep 2019 20:01:20 +0000 Subject: [PATCH 040/106] Fix/improve interrupt threads scheduling. Doing some tests with very high interrupt rates I've noticed that one of conditions I added in r232207 to make interrupt threads in most cases run on local CPU never worked as expected (worked only if previous time it was executed on some other CPU, that is quite opposite). It caused additional CPU usage to run full CPU search and could schedule interrupt threads to some other CPU. This patch removes that code and instead reuses existing non-interrupt code path with some tweaks for interrupt case: - On SMT systems, if current thread is idle, don't look on other threads. Even if they are busy, it may take more time to do fill search and bounce the interrupt thread to other core then execute it locally, even sharing CPU resources. It is other threads should migrate, not bound interrupts. - Try hard to keep interrupt threads within LLC of their original CPU. This improves scheduling cost and supposedly cache and memory locality. On a test system with 72 threads doing 2.2M IOPS to NVMe this saves few percents of CPU time while adding few percents to IOPS. MFC after: 1 month Sponsored by: iXsystems, Inc. --- sys/kern/sched_ule.c | 70 ++++++++++++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 26 deletions(-) diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c index 6a5780325b85..9d73a59c74fd 100644 --- a/sys/kern/sched_ule.c +++ b/sys/kern/sched_ule.c @@ -1251,7 +1251,7 @@ sched_pickcpu(struct thread *td, int flags) struct td_sched *ts; struct tdq *tdq; cpuset_t mask; - int cpu, pri, self; + int cpu, pri, self, intr; self = PCPU_GET(cpuid); ts = td_get_sched(td); @@ -1268,16 +1268,12 @@ sched_pickcpu(struct thread *td, int flags) * Prefer to run interrupt threads on the processors that generate * the interrupt. */ - pri = td->td_priority; if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) && - curthread->td_intr_nesting_level && ts->ts_cpu != self) { - SCHED_STAT_INC(pickcpu_intrbind); + curthread->td_intr_nesting_level) { ts->ts_cpu = self; - if (TDQ_SELF()->tdq_lowpri > pri) { - SCHED_STAT_INC(pickcpu_affinity); - return (ts->ts_cpu); - } - } + intr = 1; + } else + intr = 0; /* * If the thread can run on the last cpu and the affinity has not * expired and it is idle, run it there. @@ -1287,7 +1283,7 @@ sched_pickcpu(struct thread *td, int flags) if (THREAD_CAN_SCHED(td, ts->ts_cpu) && tdq->tdq_lowpri >= PRI_MIN_IDLE && SCHED_AFFINITY(ts, CG_SHARE_L2)) { - if (cg->cg_flags & CG_FLAG_THREAD) { + if (!intr && cg->cg_flags & CG_FLAG_THREAD) { CPUSET_FOREACH(cpu, cg->cg_mask) { if (TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE) break; @@ -1301,32 +1297,55 @@ sched_pickcpu(struct thread *td, int flags) } /* * Search for the last level cache CPU group in the tree. - * Skip caches with expired affinity time and SMT groups. - * Affinity to higher level caches will be handled less aggressively. + * Skip SMT, identical groups and caches with expired affinity. + * Interrupt threads affinity is explicit and never expires. */ for (ccg = NULL; cg != NULL; cg = cg->cg_parent) { if (cg->cg_flags & CG_FLAG_THREAD) continue; - if (!SCHED_AFFINITY(ts, cg->cg_level)) + if (cg->cg_children == 1 || cg->cg_count == 1) + continue; + if (cg->cg_level == CG_SHARE_NONE || + (!intr && !SCHED_AFFINITY(ts, cg->cg_level))) continue; ccg = cg; } - if (ccg != NULL) - cg = ccg; + /* Found LLC shared by all CPUs, so do a global search. */ + if (ccg == cpu_top) + ccg = NULL; cpu = -1; - /* Search the group for the less loaded idle CPU we can run now. */ mask = td->td_cpuset->cs_mask; - if (cg != NULL && cg != cpu_top && - CPU_CMP(&cg->cg_mask, &cpu_top->cg_mask) != 0) - cpu = sched_lowest(cg, mask, max(pri, PRI_MAX_TIMESHARE), + pri = td->td_priority; + /* + * Try hard to keep interrupts within found LLC. Search the LLC for + * the least loaded CPU we can run now. For NUMA systems it should + * be within target domain, and it also reduces scheduling overhead. + */ + if (ccg != NULL && intr) { + cpu = sched_lowest(ccg, mask, pri, INT_MAX, ts->ts_cpu); + if (cpu >= 0) + SCHED_STAT_INC(pickcpu_intrbind); + } else + /* Search the LLC for the least loaded idle CPU we can run now. */ + if (ccg != NULL) { + cpu = sched_lowest(ccg, mask, max(pri, PRI_MAX_TIMESHARE), INT_MAX, ts->ts_cpu); - /* Search globally for the less loaded CPU we can run now. */ - if (cpu == -1) + if (cpu >= 0) + SCHED_STAT_INC(pickcpu_affinity); + } + /* Search globally for the least loaded CPU we can run now. */ + if (cpu < 0) { cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu); - /* Search globally for the less loaded CPU. */ - if (cpu == -1) + if (cpu >= 0) + SCHED_STAT_INC(pickcpu_lowest); + } + /* Search globally for the least loaded CPU. */ + if (cpu < 0) { cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu); - KASSERT(cpu != -1, ("sched_pickcpu: Failed to find a cpu.")); + if (cpu >= 0) + SCHED_STAT_INC(pickcpu_lowest); + } + KASSERT(cpu < 0, ("sched_pickcpu: Failed to find a cpu.")); KASSERT(!CPU_ABSENT(cpu), ("sched_pickcpu: Picked absent CPU %d.", cpu)); /* * Compare the lowest loaded cpu to current cpu. @@ -1337,8 +1356,7 @@ sched_pickcpu(struct thread *td, int flags) TDQ_SELF()->tdq_load <= tdq->tdq_load + 1) { SCHED_STAT_INC(pickcpu_local); cpu = self; - } else - SCHED_STAT_INC(pickcpu_lowest); + } if (cpu != ts->ts_cpu) SCHED_STAT_INC(pickcpu_migration); return (cpu); From 2f1cc984db31f1d61db95246872d0c3fa731a791 Mon Sep 17 00:00:00 2001 From: Randall Stewart Date: Tue, 24 Sep 2019 20:04:31 +0000 Subject: [PATCH 041/106] Fix the ifdefs in tcp_ratelimit.h. They were reversed so that instead of functions only being inside the _KERNEL and the absence of RATELIMIT causing us to have NULL/error returning interfaces we ended up with non-kernel getting the error path. opps.. --- sys/netinet/tcp_ratelimit.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/netinet/tcp_ratelimit.h b/sys/netinet/tcp_ratelimit.h index 49e407c03928..ebd4c4204704 100644 --- a/sys/netinet/tcp_ratelimit.h +++ b/sys/netinet/tcp_ratelimit.h @@ -87,8 +87,8 @@ CK_LIST_HEAD(head_tcp_rate_set, tcp_rate_set); #define RS_PACING_LT 0x0008 /* Less than requested rate */ #define RS_PACING_SUB_OK 0x0010 /* If a rate can't be found get the * next best rate (highest or lowest). */ -#ifdef RATELIMIT #ifdef _KERNEL +#ifdef RATELIMIT #define DETAILED_RATELIMIT_SYSCTL 1 /* * Undefine this if you don't want * detailed rates to appear in From da99b33b17e8cb02b93e2aff7e30cd8f06c85b4d Mon Sep 17 00:00:00 2001 From: Randall Stewart Date: Tue, 24 Sep 2019 20:11:55 +0000 Subject: [PATCH 042/106] don't call in_ratelmit detach when RATELIMIT is not compiled in the kernel. --- sys/netinet/tcp_stacks/bbr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c index dccb2894ea68..8ba00c46f187 100644 --- a/sys/netinet/tcp_stacks/bbr.c +++ b/sys/netinet/tcp_stacks/bbr.c @@ -14784,10 +14784,12 @@ bbr_set_sockopt(struct socket *so, struct sockopt *sopt, bbr->bbr_attempt_hdwr_pace = 0; } else { bbr->bbr_hdw_pace_ena = 0; +#ifdef RATELIMIT if (bbr->bbr_hdrw_pacing) { bbr->bbr_hdrw_pacing = 0; in_pcbdetach_txrtlmt(bbr->rc_inp); } +#endif } break; From ac7bd23a7ae7a9d28533a1e639b7f43653322344 Mon Sep 17 00:00:00 2001 From: Randall Stewart Date: Tue, 24 Sep 2019 20:36:43 +0000 Subject: [PATCH 043/106] lets put (void) in a couple of functions to keep older platforms that are stuck with gcc happy (ppc). The changes are needed in both bbr and rack. Obtained from: Michael Tuexen (mtuexen@) --- sys/netinet/tcp_stacks/bbr.c | 4 ++-- sys/netinet/tcp_stacks/rack.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c index 8ba00c46f187..a6f121c77e05 100644 --- a/sys/netinet/tcp_stacks/bbr.c +++ b/sys/netinet/tcp_stacks/bbr.c @@ -1174,7 +1174,7 @@ sysctl_bbr_clear_lost(SYSCTL_HANDLER_ARGS) } static void -bbr_init_sysctls() +bbr_init_sysctls(void) { struct sysctl_oid *bbr_probertt; struct sysctl_oid *bbr_hptsi; @@ -1875,7 +1875,7 @@ bbr_progress_timeout_check(struct tcp_bbr *bbr) } static void -bbr_counter_destroy() +bbr_counter_destroy(void) { COUNTER_ARRAY_FREE(bbr_stat_arry, BBR_STAT_SIZE); COUNTER_ARRAY_FREE(bbr_opts_arry, BBR_OPTS_SIZE); diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c index f4a17e4dfc4b..e3f05a954282 100644 --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -514,7 +514,7 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS) static void -rack_init_sysctls() +rack_init_sysctls(void) { struct sysctl_oid *rack_counters; struct sysctl_oid *rack_attack; @@ -1512,7 +1512,7 @@ rack_log_sad(struct tcp_rack *rack, int event) #endif static void -rack_counter_destroy() +rack_counter_destroy(void) { counter_u64_free(rack_badfr); counter_u64_free(rack_badfr_bytes); From 8cfda118cbdadb3f1529dbc5e95722f0006ceb4e Mon Sep 17 00:00:00 2001 From: Michael Gmelin Date: Tue, 24 Sep 2019 20:49:33 +0000 Subject: [PATCH 044/106] freebsd-update: Add `updatesready' and `showconfig' commands `freebsd-update updatesready' can be used to check if there are any pending fetched updates that can be installed. `freebsd-update showconfig' writes freebsd-update's configuration to stdout. This also changes the exit code of `freebsd-update install' to 2 in case there are no updates pending to be installed and there wasn't a fetch phase in the same invocation. This allows scripts to tell apart these error conditions without breaking existing jail managers. See freebsd-update(8) for details. PR: 240757, 240177, 229346 Reviewed by: manpages (bcr), sectam (emaste), yuripv Differential Revision: https://reviews.freebsd.org/D21473 --- usr.sbin/freebsd-update/freebsd-update.8 | 13 +++++++++- usr.sbin/freebsd-update/freebsd-update.sh | 29 +++++++++++++++++++++-- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/usr.sbin/freebsd-update/freebsd-update.8 b/usr.sbin/freebsd-update/freebsd-update.8 index 7a4350f5451a..0f8cbfda8f6e 100644 --- a/usr.sbin/freebsd-update/freebsd-update.8 +++ b/usr.sbin/freebsd-update/freebsd-update.8 @@ -25,7 +25,7 @@ .\" .\" $FreeBSD$ .\" -.Dd June 14, 2017 +.Dd September 24, 2019 .Dt FREEBSD-UPDATE 8 .Os .Sh NAME @@ -155,13 +155,24 @@ Note that this command may require up to 500 MB of space in depending on which components of the .Fx base system are installed. +.It Cm updatesready +Check if there are fetched updates ready to install. +Returns exit code 2 if there are no updates to install. .It Cm install Install the most recently fetched updates or upgrade. +Returns exit code 2 if there are no updates to install +and the +.Cm fetch +command wasn't passed as an earlier argument in the same +invocation. .It Cm rollback Uninstall the most recently installed updates. .It Cm IDS Compare the system against a "known good" index of the installed release. +.It Cm showconfig +Show configuration options after parsing conffile and command +line options. .El .Sh TIPS .Bl -bullet diff --git a/usr.sbin/freebsd-update/freebsd-update.sh b/usr.sbin/freebsd-update/freebsd-update.sh index 53a95eef1fa7..2845ab3c902b 100644 --- a/usr.sbin/freebsd-update/freebsd-update.sh +++ b/usr.sbin/freebsd-update/freebsd-update.sh @@ -62,9 +62,11 @@ Commands: cron -- Sleep rand(3600) seconds, fetch updates, and send an email if updates were found upgrade -- Fetch upgrades to FreeBSD version specified via -r option + updatesready -- Check if there are fetched updates ready to install install -- Install downloaded updates or upgrades rollback -- Uninstall most recently installed updates IDS -- Compare the system against an index of "known good" files + showconfig -- Show configuration EOF exit 0 } @@ -503,7 +505,8 @@ parse_cmdline () { ;; # Commands - cron | fetch | upgrade | install | rollback | IDS) + cron | fetch | upgrade | updatesready | install | rollback |\ + IDS | showconfig) COMMANDS="${COMMANDS} $1" ;; @@ -827,7 +830,7 @@ install_check_params () { echo "No updates are available to install." if [ $ISFETCHED -eq 0 ]; then echo "Run '$0 fetch' first." - exit 1 + exit 2 fi exit 0 fi @@ -3333,6 +3336,21 @@ cmd_upgrade () { upgrade_run || exit 1 } +# Check if there are fetched updates ready to install +cmd_updatesready () { + # Construct a unique name from ${BASEDIR} + BDHASH=`echo ${BASEDIR} | sha256 -q` + + # Check that we have updates ready to install + if ! [ -L ${BDHASH}-install ]; then + echo "No updates are available to install." + exit 2 + fi + + echo "There are updates available to install." + echo "Run '$0 install' to proceed." +} + # Install downloaded updates. cmd_install () { install_check_params @@ -3351,6 +3369,13 @@ cmd_IDS () { IDS_run || exit 1 } +# Output configuration. +cmd_showconfig () { + for X in ${CONFIGOPTIONS}; do + echo $X=$(eval echo \$${X}) + done +} + #### Entry point # Make sure we find utilities from the base system From ec3ecd047114b8d6dcc5d97d858da7099cb9e71e Mon Sep 17 00:00:00 2001 From: Jung-uk Kim Date: Tue, 24 Sep 2019 21:41:19 +0000 Subject: [PATCH 045/106] Fix white spaces. --- sys/conf/files | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sys/conf/files b/sys/conf/files index 4f8272ca5908..7eaa38aa6f50 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -3808,7 +3808,7 @@ kern/subr_epoch.c standard kern/subr_eventhandler.c standard kern/subr_fattime.c standard kern/subr_firmware.c optional firmware -kern/subr_filter.c standard +kern/subr_filter.c standard kern/subr_gtaskqueue.c standard kern/subr_hash.c standard kern/subr_hints.c standard @@ -4279,8 +4279,8 @@ netinet/tcp_log_buf.c optional tcp_blackbox inet | tcp_blackbox inet6 netinet/tcp_lro.c optional inet | inet6 netinet/tcp_output.c optional inet | inet6 netinet/tcp_offload.c optional tcp_offload inet | tcp_offload inet6 -netinet/tcp_hpts.c optional tcphpts inet | tcphpts inet6 -netinet/tcp_ratelimit.c optional ratelimit inet | ratelimit inet6 +netinet/tcp_hpts.c optional tcphpts inet | tcphpts inet6 +netinet/tcp_ratelimit.c optional ratelimit inet | ratelimit inet6 netinet/tcp_pcap.c optional inet tcppcap | inet6 tcppcap \ compile-with "${NORMAL_C} ${NO_WNONNULL}" netinet/tcp_reass.c optional inet | inet6 From b662b41e62ce9bd9b2b61388f9087abc0c415402 Mon Sep 17 00:00:00 2001 From: Rick Macklem Date: Tue, 24 Sep 2019 23:38:10 +0000 Subject: [PATCH 046/106] Replace all mtx_lock()/mtx_unlock() on the iod lock with macros. Since the NFS node mutex needs to change to an sx lock so it can be held when vnode_pager_setsize() is called and the iod lock is held when the NFS node lock is acquired, the iod mutex will need to be changed to an sx lock as well. To simply the future commit that changes both the NFS node lock and iod lock to sx locks, this commit replaces all mtx_lock()/mtx_unlock() calls on the iod lock with macros. There is no semantic change as a result of this commit. I don't know when the future commit will happen and be MFC'd, so I have set the MFC on this commit to one week so that it can be MFC'd at the same time. Suggested by: kib MFC after: 1 week --- sys/fs/nfs/nfsport.h | 2 ++ sys/fs/nfsclient/nfs_clbio.c | 10 +++++----- sys/fs/nfsclient/nfs_clnfsiod.c | 28 ++++++++++++++-------------- sys/fs/nfsclient/nfs_clsubs.c | 4 ++-- sys/fs/nfsclient/nfs_clvfsops.c | 4 ++-- 5 files changed, 25 insertions(+), 23 deletions(-) diff --git a/sys/fs/nfs/nfsport.h b/sys/fs/nfs/nfsport.h index 221ee1cc6f09..49577dab0eb0 100644 --- a/sys/fs/nfs/nfsport.h +++ b/sys/fs/nfs/nfsport.h @@ -690,6 +690,8 @@ void nfsrvd_rcv(struct socket *, void *, int); #define NFSUNLOCKNODE(n) mtx_unlock(&((n)->n_mtx)) #define NFSLOCKMNT(m) mtx_lock(&((m)->nm_mtx)) #define NFSUNLOCKMNT(m) mtx_unlock(&((m)->nm_mtx)) +#define NFSLOCKIOD() mtx_lock(&ncl_iod_mutex) +#define NFSUNLOCKIOD() mtx_unlock(&ncl_iod_mutex) #define NFSLOCKREQUEST(r) mtx_lock(&((r)->r_mtx)) #define NFSUNLOCKREQUEST(r) mtx_unlock(&((r)->r_mtx)) #define NFSLOCKSOCKREQ(r) mtx_lock(&((r)->nr_mtx)) diff --git a/sys/fs/nfsclient/nfs_clbio.c b/sys/fs/nfsclient/nfs_clbio.c index dddbfb963125..07fec833b2d5 100644 --- a/sys/fs/nfsclient/nfs_clbio.c +++ b/sys/fs/nfsclient/nfs_clbio.c @@ -1410,11 +1410,11 @@ ncl_asyncio(struct nfsmount *nmp, struct buf *bp, struct ucred *cred, struct thr * To avoid this deadlock, don't allow the async nfsiod threads to * perform Readdirplus RPCs. */ - mtx_lock(&ncl_iod_mutex); + NFSLOCKIOD(); if ((bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) && (nmp->nm_bufqiods > ncl_numasync / 2)) || (bp->b_vp->v_type == VDIR && (nmp->nm_flag & NFSMNT_RDIRPLUS))) { - mtx_unlock(&ncl_iod_mutex); + NFSUNLOCKIOD(); return(EIO); } again: @@ -1481,7 +1481,7 @@ ncl_asyncio(struct nfsmount *nmp, struct buf *bp, struct ucred *cred, struct thr if (error) { error2 = newnfs_sigintr(nmp, td); if (error2) { - mtx_unlock(&ncl_iod_mutex); + NFSUNLOCKIOD(); return (error2); } if (slpflag == PCATCH) { @@ -1522,11 +1522,11 @@ ncl_asyncio(struct nfsmount *nmp, struct buf *bp, struct ucred *cred, struct thr VTONFS(bp->b_vp)->n_directio_asyncwr++; NFSUNLOCKNODE(VTONFS(bp->b_vp)); } - mtx_unlock(&ncl_iod_mutex); + NFSUNLOCKIOD(); return (0); } - mtx_unlock(&ncl_iod_mutex); + NFSUNLOCKIOD(); /* * All the iods are busy on other mounts, so return EIO to diff --git a/sys/fs/nfsclient/nfs_clnfsiod.c b/sys/fs/nfsclient/nfs_clnfsiod.c index 036e5d01d83d..d43b960eca75 100644 --- a/sys/fs/nfsclient/nfs_clnfsiod.c +++ b/sys/fs/nfsclient/nfs_clnfsiod.c @@ -106,7 +106,7 @@ sysctl_iodmin(SYSCTL_HANDLER_ARGS) error = sysctl_handle_int(oidp, &newmin, 0, req); if (error || (req->newptr == NULL)) return (error); - mtx_lock(&ncl_iod_mutex); + NFSLOCKIOD(); if (newmin > ncl_iodmax) { error = EINVAL; goto out; @@ -121,7 +121,7 @@ sysctl_iodmin(SYSCTL_HANDLER_ARGS) for (i = nfs_iodmin - ncl_numasync; i > 0; i--) nfs_nfsiodnew_sync(); out: - mtx_unlock(&ncl_iod_mutex); + NFSUNLOCKIOD(); return (0); } SYSCTL_PROC(_vfs_nfs, OID_AUTO, iodmin, CTLTYPE_UINT | CTLFLAG_RW, 0, @@ -140,7 +140,7 @@ sysctl_iodmax(SYSCTL_HANDLER_ARGS) return (error); if (newmax > NFS_MAXASYNCDAEMON) return (EINVAL); - mtx_lock(&ncl_iod_mutex); + NFSLOCKIOD(); ncl_iodmax = newmax; if (ncl_numasync <= ncl_iodmax) goto out; @@ -157,7 +157,7 @@ sysctl_iodmax(SYSCTL_HANDLER_ARGS) iod--; } out: - mtx_unlock(&ncl_iod_mutex); + NFSUNLOCKIOD(); return (0); } SYSCTL_PROC(_vfs_nfs, OID_AUTO, iodmax, CTLTYPE_UINT | CTLFLAG_RW, 0, @@ -178,10 +178,10 @@ nfs_nfsiodnew_sync(void) } if (i == ncl_iodmax) return (0); - mtx_unlock(&ncl_iod_mutex); + NFSUNLOCKIOD(); error = kproc_create(nfssvc_iod, nfs_asyncdaemon + i, NULL, RFHIGHPID, 0, "newnfs %d", i); - mtx_lock(&ncl_iod_mutex); + NFSLOCKIOD(); if (error == 0) { ncl_numasync++; ncl_iodwant[i] = NFSIOD_AVAILABLE; @@ -194,12 +194,12 @@ void ncl_nfsiodnew_tq(__unused void *arg, int pending) { - mtx_lock(&ncl_iod_mutex); + NFSLOCKIOD(); while (pending > 0) { pending--; nfs_nfsiodnew_sync(); } - mtx_unlock(&ncl_iod_mutex); + NFSUNLOCKIOD(); } void @@ -217,7 +217,7 @@ nfsiod_setup(void *dummy) TUNABLE_INT_FETCH("vfs.nfs.iodmin", &nfs_iodmin); nfscl_init(); - mtx_lock(&ncl_iod_mutex); + NFSLOCKIOD(); /* Silently limit the start number of nfsiod's */ if (nfs_iodmin > NFS_MAXASYNCDAEMON) nfs_iodmin = NFS_MAXASYNCDAEMON; @@ -227,7 +227,7 @@ nfsiod_setup(void *dummy) if (error == -1) panic("nfsiod_setup: nfs_nfsiodnew failed"); } - mtx_unlock(&ncl_iod_mutex); + NFSUNLOCKIOD(); } SYSINIT(newnfsiod, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, nfsiod_setup, NULL); @@ -248,7 +248,7 @@ nfssvc_iod(void *instance) int myiod, timo; int error = 0; - mtx_lock(&ncl_iod_mutex); + NFSLOCKIOD(); myiod = (int *)instance - nfs_asyncdaemon; /* * Main loop @@ -291,7 +291,7 @@ nfssvc_iod(void *instance) nmp->nm_bufqwant = 0; wakeup(&nmp->nm_bufq); } - mtx_unlock(&ncl_iod_mutex); + NFSUNLOCKIOD(); if (bp->b_flags & B_DIRECT) { KASSERT((bp->b_iocmd == BIO_WRITE), ("nfscvs_iod: BIO_WRITE not set")); (void)ncl_doio_directwrite(bp); @@ -303,7 +303,7 @@ nfssvc_iod(void *instance) (void) ncl_doio(bp->b_vp, bp, bp->b_wcred, NULL, 0); } - mtx_lock(&ncl_iod_mutex); + NFSLOCKIOD(); /* * Make sure the nmp hasn't been dismounted as soon as * ncl_doio() completes for the last buffer. @@ -335,7 +335,7 @@ nfssvc_iod(void *instance) /* Someone may be waiting for the last nfsiod to terminate. */ if (--ncl_numasync == 0) wakeup(&ncl_numasync); - mtx_unlock(&ncl_iod_mutex); + NFSUNLOCKIOD(); if ((error == 0) || (error == EWOULDBLOCK)) kproc_exit(0); /* Abnormal termination */ diff --git a/sys/fs/nfsclient/nfs_clsubs.c b/sys/fs/nfsclient/nfs_clsubs.c index 7691d7b5c469..8f0ebadb6de7 100644 --- a/sys/fs/nfsclient/nfs_clsubs.c +++ b/sys/fs/nfsclient/nfs_clsubs.c @@ -102,7 +102,7 @@ ncl_uninit(struct vfsconf *vfsp) * Tell all nfsiod processes to exit. Clear ncl_iodmax, and wakeup * any sleeping nfsiods so they check ncl_iodmax and exit. */ - mtx_lock(&ncl_iod_mutex); + NFSLOCKIOD(); ncl_iodmax = 0; for (i = 0; i < ncl_numasync; i++) if (ncl_iodwant[i] == NFSIOD_AVAILABLE) @@ -110,7 +110,7 @@ ncl_uninit(struct vfsconf *vfsp) /* The last nfsiod to exit will wake us up when ncl_numasync hits 0 */ while (ncl_numasync) msleep(&ncl_numasync, &ncl_iod_mutex, PWAIT, "ioddie", 0); - mtx_unlock(&ncl_iod_mutex); + NFSUNLOCKIOD(); ncl_nhuninit(); return (0); #else diff --git a/sys/fs/nfsclient/nfs_clvfsops.c b/sys/fs/nfsclient/nfs_clvfsops.c index d3587e89c552..e0cc1f079235 100644 --- a/sys/fs/nfsclient/nfs_clvfsops.c +++ b/sys/fs/nfsclient/nfs_clvfsops.c @@ -1713,13 +1713,13 @@ nfs_unmount(struct mount *mp, int mntflags) mtx_unlock(&nmp->nm_mtx); } /* Make sure no nfsiods are assigned to this mount. */ - mtx_lock(&ncl_iod_mutex); + NFSLOCKIOD(); for (i = 0; i < NFS_MAXASYNCDAEMON; i++) if (ncl_iodmount[i] == nmp) { ncl_iodwant[i] = NFSIOD_AVAILABLE; ncl_iodmount[i] = NULL; } - mtx_unlock(&ncl_iod_mutex); + NFSUNLOCKIOD(); /* * We can now set mnt_data to NULL and wait for From 087d0e0b5e75b5d4bbe23fcd0e6f74be3c904fd3 Mon Sep 17 00:00:00 2001 From: Warner Losh Date: Wed, 25 Sep 2019 00:24:57 +0000 Subject: [PATCH 047/106] After my comnd changes, the number of threads and size weren't set. In addition, the flags are optional, but were made to be mandatory. Set these things, as well as santiy check the specified size. Submitted by: Stefan Rink PR: 240798 --- sbin/nvmecontrol/perftest.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sbin/nvmecontrol/perftest.c b/sbin/nvmecontrol/perftest.c index 907acbbf49e5..74a4bd18f75f 100644 --- a/sbin/nvmecontrol/perftest.c +++ b/sbin/nvmecontrol/perftest.c @@ -143,9 +143,9 @@ perftest(const struct cmd *f, int argc, char *argv[]) if (arg_parse(argc, argv, f)) return; - if (opt.flags == NULL || opt.op == NULL) + if (opt.op == NULL) arg_help(argc, argv, f); - if (strcmp(opt.flags, "refthread") == 0) + if (opt.flags != NULL && strcmp(opt.flags, "refthread") == 0) io_test.flags |= NVME_TEST_FLAG_REFTHREAD; if (opt.intr != NULL) { if (strcmp(opt.intr, "bio") == 0 || @@ -163,6 +163,7 @@ perftest(const struct cmd *f, int argc, char *argv[]) fprintf(stderr, "Bad number of threads %d\n", opt.threads); arg_help(argc, argv, f); } + io_test.num_threads = opt.threads; if (strcasecmp(opt.op, "read") == 0) io_test.opc = NVME_OPC_READ; else if (strcasecmp(opt.op, "write") == 0) @@ -176,6 +177,11 @@ perftest(const struct cmd *f, int argc, char *argv[]) arg_help(argc, argv, f); } io_test.time = opt.time; + if (opt.size < 0) { + fprintf(stderr, "Invalid size.\n"); + arg_help(argc, argv, f); + } + io_test.size = opt.size; open_dev(opt.dev, &fd, 1, 1); if (ioctl(fd, ioctl_cmd, &io_test) < 0) err(1, "ioctl NVME_IO_TEST failed"); From 0cfa351ec5ca2833a9bcb3683cf1cb5473cff31f Mon Sep 17 00:00:00 2001 From: Justin Hibbits Date: Wed, 25 Sep 2019 01:23:08 +0000 Subject: [PATCH 048/106] powerpc: Allocate DPCPU block from domain-local memory This should improve NUMA scalability a little, by binding to the CPU's NUMA domain. This matches what's done on amd64. --- sys/powerpc/powerpc/mp_machdep.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/sys/powerpc/powerpc/mp_machdep.c b/sys/powerpc/powerpc/mp_machdep.c index 3fb22fcd54f5..cff771d7af19 100644 --- a/sys/powerpc/powerpc/mp_machdep.c +++ b/sys/powerpc/powerpc/mp_machdep.c @@ -35,6 +35,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -149,7 +150,7 @@ cpu_mp_start(void) { struct cpuref bsp, cpu; struct pcpu *pc; - int error; + int domain, error; error = platform_smp_get_bsp(&bsp); KASSERT(error == 0, ("Don't know BSP")); @@ -166,12 +167,18 @@ cpu_mp_start(void) cpu.cr_cpuid); goto next; } + + if (vm_ndomains > 1) + domain = cpu.cr_domain; + else + domain = 0; + if (cpu.cr_cpuid != bsp.cr_cpuid) { void *dpcpu; pc = &__pcpu[cpu.cr_cpuid]; - dpcpu = (void *)kmem_malloc(DPCPU_SIZE, M_WAITOK | - M_ZERO); + dpcpu = (void *)kmem_malloc_domainset(DOMAINSET_PREF(domain), + DPCPU_SIZE, M_WAITOK | M_ZERO); pcpu_init(pc, cpu.cr_cpuid, sizeof(*pc)); dpcpu_init(dpcpu, cpu.cr_cpuid); } else { @@ -179,13 +186,9 @@ cpu_mp_start(void) pc->pc_cpuid = bsp.cr_cpuid; pc->pc_bsp = 1; } + pc->pc_domain = domain; pc->pc_hwref = cpu.cr_hwref; - if (vm_ndomains > 1) - pc->pc_domain = cpu.cr_domain; - else - pc->pc_domain = 0; - CPU_SET(pc->pc_cpuid, &cpuset_domain[pc->pc_domain]); KASSERT(pc->pc_domain < MAXMEMDOM, ("bad domain value %d\n", pc->pc_domain)); From e44ed9d3d4fb0901547ee06b437e99979c5de7cc Mon Sep 17 00:00:00 2001 From: Justin Hibbits Date: Wed, 25 Sep 2019 01:39:58 +0000 Subject: [PATCH 049/106] powerpc/atomic: Follow recommendations on atomic primitive comparisons Both IBM and Freescale programming examples presume the cmpset operands will favor equal, and pessimize the non-equal case instead. Do the same for atomic_cmpset_* and atomic_fcmpset_*. This slightly pessimizes the failure case, in favor of the success case. MFC after: 3 weeks --- sys/powerpc/include/atomic.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sys/powerpc/include/atomic.h b/sys/powerpc/include/atomic.h index d379a7be8e89..776c2a15aee6 100644 --- a/sys/powerpc/include/atomic.h +++ b/sys/powerpc/include/atomic.h @@ -568,7 +568,7 @@ atomic_cmpset_int(volatile u_int* p, u_int cmpval, u_int newval) __asm __volatile ( "1:\tlwarx %0, 0, %2\n\t" /* load old value */ "cmplw %3, %0\n\t" /* compare */ - "bne 2f\n\t" /* exit if not equal */ + "bne- 2f\n\t" /* exit if not equal */ "stwcx. %4, 0, %2\n\t" /* attempt to store */ "bne- 1b\n\t" /* spin if failed */ "li %0, 1\n\t" /* success - retval = 1 */ @@ -592,12 +592,12 @@ atomic_cmpset_long(volatile u_long* p, u_long cmpval, u_long newval) #ifdef __powerpc64__ "1:\tldarx %0, 0, %2\n\t" /* load old value */ "cmpld %3, %0\n\t" /* compare */ - "bne 2f\n\t" /* exit if not equal */ + "bne- 2f\n\t" /* exit if not equal */ "stdcx. %4, 0, %2\n\t" /* attempt to store */ #else "1:\tlwarx %0, 0, %2\n\t" /* load old value */ "cmplw %3, %0\n\t" /* compare */ - "bne 2f\n\t" /* exit if not equal */ + "bne- 2f\n\t" /* exit if not equal */ "stwcx. %4, 0, %2\n\t" /* attempt to store */ #endif "bne- 1b\n\t" /* spin if failed */ @@ -684,7 +684,7 @@ atomic_fcmpset_int(volatile u_int *p, u_int *cmpval, u_int newval) __asm __volatile ( "lwarx %0, 0, %3\n\t" /* load old value */ "cmplw %4, %0\n\t" /* compare */ - "bne 1f\n\t" /* exit if not equal */ + "bne- 1f\n\t" /* exit if not equal */ "stwcx. %5, 0, %3\n\t" /* attempt to store */ "bne- 1f\n\t" /* exit if failed */ "li %0, 1\n\t" /* success - retval = 1 */ @@ -709,12 +709,12 @@ atomic_fcmpset_long(volatile u_long *p, u_long *cmpval, u_long newval) #ifdef __powerpc64__ "ldarx %0, 0, %3\n\t" /* load old value */ "cmpld %4, %0\n\t" /* compare */ - "bne 1f\n\t" /* exit if not equal */ + "bne- 1f\n\t" /* exit if not equal */ "stdcx. %5, 0, %3\n\t" /* attempt to store */ #else "lwarx %0, 0, %3\n\t" /* load old value */ "cmplw %4, %0\n\t" /* compare */ - "bne 1f\n\t" /* exit if not equal */ + "bne- 1f\n\t" /* exit if not equal */ "stwcx. %5, 0, %3\n\t" /* attempt to store */ #endif "bne- 1f\n\t" /* exit if failed */ From 5b80de237b53fd14e3926dd58fc0b78be983ed84 Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Wed, 25 Sep 2019 02:37:40 +0000 Subject: [PATCH 050/106] cron: add log suppression and mail suppression for successful runs This commit adds two new extensions to crontab, ported from OpenBSD: - -n: suppress mail on succesful run - -q: suppress logging of command execution The -q option appears decades old, but -n is relatively new. The original proposal by Job Snijder can be found here [1], and gives very convincing reasons for inclusion in base. This patch is a nearly identical port of OpenBSD cron for -q and -n features. It is written to follow existing conventions and style of the existing codebase. Example usage: # should only send email, but won't show up in log * * * * * -q date # should not send email * * * * * -n date # should not send email or log * * * * * -n -q date # should send email because of ping failure * * * * * -n -q ping -c 1 5.5.5.5 [1]: https://marc.info/?l=openbsd-tech&m=152874866117948&w=2 PR: 237538 Submitted by: Naveen Nathan Reviewed by: bcr (manpages) MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D20046 --- usr.sbin/cron/cron/cron.h | 4 +- usr.sbin/cron/cron/do_command.c | 137 +++++++++++++++++++------------- usr.sbin/cron/cron/popen.c | 6 +- usr.sbin/cron/crontab/crontab.5 | 32 +++++++- usr.sbin/cron/lib/entry.c | 51 +++++++++++- 5 files changed, 171 insertions(+), 59 deletions(-) diff --git a/usr.sbin/cron/cron/cron.h b/usr.sbin/cron/cron/cron.h index bb4f5287daea..f0f9e88d6b59 100644 --- a/usr.sbin/cron/cron/cron.h +++ b/usr.sbin/cron/cron/cron.h @@ -191,6 +191,8 @@ typedef struct _entry { #define NOT_UNTIL 0x10 #define SEC_RES 0x20 #define INTERVAL 0x40 +#define DONT_LOG 0x80 +#define MAIL_WHEN_ERR 0x100 time_t lastrun; } entry; @@ -257,7 +259,7 @@ user *load_user(int, struct passwd *, char *), entry *load_entry(FILE *, void (*)(char *), struct passwd *, char **); -FILE *cron_popen(char *, char *, entry *); +FILE *cron_popen(char *, char *, entry *, PID_T *); /* in the C tradition, we only create diff --git a/usr.sbin/cron/cron/do_command.c b/usr.sbin/cron/cron/do_command.c index 13d14e442a27..3810e5290285 100644 --- a/usr.sbin/cron/cron/do_command.c +++ b/usr.sbin/cron/cron/do_command.c @@ -41,6 +41,7 @@ static const char rcsid[] = static void child_process(entry *, user *), do_univ(user *); +static WAIT_T wait_on_child(PID_T, const char *); void do_command(e, u) @@ -94,7 +95,10 @@ child_process(e, u) int stdin_pipe[2], stdout_pipe[2]; register char *input_data; char *usernm, *mailto, *mailfrom; - int children = 0; + PID_T jobpid, stdinjob, mailpid; + register FILE *mail; + register int bytes = 1; + int status = 0; # if defined(LOGIN_CAP) struct passwd *pwd; login_cap_t *lc; @@ -216,7 +220,7 @@ child_process(e, u) /* fork again, this time so we can exec the user's command. */ - switch (vfork()) { + switch (jobpid = vfork()) { case -1: log_it("CRON",getpid(),"error","can't vfork"); exit(ERROR_EXIT); @@ -237,7 +241,7 @@ child_process(e, u) * the actual user command shell was going to get and the * PID is part of the log message. */ - /*local*/{ + if ((e->flags & DONT_LOG) == 0) { char *x = mkprints((u_char *)e->cmd, strlen(e->cmd)); log_it(usernm, getpid(), "CMD", x); @@ -359,8 +363,6 @@ child_process(e, u) break; } - children++; - /* middle process, child of original cron, parent of process running * the user's command. */ @@ -384,7 +386,7 @@ child_process(e, u) * we would block here. thus we must fork again. */ - if (*input_data && fork() == 0) { + if (*input_data && (stdinjob = fork()) == 0) { register FILE *out = fdopen(stdin_pipe[WRITE_PIPE], "w"); register int need_newline = FALSE; register int escaped = FALSE; @@ -440,8 +442,6 @@ child_process(e, u) */ close(stdin_pipe[WRITE_PIPE]); - children++; - /* * read output from the grandchild. it's stderr has been redirected to * it's stdout, which has been redirected to our pipe. if there is any @@ -462,10 +462,6 @@ child_process(e, u) ch = getc(in); if (ch != EOF) { - register FILE *mail; - register int bytes = 1; - int status = 0; - Debug(DPROC|DEXT, ("[%d] got data (%x:%c) from grandchild\n", getpid(), ch, ch)) @@ -500,7 +496,7 @@ child_process(e, u) hostname[sizeof(hostname) - 1] = '\0'; (void) snprintf(mailcmd, sizeof(mailcmd), MAILARGS, MAILCMD); - if (!(mail = cron_popen(mailcmd, "w", e))) { + if (!(mail = cron_popen(mailcmd, "w", e, &mailpid))) { warn("%s", MAILCMD); (void) _exit(ERROR_EXIT); } @@ -538,28 +534,56 @@ child_process(e, u) if (mailto) putc(ch, mail); } + } + /*if data from grandchild*/ - /* only close pipe if we opened it -- i.e., we're - * mailing... + Debug(DPROC, ("[%d] got EOF from grandchild\n", getpid())) + + /* also closes stdout_pipe[READ_PIPE] */ + fclose(in); + } + + /* wait for children to die. + */ + if (jobpid > 0) { + WAIT_T waiter; + + waiter = wait_on_child(jobpid, "grandchild command job"); + + /* If everything went well, and -n was set, _and_ we have mail, + * we won't be mailing... so shoot the messenger! + */ + if (WIFEXITED(waiter) && WEXITSTATUS(waiter) == 0 + && (e->flags & MAIL_WHEN_ERR) == MAIL_WHEN_ERR + && mailto) { + Debug(DPROC, ("[%d] %s executed successfully, mail suppressed\n", + getpid(), "grandchild command job")) + kill(mailpid, SIGKILL); + (void)fclose(mail); + mailto = NULL; + } + + + /* only close pipe if we opened it -- i.e., we're + * mailing... + */ + + if (mailto) { + Debug(DPROC, ("[%d] closing pipe to mail\n", + getpid())) + /* Note: the pclose will probably see + * the termination of the grandchild + * in addition to the mail process, since + * it (the grandchild) is likely to exit + * after closing its stdout. */ - - if (mailto) { - Debug(DPROC, ("[%d] closing pipe to mail\n", - getpid())) - /* Note: the pclose will probably see - * the termination of the grandchild - * in addition to the mail process, since - * it (the grandchild) is likely to exit - * after closing its stdout. - */ - status = cron_pclose(mail); - } + status = cron_pclose(mail); /* if there was output and we could not mail it, * log the facts so the poor user can figure out * what's going on. */ - if (mailto && status) { + if (status) { char buf[MAX_TEMPSTR]; snprintf(buf, sizeof(buf), @@ -568,35 +592,38 @@ child_process(e, u) status); log_it(usernm, getpid(), "MAIL", buf); } - - } /*if data from grandchild*/ - - Debug(DPROC, ("[%d] got EOF from grandchild\n", getpid())) - - fclose(in); /* also closes stdout_pipe[READ_PIPE] */ - } - - /* wait for children to die. - */ - for (; children > 0; children--) - { - WAIT_T waiter; - PID_T pid; - - Debug(DPROC, ("[%d] waiting for grandchild #%d to finish\n", - getpid(), children)) - pid = wait(&waiter); - if (pid < OK) { - Debug(DPROC, ("[%d] no more grandchildren--mail written?\n", - getpid())) - break; } - Debug(DPROC, ("[%d] grandchild #%d finished, status=%04x", - getpid(), pid, WEXITSTATUS(waiter))) - if (WIFSIGNALED(waiter) && WCOREDUMP(waiter)) - Debug(DPROC, (", dumped core")) - Debug(DPROC, ("\n")) } + + if (*input_data && stdinjob > 0) + wait_on_child(stdinjob, "grandchild stdinjob"); +} + +static WAIT_T +wait_on_child(PID_T childpid, const char *name) { + WAIT_T waiter; + PID_T pid; + + Debug(DPROC, ("[%d] waiting for %s (%d) to finish\n", + getpid(), name, childpid)) + +#ifdef POSIX + while ((pid = waitpid(childpid, &waiter, 0)) < 0 && errno == EINTR) +#else + while ((pid = wait4(childpid, &waiter, 0, NULL)) < 0 && errno == EINTR) +#endif + ; + + if (pid < OK) + return waiter; + + Debug(DPROC, ("[%d] %s (%d) finished, status=%04x", + getpid(), name, pid, WEXITSTATUS(waiter))) + if (WIFSIGNALED(waiter) && WCOREDUMP(waiter)) + Debug(DPROC, (", dumped core")) + Debug(DPROC, ("\n")) + + return waiter; } diff --git a/usr.sbin/cron/cron/popen.c b/usr.sbin/cron/cron/popen.c index 01e62bf2bab2..73e6e28d748a 100644 --- a/usr.sbin/cron/cron/popen.c +++ b/usr.sbin/cron/cron/popen.c @@ -55,9 +55,10 @@ static PID_T *pids; static int fds; FILE * -cron_popen(program, type, e) +cron_popen(program, type, e, pidptr) char *program, *type; entry *e; + PID_T *pidptr; { register char *cp; FILE *iop; @@ -218,6 +219,9 @@ cron_popen(program, type, e) free((char *)argv[argc]); } #endif + + *pidptr = pid; + return(iop); } diff --git a/usr.sbin/cron/crontab/crontab.5 b/usr.sbin/cron/crontab/crontab.5 index 8988574f3745..9943adfaf356 100644 --- a/usr.sbin/cron/crontab/crontab.5 +++ b/usr.sbin/cron/crontab/crontab.5 @@ -17,7 +17,7 @@ .\" .\" $FreeBSD$ .\" -.Dd April 19, 2019 +.Dd September 24, 2019 .Dt CRONTAB 5 .Os .Sh NAME @@ -199,6 +199,8 @@ lists of names are not allowed. .Pp The ``sixth'' field (the rest of the line) specifies the command to be run. +One or more command options may precede the command to modify processing +behavior. The entire command portion of the line, up to a newline or % character, will be executed by .Pa /bin/sh @@ -211,6 +213,22 @@ Percent-signs (%) in the command, unless escaped with backslash after the first % will be sent to the command as standard input. .Pp +The following command options can be supplied: +.Bl -tag -width Ds +.It Fl n +No mail is sent after a successful run. +The execution output will only be mailed if the command exits with a non-zero +exit code. +The +.Fl n +option is an attempt to cure potentially copious volumes of mail coming from +.Xr cron 8 . +.It Fl q +Execution will not be logged. +.El +.sp +Duplicate options are not allowed. +.Pp Note: The day of a command's execution can be specified by two fields \(em day of month, and day of week. If both fields are @@ -271,6 +289,10 @@ MAILTO=paul 5 4 * * sun echo "run at 5 after 4 every sunday" # run at 5 minutes intervals, no matter how long it takes @300 svnlite up /usr/src +# run every minute, suppress logging +* * * * * -q date +# run every minute, only send mail if ping fails +* * * * * -n ping -c 1 freebsd.org .Ed .Sh SEE ALSO .Xr crontab 1 , @@ -314,6 +336,14 @@ All of the .Sq @ directives that can appear in place of the first five fields are extensions. +.Pp +Command processing can be modified using command options. +The +.Sq -q +option suppresses logging. +The +.Sq -n +option does not mail on successful run. .Sh AUTHORS .An Paul Vixie Aq Mt paul@vix.com .Sh BUGS diff --git a/usr.sbin/cron/lib/entry.c b/usr.sbin/cron/lib/entry.c index a8ec3ae8b568..8c579b8f511d 100644 --- a/usr.sbin/cron/lib/entry.c +++ b/usr.sbin/cron/lib/entry.c @@ -35,7 +35,8 @@ static const char rcsid[] = typedef enum ecode { e_none, e_minute, e_hour, e_dom, e_month, e_dow, - e_cmd, e_timespec, e_username, e_group, e_mem + e_cmd, e_timespec, e_username, e_group, e_option, + e_mem #ifdef LOGIN_CAP , e_class #endif @@ -58,6 +59,7 @@ static char *ecodes[] = "bad time specifier", "bad username", "bad group name", + "bad option", "out of memory", #ifdef LOGIN_CAP "bad class name", @@ -429,6 +431,53 @@ load_entry(file, error_func, pw, envp) } #endif + Debug(DPARS, ("load_entry()...checking for command options\n")) + + ch = get_char(file); + + while (ch == '-') { + Debug(DPARS|DEXT, ("load_entry()...expecting option\n")) + switch (ch = get_char(file)) { + case 'n': + Debug(DPARS|DEXT, ("load_entry()...got MAIL_WHEN_ERR ('n') option\n")) + /* only allow the user to set the option once */ + if ((e->flags & MAIL_WHEN_ERR) == MAIL_WHEN_ERR) { + Debug(DPARS|DEXT, ("load_entry()...duplicate MAIL_WHEN_ERR ('n') option\n")) + ecode = e_option; + goto eof; + } + e->flags |= MAIL_WHEN_ERR; + break; + case 'q': + Debug(DPARS|DEXT, ("load_entry()...got DONT_LOG ('q') option\n")) + /* only allow the user to set the option once */ + if ((e->flags & DONT_LOG) == DONT_LOG) { + Debug(DPARS|DEXT, ("load_entry()...duplicate DONT_LOG ('q') option\n")) + ecode = e_option; + goto eof; + } + e->flags |= DONT_LOG; + break; + default: + Debug(DPARS|DEXT, ("load_entry()...invalid option '%c'\n", ch)) + ecode = e_option; + goto eof; + } + ch = get_char(file); + if (ch!='\t' && ch!=' ') { + ecode = e_option; + goto eof; + } + + Skip_Blanks(ch, file) + if (ch == EOF || ch == '\n') { + ecode = e_cmd; + goto eof; + } + } + + unget_char(ch, file); + Debug(DPARS, ("load_entry()...about to parse command\n")) /* Everything up to the next \n or EOF is part of the command... From 233ab015c0d735089bd6863f8d0a0fd25b62c2a6 Mon Sep 17 00:00:00 2001 From: Toomas Soome Date: Wed, 25 Sep 2019 07:09:25 +0000 Subject: [PATCH 051/106] loader: add teken.fg_color and teken.bg_color variables Add settable variables to control teken default color attributes. The supported colors are 0-7 or basic color names: black, red, green, brown, blue, magenta, cyan, white. The current implementation does add some duplication which will be addressed later. --- stand/efi/libefi/efi_console.c | 92 ++++++++++++++++++++++++++++ stand/i386/libi386/vidconsole.c | 105 ++++++++++++++++++++++++++++++-- 2 files changed, 193 insertions(+), 4 deletions(-) diff --git a/stand/efi/libefi/efi_console.c b/stand/efi/libefi/efi_console.c index 6c8522111c80..746d57f9cb68 100644 --- a/stand/efi/libefi/efi_console.c +++ b/stand/efi/libefi/efi_console.c @@ -343,6 +343,91 @@ efi_cons_probe(struct console *cp) cp->c_flags |= C_PRESENTIN | C_PRESENTOUT; } +static bool +color_name_to_teken(const char *name, int *val) +{ + if (strcasecmp(name, "black") == 0) { + *val = TC_BLACK; + return (true); + } + if (strcasecmp(name, "red") == 0) { + *val = TC_RED; + return (true); + } + if (strcasecmp(name, "green") == 0) { + *val = TC_GREEN; + return (true); + } + if (strcasecmp(name, "brown") == 0) { + *val = TC_BROWN; + return (true); + } + if (strcasecmp(name, "blue") == 0) { + *val = TC_BLUE; + return (true); + } + if (strcasecmp(name, "magenta") == 0) { + *val = TC_MAGENTA; + return (true); + } + if (strcasecmp(name, "cyan") == 0) { + *val = TC_CYAN; + return (true); + } + if (strcasecmp(name, "white") == 0) { + *val = TC_WHITE; + return (true); + } + return (false); +} + +static int +efi_set_colors(struct env_var *ev, int flags, const void *value) +{ + int val = 0; + char buf[2]; + const void *evalue; + const teken_attr_t *ap; + teken_attr_t a; + + if (value == NULL) + return (CMD_OK); + + if (color_name_to_teken(value, &val)) { + snprintf(buf, sizeof (buf), "%d", val); + evalue = buf; + } else { + char *end; + + errno = 0; + val = (int)strtol(value, &end, 0); + if (errno != 0 || *end != '\0') { + printf("Allowed values are either ansi color name or " + "number from range [0-7].\n"); + return (CMD_OK); + } + evalue = value; + } + + ap = teken_get_defattr(&teken); + a = *ap; + if (strcmp(ev->ev_name, "teken.fg_color") == 0) { + /* is it already set? */ + if (ap->ta_fgcolor == val) + return (CMD_OK); + a.ta_fgcolor = val; + } + if (strcmp(ev->ev_name, "teken.bg_color") == 0) { + /* is it already set? */ + if (ap->ta_bgcolor == val) + return (CMD_OK); + a.ta_bgcolor = val; + } + env_setenv(ev->ev_name, flags | EV_NOHOOK, evalue, NULL, NULL); + teken_set_defattr(&teken, &a); + return (CMD_OK); +} + bool efi_cons_update_mode(void) { @@ -374,6 +459,13 @@ efi_cons_update_mode(void) teken_set_winsize(&teken, &tp); a = teken_get_defattr(&teken); + snprintf(env, sizeof(env), "%d", a->ta_fgcolor); + env_setenv("teken.fg_color", EV_VOLATILE, env, efi_set_colors, + env_nounset); + snprintf(env, sizeof(env), "%d", a->ta_bgcolor); + env_setenv("teken.bg_color", EV_VOLATILE, env, efi_set_colors, + env_nounset); + for (int row = 0; row < rows; row++) for (int col = 0; col < cols; col++) { buffer[col + row * tp.tp_col].c = ' '; diff --git a/stand/i386/libi386/vidconsole.c b/stand/i386/libi386/vidconsole.c index e947d274e89a..45ea873bf592 100644 --- a/stand/i386/libi386/vidconsole.c +++ b/stand/i386/libi386/vidconsole.c @@ -569,6 +569,96 @@ vidc_probe(struct console *cp) cp->c_flags |= C_PRESENTOUT; } +static bool +color_name_to_teken(const char *name, int *val) +{ + if (strcasecmp(name, "black") == 0) { + *val = TC_BLACK; + return (true); + } + if (strcasecmp(name, "red") == 0) { + *val = TC_RED; + return (true); + } + if (strcasecmp(name, "green") == 0) { + *val = TC_GREEN; + return (true); + } + if (strcasecmp(name, "brown") == 0) { + *val = TC_BROWN; + return (true); + } + if (strcasecmp(name, "blue") == 0) { + *val = TC_BLUE; + return (true); + } + if (strcasecmp(name, "magenta") == 0) { + *val = TC_MAGENTA; + return (true); + } + if (strcasecmp(name, "cyan") == 0) { + *val = TC_CYAN; + return (true); + } + if (strcasecmp(name, "white") == 0) { + *val = TC_WHITE; + return (true); + } + return (false); +} + +static int +vidc_set_colors(struct env_var *ev, int flags, const void *value) +{ + int val = 0; + char buf[2]; + const void *evalue; + const teken_attr_t *ap; + teken_attr_t a; + + if (value == NULL) + return (CMD_OK); + + if (color_name_to_teken(value, &val)) { + snprintf(buf, sizeof (buf), "%d", val); + evalue = buf; + } else { + char *end; + + errno = 0; + val = (int)strtol(value, &end, 0); + if (errno != 0 || *end != '\0') { + printf("Allowed values are either ansi color name or " + "number from range [0-7].\n"); + return (CMD_OK); + } + evalue = value; + } + + ap = teken_get_defattr(&teken); + a = *ap; + if (strcmp(ev->ev_name, "teken.fg_color") == 0) { + /* is it already set? */ + if (ap->ta_fgcolor == val) + return (CMD_OK); + a.ta_fgcolor = val; + } + if (strcmp(ev->ev_name, "teken.bg_color") == 0) { + /* is it already set? */ + if (ap->ta_bgcolor == val) + return (CMD_OK); + a.ta_bgcolor = val; + } + + /* Improve visibility */ + if (a.ta_bgcolor == TC_WHITE) + a.ta_bgcolor |= TC_LIGHT; + + env_setenv(ev->ev_name, flags | EV_NOHOOK, evalue, NULL, NULL); + teken_set_defattr(&teken, &a); + return (CMD_OK); +} + static int vidc_init(int arg) { @@ -603,15 +693,22 @@ vidc_init(int arg) if (buffer == NULL) return (1); - teken_init(&teken, &tf, NULL); - teken_set_winsize(&teken, &tp); - a = teken_get_defattr(&teken); - snprintf(env, sizeof (env), "%u", tp.tp_row); setenv("LINES", env, 1); snprintf(env, sizeof (env), "%u", tp.tp_col); setenv("COLUMNS", env, 1); + teken_init(&teken, &tf, NULL); + teken_set_winsize(&teken, &tp); + a = teken_get_defattr(&teken); + + snprintf(env, sizeof(env), "%d", a->ta_fgcolor); + env_setenv("teken.fg_color", EV_VOLATILE, env, vidc_set_colors, + env_nounset); + snprintf(env, sizeof(env), "%d", a->ta_bgcolor); + env_setenv("teken.bg_color", EV_VOLATILE, env, vidc_set_colors, + env_nounset); + for (int row = 0; row < tp.tp_row; row++) for (int col = 0; col < tp.tp_col; col++) { buffer[col + row * tp.tp_col].c = ' '; From 8074c5cc861f4072dc019ac06fe17ba8313b56ad Mon Sep 17 00:00:00 2001 From: Toomas Soome Date: Wed, 25 Sep 2019 07:36:35 +0000 Subject: [PATCH 052/106] loader: fix indentation in efi_console and vidconsole Remove extra tab. Reported by: yuripv --- stand/efi/libefi/efi_console.c | 2 +- stand/i386/libi386/vidconsole.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/stand/efi/libefi/efi_console.c b/stand/efi/libefi/efi_console.c index 746d57f9cb68..26e051fe73ad 100644 --- a/stand/efi/libefi/efi_console.c +++ b/stand/efi/libefi/efi_console.c @@ -374,7 +374,7 @@ color_name_to_teken(const char *name, int *val) *val = TC_CYAN; return (true); } - if (strcasecmp(name, "white") == 0) { + if (strcasecmp(name, "white") == 0) { *val = TC_WHITE; return (true); } diff --git a/stand/i386/libi386/vidconsole.c b/stand/i386/libi386/vidconsole.c index 45ea873bf592..5025168dc661 100644 --- a/stand/i386/libi386/vidconsole.c +++ b/stand/i386/libi386/vidconsole.c @@ -600,7 +600,7 @@ color_name_to_teken(const char *name, int *val) *val = TC_CYAN; return (true); } - if (strcasecmp(name, "white") == 0) { + if (strcasecmp(name, "white") == 0) { *val = TC_WHITE; return (true); } From a35a97ae12bf013bd06fe6b08ceecbd019f7aedc Mon Sep 17 00:00:00 2001 From: Warner Losh Date: Wed, 25 Sep 2019 07:51:30 +0000 Subject: [PATCH 053/106] Size is unsigned, so remove the test entirely. The kernel won't crash if you have a bad value and I'd rather not have nvmecontrol know the internal details about how the nvme driver limits the transfer size. --- sbin/nvmecontrol/perftest.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sbin/nvmecontrol/perftest.c b/sbin/nvmecontrol/perftest.c index 74a4bd18f75f..cc3beece5534 100644 --- a/sbin/nvmecontrol/perftest.c +++ b/sbin/nvmecontrol/perftest.c @@ -177,10 +177,6 @@ perftest(const struct cmd *f, int argc, char *argv[]) arg_help(argc, argv, f); } io_test.time = opt.time; - if (opt.size < 0) { - fprintf(stderr, "Invalid size.\n"); - arg_help(argc, argv, f); - } io_test.size = opt.size; open_dev(opt.dev, &fd, 1, 1); if (ioctl(fd, ioctl_cmd, &io_test) < 0) From bb3dfc6ae90f67c1900f616ac7948a1112cfae32 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 25 Sep 2019 11:58:54 +0000 Subject: [PATCH 054/106] Fix wrong assertion in r352658. MFC after: 1 month --- sys/kern/sched_ule.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c index 9d73a59c74fd..5568facbc80f 100644 --- a/sys/kern/sched_ule.c +++ b/sys/kern/sched_ule.c @@ -1345,7 +1345,7 @@ sched_pickcpu(struct thread *td, int flags) if (cpu >= 0) SCHED_STAT_INC(pickcpu_lowest); } - KASSERT(cpu < 0, ("sched_pickcpu: Failed to find a cpu.")); + KASSERT(cpu >= 0, ("sched_pickcpu: Failed to find a cpu.")); KASSERT(!CPU_ABSENT(cpu), ("sched_pickcpu: Picked absent CPU %d.", cpu)); /* * Compare the lowest loaded cpu to current cpu. From 53e73a5935f0f4d6c5afd5f4f53428ae0b6a9dd6 Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Wed, 25 Sep 2019 13:04:34 +0000 Subject: [PATCH 055/106] RELNOTES: Document r352668 (crontab -n and -q options) Suggested by: bapt --- RELNOTES | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/RELNOTES b/RELNOTES index c92d92bd9c15..f5aba43277c5 100644 --- a/RELNOTES +++ b/RELNOTES @@ -10,6 +10,11 @@ newline. Entries should be separated by a newline. Changes to this file should not be MFCed. +r352668: + cron(8) now supports the -n (suppress mail on succesful run) and -q + (suppress logging of command execution) options in the crontab format. + See the crontab(5) manpage for details. + r352304: ntpd is no longer by default locked in memory. rlimit memlock 32 or rlimit memlock 0 can be used to restore this behaviour. From 3001e0c942d803c83e77e593a34cd3c5a12284ac Mon Sep 17 00:00:00 2001 From: Toomas Soome Date: Wed, 25 Sep 2019 13:21:07 +0000 Subject: [PATCH 056/106] kernel: terminal_init() should check for teken colors from kenv Check for teken.fg_color and teken.bg_color and prepare the color attributes accordingly. When white background is used, make it light to improve visibility. When black background is used, make kernel messages light. --- sys/kern/subr_terminal.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/sys/kern/subr_terminal.c b/sys/kern/subr_terminal.c index dbb94f57ec36..be52d0618dfc 100644 --- a/sys/kern/subr_terminal.c +++ b/sys/kern/subr_terminal.c @@ -124,13 +124,13 @@ static teken_funcs_t terminal_drawmethods = { }; /* Kernel message formatting. */ -static const teken_attr_t kernel_message = { +static teken_attr_t kernel_message = { .ta_fgcolor = TCHAR_FGCOLOR(TERMINAL_KERN_ATTR), .ta_bgcolor = TCHAR_BGCOLOR(TERMINAL_KERN_ATTR), .ta_format = TCHAR_FORMAT(TERMINAL_KERN_ATTR) }; -static const teken_attr_t default_message = { +static teken_attr_t default_message = { .ta_fgcolor = TCHAR_FGCOLOR(TERMINAL_NORM_ATTR), .ta_bgcolor = TCHAR_BGCOLOR(TERMINAL_NORM_ATTR), .ta_format = TCHAR_FORMAT(TERMINAL_NORM_ATTR) @@ -168,10 +168,33 @@ static const teken_attr_t default_message = { static void terminal_init(struct terminal *tm) { + int fg, bg; if (tm->tm_flags & TF_CONS) mtx_init(&tm->tm_mtx, "trmlck", NULL, MTX_SPIN); + teken_init(&tm->tm_emulator, &terminal_drawmethods, tm); + + TUNABLE_INT_FETCH("teken.fg_color", &fg); + TUNABLE_INT_FETCH("teken.bg_color", &bg); + + if (fg != -1) { + default_message.ta_fgcolor = fg; + kernel_message.ta_fgcolor = fg; + } + if (bg != -1) { + default_message.ta_bgcolor = bg; + kernel_message.ta_bgcolor = bg; + } + + if (default_message.ta_bgcolor == TC_WHITE) { + default_message.ta_bgcolor |= TC_LIGHT; + kernel_message.ta_bgcolor |= TC_LIGHT; + } + + if (default_message.ta_bgcolor == TC_BLACK && + default_message.ta_fgcolor < TC_NCOLORS) + kernel_message.ta_fgcolor |= TC_LIGHT; teken_set_defattr(&tm->tm_emulator, &default_message); } From df1bc27a0c392944ef3d506116092f70812edbdc Mon Sep 17 00:00:00 2001 From: Toomas Soome Date: Wed, 25 Sep 2019 13:24:31 +0000 Subject: [PATCH 057/106] vt: use colors from terminal emulator Instead of hardcoded colors, use terminal state. This also means, we need to record the pointer to terminal state with vtbuf. --- sys/dev/vt/hw/fb/vt_fb.c | 13 +++++++++++-- sys/dev/vt/vt.h | 1 + sys/dev/vt/vt_buf.c | 19 +++++++++++++++---- sys/dev/vt/vt_core.c | 23 ++++++++++++----------- sys/dev/vt/vt_cpulogos.c | 7 +++++-- 5 files changed, 44 insertions(+), 19 deletions(-) diff --git a/sys/dev/vt/hw/fb/vt_fb.c b/sys/dev/vt/hw/fb/vt_fb.c index 091188c5e2da..7bd717b77031 100644 --- a/sys/dev/vt/hw/fb/vt_fb.c +++ b/sys/dev/vt/hw/fb/vt_fb.c @@ -37,6 +37,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -453,7 +454,8 @@ vt_fb_init(struct vt_device *vd) { struct fb_info *info; u_int margin; - int err; + int bg, err; + term_color_t c; info = vd->vd_softc; vd->vd_height = MIN(VT_FB_MAX_HEIGHT, info->fb_height); @@ -477,8 +479,15 @@ vt_fb_init(struct vt_device *vd) info->fb_cmsize = 16; } + c = TC_BLACK; + TUNABLE_INT_FETCH("teken.bg_color", &bg); + if (bg != -1) { + if (bg == TC_WHITE) + bg |= TC_LIGHT; + c = bg; + } /* Clear the screen. */ - vd->vd_driver->vd_blank(vd, TC_BLACK); + vd->vd_driver->vd_blank(vd, c); /* Wakeup screen. KMS need this. */ vt_fb_postswitch(vd); diff --git a/sys/dev/vt/vt.h b/sys/dev/vt/vt.h index 712da20b7634..da308cd035c4 100644 --- a/sys/dev/vt/vt.h +++ b/sys/dev/vt/vt.h @@ -192,6 +192,7 @@ void vt_suspend(struct vt_device *vd); struct vt_buf { struct mtx vb_lock; /* Buffer lock. */ + struct terminal *vb_terminal; term_pos_t vb_scr_size; /* (b) Screen dimensions. */ int vb_flags; /* (b) Flags. */ #define VBF_CURSOR 0x1 /* Cursor visible. */ diff --git a/sys/dev/vt/vt_buf.c b/sys/dev/vt/vt_buf.c index be5d23b5d54d..6512ca21071e 100644 --- a/sys/dev/vt/vt_buf.c +++ b/sys/dev/vt/vt_buf.c @@ -420,6 +420,8 @@ void vtbuf_init_early(struct vt_buf *vb) { term_rect_t rect; + const teken_attr_t *a; + term_char_t c; vb->vb_flags |= VBF_CURSOR; vb->vb_roffset = 0; @@ -433,7 +435,11 @@ vtbuf_init_early(struct vt_buf *vb) rect.tr_begin.tp_row = rect.tr_begin.tp_col = 0; rect.tr_end.tp_col = vb->vb_scr_size.tp_col; rect.tr_end.tp_row = vb->vb_history_size; - vtbuf_do_fill(vb, &rect, VTBUF_SPACE_CHAR(TERMINAL_NORM_ATTR)); + + a = teken_get_curattr(&vb->vb_terminal->tm_emulator); + c = TCOLOR_FG((term_char_t)a->ta_fgcolor) | + TCOLOR_BG((term_char_t)a->ta_bgcolor); + vtbuf_do_fill(vb, &rect, VTBUF_SPACE_CHAR(c)); vtbuf_make_undirty(vb); if ((vb->vb_flags & VBF_MTX_INIT) == 0) { mtx_init(&vb->vb_lock, "vtbuf", NULL, MTX_SPIN); @@ -478,6 +484,11 @@ vtbuf_grow(struct vt_buf *vb, const term_pos_t *p, unsigned int history_size) unsigned int w, h, c, r, old_history_size; size_t bufsize, rowssize; int history_full; + const teken_attr_t *a; + term_char_t ch; + + a = teken_get_curattr(&vb->vb_terminal->tm_emulator); + ch = TCOLOR_FG(a->ta_fgcolor) | TCOLOR_BG(a->ta_bgcolor); history_size = MAX(history_size, p->tp_row); @@ -544,7 +555,7 @@ vtbuf_grow(struct vt_buf *vb, const term_pos_t *p, unsigned int history_size) * background color. */ for (c = MIN(p->tp_col, w); c < p->tp_col; c++) { - row[c] = VTBUF_SPACE_CHAR(TERMINAL_NORM_ATTR); + row[c] = VTBUF_SPACE_CHAR(ch); } } @@ -552,7 +563,7 @@ vtbuf_grow(struct vt_buf *vb, const term_pos_t *p, unsigned int history_size) for (r = old_history_size; r < history_size; r++) { row = rows[r]; for (c = MIN(p->tp_col, w); c < p->tp_col; c++) { - row[c] = VTBUF_SPACE_CHAR(TERMINAL_NORM_ATTR); + row[c] = VTBUF_SPACE_CHAR(ch); } } @@ -601,7 +612,7 @@ vtbuf_grow(struct vt_buf *vb, const term_pos_t *p, unsigned int history_size) * background color. */ for (c = MIN(p->tp_col, w); c < p->tp_col; c++) { - row[c] = VTBUF_SPACE_CHAR(TERMINAL_NORM_ATTR); + row[c] = VTBUF_SPACE_CHAR(ch); } } diff --git a/sys/dev/vt/vt_core.c b/sys/dev/vt/vt_core.c index 87bcc3a61a1e..4d517bf77715 100644 --- a/sys/dev/vt/vt_core.c +++ b/sys/dev/vt/vt_core.c @@ -1241,7 +1241,7 @@ vt_mark_mouse_position_as_dirty(struct vt_device *vd, int locked) static void vt_set_border(struct vt_device *vd, const term_rect_t *area, - const term_color_t c) + term_color_t c) { vd_drawrect_t *drawrect = vd->vd_driver->vd_drawrect; @@ -1334,9 +1334,12 @@ vt_flush(struct vt_device *vd) /* Force a full redraw when the screen contents might be invalid. */ if (vd->vd_flags & (VDF_INVALID | VDF_SUSPENDED)) { + const teken_attr_t *a; + vd->vd_flags &= ~VDF_INVALID; - vt_set_border(vd, &vw->vw_draw_area, TC_BLACK); + a = teken_get_curattr(&vw->vw_terminal->tm_emulator); + vt_set_border(vd, &vw->vw_draw_area, a->ta_bgcolor); vt_termrect(vd, vf, &tarea); if (vd->vd_driver->vd_invalidate_text) vd->vd_driver->vd_invalidate_text(vd, &tarea); @@ -1440,8 +1443,7 @@ vtterm_cnprobe(struct terminal *tm, struct consdev *cp) struct vt_window *vw = tm->tm_softc; struct vt_device *vd = vw->vw_device; struct winsize wsz; - term_attr_t attr; - term_char_t c; + const term_attr_t *a; if (!vty_enabled(VTY_VT)) return; @@ -1494,14 +1496,12 @@ vtterm_cnprobe(struct terminal *tm, struct consdev *cp) if (vd->vd_width != 0 && vd->vd_height != 0) vt_termsize(vd, vw->vw_font, &vw->vw_buf.vb_scr_size); + /* We need to access terminal attributes from vtbuf */ + vw->vw_buf.vb_terminal = tm; vtbuf_init_early(&vw->vw_buf); vt_winsize(vd, vw->vw_font, &wsz); - c = (boothowto & RB_MUTE) == 0 ? TERMINAL_KERN_ATTR : - TERMINAL_NORM_ATTR; - attr.ta_format = TCHAR_FORMAT(c); - attr.ta_fgcolor = TCHAR_FGCOLOR(c); - attr.ta_bgcolor = TCHAR_BGCOLOR(c); - terminal_set_winsize_blank(tm, &wsz, 1, &attr); + a = teken_get_curattr(&tm->tm_emulator); + terminal_set_winsize_blank(tm, &wsz, 1, a); if (vtdbest != NULL) { #ifdef DEV_SPLASH @@ -2691,9 +2691,10 @@ vt_allocate_window(struct vt_device *vd, unsigned int window) vt_termsize(vd, vw->vw_font, &size); vt_winsize(vd, vw->vw_font, &wsz); + tm = vw->vw_terminal = terminal_alloc(&vt_termclass, vw); + vw->vw_buf.vb_terminal = tm; /* must be set before vtbuf_init() */ vtbuf_init(&vw->vw_buf, &size); - tm = vw->vw_terminal = terminal_alloc(&vt_termclass, vw); terminal_set_winsize(tm, &wsz); vd->vd_windows[window] = vw; callout_init(&vw->vw_proc_dead_timer, 0); diff --git a/sys/dev/vt/vt_cpulogos.c b/sys/dev/vt/vt_cpulogos.c index df976c82a4da..8400f013fdb0 100644 --- a/sys/dev/vt/vt_cpulogos.c +++ b/sys/dev/vt/vt_cpulogos.c @@ -121,6 +121,8 @@ vtterm_draw_cpu_logos(struct vt_device *vd) { unsigned int ncpu, i; vt_axis_t left; + struct terminal *tm = vd->vd_curwindow->vw_terminal; + const teken_attr_t *a; if (vt_splash_ncpu) ncpu = vt_splash_ncpu; @@ -130,15 +132,16 @@ vtterm_draw_cpu_logos(struct vt_device *vd) ncpu = 1; } + a = teken_get_curattr(&tm->tm_emulator); if (vd->vd_driver->vd_drawrect) vd->vd_driver->vd_drawrect(vd, 0, 0, vd->vd_width, - vt_logo_sprite_height, 1, TC_BLACK); + vt_logo_sprite_height, 1, a->ta_bgcolor); /* * Blank is okay because we only ever draw beasties on full screen * refreshes. */ else if (vd->vd_driver->vd_blank) - vd->vd_driver->vd_blank(vd, TC_BLACK); + vd->vd_driver->vd_blank(vd, a->ta_bgcolor); ncpu = MIN(ncpu, vd->vd_width / vt_logo_sprite_width); for (i = 0, left = 0; i < ncpu; left += vt_logo_sprite_width, i++) From a9d0e0071c254f4df991f2b56bcc10ce6a32a7cf Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Wed, 25 Sep 2019 13:36:56 +0000 Subject: [PATCH 058/106] x86: Fall back to leaf 0x16 if TSC frequency is obtained by CPUID and leaf 0x15 is not functional. This should improve automatic TSC frequency determination on Skylake/Kabylake/... families, where 0x15 exists but does not provide all necessary information. SDM contains relatively strong wording against such uses of 0x16, but Intel does not give us any other way to obtain the frequency. Linux did the same in the commit 604dc9170f2435d27da5039a3efd757dceadc684. Based on submission by: Neel Chauhan PR: 240475 Reviewed by: markj Sponsored by: The FreeBSD Foundation MFC after: 1 week Differential revision: https://reviews.freebsd.org/D21777 --- sys/x86/x86/tsc.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/sys/x86/x86/tsc.c b/sys/x86/x86/tsc.c index 9ecdcce9bcf3..0fbb2e8d65f2 100644 --- a/sys/x86/x86/tsc.c +++ b/sys/x86/x86/tsc.c @@ -134,7 +134,11 @@ tsc_freq_vmware(void) /* * Calculate TSC frequency using information from the CPUID leaf 0x15 - * 'Time Stamp Counter and Nominal Core Crystal Clock'. It should be + * 'Time Stamp Counter and Nominal Core Crystal Clock'. If leaf 0x15 + * is not functional, as it is on Skylake/Kabylake, try 0x16 'Processor + * Frequency Information'. Leaf 0x16 is described in the SDM as + * informational only, but if 0x15 did not work, and TSC calibration + * is disabled, it is the best we can get at all. It should still be * an improvement over the parsing of the CPU model name in * tsc_freq_intel(), when available. */ @@ -146,10 +150,20 @@ tsc_freq_cpuid(void) if (cpu_high < 0x15) return (false); do_cpuid(0x15, regs); - if (regs[0] == 0 || regs[1] == 0 || regs[2] == 0) + if (regs[0] != 0 && regs[1] != 0 && regs[2] != 0) { + tsc_freq = (uint64_t)regs[2] * regs[1] / regs[0]; + return (true); + } + + if (cpu_high < 0x16) return (false); - tsc_freq = (uint64_t)regs[2] * regs[1] / regs[0]; - return (true); + do_cpuid(0x16, regs); + if (regs[0] != 0) { + tsc_freq = (uint64_t)regs[0] * 1000000; + return (true); + } + + return (false); } static void From b119329d81bfa520fc6f317cb21c007ee92390f7 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Wed, 25 Sep 2019 16:11:35 +0000 Subject: [PATCH 059/106] Complete the removal of the "wire_count" field from struct vm_page. Convert all remaining references to that field to "ref_count" and update comments accordingly. No functional change intended. Reviewed by: alc, kib Sponsored by: Intel, Netflix Differential Revision: https://reviews.freebsd.org/D21768 --- sys/amd64/amd64/efirt_machdep.c | 2 +- sys/amd64/amd64/pmap.c | 102 ++++++++++++++++---------------- sys/arm/arm/pmap-v6.c | 42 ++++++------- sys/arm64/arm64/efirt_machdep.c | 2 +- sys/arm64/arm64/pmap.c | 64 ++++++++++---------- sys/dev/agp/agp_i810.c | 2 +- sys/i386/i386/pmap.c | 46 +++++++------- sys/mips/mips/pmap.c | 26 ++++---- sys/powerpc/booke/pmap.c | 22 +++---- sys/riscv/riscv/pmap.c | 50 ++++++++-------- sys/vm/vm_page.h | 8 +-- sys/x86/iommu/intel_idpgtbl.c | 20 +++---- 12 files changed, 193 insertions(+), 193 deletions(-) diff --git a/sys/amd64/amd64/efirt_machdep.c b/sys/amd64/amd64/efirt_machdep.c index f0e15a935d2a..fc4835f85ab1 100644 --- a/sys/amd64/amd64/efirt_machdep.c +++ b/sys/amd64/amd64/efirt_machdep.c @@ -74,7 +74,7 @@ efi_destroy_1t1_map(void) if (obj_1t1_pt != NULL) { VM_OBJECT_RLOCK(obj_1t1_pt); TAILQ_FOREACH(m, &obj_1t1_pt->memq, listq) - m->wire_count = VPRC_OBJREF; + m->ref_count = VPRC_OBJREF; vm_wire_sub(obj_1t1_pt->resident_page_count); VM_OBJECT_RUNLOCK(obj_1t1_pt); vm_object_deallocate(obj_1t1_pt); diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 793702174e6a..1424dee34f5e 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -1856,7 +1856,7 @@ pmap_init(void) ("pmap_init: page table page is out of range")); mpte->pindex = pmap_pde_pindex(KERNBASE) + i; mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); - mpte->wire_count = 1; + mpte->ref_count = 1; /* * Collect the page table pages that were replaced by a 2MB @@ -3285,8 +3285,8 @@ pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) } /* - * Decrements a page table page's wire count, which is used to record the - * number of valid page table entries within the page. If the wire count + * Decrements a page table page's reference count, which is used to record the + * number of valid page table entries within the page. If the reference count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ @@ -3294,8 +3294,8 @@ static inline boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { - --m->wire_count; - if (m->wire_count == 0) { + --m->ref_count; + if (m->ref_count == 0) { _pmap_unwire_ptp(pmap, va, m, free); return (TRUE); } else @@ -3355,7 +3355,7 @@ _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) /* * After removing a page table entry, this routine is used to - * conditionally free the page, and manage the hold/wire counts. + * conditionally free the page, and manage the reference count. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, @@ -3615,7 +3615,7 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) } else { /* Add reference to pdp page */ pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); - pdppg->wire_count++; + pdppg->ref_count++; } pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); @@ -3660,7 +3660,7 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) } else { /* Add reference to the pd page */ pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); - pdpg->wire_count++; + pdpg->ref_count++; } } pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); @@ -3689,7 +3689,7 @@ pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) if (pdpe != NULL && (*pdpe & PG_V) != 0) { /* Add a reference to the pd page. */ pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); - pdpg->wire_count++; + pdpg->ref_count++; } else { /* Allocate a pd page. */ ptepindex = pmap_pde_pindex(va); @@ -3740,7 +3740,7 @@ pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) */ if (pd != NULL && (*pd & PG_V) != 0) { m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); - m->wire_count++; + m->ref_count++; } else { /* * Here if the pte page isn't mapped, or if it has been @@ -4205,7 +4205,7 @@ reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ - m_pc->wire_count = 1; + m_pc->ref_count = 1; } vm_page_free_pages_toq(&free, true); return (m_pc); @@ -4785,7 +4785,7 @@ pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, } if (!in_kernel) { - mpte->wire_count = NPTEPG; + mpte->ref_count = NPTEPG; pmap_resident_count_inc(pmap, 1); } } @@ -4946,9 +4946,9 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, KASSERT(mpte->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pde: pte page not promoted")); pmap_resident_count_dec(pmap, 1); - KASSERT(mpte->wire_count == NPTEPG, - ("pmap_remove_pde: pte page wire count error")); - mpte->wire_count = 0; + KASSERT(mpte->ref_count == NPTEPG, + ("pmap_remove_pde: pte page ref count error")); + mpte->ref_count = 0; pmap_add_delayed_free_list(mpte, free, FALSE); } } @@ -5709,7 +5709,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, pte = pmap_pde_to_pte(pde, va); if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); - mpte->wire_count++; + mpte->ref_count++; } } else if (va < VM_MAXUSER_ADDRESS) { /* @@ -5751,8 +5751,8 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, * Remove the extra PT page reference. */ if (mpte != NULL) { - mpte->wire_count--; - KASSERT(mpte->wire_count > 0, + mpte->ref_count--; + KASSERT(mpte->ref_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%lx", va)); } @@ -5873,7 +5873,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, * If both the page table page and the reservation are fully * populated, then attempt promotion. */ - if ((mpte == NULL || mpte->wire_count == NPTEPG) && + if ((mpte == NULL || mpte->ref_count == NPTEPG) && pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) @@ -5975,10 +5975,10 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, pde = &pde[pmap_pde_index(va)]; oldpde = *pde; if ((oldpde & PG_V) != 0) { - KASSERT(pdpg->wire_count > 1, - ("pmap_enter_pde: pdpg's wire count is too low")); + KASSERT(pdpg->ref_count > 1, + ("pmap_enter_pde: pdpg's reference count is too low")); if ((flags & PMAP_ENTER_NOREPLACE) != 0) { - pdpg->wire_count--; + pdpg->ref_count--; CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_FAILURE); @@ -6152,7 +6152,7 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, */ ptepindex = pmap_pde_pindex(va); if (mpte && (mpte->pindex == ptepindex)) { - mpte->wire_count++; + mpte->ref_count++; } else { /* * Get the page directory entry @@ -6169,7 +6169,7 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, if (*ptepa & PG_PS) return (NULL); mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); - mpte->wire_count++; + mpte->ref_count++; } else { /* * Pass NULL instead of the PV list lock @@ -6188,7 +6188,7 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, } if (*pte) { if (mpte != NULL) { - mpte->wire_count--; + mpte->ref_count--; mpte = NULL; } return (mpte); @@ -6334,8 +6334,8 @@ pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, atomic_add_long(&pmap_pde_mappings, 1); } else { /* Continue on if the PDE is already valid. */ - pdpg->wire_count--; - KASSERT(pdpg->wire_count > 0, + pdpg->ref_count--; + KASSERT(pdpg->ref_count > 0, ("pmap_object_init_pt: missing reference " "to page directory page, va: 0x%lx", addr)); } @@ -6525,13 +6525,13 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, PAGE_SIZE); atomic_add_long(&pmap_pde_mappings, 1); } else - dst_pdpg->wire_count--; + dst_pdpg->ref_count--; continue; } srcptepaddr &= PG_FRAME; srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); - KASSERT(srcmpte->wire_count > 0, + KASSERT(srcmpte->ref_count > 0, ("pmap_copy: source page table page is unused")); if (va_next > end_addr) @@ -6553,7 +6553,7 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, KASSERT(dstmpte->pindex == pmap_pde_pindex(addr), ("dstmpte pindex/addr mismatch")); - dstmpte->wire_count++; + dstmpte->ref_count++; } else if ((dstmpte = pmap_allocpte(dst_pmap, addr, NULL)) == NULL) goto out; @@ -6586,7 +6586,7 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, goto out; } /* Have we copied all of the valid mappings? */ - if (dstmpte->wire_count >= srcmpte->wire_count) + if (dstmpte->ref_count >= srcmpte->ref_count) break; } } @@ -6996,9 +6996,9 @@ pmap_remove_pages(pmap_t pmap) KASSERT(mpte->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pages: pte page not promoted")); pmap_resident_count_dec(pmap, 1); - KASSERT(mpte->wire_count == NPTEPG, - ("pmap_remove_pages: pte page wire count error")); - mpte->wire_count = 0; + KASSERT(mpte->ref_count == NPTEPG, + ("pmap_remove_pages: pte page reference count error")); + mpte->ref_count = 0; pmap_add_delayed_free_list(mpte, &free, FALSE); } } else { @@ -8728,7 +8728,7 @@ pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); - if ((mpte == NULL || mpte->wire_count == NPTEPG) && + if ((mpte == NULL || mpte->ref_count == NPTEPG) && pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) { @@ -8919,12 +8919,12 @@ pmap_quick_remove_page(vm_offset_t addr) /* * Pdp pages from the large map are managed differently from either * kernel or user page table pages. They are permanently allocated at - * initialization time, and their wire count is permanently set to + * initialization time, and their reference count is permanently set to * zero. The pml4 entries pointing to those pages are copied into * each allocated pmap. * * In contrast, pd and pt pages are managed like user page table - * pages. They are dynamically allocated, and their wire count + * pages. They are dynamically allocated, and their reference count * represents the number of valid entries within the page. */ static vm_page_t @@ -9011,7 +9011,7 @@ pmap_large_map_pte(vm_offset_t va) goto retry; mphys = VM_PAGE_TO_PHYS(m); *pde = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; - PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->wire_count++; + PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->ref_count++; } else { MPASS((*pde & X86_PG_PS) == 0); mphys = *pde & PG_FRAME; @@ -9131,7 +9131,7 @@ pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr, X86_PG_V | X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, mattr, TRUE); PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))-> - wire_count++; + ref_count++; inc = NBPDR; } else { pte = pmap_large_map_pte(va); @@ -9140,7 +9140,7 @@ pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr, X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, mattr, FALSE); PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte))-> - wire_count++; + ref_count++; inc = PAGE_SIZE; } } @@ -9209,8 +9209,8 @@ pmap_large_unmap(void *svaa, vm_size_t len) pde_store(pde, 0); inc = NBPDR; m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); - m->wire_count--; - if (m->wire_count == 0) { + m->ref_count--; + if (m->ref_count == 0) { *pdpe = 0; SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); } @@ -9223,13 +9223,13 @@ pmap_large_unmap(void *svaa, vm_size_t len) pte_clear(pte); inc = PAGE_SIZE; m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pte)); - m->wire_count--; - if (m->wire_count == 0) { + m->ref_count--; + if (m->ref_count == 0) { *pde = 0; SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); - m->wire_count--; - if (m->wire_count == 0) { + m->ref_count--; + if (m->ref_count == 0) { *pdpe = 0; SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); } @@ -9436,7 +9436,7 @@ static bool pmap_pti_free_page(vm_page_t m) { - KASSERT(m->wire_count > 0, ("page %p not wired", m)); + KASSERT(m->ref_count > 0, ("page %p not referenced", m)); if (!vm_page_unwire_noq(m)) return (false); vm_page_free_zero(m); @@ -9530,7 +9530,7 @@ pmap_pti_wire_pte(void *pte) VM_OBJECT_ASSERT_WLOCKED(pti_obj); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); - m->wire_count++; + m->ref_count++; } static void @@ -9540,8 +9540,8 @@ pmap_pti_unwire_pde(void *pde, bool only_ref) VM_OBJECT_ASSERT_WLOCKED(pti_obj); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde)); - MPASS(m->wire_count > 0); - MPASS(only_ref || m->wire_count > 1); + MPASS(m->ref_count > 0); + MPASS(only_ref || m->ref_count > 1); pmap_pti_free_page(m); } @@ -9553,7 +9553,7 @@ pmap_pti_unwire_pte(void *pte, vm_offset_t va) VM_OBJECT_ASSERT_WLOCKED(pti_obj); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); - MPASS(m->wire_count > 0); + MPASS(m->ref_count > 0); if (pmap_pti_free_page(m)) { pde = pmap_pti_pde(va); MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V); diff --git a/sys/arm/arm/pmap-v6.c b/sys/arm/arm/pmap-v6.c index 1d82ebf48cb2..27619957c057 100644 --- a/sys/arm/arm/pmap-v6.c +++ b/sys/arm/arm/pmap-v6.c @@ -2365,7 +2365,7 @@ pmap_release(pmap_t pmap) * untouched, so the table (strictly speaking a page which holds it) * is never freed if promoted. * - * If a page m->wire_count == 1 then no valid mappings exist in any L2 page + * If a page m->ref_count == 1 then no valid mappings exist in any L2 page * table in the page and the page itself is only mapped in PT2TAB. */ @@ -2376,7 +2376,7 @@ pt2_wirecount_init(vm_page_t m) /* * Note: A page m is allocated with VM_ALLOC_WIRED flag and - * m->wire_count should be already set correctly. + * m->ref_count should be already set correctly. * So, there is no need to set it again herein. */ for (i = 0; i < NPT2_IN_PG; i++) @@ -2396,10 +2396,10 @@ pt2_wirecount_inc(vm_page_t m, uint32_t pte1_idx) */ KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] < (NPTE2_IN_PT2 + 1), ("%s: PT2 is overflowing ...", __func__)); - KASSERT(m->wire_count <= (NPTE2_IN_PG + 1), + KASSERT(m->ref_count <= (NPTE2_IN_PG + 1), ("%s: PT2PG is overflowing ...", __func__)); - m->wire_count++; + m->ref_count++; m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]++; } @@ -2409,10 +2409,10 @@ pt2_wirecount_dec(vm_page_t m, uint32_t pte1_idx) KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] != 0, ("%s: PT2 is underflowing ...", __func__)); - KASSERT(m->wire_count > 1, + KASSERT(m->ref_count > 1, ("%s: PT2PG is underflowing ...", __func__)); - m->wire_count--; + m->ref_count--; m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]--; } @@ -2422,16 +2422,16 @@ pt2_wirecount_set(vm_page_t m, uint32_t pte1_idx, uint16_t count) KASSERT(count <= NPTE2_IN_PT2, ("%s: invalid count %u", __func__, count)); - KASSERT(m->wire_count > m->md.pt2_wirecount[pte1_idx & PT2PG_MASK], - ("%s: PT2PG corrupting (%u, %u) ...", __func__, m->wire_count, + KASSERT(m->ref_count > m->md.pt2_wirecount[pte1_idx & PT2PG_MASK], + ("%s: PT2PG corrupting (%u, %u) ...", __func__, m->ref_count, m->md.pt2_wirecount[pte1_idx & PT2PG_MASK])); - m->wire_count -= m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]; - m->wire_count += count; + m->ref_count -= m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]; + m->ref_count += count; m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] = count; - KASSERT(m->wire_count <= (NPTE2_IN_PG + 1), - ("%s: PT2PG is overflowed (%u) ...", __func__, m->wire_count)); + KASSERT(m->ref_count <= (NPTE2_IN_PG + 1), + ("%s: PT2PG is overflowed (%u) ...", __func__, m->ref_count)); } static __inline uint32_t @@ -2460,7 +2460,7 @@ static __inline boolean_t pt2pg_is_empty(vm_page_t m) { - return (m->wire_count == 1); + return (m->ref_count == 1); } /* @@ -2634,7 +2634,7 @@ pmap_unwire_pt2pg(pmap_t pmap, vm_offset_t va, vm_page_t m) (void)pt2tab_load_clear(pte2p); pmap_tlb_flush(pmap, pt2map_pt2pg(va)); - m->wire_count = 0; + m->ref_count = 0; pmap->pm_stats.resident_count--; /* @@ -2683,8 +2683,8 @@ pmap_unwire_pt2_all(pmap_t pmap, vm_offset_t va, vm_page_t m, KASSERT(m->pindex == (pte1_idx & ~PT2PG_MASK), ("%s: PT2 page's pindex is wrong", __func__)); - KASSERT(m->wire_count > pt2_wirecount_get(m, pte1_idx), - ("%s: bad pt2 wire count %u > %u", __func__, m->wire_count, + KASSERT(m->ref_count > pt2_wirecount_get(m, pte1_idx), + ("%s: bad pt2 wire count %u > %u", __func__, m->ref_count, pt2_wirecount_get(m, pte1_idx))); /* @@ -2949,7 +2949,7 @@ pmap_pv_reclaim(pmap_t locked_pmap) m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ - m_pc->wire_count = 1; + m_pc->ref_count = 1; vm_wire_add(1); } vm_page_free_pages_toq(&free, false); @@ -6707,7 +6707,7 @@ pmap_pid_dump(int pid) m = PHYS_TO_VM_PAGE(pa); printf("va: 0x%x, pa: 0x%x, w: %d, " "f: 0x%x", va, pa, - m->wire_count, m->flags); + m->ref_count, m->flags); npte2++; index++; if (index >= 2) { @@ -6818,7 +6818,7 @@ dump_link(pmap_t pmap, uint32_t pte1_idx, boolean_t invalid_ok) pte2_class(pte2), !!(pte2 & PTE2_S), !(pte2 & PTE2_NG), m); if (m != NULL) { printf(" v:%d w:%d f:0x%04X\n", m->valid, - m->wire_count, m->flags); + m->ref_count, m->flags); } else { printf("\n"); } @@ -6892,7 +6892,7 @@ DB_SHOW_COMMAND(pmap, pmap_pmap_print) dump_link_ok = FALSE; } else if (m != NULL) - printf(" w:%d w2:%u", m->wire_count, + printf(" w:%d w2:%u", m->ref_count, pt2_wirecount_get(m, pte1_index(va))); if (pte2 == 0) printf(" !!! pt2tab entry is ZERO"); @@ -6928,7 +6928,7 @@ dump_pt2tab(pmap_t pmap) pte2_class(pte2), !!(pte2 & PTE2_S), m); if (m != NULL) printf(" , w: %d, f: 0x%04X pidx: %lld", - m->wire_count, m->flags, m->pindex); + m->ref_count, m->flags, m->pindex); printf("\n"); } } diff --git a/sys/arm64/arm64/efirt_machdep.c b/sys/arm64/arm64/efirt_machdep.c index 7d9f8fbe31f5..46be93039115 100644 --- a/sys/arm64/arm64/efirt_machdep.c +++ b/sys/arm64/arm64/efirt_machdep.c @@ -74,7 +74,7 @@ efi_destroy_1t1_map(void) if (obj_1t1_pt != NULL) { VM_OBJECT_RLOCK(obj_1t1_pt); TAILQ_FOREACH(m, &obj_1t1_pt->memq, listq) - m->wire_count = VPRC_OBJREF; + m->ref_count = VPRC_OBJREF; vm_wire_sub(obj_1t1_pt->resident_page_count); VM_OBJECT_RUNLOCK(obj_1t1_pt); vm_object_deallocate(obj_1t1_pt); diff --git a/sys/arm64/arm64/pmap.c b/sys/arm64/arm64/pmap.c index d9753073e17b..49a632cedac1 100644 --- a/sys/arm64/arm64/pmap.c +++ b/sys/arm64/arm64/pmap.c @@ -1348,8 +1348,8 @@ pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, } /* - * Decrements a page table page's wire count, which is used to record the - * number of valid page table entries within the page. If the wire count + * Decrements a page table page's reference count, which is used to record the + * number of valid page table entries within the page. If the reference count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ @@ -1357,8 +1357,8 @@ static inline boolean_t pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { - --m->wire_count; - if (m->wire_count == 0) { + --m->ref_count; + if (m->ref_count == 0) { _pmap_unwire_l3(pmap, va, m, free); return (TRUE); } else @@ -1423,7 +1423,7 @@ _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) /* * After removing a page table entry, this routine is used to - * conditionally free the page, and manage the hold/wire counts. + * conditionally free the page, and manage the reference count. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, @@ -1554,7 +1554,7 @@ _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) } } else { l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); - l1pg->wire_count++; + l1pg->ref_count++; } l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); @@ -1595,7 +1595,7 @@ _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) } } else { l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); - l2pg->wire_count++; + l2pg->ref_count++; } } @@ -1621,7 +1621,7 @@ pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) { /* Add a reference to the L2 page. */ l2pg = PHYS_TO_VM_PAGE(pmap_load(l1) & ~ATTR_MASK); - l2pg->wire_count++; + l2pg->ref_count++; } else { /* Allocate a L2 page. */ l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; @@ -1679,7 +1679,7 @@ pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) tpde = pmap_load(pde); if (tpde != 0) { m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK); - m->wire_count++; + m->ref_count++; return (m); } break; @@ -2044,7 +2044,7 @@ reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ - m_pc->wire_count = 1; + m_pc->ref_count = 1; } vm_page_free_pages_toq(&free, true); return (m_pc); @@ -2482,9 +2482,9 @@ pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, KASSERT(ml3->valid == VM_PAGE_BITS_ALL, ("pmap_remove_l2: l3 page not promoted")); pmap_resident_count_dec(pmap, 1); - KASSERT(ml3->wire_count == NL3PG, - ("pmap_remove_l2: l3 page wire count error")); - ml3->wire_count = 0; + KASSERT(ml3->ref_count == NL3PG, + ("pmap_remove_l2: l3 page ref count error")); + ml3->ref_count = 0; pmap_add_delayed_free_list(ml3, free, FALSE); } } @@ -3229,7 +3229,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, l3 = pmap_l2_to_l3(pde, va); if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK); - mpte->wire_count++; + mpte->ref_count++; } goto havel3; } else if (pde != NULL && lvl == 1) { @@ -3240,7 +3240,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, if (va < VM_MAXUSER_ADDRESS) { mpte = PHYS_TO_VM_PAGE( pmap_load(l2) & ~ATTR_MASK); - mpte->wire_count++; + mpte->ref_count++; } goto havel3; } @@ -3291,8 +3291,8 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, * Remove the extra PT page reference. */ if (mpte != NULL) { - mpte->wire_count--; - KASSERT(mpte->wire_count > 0, + mpte->ref_count--; + KASSERT(mpte->ref_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%lx", va)); } @@ -3417,7 +3417,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, } #if VM_NRESERVLEVEL > 0 - if ((mpte == NULL || mpte->wire_count == NL3PG) && + if ((mpte == NULL || mpte->ref_count == NL3PG) && pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) { @@ -3494,10 +3494,10 @@ pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); l2 = &l2[pmap_l2_index(va)]; if ((old_l2 = pmap_load(l2)) != 0) { - KASSERT(l2pg->wire_count > 1, - ("pmap_enter_l2: l2pg's wire count is too low")); + KASSERT(l2pg->ref_count > 1, + ("pmap_enter_l2: l2pg's ref count is too low")); if ((flags & PMAP_ENTER_NOREPLACE) != 0) { - l2pg->wire_count--; + l2pg->ref_count--; CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); @@ -3671,7 +3671,7 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, */ l2pindex = pmap_l2_pindex(va); if (mpte && (mpte->pindex == l2pindex)) { - mpte->wire_count++; + mpte->ref_count++; } else { /* * Get the l2 entry @@ -3693,7 +3693,7 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, if (lvl == 2 && pmap_load(pde) != 0) { mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK); - mpte->wire_count++; + mpte->ref_count++; } else { /* * Pass NULL instead of the PV list lock @@ -3722,7 +3722,7 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, */ if (pmap_load(l3) != 0) { if (mpte != NULL) { - mpte->wire_count--; + mpte->ref_count--; mpte = NULL; } return (mpte); @@ -3952,14 +3952,14 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, PAGE_SIZE); atomic_add_long(&pmap_l2_mappings, 1); } else - dst_l2pg->wire_count--; + dst_l2pg->ref_count--; continue; } KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_copy: invalid L2 entry")); srcptepaddr &= ~ATTR_MASK; srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); - KASSERT(srcmpte->wire_count > 0, + KASSERT(srcmpte->ref_count > 0, ("pmap_copy: source page table page is unused")); if (va_next > end_addr) va_next = end_addr; @@ -3978,7 +3978,7 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, if (dstmpte != NULL) { KASSERT(dstmpte->pindex == pmap_l2_pindex(addr), ("dstmpte pindex/addr mismatch")); - dstmpte->wire_count++; + dstmpte->ref_count++; } else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr, NULL)) == NULL) goto out; @@ -4017,7 +4017,7 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, goto out; } /* Have we copied all of the valid mappings? */ - if (dstmpte->wire_count >= srcmpte->wire_count) + if (dstmpte->ref_count >= srcmpte->ref_count) break; } } @@ -4382,9 +4382,9 @@ pmap_remove_pages(pmap_t pmap) KASSERT(ml3->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pages: l3 page not promoted")); pmap_resident_count_dec(pmap,1); - KASSERT(ml3->wire_count == NL3PG, - ("pmap_remove_pages: l3 page wire count error")); - ml3->wire_count = 0; + KASSERT(ml3->ref_count == NL3PG, + ("pmap_remove_pages: l3 page ref count error")); + ml3->ref_count = 0; pmap_add_delayed_free_list(ml3, &free, FALSE); } @@ -5521,7 +5521,7 @@ pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, } if (va < VM_MAXUSER_ADDRESS) { - ml3->wire_count = NL3PG; + ml3->ref_count = NL3PG; pmap_resident_count_inc(pmap, 1); } } diff --git a/sys/dev/agp/agp_i810.c b/sys/dev/agp/agp_i810.c index 501f78ca0a32..d38244e71e0a 100644 --- a/sys/dev/agp/agp_i810.c +++ b/sys/dev/agp/agp_i810.c @@ -1954,7 +1954,7 @@ agp_intel_gtt_insert_pages(device_t dev, u_int first_entry, u_int num_entries, sc = device_get_softc(dev); for (i = 0; i < num_entries; i++) { MPASS(pages[i]->valid == VM_PAGE_BITS_ALL); - MPASS(pages[i]->wire_count > 0); + MPASS(pages[i]->ref_count > 0); sc->match->driver->install_gtt_pte(dev, first_entry + i, VM_PAGE_TO_PHYS(pages[i]), flags); } diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index 949cfe6a121a..59434be0c07b 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -992,7 +992,7 @@ __CONCAT(PMTYPE, init)(void) ("pmap_init: page table page is out of range")); mpte->pindex = i + KPTDI; mpte->phys_addr = KPTphys + ptoa(i); - mpte->wire_count = 1; + mpte->ref_count = 1; /* * Collect the page table pages that were replaced by a 2/4MB @@ -1952,8 +1952,8 @@ pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) } /* - * Decrements a page table page's wire count, which is used to record the - * number of valid page table entries within the page. If the wire count + * Decrements a page table page's reference count, which is used to record the + * number of valid page table entries within the page. If the reference count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ @@ -1961,8 +1961,8 @@ static inline boolean_t pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) { - --m->wire_count; - if (m->wire_count == 0) { + --m->ref_count; + if (m->ref_count == 0) { _pmap_unwire_ptp(pmap, m, free); return (TRUE); } else @@ -1992,7 +1992,7 @@ _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) /* * After removing a page table entry, this routine is used to - * conditionally free the page, and manage the hold/wire counts. + * conditionally free the page, and manage the reference count. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free) @@ -2171,7 +2171,7 @@ pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags) */ if (ptepa) { m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); - m->wire_count++; + m->ref_count++; } else { /* * Here if the pte page isn't mapped, or if it has @@ -2438,7 +2438,7 @@ pmap_pv_reclaim(pmap_t locked_pmap) m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ - m_pc->wire_count = 1; + m_pc->ref_count = 1; } vm_page_free_pages_toq(&free, true); return (m_pc); @@ -2797,7 +2797,7 @@ pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) return (FALSE); } if (pmap != kernel_pmap) { - mpte->wire_count = NPTEPG; + mpte->ref_count = NPTEPG; pmap->pm_stats.resident_count++; } } @@ -2993,9 +2993,9 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, KASSERT(mpte->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pde: pte page not promoted")); pmap->pm_stats.resident_count--; - KASSERT(mpte->wire_count == NPTEPG, - ("pmap_remove_pde: pte page wire count error")); - mpte->wire_count = 0; + KASSERT(mpte->ref_count == NPTEPG, + ("pmap_remove_pde: pte page ref count error")); + mpte->ref_count = 0; pmap_add_delayed_free_list(mpte, free, FALSE); } } @@ -3731,8 +3731,8 @@ __CONCAT(PMTYPE, enter)(pmap_t pmap, vm_offset_t va, vm_page_t m, * Remove the extra PT page reference. */ if (mpte != NULL) { - mpte->wire_count--; - KASSERT(mpte->wire_count > 0, + mpte->ref_count--; + KASSERT(mpte->ref_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%x", va)); } @@ -3853,7 +3853,7 @@ __CONCAT(PMTYPE, enter)(pmap_t pmap, vm_offset_t va, vm_page_t m, * If both the page table page and the reservation are fully * populated, then attempt promotion. */ - if ((mpte == NULL || mpte->wire_count == NPTEPG) && + if ((mpte == NULL || mpte->ref_count == NPTEPG) && pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) pmap_promote_pde(pmap, pde, va); @@ -4076,7 +4076,7 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, */ ptepindex = va >> PDRSHIFT; if (mpte && (mpte->pindex == ptepindex)) { - mpte->wire_count++; + mpte->ref_count++; } else { /* * Get the page directory entry @@ -4091,7 +4091,7 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, if (ptepa & PG_PS) return (NULL); mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); - mpte->wire_count++; + mpte->ref_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex, PMAP_ENTER_NOSLEEP); @@ -4107,7 +4107,7 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, pte = pmap_pte_quick(pmap, va); if (*pte) { if (mpte != NULL) { - mpte->wire_count--; + mpte->ref_count--; mpte = NULL; } sched_unpin(); @@ -4402,7 +4402,7 @@ __CONCAT(PMTYPE, copy)(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, } srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); - KASSERT(srcmpte->wire_count > 0, + KASSERT(srcmpte->ref_count > 0, ("pmap_copy: source page table page is unused")); if (pdnxt > end_addr) @@ -4442,7 +4442,7 @@ __CONCAT(PMTYPE, copy)(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, } goto out; } - if (dstmpte->wire_count >= srcmpte->wire_count) + if (dstmpte->ref_count >= srcmpte->ref_count) break; } addr += PAGE_SIZE; @@ -4829,9 +4829,9 @@ __CONCAT(PMTYPE, remove_pages)(pmap_t pmap) KASSERT(mpte->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pages: pte page not promoted")); pmap->pm_stats.resident_count--; - KASSERT(mpte->wire_count == NPTEPG, - ("pmap_remove_pages: pte page wire count error")); - mpte->wire_count = 0; + KASSERT(mpte->ref_count == NPTEPG, + ("pmap_remove_pages: pte page ref count error")); + mpte->ref_count = 0; pmap_add_delayed_free_list(mpte, &free, FALSE); } } else { diff --git a/sys/mips/mips/pmap.c b/sys/mips/mips/pmap.c index 10487ac18738..46b4be8e83df 100644 --- a/sys/mips/mips/pmap.c +++ b/sys/mips/mips/pmap.c @@ -983,8 +983,8 @@ pmap_qremove(vm_offset_t va, int count) ***************************************************/ /* - * Decrements a page table page's wire count, which is used to record the - * number of valid page table entries within the page. If the wire count + * Decrements a page table page's reference count, which is used to record the + * number of valid page table entries within the page. If the reference count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ @@ -992,8 +992,8 @@ static PMAP_INLINE boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m) { - --m->wire_count; - if (m->wire_count == 0) { + --m->ref_count; + if (m->ref_count == 0) { _pmap_unwire_ptp(pmap, va, m); return (TRUE); } else @@ -1043,7 +1043,7 @@ _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m) /* * After removing a page table entry, this routine is used to - * conditionally free the page, and manage the hold/wire counts. + * conditionally free the page, and manage the reference count. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t pde) @@ -1194,7 +1194,7 @@ _pmap_allocpte(pmap_t pmap, unsigned ptepindex, u_int flags) } } else { pg = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(*pdep)); - pg->wire_count++; + pg->ref_count++; } /* Next level entry */ pde = (pd_entry_t *)*pdep; @@ -1230,7 +1230,7 @@ pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags) */ if (pde != NULL && *pde != NULL) { m = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(*pde)); - m->wire_count++; + m->ref_count++; } else { /* * Here if the pte page isn't mapped, or if it has been @@ -2124,7 +2124,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, * Remove extra pte reference */ if (mpte) - mpte->wire_count--; + mpte->ref_count--; if (pte_test(&origpte, PTE_MANAGED)) { m->md.pv_flags |= PV_TABLE_REF; @@ -2165,8 +2165,8 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, pmap_invalidate_page(pmap, va); origpte = 0; if (mpte != NULL) { - mpte->wire_count--; - KASSERT(mpte->wire_count > 0, + mpte->ref_count--; + KASSERT(mpte->ref_count > 0, ("pmap_enter: missing reference to page table page," " va: %p", (void *)va)); } @@ -2276,7 +2276,7 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, */ ptepindex = pmap_pde_pindex(va); if (mpte && (mpte->pindex == ptepindex)) { - mpte->wire_count++; + mpte->ref_count++; } else { /* * Get the page directory entry @@ -2290,7 +2290,7 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, if (pde && *pde != 0) { mpte = PHYS_TO_VM_PAGE( MIPS_DIRECT_TO_PHYS(*pde)); - mpte->wire_count++; + mpte->ref_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex, PMAP_ENTER_NOSLEEP); @@ -2305,7 +2305,7 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, pte = pmap_pte(pmap, va); if (pte_test(pte, PTE_V)) { if (mpte != NULL) { - mpte->wire_count--; + mpte->ref_count--; mpte = NULL; } return (mpte); diff --git a/sys/powerpc/booke/pmap.c b/sys/powerpc/booke/pmap.c index 140b1367325f..1d141f75d0e9 100644 --- a/sys/powerpc/booke/pmap.c +++ b/sys/powerpc/booke/pmap.c @@ -663,8 +663,8 @@ pdir_unhold(mmu_t mmu, pmap_t pmap, u_int pp2d_idx) /* * Free pdir page if there are no dir entries in this pdir. */ - m->wire_count--; - if (m->wire_count == 0) { + m->ref_count--; + if (m->ref_count == 0) { pdir_free(mmu, pmap, pp2d_idx, m); return (1); } @@ -686,7 +686,7 @@ pdir_hold(mmu_t mmu, pmap_t pmap, pte_t ** pdir) KASSERT((pdir != NULL), ("pdir_hold: null pdir")); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pdir)); - m->wire_count++; + m->ref_count++; } /* Allocate page table. */ @@ -765,11 +765,11 @@ ptbl_unhold(mmu_t mmu, pmap_t pmap, vm_offset_t va) /* * Free ptbl pages if there are no pte entries in this ptbl. - * wire_count has the same value for all ptbl pages, so check the + * ref_count has the same value for all ptbl pages, so check the * last page. */ - m->wire_count--; - if (m->wire_count == 0) { + m->ref_count--; + if (m->ref_count == 0) { ptbl_free(mmu, pmap, pdir, pdir_idx, m); pdir_unhold(mmu, pmap, pp2d_idx); return (1); @@ -795,7 +795,7 @@ ptbl_hold(mmu_t mmu, pmap_t pmap, pte_t ** pdir, unsigned int pdir_idx) KASSERT((ptbl != NULL), ("ptbl_hold: null ptbl")); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t) ptbl)); - m->wire_count++; + m->ref_count++; } #else @@ -1010,15 +1010,15 @@ ptbl_unhold(mmu_t mmu, pmap_t pmap, unsigned int pdir_idx) pa = pte_vatopa(mmu, kernel_pmap, (vm_offset_t)ptbl + (i * PAGE_SIZE)); m = PHYS_TO_VM_PAGE(pa); - m->wire_count--; + m->ref_count--; } /* * Free ptbl pages if there are no pte etries in this ptbl. - * wire_count has the same value for all ptbl pages, so check the last + * ref_count has the same value for all ptbl pages, so check the last * page. */ - if (m->wire_count == 0) { + if (m->ref_count == 0) { ptbl_free(mmu, pmap, pdir_idx); //debugf("ptbl_unhold: e (freed ptbl)\n"); @@ -1056,7 +1056,7 @@ ptbl_hold(mmu_t mmu, pmap_t pmap, unsigned int pdir_idx) pa = pte_vatopa(mmu, kernel_pmap, (vm_offset_t)ptbl + (i * PAGE_SIZE)); m = PHYS_TO_VM_PAGE(pa); - m->wire_count++; + m->ref_count++; } } #endif diff --git a/sys/riscv/riscv/pmap.c b/sys/riscv/riscv/pmap.c index bbda832ff885..b2bc6ea02ffd 100644 --- a/sys/riscv/riscv/pmap.c +++ b/sys/riscv/riscv/pmap.c @@ -1127,8 +1127,8 @@ pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) } /* - * Decrements a page table page's wire count, which is used to record the - * number of valid page table entries within the page. If the wire count + * Decrements a page table page's reference count, which is used to record the + * number of valid page table entries within the page. If the reference count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ @@ -1136,8 +1136,8 @@ static inline boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { - --m->wire_count; - if (m->wire_count == 0) { + --m->ref_count; + if (m->ref_count == 0) { _pmap_unwire_ptp(pmap, va, m, free); return (TRUE); } else { @@ -1184,7 +1184,7 @@ _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) /* * After removing a page table entry, this routine is used to - * conditionally free the page, and manage the hold/wire counts. + * conditionally free the page, and manage the reference count. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, @@ -1327,7 +1327,7 @@ _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) } else { phys = PTE_TO_PHYS(pmap_load(l1)); pdpg = PHYS_TO_VM_PAGE(phys); - pdpg->wire_count++; + pdpg->ref_count++; } phys = PTE_TO_PHYS(pmap_load(l1)); @@ -1357,7 +1357,7 @@ pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) if (l1 != NULL && (pmap_load(l1) & PTE_RWX) == 0) { /* Add a reference to the L2 page. */ l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l1))); - l2pg->wire_count++; + l2pg->ref_count++; } else { /* Allocate a L2 page. */ l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; @@ -1393,7 +1393,7 @@ pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) if (l2 != NULL && pmap_load(l2) != 0) { phys = PTE_TO_PHYS(pmap_load(l2)); m = PHYS_TO_VM_PAGE(phys); - m->wire_count++; + m->ref_count++; } else { /* * Here if the pte page isn't mapped, or if it has been @@ -2068,9 +2068,9 @@ pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, KASSERT(ml3->valid == VM_PAGE_BITS_ALL, ("pmap_remove_l2: l3 page not promoted")); pmap_resident_count_dec(pmap, 1); - KASSERT(ml3->wire_count == Ln_ENTRIES, - ("pmap_remove_l2: l3 page wire count error")); - ml3->wire_count = 1; + KASSERT(ml3->ref_count == Ln_ENTRIES, + ("pmap_remove_l2: l3 page ref count error")); + ml3->ref_count = 1; vm_page_unwire_noq(ml3); pmap_add_delayed_free_list(ml3, free, FALSE); } @@ -2487,7 +2487,7 @@ pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, return (false); } if (va < VM_MAXUSER_ADDRESS) { - mpte->wire_count = Ln_ENTRIES; + mpte->ref_count = Ln_ENTRIES; pmap_resident_count_inc(pmap, 1); } } @@ -2695,7 +2695,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, l3 = pmap_l2_to_l3(l2, va); if (va < VM_MAXUSER_ADDRESS) { mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); - mpte->wire_count++; + mpte->ref_count++; } } else if (va < VM_MAXUSER_ADDRESS) { nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; @@ -2775,8 +2775,8 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, * Remove the extra PT page reference. */ if (mpte != NULL) { - mpte->wire_count--; - KASSERT(mpte->wire_count > 0, + mpte->ref_count--; + KASSERT(mpte->ref_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%lx", va)); } @@ -2878,7 +2878,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, } #if VM_NRESERVLEVEL > 0 - if (mpte != NULL && mpte->wire_count == Ln_ENTRIES && + if (mpte != NULL && mpte->ref_count == Ln_ENTRIES && pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) @@ -2955,10 +2955,10 @@ pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); l2 = &l2[pmap_l2_index(va)]; if ((oldl2 = pmap_load(l2)) != 0) { - KASSERT(l2pg->wire_count > 1, - ("pmap_enter_l2: l2pg's wire count is too low")); + KASSERT(l2pg->ref_count > 1, + ("pmap_enter_l2: l2pg's ref count is too low")); if ((flags & PMAP_ENTER_NOREPLACE) != 0) { - l2pg->wire_count--; + l2pg->ref_count--; CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); @@ -3133,7 +3133,7 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, */ l2pindex = pmap_l2_pindex(va); if (mpte && (mpte->pindex == l2pindex)) { - mpte->wire_count++; + mpte->ref_count++; } else { /* * Get the l2 entry @@ -3149,7 +3149,7 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, if (l2 != NULL && pmap_load(l2) != 0) { phys = PTE_TO_PHYS(pmap_load(l2)); mpte = PHYS_TO_VM_PAGE(phys); - mpte->wire_count++; + mpte->ref_count++; } else { /* * Pass NULL instead of the PV list lock @@ -3170,7 +3170,7 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, panic("pmap_enter_quick_locked: No l3"); if (pmap_load(l3) != 0) { if (mpte != NULL) { - mpte->wire_count--; + mpte->ref_count--; mpte = NULL; } return (mpte); @@ -3564,9 +3564,9 @@ pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv, KASSERT(mpte->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pages: pte page not promoted")); pmap_resident_count_dec(pmap, 1); - KASSERT(mpte->wire_count == Ln_ENTRIES, - ("pmap_remove_pages: pte page wire count error")); - mpte->wire_count = 0; + KASSERT(mpte->ref_count == Ln_ENTRIES, + ("pmap_remove_pages: pte page ref count error")); + mpte->ref_count = 0; pmap_add_delayed_free_list(mpte, free, FALSE); } } else { diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index 36f8237ab407..c210367dc30a 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -101,6 +101,9 @@ * annotated below with two of these locks, then holding either lock is * sufficient for read access, but both locks are required for write * access. An annotation of (C) indicates that the field is immutable. + * An annotation of (A) indicates that modifications to the field must + * be atomic. Accesses to such fields may require additional + * synchronization depending on the context. * * In contrast, the synchronization of accesses to the page's * dirty field is machine dependent (M). In the @@ -207,10 +210,7 @@ struct vm_page { vm_pindex_t pindex; /* offset into object (O,P) */ vm_paddr_t phys_addr; /* physical address of page (C) */ struct md_page md; /* machine dependent stuff */ - union { - u_int wire_count; - u_int ref_count; /* page references */ - }; + u_int ref_count; /* page references (A) */ volatile u_int busy_lock; /* busy owners lock */ uint16_t flags; /* page PG_* flags (P) */ uint8_t order; /* index of the buddy queue (F) */ diff --git a/sys/x86/iommu/intel_idpgtbl.c b/sys/x86/iommu/intel_idpgtbl.c index bdd380f9d2a0..3c8e27e1b946 100644 --- a/sys/x86/iommu/intel_idpgtbl.c +++ b/sys/x86/iommu/intel_idpgtbl.c @@ -392,7 +392,7 @@ domain_pgtbl_map_pte(struct dmar_domain *domain, dmar_gaddr_t base, int lvl, * pte write and clean while the lock is * dropped. */ - m->wire_count++; + m->ref_count++; sfp = NULL; ptep = domain_pgtbl_map_pte(domain, base, lvl - 1, @@ -400,7 +400,7 @@ domain_pgtbl_map_pte(struct dmar_domain *domain, dmar_gaddr_t base, int lvl, if (ptep == NULL) { KASSERT(m->pindex != 0, ("loosing root page %p", domain)); - m->wire_count--; + m->ref_count--; dmar_pgfree(domain->pgtbl_obj, m->pindex, flags); return (NULL); @@ -408,8 +408,8 @@ domain_pgtbl_map_pte(struct dmar_domain *domain, dmar_gaddr_t base, int lvl, dmar_pte_store(&ptep->pte, DMAR_PTE_R | DMAR_PTE_W | VM_PAGE_TO_PHYS(m)); dmar_flush_pte_to_ram(domain->dmar, ptep); - sf_buf_page(sfp)->wire_count += 1; - m->wire_count--; + sf_buf_page(sfp)->ref_count += 1; + m->ref_count--; dmar_unmap_pgtbl(sfp); /* Only executed once. */ goto retry; @@ -489,7 +489,7 @@ domain_map_buf_locked(struct dmar_domain *domain, dmar_gaddr_t base, dmar_pte_store(&pte->pte, VM_PAGE_TO_PHYS(ma[pi]) | pflags | (superpage ? DMAR_PTE_SP : 0)); dmar_flush_pte_to_ram(domain->dmar, pte); - sf_buf_page(sf)->wire_count += 1; + sf_buf_page(sf)->ref_count += 1; } if (sf != NULL) dmar_unmap_pgtbl(sf); @@ -587,8 +587,8 @@ domain_unmap_clear_pte(struct dmar_domain *domain, dmar_gaddr_t base, int lvl, dmar_unmap_pgtbl(*sf); *sf = NULL; } - m->wire_count--; - if (m->wire_count != 0) + m->ref_count--; + if (m->ref_count != 0) return; KASSERT(lvl != 0, ("lost reference (lvl) on root pg domain %p base %jx lvl %d", @@ -701,7 +701,7 @@ domain_alloc_pgtbl(struct dmar_domain *domain) m = dmar_pgalloc(domain->pgtbl_obj, 0, DMAR_PGF_WAITOK | DMAR_PGF_ZERO | DMAR_PGF_OBJL); /* No implicit free of the top level page table page. */ - m->wire_count = 1; + m->ref_count = 1; DMAR_DOMAIN_PGUNLOCK(domain); DMAR_LOCK(domain->dmar); domain->flags |= DMAR_DOMAIN_PGTBL_INITED; @@ -731,10 +731,10 @@ domain_free_pgtbl(struct dmar_domain *domain) return; } - /* Obliterate wire_counts */ + /* Obliterate ref_counts */ VM_OBJECT_ASSERT_WLOCKED(obj); for (m = vm_page_lookup(obj, 0); m != NULL; m = vm_page_next(m)) - m->wire_count = 0; + m->ref_count = 0; VM_OBJECT_WUNLOCK(obj); vm_object_deallocate(obj); } From ef36db58da955412e1afc9c4bb3fee74645c6475 Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Wed, 25 Sep 2019 16:49:22 +0000 Subject: [PATCH 060/106] remove obsolete i386 MD memchr implementation bde reports (in a reply to r351700 commit mail): This uses scasb, which was last optimal on the 8086, or perhaps the original i386. On freefall, it is several times slower than the naive translation of the naive C code. Reported by: bde Reviewed by: kib, markj Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D21785 --- lib/libc/i386/string/Makefile.inc | 1 - lib/libc/i386/string/memchr.S | 61 ------------------------------- 2 files changed, 62 deletions(-) delete mode 100644 lib/libc/i386/string/memchr.S diff --git a/lib/libc/i386/string/Makefile.inc b/lib/libc/i386/string/Makefile.inc index 17170eeb3859..a8c50a285cbd 100644 --- a/lib/libc/i386/string/Makefile.inc +++ b/lib/libc/i386/string/Makefile.inc @@ -5,7 +5,6 @@ MDSRCS+= \ bcopy.S \ bzero.S \ ffs.S \ - memchr.S \ memcmp.S \ memcpy.S \ memmove.S \ diff --git a/lib/libc/i386/string/memchr.S b/lib/libc/i386/string/memchr.S deleted file mode 100644 index 3bd4d9c3a4d6..000000000000 --- a/lib/libc/i386/string/memchr.S +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 1993 Winning Strategies, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by Winning Strategies, Inc. - * 4. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include -__FBSDID("$FreeBSD$"); - -/* - * memchr (b, c, len) - * locates the first occurrence of c in string b. - * - * Written by: - * J.T. Conklin (jtc@wimsey.com), Winning Strategies, Inc. - */ - -ENTRY(memchr) - pushl %edi - movl 8(%esp),%edi /* string address */ - movl 12(%esp),%eax /* set character to search for */ - movl 16(%esp),%ecx /* set length of search */ - testl %esp,%esp /* clear Z flag, for len == 0 */ - cld /* set search forward */ - repne /* search! */ - scasb - jnz L1 /* scan failed, return null */ - leal -1(%edi),%eax /* adjust result of scan */ - popl %edi - ret - .align 2,0x90 -L1: xorl %eax,%eax - popl %edi - ret -END(memchr) - - .section .note.GNU-stack,"",%progbits From 2b93f779d22332da86a5398e59b57a5e27b9ffb6 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Wed, 25 Sep 2019 17:08:35 +0000 Subject: [PATCH 061/106] Add some counters for per-VM page events. For now, just count batched page queue state operations. vm.stats.page.queue_ops counts the number of batch entries that successfully completed, while queue_nops counts entries that had no effect, which occurs when the queue operation had been completed before the batch entry was processed. Reviewed by: alc, kib MFC after: 1 week Sponsored by: Intel, Netflix Differential Revision: https://reviews.freebsd.org/D21782 --- sys/vm/vm_page.c | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index bbfc5a7a334c..e375c1b98c85 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -73,11 +73,12 @@ __FBSDID("$FreeBSD$"); #include #include -#include +#include #include #include #include #include +#include #include #include #include @@ -130,6 +131,28 @@ static int vm_min_waiters; static int vm_severe_waiters; static int vm_pageproc_waiters; +static SYSCTL_NODE(_vm_stats, OID_AUTO, page, CTLFLAG_RD, 0, + "VM page statistics"); + +static counter_u64_t queue_ops = EARLY_COUNTER; +SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, queue_ops, + CTLFLAG_RD, &queue_ops, + "Number of batched queue operations"); + +static counter_u64_t queue_nops = EARLY_COUNTER; +SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, queue_nops, + CTLFLAG_RD, &queue_nops, + "Number of batched queue operations with no effects"); + +static void +counter_startup(void) +{ + + queue_ops = counter_u64_alloc(M_WAITOK); + queue_nops = counter_u64_alloc(M_WAITOK); +} +SYSINIT(page_counters, SI_SUB_CPU, SI_ORDER_ANY, counter_startup, NULL); + /* * bogus page -- for I/O to/from partially complete buffers, * or for paging into sparsely invalid regions. @@ -3117,6 +3140,7 @@ vm_pqbatch_process_page(struct vm_pagequeue *pq, vm_page_t m) if (__predict_true((qflags & PGA_ENQUEUED) != 0)) vm_pagequeue_remove(pq, m); vm_page_dequeue_complete(m); + counter_u64_add(queue_ops, 1); } else if ((qflags & (PGA_REQUEUE | PGA_REQUEUE_HEAD)) != 0) { if ((qflags & PGA_ENQUEUED) != 0) TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); @@ -3141,6 +3165,9 @@ vm_pqbatch_process_page(struct vm_pagequeue *pq, vm_page_t m) vm_page_aflag_clear(m, qflags & (PGA_REQUEUE | PGA_REQUEUE_HEAD)); + counter_u64_add(queue_ops, 1); + } else { + counter_u64_add(queue_nops, 1); } } From 38325e2ab8f0c7823e31df13bfc52e38e7f6d616 Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Wed, 25 Sep 2019 17:14:43 +0000 Subject: [PATCH 062/106] bsdgrep(1): various fixes of empty pattern/exit code/-c behavior When an empty pattern is encountered in the pattern list, I had previously broken bsdgrep to count that as a "match all" and ignore any other patterns in the list. This commit rectifies that mistake, among others: - The -v flag semantics were not quite right; lines matched should have been counted differently based on whether the -v flag was set or not. procline now definitively returns whether it's matched or not, and interpreting that result has been kicked up a level. - Empty patterns with the -x flag was broken similarly to empty patterns with the -w flag. The former is a whole-line match and should be more strict, only matching blank lines. No -x and no -w will will match the empty string at the beginning of each line. - The exit code with -L was broken, w.r.t. modern grep. Modern grap will exit(0) if any file that didn't match was output, so our interpretation was simply backwards. The new interpretation makes sense to me. Tests updated and added to try and catch some of this. This misbehavior was found by autoconf while fixing ports found in PR 229925 expecting either a more sane or a more GNU-like sed. MFC after: 1 week --- contrib/netbsd-tests/usr.bin/grep/t_grep.sh | 64 ++++++++++++++++++++- usr.bin/grep/grep.c | 27 ++++----- usr.bin/grep/util.c | 37 ++++++++---- 3 files changed, 98 insertions(+), 30 deletions(-) diff --git a/contrib/netbsd-tests/usr.bin/grep/t_grep.sh b/contrib/netbsd-tests/usr.bin/grep/t_grep.sh index d6d0d46f4ac3..c5c47e992378 100755 --- a/contrib/netbsd-tests/usr.bin/grep/t_grep.sh +++ b/contrib/netbsd-tests/usr.bin/grep/t_grep.sh @@ -413,6 +413,60 @@ wflag_emptypat_body() atf_check -o file:test4 grep -w -e "" test4 } +atf_test_case xflag_emptypat +xflag_emptypat_body() +{ + printf "" > test1 + printf "\n" > test2 + printf "qaz" > test3 + printf " qaz\n" > test4 + + # -x is whole-line, more strict than -w. + atf_check -s exit:1 -o empty grep -x -e "" test1 + + atf_check -o file:test2 grep -x -e "" test2 + + atf_check -s exit:1 -o empty grep -x -e "" test3 + + atf_check -s exit:1 -o empty grep -x -e "" test4 + + total=$(wc -l /COPYRIGHT | sed 's/[^0-9]//g') + + # Simple checks that grep -x with an empty pattern isn't matching every + # line. The exact counts aren't important, as long as they don't + # match the total line count and as long as they don't match each other. + atf_check -o save:xpositive.count grep -Fxc '' /COPYRIGHT + atf_check -o save:xnegative.count grep -Fvxc '' /COPYRIGHT + + atf_check -o not-inline:"${total}" cat xpositive.count + atf_check -o not-inline:"${total}" cat xnegative.count + + atf_check -o not-file:xnegative.count cat xpositive.count +} + +atf_test_case xflag_emptypat_plus +xflag_emptypat_plus_body() +{ + printf "foo\n\nbar\n\nbaz\n" > target + printf "foo\n \nbar\n \nbaz\n" > target_spacelines + printf "foo\nbar\nbaz\n" > matches + printf " \n \n" > spacelines + + printf "foo\n\nbar\n\nbaz\n" > patlist1 + printf "foo\n\nba\n\nbaz\n" > patlist2 + + sed -e '/bar/d' target > matches_not2 + + # Normal handling first + atf_check -o file:target grep -Fxf patlist1 target + atf_check -o file:matches grep -Fxf patlist1 target_spacelines + atf_check -o file:matches_not2 grep -Fxf patlist2 target + + # -v handling + atf_check -s exit:1 -o empty grep -Fvxf patlist1 target + atf_check -o file:spacelines grep -Fxvf patlist1 target_spacelines +} + atf_test_case excessive_matches excessive_matches_head() { @@ -551,6 +605,12 @@ grep_nomatch_flags_head() grep_nomatch_flags_body() { + grep_type + + if [ $? -eq $GREP_TYPE_GNU_FREEBSD ]; then + atf_expect_fail "this test does not pass with GNU grep in base" + fi + printf "A\nB\nC\n" > test1 atf_check -o inline:"1\n" grep -c -C 1 -e "B" test1 @@ -563,7 +623,7 @@ grep_nomatch_flags_body() atf_check -o inline:"test1\n" grep -l -A 1 -e "B" test1 atf_check -o inline:"test1\n" grep -l -C 1 -e "B" test1 - atf_check -s exit:1 -o inline:"test1\n" grep -L -e "D" test1 + atf_check -o inline:"test1\n" grep -L -e "D" test1 atf_check -o empty grep -q -e "B" test1 atf_check -o empty grep -q -B 1 -e "B" test1 @@ -777,6 +837,8 @@ atf_init_test_cases() atf_add_test_case egrep_empty_invalid atf_add_test_case zerolen atf_add_test_case wflag_emptypat + atf_add_test_case xflag_emptypat + atf_add_test_case xflag_emptypat_plus atf_add_test_case excessive_matches atf_add_test_case wv_combo_break atf_add_test_case fgrep_sanity diff --git a/usr.bin/grep/grep.c b/usr.bin/grep/grep.c index 20911c294343..731e46bb112e 100644 --- a/usr.bin/grep/grep.c +++ b/usr.bin/grep/grep.c @@ -218,20 +218,9 @@ static void add_pattern(char *pat, size_t len) { - /* Do not add further pattern is we already match everything */ - if (matchall) - return; - /* Check if we can do a shortcut */ if (len == 0) { matchall = true; - for (unsigned int i = 0; i < patterns; i++) { - free(pattern[i].pat); - } - pattern = grep_realloc(pattern, sizeof(struct pat)); - pattern[0].pat = NULL; - pattern[0].len = 0; - patterns = 1; return; } /* Increase size if necessary */ @@ -654,7 +643,7 @@ main(int argc, char *argv[]) aargv += optind; /* Empty pattern file matches nothing */ - if (!needpattern && (patterns == 0)) + if (!needpattern && (patterns == 0) && !matchall) exit(1); /* Fail if we don't have any pattern */ @@ -701,11 +690,10 @@ main(int argc, char *argv[]) r_pattern = grep_calloc(patterns, sizeof(*r_pattern)); - /* Don't process any patterns if we have a blank one */ #ifdef WITH_INTERNAL_NOSPEC - if (!matchall && grepbehave != GREP_FIXED) { + if (grepbehave != GREP_FIXED) { #else - if (!matchall) { + { #endif /* Check if cheating is allowed (always is for fgrep). */ for (i = 0; i < patterns; ++i) { @@ -737,7 +725,12 @@ main(int argc, char *argv[]) matched = true; } - /* Find out the correct return value according to the - results and the command line option. */ + if (Lflag) + matched = !matched; + + /* + * Calculate the correct return value according to the + * results and the command line option. + */ exit(matched ? (file_err ? (qflag ? 0 : 2) : 0) : (file_err ? 2 : 1)); } diff --git a/usr.bin/grep/util.c b/usr.bin/grep/util.c index 07d9b40cbdcd..33afe4d6b030 100644 --- a/usr.bin/grep/util.c +++ b/usr.bin/grep/util.c @@ -210,7 +210,7 @@ procmatch_match(struct mprintc *mc, struct parsec *pc) while (pc->matchidx >= MAX_MATCHES) { /* Reset matchidx and try again */ pc->matchidx = 0; - if (procline(pc)) + if (procline(pc) == !vflag) printline(pc, ':'); else break; @@ -355,7 +355,7 @@ procfile(const char *fn) return (0); } - line_matched = procline(&pc); + line_matched = procline(&pc) == !vflag; if (line_matched) ++lines; @@ -469,17 +469,32 @@ procline(struct parsec *pc) matchidx = pc->matchidx; - /* Special case: empty pattern with -w flag, check first character */ - if (matchall && wflag) { + /* + * With matchall (empty pattern), we can try to take some shortcuts. + * Emtpy patterns trivially match every line except in the -w and -x + * cases. For -w (whole-word) cases, we only match if the first + * character isn't a word-character. For -x (whole-line) cases, we only + * match if the line is empty. + */ + if (matchall) { if (pc->ln.len == 0) return (true); - wend = L' '; - if (sscanf(&pc->ln.dat[0], "%lc", &wend) != 1 || iswword(wend)) - return (false); - else + if (wflag) { + wend = L' '; + if (sscanf(&pc->ln.dat[0], "%lc", &wend) == 1 && + !iswword(wend)) + return (true); + } else if (!xflag) return (true); - } else if (matchall) - return (true); + + /* + * If we don't have any other patterns, we really don't match. + * If we do have other patterns, we must fall through and check + * them. + */ + if (patterns == 0) + return (false); + } matched = false; st = pc->lnstart; @@ -609,8 +624,6 @@ procline(struct parsec *pc) /* Reflect the new matchidx in the context */ pc->matchidx = matchidx; - if (vflag) - matched = !matched; return matched; } From 85c5f3cb57d600c18ebd4f85ee1bc05ce0fea1a9 Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Wed, 25 Sep 2019 17:29:45 +0000 Subject: [PATCH 063/106] Add COMPAT12 support to makesyscalls.sh Reviewed by: kib, imp, brooks (all without syscalls.master edits) Differential Revision: https://reviews.freebsd.org/D21366 --- sys/compat/freebsd32/syscalls.master | 3 ++- sys/kern/makesyscalls.sh | 35 +++++++++++++++++++++++++--- sys/kern/syscalls.master | 3 ++- 3 files changed, 36 insertions(+), 5 deletions(-) diff --git a/sys/compat/freebsd32/syscalls.master b/sys/compat/freebsd32/syscalls.master index 2eba45c7ccd6..6810f40302b6 100644 --- a/sys/compat/freebsd32/syscalls.master +++ b/sys/compat/freebsd32/syscalls.master @@ -13,7 +13,7 @@ ; case where the event exists, but we don't want auditing, the ; event should be #defined to AUE_NULL in audit_kevents.h. ; type one of STD, OBSOL, UNIMPL, COMPAT, COMPAT4, COMPAT6, -; COMPAT7, COMPAT11, NODEF, NOARGS, NOPROTO, NOSTD +; COMPAT7, COMPAT11, COMPAT12, NODEF, NOARGS, NOPROTO, NOSTD ; The COMPAT* options may be combined with one or more NO* ; options separated by '|' with no spaces (e.g. COMPAT|NOARGS) ; name pseudo-prototype of syscall routine @@ -31,6 +31,7 @@ ; COMPAT7 included on COMPAT_FREEBSD7 #ifdef (FreeBSD 7 compat) ; COMPAT10 included on COMPAT_FREEBSD10 #ifdef (FreeBSD 10 compat) ; COMPAT11 included on COMPAT_FREEBSD11 #ifdef (FreeBSD 11 compat) +; COMPAT12 included on COMPAT_FREEBSD12 #ifdef (FreeBSD 12 compat) ; OBSOL obsolete, not included in system, only specifies name ; UNIMPL not implemented, placeholder only ; NOSTD implemented but as a lkm that can be statically diff --git a/sys/kern/makesyscalls.sh b/sys/kern/makesyscalls.sh index 522948fe6f1b..0e5772b72f81 100644 --- a/sys/kern/makesyscalls.sh +++ b/sys/kern/makesyscalls.sh @@ -11,6 +11,7 @@ compat6=COMPAT_FREEBSD6 compat7=COMPAT_FREEBSD7 compat10=COMPAT_FREEBSD10 compat11=COMPAT_FREEBSD11 +compat12=COMPAT_FREEBSD12 # output files: sysnames="syscalls.c" @@ -39,6 +40,8 @@ syscompat10="sysent.compat10.$$" syscompat10dcl="sysent.compat10dcl.$$" syscompat11="sysent.compat11.$$" syscompat11dcl="sysent.compat11dcl.$$" +syscompat12="sysent.compat12.$$" +syscompat12dcl="sysent.compat12dcl.$$" sysent="sysent.switch.$$" sysinc="sysinc.switch.$$" sysarg="sysarg.switch.$$" @@ -47,9 +50,9 @@ systracetmp="systrace.$$" systraceret="systraceret.$$" capabilities_conf="capabilities.conf" -trap "rm $sysaue $sysdcl $syscompat $syscompatdcl $syscompat4 $syscompat4dcl $syscompat6 $syscompat6dcl $syscompat7 $syscompat7dcl $syscompat10 $syscompat10dcl $syscompat11 $syscompat11dcl $sysent $sysinc $sysarg $sysprotoend $systracetmp $systraceret" 0 +trap "rm $sysaue $sysdcl $syscompat $syscompatdcl $syscompat4 $syscompat4dcl $syscompat6 $syscompat6dcl $syscompat7 $syscompat7dcl $syscompat10 $syscompat10dcl $syscompat11 $syscompat11dcl $syscompat12 $syscompat12dcl $sysent $sysinc $sysarg $sysprotoend $systracetmp $systraceret" 0 -touch $sysaue $sysdcl $syscompat $syscompatdcl $syscompat4 $syscompat4dcl $syscompat6 $syscompat6dcl $syscompat7 $syscompat7dcl $syscompat10 $syscompat10dcl $syscompat11 $syscompat11dcl $sysent $sysinc $sysarg $sysprotoend $systracetmp $systraceret +touch $sysaue $sysdcl $syscompat $syscompatdcl $syscompat4 $syscompat4dcl $syscompat6 $syscompat6dcl $syscompat7 $syscompat7dcl $syscompat10 $syscompat10dcl $syscompat11 $syscompat11dcl $syscompat12 $syscompat12dcl $sysent $sysinc $sysarg $sysprotoend $systracetmp $systraceret case $# in 0) echo "usage: $0 input-file " 1>&2 @@ -118,6 +121,8 @@ sed -e ' syscompat10dcl = \"$syscompat10dcl\" syscompat11 = \"$syscompat11\" syscompat11dcl = \"$syscompat11dcl\" + syscompat12 = \"$syscompat12\" + syscompat12dcl = \"$syscompat12dcl\" sysent = \"$sysent\" syssw = \"$syssw\" sysinc = \"$sysinc\" @@ -134,6 +139,7 @@ sed -e ' compat7 = \"$compat7\" compat10 = \"$compat10\" compat11 = \"$compat11\" + compat12 = \"$compat12\" syscallprefix = \"$syscallprefix\" switchname = \"$switchname\" namesname = \"$namesname\" @@ -188,6 +194,7 @@ sed -e ' printf "\n#ifdef %s\n\n", compat7 > syscompat7 printf "\n#ifdef %s\n\n", compat10 > syscompat10 printf "\n#ifdef %s\n\n", compat11 > syscompat11 + printf "\n#ifdef %s\n\n", compat12 > syscompat12 printf "/*\n * System call names.\n *\n" > sysnames printf " * DO NOT EDIT-- this file is automatically " generated ".\n" > sysnames @@ -239,6 +246,7 @@ sed -e ' print > syscompat7 print > syscompat10 print > syscompat11 + print > syscompat12 print > sysnames print > systrace print > systracetmp @@ -256,6 +264,7 @@ sed -e ' print > syscompat7 print > syscompat10 print > syscompat11 + print > syscompat12 print > sysnames print > systrace print > systracetmp @@ -273,6 +282,7 @@ sed -e ' print > syscompat7 print > syscompat10 print > syscompat11 + print > syscompat12 print > sysnames print > systrace print > systracetmp @@ -409,6 +419,8 @@ sed -e ' argalias = "freebsd10_" argalias if (flag("COMPAT11")) argalias = "freebsd11_" argalias + if (flag("COMPAT12")) + argalias = "freebsd12_" argalias } f++ @@ -569,7 +581,8 @@ sed -e ' next } type("COMPAT") || type("COMPAT4") || type("COMPAT6") || \ - type("COMPAT7") || type("COMPAT10") || type("COMPAT11") { + type("COMPAT7") || type("COMPAT10") || type("COMPAT11") || \ + type("COMPAT12") { if (flag("COMPAT")) { ncompat++ out = syscompat @@ -612,6 +625,13 @@ sed -e ' wrap = "compat11" prefix = "freebsd11_" descr = "freebsd11" + } else if (flag("COMPAT12")) { + ncompat12++ + out = syscompat12 + outdcl = syscompat12dcl + wrap = "compat12" + prefix = "freebsd12_" + descr = "freebsd12" } parseline() if (argc != 0 && !flag("NOARGS") && !flag("NOPROTO") && \ @@ -734,6 +754,13 @@ sed -e ' printf "#define compat11(n, name) 0, (sy_call_t *)nosys\n" > sysinc printf "#endif\n" > sysinc } + if (ncompat12 != 0) { + printf "\n#ifdef %s\n", compat12 > sysinc + printf "#define compat12(n, name) n, (sy_call_t *)__CONCAT(freebsd12_,name)\n" > sysinc + printf "#else\n" > sysinc + printf "#define compat12(n, name) 0, (sy_call_t *)nosys\n" > sysinc + printf "#endif\n" > sysinc + } printf("\n#endif /* %s */\n\n", compat) > syscompatdcl printf("\n#endif /* %s */\n\n", compat4) > syscompat4dcl @@ -741,6 +768,7 @@ sed -e ' printf("\n#endif /* %s */\n\n", compat7) > syscompat7dcl printf("\n#endif /* %s */\n\n", compat10) > syscompat10dcl printf("\n#endif /* %s */\n\n", compat11) > syscompat11dcl + printf("\n#endif /* %s */\n\n", compat12) > syscompat12dcl printf("\n#undef PAD_\n") > sysprotoend printf("#undef PADL_\n") > sysprotoend @@ -765,6 +793,7 @@ cat $sysarg $sysdcl \ $syscompat7 $syscompat7dcl \ $syscompat10 $syscompat10dcl \ $syscompat11 $syscompat11dcl \ + $syscompat12 $syscompat12dcl \ $sysaue $sysprotoend > $sysproto cat $systracetmp >> $systrace cat $systraceret >> $systrace diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master index 70ac18a5737b..97435517204c 100644 --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -12,7 +12,7 @@ ; case where the event exists, but we don't want auditing, the ; event should be #defined to AUE_NULL in audit_kevents.h. ; type one of STD, OBSOL, UNIMPL, COMPAT, COMPAT4, COMPAT6, -; COMPAT7, COMPAT11, NODEF, NOARGS, NOPROTO, NOSTD +; COMPAT7, COMPAT11, COMPAT12, NODEF, NOARGS, NOPROTO, NOSTD ; The COMPAT* options may be combined with one or more NO* ; options separated by '|' with no spaces (e.g. COMPAT|NOARGS) ; name pseudo-prototype of syscall routine @@ -30,6 +30,7 @@ ; COMPAT7 included on COMPAT_FREEBSD7 #ifdef (FreeBSD 7 compat) ; COMPAT10 included on COMPAT_FREEBSD10 #ifdef (FreeBSD 10 compat) ; COMPAT11 included on COMPAT_FREEBSD11 #ifdef (FreeBSD 11 compat) +; COMPAT12 included on COMPAT_FREEBSD12 #ifdef (FreeBSD 12 compat) ; OBSOL obsolete, not included in system, only specifies name ; UNIMPL not implemented, placeholder only ; NOSTD implemented but as a lkm that can be statically From d19f028e336bfcb0e6508dc1d76c46646028349f Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Wed, 25 Sep 2019 17:30:28 +0000 Subject: [PATCH 064/106] sysent: regenerate after r352693 --- sys/amd64/linux/linux_proto.h | 7 +++++++ sys/amd64/linux32/linux32_proto.h | 7 +++++++ sys/arm64/linux/linux_proto.h | 7 +++++++ sys/compat/freebsd32/freebsd32_proto.h | 24 ++++++++++++++++++++++++ sys/i386/linux/linux_proto.h | 7 +++++++ sys/sys/sysproto.h | 6 ++++++ 6 files changed, 58 insertions(+) diff --git a/sys/amd64/linux/linux_proto.h b/sys/amd64/linux/linux_proto.h index 199229c42760..de89ce271c5e 100644 --- a/sys/amd64/linux/linux_proto.h +++ b/sys/amd64/linux/linux_proto.h @@ -1556,6 +1556,13 @@ int linux_io_uring_register(struct thread *, struct linux_io_uring_register_args #endif /* COMPAT_FREEBSD11 */ + +#ifdef COMPAT_FREEBSD12 + +#define nosys linux_nosys + +#endif /* COMPAT_FREEBSD12 */ + #define LINUX_SYS_AUE_linux_open AUE_OPEN_RWTC #define LINUX_SYS_AUE_linux_newstat AUE_STAT #define LINUX_SYS_AUE_linux_newfstat AUE_FSTAT diff --git a/sys/amd64/linux32/linux32_proto.h b/sys/amd64/linux32/linux32_proto.h index c23215d30fbb..21ad0cb51e07 100644 --- a/sys/amd64/linux32/linux32_proto.h +++ b/sys/amd64/linux32/linux32_proto.h @@ -1896,6 +1896,13 @@ int linux_io_uring_register(struct thread *, struct linux_io_uring_register_args #endif /* COMPAT_FREEBSD11 */ + +#ifdef COMPAT_FREEBSD12 + +#define nosys linux_nosys + +#endif /* COMPAT_FREEBSD12 */ + #define LINUX32_SYS_AUE_linux_exit AUE_EXIT #define LINUX32_SYS_AUE_linux_fork AUE_FORK #define LINUX32_SYS_AUE_linux_open AUE_OPEN_RWTC diff --git a/sys/arm64/linux/linux_proto.h b/sys/arm64/linux/linux_proto.h index d927f4421020..c8d4fc353335 100644 --- a/sys/arm64/linux/linux_proto.h +++ b/sys/arm64/linux/linux_proto.h @@ -1320,6 +1320,13 @@ int linux_pkey_free(struct thread *, struct linux_pkey_free_args *); #endif /* COMPAT_FREEBSD11 */ + +#ifdef COMPAT_FREEBSD12 + +#define nosys linux_nosys + +#endif /* COMPAT_FREEBSD12 */ + #define LINUX_SYS_AUE_linux_setxattr AUE_NULL #define LINUX_SYS_AUE_linux_lsetxattr AUE_NULL #define LINUX_SYS_AUE_linux_fsetxattr AUE_NULL diff --git a/sys/compat/freebsd32/freebsd32_proto.h b/sys/compat/freebsd32/freebsd32_proto.h index 95c5b8ece9f9..172fb79d4614 100644 --- a/sys/compat/freebsd32/freebsd32_proto.h +++ b/sys/compat/freebsd32/freebsd32_proto.h @@ -1266,6 +1266,30 @@ int freebsd11_freebsd32_fstatat(struct thread *, struct freebsd11_freebsd32_fsta #endif /* COMPAT_FREEBSD11 */ + +#ifdef COMPAT_FREEBSD12 + +#if !defined(PAD64_REQUIRED) && !defined(__amd64__) +#define PAD64_REQUIRED +#endif +#ifdef PAD64_REQUIRED +#else +#endif +#ifdef PAD64_REQUIRED +#else +#endif +#ifdef PAD64_REQUIRED +#else +#endif +#ifdef PAD64_REQUIRED +#else +#endif +#ifdef PAD64_REQUIRED +#else +#endif + +#endif /* COMPAT_FREEBSD12 */ + #define FREEBSD32_SYS_AUE_freebsd32_wait4 AUE_WAIT4 #define FREEBSD32_SYS_AUE_freebsd4_freebsd32_getfsstat AUE_GETFSSTAT #define FREEBSD32_SYS_AUE_ofreebsd32_lseek AUE_LSEEK diff --git a/sys/i386/linux/linux_proto.h b/sys/i386/linux/linux_proto.h index 574bbb4ff818..710b27353c3e 100644 --- a/sys/i386/linux/linux_proto.h +++ b/sys/i386/linux/linux_proto.h @@ -1916,6 +1916,13 @@ int linux_io_uring_register(struct thread *, struct linux_io_uring_register_args #endif /* COMPAT_FREEBSD11 */ + +#ifdef COMPAT_FREEBSD12 + +#define nosys linux_nosys + +#endif /* COMPAT_FREEBSD12 */ + #define LINUX_SYS_AUE_linux_exit AUE_EXIT #define LINUX_SYS_AUE_linux_fork AUE_FORK #define LINUX_SYS_AUE_linux_open AUE_OPEN_RWTC diff --git a/sys/sys/sysproto.h b/sys/sys/sysproto.h index aabe238ddcda..5de3ce619204 100644 --- a/sys/sys/sysproto.h +++ b/sys/sys/sysproto.h @@ -2643,6 +2643,12 @@ int freebsd11_mknodat(struct thread *, struct freebsd11_mknodat_args *); #endif /* COMPAT_FREEBSD11 */ + +#ifdef COMPAT_FREEBSD12 + + +#endif /* COMPAT_FREEBSD12 */ + #define SYS_AUE_syscall AUE_NULL #define SYS_AUE_exit AUE_EXIT #define SYS_AUE_fork AUE_FORK From af755d3e485918c68ca0548687fd59e200823046 Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Wed, 25 Sep 2019 17:32:43 +0000 Subject: [PATCH 065/106] [1/3] Add mostly Linux-compatible file sealing support File sealing applies protections against certain actions (currently: write, growth, shrink) at the inode level. New fileops are added to accommodate seals - EINVAL is returned by fcntl(2) if they are not implemented. Reviewed by: markj, kib Differential Revision: https://reviews.freebsd.org/D21391 --- sys/kern/kern_descrip.c | 21 ++++++- sys/kern/uipc_shm.c | 128 +++++++++++++++++++++++++++++++++------- sys/sys/fcntl.h | 10 +++- sys/sys/file.h | 22 +++++++ sys/sys/mman.h | 2 + 5 files changed, 161 insertions(+), 22 deletions(-) diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index 80376910155b..2c1606bd4020 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -489,7 +489,7 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) struct filedescent *fde; struct proc *p; struct vnode *vp; - int error, flg, tmp; + int error, flg, seals, tmp; uint64_t bsize; off_t foffset; @@ -756,6 +756,25 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) fdrop(fp, td); break; + case F_ADD_SEALS: + error = fget_unlocked(fdp, fd, &cap_no_rights, &fp, NULL); + if (error != 0) + break; + error = fo_add_seals(fp, arg); + fdrop(fp, td); + break; + + case F_GET_SEALS: + error = fget_unlocked(fdp, fd, &cap_no_rights, &fp, NULL); + if (error != 0) + break; + if (fo_get_seals(fp, &seals) == 0) + td->td_retval[0] = seals; + else + error = EINVAL; + fdrop(fp, td); + break; + case F_RDAHEAD: arg = arg ? 128 * 1024: 0; /* FALLTHROUGH */ diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c index 415104ec8e8d..feb4a92d1d97 100644 --- a/sys/kern/uipc_shm.c +++ b/sys/kern/uipc_shm.c @@ -119,6 +119,8 @@ static void shm_init(void *arg); static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd); static struct shmfd *shm_lookup(char *path, Fnv32_t fnv); static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred); +static int shm_dotruncate_locked(struct shmfd *shmfd, off_t length, + void *rl_cookie); static fo_rdwr_t shm_read; static fo_rdwr_t shm_write; @@ -131,6 +133,8 @@ static fo_chown_t shm_chown; static fo_seek_t shm_seek; static fo_fill_kinfo_t shm_fill_kinfo; static fo_mmap_t shm_mmap; +static fo_get_seals_t shm_get_seals; +static fo_add_seals_t shm_add_seals; /* File descriptor operations. */ struct fileops shm_ops = { @@ -148,6 +152,8 @@ struct fileops shm_ops = { .fo_seek = shm_seek, .fo_fill_kinfo = shm_fill_kinfo, .fo_mmap = shm_mmap, + .fo_get_seals = shm_get_seals, + .fo_add_seals = shm_add_seals, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; @@ -316,8 +322,10 @@ shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred, rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset, uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx); } - - error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); + if ((shmfd->shm_seals & F_SEAL_WRITE) != 0) + error = EPERM; + else + error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); foffset_unlock_uio(fp, uio, flags); return (error); @@ -412,8 +420,8 @@ shm_close(struct file *fp, struct thread *td) return (0); } -int -shm_dotruncate(struct shmfd *shmfd, off_t length) +static int +shm_dotruncate_locked(struct shmfd *shmfd, off_t length, void *rl_cookie) { vm_object_t object; vm_page_t m; @@ -423,23 +431,23 @@ shm_dotruncate(struct shmfd *shmfd, off_t length) KASSERT(length >= 0, ("shm_dotruncate: length < 0")); object = shmfd->shm_object; - VM_OBJECT_WLOCK(object); - if (length == shmfd->shm_size) { - VM_OBJECT_WUNLOCK(object); + VM_OBJECT_ASSERT_WLOCKED(object); + rangelock_cookie_assert(rl_cookie, RA_WLOCKED); + if (length == shmfd->shm_size) return (0); - } nobjsize = OFF_TO_IDX(length + PAGE_MASK); /* Are we shrinking? If so, trim the end. */ if (length < shmfd->shm_size) { + if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0) + return (EPERM); + /* * Disallow any requests to shrink the size if this * object is mapped into the kernel. */ - if (shmfd->shm_kmappings > 0) { - VM_OBJECT_WUNLOCK(object); + if (shmfd->shm_kmappings > 0) return (EBUSY); - } /* * Zero the truncated part of the last page. @@ -499,12 +507,13 @@ shm_dotruncate(struct shmfd *shmfd, off_t length) swap_release_by_cred(delta, object->cred); object->charge -= delta; } else { + if ((shmfd->shm_seals & F_SEAL_GROW) != 0) + return (EPERM); + /* Try to reserve additional swap space. */ delta = IDX_TO_OFF(nobjsize - object->size); - if (!swap_reserve_by_cred(delta, object->cred)) { - VM_OBJECT_WUNLOCK(object); + if (!swap_reserve_by_cred(delta, object->cred)) return (ENOMEM); - } object->charge += delta; } shmfd->shm_size = length; @@ -513,10 +522,24 @@ shm_dotruncate(struct shmfd *shmfd, off_t length) shmfd->shm_mtime = shmfd->shm_ctime; mtx_unlock(&shm_timestamp_lock); object->size = nobjsize; - VM_OBJECT_WUNLOCK(object); return (0); } +int +shm_dotruncate(struct shmfd *shmfd, off_t length) +{ + void *rl_cookie; + int error; + + rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, + &shmfd->shm_mtx); + VM_OBJECT_WLOCK(shmfd->shm_object); + error = shm_dotruncate_locked(shmfd, length, rl_cookie); + VM_OBJECT_WUNLOCK(shmfd->shm_object); + rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); + return (error); +} + /* * shmfd object management including creation and reference counting * routines. @@ -878,10 +901,13 @@ shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize, vm_prot_t maxprot; int error; bool writecnt; + void *rl_cookie; shmfd = fp->f_data; maxprot = VM_PROT_NONE; + rl_cookie = rangelock_rlock(&shmfd->shm_rl, 0, objsize, + &shmfd->shm_mtx); /* FREAD should always be set. */ if ((fp->f_flag & FREAD) != 0) maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; @@ -890,9 +916,16 @@ shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize, writecnt = (flags & MAP_SHARED) != 0 && (prot & VM_PROT_WRITE) != 0; + if (writecnt && (shmfd->shm_seals & F_SEAL_WRITE) != 0) { + error = EPERM; + goto out; + } + /* Don't permit shared writable mappings on read-only descriptors. */ - if (writecnt && (maxprot & VM_PROT_WRITE) == 0) - return (EACCES); + if (writecnt && (maxprot & VM_PROT_WRITE) == 0) { + error = EACCES; + goto out; + } maxprot &= cap_maxprot; /* See comment in vn_mmap(). */ @@ -900,13 +933,15 @@ shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize, #ifdef _LP64 objsize > OFF_MAX || #endif - foff < 0 || foff > OFF_MAX - objsize) - return (EINVAL); + foff < 0 || foff > OFF_MAX - objsize) { + error = EINVAL; + goto out; + } #ifdef MAC error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, flags); if (error != 0) - return (error); + goto out; #endif mtx_lock(&shm_timestamp_lock); @@ -924,6 +959,8 @@ shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize, objsize); vm_object_deallocate(shmfd->shm_object); } +out: + rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); return (error); } @@ -1131,6 +1168,57 @@ shm_fill_kinfo(struct file *fp, struct kinfo_file *kif, return (res); } +static int +shm_add_seals(struct file *fp, int seals) +{ + struct shmfd *shmfd; + void *rl_cookie; + vm_ooffset_t writemappings; + int error, nseals; + + error = 0; + shmfd = fp->f_data; + rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, + &shmfd->shm_mtx); + + /* Even already-set seals should result in EPERM. */ + if ((shmfd->shm_seals & F_SEAL_SEAL) != 0) { + error = EPERM; + goto out; + } + nseals = seals & ~shmfd->shm_seals; + if ((nseals & F_SEAL_WRITE) != 0) { + /* + * The rangelock above prevents writable mappings from being + * added after we've started applying seals. The RLOCK here + * is to avoid torn reads on ILP32 arches as unmapping/reducing + * writemappings will be done without a rangelock. + */ + VM_OBJECT_RLOCK(shmfd->shm_object); + writemappings = shmfd->shm_object->un_pager.swp.writemappings; + VM_OBJECT_RUNLOCK(shmfd->shm_object); + /* kmappings are also writable */ + if (writemappings > 0) { + error = EBUSY; + goto out; + } + } + shmfd->shm_seals |= nseals; +out: + rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); + return (error); +} + +static int +shm_get_seals(struct file *fp, int *seals) +{ + struct shmfd *shmfd; + + shmfd = fp->f_data; + *seals = shmfd->shm_seals; + return (0); +} + static int sysctl_posix_shm_list(SYSCTL_HANDLER_ARGS) { diff --git a/sys/sys/fcntl.h b/sys/sys/fcntl.h index b2e7f94199d0..5359a317c4e5 100644 --- a/sys/sys/fcntl.h +++ b/sys/sys/fcntl.h @@ -248,7 +248,15 @@ typedef __pid_t pid_t; #endif #if __BSD_VISIBLE #define F_DUP2FD_CLOEXEC 18 /* Like F_DUP2FD, but FD_CLOEXEC is set */ -#endif +#define F_ADD_SEALS 19 +#define F_GET_SEALS 20 + +/* Seals (F_ADD_SEALS, F_GET_SEALS). */ +#define F_SEAL_SEAL 0x0001 /* Prevent adding sealings */ +#define F_SEAL_SHRINK 0x0002 /* May not shrink */ +#define F_SEAL_GROW 0x0004 /* May not grow */ +#define F_SEAL_WRITE 0x0008 /* May not write */ +#endif /* __BSD_VISIBLE */ /* file descriptor flags (F_GETFD, F_SETFD) */ #define FD_CLOEXEC 1 /* close-on-exec flag */ diff --git a/sys/sys/file.h b/sys/sys/file.h index 876300a0f3d8..3b00b0809836 100644 --- a/sys/sys/file.h +++ b/sys/sys/file.h @@ -123,6 +123,8 @@ typedef int fo_mmap_t(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td); typedef int fo_aio_queue_t(struct file *fp, struct kaiocb *job); +typedef int fo_add_seals_t(struct file *fp, int flags); +typedef int fo_get_seals_t(struct file *fp, int *flags); typedef int fo_flags_t; struct fileops { @@ -141,6 +143,8 @@ struct fileops { fo_fill_kinfo_t *fo_fill_kinfo; fo_mmap_t *fo_mmap; fo_aio_queue_t *fo_aio_queue; + fo_add_seals_t *fo_add_seals; + fo_get_seals_t *fo_get_seals; fo_flags_t fo_flags; /* DFLAG_* below */ }; @@ -426,6 +430,24 @@ fo_aio_queue(struct file *fp, struct kaiocb *job) return ((*fp->f_ops->fo_aio_queue)(fp, job)); } +static __inline int +fo_add_seals(struct file *fp, int seals) +{ + + if (fp->f_ops->fo_add_seals == NULL) + return (EINVAL); + return ((*fp->f_ops->fo_add_seals)(fp, seals)); +} + +static __inline int +fo_get_seals(struct file *fp, int *seals) +{ + + if (fp->f_ops->fo_get_seals == NULL) + return (EINVAL); + return ((*fp->f_ops->fo_get_seals)(fp, seals)); +} + #endif /* _KERNEL */ #endif /* !SYS_FILE_H */ diff --git a/sys/sys/mman.h b/sys/sys/mman.h index 1b1b4bcc2cb9..ae95b9fee791 100644 --- a/sys/sys/mman.h +++ b/sys/sys/mman.h @@ -238,6 +238,8 @@ struct shmfd { struct rangelock shm_rl; struct mtx shm_mtx; + + int shm_seals; }; #endif From f17221ee7ada17711094996bcc03f383d96740cc Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Wed, 25 Sep 2019 17:33:12 +0000 Subject: [PATCH 066/106] Update fcntl(2) after r352695 --- lib/libc/sys/fcntl.2 | 63 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/lib/libc/sys/fcntl.2 b/lib/libc/sys/fcntl.2 index 35bb6166cee1..189b505f4332 100644 --- a/lib/libc/sys/fcntl.2 +++ b/lib/libc/sys/fcntl.2 @@ -28,7 +28,7 @@ .\" @(#)fcntl.2 8.2 (Berkeley) 1/12/94 .\" $FreeBSD$ .\" -.Dd Nov 15, 2018 +.Dd September 4, 2019 .Dt FCNTL 2 .Os .Sh NAME @@ -180,6 +180,11 @@ is non-zero. A zero value in .Fa arg turns off read ahead. +.It Dv F_ADD_SEALS +Add seals to the file as described below, if the underlying filesystem supports +seals. +.It Dv F_GET_SEALS +Get seals associated with the file, if the underlying filesystem supports seals. .El .Pp The flags for the @@ -217,6 +222,37 @@ when I/O is possible, e.g., upon availability of data to be read. .El .Pp +The seals that may be applied with +.Dv F_ADD_SEALS +are as follows: +.Bl -tag -width F_SEAL_SHRINK +.It Dv F_SEAL_SEAL +Prevent any further seals from being applied to the file. +.It Dv F_SEAL_SHRINK +Prevent the file from being shrunk with +.Xr ftruncate 2 . +.It Dv F_SEAL_GROW +Prevent the file from being enlarged with +.Xr ftruncate 2 . +.It Dv F_SEAL_WRITE +Prevent any further +.Xr write 2 +calls to the file. +Any writes in progress will finish before +.Fn fcntl +returns. +If any writeable mappings exist, F_ADD_SEALS will fail and return +.Dv EBUSY . +.El +.Pp +Seals are on a per-inode basis and require support by the underlying filesystem. +If the underlying filesystem does not support seals, +.Dv F_ADD_SEALS +and +.Dv F_GET_SEALS +will fail and return +.Dv EINVAL . +.Pp Several commands are available for doing advisory file locking; they all operate on the following structure: .Bd -literal @@ -528,6 +564,14 @@ is an exclusive lock and .Fa fd is not a valid file descriptor open for writing. +.It Bq Er EBUSY +The argument +.Fa cmd +is +.Dv F_ADD_SEALS , +attempting to set +.Dv F_SEAL_WRITE , +and writeable mappings of the file exist. .It Bq Er EDEADLK The argument .Fa cmd @@ -565,6 +609,14 @@ points is not valid. .Pp The argument .Fa cmd +is +.Dv F_ADD_SEALS +or +.Dv F_GET_SEALS , +and the underlying filesystem does not support sealing. +.Pp +The argument +.Fa cmd is invalid. .It Bq Er EMFILE The argument @@ -624,6 +676,15 @@ is and the process ID or process group given as an argument is in a different session than the caller. +.Pp +The +.Fa cmd +argument +is +.Dv F_ADD_SEALS +and the +.Dv F_SEAL_SEAL +seal has already been set. .It Bq Er ESRCH The .Fa cmd From 0cd95859c823f2a783a4f12cea4bc889061be242 Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Wed, 25 Sep 2019 17:35:03 +0000 Subject: [PATCH 067/106] [2/3] Add an initial seal argument to kern_shm_open() Now that flags may be set on posixshm, add an argument to kern_shm_open() for the initial seals. To maintain past behavior where callers of shm_open(2) are guaranteed to not have any seals applied to the fd they're given, apply F_SEAL_SEAL for existing callers of kern_shm_open. A special flag could be opened later for shm_open(2) to indicate that sealing should be allowed. We currently restrict initial seals to F_SEAL_SEAL. We cannot error out if F_SEAL_SEAL is re-applied, as this would easily break shm_open() twice to a shmfd that already existed. A note's been added about the assumptions we've made here as a hint towards anyone wanting to allow other seals to be applied at creation. Reviewed by: kib, markj Differential Revision: https://reviews.freebsd.org/D21392 --- sys/compat/cloudabi/cloudabi_fd.c | 3 +- sys/kern/uipc_shm.c | 63 ++++++++++++++++++++++++++++--- sys/sys/syscallsubr.h | 2 +- 3 files changed, 61 insertions(+), 7 deletions(-) diff --git a/sys/compat/cloudabi/cloudabi_fd.c b/sys/compat/cloudabi/cloudabi_fd.c index 4e3f94f08f70..02dd357effda 100644 --- a/sys/compat/cloudabi/cloudabi_fd.c +++ b/sys/compat/cloudabi/cloudabi_fd.c @@ -28,6 +28,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include @@ -95,7 +96,7 @@ cloudabi_sys_fd_create1(struct thread *td, cap_rights_init(&fcaps.fc_rights, CAP_FSTAT, CAP_FTRUNCATE, CAP_MMAP_RWX); return (kern_shm_open(td, SHM_ANON, O_RDWR | O_CLOEXEC, 0, - &fcaps)); + &fcaps, F_SEAL_SEAL)); default: return (EINVAL); } diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c index feb4a92d1d97..2674fdfedba9 100644 --- a/sys/kern/uipc_shm.c +++ b/sys/kern/uipc_shm.c @@ -701,13 +701,14 @@ shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred) int kern_shm_open(struct thread *td, const char *userpath, int flags, mode_t mode, - struct filecaps *fcaps) + struct filecaps *fcaps, int initial_seals) { struct filedesc *fdp; struct shmfd *shmfd; struct file *fp; char *path; const char *pr_path; + void *rl_cookie; size_t pr_pathlen; Fnv32_t fnv; mode_t cmode; @@ -730,6 +731,17 @@ kern_shm_open(struct thread *td, const char *userpath, int flags, mode_t mode, if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0) return (EINVAL); + /* + * Currently only F_SEAL_SEAL may be set when creating or opening shmfd. + * If the decision is made later to allow additional seals, care must be + * taken below to ensure that the seals are properly set if the shmfd + * already existed -- this currently assumes that only F_SEAL_SEAL can + * be set and doesn't take further precautions to ensure the validity of + * the seals being added with respect to current mappings. + */ + if ((initial_seals & ~F_SEAL_SEAL) != 0) + return (EINVAL); + fdp = td->td_proc->p_fd; cmode = (mode & ~fdp->fd_cmask) & ACCESSPERMS; @@ -753,6 +765,7 @@ kern_shm_open(struct thread *td, const char *userpath, int flags, mode_t mode, return (EINVAL); } shmfd = shm_alloc(td->td_ucred, cmode); + shmfd->shm_seals = initial_seals; } else { path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK); pr_path = td->td_ucred->cr_prison->pr_path; @@ -789,6 +802,7 @@ kern_shm_open(struct thread *td, const char *userpath, int flags, mode_t mode, if (error == 0) { #endif shmfd = shm_alloc(td->td_ucred, cmode); + shmfd->shm_seals = initial_seals; shm_insert(path, fnv, shmfd); #ifdef MAC } @@ -798,12 +812,39 @@ kern_shm_open(struct thread *td, const char *userpath, int flags, mode_t mode, error = ENOENT; } } else { + rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, + &shmfd->shm_mtx); + + /* + * kern_shm_open() likely shouldn't ever error out on + * trying to set a seal that already exists, unlike + * F_ADD_SEALS. This would break terribly as + * shm_open(2) actually sets F_SEAL_SEAL to maintain + * historical behavior where the underlying file could + * not be sealed. + */ + initial_seals &= ~shmfd->shm_seals; + /* * Object already exists, obtain a new * reference if requested and permitted. */ free(path, M_SHMFD); - if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) + + /* + * initial_seals can't set additional seals if we've + * already been set F_SEAL_SEAL. If F_SEAL_SEAL is set, + * then we've already removed that one from + * initial_seals. This is currently redundant as we + * only allow setting F_SEAL_SEAL at creation time, but + * it's cheap to check and decreases the effort required + * to allow additional seals. + */ + if ((shmfd->shm_seals & F_SEAL_SEAL) != 0 && + initial_seals != 0) + error = EPERM; + else if ((flags & (O_CREAT | O_EXCL)) == + (O_CREAT | O_EXCL)) error = EEXIST; else { #ifdef MAC @@ -823,15 +864,27 @@ kern_shm_open(struct thread *td, const char *userpath, int flags, mode_t mode, if (error == 0 && (flags & (O_ACCMODE | O_TRUNC)) == (O_RDWR | O_TRUNC)) { + VM_OBJECT_WLOCK(shmfd->shm_object); #ifdef MAC error = mac_posixshm_check_truncate( td->td_ucred, fp->f_cred, shmfd); if (error == 0) #endif - shm_dotruncate(shmfd, 0); + error = shm_dotruncate_locked(shmfd, 0, + rl_cookie); + VM_OBJECT_WUNLOCK(shmfd->shm_object); } - if (error == 0) + if (error == 0) { + /* + * Currently we only allow F_SEAL_SEAL to be + * set initially. As noted above, this would + * need to be reworked should that change. + */ + shmfd->shm_seals |= initial_seals; shm_hold(shmfd); + } + rangelock_unlock(&shmfd->shm_rl, rl_cookie, + &shmfd->shm_mtx); } sx_xunlock(&shm_dict_lock); @@ -856,7 +909,7 @@ sys_shm_open(struct thread *td, struct shm_open_args *uap) { return (kern_shm_open(td, uap->path, uap->flags | O_CLOEXEC, uap->mode, - NULL)); + NULL, F_SEAL_SEAL)); } int diff --git a/sys/sys/syscallsubr.h b/sys/sys/syscallsubr.h index beb6e503ebc7..3bc441151e5c 100644 --- a/sys/sys/syscallsubr.h +++ b/sys/sys/syscallsubr.h @@ -250,7 +250,7 @@ int kern_setsockopt(struct thread *td, int s, int level, int name, int kern_settimeofday(struct thread *td, struct timeval *tv, struct timezone *tzp); int kern_shm_open(struct thread *td, const char *userpath, int flags, - mode_t mode, struct filecaps *fcaps); + mode_t mode, struct filecaps *fcaps, int initial_seals); int kern_shmat(struct thread *td, int shmid, const void *shmaddr, int shmflg); int kern_shmctl(struct thread *td, int shmid, int cmd, void *buf, From 8df0d09bdbf6e02a0669fb051e9d5c418d7d6c40 Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Wed, 25 Sep 2019 17:35:34 +0000 Subject: [PATCH 068/106] In r340411, libufs.so's major number was bumped to 7, but an entry in ObsoleteFiles.inc was not added. Retroactively fix that. --- ObsoleteFiles.inc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ObsoleteFiles.inc b/ObsoleteFiles.inc index fa5a466f57ed..432ce0ab8240 100644 --- a/ObsoleteFiles.inc +++ b/ObsoleteFiles.inc @@ -668,6 +668,9 @@ OLD_DIRS+=usr/lib/clang/6.0.1/lib OLD_DIRS+=usr/lib/clang/6.0.1 # 20181116: Rename test file. OLD_FILES+=usr/tests/sys/netinet/reuseport_lb +# 20181113: libufs version bumped to 7. +OLD_LIBS+=lib/libufs.so.6 +OLD_LIBS+=usr/lib32/libufs.so.6 # 20181112: Cleanup old libcap_dns. OLD_LIBS+=lib/casper/libcap_dns.so.1 OLD_LIBS+=usr/lib32/libcap_dns.so.1 From e11365676b89695bbec90db2f9f9a5f8bda87121 Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Wed, 25 Sep 2019 17:52:59 +0000 Subject: [PATCH 069/106] In suite.test.mk, test if ${DESTDIR} exists before attempting to run chflags -R on it, otherwise the command will error out. (Note that adding -f to the chflags invocation does not help, unlike with rm.) MFC after: 3 days --- share/mk/suite.test.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/share/mk/suite.test.mk b/share/mk/suite.test.mk index d91d3df757d8..69cd02d684de 100644 --- a/share/mk/suite.test.mk +++ b/share/mk/suite.test.mk @@ -120,7 +120,7 @@ beforecheck: # etc. aftercheck: @cd ${.CURDIR} && ${MAKE} clean - @chflags -R 0 "${DESTDIR}" + @test ! -e ${DESTDIR} || chflags -R 0 "${DESTDIR}" @rm -Rf "${DESTDIR}" .endif From 20f7057685cdb96761f1448f4827606e4a6f4406 Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Wed, 25 Sep 2019 17:59:15 +0000 Subject: [PATCH 070/106] Add a shm_open2 syscall to support upcoming memfd_create shm_open2 allows a little more flexibility than the original shm_open. shm_open2 doesn't enforce CLOEXEC on its callers, and it has a separate shmflag argument that can be expanded later. Currently the only shmflag is to allow file sealing on the returned fd. shm_open and memfd_create will both be implemented in libc to use this new syscall. __FreeBSD_version is bumped to indicate the presence. Reviewed by: kib, markj Differential Revision: https://reviews.freebsd.org/D21393 --- sys/compat/freebsd32/syscalls.master | 3 +++ sys/kern/capabilities.conf | 1 + sys/kern/syscalls.master | 9 ++++++++ sys/kern/uipc_shm.c | 33 ++++++++++++++++++++++++++++ sys/sys/mman.h | 6 +++++ sys/sys/param.h | 2 +- sys/sys/syscallsubr.h | 2 ++ 7 files changed, 55 insertions(+), 1 deletion(-) diff --git a/sys/compat/freebsd32/syscalls.master b/sys/compat/freebsd32/syscalls.master index 6810f40302b6..0141f48fdb3c 100644 --- a/sys/compat/freebsd32/syscalls.master +++ b/sys/compat/freebsd32/syscalls.master @@ -1154,5 +1154,8 @@ 570 AUE_SYSCTL STD { int freebsd32___sysctlbyname(const char *name, \ size_t namelen, void *old, uint32_t *oldlenp, \ void *new, size_t newlen); } +571 AUE_SHMOPEN NOPROTO { int shm_open2( \ + const char *path, int flags, mode_t mode, \ + int shmflags, const char *name); } ; vim: syntax=off diff --git a/sys/kern/capabilities.conf b/sys/kern/capabilities.conf index 243953d11c5c..897c64145514 100644 --- a/sys/kern/capabilities.conf +++ b/sys/kern/capabilities.conf @@ -655,6 +655,7 @@ setuid ## shm_open(2) is scoped so as to allow only access to new anonymous objects. ## shm_open +shm_open2 ## ## Allow I/O-related file descriptors, subject to capability rights. diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master index 97435517204c..4aa117d9b797 100644 --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -3195,6 +3195,15 @@ _In_reads_bytes_opt_(newlen) void *new, size_t newlen); } +571 AUE_SHMOPEN STD { + int shm_open2( + _In_z_ const char *path, + int flags, + mode_t mode, + int shmflags, + _In_z_ const char *name + ); + } ; Please copy any additions and changes to the following compatability tables: ; sys/compat/freebsd32/syscalls.master diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c index 2674fdfedba9..4c21da4bc38a 100644 --- a/sys/kern/uipc_shm.c +++ b/sys/kern/uipc_shm.c @@ -1316,3 +1316,36 @@ SYSCTL_PROC(_kern_ipc, OID_AUTO, posix_shm_list, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLTYPE_OPAQUE, NULL, 0, sysctl_posix_shm_list, "", "POSIX SHM list"); + +int +kern_shm_open2(struct thread *td, const char *path, int flags, mode_t mode, + int shmflags, const char *name __unused) +{ + int initial_seals; + + if ((shmflags & ~SHM_ALLOW_SEALING) != 0) + return (EINVAL); + + initial_seals = F_SEAL_SEAL; + if ((shmflags & SHM_ALLOW_SEALING) != 0) + initial_seals &= ~F_SEAL_SEAL; + return (kern_shm_open(td, path, flags, 0, NULL, initial_seals)); +} + +/* + * This version of the shm_open() interface leaves CLOEXEC behavior up to the + * caller, and libc will enforce it for the traditional shm_open() call. This + * allows other consumers, like memfd_create(), to opt-in for CLOEXEC. This + * interface also includes a 'name' argument that is currently unused, but could + * potentially be exported later via some interface for debugging purposes. + * From the kernel's perspective, it is optional. Individual consumers like + * memfd_create() may require it in order to be compatible with other systems + * implementing the same function. + */ +int +sys_shm_open2(struct thread *td, struct shm_open2_args *uap) +{ + + return (kern_shm_open2(td, uap->path, uap->flags, uap->mode, + uap->shmflags, uap->name)); +} diff --git a/sys/sys/mman.h b/sys/sys/mman.h index ae95b9fee791..16ac44c4ba37 100644 --- a/sys/sys/mman.h +++ b/sys/sys/mman.h @@ -176,6 +176,12 @@ * Anonymous object constant for shm_open(). */ #define SHM_ANON ((char *)1) + +/* + * shmflags for shm_open2() + */ +#define SHM_ALLOW_SEALING 0x00000001 + #endif /* __BSD_VISIBLE */ /* diff --git a/sys/sys/param.h b/sys/sys/param.h index 22fb94934e83..77dec5a34280 100644 --- a/sys/sys/param.h +++ b/sys/sys/param.h @@ -60,7 +60,7 @@ * in the range 5 to 9. */ #undef __FreeBSD_version -#define __FreeBSD_version 1300047 /* Master, propagated to newvers */ +#define __FreeBSD_version 1300048 /* Master, propagated to newvers */ /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, diff --git a/sys/sys/syscallsubr.h b/sys/sys/syscallsubr.h index 3bc441151e5c..64989924948b 100644 --- a/sys/sys/syscallsubr.h +++ b/sys/sys/syscallsubr.h @@ -251,6 +251,8 @@ int kern_settimeofday(struct thread *td, struct timeval *tv, struct timezone *tzp); int kern_shm_open(struct thread *td, const char *userpath, int flags, mode_t mode, struct filecaps *fcaps, int initial_seals); +int kern_shm_open2(struct thread *td, const char *path, int flags, + mode_t mode, int shmflags, const char *name); int kern_shmat(struct thread *td, int shmid, const void *shmaddr, int shmflg); int kern_shmctl(struct thread *td, int shmid, int cmd, void *buf, From 460211e730cdcf5fdf7aa0bb944b89c8b4c219b2 Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Wed, 25 Sep 2019 17:59:58 +0000 Subject: [PATCH 071/106] sysent: regenerate after r352700 --- sys/compat/freebsd32/freebsd32_syscall.h | 3 +- sys/compat/freebsd32/freebsd32_syscalls.c | 1 + sys/compat/freebsd32/freebsd32_sysent.c | 1 + .../freebsd32/freebsd32_systrace_args.c | 38 +++++++++++++++++++ sys/kern/init_sysent.c | 1 + sys/kern/syscalls.c | 1 + sys/kern/systrace_args.c | 38 +++++++++++++++++++ sys/sys/syscall.h | 3 +- sys/sys/syscall.mk | 3 +- sys/sys/sysproto.h | 9 +++++ 10 files changed, 95 insertions(+), 3 deletions(-) diff --git a/sys/compat/freebsd32/freebsd32_syscall.h b/sys/compat/freebsd32/freebsd32_syscall.h index eb9f9d306af9..b79832ddfe34 100644 --- a/sys/compat/freebsd32/freebsd32_syscall.h +++ b/sys/compat/freebsd32/freebsd32_syscall.h @@ -497,4 +497,5 @@ #define FREEBSD32_SYS_funlinkat 568 #define FREEBSD32_SYS_copy_file_range 569 #define FREEBSD32_SYS_freebsd32___sysctlbyname 570 -#define FREEBSD32_SYS_MAXSYSCALL 571 +#define FREEBSD32_SYS_shm_open2 571 +#define FREEBSD32_SYS_MAXSYSCALL 572 diff --git a/sys/compat/freebsd32/freebsd32_syscalls.c b/sys/compat/freebsd32/freebsd32_syscalls.c index 8b94310a48df..a1a45b3988c7 100644 --- a/sys/compat/freebsd32/freebsd32_syscalls.c +++ b/sys/compat/freebsd32/freebsd32_syscalls.c @@ -607,4 +607,5 @@ const char *freebsd32_syscallnames[] = { "funlinkat", /* 568 = funlinkat */ "copy_file_range", /* 569 = copy_file_range */ "freebsd32___sysctlbyname", /* 570 = freebsd32___sysctlbyname */ + "shm_open2", /* 571 = shm_open2 */ }; diff --git a/sys/compat/freebsd32/freebsd32_sysent.c b/sys/compat/freebsd32/freebsd32_sysent.c index fa6808682f1e..c87f9e4654fd 100644 --- a/sys/compat/freebsd32/freebsd32_sysent.c +++ b/sys/compat/freebsd32/freebsd32_sysent.c @@ -654,4 +654,5 @@ struct sysent freebsd32_sysent[] = { { AS(funlinkat_args), (sy_call_t *)sys_funlinkat, AUE_UNLINKAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 568 = funlinkat */ { AS(copy_file_range_args), (sy_call_t *)sys_copy_file_range, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 569 = copy_file_range */ { AS(freebsd32___sysctlbyname_args), (sy_call_t *)freebsd32___sysctlbyname, AUE_SYSCTL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 570 = freebsd32___sysctlbyname */ + { AS(shm_open2_args), (sy_call_t *)sys_shm_open2, AUE_SHMOPEN, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 571 = shm_open2 */ }; diff --git a/sys/compat/freebsd32/freebsd32_systrace_args.c b/sys/compat/freebsd32/freebsd32_systrace_args.c index eb4f56b826b9..d3655038c737 100644 --- a/sys/compat/freebsd32/freebsd32_systrace_args.c +++ b/sys/compat/freebsd32/freebsd32_systrace_args.c @@ -3344,6 +3344,17 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args) *n_args = 6; break; } + /* shm_open2 */ + case 571: { + struct shm_open2_args *p = params; + uarg[0] = (intptr_t) p->path; /* const char * */ + iarg[1] = p->flags; /* int */ + iarg[2] = p->mode; /* mode_t */ + iarg[3] = p->shmflags; /* int */ + uarg[4] = (intptr_t) p->name; /* const char * */ + *n_args = 5; + break; + } default: *n_args = 0; break; @@ -9008,6 +9019,28 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) break; }; break; + /* shm_open2 */ + case 571: + switch(ndx) { + case 0: + p = "userland const char *"; + break; + case 1: + p = "int"; + break; + case 2: + p = "mode_t"; + break; + case 3: + p = "int"; + break; + case 4: + p = "userland const char *"; + break; + default: + break; + }; + break; default: break; }; @@ -10892,6 +10925,11 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) if (ndx == 0 || ndx == 1) p = "int"; break; + /* shm_open2 */ + case 571: + if (ndx == 0 || ndx == 1) + p = "int"; + break; default: break; }; diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c index 34546b605e24..53b969739785 100644 --- a/sys/kern/init_sysent.c +++ b/sys/kern/init_sysent.c @@ -620,4 +620,5 @@ struct sysent sysent[] = { { AS(funlinkat_args), (sy_call_t *)sys_funlinkat, AUE_UNLINKAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 568 = funlinkat */ { AS(copy_file_range_args), (sy_call_t *)sys_copy_file_range, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 569 = copy_file_range */ { AS(__sysctlbyname_args), (sy_call_t *)sys___sysctlbyname, AUE_SYSCTL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 570 = __sysctlbyname */ + { AS(shm_open2_args), (sy_call_t *)sys_shm_open2, AUE_SHMOPEN, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 571 = shm_open2 */ }; diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c index 498298e45dba..d9b361da2f31 100644 --- a/sys/kern/syscalls.c +++ b/sys/kern/syscalls.c @@ -577,4 +577,5 @@ const char *syscallnames[] = { "funlinkat", /* 568 = funlinkat */ "copy_file_range", /* 569 = copy_file_range */ "__sysctlbyname", /* 570 = __sysctlbyname */ + "shm_open2", /* 571 = shm_open2 */ }; diff --git a/sys/kern/systrace_args.c b/sys/kern/systrace_args.c index 232e464896ef..0ea576919c24 100644 --- a/sys/kern/systrace_args.c +++ b/sys/kern/systrace_args.c @@ -3336,6 +3336,17 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args) *n_args = 6; break; } + /* shm_open2 */ + case 571: { + struct shm_open2_args *p = params; + uarg[0] = (intptr_t) p->path; /* const char * */ + iarg[1] = p->flags; /* int */ + iarg[2] = p->mode; /* mode_t */ + iarg[3] = p->shmflags; /* int */ + uarg[4] = (intptr_t) p->name; /* const char * */ + *n_args = 5; + break; + } default: *n_args = 0; break; @@ -8913,6 +8924,28 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) break; }; break; + /* shm_open2 */ + case 571: + switch(ndx) { + case 0: + p = "userland const char *"; + break; + case 1: + p = "int"; + break; + case 2: + p = "mode_t"; + break; + case 3: + p = "int"; + break; + case 4: + p = "userland const char *"; + break; + default: + break; + }; + break; default: break; }; @@ -10824,6 +10857,11 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) if (ndx == 0 || ndx == 1) p = "int"; break; + /* shm_open2 */ + case 571: + if (ndx == 0 || ndx == 1) + p = "int"; + break; default: break; }; diff --git a/sys/sys/syscall.h b/sys/sys/syscall.h index b263285ace5e..d1a0c5967e3d 100644 --- a/sys/sys/syscall.h +++ b/sys/sys/syscall.h @@ -506,4 +506,5 @@ #define SYS_funlinkat 568 #define SYS_copy_file_range 569 #define SYS___sysctlbyname 570 -#define SYS_MAXSYSCALL 571 +#define SYS_shm_open2 571 +#define SYS_MAXSYSCALL 572 diff --git a/sys/sys/syscall.mk b/sys/sys/syscall.mk index d19a410170b2..b85e9cea73c0 100644 --- a/sys/sys/syscall.mk +++ b/sys/sys/syscall.mk @@ -411,4 +411,5 @@ MIASM = \ fhreadlink.o \ funlinkat.o \ copy_file_range.o \ - __sysctlbyname.o + __sysctlbyname.o \ + shm_open2.o diff --git a/sys/sys/sysproto.h b/sys/sys/sysproto.h index 5de3ce619204..b0f92e750ef0 100644 --- a/sys/sys/sysproto.h +++ b/sys/sys/sysproto.h @@ -1812,6 +1812,13 @@ struct __sysctlbyname_args { char new_l_[PADL_(void *)]; void * new; char new_r_[PADR_(void *)]; char newlen_l_[PADL_(size_t)]; size_t newlen; char newlen_r_[PADR_(size_t)]; }; +struct shm_open2_args { + char path_l_[PADL_(const char *)]; const char * path; char path_r_[PADR_(const char *)]; + char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; + char mode_l_[PADL_(mode_t)]; mode_t mode; char mode_r_[PADR_(mode_t)]; + char shmflags_l_[PADL_(int)]; int shmflags; char shmflags_r_[PADR_(int)]; + char name_l_[PADL_(const char *)]; const char * name; char name_r_[PADR_(const char *)]; +}; int nosys(struct thread *, struct nosys_args *); void sys_sys_exit(struct thread *, struct sys_exit_args *); int sys_fork(struct thread *, struct fork_args *); @@ -2199,6 +2206,7 @@ int sys_fhreadlink(struct thread *, struct fhreadlink_args *); int sys_funlinkat(struct thread *, struct funlinkat_args *); int sys_copy_file_range(struct thread *, struct copy_file_range_args *); int sys___sysctlbyname(struct thread *, struct __sysctlbyname_args *); +int sys_shm_open2(struct thread *, struct shm_open2_args *); #ifdef COMPAT_43 @@ -3114,6 +3122,7 @@ int freebsd11_mknodat(struct thread *, struct freebsd11_mknodat_args *); #define SYS_AUE_funlinkat AUE_UNLINKAT #define SYS_AUE_copy_file_range AUE_NULL #define SYS_AUE___sysctlbyname AUE_SYSCTL +#define SYS_AUE_shm_open2 AUE_SHMOPEN #undef PAD_ #undef PADL_ From e7dd6e9402cae324c2190a70081854c3c8a8feb9 Mon Sep 17 00:00:00 2001 From: Gleb Smirnoff Date: Wed, 25 Sep 2019 18:03:15 +0000 Subject: [PATCH 072/106] Enhance the 'ps' command so that it prints a line per proc and a line per thread, so that instead of repeating the same info for all threads in proc, it would print thread specific info. Also includes thread number that would match 'info threads' info and can be used as argument for thread swithcing with 'thread' command. --- tools/debugscripts/gdbinit.kernel | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/tools/debugscripts/gdbinit.kernel b/tools/debugscripts/gdbinit.kernel index 91d85c22a57c..274eb73d743b 100644 --- a/tools/debugscripts/gdbinit.kernel +++ b/tools/debugscripts/gdbinit.kernel @@ -199,28 +199,29 @@ define ps set $nproc = nprocs set $aproc = allproc.lh_first set $proc = allproc.lh_first - printf " pid proc uid ppid pgrp flag stat comm wchan\n" + set $tid = 1 + printf "pid/ID ppid/tid uid pgrp flag st comm/name proc/thread\n" while (--$nproc >= 0) set $pptr = $proc.p_pptr if ($pptr == 0) set $pptr = $proc end if ($proc.p_state) + printf " %5d %6d %4d %5d %8x %2d %-10s %p\n", \ + $proc.p_pid, $pptr->p_pid, \ + $proc.p_ucred->cr_ruid, \ + $proc.p_pgrp->pg_id, $proc.p_flag, $proc.p_state, \ + &$proc.p_comm[0], $aproc set $thread = $proc->p_threads.tqh_first while ($thread) - printf "%5d %08x %4d %5d %5d %06x %d %-10s ", \ - $proc.p_pid, $aproc, \ - $proc.p_ucred->cr_ruid, $pptr->p_pid, \ - $proc.p_pgrp->pg_id, $proc.p_flag, $proc.p_state, \ - &$proc.p_comm[0] - if ($thread.td_wchan) - if ($thread.td_wmesg) - printf "%s ", $thread.td_wmesg - end - printf "%x", $thread.td_wchan + printf "(%5d) %6d %-10s %p", \ + $tid, $thread->td_tid, $thread->td_name, $thread + if ($thread.td_wmesg) + printf " %s", $thread.td_wmesg end - printf "\n" + printf "\n" set $thread = $thread->td_plist.tqe_next + set $tid = $tid + 1 end end set $aproc = $proc.p_list.le_next From 3e25d1fb6168c7d905230e2effbf7adb20935cd4 Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Wed, 25 Sep 2019 18:03:18 +0000 Subject: [PATCH 073/106] Add linux-compatible memfd_create memfd_create is effectively a SHM_ANON shm_open(2) mapping with optional CLOEXEC and file sealing support. This is used by some mesa parts, some linux libs, and qemu can also take advantage of it and uses the sealing to prevent resizing the region. This reimplements shm_open in terms of shm_open2(2) at the same time. shm_open(2) will be moved to COMPAT12 shortly. Reviewed by: markj, kib Differential Revision: https://reviews.freebsd.org/D21393 --- Makefile.inc1 | 3 +- lib/libc/include/libc_private.h | 1 + lib/libc/sys/Makefile.inc | 4 +- lib/libc/sys/Symbol.map | 1 + lib/libc/sys/shm_open.2 | 97 ++++++++++- lib/libc/sys/shm_open.c | 113 +++++++++++++ sys/sys/mman.h | 27 ++++ tests/sys/kern/Makefile | 1 + tests/sys/kern/memfd_test.c | 276 ++++++++++++++++++++++++++++++++ 9 files changed, 517 insertions(+), 6 deletions(-) create mode 100644 lib/libc/sys/shm_open.c create mode 100644 tests/sys/kern/memfd_test.c diff --git a/Makefile.inc1 b/Makefile.inc1 index 17a7569e232c..afa3c0703854 100644 --- a/Makefile.inc1 +++ b/Makefile.inc1 @@ -947,7 +947,8 @@ _cleanobj_fast_depend_hack: .PHONY # Syscall stubs rewritten in C and obsolete MD assembly implementations # Date SVN Rev Syscalls # 20180604 r334626 brk sbrk -.for f in brk sbrk +# 20190916 r35XXXX shm_open +.for f in brk sbrk shm_open @if [ -e "${OBJTOP}/lib/libc/.depend.${f}.o" ] && \ egrep -qw '${f}\.[sS]' ${OBJTOP}/lib/libc/.depend.${f}.o; then \ echo "Removing stale dependencies for ${f} syscall wrappers"; \ diff --git a/lib/libc/include/libc_private.h b/lib/libc/include/libc_private.h index 529ae6b17c41..654baeaaae55 100644 --- a/lib/libc/include/libc_private.h +++ b/lib/libc/include/libc_private.h @@ -391,6 +391,7 @@ __pid_t __sys_wait6(enum idtype, __id_t, int *, int, struct __wrusage *, struct __siginfo *); __ssize_t __sys_write(int, const void *, __size_t); __ssize_t __sys_writev(int, const struct iovec *, int); +int __sys_shm_open2(const char *, int, __mode_t, int, const char *); int __libc_sigaction(int, const struct sigaction *, struct sigaction *) __hidden; diff --git a/lib/libc/sys/Makefile.inc b/lib/libc/sys/Makefile.inc index 15c3b004d0ff..9f3ec84b517c 100644 --- a/lib/libc/sys/Makefile.inc +++ b/lib/libc/sys/Makefile.inc @@ -46,6 +46,7 @@ PSEUDO+= _getdirentries.o SRCS+= brk.c SRCS+= pipe.c +SRCS+= shm_open.c SRCS+= vadvise.c SRCS+= compat-stub.c @@ -475,7 +476,8 @@ MLINKS+=setuid.2 setegid.2 \ setuid.2 seteuid.2 \ setuid.2 setgid.2 MLINKS+=shmat.2 shmdt.2 -MLINKS+=shm_open.2 shm_unlink.2 +MLINKS+=shm_open.2 memfd_create.3 \ + shm_open.2 shm_unlink.2 MLINKS+=sigwaitinfo.2 sigtimedwait.2 MLINKS+=stat.2 fstat.2 \ stat.2 fstatat.2 \ diff --git a/lib/libc/sys/Symbol.map b/lib/libc/sys/Symbol.map index a0bd5a214b81..711ddaa6ea9f 100644 --- a/lib/libc/sys/Symbol.map +++ b/lib/libc/sys/Symbol.map @@ -409,6 +409,7 @@ FBSD_1.6 { fhreadlink; getfhat; funlinkat; + memfd_create; }; FBSDprivate_1.0 { diff --git a/lib/libc/sys/shm_open.2 b/lib/libc/sys/shm_open.2 index 0855c076faf5..f089f177e1d6 100644 --- a/lib/libc/sys/shm_open.2 +++ b/lib/libc/sys/shm_open.2 @@ -28,11 +28,11 @@ .\" .\" $FreeBSD$ .\" -.Dd January 20, 2017 +.Dd September 24, 2019 .Dt SHM_OPEN 2 .Os .Sh NAME -.Nm shm_open , shm_unlink +.Nm memfd_create , shm_open , shm_unlink .Nd "shared memory object operations" .Sh LIBRARY .Lb libc @@ -41,6 +41,8 @@ .In sys/mman.h .In fcntl.h .Ft int +.Fn memfd_create "const char *name" "unsigned int flags" +.Ft int .Fn shm_open "const char *path" "int flags" "mode_t mode" .Ft int .Fn shm_unlink "const char *path" @@ -139,14 +141,64 @@ The .Fn shm_unlink system call removes a shared memory object named .Fa path . +.Pp +The +.Fn memfd_create +function creates an anonymous shared memory object, identical to that created +by +.Fn shm_open +when +.Dv SHM_ANON +is specified. +Newly created objects start off with a size of zero. +The size of the new object must be adjusted via +.Xr ftruncate 2 . +.Pp +The +.Fa name +argument must not be +.Dv NULL , +but it may be an empty string. +The length of the +.Fa name +argument may not exceed +.Dv NAME_MAX +minus six characters for the prefix +.Dq memfd: , +which will be prepended. +The +.Fa name +argument is intended solely for debugging purposes and will never be used by the +kernel to identify a memfd. +Names are therefore not required to be unique. +.Pp +The following +.Fa flags +may be specified to +.Fn memfd_create : +.Bl -tag -width MFD_ALLOW_SEALING +.It Dv MFD_CLOEXEC +Set +.Dv FD_CLOEXEC +on the resulting file descriptor. +.It Dv MFD_ALLOW_SEALING +Allow adding seals to the resulting file descriptor using the +.Dv F_ADD_SEALS +.Xr fcntl 2 +command. +.It Dv MFD_HUGETLB +This flag is currently unsupported. +.El .Sh RETURN VALUES If successful, +.Fn memfd_create +and .Fn shm_open -returns a non-negative integer, +both return a non-negative integer, and .Fn shm_unlink returns zero. -Both functions return -1 on failure, and set +All three functions return -1 on failure, and set .Va errno to indicate the error. .Sh COMPATIBILITY @@ -220,6 +272,33 @@ This example fails without the call to errx(EX_IOERR, "%s: pwrite length mismatch", __func__); .Ed .Sh ERRORS +.Fn memfd_create +fails with these error codes for these conditions: +.Bl -tag -width Er +.It Bq Er EBADF +The +.Fa name +argument was NULL. +.It Bq Er EINVAL +The +.Fa name +argument was too long. +.Pp +An invalid or unsupported flag was included in +.Fa flags . +.It Bq Er EMFILE +The process has already reached its limit for open file descriptors. +.It Bq Er ENFILE +The system file table is full. +.It Bq Er ENOSYS +In +.Fa memfd_create , +.Dv MFD_HUGETLB +was specified in +.Fa flags , +and this system does not support forced hugetlb mappings. +.El +.Pp .Fn shm_open fails with these error codes for these conditions: .Bl -tag -width Er @@ -290,6 +369,11 @@ requires write permission to the shared memory object. .Xr sendfile 2 .Sh STANDARDS The +.Fn memfd_create +function is expected to be compatible with the Linux system call of the same +name. +.Pp +The .Fn shm_open and .Fn shm_unlink @@ -297,6 +381,11 @@ functions are believed to conform to .St -p1003.1b-93 . .Sh HISTORY The +.Fn memfd_create +function appeared in +.Fx 13.0 . +.Pp +The .Fn shm_open and .Fn shm_unlink diff --git a/lib/libc/sys/shm_open.c b/lib/libc/sys/shm_open.c new file mode 100644 index 000000000000..e6281f87bbb5 --- /dev/null +++ b/lib/libc/sys/shm_open.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2019 Kyle Evans + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice(s), this list of conditions and the following disclaimer as + * the first lines of this file unmodified other than the possible + * addition of one or more copyright notices. + * 2. Redistributions in binary form must reproduce the above copyright + * notice(s), this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "libc_private.h" + +__weak_reference(shm_open, _shm_open); +__weak_reference(shm_open, __sys_shm_open); + +#ifndef SYS_freebsd12_shm_open +#define SYS_freebsd12_shm_open SYS_shm_open +#endif + +#define SHM_OPEN2_OSREL 1300048 + +#define MEMFD_NAME_PREFIX "memfd:" + +int +shm_open(const char *path, int flags, mode_t mode) +{ + + if (__getosreldate() >= SHM_OPEN2_OSREL) + return (__sys_shm_open2(path, flags | O_CLOEXEC, mode, 0, + NULL)); + + /* + * Fallback to shm_open(2) on older kernels. The kernel will enforce + * O_CLOEXEC in this interface, unlike the newer shm_open2 which does + * not enforce it. The newer interface allows memfd_create(), for + * instance, to not have CLOEXEC on the returned fd. + */ + return (syscall(SYS_freebsd12_shm_open, path, flags, mode)); +} + +/* + * The path argument is passed to the kernel, but the kernel doesn't currently + * do anything with it. Linux exposes it in linprocfs for debugging purposes + * only, but our kernel currently will not do the same. + */ +int +memfd_create(const char *name, unsigned int flags) +{ + char memfd_name[NAME_MAX + 1]; + size_t namelen; + int oflags, shmflags; + + if (name == NULL) + return (EBADF); + namelen = strlen(name); + if (namelen + sizeof(MEMFD_NAME_PREFIX) - 1 > NAME_MAX) + return (EINVAL); + if ((flags & ~(MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | + MFD_HUGE_MASK)) != 0) + return (EINVAL); + /* HUGETLB set with no size specified. */ + if ((flags & MFD_HUGETLB) != 0 && (flags & MFD_HUGE_MASK) == 0) + return (EINVAL); + /* Size specified but no HUGETLB. */ + if ((flags & MFD_HUGE_MASK) != 0 && (flags & MFD_HUGETLB) == 0) + return (EINVAL); + /* We don't actually support HUGETLB. */ + if ((flags & MFD_HUGETLB) != 0) + return (ENOSYS); + + /* We've already validated that we're sufficiently sized. */ + snprintf(memfd_name, NAME_MAX + 1, "%s%s", MEMFD_NAME_PREFIX, name); + oflags = O_RDWR; + shmflags = 0; + if ((flags & MFD_CLOEXEC) != 0) + oflags |= O_CLOEXEC; + if ((flags & MFD_ALLOW_SEALING) != 0) + shmflags |= SHM_ALLOW_SEALING; + return (__sys_shm_open2(SHM_ANON, oflags, 0, shmflags, memfd_name)); +} diff --git a/sys/sys/mman.h b/sys/sys/mman.h index 16ac44c4ba37..a5c66f3596ea 100644 --- a/sys/sys/mman.h +++ b/sys/sys/mman.h @@ -182,6 +182,30 @@ */ #define SHM_ALLOW_SEALING 0x00000001 +/* + * Flags for memfd_create(). + */ +#define MFD_ALLOW_SEALING 0x00000001 +#define MFD_CLOEXEC 0x00000002 + +/* UNSUPPORTED */ +#define MFD_HUGETLB 0x00000004 + +#define MFD_HUGE_MASK 0xFC000000 +#define MFD_HUGE_SHIFT 26 +#define MFD_HUGE_64KB (16 << MFD_HUGE_SHIFT) +#define MFD_HUGE_512KB (19 << MFD_HUGE_SHIFT) +#define MFD_HUGE_1MB (20 << MFD_HUGE_SHIFT) +#define MFD_HUGE_2MB (21 << MFD_HUGE_SHIFT) +#define MFD_HUGE_8MB (23 << MFD_HUGE_SHIFT) +#define MFD_HUGE_16MB (24 << MFD_HUGE_SHIFT) +#define MFD_HUGE_32MB (25 << MFD_HUGE_SHIFT) +#define MFD_HUGE_256MB (28 << MFD_HUGE_SHIFT) +#define MFD_HUGE_512MB (29 << MFD_HUGE_SHIFT) +#define MFD_HUGE_1GB (30 << MFD_HUGE_SHIFT) +#define MFD_HUGE_2GB (31 << MFD_HUGE_SHIFT) +#define MFD_HUGE_16GB (34 << MFD_HUGE_SHIFT) + #endif /* __BSD_VISIBLE */ /* @@ -291,6 +315,9 @@ int munlockall(void); int shm_open(const char *, int, mode_t); int shm_unlink(const char *); #endif +#if __BSD_VISIBLE +int memfd_create(const char *, unsigned int); +#endif __END_DECLS #endif /* !_KERNEL */ diff --git a/tests/sys/kern/Makefile b/tests/sys/kern/Makefile index 62a84d47807e..3aee85c005eb 100644 --- a/tests/sys/kern/Makefile +++ b/tests/sys/kern/Makefile @@ -9,6 +9,7 @@ TESTSDIR= ${TESTSBASE}/sys/kern ATF_TESTS_C+= kern_copyin ATF_TESTS_C+= kern_descrip_test ATF_TESTS_C+= kill_zombie +ATF_TESTS_C+= memfd_test ATF_TESTS_C+= ptrace_test TEST_METADATA.ptrace_test+= timeout="15" ATF_TESTS_C+= reaper diff --git a/tests/sys/kern/memfd_test.c b/tests/sys/kern/memfd_test.c new file mode 100644 index 000000000000..199b4d110c2f --- /dev/null +++ b/tests/sys/kern/memfd_test.c @@ -0,0 +1,276 @@ +/*- + * Copyright (c) 2019 Kyle Evans + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include + +ATF_TC_WITHOUT_HEAD(basic); +ATF_TC_BODY(basic, tc) +{ + int fd; + char buf[8]; + + ATF_REQUIRE((fd = memfd_create("...", 0)) != -1); + + /* File size should be initially 0 */ + ATF_REQUIRE(write(fd, buf, sizeof(buf)) == 0); + + /* ftruncate(2) must succeed without seals */ + ATF_REQUIRE(ftruncate(fd, sizeof(buf) - 1) == 0); + + ATF_REQUIRE(write(fd, buf, sizeof(buf)) == sizeof(buf) - 1); + + close(fd); +} + +ATF_TC_WITHOUT_HEAD(cloexec); +ATF_TC_BODY(cloexec, tc) +{ + int fd_nocl, fd_cl; + + ATF_REQUIRE((fd_nocl = memfd_create("...", 0)) != -1); + ATF_REQUIRE((fd_cl = memfd_create("...", MFD_CLOEXEC)) != -1); + + ATF_REQUIRE((fcntl(fd_nocl, F_GETFD) & FD_CLOEXEC) == 0); + ATF_REQUIRE((fcntl(fd_cl, F_GETFD) & FD_CLOEXEC) != 0); + + close(fd_nocl); + close(fd_cl); +} + +ATF_TC_WITHOUT_HEAD(disallowed_sealing); +ATF_TC_BODY(disallowed_sealing, tc) +{ + int fd; + + ATF_REQUIRE((fd = memfd_create("...", 0)) != -1); + ATF_REQUIRE(fcntl(fd, F_GET_SEALS) == F_SEAL_SEAL); + ATF_REQUIRE(fcntl(fd, F_ADD_SEALS, F_SEAL_WRITE) == -1); + ATF_REQUIRE(errno == EPERM); + + close(fd); +} + +#define BUF_SIZE 1024 + +ATF_TC_WITHOUT_HEAD(write_seal); +ATF_TC_BODY(write_seal, tc) +{ + int fd; + char *addr, buf[BUF_SIZE]; + + ATF_REQUIRE((fd = memfd_create("...", MFD_ALLOW_SEALING)) != -1); + ATF_REQUIRE(ftruncate(fd, BUF_SIZE) == 0); + + /* Write once, then we'll seal it and try again */ + ATF_REQUIRE(write(fd, buf, BUF_SIZE) == BUF_SIZE); + ATF_REQUIRE(lseek(fd, 0, SEEK_SET) == 0); + + addr = mmap(0, BUF_SIZE, (PROT_READ | PROT_WRITE), MAP_PRIVATE, fd, 0); + ATF_REQUIRE(addr != MAP_FAILED); + ATF_REQUIRE(munmap(addr, BUF_SIZE) == 0); + + ATF_REQUIRE(fcntl(fd, F_ADD_SEALS, F_SEAL_WRITE) == 0); + + ATF_REQUIRE(write(fd, buf, BUF_SIZE) == -1); + ATF_REQUIRE(errno == EPERM); + + ATF_REQUIRE(mmap(0, BUF_SIZE, (PROT_READ | PROT_WRITE), MAP_SHARED, + fd, 0) == MAP_FAILED); + ATF_REQUIRE(errno == EPERM); + + close(fd); +} + +ATF_TC_WITHOUT_HEAD(mmap_write_seal); +ATF_TC_BODY(mmap_write_seal, tc) +{ + int fd; + char *addr, *paddr, *raddr; + + ATF_REQUIRE((fd = memfd_create("...", MFD_ALLOW_SEALING)) != -1); + ATF_REQUIRE(ftruncate(fd, BUF_SIZE) == 0); + + /* Map it, both shared and privately */ + addr = mmap(0, BUF_SIZE, (PROT_READ | PROT_WRITE), MAP_SHARED, fd, 0); + ATF_REQUIRE(addr != MAP_FAILED); + paddr = mmap(0, BUF_SIZE, (PROT_READ | PROT_WRITE), MAP_PRIVATE, fd, 0); + ATF_REQUIRE(paddr != MAP_FAILED); + raddr = mmap(0, BUF_SIZE, PROT_READ, MAP_SHARED, fd, 0); + ATF_REQUIRE(raddr != MAP_FAILED); + + /* Now try to seal it before unmapping */ + ATF_REQUIRE(fcntl(fd, F_ADD_SEALS, F_SEAL_WRITE) == -1); + ATF_REQUIRE(errno == EBUSY); + + ATF_REQUIRE(munmap(addr, BUF_SIZE) == 0); + + ATF_REQUIRE(fcntl(fd, F_ADD_SEALS, F_SEAL_WRITE) == 0); + + ATF_REQUIRE(munmap(paddr, BUF_SIZE) == 0); + ATF_REQUIRE(munmap(raddr, BUF_SIZE) == 0); + ATF_REQUIRE(mmap(0, BUF_SIZE, (PROT_READ | PROT_WRITE), MAP_SHARED, + fd, 0) == MAP_FAILED); + ATF_REQUIRE(errno == EPERM); + paddr = mmap(0, BUF_SIZE, (PROT_READ | PROT_WRITE), MAP_PRIVATE, fd, 0); + ATF_REQUIRE(paddr != MAP_FAILED); + raddr = mmap(0, BUF_SIZE, PROT_READ, MAP_SHARED, fd, 0); + ATF_REQUIRE(raddr != MAP_FAILED); + ATF_REQUIRE(munmap(raddr, BUF_SIZE) == 0); + ATF_REQUIRE(munmap(paddr, BUF_SIZE) == 0); + + close(fd); +} + +static int +memfd_truncate_test(int initial_size, int dest_size, int seals) +{ + int err, fd; + + ATF_REQUIRE((fd = memfd_create("...", MFD_ALLOW_SEALING)) != -1); + ATF_REQUIRE(ftruncate(fd, initial_size) == 0); + + ATF_REQUIRE(fcntl(fd, F_ADD_SEALS, seals) == 0); + + err = ftruncate(fd, dest_size); + if (err != 0) + err = errno; + close(fd); + return (err); +} + +ATF_TC_WITHOUT_HEAD(truncate_seals); +ATF_TC_BODY(truncate_seals, tc) +{ + + ATF_REQUIRE(memfd_truncate_test(4, 8, F_SEAL_GROW) == EPERM); + ATF_REQUIRE(memfd_truncate_test(8, 4, F_SEAL_SHRINK) == EPERM); + ATF_REQUIRE(memfd_truncate_test(8, 4, F_SEAL_GROW) == 0); + ATF_REQUIRE(memfd_truncate_test(4, 8, F_SEAL_SHRINK) == 0); + + ATF_REQUIRE(memfd_truncate_test(4, 8, F_SEAL_GROW | F_SEAL_SHRINK) == + EPERM); + ATF_REQUIRE(memfd_truncate_test(8, 4, F_SEAL_GROW | F_SEAL_SHRINK) == + EPERM); + ATF_REQUIRE(memfd_truncate_test(4, 4, F_SEAL_GROW | F_SEAL_SHRINK) == + 0); +} + +ATF_TC_WITHOUT_HEAD(get_seals); +ATF_TC_BODY(get_seals, tc) +{ + int fd; + int seals; + + ATF_REQUIRE((fd = memfd_create("...", MFD_ALLOW_SEALING)) != -1); + ATF_REQUIRE(fcntl(fd, F_GET_SEALS) == 0); + + ATF_REQUIRE(fcntl(fd, F_ADD_SEALS, F_SEAL_WRITE | F_SEAL_GROW) == 0); + seals = fcntl(fd, F_GET_SEALS); + ATF_REQUIRE(seals == (F_SEAL_WRITE | F_SEAL_GROW)); + + close(fd); +} + +ATF_TC_WITHOUT_HEAD(dup_seals); +ATF_TC_BODY(dup_seals, tc) +{ + char buf[8]; + int fd, fdx; + int seals; + + ATF_REQUIRE((fd = memfd_create("...", MFD_ALLOW_SEALING)) != -1); + ATF_REQUIRE((fdx = dup(fd)) != -1); + ATF_REQUIRE(fcntl(fd, F_GET_SEALS) == 0); + + ATF_REQUIRE(fcntl(fd, F_ADD_SEALS, F_SEAL_WRITE | F_SEAL_GROW) == 0); + seals = fcntl(fd, F_GET_SEALS); + ATF_REQUIRE(seals == (F_SEAL_WRITE | F_SEAL_GROW)); + + seals = fcntl(fdx, F_GET_SEALS); + ATF_REQUIRE(seals == (F_SEAL_WRITE | F_SEAL_GROW)); + + /* Make sure the seal's actually being applied at the inode level */ + ATF_REQUIRE(write(fdx, buf, sizeof(buf)) == -1); + ATF_REQUIRE(errno == EPERM); + + ATF_REQUIRE(mmap(0, BUF_SIZE, (PROT_READ | PROT_WRITE), MAP_SHARED, + fdx, 0) == MAP_FAILED); + ATF_REQUIRE(errno == EPERM); + + close(fd); + close(fdx); +} + +ATF_TC_WITHOUT_HEAD(immutable_seals); +ATF_TC_BODY(immutable_seals, tc) +{ + int fd; + + ATF_REQUIRE((fd = memfd_create("...", MFD_ALLOW_SEALING)) != -1); + + ATF_REQUIRE(fcntl(fd, F_ADD_SEALS, F_SEAL_SEAL) == 0); + ATF_REQUIRE(fcntl(fd, F_ADD_SEALS, F_SEAL_GROW) == -1); + ATF_REQUIRE_MSG(errno == EPERM, + "Added unique grow seal after restricting seals"); + + close(fd); + + /* + * Also check that adding a seal that already exists really doesn't + * do anything once we're sealed. + */ + ATF_REQUIRE((fd = memfd_create("...", MFD_ALLOW_SEALING)) != -1); + + ATF_REQUIRE(fcntl(fd, F_ADD_SEALS, F_SEAL_GROW | F_SEAL_SEAL) == 0); + ATF_REQUIRE(fcntl(fd, F_ADD_SEALS, F_SEAL_GROW) == -1); + ATF_REQUIRE_MSG(errno == EPERM, + "Added duplicate grow seal after restricting seals"); + close(fd); +} + + +ATF_TP_ADD_TCS(tp) +{ + + ATF_TP_ADD_TC(tp, basic); + ATF_TP_ADD_TC(tp, cloexec); + ATF_TP_ADD_TC(tp, disallowed_sealing); + ATF_TP_ADD_TC(tp, write_seal); + ATF_TP_ADD_TC(tp, mmap_write_seal); + ATF_TP_ADD_TC(tp, truncate_seals); + ATF_TP_ADD_TC(tp, get_seals); + ATF_TP_ADD_TC(tp, dup_seals); + ATF_TP_ADD_TC(tp, immutable_seals); + return (atf_no_error()); +} From 9ccd2fde4c93ff78c069cddd88f4ba555a8d7ea3 Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Wed, 25 Sep 2019 18:04:09 +0000 Subject: [PATCH 074/106] Adjust Makefile.inc1 syscall sub commit --- Makefile.inc1 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.inc1 b/Makefile.inc1 index afa3c0703854..3ca58b38e5ff 100644 --- a/Makefile.inc1 +++ b/Makefile.inc1 @@ -947,7 +947,7 @@ _cleanobj_fast_depend_hack: .PHONY # Syscall stubs rewritten in C and obsolete MD assembly implementations # Date SVN Rev Syscalls # 20180604 r334626 brk sbrk -# 20190916 r35XXXX shm_open +# 20190916 r352703 shm_open .for f in brk sbrk shm_open @if [ -e "${OBJTOP}/lib/libc/.depend.${f}.o" ] && \ egrep -qw '${f}\.[sS]' ${OBJTOP}/lib/libc/.depend.${f}.o; then \ From 234879a7e30ea98cc3cdf74e2b4b21c69b305223 Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Wed, 25 Sep 2019 18:06:48 +0000 Subject: [PATCH 075/106] Mark shm_open(2) as COMPAT12, succeeded by shm_open2 Implementation and regenerated files will follow. --- sys/compat/freebsd32/syscalls.master | 4 ++-- sys/kern/syscalls.master | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sys/compat/freebsd32/syscalls.master b/sys/compat/freebsd32/syscalls.master index 0141f48fdb3c..7f4cc6f1044e 100644 --- a/sys/compat/freebsd32/syscalls.master +++ b/sys/compat/freebsd32/syscalls.master @@ -910,8 +910,8 @@ uint32_t length1, uint32_t length2); } #endif 481 AUE_THR_KILL2 NOPROTO { int thr_kill2(pid_t pid, long id, int sig); } -482 AUE_SHMOPEN NOPROTO { int shm_open(const char *path, int flags, \ - mode_t mode); } +482 AUE_SHMOPEN COMPAT12|NOPROTO { int shm_open(const char *path, + int flags, mode_t mode); } 483 AUE_SHMUNLINK NOPROTO { int shm_unlink(const char *path); } 484 AUE_NULL NOPROTO { int cpuset(cpusetid_t *setid); } #ifdef PAD64_REQUIRED diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master index 4aa117d9b797..cd9a825e926c 100644 --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -2579,7 +2579,7 @@ int sig ); } -482 AUE_SHMOPEN STD { +482 AUE_SHMOPEN COMPAT12 { int shm_open( _In_z_ const char *path, int flags, From a9ac5e142408d3228694aea197d9f77e643ab015 Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Wed, 25 Sep 2019 18:09:19 +0000 Subject: [PATCH 076/106] sysent: regenerate after r352705 This also implements it, fixes kdump, and removes no longer needed bits from lib/libc/sys/shm_open.c for the interim. --- lib/libc/sys/shm_open.c | 4 --- sys/compat/freebsd32/freebsd32_syscall.h | 2 +- sys/compat/freebsd32/freebsd32_syscalls.c | 2 +- sys/compat/freebsd32/freebsd32_sysent.c | 8 ++++- .../freebsd32/freebsd32_systrace_args.c | 30 ------------------- sys/kern/init_sysent.c | 8 ++++- sys/kern/syscalls.c | 2 +- sys/kern/systrace_args.c | 30 ------------------- sys/kern/uipc_shm.c | 4 ++- sys/sys/syscall.h | 2 +- sys/sys/syscall.mk | 2 +- sys/sys/sysproto.h | 14 ++++----- usr.bin/kdump/kdump.c | 4 ++- 13 files changed, 32 insertions(+), 80 deletions(-) diff --git a/lib/libc/sys/shm_open.c b/lib/libc/sys/shm_open.c index e6281f87bbb5..a7846474f092 100644 --- a/lib/libc/sys/shm_open.c +++ b/lib/libc/sys/shm_open.c @@ -46,10 +46,6 @@ __FBSDID("$FreeBSD$"); __weak_reference(shm_open, _shm_open); __weak_reference(shm_open, __sys_shm_open); -#ifndef SYS_freebsd12_shm_open -#define SYS_freebsd12_shm_open SYS_shm_open -#endif - #define SHM_OPEN2_OSREL 1300048 #define MEMFD_NAME_PREFIX "memfd:" diff --git a/sys/compat/freebsd32/freebsd32_syscall.h b/sys/compat/freebsd32/freebsd32_syscall.h index b79832ddfe34..c535b03a05d8 100644 --- a/sys/compat/freebsd32/freebsd32_syscall.h +++ b/sys/compat/freebsd32/freebsd32_syscall.h @@ -404,7 +404,7 @@ #define FREEBSD32_SYS_freebsd32_truncate 479 #define FREEBSD32_SYS_freebsd32_ftruncate 480 #define FREEBSD32_SYS_thr_kill2 481 -#define FREEBSD32_SYS_shm_open 482 +#define FREEBSD32_SYS_freebsd12_shm_open 482 #define FREEBSD32_SYS_shm_unlink 483 #define FREEBSD32_SYS_cpuset 484 #define FREEBSD32_SYS_freebsd32_cpuset_setid 485 diff --git a/sys/compat/freebsd32/freebsd32_syscalls.c b/sys/compat/freebsd32/freebsd32_syscalls.c index a1a45b3988c7..1e42d2aaf0ce 100644 --- a/sys/compat/freebsd32/freebsd32_syscalls.c +++ b/sys/compat/freebsd32/freebsd32_syscalls.c @@ -500,7 +500,7 @@ const char *freebsd32_syscallnames[] = { "freebsd32_ftruncate", /* 480 = freebsd32_ftruncate */ #endif "thr_kill2", /* 481 = thr_kill2 */ - "shm_open", /* 482 = shm_open */ + "compat12.shm_open", /* 482 = freebsd12 shm_open */ "shm_unlink", /* 483 = shm_unlink */ "cpuset", /* 484 = cpuset */ #ifdef PAD64_REQUIRED diff --git a/sys/compat/freebsd32/freebsd32_sysent.c b/sys/compat/freebsd32/freebsd32_sysent.c index c87f9e4654fd..c64cd81c1f3c 100644 --- a/sys/compat/freebsd32/freebsd32_sysent.c +++ b/sys/compat/freebsd32/freebsd32_sysent.c @@ -51,6 +51,12 @@ #define compat11(n, name) 0, (sy_call_t *)nosys #endif +#ifdef COMPAT_FREEBSD12 +#define compat12(n, name) n, (sy_call_t *)__CONCAT(freebsd12_,name) +#else +#define compat12(n, name) 0, (sy_call_t *)nosys +#endif + /* The casts are bogus but will do for now. */ struct sysent freebsd32_sysent[] = { #if !defined(PAD64_REQUIRED) && !defined(__amd64__) @@ -547,7 +553,7 @@ struct sysent freebsd32_sysent[] = { { AS(freebsd32_ftruncate_args), (sy_call_t *)freebsd32_ftruncate, AUE_FTRUNCATE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 480 = freebsd32_ftruncate */ #endif { AS(thr_kill2_args), (sy_call_t *)sys_thr_kill2, AUE_THR_KILL2, NULL, 0, 0, 0, SY_THR_STATIC }, /* 481 = thr_kill2 */ - { AS(shm_open_args), (sy_call_t *)sys_shm_open, AUE_SHMOPEN, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 482 = shm_open */ + { compat12(AS(freebsd12_shm_open_args),shm_open), AUE_SHMOPEN, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 482 = freebsd12 shm_open */ { AS(shm_unlink_args), (sy_call_t *)sys_shm_unlink, AUE_SHMUNLINK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 483 = shm_unlink */ { AS(cpuset_args), (sy_call_t *)sys_cpuset, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 484 = cpuset */ #ifdef PAD64_REQUIRED diff --git a/sys/compat/freebsd32/freebsd32_systrace_args.c b/sys/compat/freebsd32/freebsd32_systrace_args.c index d3655038c737..45ed055967e2 100644 --- a/sys/compat/freebsd32/freebsd32_systrace_args.c +++ b/sys/compat/freebsd32/freebsd32_systrace_args.c @@ -2491,15 +2491,6 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args) *n_args = 3; break; } - /* shm_open */ - case 482: { - struct shm_open_args *p = params; - uarg[0] = (intptr_t) p->path; /* const char * */ - iarg[1] = p->flags; /* int */ - iarg[2] = p->mode; /* mode_t */ - *n_args = 3; - break; - } /* shm_unlink */ case 483: { struct shm_unlink_args *p = params; @@ -7453,22 +7444,6 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) break; }; break; - /* shm_open */ - case 482: - switch(ndx) { - case 0: - p = "userland const char *"; - break; - case 1: - p = "int"; - break; - case 2: - p = "mode_t"; - break; - default: - break; - }; - break; /* shm_unlink */ case 483: switch(ndx) { @@ -10476,11 +10451,6 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) if (ndx == 0 || ndx == 1) p = "int"; break; - /* shm_open */ - case 482: - if (ndx == 0 || ndx == 1) - p = "int"; - break; /* shm_unlink */ case 483: if (ndx == 0 || ndx == 1) diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c index 53b969739785..4727e8189254 100644 --- a/sys/kern/init_sysent.c +++ b/sys/kern/init_sysent.c @@ -47,6 +47,12 @@ #define compat11(n, name) 0, (sy_call_t *)nosys #endif +#ifdef COMPAT_FREEBSD12 +#define compat12(n, name) n, (sy_call_t *)__CONCAT(freebsd12_,name) +#else +#define compat12(n, name) 0, (sy_call_t *)nosys +#endif + /* The casts are bogus but will do for now. */ struct sysent sysent[] = { { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 0 = syscall */ @@ -531,7 +537,7 @@ struct sysent sysent[] = { { AS(truncate_args), (sy_call_t *)sys_truncate, AUE_TRUNCATE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 479 = truncate */ { AS(ftruncate_args), (sy_call_t *)sys_ftruncate, AUE_FTRUNCATE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 480 = ftruncate */ { AS(thr_kill2_args), (sy_call_t *)sys_thr_kill2, AUE_THR_KILL2, NULL, 0, 0, 0, SY_THR_STATIC }, /* 481 = thr_kill2 */ - { AS(shm_open_args), (sy_call_t *)sys_shm_open, AUE_SHMOPEN, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 482 = shm_open */ + { compat12(AS(freebsd12_shm_open_args),shm_open), AUE_SHMOPEN, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 482 = freebsd12 shm_open */ { AS(shm_unlink_args), (sy_call_t *)sys_shm_unlink, AUE_SHMUNLINK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 483 = shm_unlink */ { AS(cpuset_args), (sy_call_t *)sys_cpuset, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 484 = cpuset */ { AS(cpuset_setid_args), (sy_call_t *)sys_cpuset_setid, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 485 = cpuset_setid */ diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c index d9b361da2f31..1e5a69ce6a68 100644 --- a/sys/kern/syscalls.c +++ b/sys/kern/syscalls.c @@ -488,7 +488,7 @@ const char *syscallnames[] = { "truncate", /* 479 = truncate */ "ftruncate", /* 480 = ftruncate */ "thr_kill2", /* 481 = thr_kill2 */ - "shm_open", /* 482 = shm_open */ + "compat12.shm_open", /* 482 = freebsd12 shm_open */ "shm_unlink", /* 483 = shm_unlink */ "cpuset", /* 484 = cpuset */ "cpuset_setid", /* 485 = cpuset_setid */ diff --git a/sys/kern/systrace_args.c b/sys/kern/systrace_args.c index 0ea576919c24..3816389eb3d0 100644 --- a/sys/kern/systrace_args.c +++ b/sys/kern/systrace_args.c @@ -2575,15 +2575,6 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args) *n_args = 3; break; } - /* shm_open */ - case 482: { - struct shm_open_args *p = params; - uarg[0] = (intptr_t) p->path; /* const char * */ - iarg[1] = p->flags; /* int */ - iarg[2] = p->mode; /* mode_t */ - *n_args = 3; - break; - } /* shm_unlink */ case 483: { struct shm_unlink_args *p = params; @@ -7555,22 +7546,6 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) break; }; break; - /* shm_open */ - case 482: - switch(ndx) { - case 0: - p = "userland const char *"; - break; - case 1: - p = "int"; - break; - case 2: - p = "mode_t"; - break; - default: - break; - }; - break; /* shm_unlink */ case 483: switch(ndx) { @@ -10445,11 +10420,6 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) if (ndx == 0 || ndx == 1) p = "int"; break; - /* shm_open */ - case 482: - if (ndx == 0 || ndx == 1) - p = "int"; - break; /* shm_unlink */ case 483: if (ndx == 0 || ndx == 1) diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c index 4c21da4bc38a..f25f47201b9a 100644 --- a/sys/kern/uipc_shm.c +++ b/sys/kern/uipc_shm.c @@ -904,13 +904,15 @@ kern_shm_open(struct thread *td, const char *userpath, int flags, mode_t mode, } /* System calls. */ +#ifdef COMPAT_FREEBSD12 int -sys_shm_open(struct thread *td, struct shm_open_args *uap) +freebsd12_shm_open(struct thread *td, struct freebsd12_shm_open_args *uap) { return (kern_shm_open(td, uap->path, uap->flags | O_CLOEXEC, uap->mode, NULL, F_SEAL_SEAL)); } +#endif int sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap) diff --git a/sys/sys/syscall.h b/sys/sys/syscall.h index d1a0c5967e3d..57b764ec9085 100644 --- a/sys/sys/syscall.h +++ b/sys/sys/syscall.h @@ -418,7 +418,7 @@ #define SYS_truncate 479 #define SYS_ftruncate 480 #define SYS_thr_kill2 481 -#define SYS_shm_open 482 +#define SYS_freebsd12_shm_open 482 #define SYS_shm_unlink 483 #define SYS_cpuset 484 #define SYS_cpuset_setid 485 diff --git a/sys/sys/syscall.mk b/sys/sys/syscall.mk index b85e9cea73c0..e813af1e7bcf 100644 --- a/sys/sys/syscall.mk +++ b/sys/sys/syscall.mk @@ -327,7 +327,7 @@ MIASM = \ truncate.o \ ftruncate.o \ thr_kill2.o \ - shm_open.o \ + freebsd12_shm_open.o \ shm_unlink.o \ cpuset.o \ cpuset_setid.o \ diff --git a/sys/sys/sysproto.h b/sys/sys/sysproto.h index b0f92e750ef0..0b9364830128 100644 --- a/sys/sys/sysproto.h +++ b/sys/sys/sysproto.h @@ -1381,11 +1381,6 @@ struct thr_kill2_args { char id_l_[PADL_(long)]; long id; char id_r_[PADR_(long)]; char sig_l_[PADL_(int)]; int sig; char sig_r_[PADR_(int)]; }; -struct shm_open_args { - char path_l_[PADL_(const char *)]; const char * path; char path_r_[PADR_(const char *)]; - char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; - char mode_l_[PADL_(mode_t)]; mode_t mode; char mode_r_[PADR_(mode_t)]; -}; struct shm_unlink_args { char path_l_[PADL_(const char *)]; const char * path; char path_r_[PADR_(const char *)]; }; @@ -2123,7 +2118,6 @@ int sys_lseek(struct thread *, struct lseek_args *); int sys_truncate(struct thread *, struct truncate_args *); int sys_ftruncate(struct thread *, struct ftruncate_args *); int sys_thr_kill2(struct thread *, struct thr_kill2_args *); -int sys_shm_open(struct thread *, struct shm_open_args *); int sys_shm_unlink(struct thread *, struct shm_unlink_args *); int sys_cpuset(struct thread *, struct cpuset_args *); int sys_cpuset_setid(struct thread *, struct cpuset_setid_args *); @@ -2654,6 +2648,12 @@ int freebsd11_mknodat(struct thread *, struct freebsd11_mknodat_args *); #ifdef COMPAT_FREEBSD12 +struct freebsd12_shm_open_args { + char path_l_[PADL_(const char *)]; const char * path; char path_r_[PADR_(const char *)]; + char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; + char mode_l_[PADL_(mode_t)]; mode_t mode; char mode_r_[PADR_(mode_t)]; +}; +int freebsd12_shm_open(struct thread *, struct freebsd12_shm_open_args *); #endif /* COMPAT_FREEBSD12 */ @@ -3037,7 +3037,7 @@ int freebsd11_mknodat(struct thread *, struct freebsd11_mknodat_args *); #define SYS_AUE_truncate AUE_TRUNCATE #define SYS_AUE_ftruncate AUE_FTRUNCATE #define SYS_AUE_thr_kill2 AUE_THR_KILL2 -#define SYS_AUE_shm_open AUE_SHMOPEN +#define SYS_AUE_freebsd12_shm_open AUE_SHMOPEN #define SYS_AUE_shm_unlink AUE_SHMUNLINK #define SYS_AUE_cpuset AUE_NULL #define SYS_AUE_cpuset_setid AUE_NULL diff --git a/usr.bin/kdump/kdump.c b/usr.bin/kdump/kdump.c index 25b2508f862f..3319d2f06607 100644 --- a/usr.bin/kdump/kdump.c +++ b/usr.bin/kdump/kdump.c @@ -1246,7 +1246,8 @@ ktrsyscall(struct ktr_syscall *ktr, u_int sv_flags) ip++; narg--; break; - case SYS_shm_open: +#ifdef SYS_freebsd12_shm_open + case SYS_freebsd12_shm_open: print_number(ip, narg, c); putchar(','); print_mask_arg(sysdecode_open_flags, ip[0]); @@ -1255,6 +1256,7 @@ ktrsyscall(struct ktr_syscall *ktr, u_int sv_flags) ip += 2; narg -= 2; break; +#endif case SYS_minherit: print_number(ip, narg, c); print_number(ip, narg, c); From dd902d015a202e4cabb8fdadd307ec9a3dc38251 Mon Sep 17 00:00:00 2001 From: Gleb Smirnoff Date: Wed, 25 Sep 2019 18:26:31 +0000 Subject: [PATCH 077/106] Add debugging facility EPOCH_TRACE that checks that epochs entered are properly nested and warns about recursive entrances. Unlike with locks, there is nothing fundamentally wrong with such use, the intent of tracer is to help to review complex epoch-protected code paths, and we mean the network stack here. Reviewed by: hselasky Sponsored by: Netflix Pull Request: https://reviews.freebsd.org/D21610 --- sys/conf/options | 2 + sys/kern/kern_thread.c | 3 + sys/kern/subr_epoch.c | 134 +++++++++++++++++++++++++++++++++++------ sys/kern/subr_stack.c | 2 +- sys/net/if.c | 4 +- sys/sys/epoch.h | 35 +++++++---- sys/sys/proc.h | 1 + sys/sys/stack.h | 4 ++ 8 files changed, 152 insertions(+), 33 deletions(-) diff --git a/sys/conf/options b/sys/conf/options index 6957a2d236ef..875a3bc461e0 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -712,6 +712,8 @@ WITNESS_SKIPSPIN opt_witness.h WITNESS_COUNT opt_witness.h OPENSOLARIS_WITNESS opt_global.h +EPOCH_TRACE opt_epoch.h + # options for ACPI support ACPI_DEBUG opt_acpi.h ACPI_MAX_TASKS opt_acpi.h diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c index 378bdb88d3a6..cf5beb4acfbc 100644 --- a/sys/kern/kern_thread.c +++ b/sys/kern/kern_thread.c @@ -668,6 +668,7 @@ thread_link(struct thread *td, struct proc *p) LIST_INIT(&td->td_contested); LIST_INIT(&td->td_lprof[0]); LIST_INIT(&td->td_lprof[1]); + SLIST_INIT(&td->td_epochs); sigqueue_init(&td->td_sigqueue, p); callout_init(&td->td_slpcallout, 1); TAILQ_INSERT_TAIL(&p->p_threads, td, td_plist); @@ -684,6 +685,8 @@ thread_unlink(struct thread *td) struct proc *p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); + MPASS(SLIST_EMPTY(&td->td_epochs)); + TAILQ_REMOVE(&p->p_threads, td, td_plist); p->p_numthreads--; /* could clear a few other things here */ diff --git a/sys/kern/subr_epoch.c b/sys/kern/subr_epoch.c index aec3a70c27d2..c76697b91608 100644 --- a/sys/kern/subr_epoch.c +++ b/sys/kern/subr_epoch.c @@ -30,7 +30,6 @@ __FBSDID("$FreeBSD$"); #include -#include #include #include #include @@ -47,6 +46,11 @@ __FBSDID("$FreeBSD$"); #include #include #include +#ifdef EPOCH_TRACE +#include +#include +#include +#endif #include #include #include @@ -80,6 +84,7 @@ struct epoch { struct sx e_drain_sx; struct mtx e_drain_mtx; volatile int e_drain_count; + const char *e_name; }; /* arbitrary --- needs benchmarking */ @@ -134,6 +139,103 @@ __read_mostly epoch_t global_epoch_preempt; static void epoch_call_task(void *context __unused); static uma_zone_t pcpu_zone_record; +#ifdef EPOCH_TRACE +struct stackentry { + RB_ENTRY(stackentry) se_node; + struct stack se_stack; +}; + +static int +stackentry_compare(struct stackentry *a, struct stackentry *b) +{ + + if (a->se_stack.depth > b->se_stack.depth) + return (1); + if (a->se_stack.depth < b->se_stack.depth) + return (-1); + for (int i = 0; i < a->se_stack.depth; i++) { + if (a->se_stack.pcs[i] > b->se_stack.pcs[i]) + return (1); + if (a->se_stack.pcs[i] < b->se_stack.pcs[i]) + return (-1); + } + + return (0); +} + +RB_HEAD(stacktree, stackentry) epoch_stacks = RB_INITIALIZER(&epoch_stacks); +RB_GENERATE_STATIC(stacktree, stackentry, se_node, stackentry_compare); + +static struct mtx epoch_stacks_lock; +MTX_SYSINIT(epochstacks, &epoch_stacks_lock, "epoch_stacks", MTX_DEF); + +static void epoch_trace_report(const char *fmt, ...) __printflike(1, 2); +static inline void +epoch_trace_report(const char *fmt, ...) +{ + va_list ap; + struct stackentry se, *new; + + stack_zero(&se.se_stack); /* XXX: is it really needed? */ + stack_save(&se.se_stack); + + /* Tree is never reduced - go lockless. */ + if (RB_FIND(stacktree, &epoch_stacks, &se) != NULL) + return; + + new = malloc(sizeof(*new), M_STACK, M_NOWAIT); + if (new != NULL) { + bcopy(&se.se_stack, &new->se_stack, sizeof(struct stack)); + + mtx_lock(&epoch_stacks_lock); + new = RB_INSERT(stacktree, &epoch_stacks, new); + mtx_unlock(&epoch_stacks_lock); + if (new != NULL) + free(new, M_STACK); + } + + va_start(ap, fmt); + (void)vprintf(fmt, ap); + va_end(ap); + stack_print_ddb(&se.se_stack); +} + +static inline void +epoch_trace_enter(struct thread *td, epoch_t epoch, epoch_tracker_t et, + const char *file, int line) +{ + epoch_tracker_t iet; + + SLIST_FOREACH(iet, &td->td_epochs, et_tlink) + if (iet->et_epoch == epoch) + epoch_trace_report("Recursively entering epoch %s " + "previously entered at %s:%d\n", + epoch->e_name, iet->et_file, iet->et_line); + et->et_epoch = epoch; + et->et_file = file; + et->et_line = line; + SLIST_INSERT_HEAD(&td->td_epochs, et, et_tlink); +} + +static inline void +epoch_trace_exit(struct thread *td, epoch_t epoch, epoch_tracker_t et, + const char *file, int line) +{ + + if (SLIST_FIRST(&td->td_epochs) != et) { + epoch_trace_report("Exiting epoch %s in a not nested order. " + "Most recently entered %s at %s:%d\n", + epoch->e_name, + SLIST_FIRST(&td->td_epochs)->et_epoch->e_name, + SLIST_FIRST(&td->td_epochs)->et_file, + SLIST_FIRST(&td->td_epochs)->et_line); + /* This will panic if et is not anywhere on td_epochs. */ + SLIST_REMOVE(&td->td_epochs, et, epoch_tracker, et_tlink); + } else + SLIST_REMOVE_HEAD(&td->td_epochs, et_tlink); +} +#endif /* EPOCH_TRACE */ + static void epoch_init(void *arg __unused) { @@ -156,9 +258,10 @@ epoch_init(void *arg __unused) DPCPU_ID_PTR(cpu, epoch_cb_task), NULL, cpu, NULL, NULL, "epoch call task"); } + SLIST_INIT(&thread0.td_epochs); inited = 1; - global_epoch = epoch_alloc(0); - global_epoch_preempt = epoch_alloc(EPOCH_PREEMPT); + global_epoch = epoch_alloc("Global", 0); + global_epoch_preempt = epoch_alloc("Global preemptible", EPOCH_PREEMPT); } SYSINIT(epoch, SI_SUB_TASKQ + 1, SI_ORDER_FIRST, epoch_init, NULL); @@ -198,7 +301,7 @@ epoch_adjust_prio(struct thread *td, u_char prio) } epoch_t -epoch_alloc(int flags) +epoch_alloc(const char *name, int flags) { epoch_t epoch; @@ -210,6 +313,7 @@ epoch_alloc(int flags) MPASS(epoch_count < MAX_EPOCHS - 2); epoch->e_flags = flags; epoch->e_idx = epoch_count; + epoch->e_name = name; sx_init(&epoch->e_drain_sx, "epoch-drain-sx"); mtx_init(&epoch->e_drain_mtx, "epoch-drain-mtx", NULL, MTX_DEF); allepochs[epoch_count++] = epoch; @@ -243,7 +347,7 @@ epoch_currecord(epoch_t epoch) } while (0) void -epoch_enter_preempt(epoch_t epoch, epoch_tracker_t et) +_epoch_enter_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE) { struct epoch_record *er; struct thread *td; @@ -251,16 +355,14 @@ epoch_enter_preempt(epoch_t epoch, epoch_tracker_t et) MPASS(cold || epoch != NULL); INIT_CHECK(epoch); MPASS(epoch->e_flags & EPOCH_PREEMPT); -#ifdef EPOCH_TRACKER_DEBUG - et->et_magic_pre = EPOCH_MAGIC0; - et->et_magic_post = EPOCH_MAGIC1; -#endif td = curthread; +#ifdef EPOCH_TRACE + epoch_trace_enter(td, epoch, et, file, line); +#endif et->et_td = td; td->td_epochnest++; critical_enter(); sched_pin(); - td->td_pre_epoch_prio = td->td_priority; er = epoch_currecord(epoch); TAILQ_INSERT_TAIL(&er->er_tdlist, et, et_link); @@ -277,7 +379,6 @@ epoch_enter(epoch_t epoch) MPASS(cold || epoch != NULL); INIT_CHECK(epoch); td = curthread; - td->td_epochnest++; critical_enter(); er = epoch_currecord(epoch); @@ -285,7 +386,7 @@ epoch_enter(epoch_t epoch) } void -epoch_exit_preempt(epoch_t epoch, epoch_tracker_t et) +_epoch_exit_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE) { struct epoch_record *er; struct thread *td; @@ -300,12 +401,6 @@ epoch_exit_preempt(epoch_t epoch, epoch_tracker_t et) MPASS(epoch->e_flags & EPOCH_PREEMPT); MPASS(et != NULL); MPASS(et->et_td == td); -#ifdef EPOCH_TRACKER_DEBUG - MPASS(et->et_magic_pre == EPOCH_MAGIC0); - MPASS(et->et_magic_post == EPOCH_MAGIC1); - et->et_magic_pre = 0; - et->et_magic_post = 0; -#endif #ifdef INVARIANTS et->et_td = (void*)0xDEADBEEF; #endif @@ -315,6 +410,9 @@ epoch_exit_preempt(epoch_t epoch, epoch_tracker_t et) if (__predict_false(td->td_pre_epoch_prio != td->td_priority)) epoch_adjust_prio(td, td->td_pre_epoch_prio); critical_exit(); +#ifdef EPOCH_TRACE + epoch_trace_exit(td, epoch, et, file, line); +#endif } void diff --git a/sys/kern/subr_stack.c b/sys/kern/subr_stack.c index 0429a5fb2434..0ec11e85e9a5 100644 --- a/sys/kern/subr_stack.c +++ b/sys/kern/subr_stack.c @@ -45,7 +45,7 @@ __FBSDID("$FreeBSD$"); FEATURE(stack, "Support for capturing kernel stack"); -static MALLOC_DEFINE(M_STACK, "stack", "Stack Traces"); +MALLOC_DEFINE(M_STACK, "stack", "Stack Traces"); static int stack_symbol(vm_offset_t pc, char *namebuf, u_int buflen, long *offset, int flags); diff --git a/sys/net/if.c b/sys/net/if.c index 8de83b2d7bc5..7e9f800593bc 100644 --- a/sys/net/if.c +++ b/sys/net/if.c @@ -940,8 +940,8 @@ static void if_epochalloc(void *dummy __unused) { - net_epoch_preempt = epoch_alloc(EPOCH_PREEMPT); - net_epoch = epoch_alloc(0); + net_epoch_preempt = epoch_alloc("Net preemptible", EPOCH_PREEMPT); + net_epoch = epoch_alloc("Net", 0); } SYSINIT(ifepochalloc, SI_SUB_TASKQ + 1, SI_ORDER_ANY, if_epochalloc, NULL); diff --git a/sys/sys/epoch.h b/sys/sys/epoch.h index 3078a53fedbf..4d7172dd91a9 100644 --- a/sys/sys/epoch.h +++ b/sys/sys/epoch.h @@ -41,6 +41,8 @@ typedef struct epoch_context *epoch_context_t; #include #include +#include "opt_epoch.h" + struct epoch; typedef struct epoch *epoch_t; @@ -51,21 +53,19 @@ extern epoch_t global_epoch; extern epoch_t global_epoch_preempt; struct epoch_tracker { -#ifdef EPOCH_TRACKER_DEBUG -#define EPOCH_MAGIC0 0xFADECAFEF00DD00D -#define EPOCH_MAGIC1 0xBADDBABEDEEDFEED - uint64_t et_magic_pre; -#endif TAILQ_ENTRY(epoch_tracker) et_link; struct thread *et_td; ck_epoch_section_t et_section; -#ifdef EPOCH_TRACKER_DEBUG - uint64_t et_magic_post; +#ifdef EPOCH_TRACE + struct epoch *et_epoch; + SLIST_ENTRY(epoch_tracker) et_tlink; + const char *et_file; + int et_line; #endif } __aligned(sizeof(void *)); typedef struct epoch_tracker *epoch_tracker_t; -epoch_t epoch_alloc(int flags); +epoch_t epoch_alloc(const char *name, int flags); void epoch_free(epoch_t epoch); void epoch_wait(epoch_t epoch); void epoch_wait_preempt(epoch_t epoch); @@ -75,11 +75,22 @@ int in_epoch(epoch_t epoch); int in_epoch_verbose(epoch_t epoch, int dump_onfail); DPCPU_DECLARE(int, epoch_cb_count); DPCPU_DECLARE(struct grouptask, epoch_cb_task); -#define EPOCH_MAGIC0 0xFADECAFEF00DD00D -#define EPOCH_MAGIC1 0xBADDBABEDEEDFEED -void epoch_enter_preempt(epoch_t epoch, epoch_tracker_t et); -void epoch_exit_preempt(epoch_t epoch, epoch_tracker_t et); +#ifdef EPOCH_TRACE +#define EPOCH_FILE_LINE , const char *file, int line +#else +#define EPOCH_FILE_LINE +#endif + +void _epoch_enter_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE); +void _epoch_exit_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE); +#ifdef EPOCH_TRACE +#define epoch_enter_preempt(epoch, et) _epoch_enter_preempt(epoch, et, __FILE__, __LINE__) +#define epoch_exit_preempt(epoch, et) _epoch_exit_preempt(epoch, et, __FILE__, __LINE__) +#else +#define epoch_enter_preempt(epoch, et) _epoch_enter_preempt(epoch, et) +#define epoch_exit_preempt(epoch, et) _epoch_exit_preempt(epoch, et) +#endif void epoch_enter(epoch_t epoch); void epoch_exit(epoch_t epoch); diff --git a/sys/sys/proc.h b/sys/sys/proc.h index ae605a1266dd..8edca8ad889b 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -367,6 +367,7 @@ struct thread { void *td_lkpi_task; /* LinuxKPI task struct pointer */ struct epoch_tracker *td_et; /* (k) compat KPI spare tracker */ int td_pmcpend; + SLIST_HEAD(, epoch_tracker) td_epochs; }; struct thread0_storage { diff --git a/sys/sys/stack.h b/sys/sys/stack.h index 3b4eff8ab117..03a7dbfb3c97 100644 --- a/sys/sys/stack.h +++ b/sys/sys/stack.h @@ -33,6 +33,10 @@ #include +#ifdef _SYS_MALLOC_H_ +MALLOC_DECLARE(M_STACK); +#endif + struct sbuf; /* MI Routines. */ From 618d66a56ff129568cee790b29ae5ec07ee3a561 Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Wed, 25 Sep 2019 18:48:05 +0000 Subject: [PATCH 078/106] compat/freebsd32: restore style after r352705 (no functional change) The escaped newlines haven't been necessary since r339624, but this file has not been reformatted. Restore the style. --- sys/compat/freebsd32/syscalls.master | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sys/compat/freebsd32/syscalls.master b/sys/compat/freebsd32/syscalls.master index 7f4cc6f1044e..b212e72652fa 100644 --- a/sys/compat/freebsd32/syscalls.master +++ b/sys/compat/freebsd32/syscalls.master @@ -910,8 +910,8 @@ uint32_t length1, uint32_t length2); } #endif 481 AUE_THR_KILL2 NOPROTO { int thr_kill2(pid_t pid, long id, int sig); } -482 AUE_SHMOPEN COMPAT12|NOPROTO { int shm_open(const char *path, - int flags, mode_t mode); } +482 AUE_SHMOPEN COMPAT12|NOPROTO { int shm_open( \ + const char *path, int flags, mode_t mode); } 483 AUE_SHMUNLINK NOPROTO { int shm_unlink(const char *path); } 484 AUE_NULL NOPROTO { int cpuset(cpusetid_t *setid); } #ifdef PAD64_REQUIRED From 5763a8cf06a297c067aa7af13973f50ab44c2a4d Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Wed, 25 Sep 2019 18:50:57 +0000 Subject: [PATCH 079/106] Do not left-shift a negative number (inducing undefined behavior in C/C++) in exp(3), expf(3), expm1(3) and expm1f(3) during intermediate computations that compute the IEEE-754 bit pattern for |2**k| for integer |k|. The implementations of exp(3), expf(3), expm1(3) and expm1f(3) need to compute IEEE-754 bit patterns for 2**k in certain places. (k is an integer and 2**k is exactly representable in IEEE-754.) Currently they do things like 0x3FF0'0000+(k<<20), which is to say they take the bit pattern representing 1 and then add directly to the exponent field to get the desired power of two. This is fine when k is non-negative. But when k<0 (and certain classes of input trigger this), this left-shifts a negative number -- an operation with undefined behavior in C and C++. The desired semantics can be achieved by instead adding the possibly-negative k to the IEEE-754 exponent bias to get the desired exponent field, _then_ shifting that into its proper overall position. (Note that in case of s_expm1.c and s_expm1f.c, there are SET_HIGH_WORD and SET_FLOAT_WORD uses further down in each of these files that perform shift operations involving k, but by these points k's range has been restricted to 2 < k <= 56, and the shift operations under those circumstances can't do anything that would be UB.) Submitted by: Jeff Walden, https://github.com/jswalden Obtained from: https://github.com/freebsd/freebsd/pull/411 Obtained from: https://github.com/freebsd/freebsd/pull/412 MFC after: 3 days --- lib/msun/src/e_exp.c | 4 ++-- lib/msun/src/e_expf.c | 4 ++-- lib/msun/src/s_expm1.c | 2 +- lib/msun/src/s_expm1f.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/msun/src/e_exp.c b/lib/msun/src/e_exp.c index 94c9769da38d..dd04d8e839d5 100644 --- a/lib/msun/src/e_exp.c +++ b/lib/msun/src/e_exp.c @@ -145,9 +145,9 @@ __ieee754_exp(double x) /* default IEEE double exp */ /* x is now in primary range */ t = x*x; if(k >= -1021) - INSERT_WORDS(twopk,0x3ff00000+(k<<20), 0); + INSERT_WORDS(twopk,((u_int32_t)(0x3ff+k))<<20, 0); else - INSERT_WORDS(twopk,0x3ff00000+((k+1000)<<20), 0); + INSERT_WORDS(twopk,((u_int32_t)(0x3ff+(k+1000)))<<20, 0); c = x - t*(P1+t*(P2+t*(P3+t*(P4+t*P5)))); if(k==0) return one-((x*c)/(c-2.0)-x); else y = one-((lo-(x*c)/(2.0-c))-hi); diff --git a/lib/msun/src/e_expf.c b/lib/msun/src/e_expf.c index b1fe2c5371ac..4903d55c5ab3 100644 --- a/lib/msun/src/e_expf.c +++ b/lib/msun/src/e_expf.c @@ -83,9 +83,9 @@ __ieee754_expf(float x) /* x is now in primary range */ t = x*x; if(k >= -125) - SET_FLOAT_WORD(twopk,0x3f800000+(k<<23)); + SET_FLOAT_WORD(twopk,((u_int32_t)(0x7f+k))<<23); else - SET_FLOAT_WORD(twopk,0x3f800000+((k+100)<<23)); + SET_FLOAT_WORD(twopk,((u_int32_t)(0x7f+(k+100)))<<23); c = x - t*(P1+t*P2); if(k==0) return one-((x*c)/(c-(float)2.0)-x); else y = one-((lo-(x*c)/((float)2.0-c))-hi); diff --git a/lib/msun/src/s_expm1.c b/lib/msun/src/s_expm1.c index 37998a3871aa..844f10360fe6 100644 --- a/lib/msun/src/s_expm1.c +++ b/lib/msun/src/s_expm1.c @@ -188,7 +188,7 @@ expm1(double x) e = hxs*((r1-t)/(6.0 - x*t)); if(k==0) return x - (x*e-hxs); /* c is 0 */ else { - INSERT_WORDS(twopk,0x3ff00000+(k<<20),0); /* 2^k */ + INSERT_WORDS(twopk,((u_int32_t)(0x3ff+k))<<20,0); /* 2^k */ e = (x*(e-c)-c); e -= hxs; if(k== -1) return 0.5*(x-e)-0.5; diff --git a/lib/msun/src/s_expm1f.c b/lib/msun/src/s_expm1f.c index c0a39340fc04..b47daac2eb35 100644 --- a/lib/msun/src/s_expm1f.c +++ b/lib/msun/src/s_expm1f.c @@ -94,7 +94,7 @@ expm1f(float x) e = hxs*((r1-t)/((float)6.0 - x*t)); if(k==0) return x - (x*e-hxs); /* c is 0 */ else { - SET_FLOAT_WORD(twopk,0x3f800000+(k<<23)); /* 2^k */ + SET_FLOAT_WORD(twopk,((u_int32_t)(0x7f+k))<<23); /* 2^k */ e = (x*(e-c)-c); e -= hxs; if(k== -1) return (float)0.5*(x-e)-(float)0.5; From 079c5b9ed809af6dec664accc70119a4ef989420 Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Wed, 25 Sep 2019 19:20:41 +0000 Subject: [PATCH 080/106] rfork(2): add RFSPAWN flag When RFSPAWN is passed, rfork exhibits vfork(2) semantics but also resets signal handlers in the child during creation to avoid a point of corruption of parent state from the child. This flag will be used by posix_spawn(3) to handle potential signal issues. Reviewed by: jilles, kib Differential Revision: https://reviews.freebsd.org/D19058 --- lib/libc/sys/rfork.2 | 14 ++++++++++++-- sys/kern/kern_fork.c | 15 ++++++++++++++- sys/kern/kern_sig.c | 24 ++++++++++++++++++------ sys/sys/proc.h | 2 ++ sys/sys/signalvar.h | 1 + sys/sys/unistd.h | 9 ++++++--- 6 files changed, 53 insertions(+), 12 deletions(-) diff --git a/lib/libc/sys/rfork.2 b/lib/libc/sys/rfork.2 index 222bac425496..fe3e6e3a75e5 100644 --- a/lib/libc/sys/rfork.2 +++ b/lib/libc/sys/rfork.2 @@ -5,7 +5,7 @@ .\" .\" $FreeBSD$ .\" -.Dd July 12, 2011 +.Dd September 25, 2019 .Dt RFORK 2 .Os .Sh NAME @@ -34,7 +34,9 @@ and open files. The .Fa flags argument -is the logical OR of some subset of: +is either +.Dv RFSPAWN +or the logical OR of some subset of: .Bl -tag -width ".Dv RFLINUXTHPN" .It Dv RFPROC If set a new process is created; otherwise changes affect the @@ -105,6 +107,14 @@ open until either they are explicitly closed or all processes sharing the table exit. .Pp If +.Dv RFSPAWN +is passed, +.Nm +will use +.Xr vfork 2 +semantics but reset all signal actions in the child to default. +.Pp +If .Dv RFPROC is set, the value returned in the parent process diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c index 7fc5750a1f07..b397dee1aaa4 100644 --- a/sys/kern/kern_fork.c +++ b/sys/kern/kern_fork.c @@ -170,10 +170,18 @@ sys_rfork(struct thread *td, struct rfork_args *uap) /* Don't allow kernel-only flags. */ if ((uap->flags & RFKERNELONLY) != 0) return (EINVAL); + /* RFSPAWN must not appear with others */ + if ((uap->flags & RFSPAWN) != 0 && uap->flags != RFSPAWN) + return (EINVAL); AUDIT_ARG_FFLAGS(uap->flags); bzero(&fr, sizeof(fr)); - fr.fr_flags = uap->flags; + if ((uap->flags & RFSPAWN) != 0) { + fr.fr_flags = RFFDG | RFPROC | RFPPWAIT | RFMEM; + fr.fr_flags2 = FR2_DROPSIG_CAUGHT; + } else { + fr.fr_flags = uap->flags; + } fr.fr_pidp = &pid; error = fork1(td, &fr); if (error == 0) { @@ -471,6 +479,11 @@ do_fork(struct thread *td, struct fork_req *fr, struct proc *p2, struct thread * } else { sigacts_copy(newsigacts, p1->p_sigacts); p2->p_sigacts = newsigacts; + if ((fr->fr_flags2 & FR2_DROPSIG_CAUGHT) != 0) { + mtx_lock(&p2->p_sigacts->ps_mtx); + sig_drop_caught(p2); + mtx_unlock(&p2->p_sigacts->ps_mtx); + } } if (fr->fr_flags & RFTSIGZMB) diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c index 067de64f808e..ec7030d29ca0 100644 --- a/sys/kern/kern_sig.c +++ b/sys/kern/kern_sig.c @@ -986,12 +986,7 @@ execsigs(struct proc *p) PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); - while (SIGNOTEMPTY(ps->ps_sigcatch)) { - sig = sig_ffs(&ps->ps_sigcatch); - sigdflt(ps, sig); - if ((sigprop(sig) & SIGPROP_IGNORE) != 0) - sigqueue_delete_proc(p, sig); - } + sig_drop_caught(p); /* * As CloudABI processes cannot modify signal handlers, fully @@ -3857,3 +3852,20 @@ sigacts_shared(struct sigacts *ps) return (ps->ps_refcnt > 1); } + +void +sig_drop_caught(struct proc *p) +{ + int sig; + struct sigacts *ps; + + ps = p->p_sigacts; + PROC_LOCK_ASSERT(p, MA_OWNED); + mtx_assert(&ps->ps_mtx, MA_OWNED); + while (SIGNOTEMPTY(ps->ps_sigcatch)) { + sig = sig_ffs(&ps->ps_sigcatch); + sigdflt(ps, sig); + if ((sigprop(sig) & SIGPROP_IGNORE) != 0) + sigqueue_delete_proc(p, sig); + } +} diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 8edca8ad889b..36424d51bc78 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -1006,6 +1006,8 @@ struct fork_req { int *fr_pd_fd; int fr_pd_flags; struct filecaps *fr_pd_fcaps; + int fr_flags2; +#define FR2_DROPSIG_CAUGHT 0x00001 /* Drop caught non-DFL signals */ }; /* diff --git a/sys/sys/signalvar.h b/sys/sys/signalvar.h index c6c7d083cefb..ed82ff123209 100644 --- a/sys/sys/signalvar.h +++ b/sys/sys/signalvar.h @@ -381,6 +381,7 @@ void sigacts_copy(struct sigacts *dest, struct sigacts *src); void sigacts_free(struct sigacts *ps); struct sigacts *sigacts_hold(struct sigacts *ps); int sigacts_shared(struct sigacts *ps); +void sig_drop_caught(struct proc *p); void sigexit(struct thread *td, int sig) __dead2; int sigev_findtd(struct proc *p, struct sigevent *sigev, struct thread **); int sig_ffs(sigset_t *set); diff --git a/sys/sys/unistd.h b/sys/sys/unistd.h index 668f8cf42651..3b3de3aa33bc 100644 --- a/sys/sys/unistd.h +++ b/sys/sys/unistd.h @@ -188,11 +188,14 @@ #define RFTSIGNUM(flags) (((flags) >> RFTSIGSHIFT) & RFTSIGMASK) #define RFTSIGFLAGS(signum) ((signum) << RFTSIGSHIFT) #define RFPROCDESC (1<<28) /* return a process descriptor */ -#define RFPPWAIT (1<<31) /* parent sleeps until child exits (vfork) */ +/* kernel: parent sleeps until child exits (vfork) */ +#define RFPPWAIT (1<<31) +/* user: vfork(2) semantics, clear signals */ +#define RFSPAWN (1U<<31) #define RFFLAGS (RFFDG | RFPROC | RFMEM | RFNOWAIT | RFCFDG | \ RFTHREAD | RFSIGSHARE | RFLINUXTHPN | RFSTOPPED | RFHIGHPID | RFTSIGZMB | \ - RFPROCDESC | RFPPWAIT) -#define RFKERNELONLY (RFSTOPPED | RFHIGHPID | RFPPWAIT | RFPROCDESC) + RFPROCDESC | RFSPAWN | RFPPWAIT) +#define RFKERNELONLY (RFSTOPPED | RFHIGHPID | RFPROCDESC) #endif /* __BSD_VISIBLE */ From c34a5f16faf5d6d108323eb35bc045e9bb22d1ff Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Wed, 25 Sep 2019 19:22:03 +0000 Subject: [PATCH 081/106] posix_spawn(3): handle potential signal issues with vfork Described in [1], signal handlers running in a vfork child have opportunities to corrupt the parent's state. Address this by adding a new rfork(2) flag, RFSPAWN, that has vfork(2) semantics but also resets signal handlers in the child during creation. x86 uses rfork_thread(3) instead of a direct rfork(2) because rfork with RFMEM/RFSPAWN cannot work when the return address is stored on the stack -- further information about this problem is described under RFMEM in the rfork(2) man page. Addressing this has been identified as a prerequisite to using posix_spawn in subprocess on FreeBSD [2]. [1] https://ewontfix.com/7/ [2] https://bugs.python.org/issue35823 Reviewed by: jilles, kib Differential Revision: https://reviews.freebsd.org/D19058 --- lib/libc/gen/posix_spawn.c | 128 +++++++++++++++++++++++++++++-------- lib/libc/sys/rfork.2 | 3 + 2 files changed, 103 insertions(+), 28 deletions(-) diff --git a/lib/libc/gen/posix_spawn.c b/lib/libc/gen/posix_spawn.c index c8875b4de763..fb83bbebe85c 100644 --- a/lib/libc/gen/posix_spawn.c +++ b/lib/libc/gen/posix_spawn.c @@ -194,43 +194,115 @@ process_file_actions(const posix_spawn_file_actions_t fa) return (0); } +struct posix_spawn_args { + const char *path; + const posix_spawn_file_actions_t *fa; + const posix_spawnattr_t *sa; + char * const * argv; + char * const * envp; + int use_env_path; + int error; +}; + +#if defined(__i386__) || defined(__amd64__) +#define _RFORK_THREAD_STACK_SIZE 4096 +#endif + +static int +_posix_spawn_thr(void *data) +{ + struct posix_spawn_args *psa; + char * const *envp; + + psa = data; + if (psa->sa != NULL) { + psa->error = process_spawnattr(*psa->sa); + if (psa->error) + _exit(127); + } + if (psa->fa != NULL) { + psa->error = process_file_actions(*psa->fa); + if (psa->error) + _exit(127); + } + envp = psa->envp != NULL ? psa->envp : environ; + if (psa->use_env_path) + _execvpe(psa->path, psa->argv, envp); + else + _execve(psa->path, psa->argv, envp); + psa->error = errno; + + /* This is called in such a way that it must not exit. */ + _exit(127); +} + static int do_posix_spawn(pid_t *pid, const char *path, const posix_spawn_file_actions_t *fa, const posix_spawnattr_t *sa, char * const argv[], char * const envp[], int use_env_path) { + struct posix_spawn_args psa; pid_t p; - volatile int error = 0; +#ifdef _RFORK_THREAD_STACK_SIZE + char *stack; - p = vfork(); - switch (p) { - case -1: - return (errno); - case 0: - if (sa != NULL) { - error = process_spawnattr(*sa); - if (error) - _exit(127); - } - if (fa != NULL) { - error = process_file_actions(*fa); - if (error) - _exit(127); - } - if (use_env_path) - _execvpe(path, argv, envp != NULL ? envp : environ); - else - _execve(path, argv, envp != NULL ? envp : environ); - error = errno; - _exit(127); - default: - if (error != 0) - _waitpid(p, NULL, WNOHANG); - else if (pid != NULL) - *pid = p; - return (error); + stack = malloc(_RFORK_THREAD_STACK_SIZE); + if (stack == NULL) + return (ENOMEM); +#endif + psa.path = path; + psa.fa = fa; + psa.sa = sa; + psa.argv = argv; + psa.envp = envp; + psa.use_env_path = use_env_path; + psa.error = 0; + + /* + * Passing RFSPAWN to rfork(2) gives us effectively a vfork that drops + * non-ignored signal handlers. We'll fall back to the slightly less + * ideal vfork(2) if we get an EINVAL from rfork -- this should only + * happen with newer libc on older kernel that doesn't accept + * RFSPAWN. + */ +#ifdef _RFORK_THREAD_STACK_SIZE + /* + * x86 stores the return address on the stack, so rfork(2) cannot work + * as-is because the child would clobber the return address om the + * parent. Because of this, we must use rfork_thread instead while + * almost every other arch stores the return address in a register. + */ + p = rfork_thread(RFSPAWN, stack + _RFORK_THREAD_STACK_SIZE, + _posix_spawn_thr, &psa); + free(stack); +#else + p = rfork(RFSPAWN); + if (p == 0) + /* _posix_spawn_thr does not return */ + _posix_spawn_thr(&psa); +#endif + /* + * The above block should leave us in a state where we've either + * succeeded and we're ready to process the results, or we need to + * fallback to vfork() if the kernel didn't like RFSPAWN. + */ + + if (p == -1 && errno == EINVAL) { + p = vfork(); + if (p == 0) + /* _posix_spawn_thr does not return */ + _posix_spawn_thr(&psa); } + if (p == -1) + return (errno); + if (psa.error != 0) + /* Failed; ready to reap */ + _waitpid(p, NULL, WNOHANG); + else if (pid != NULL) + /* exec succeeded */ + *pid = p; + return (psa.error); } int diff --git a/lib/libc/sys/rfork.2 b/lib/libc/sys/rfork.2 index fe3e6e3a75e5..f7fc99d2174b 100644 --- a/lib/libc/sys/rfork.2 +++ b/lib/libc/sys/rfork.2 @@ -113,6 +113,9 @@ is passed, will use .Xr vfork 2 semantics but reset all signal actions in the child to default. +This flag is used by the +.Xr posix_spawn 3 +implementation in libc. .Pp If .Dv RFPROC From c55dc51c370470341a3009a468d94053480dc9b3 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 25 Sep 2019 19:29:09 +0000 Subject: [PATCH 082/106] Microoptimize sched_pickcpu() after r352658. I've noticed that I missed intr check at one more SCHED_AFFINITY(), so instead of adding one more branching I prefer to remove few. Profiler shows the function CPU time reduction from 0.24% to 0.16%. MFC after: 1 month Sponsored by: iXsystems, Inc. --- sys/kern/sched_ule.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c index 5568facbc80f..eb602185a847 100644 --- a/sys/kern/sched_ule.c +++ b/sys/kern/sched_ule.c @@ -1270,20 +1270,28 @@ sched_pickcpu(struct thread *td, int flags) */ if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) && curthread->td_intr_nesting_level) { + tdq = TDQ_SELF(); + if (tdq->tdq_lowpri >= PRI_MIN_IDLE) { + SCHED_STAT_INC(pickcpu_idle_affinity); + return (self); + } ts->ts_cpu = self; intr = 1; - } else + cg = tdq->tdq_cg; + goto llc; + } else { intr = 0; + tdq = TDQ_CPU(ts->ts_cpu); + cg = tdq->tdq_cg; + } /* * If the thread can run on the last cpu and the affinity has not * expired and it is idle, run it there. */ - tdq = TDQ_CPU(ts->ts_cpu); - cg = tdq->tdq_cg; if (THREAD_CAN_SCHED(td, ts->ts_cpu) && tdq->tdq_lowpri >= PRI_MIN_IDLE && SCHED_AFFINITY(ts, CG_SHARE_L2)) { - if (!intr && cg->cg_flags & CG_FLAG_THREAD) { + if (cg->cg_flags & CG_FLAG_THREAD) { CPUSET_FOREACH(cpu, cg->cg_mask) { if (TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE) break; @@ -1295,6 +1303,7 @@ sched_pickcpu(struct thread *td, int flags) return (ts->ts_cpu); } } +llc: /* * Search for the last level cache CPU group in the tree. * Skip SMT, identical groups and caches with expired affinity. From bf7700e44f321f0281b271307b016acf2398f30f Mon Sep 17 00:00:00 2001 From: Gleb Smirnoff Date: Wed, 25 Sep 2019 20:46:09 +0000 Subject: [PATCH 083/106] style(9): remove extraneous empty lines --- sys/dev/firewire/if_fwip.c | 1 - sys/net/if_ethersubr.c | 1 - sys/net/if_vlan.c | 1 - 3 files changed, 3 deletions(-) diff --git a/sys/dev/firewire/if_fwip.c b/sys/dev/firewire/if_fwip.c index ed2c0cfffab6..35a58c013707 100644 --- a/sys/dev/firewire/if_fwip.c +++ b/sys/dev/firewire/if_fwip.c @@ -717,7 +717,6 @@ fwip_stream_input(struct fw_xferq *xferq) uint16_t src; uint32_t *p; - fwip = (struct fwip_softc *)xferq->sc; ifp = fwip->fw_softc.fwip_ifp; diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c index c7ed1503da76..991ee0882625 100644 --- a/sys/net/if_ethersubr.c +++ b/sys/net/if_ethersubr.c @@ -800,7 +800,6 @@ VNET_SYSUNINIT(vnet_ether_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, static void ether_input(struct ifnet *ifp, struct mbuf *m) { - struct mbuf *mn; /* diff --git a/sys/net/if_vlan.c b/sys/net/if_vlan.c index 63006bbd650e..76a0f29d63c3 100644 --- a/sys/net/if_vlan.c +++ b/sys/net/if_vlan.c @@ -247,7 +247,6 @@ static struct sx _VLAN_SX_ID; #define VLAN_XLOCK_ASSERT() sx_assert(&_VLAN_SX_ID, SA_XLOCKED) #define VLAN_SXLOCK_ASSERT() sx_assert(&_VLAN_SX_ID, SA_LOCKED) - /* * We also have a per-trunk mutex that should be acquired when changing * its state. From 736dcdb75a9c2b478911f2e20536a99f0308aaad Mon Sep 17 00:00:00 2001 From: Yuri Pankov Date: Wed, 25 Sep 2019 21:23:30 +0000 Subject: [PATCH 084/106] efibootmgr(8): fix markup and style issues - split synopsis into separate options that can't be used together - sort options - fix (style) issues reported by mandoc lint Reviewed by: imp Differential Revision: https://reviews.freebsd.org/D21710 --- usr.sbin/efibootmgr/efibootmgr.8 | 182 +++++++++++++++++++++---------- 1 file changed, 123 insertions(+), 59 deletions(-) diff --git a/usr.sbin/efibootmgr/efibootmgr.8 b/usr.sbin/efibootmgr/efibootmgr.8 index c19e8f18799a..1a7776b92697 100644 --- a/usr.sbin/efibootmgr/efibootmgr.8 +++ b/usr.sbin/efibootmgr/efibootmgr.8 @@ -24,20 +24,42 @@ .\" .\" $FreeBSD$ .\" -.Dd December 28, 2018 +.Dd September 24, 2019 .Dt EFIBOOTMGR 8 .Os .Sh NAME -.Nm efibootmgr +.Nm efibootmgr .Nd manipulate the EFI Boot Manager .Sh SYNOPSIS -.Op Fl aAnNB -.Op Fl b Ar bootnum -.Op Fl t Ar timeout -.Op Fl T -.Op Fl o Ar bootorder +.Nm .Op Fl v -.Op Fl c l Ar loader [ Fl k Ar kernel ] [ Fl L Ar label ] [ Fl -dry-run ] +.Nm +.Fl a +.Fl b Ar bootnum +.Nm +.Fl A +.Fl b Ar bootnum +.Nm +.Fl B +.Fl b Ar bootnum +.Nm +.Fl c +.Fl l Ar loader +.Op Fl aD +.Op Fl b Ar bootnum +.Op Fl k Ar kernel +.Op Fl L Ar label +.Nm +.Fl n +.Fl b Ar bootnum +.Nm +.Fl N +.Nm +.Fl o Ar bootorder +.Nm +.Fl t Ar timeout +.Nm +.Fl T .Sh "DESCRIPTION" .Nm manipulates how UEFI Boot Managers boot the system. @@ -50,13 +72,13 @@ The UEFI standard defines how hosts may control what is used to bootstrap the system. Each method is encapsulated within a persistent UEFI variable, stored by the UEFI BIOS of the form -.Va BootXXXX . +.Cm Boot Ns Em XXXX . These variables are numbered, describe where to load the bootstrap program from, and whether or not the method is active. The boot order of these methods is controlled by another variable -.Va BootOrder . -The currently booting method is communicated using -.Va BootCurrent . +.Cm BootOrder . +The currently booting method is communicated using +.Cm BootCurrent . A global timeout can also be set. .Pp .Nm @@ -64,91 +86,133 @@ requires that the kernel efirt module be loaded to get and set these non-volatile variables. .Pp The following options are available: -.Bl -tag -width 28m -.It Fl c Fl -create -Create a new Boot Variable -.It Fl l -loader Ar loader -The path to and name of the loader. -.It Fl k -kernel Ar kernel -The path to and name of the kernel. +.Bl -tag -width Ds +.It Fl a -activate +Activate the given +.Ar bootnum +boot entry, or the new entry when used with +.Fl c . +.It Fl A -deactivate +Deactivate the given +.Ar bootnum +boot entry. .It Fl b -bootnum Ar bootnum -When creating or modifying an entry, use bootnum as the index. +When creating or modifying an entry, use +.Ar bootnum +as the index. When creating a new entry, fail if it already exists. -.It Fl L -label Ar label -An optional description for the entry. +.It Fl B -delete +Delete the given +.Ar bootnum +boot entry. +.It Fl c -create +Create a new +.Cm Boot +variable. .It Fl D -dry-run Process but do not change any variables. -.It Fl B -delete -Delete the given bootnum boot entry. -.It Fl a -activate -Activate the given bootnum boot entry, or the new entry when used with -c. -.It Fl A -deactivate -Deactivate the given bootnum boot entry. +.It Fl k -kernel Ar kernel +The path to and name of the kernel. +.It Fl l -loader Ar loader +The path to and name of the loader. +.It Fl L -label Ar label +An optional description for the entry. .It Fl n -bootnext -Set bootnum boot entry as the BootNext variable. -.It Fl N -delete-bootnext -Delete the BootNext optional variable. +Set +.Ar bootnum +boot entry as the +.Cm BootNext +variable. +.It Fl N -delete-bootnext +Delete the +.Cm BootNext +optional variable. .It Fl o -bootorder Ar bootorder -Set BootOrder variable to the given comma delimited set of bootnums. -The numbers are in hex to match BootXXXX, but may omit leading zeros. +Set +.Cm BootOrder +variable to the given comma delimited set of +.Ar bootnum Ns s . +The numbers are in hex to match +.Cm Boot Ns Em XXXX , +but may omit leading zeros. .It Fl t -set-timeout Ar timeout Set the bootmenu timeout value. .It Fl T -del-timeout -Delete the BootTimeout variable. +Delete the +.Cm BootTimeout +variable. .It Fl v -verbose Display the device path of boot entries in the output. .El -.Pp .Sh Examples -.Pp -To display the current Boot related variables in the system: +To display the current +.Cm Boot +related variables in the system: .Pp .Dl efibootmgr [-v] .Pp -This will display the optional BootNext bootnum, BootCurrent, -or currently booted bootnum, followed by the optional Timeout value, any -BootOrder that may be set, followed finally by all currently defined Boot -variables, active or not. The verbose flag will augment this output with -the disk partition uuids, size/offset and device-path of the -variable. +This will display the optional +.Cm BootNext +bootnum, +.Cm BootCurrent , +or currently booted bootnum, followed by the optional +.Cm Timeout +value, any +.Cm BootOrder +that may be set, followed finally by all currently defined +.Cm Boot +variables, active or not. +The verbose flag will augment this output with the disk partition uuids, +size/offset and device-path of the variable. .Pp The .Nm -program can be used to create new EFI boot variables. To create a new -boot var pointing to an installation with its EFI partition mounted -under /mnt, the given loader and a label "FreeBSD-11": +program can be used to create new EFI boot variables. +To create a new boot var pointing to an installation with its EFI partition +mounted under +.Pa /mnt , +the given loader and a label +.Qq FreeBSD-11 : .Pp .Dl efibootmgr -c -l /mnt/EFI/freebsd/loader.efi -L FreeBSD-11 .Pp This will result in the next available bootnum being assigned to a -new UEFI boot variable, and given the label "FreeBSD-11" such as: +new UEFI boot variable, and given the label +.Qq FreeBSD-11 +such as: .Pp .Dl Boot0009 FreeBSD-11 .Pp -Note newly created boot entries are created inactive. The active state is denoted -by an '*' following the BootXXXX name in the output. They are also inserted -into the first position of current BootOrder variable if it exists. They -must first be set to active before being considered available to attempt booting from, else they -are ignored. +Note newly created boot entries are created inactive. +The active state is denoted by an '*' following the +.Cm Boot Ns Em XXXX +name in the output. +They are also inserted into the first position of current +.Cm BootOrder +variable if it exists. +They must first be set to active before being considered available to attempt +booting from, else they are ignored. .Pp .Dl efibootmgr -B -b 0009 .Pp -Will delete the given boot entry Boot0009 +Will delete the given boot entry Boot0009. .Pp To set a given newly created boot entry active use: .Pp .Dl efibootmgr -a -b 0009 .Pp -To set a given boot entry to be used as the BootNext variable, irrespective -of its active state, use: +To set a given boot entry to be used as the +.Cm BootNext +variable, irrespective of its active state, use: .Pp .Dl efibootmgr -n -b 0009 .Pp -To set the BootOrder for the next reboot use: +To set the +.Cm BootOrder +for the next reboot use: .Pp .Dl efibootmgr -o 0009,0003,... -.Pp .Sh SEE ALSO .Xr efivar 8 , -.Xr uefi 8 , -.Xr gpart 8 +.Xr gpart 8 , +.Xr uefi 8 From a631497fca48968137a0c9c95466510a2de1723f Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Wed, 25 Sep 2019 22:53:30 +0000 Subject: [PATCH 085/106] Add SPDX tags to recently added files Reported by: Pawel Biernacki --- lib/libc/sys/shm_open.c | 2 ++ tests/sys/kern/memfd_test.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/lib/libc/sys/shm_open.c b/lib/libc/sys/shm_open.c index a7846474f092..2c91eb2c8e1e 100644 --- a/lib/libc/sys/shm_open.c +++ b/lib/libc/sys/shm_open.c @@ -1,4 +1,6 @@ /* + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2019 Kyle Evans * All rights reserved. * diff --git a/tests/sys/kern/memfd_test.c b/tests/sys/kern/memfd_test.c index 199b4d110c2f..6b48ab85bd60 100644 --- a/tests/sys/kern/memfd_test.c +++ b/tests/sys/kern/memfd_test.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2019 Kyle Evans * All rights reserved. * From 176dd236dce97171994fb94ede098ea347d5421a Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 26 Sep 2019 00:35:06 +0000 Subject: [PATCH 086/106] Microoptimize sched_pickcpu() CPU affinity on SMT. Use of CPU_FFS() to implement CPUSET_FOREACH() allows to save up to ~0.5% of CPU time on 72-thread SMT system doing 80K IOPS to NVMe from one thread. MFC after: 1 month Sponsored by: iXsystems, Inc. --- sys/kern/sched_ule.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c index eb602185a847..e33f12e40382 100644 --- a/sys/kern/sched_ule.c +++ b/sys/kern/sched_ule.c @@ -643,10 +643,6 @@ struct cpu_search { #define CPU_SEARCH_HIGHEST 0x2 #define CPU_SEARCH_BOTH (CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST) -#define CPUSET_FOREACH(cpu, mask) \ - for ((cpu) = 0; (cpu) <= mp_maxid; (cpu)++) \ - if (CPU_ISSET(cpu, &mask)) - static __always_inline int cpu_search(const struct cpu_group *cg, struct cpu_search *low, struct cpu_search *high, const int match); int __noinline cpu_search_lowest(const struct cpu_group *cg, @@ -1292,13 +1288,17 @@ sched_pickcpu(struct thread *td, int flags) tdq->tdq_lowpri >= PRI_MIN_IDLE && SCHED_AFFINITY(ts, CG_SHARE_L2)) { if (cg->cg_flags & CG_FLAG_THREAD) { - CPUSET_FOREACH(cpu, cg->cg_mask) { - if (TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE) + /* Check all SMT threads for being idle. */ + for (cpu = CPU_FFS(&cg->cg_mask) - 1; ; cpu++) { + if (CPU_ISSET(cpu, &cg->cg_mask) && + TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE) break; + if (cpu >= mp_maxid) { + SCHED_STAT_INC(pickcpu_idle_affinity); + return (ts->ts_cpu); + } } - } else - cpu = INT_MAX; - if (cpu > mp_maxid) { + } else { SCHED_STAT_INC(pickcpu_idle_affinity); return (ts->ts_cpu); } From 8b868507496853e46a765e0add00a81233bc0f55 Mon Sep 17 00:00:00 2001 From: Mitchell Horne Date: Thu, 26 Sep 2019 00:54:07 +0000 Subject: [PATCH 087/106] Cleanup of elf_machdep.c Fix some style(9) violations. This also changes the name of the machine-dependent sysctl kern.debug_kld to debug.kld_reloc, and changes its type from int to bool. This is acceptable since we are not currently concerned with preserving the RISC-V ABI. Reviewed by: markj, kp MFC after: 3 days Differential Revision: https://reviews.freebsd.org/D21772 --- sys/riscv/riscv/elf_machdep.c | 144 +++++++++++++--------------------- 1 file changed, 56 insertions(+), 88 deletions(-) diff --git a/sys/riscv/riscv/elf_machdep.c b/sys/riscv/riscv/elf_machdep.c index e626b01a39f0..7164f5b8a0b9 100644 --- a/sys/riscv/riscv/elf_machdep.c +++ b/sys/riscv/riscv/elf_machdep.c @@ -109,13 +109,11 @@ static Elf64_Brandinfo freebsd_brand_info = { }; SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST, - (sysinit_cfunc_t) elf64_insert_brand_entry, - &freebsd_brand_info); + (sysinit_cfunc_t)elf64_insert_brand_entry, &freebsd_brand_info); -static int debug_kld; -SYSCTL_INT(_kern, OID_AUTO, debug_kld, - CTLFLAG_RW, &debug_kld, 0, - "Activate debug prints in elf_reloc_internal()"); +static bool debug_kld; +SYSCTL_BOOL(_debug, OID_AUTO, kld_reloc, CTLFLAG_RW, &debug_kld, 0, + "Activate debug prints in elf_reloc_internal()"); struct type2str_ent { int type; @@ -274,7 +272,7 @@ elf_reloc_internal(linker_file_t lf, Elf_Addr relocbase, const void *data, uint32_t before32_1; uint32_t before32; uint64_t before64; - uint32_t* insn32p; + uint32_t *insn32p; uint32_t imm20; int error; @@ -282,15 +280,15 @@ elf_reloc_internal(linker_file_t lf, Elf_Addr relocbase, const void *data, case ELF_RELOC_RELA: rela = (const Elf_Rela *)data; where = (Elf_Addr *)(relocbase + rela->r_offset); - insn32p = (uint32_t*)where; + insn32p = (uint32_t *)where; addend = rela->r_addend; rtype = ELF_R_TYPE(rela->r_info); symidx = ELF_R_SYM(rela->r_info); break; default: printf("%s:%d unknown reloc type %d\n", - __FUNCTION__, __LINE__, type); - return -1; + __FUNCTION__, __LINE__, type); + return (-1); } switch (rtype) { @@ -301,43 +299,36 @@ elf_reloc_internal(linker_file_t lf, Elf_Addr relocbase, const void *data, case R_RISCV_JUMP_SLOT: error = lookup(lf, symidx, 1, &addr); if (error != 0) - return -1; + return (-1); val = addr; before64 = *where; if (*where != val) *where = val; - if (debug_kld) - printf("%p %c %-24s %016lx -> %016lx\n", - where, - (local? 'l': 'g'), - reloctype_to_str(rtype), - before64, *where); + printf("%p %c %-24s %016lx -> %016lx\n", where, + (local ? 'l' : 'g'), reloctype_to_str(rtype), + before64, *where); break; case R_RISCV_RELATIVE: before64 = *where; - *where = elf_relocaddr(lf, relocbase + addend); - if (debug_kld) - printf("%p %c %-24s %016lx -> %016lx\n", - where, - (local? 'l': 'g'), - reloctype_to_str(rtype), - before64, *where); + printf("%p %c %-24s %016lx -> %016lx\n", where, + (local ? 'l' : 'g'), reloctype_to_str(rtype), + before64, *where); break; case R_RISCV_JAL: error = lookup(lf, symidx, 1, &addr); if (error != 0) - return -1; + return (-1); val = addr - (Elf_Addr)where; - if ((val <= -(1UL << 20) || (1UL << 20) <= val)) { + if (val <= -(1UL << 20) || (1UL << 20) <= val) { printf("kldload: huge offset against R_RISCV_JAL\n"); - return -1; + return (-1); } before32 = *insn32p; @@ -345,13 +336,10 @@ elf_reloc_internal(linker_file_t lf, Elf_Addr relocbase, const void *data, *insn32p = insert_imm(*insn32p, val, 10, 1, 21); *insn32p = insert_imm(*insn32p, val, 11, 11, 20); *insn32p = insert_imm(*insn32p, val, 19, 12, 12); - if (debug_kld) - printf("%p %c %-24s %08x -> %08x\n", - where, - (local? 'l': 'g'), - reloctype_to_str(rtype), - before32, *insn32p); + printf("%p %c %-24s %08x -> %08x\n", where, + (local ? 'l' : 'g'), reloctype_to_str(rtype), + before32, *insn32p); break; case R_RISCV_CALL: @@ -359,14 +347,15 @@ elf_reloc_internal(linker_file_t lf, Elf_Addr relocbase, const void *data, * R_RISCV_CALL relocates 8-byte region that consists * of the sequence of AUIPC and JALR. */ - /* calculate and check the pc relative offset. */ + /* Calculate and check the pc relative offset. */ error = lookup(lf, symidx, 1, &addr); if (error != 0) - return -1; + return (-1); + val = addr - (Elf_Addr)where; - if ((val <= -(1UL << 32) || (1UL << 32) <= val)) { + if (val <= -(1UL << 32) || (1UL << 32) <= val) { printf("kldload: huge offset against R_RISCV_CALL\n"); - return -1; + return (-1); } /* Relocate AUIPC. */ @@ -377,112 +366,91 @@ elf_reloc_internal(linker_file_t lf, Elf_Addr relocbase, const void *data, /* Relocate JALR. */ before32_1 = insn32p[1]; insn32p[1] = insert_imm(insn32p[1], val, 11, 0, 20); - if (debug_kld) - printf("%p %c %-24s %08x %08x -> %08x %08x\n", - where, - (local? 'l': 'g'), - reloctype_to_str(rtype), - before32, insn32p[0], - before32_1, insn32p[1]); + printf("%p %c %-24s %08x %08x -> %08x %08x\n", where, + (local ? 'l' : 'g'), reloctype_to_str(rtype), + before32, insn32p[0], before32_1, insn32p[1]); break; case R_RISCV_PCREL_HI20: val = addr - (Elf_Addr)where; - insn32p = (uint32_t*)where; + insn32p = (uint32_t *)where; before32 = *insn32p; imm20 = calc_hi20_imm(val); *insn32p = insert_imm(*insn32p, imm20, 31, 12, 12); - if (debug_kld) - printf("%p %c %-24s %08x -> %08x\n", - where, - (local? 'l': 'g'), - reloctype_to_str(rtype), - before32, *insn32p); + printf("%p %c %-24s %08x -> %08x\n", where, + (local ? 'l' : 'g'), reloctype_to_str(rtype), + before32, *insn32p); break; case R_RISCV_PCREL_LO12_I: val = addr - (Elf_Addr)where; - insn32p = (uint32_t*)where; + insn32p = (uint32_t *)where; before32 = *insn32p; *insn32p = insert_imm(*insn32p, addr, 11, 0, 20); - if (debug_kld) - printf("%p %c %-24s %08x -> %08x\n", - where, - (local? 'l': 'g'), - reloctype_to_str(rtype), - before32, *insn32p); + printf("%p %c %-24s %08x -> %08x\n", where, + (local ? 'l' : 'g'), reloctype_to_str(rtype), + before32, *insn32p); break; case R_RISCV_PCREL_LO12_S: val = addr - (Elf_Addr)where; - insn32p = (uint32_t*)where; + insn32p = (uint32_t *)where; before32 = *insn32p; *insn32p = insert_imm(*insn32p, addr, 11, 5, 25); *insn32p = insert_imm(*insn32p, addr, 4, 0, 7); if (debug_kld) - printf("%p %c %-24s %08x -> %08x\n", - where, - (local? 'l': 'g'), - reloctype_to_str(rtype), - before32, *insn32p); + printf("%p %c %-24s %08x -> %08x\n", where, + (local ? 'l' : 'g'), reloctype_to_str(rtype), + before32, *insn32p); break; case R_RISCV_HI20: error = lookup(lf, symidx, 1, &addr); if (error != 0) - return -1; + return (-1); - insn32p = (uint32_t*)where; + insn32p = (uint32_t *)where; before32 = *insn32p; imm20 = calc_hi20_imm(val); *insn32p = insert_imm(*insn32p, imm20, 31, 12, 12); - if (debug_kld) - printf("%p %c %-24s %08x -> %08x\n", - where, - (local? 'l': 'g'), - reloctype_to_str(rtype), - before32, *insn32p); + printf("%p %c %-24s %08x -> %08x\n", where, + (local ? 'l' : 'g'), reloctype_to_str(rtype), + before32, *insn32p); break; case R_RISCV_LO12_I: error = lookup(lf, symidx, 1, &addr); if (error != 0) - return -1; + return (-1); val = addr; - insn32p = (uint32_t*)where; + insn32p = (uint32_t *)where; before32 = *insn32p; *insn32p = insert_imm(*insn32p, addr, 11, 0, 20); - if (debug_kld) - printf("%p %c %-24s %08x -> %08x\n", - where, - (local? 'l': 'g'), - reloctype_to_str(rtype), - before32, *insn32p); + printf("%p %c %-24s %08x -> %08x\n", where, + (local ? 'l' : 'g'), reloctype_to_str(rtype), + before32, *insn32p); break; case R_RISCV_LO12_S: error = lookup(lf, symidx, 1, &addr); if (error != 0) - return -1; + return (-1); val = addr; - insn32p = (uint32_t*)where; + insn32p = (uint32_t *)where; before32 = *insn32p; *insn32p = insert_imm(*insn32p, addr, 11, 5, 25); *insn32p = insert_imm(*insn32p, addr, 4, 0, 7); - if (debug_kld) - printf("%p %c %-24s %08x -> %08x\n", - where, - (local? 'l': 'g'), - reloctype_to_str(rtype), - before32, *insn32p); + printf("%p %c %-24s %08x -> %08x\n", where, + (local ? 'l' : 'g'), reloctype_to_str(rtype), + before32, *insn32p); break; default: From c81e8f9891a3bda0cd2734ba02cf53931d97a64a Mon Sep 17 00:00:00 2001 From: Mitchell Horne Date: Thu, 26 Sep 2019 00:58:47 +0000 Subject: [PATCH 088/106] Fix some broken relocation handling In a few cases, the symbol lookup is missing before attempting to perform the relocation. While the relocation types affected are currently unused, this results in an uninitialized variable warning, that is escalated to an error when building with clang. Reviewed by: markj MFC after: 3 days Differential Revision: https://reviews.freebsd.org/D21773 --- sys/riscv/riscv/elf_machdep.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/sys/riscv/riscv/elf_machdep.c b/sys/riscv/riscv/elf_machdep.c index 7164f5b8a0b9..09d8aba84023 100644 --- a/sys/riscv/riscv/elf_machdep.c +++ b/sys/riscv/riscv/elf_machdep.c @@ -373,6 +373,10 @@ elf_reloc_internal(linker_file_t lf, Elf_Addr relocbase, const void *data, break; case R_RISCV_PCREL_HI20: + error = lookup(lf, symidx, 1, &addr); + if (error != 0) + return (-1); + val = addr - (Elf_Addr)where; insn32p = (uint32_t *)where; before32 = *insn32p; @@ -385,6 +389,10 @@ elf_reloc_internal(linker_file_t lf, Elf_Addr relocbase, const void *data, break; case R_RISCV_PCREL_LO12_I: + error = lookup(lf, symidx, 1, &addr); + if (error != 0) + return (-1); + val = addr - (Elf_Addr)where; insn32p = (uint32_t *)where; before32 = *insn32p; @@ -396,6 +404,10 @@ elf_reloc_internal(linker_file_t lf, Elf_Addr relocbase, const void *data, break; case R_RISCV_PCREL_LO12_S: + error = lookup(lf, symidx, 1, &addr); + if (error != 0) + return (-1); + val = addr - (Elf_Addr)where; insn32p = (uint32_t *)where; before32 = *insn32p; @@ -412,6 +424,7 @@ elf_reloc_internal(linker_file_t lf, Elf_Addr relocbase, const void *data, if (error != 0) return (-1); + val = addr; insn32p = (uint32_t *)where; before32 = *insn32p; imm20 = calc_hi20_imm(val); From a297901e6c90d415ac49c88002dd4e02dc49e32b Mon Sep 17 00:00:00 2001 From: Martin Matuska Date: Thu, 26 Sep 2019 01:42:09 +0000 Subject: [PATCH 089/106] Update vendor/libarchive/dist to git 2f3033ca23f8c21160506c3c7ac8a0df0d3fde42 Relevant vendor changes: Issue #1237: Fix integer overflow in archive_read_support_filter_lz4.c PR #1249: Correct some typographical and grammatical errors. PR #1250: Minor corrections to the formatting of manual pages --- .cirrus.yml | 4 +- .github/workflows/ci.yml | 37 +++++++++++ cat/bsdcat.1 | 11 ++-- contrib/shar/shar.1 | 11 ++-- cpio/bsdcpio.1 | 3 +- libarchive/archive.h | 2 +- libarchive/archive_entry.3 | 7 +- libarchive/archive_entry_acl.3 | 59 ++++++++++------- libarchive/archive_entry_misc.3 | 5 +- libarchive/archive_entry_paths.3 | 10 +-- libarchive/archive_entry_perms.3 | 12 ++-- libarchive/archive_entry_stat.3 | 6 +- libarchive/archive_entry_time.3 | 6 +- libarchive/archive_read.3 | 6 +- libarchive/archive_read_add_passphrase.3 | 10 +-- libarchive/archive_read_data.3 | 4 +- libarchive/archive_read_disk.3 | 13 ++-- libarchive/archive_read_extract.3 | 2 +- libarchive/archive_read_filter.3 | 4 +- libarchive/archive_read_format.3 | 6 +- libarchive/archive_read_free.3 | 6 +- libarchive/archive_read_header.3 | 2 +- libarchive/archive_read_new.3 | 2 +- libarchive/archive_read_open.3 | 4 +- libarchive/archive_read_set_options.3 | 7 +- libarchive/archive_read_support_filter_gzip.c | 54 +++++++++------ libarchive/archive_read_support_filter_lz4.c | 10 +-- libarchive/archive_read_support_format_zip.c | 6 +- libarchive/archive_string.c | 6 +- libarchive/archive_util.3 | 16 ++--- libarchive/archive_write.3 | 6 +- libarchive/archive_write_blocksize.3 | 2 +- libarchive/archive_write_data.3 | 2 +- libarchive/archive_write_disk.3 | 5 +- libarchive/archive_write_disk_posix.c | 2 +- libarchive/archive_write_filter.3 | 4 +- libarchive/archive_write_finish_entry.3 | 2 +- libarchive/archive_write_format.3 | 4 +- libarchive/archive_write_free.3 | 4 +- libarchive/archive_write_header.3 | 2 +- libarchive/archive_write_new.3 | 2 +- libarchive/archive_write_open.3 | 4 +- libarchive/archive_write_set_format_iso9660.c | 6 +- libarchive/archive_write_set_format_mtree.c | 16 ++--- libarchive/archive_write_set_options.3 | 62 +++++++++++++---- libarchive/archive_write_set_passphrase.3 | 12 ++-- libarchive/libarchive_changes.3 | 5 +- libarchive/libarchive_internals.3 | 4 +- libarchive/mtree.5 | 7 +- libarchive/tar.5 | 9 +-- .../test_archive_write_add_filter_by_name.c | 4 +- ...t_archive_write_set_format_filter_by_ext.c | 2 +- libarchive/test/test_read_format_raw.c | 4 ++ libarchive/test/test_read_format_zip.c | 8 +-- ...d_format_zip_traditional_encryption_data.c | 4 +- libarchive/test/test_write_filter_zstd.c | 66 +++++++++++++++++-- tar/bsdtar.1 | 56 ++++++++++------ tar/test/test_option_n.c | 4 +- tar/test/test_option_xattrs.c | 2 +- test_utils/test_main.c | 2 +- 60 files changed, 407 insertions(+), 236 deletions(-) create mode 100644 .github/workflows/ci.yml diff --git a/.cirrus.yml b/.cirrus.yml index 33d6e473d9fa..d87e48983e8e 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -16,8 +16,8 @@ FreeBSD_task: prepare_script: - ./build/ci/cirrus_ci/ci.sh prepare configure_script: - - ./build/ci/build.sh -a autogen - - ./build/ci/build.sh -a configure + - env CFLAGS=-I/usr/local/include LDFLAGS=-L/usr/local/lib ./build/ci/build.sh -a autogen + - env CFLAGS=-I/usr/local/include LDFLAGS=-L/usr/local/lib ./build/ci/build.sh -a configure build_script: - ./build/ci/build.sh -a build test_script: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 000000000000..11fa1b6b8464 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,37 @@ +name: Ubuntu + +on: [push, pull_request] + +jobs: + Build-and-test: + + runs-on: ubuntu-latest + + strategy: + matrix: + bs: [autotools, cmake] + + steps: + - uses: actions/checkout@master + - name: Install dependencies + run: sudo apt-get install -y build-essential cmake libssl-dev libacl1-dev libbz2-dev liblzma-dev libzip-dev liblz4-dev libzstd-dev lzop + - name: Autogen + run: ./build/ci/build.sh -a autogen + env: + BS: ${{ matrix.bs }} + - name: Configure + run: ./build/ci/build.sh -a configure + env: + BS: ${{ matrix.bs }} + - name: Build + run: ./build/ci/build.sh -a build + env: + BS: ${{ matrix.bs }} + - name: Test + run: ./build/ci/build.sh -a test + env: + BS: ${{ matrix.bs }} + - name: Install + run: ./build/ci/build.sh -a install + env: + BS: ${{ matrix.bs }} diff --git a/cat/bsdcat.1 b/cat/bsdcat.1 index 4f82b1e57c06..036623e4e4d6 100644 --- a/cat/bsdcat.1 +++ b/cat/bsdcat.1 @@ -34,16 +34,15 @@ .Nm .Op options .Op files -.Pp .Sh DESCRIPTION .Nm expands files to standard output. .Sh OPTIONS .Nm typically takes a filename as an argument or reads standard input when used in a -pipe. In both cases decompressed data it written to standard output. +pipe. +In both cases decompressed data it written to standard output. .Sh EXAMPLES -.Pp To decompress a file: .Pp .Dl bsdcat example.txt.gz > example.txt @@ -55,8 +54,8 @@ To decompress standard input in a pipe: Both examples achieve the same results - a decompressed file by redirecting output. .Sh SEE ALSO -.Xr uncompress 1 , -.Xr zcat 1 , .Xr bzcat 1 , +.Xr uncompress 1 , .Xr xzcat 1 , -.Xr libarchive-formats 5 , +.Xr zcat 1 , +.Xr libarchive-formats 5 diff --git a/contrib/shar/shar.1 b/contrib/shar/shar.1 index e3152f299ebe..31561978f05c 100644 --- a/contrib/shar/shar.1 +++ b/contrib/shar/shar.1 @@ -61,7 +61,8 @@ or The following options are available: .Bl -tag -width indent .It Fl b -Use an alternative binary format. Content of files will be uuencoded. +Use an alternative binary format. +Content of files will be uuencoded. This option should be used to archive binary files correctly. In this mode also file permissions will be stored to the archive. uudecode(1) is needed to extract archives created with this option. @@ -72,8 +73,8 @@ Redirect output to If .Ar file given on command line is a directory the entire subtree will be archived. -Symbolic links given on command line are followed. Other symbolic links will -be archived as such. +Symbolic links given on command line are followed. +Other symbolic links will be archived as such. .El .Sh EXAMPLES To create a shell archive of the program @@ -111,7 +112,9 @@ The command makes no provisions for hard links. .Pp Files containing magic characters or files without a newline ('\\n') as the -last character are not handled correctly with the default format. Use the -b +last character are not handled correctly with the default format. +Use the +.Fl b option for binary files. .Pp It is easy to insert trojan horses into diff --git a/cpio/bsdcpio.1 b/cpio/bsdcpio.1 index 786a717097ed..514c1a2c1937 100644 --- a/cpio/bsdcpio.1 +++ b/cpio/bsdcpio.1 @@ -75,7 +75,6 @@ Pass-through. Read a list of filenames from standard input and copy the files to the specified directory. .El -.Pp .Sh OPTIONS Unless specifically stated otherwise, options are applicable in all operating modes. @@ -385,10 +384,10 @@ For best compatibility, scripts should limit themselves to the standard syntax. .Sh SEE ALSO .Xr bzip2 1 , -.Xr tar 1 , .Xr gzip 1 , .Xr mt 1 , .Xr pax 1 , +.Xr tar 1 , .Xr libarchive 3 , .Xr cpio 5 , .Xr libarchive-formats 5 , diff --git a/libarchive/archive.h b/libarchive/archive.h index 7b196e7bb9ea..263d5ea354c9 100644 --- a/libarchive/archive.h +++ b/libarchive/archive.h @@ -52,7 +52,7 @@ */ #if defined(__BORLANDC__) && __BORLANDC__ >= 0x560 # include -#elif !defined(__WATCOMC__) && !defined(_MSC_VER) && !defined(__INTERIX) && !defined(__BORLANDC__) && !defined(_SCO_DS) && !defined(__osf__) +#elif !defined(__WATCOMC__) && !defined(_MSC_VER) && !defined(__INTERIX) && !defined(__BORLANDC__) && !defined(_SCO_DS) && !defined(__osf__) && !defined(__CLANG_INTTYPES_H) # include #endif diff --git a/libarchive/archive_entry.3 b/libarchive/archive_entry.3 index f75916c9e41b..2f62a4be233e 100644 --- a/libarchive/archive_entry.3 +++ b/libarchive/archive_entry.3 @@ -32,7 +32,7 @@ .Nm archive_entry_clear , .Nm archive_entry_clone , .Nm archive_entry_free , -.Nm archive_entry_new , +.Nm archive_entry_new .Nd functions for managing archive entry descriptions .Sh LIBRARY Streaming Archive Library (libarchive, -larchive) @@ -126,7 +126,6 @@ using the current locale. Similarly, if you store a wide string and then store a narrow string for the same data, the previously-set wide string will be discarded in favor of the new data. -.Pp .\" .Sh EXAMPLE .\" .Sh RETURN VALUES .\" .Sh ERRORS @@ -134,8 +133,8 @@ be discarded in favor of the new data. .Xr archive_entry_acl 3 , .Xr archive_entry_paths 3 , .Xr archive_entry_perms 3 , -.Xr archive_entry_time 3 -.Xr libarchive 3 , +.Xr archive_entry_time 3 , +.Xr libarchive 3 .Sh HISTORY The .Nm libarchive diff --git a/libarchive/archive_entry_acl.3 b/libarchive/archive_entry_acl.3 index 534dbfac6ef6..7dcc5854ce10 100644 --- a/libarchive/archive_entry_acl.3 +++ b/libarchive/archive_entry_acl.3 @@ -118,15 +118,16 @@ Streaming Archive Library (libarchive, -larchive) .Sh DESCRIPTION The .Dq Access Control Lists (ACLs) -extend the standard Unix perssion model. +extend the standard Unix permission model. The ACL interface of .Nm libarchive -supports both POSIX.1e and NFSv4 style ACLs. Use of ACLs is restricted by +supports both POSIX.1e and NFSv4 style ACLs. +Use of ACLs is restricted by various levels of ACL support in operating systems, file systems and archive formats. .Ss POSIX.1e Access Control Lists A POSIX.1e ACL consists of a number of independent entries. -Each entry specifies the permission set as bitmask of basic permissions. +Each entry specifies the permission set as a bitmask of basic permissions. Valid permissions in the .Fa permset are: @@ -147,13 +148,13 @@ The user specified by the name field. .It Dv ARCHIVE_ENTRY_ACL_USER_OBJ The owner of the file. .It Dv ARCHIVE_ENTRY_ACL_GROUP -The group specied by the name field. +The group specified by the name field. .It Dv ARCHIVE_ENTRY_ACL_GROUP_OBJ -The group who owns the file. +The group which owns the file. .It Dv ARCHIVE_ENTRY_ACL_MASK The maximum permissions to be obtained via group permissions. .It Dv ARCHIVE_ENTRY_ACL_OTHER -Any principal who is not file owner or a member of the owning group. +Any principal who is not the file owner or a member of the owning group. .El .Pp The principals @@ -164,12 +165,12 @@ and are equivalent to user, group and other in the classic Unix permission model and specify non-extended ACL entries. .Pp -All files with have an access ACL +All files have an access ACL .Pq Dv ARCHIVE_ENTRY_ACL_TYPE_ACCESS . This specifies the permissions required for access to the file itself. Directories have an additional ACL .Pq Dv ARCHIVE_ENTRY_ACL_TYPE_DEFAULT , -which controls the initial access ACL for newly created directory entries. +which controls the initial access ACL for newly-created directory entries. .Ss NFSv4 Access Control Lists A NFSv4 ACL consists of multiple individual entries called Access Control Entries (ACEs). @@ -197,11 +198,11 @@ The user specified by the name field. .It Dv ARCHIVE_ENTRY_ACL_USER_OBJ The owner of the file. .It Dv ARCHIVE_ENTRY_ACL_GROUP -The group specied by the name field. +The group specified by the name field. .It Dv ARCHIVE_ENTRY_ACL_GROUP_OBJ -The group who owns the file. +The group which owns the file. .It Dv ARCHIVE_ENTRY_ACL_EVERYONE -Any principal who is not file owner or a member of the owning group. +Any principal who is not the file owner or a member of the owning group. .El .Pp Entries with the @@ -216,9 +217,10 @@ integer. .Pp NFSv4 ACE permissions and flags are stored in the same .Fa permset -bitfield. Some permissions share the same constant and permission character but -have different effect on directories than on files. The following ACE -permissions are supported: +bitfield. +Some permissions share the same constant and permission character +but have different effect on directories than on files. +The following ACE permissions are supported: .Bl -tag -offset indent -compact -width ARCHIV .It Dv ARCHIVE_ENTRY_ACL_READ_DATA ( Sy r ) Read data (file). @@ -265,7 +267,8 @@ Inherit parent directory ACE to subdirectories. .It Dv ARCHIVE_ENTRY_ACL_ENTRY_INHERIT_ONLY ( Sy i ) Only inherit, do not apply the permission on the directory itself. .It Dv ARCHIVE_ENTRY_ACL_ENTRY_NO_PROPAGATE_INHERIT ( Sy n ) -Do not propagate inherit flags. Only first-level entries inherit ACLs. +Do not propagate inherit flags. +Only first-level entries inherit ACLs. .It Dv ARCHIVE_ENTRY_ACL_ENTRY_SUCCESSFUL_ACCESS ( Sy S ) Trigger alarm or audit on successful access. .It Dv ARCHIVE_ENTRY_ACL_ENTRY_FAILED_ACCESS ( Sy F ) @@ -279,8 +282,8 @@ and .Fn archive_entry_acl_add_entry_w add a single ACL entry. For the access ACL and non-extended principals, the classic Unix permissions -are updated. An archive entry cannot contain both POSIX.1e and NFSv4 ACL -entries. +are updated. +An archive entry cannot contain both POSIX.1e and NFSv4 ACL entries. .Pp .Fn archive_entry_acl_clear removes all ACL entries and resets the enumeration pointer. @@ -300,7 +303,8 @@ for POSIX.1e ACLs and .It Dv ARCHIVE_ENTRY_ACL_TYPE_AUDIT .It Dv ARCHIVE_ENTRY_ACL_TYPE_ALARM .El -for NFSv4 ACLs. For POSIX.1e ACLs if +for NFSv4 ACLs. +For POSIX.1e ACLs if .Dv ARCHIVE_ENTRY_ACL_TYPE_ACCESS is included and at least one extended ACL entry is found, the three non-extended ACLs are added. @@ -312,7 +316,8 @@ add new .Pq or merge with existing ACL entries from .Pq wide -text. The argument +text. +The argument .Fa type may take one of the following values: .Bl -tag -offset indent -compact -width "ARCHIVE_ENTRY_ACL_TYPE_DEFAULT" @@ -322,11 +327,13 @@ may take one of the following values: .El Supports all formats that can be created with .Fn archive_entry_acl_to_text -or respective +or respectively .Fn archive_entry_acl_to_text_w . -Existing ACL entries are preserved. To get a clean new ACL from text +Existing ACL entries are preserved. +To get a clean new ACL from text .Fn archive_entry_acl_clear -must be called first. Entries prefixed with +must be called first. +Entries prefixed with .Dq default: are treated as .Dv ARCHIVE_ENTRY_ACL_TYPE_DEFAULT @@ -354,7 +361,7 @@ prepare reading the list of ACL entries with .Fn archive_entry_acl_next or .Fn archive_entry_acl_next_w . -The function returns either 0, if no non-extended ACLs are found. +The function returns 0 if no non-extended ACLs are found. In this case, the access permissions should be obtained by .Xr archive_entry_mode 3 or set using @@ -367,7 +374,8 @@ and .Fn archive_entry_acl_to_text_w convert the ACL entries for the given type into a .Pq wide -string of ACL entries separated by newline. If the pointer +string of ACL entries separated by newline. +If the pointer .Fa len_p is not NULL, then the function shall return the length of the string .Pq not including the NULL terminator @@ -415,7 +423,8 @@ are prefixed with .Dq default: . .Pp .Fn archive_entry_acl_types -get ACL entry types contained in an archive entry's ACL. As POSIX.1e and NFSv4 +get ACL entry types contained in an archive entry's ACL. +As POSIX.1e and NFSv4 ACL entries cannot be mixed, this function is a very efficient way to detect if an ACL already contains POSIX.1e or NFSv4 ACL entries. .Sh RETURN VALUES diff --git a/libarchive/archive_entry_misc.3 b/libarchive/archive_entry_misc.3 index 9b1e3ea207d3..dfab7ddb559b 100644 --- a/libarchive/archive_entry_misc.3 +++ b/libarchive/archive_entry_misc.3 @@ -28,7 +28,7 @@ .Sh NAME .Nm archive_entry_symlink_type , .Nm archive_entry_set_symlink_type -.Nd miscellaneous functions for manipulating properties of archive_entry. +.Nd miscellaneous functions for manipulating properties of archive_entry .Sh LIBRARY Streaming Archive Library (libarchive, -larchive) .Sh SYNOPSIS @@ -42,7 +42,8 @@ The function .Fn archive_entry_symlink_type returns and the function .Fn archive_entry_set_symlink_type -sets the type of the symbolic link stored in an archive entry. These functions +sets the type of the symbolic link stored in an archive entry. +These functions have special meaning on operating systems that support multiple symbolic link types (e.g. Microsoft Windows). .Pp diff --git a/libarchive/archive_entry_paths.3 b/libarchive/archive_entry_paths.3 index f647212a98be..0f849c9ebb35 100644 --- a/libarchive/archive_entry_paths.3 +++ b/libarchive/archive_entry_paths.3 @@ -133,7 +133,7 @@ The accessor functions are named .Fn XXX_w . .It UTF-8 Unicode strings encoded as UTF-8. -This are convience functions to update both the multibyte and wide +These are convenience functions to update both the multibyte and wide character strings at the same time. .El .Pp @@ -141,13 +141,13 @@ The sourcepath is a pure filesystem concept and never stored in an archive directly. .Pp For that reason, it is only available as multibyte string. -The link path is a convience function for conditionally setting +The link path is a convenience function for conditionally setting hardlink or symlink destination. It doesn't have a corresponding get accessor function. .Pp .Fn archive_entry_set_XXX -is an alias for +is an alias for .Fn archive_entry_copy_XXX . .Sh SEE ALSO -.Xr archive_entry 3 -.Xr libarchive 3 , +.Xr archive_entry 3 , +.Xr libarchive 3 diff --git a/libarchive/archive_entry_perms.3 b/libarchive/archive_entry_perms.3 index aae3648bb210..0291b7b4988b 100644 --- a/libarchive/archive_entry_perms.3 +++ b/libarchive/archive_entry_perms.3 @@ -126,7 +126,7 @@ The corresponding functions and .Fn archive_entry_set_perm store the given user id, group id and permission in the entry. -The permission is also set as side effect of calling +The permission is also set as a side effect of calling .Fn archive_entry_set_mode . .Pp .Fn archive_entry_strmode @@ -143,12 +143,12 @@ The accessor functions are named .Fn XXX_w . .It UTF-8 Unicode strings encoded as UTF-8. -This are convience functions to update both the multibyte and wide +These are convenience functions to update both the multibyte and wide character strings at the same time. .El .Pp .Fn archive_entry_set_XXX -is an alias for +is an alias for .Fn archive_entry_copy_XXX . .Ss File Flags File flags are transparently converted between a bitmap @@ -182,7 +182,7 @@ The .Fn archive_entry_copy_fflags_text and .Fn archive_entry_copy_fflags_text_w -functions parse the provided text and sets the internal bitmap values. +functions parse the provided text and set the internal bitmap values. This is a platform-specific operation; names that are not meaningful on the current platform will be ignored. The function returns a pointer to the start of the first name that was not @@ -197,8 +197,8 @@ which stops parsing at the first unrecognized name.) .Xr archive_entry 3 , .Xr archive_entry_acl 3 , .Xr archive_read_disk 3 , -.Xr archive_write_disk 3 -.Xr libarchive 3 , +.Xr archive_write_disk 3 , +.Xr libarchive 3 .Sh BUGS The platform types .Vt uid_t diff --git a/libarchive/archive_entry_stat.3 b/libarchive/archive_entry_stat.3 index 26611e4c62e9..aa5c8e03f9f9 100644 --- a/libarchive/archive_entry_stat.3 +++ b/libarchive/archive_entry_stat.3 @@ -54,7 +54,7 @@ .Nm archive_entry_rdevmajor , .Nm archive_entry_set_rdevmajor , .Nm archive_entry_rdevminor , -.Nm archive_entry_set_rdevminor , +.Nm archive_entry_set_rdevminor .Nd accessor functions for manipulating archive entry descriptions .Sh LIBRARY Streaming Archive Library (libarchive, -larchive) @@ -267,8 +267,8 @@ platforms. Some archive formats use the combined form, while other formats use the split form. .Sh SEE ALSO +.Xr stat 2 , .Xr archive_entry_acl 3 , .Xr archive_entry_perms 3 , .Xr archive_entry_time 3 , -.Xr libarchive 3 , -.Xr stat 2 +.Xr libarchive 3 diff --git a/libarchive/archive_entry_time.3 b/libarchive/archive_entry_time.3 index 186452159370..d0563eaef43a 100644 --- a/libarchive/archive_entry_time.3 +++ b/libarchive/archive_entry_time.3 @@ -48,7 +48,7 @@ .Nm archive_entry_mtime_nsec , .Nm archive_entry_mtime_is_set , .Nm archive_entry_set_mtime , -.Nm archive_entry_unset_mtime , +.Nm archive_entry_unset_mtime .Nd functions for manipulating times in archive entry descriptions .Sh LIBRARY Streaming Archive Library (libarchive, -larchive) @@ -113,8 +113,8 @@ The current state can be queried using .Fn XXX_is_set . Unset time fields have a second and nanosecond field of 0. .Sh SEE ALSO -.Xr archive_entry 3 -.Xr libarchive 3 , +.Xr archive_entry 3 , +.Xr libarchive 3 .Sh HISTORY The .Nm libarchive diff --git a/libarchive/archive_read.3 b/libarchive/archive_read.3 index d37e7327cb5e..cbedd0a19129 100644 --- a/libarchive/archive_read.3 +++ b/libarchive/archive_read.3 @@ -155,7 +155,7 @@ to close the archive, then call .Fn archive_read_free to release all resources, including all memory allocated by the library. .\" -.Sh EXAMPLE +.Sh EXAMPLES The following illustrates basic usage of the library. In this example, the callback functions are simply wrappers around the standard @@ -217,16 +217,16 @@ myclose(struct archive *a, void *client_data) .\" .Sh ERRORS .Sh SEE ALSO .Xr tar 1 , -.Xr libarchive 3 , -.Xr archive_read_new 3 , .Xr archive_read_data 3 , .Xr archive_read_extract 3 , .Xr archive_read_filter 3 , .Xr archive_read_format 3 , .Xr archive_read_header 3 , +.Xr archive_read_new 3 , .Xr archive_read_open 3 , .Xr archive_read_set_options 3 , .Xr archive_util 3 , +.Xr libarchive 3 , .Xr tar 5 .Sh HISTORY The diff --git a/libarchive/archive_read_add_passphrase.3 b/libarchive/archive_read_add_passphrase.3 index 8b242ea79b1e..ca60d4fc62f7 100644 --- a/libarchive/archive_read_add_passphrase.3 +++ b/libarchive/archive_read_add_passphrase.3 @@ -59,16 +59,16 @@ or empty, this function will do nothing and will be returned. Otherwise, .Cm ARCHIVE_OK -will be returned. +will be returned. .It Fn archive_read_set_passphrase_callback -Register callback function that will be invoked to get a passphrase -for decrption after trying all passphrases registered by the +Register a callback function that will be invoked to get a passphrase +for decryption after trying all the passphrases registered by the .Fn archive_read_add_passphrase function failed. .El .\" .Sh ERRORS .Sh SEE ALSO .Xr tar 1 , -.Xr libarchive 3 , .Xr archive_read 3 , -.Xr archive_read_set_options 3 +.Xr archive_read_set_options 3 , +.Xr libarchive 3 diff --git a/libarchive/archive_read_data.3 b/libarchive/archive_read_data.3 index c1bc15d7cc8c..78c0c9000419 100644 --- a/libarchive/archive_read_data.3 +++ b/libarchive/archive_read_data.3 @@ -28,7 +28,7 @@ .Dt ARCHIVE_READ_DATA 3 .Os .Sh NAME -.Nm archive_read_data +.Nm archive_read_data , .Nm archive_read_data_block , .Nm archive_read_data_skip , .Nm archive_read_data_into_fd @@ -118,7 +118,6 @@ functions. .\" .Sh SEE ALSO .Xr tar 1 , -.Xr libarchive 3 , .Xr archive_read 3 , .Xr archive_read_extract 3 , .Xr archive_read_filter 3 , @@ -127,4 +126,5 @@ functions. .Xr archive_read_open 3 , .Xr archive_read_set_options 3 , .Xr archive_util 3 , +.Xr libarchive 3 , .Xr tar 5 diff --git a/libarchive/archive_read_disk.3 b/libarchive/archive_read_disk.3 index 027f63cb630f..82d6a5c8562c 100644 --- a/libarchive/archive_read_disk.3 +++ b/libarchive/archive_read_disk.3 @@ -99,9 +99,10 @@ following values: .Bl -tag -compact -width "indent" .It Cm ARCHIVE_READDISK_HONOR_NODUMP Skip files and directories with the nodump file attribute (file flag) set. -By default, the nodump file atrribute is ignored. +By default, the nodump file attribute is ignored. .It Cm ARCHIVE_READDISK_MAC_COPYFILE -Mac OS X specific. Read metadata (ACLs and extended attributes) with +Mac OS X specific. +Read metadata (ACLs and extended attributes) with .Xr copyfile 3 . By default, metadata is read using .Xr copyfile 3 . @@ -120,7 +121,7 @@ or for more information on file attributes. .It Cm ARCHIVE_READDISK_NO_TRAVERSE_MOUNTS Do not traverse mount points. -By defaut, moint points are traversed. +By default, mount points are traversed. .It Cm ARCHIVE_READDISK_NO_XATTR Do not read extended file attributes (xattrs). By default, extended file attributes are read from disk. @@ -216,7 +217,7 @@ of some other operation. (For example, directory traversal libraries often provide this information.) .Pp Where necessary, user and group ids are converted to user and group names -using the currently registered lookup functions above. +using the currently-registered lookup functions above. This affects the file ownership fields and ACL values in the .Tn struct archive_entry object. @@ -226,7 +227,7 @@ More information about the object and the overall design of the library can be found in the .Xr libarchive 3 overview. -.Sh EXAMPLE +.Sh EXAMPLES The following illustrates basic usage of the library by showing how to use it to copy an item on disk into an archive. .Bd -literal -offset indent @@ -291,11 +292,11 @@ and functions. .\" .Sh SEE ALSO +.Xr tar 1 , .Xr archive_read 3 , .Xr archive_util 3 , .Xr archive_write 3 , .Xr archive_write_disk 3 , -.Xr tar 1 , .Xr libarchive 3 .Sh HISTORY The diff --git a/libarchive/archive_read_extract.3 b/libarchive/archive_read_extract.3 index 6ec0ced939b6..858f39742553 100644 --- a/libarchive/archive_read_extract.3 +++ b/libarchive/archive_read_extract.3 @@ -126,7 +126,6 @@ and functions. .Sh SEE ALSO .Xr tar 1 , -.Xr libarchive 3 , .Xr archive_read 3 , .Xr archive_read_data 3 , .Xr archive_read_filter 3 , @@ -134,4 +133,5 @@ functions. .Xr archive_read_open 3 , .Xr archive_read_set_options 3 , .Xr archive_util 3 , +.Xr libarchive 3 , .Xr tar 5 diff --git a/libarchive/archive_read_filter.3 b/libarchive/archive_read_filter.3 index ef0a70175332..1ba5fcbd6efd 100644 --- a/libarchive/archive_read_filter.3 +++ b/libarchive/archive_read_filter.3 @@ -147,8 +147,8 @@ and functions. .\" .Sh SEE ALSO -.Xr libarchive 3 , .Xr archive_read 3 , .Xr archive_read_data 3 , .Xr archive_read_format 3 , -.Xr archive_read_format 3 +.Xr archive_read_format 3 , +.Xr libarchive 3 diff --git a/libarchive/archive_read_format.3 b/libarchive/archive_read_format.3 index 91c5d2cfd4b6..f3804ce3796a 100644 --- a/libarchive/archive_read_format.3 +++ b/libarchive/archive_read_format.3 @@ -102,7 +102,7 @@ For example, .Fn archive_read_support_format_tar enables support for a variety of standard tar formats, old-style tar, ustar, pax interchange format, and many common variants. -.It Fn archive_read_support_format_all +.It Fn archive_read_support_format_all Enables support for all available formats except the .Dq raw format (see below). @@ -125,7 +125,7 @@ it is not possible to accurately determine a format for an empty file based purely on contents. So empty files are treated by libarchive as a distinct format. -.It Fn archive_read_support_format_raw +.It Fn archive_read_support_format_raw The .Dq raw format handler allows libarchive to be used to read arbitrary data. @@ -153,11 +153,11 @@ functions. .\" .Sh SEE ALSO .Xr tar 1 , -.Xr libarchive 3 , .Xr archive_read_data 3 , .Xr archive_read_filter 3 , .Xr archive_read_set_options 3 , .Xr archive_util 3 , +.Xr libarchive 3 , .Xr tar 5 .Sh BUGS Many traditional archiver programs treat diff --git a/libarchive/archive_read_free.3 b/libarchive/archive_read_free.3 index 5b218225ba0e..8371c3a0c60c 100644 --- a/libarchive/archive_read_free.3 +++ b/libarchive/archive_read_free.3 @@ -83,11 +83,11 @@ and functions. .\" .Sh SEE ALSO -.Xr libarchive 3 , -.Xr archive_read_new 3 , .Xr archive_read_data 3 , .Xr archive_read_filter 3 , .Xr archive_read_format 3 , +.Xr archive_read_new 3 , .Xr archive_read_open 3 , .Xr archive_read_set_options 3 , -.Xr archive_util 3 +.Xr archive_util 3 , +.Xr libarchive 3 diff --git a/libarchive/archive_read_header.3 b/libarchive/archive_read_header.3 index 480a666ca395..1e97f3a27507 100644 --- a/libarchive/archive_read_header.3 +++ b/libarchive/archive_read_header.3 @@ -79,7 +79,6 @@ functions. .\" .Sh SEE ALSO .Xr tar 1 , -.Xr libarchive 3 , .Xr archive_read 3 , .Xr archive_read_data 3 , .Xr archive_read_extract 3 , @@ -88,4 +87,5 @@ functions. .Xr archive_read_open 3 , .Xr archive_read_set_options 3 , .Xr archive_util 3 , +.Xr libarchive 3 , .Xr tar 5 diff --git a/libarchive/archive_read_new.3 b/libarchive/archive_read_new.3 index 0c9d1a7fbb27..8bb6b848b06a 100644 --- a/libarchive/archive_read_new.3 +++ b/libarchive/archive_read_new.3 @@ -50,10 +50,10 @@ object can be found in the overview manual page for .\" .Sh ERRORS .Sh SEE ALSO .Xr tar 1 , -.Xr libarchive 3 , .Xr archive_read_data 3 , .Xr archive_read_filter 3 , .Xr archive_read_format 3 , .Xr archive_read_set_options 3 , .Xr archive_util 3 , +.Xr libarchive 3 , .Xr tar 5 diff --git a/libarchive/archive_read_open.3 b/libarchive/archive_read_open.3 index 2278ebc330c3..f67677823bd0 100644 --- a/libarchive/archive_read_open.3 +++ b/libarchive/archive_read_open.3 @@ -205,7 +205,7 @@ On failure, the callback should invoke .Fn archive_set_error to register an error code and message and return -.Cm ARCHIVE_FATAL. +.Cm ARCHIVE_FATAL . .\" .Sh EXAMPLE .\" .Sh RETURN VALUES @@ -223,11 +223,11 @@ functions. .\" .Sh SEE ALSO .Xr tar 1 , -.Xr libarchive 3 , .Xr archive_read 3 , .Xr archive_read_data 3 , .Xr archive_read_filter 3 , .Xr archive_read_format 3 , .Xr archive_read_set_options 3 , .Xr archive_util 3 , +.Xr libarchive 3 , .Xr tar 5 diff --git a/libarchive/archive_read_set_options.3 b/libarchive/archive_read_set_options.3 index 1a251cefecd1..d23f028b0ce2 100644 --- a/libarchive/archive_read_set_options.3 +++ b/libarchive/archive_read_set_options.3 @@ -212,7 +212,8 @@ Use to disable. .It Cm read_concatenated_archives Ignore zeroed blocks in the archive, which occurs when multiple tar archives -have been concatenated together. Without this option, only the contents of +have been concatenated together. +Without this option, only the contents of the first concatenated archive would be read. .El .El @@ -226,6 +227,6 @@ functions. .\" .Sh SEE ALSO .Xr tar 1 , -.Xr libarchive 3 , +.Xr archive_read 3 , .Xr archive_write_set_options 3 , -.Xr archive_read 3 +.Xr libarchive 3 diff --git a/libarchive/archive_read_support_filter_gzip.c b/libarchive/archive_read_support_filter_gzip.c index 458b6f729164..9fa9e2b0ddb8 100644 --- a/libarchive/archive_read_support_filter_gzip.c +++ b/libarchive/archive_read_support_filter_gzip.c @@ -131,12 +131,20 @@ archive_read_support_filter_gzip(struct archive *_a) */ static ssize_t peek_at_header(struct archive_read_filter *filter, int *pbits, - struct private_data *state) +#ifdef HAVE_ZLIB_H + struct private_data *state +#else + void *state +#endif + ) { const unsigned char *p; ssize_t avail, len; int bits = 0; int header_flags; +#ifndef HAVE_ZLIB_H + (void)state; /* UNUSED */ +#endif /* Start by looking at the first ten bytes of the header, which * is all fixed layout. */ @@ -153,8 +161,10 @@ peek_at_header(struct archive_read_filter *filter, int *pbits, bits += 3; header_flags = p[3]; /* Bytes 4-7 are mod time in little endian. */ +#ifdef HAVE_ZLIB_H if (state) state->mtime = archive_le32dec(p + 4); +#endif /* Byte 8 is deflate flags. */ /* XXXX TODO: return deflate flags back to consume_header for use in initializing the decompressor. */ @@ -171,7 +181,9 @@ peek_at_header(struct archive_read_filter *filter, int *pbits, /* Null-terminated optional filename. */ if (header_flags & 8) { +#ifdef HAVE_ZLIB_H ssize_t file_start = len; +#endif do { ++len; if (avail < len) @@ -181,11 +193,13 @@ peek_at_header(struct archive_read_filter *filter, int *pbits, return (0); } while (p[len - 1] != 0); +#ifdef HAVE_ZLIB_H if (state) { /* Reset the name in case of repeat header reads. */ free(state->name); state->name = strdup((const char *)&p[file_start]); } +#endif } /* Null-terminated optional comment. */ @@ -236,24 +250,6 @@ gzip_bidder_bid(struct archive_read_filter_bidder *self, return (0); } -static int -gzip_read_header(struct archive_read_filter *self, struct archive_entry *entry) -{ - struct private_data *state; - - state = (struct private_data *)self->data; - - /* A mtime of 0 is considered invalid/missing. */ - if (state->mtime != 0) - archive_entry_set_mtime(entry, state->mtime, 0); - - /* If the name is available, extract it. */ - if (state->name) - archive_entry_set_pathname(entry, state->name); - - return (ARCHIVE_OK); -} - #ifndef HAVE_ZLIB_H /* @@ -277,6 +273,24 @@ gzip_bidder_init(struct archive_read_filter *self) #else +static int +gzip_read_header(struct archive_read_filter *self, struct archive_entry *entry) +{ + struct private_data *state; + + state = (struct private_data *)self->data; + + /* A mtime of 0 is considered invalid/missing. */ + if (state->mtime != 0) + archive_entry_set_mtime(entry, state->mtime, 0); + + /* If the name is available, extract it. */ + if (state->name) + archive_entry_set_pathname(entry, state->name); + + return (ARCHIVE_OK); +} + /* * Initialize the filter object. */ @@ -306,7 +320,9 @@ gzip_bidder_init(struct archive_read_filter *self) self->read = gzip_filter_read; self->skip = NULL; /* not supported */ self->close = gzip_filter_close; +#ifdef HAVE_ZLIB_H self->read_header = gzip_read_header; +#endif state->in_stream = 0; /* We're not actually within a stream yet. */ diff --git a/libarchive/archive_read_support_filter_lz4.c b/libarchive/archive_read_support_filter_lz4.c index 147f5027ff4d..43ee6c2b7266 100644 --- a/libarchive/archive_read_support_filter_lz4.c +++ b/libarchive/archive_read_support_filter_lz4.c @@ -460,7 +460,7 @@ lz4_filter_read_descriptor(struct archive_read_filter *self) __archive_read_filter_consume(self->upstream, descriptor_bytes); - /* Make sure we have an enough buffer for uncompressed data. */ + /* Make sure we have a large enough buffer for uncompressed data. */ if (lz4_allocate_out_block(self) != ARCHIVE_OK) return (ARCHIVE_FATAL); if (state->flags.stream_checksum) @@ -520,7 +520,7 @@ lz4_filter_read_data_block(struct archive_read_filter *self, const void **p) if (read_buf == NULL) goto truncated_error; - /* Optional process, checking a block sum. */ + /* Optional processing, checking a block sum. */ if (checksum_size) { unsigned int chsum = __archive_xxhash.XXH32( read_buf + 4, (int)compressed_size, 0); @@ -640,7 +640,7 @@ lz4_filter_read_default_stream(struct archive_read_filter *self, const void **p) if (ret == 0 && *p == NULL) state->stage = SELECT_STREAM; - /* Optional process, checking a stream sum. */ + /* Optional processing, checking a stream sum. */ if (state->flags.stream_checksum) { if (state->stage == SELECT_STREAM) { unsigned int checksum; @@ -660,7 +660,7 @@ lz4_filter_read_default_stream(struct archive_read_filter *self, const void **p) if (checksum != checksum_stream) { archive_set_error(&self->archive->archive, ARCHIVE_ERRNO_MISC, - "lz4 stream cheksum error"); + "lz4 stream checksum error"); return (ARCHIVE_FATAL); } } else if (ret > 0) @@ -674,7 +674,7 @@ static ssize_t lz4_filter_read_legacy_stream(struct archive_read_filter *self, const void **p) { struct private_data *state = (struct private_data *)self->data; - int compressed; + uint32_t compressed; const char *read_buf; ssize_t ret; diff --git a/libarchive/archive_read_support_format_zip.c b/libarchive/archive_read_support_format_zip.c index ab21e222f5af..9934bf1504dc 100644 --- a/libarchive/archive_read_support_format_zip.c +++ b/libarchive/archive_read_support_format_zip.c @@ -487,7 +487,7 @@ process_extra(struct archive_read *a, struct archive_entry *entry, /* Some ZIP files may have trailing 0 bytes. Let's check they * are all 0 and ignore them instead of returning an error. * - * This is not techincally correct, but some ZIP files look + * This is not technically correct, but some ZIP files look * like this and other tools support those files - so let's * also support them. */ @@ -1053,7 +1053,7 @@ zip_read_local_file_header(struct archive_read *a, struct archive_entry *entry, /* Make sure that entries with a trailing '/' are marked as directories * even if the External File Attributes contains bogus values. If this - * is not a directory and there is no type, assume regularfile. */ + * is not a directory and there is no type, assume a regular file. */ if ((zip_entry->mode & AE_IFMT) != AE_IFDIR) { int has_slash; @@ -1104,7 +1104,7 @@ zip_read_local_file_header(struct archive_read *a, struct archive_entry *entry, } if (zip_entry->flags & LA_FROM_CENTRAL_DIRECTORY) { - /* If this came from the central dir, it's size info + /* If this came from the central dir, its size info * is definitive, so ignore the length-at-end flag. */ zip_entry->zip_flags &= ~ZIP_LENGTH_AT_END; /* If local header is missing a value, use the one from diff --git a/libarchive/archive_string.c b/libarchive/archive_string.c index 76a1624043f8..979a418b6779 100644 --- a/libarchive/archive_string.c +++ b/libarchive/archive_string.c @@ -458,7 +458,7 @@ archive_wstring_append_from_mbs_in_codepage(struct archive_wstring *dest, if (from_cp == CP_C_LOCALE) { /* - * "C" locale special process. + * "C" locale special processing. */ wchar_t *ws; const unsigned char *mp; @@ -680,7 +680,7 @@ archive_string_append_from_wcs_in_codepage(struct archive_string *as, if (to_cp == CP_C_LOCALE) { /* - * "C" locale special process. + * "C" locale special processing. */ const wchar_t *wp = ws; char *p; @@ -889,7 +889,7 @@ add_converter(struct archive_string_conv *sc, int (*converter) struct archive_string_conv *)) { if (sc == NULL || sc->nconverter >= 2) - __archive_errx(1, "Programing error"); + __archive_errx(1, "Programming error"); sc->converter[sc->nconverter++] = converter; } diff --git a/libarchive/archive_util.3 b/libarchive/archive_util.3 index 99ab842a28ca..d5d4e7dfd7d5 100644 --- a/libarchive/archive_util.3 +++ b/libarchive/archive_util.3 @@ -92,10 +92,10 @@ Clears any error information left over from a previous call. Not generally used in client code. .It Fn archive_compression Synonym for -.Fn archive_filter_code(a, 0) . +.Fn archive_filter_code a 0 . .It Fn archive_compression_name Synonym for -.Fn archive_filter_name(a, 0) . +.Fn archive_filter_name a 0 . .It Fn archive_copy_error Copies error information from one archive to another. .It Fn archive_errno @@ -142,13 +142,13 @@ filter 0 is the gunzip filter, filter 1 is the uudecode filter, and filter 2 is the pseudo-filter that wraps the archive read functions. In this case, requesting -.Fn archive_position(a, -1) +.Fn archive_position a -1 would be a synonym for -.Fn archive_position(a, 2) +.Fn archive_position a 2 which would return the number of bytes currently read from the archive, while -.Fn archive_position(a, 1) +.Fn archive_position a 1 would return the number of bytes after uudecoding, and -.Fn archive_position(a, 0) +.Fn archive_position a 0 would return the number of bytes after decompression. .It Fn archive_filter_name Returns a textual name identifying the indicated filter. @@ -170,9 +170,9 @@ A textual description of the format of the current entry. .It Fn archive_position Returns the number of bytes read from or written to the indicated filter. In particular, -.Fn archive_position(a, 0) +.Fn archive_position a 0 returns the number of bytes read or written by the format handler, while -.Fn archive_position(a, -1) +.Fn archive_position a -1 returns the number of bytes read or written to the archive. See .Fn archive_filter_count diff --git a/libarchive/archive_write.3 b/libarchive/archive_write.3 index c1164f5b5fdb..e7f7f1384ee8 100644 --- a/libarchive/archive_write.3 +++ b/libarchive/archive_write.3 @@ -118,7 +118,7 @@ After all entries have been written, use the .Fn archive_write_free function to release all resources. .\" -.Sh EXAMPLE +.Sh EXAMPLES The following sketch illustrates basic usage of the library. In this example, the callback functions are simply wrappers around the standard @@ -192,7 +192,7 @@ write_archive(const char *outname, const char **filename) if (archive_write_set_format_filter_by_ext(a, outname) != ARCHIVE_OK) { archive_write_add_filter_gzip(a); archive_write_set_format_ustar(a); - } + } archive_write_open(a, mydata, myopen, mywrite, myclose); while (*filename) { stat(*filename, &st); @@ -225,8 +225,8 @@ int main(int argc, const char **argv) .Ed .Sh SEE ALSO .Xr tar 1 , -.Xr libarchive 3 , .Xr archive_write_set_options 3 , +.Xr libarchive 3 , .Xr cpio 5 , .Xr mtree 5 , .Xr tar 5 diff --git a/libarchive/archive_write_blocksize.3 b/libarchive/archive_write_blocksize.3 index afd84ea4d393..4973f9990566 100644 --- a/libarchive/archive_write_blocksize.3 +++ b/libarchive/archive_write_blocksize.3 @@ -107,8 +107,8 @@ functions. .\" .Sh SEE ALSO .Xr tar 1 , -.Xr libarchive 3 , .Xr archive_write_set_options 3 , +.Xr libarchive 3 , .Xr cpio 5 , .Xr mtree 5 , .Xr tar 5 diff --git a/libarchive/archive_write_data.3 b/libarchive/archive_write_data.3 index 9c16cd9b4f70..bc208b45d53a 100644 --- a/libarchive/archive_write_data.3 +++ b/libarchive/archive_write_data.3 @@ -82,9 +82,9 @@ and consider any non-negative value as success. .\" .Sh SEE ALSO .Xr tar 1 , -.Xr libarchive 3 , .Xr archive_write_finish_entry 3 , .Xr archive_write_set_options 3 , +.Xr libarchive 3 , .Xr cpio 5 , .Xr mtree 5 , .Xr tar 5 diff --git a/libarchive/archive_write_disk.3 b/libarchive/archive_write_disk.3 index 949c9ef106f2..ff8e1a36a75c 100644 --- a/libarchive/archive_write_disk.3 +++ b/libarchive/archive_write_disk.3 @@ -113,7 +113,8 @@ or .Pq FreeBSD, Mac OS X for more information on file attributes. .It Cm ARCHIVE_EXTRACT_MAC_METADATA -Mac OS X specific. Restore metadata using +Mac OS X specific. +Restore metadata using .Xr copyfile 3 . By default, .Xr copyfile 3 @@ -264,9 +265,9 @@ and functions. .\" .Sh SEE ALSO +.Xr tar 1 , .Xr archive_read 3 , .Xr archive_write 3 , -.Xr tar 1 , .Xr libarchive 3 .Sh HISTORY The diff --git a/libarchive/archive_write_disk_posix.c b/libarchive/archive_write_disk_posix.c index 283f3e787678..6ae8a6a89bbf 100644 --- a/libarchive/archive_write_disk_posix.c +++ b/libarchive/archive_write_disk_posix.c @@ -431,7 +431,7 @@ la_opendirat(int fd, const char *path) { errno = ENOTSUP; return (-1); } else - return (open(fd, path, flags)); + return (open(path, flags)); #else return (openat(fd, path, flags)); #endif diff --git a/libarchive/archive_write_filter.3 b/libarchive/archive_write_filter.3 index d6fa07131a80..c83eb77b6a5e 100644 --- a/libarchive/archive_write_filter.3 +++ b/libarchive/archive_write_filter.3 @@ -43,7 +43,7 @@ .Nm archive_write_add_filter_program , .Nm archive_write_add_filter_uuencode , .Nm archive_write_add_filter_xz , -.Nm archive_write_add_filter_zstd , +.Nm archive_write_add_filter_zstd .Nd functions enabling output filters .Sh LIBRARY Streaming Archive Library (libarchive, -larchive) @@ -125,10 +125,10 @@ functions. .\" .Sh SEE ALSO .Xr tar 1 , -.Xr libarchive 3 , .Xr archive_write 3 , .Xr archive_write_format 3 , .Xr archive_write_set_options 3 , +.Xr libarchive 3 , .Xr cpio 5 , .Xr mtree 5 , .Xr tar 5 diff --git a/libarchive/archive_write_finish_entry.3 b/libarchive/archive_write_finish_entry.3 index dc1b94b82a5c..5797e16a6dbc 100644 --- a/libarchive/archive_write_finish_entry.3 +++ b/libarchive/archive_write_finish_entry.3 @@ -71,9 +71,9 @@ functions. .\" .Sh SEE ALSO .Xr tar 1 , -.Xr libarchive 3 , .Xr archive_write_data 3 , .Xr archive_write_set_options 3 , +.Xr libarchive 3 , .Xr cpio 5 , .Xr mtree 5 , .Xr tar 5 diff --git a/libarchive/archive_write_format.3 b/libarchive/archive_write_format.3 index aaafb0a8617c..47a740339622 100644 --- a/libarchive/archive_write_format.3 +++ b/libarchive/archive_write_format.3 @@ -52,7 +52,7 @@ .Nm archive_write_set_format_v7tar , .Nm archive_write_set_format_warc , .Nm archive_write_set_format_xar , -.Nm archive_write_set_format_zip , +.Nm archive_write_set_format_zip .Nd functions for creating archives .Sh LIBRARY Streaming Archive Library (libarchive, -larchive) @@ -166,9 +166,9 @@ functions. .\" .Sh SEE ALSO .Xr tar 1 , -.Xr libarchive 3 , .Xr archive_write 3 , .Xr archive_write_set_options 3 , +.Xr libarchive 3 , .Xr cpio 5 , .Xr libarchive-formats 5 , .Xr mtree 5 , diff --git a/libarchive/archive_write_free.3 b/libarchive/archive_write_free.3 index 1b2d07131d8e..5210e2a633de 100644 --- a/libarchive/archive_write_free.3 +++ b/libarchive/archive_write_free.3 @@ -56,7 +56,7 @@ after calling this function, the only call that can succeed is to release the resources. This can be used to speed recovery when the archive creation must be aborted. -Note that the created archive is likely to be malformed in this case; +Note that the created archive is likely to be malformed in this case; .It Fn archive_write_close Complete the archive and invoke the close callback. .It Fn archive_write_finish @@ -89,8 +89,8 @@ functions. .\" .Sh SEE ALSO .Xr tar 1 , -.Xr libarchive 3 , .Xr archive_write_set_options 3 , +.Xr libarchive 3 , .Xr cpio 5 , .Xr mtree 5 , .Xr tar 5 diff --git a/libarchive/archive_write_header.3 b/libarchive/archive_write_header.3 index 4de58f3b9f37..2217b1871bba 100644 --- a/libarchive/archive_write_header.3 +++ b/libarchive/archive_write_header.3 @@ -66,8 +66,8 @@ functions. .\" .Sh SEE ALSO .Xr tar 1 , -.Xr libarchive 3 , .Xr archive_write_set_options 3 , +.Xr libarchive 3 , .Xr cpio 5 , .Xr mtree 5 , .Xr tar 5 diff --git a/libarchive/archive_write_new.3 b/libarchive/archive_write_new.3 index f05d269d3e8d..788cbb855985 100644 --- a/libarchive/archive_write_new.3 +++ b/libarchive/archive_write_new.3 @@ -50,9 +50,9 @@ object can be found in the overview manual page for .\" .Sh ERRORS .Sh SEE ALSO .Xr tar 1 , -.Xr libarchive 3 , .Xr archive_write 3 , .Xr archive_write_set_options 3 , +.Xr libarchive 3 , .Xr cpio 5 , .Xr mtree 5 , .Xr tar 5 diff --git a/libarchive/archive_write_open.3 b/libarchive/archive_write_open.3 index 457873e61483..0129d10b7f2d 100644 --- a/libarchive/archive_write_open.3 +++ b/libarchive/archive_write_open.3 @@ -200,7 +200,7 @@ On failure, the callback should invoke .Fn archive_set_error to register an error code and message and return -.Cm ARCHIVE_FATAL. +.Cm ARCHIVE_FATAL . .Pp Note that if the client-provided write callback function returns a non-zero value, that error will be propagated back to the caller @@ -234,13 +234,13 @@ functions. .\" .Sh SEE ALSO .Xr tar 1 , -.Xr libarchive 3 , .Xr archive_write 3 , .Xr archive_write_blocksize 3 , .Xr archive_write_filter 3 , .Xr archive_write_format 3 , .Xr archive_write_new 3 , .Xr archive_write_set_options 3 , +.Xr libarchive 3 , .Xr cpio 5 , .Xr mtree 5 , .Xr tar 5 diff --git a/libarchive/archive_write_set_format_iso9660.c b/libarchive/archive_write_set_format_iso9660.c index badc88bad0c9..cacbdde7dcb0 100644 --- a/libarchive/archive_write_set_format_iso9660.c +++ b/libarchive/archive_write_set_format_iso9660.c @@ -3650,7 +3650,7 @@ wb_consume(struct archive_write *a, size_t size) if (size > iso9660->wbuff_remaining || iso9660->wbuff_remaining == 0) { archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC, - "Internal Programing error: iso9660:wb_consume()" + "Internal Programming error: iso9660:wb_consume()" " size=%jd, wbuff_remaining=%jd", (intmax_t)size, (intmax_t)iso9660->wbuff_remaining); return (ARCHIVE_FATAL); @@ -3671,7 +3671,7 @@ wb_set_offset(struct archive_write *a, int64_t off) if (iso9660->wbuff_type != WB_TO_TEMP) { archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC, - "Internal Programing error: iso9660:wb_set_offset()"); + "Internal Programming error: iso9660:wb_set_offset()"); return (ARCHIVE_FATAL); } @@ -8128,7 +8128,7 @@ zisofs_write_to_temp(struct archive_write *a, const void *buff, size_t s) { (void)buff; /* UNUSED */ (void)s; /* UNUSED */ - archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC, "Programing error"); + archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC, "Programming error"); return (ARCHIVE_FATAL); } diff --git a/libarchive/archive_write_set_format_mtree.c b/libarchive/archive_write_set_format_mtree.c index 0f2431e6abe0..aa41e9acc22b 100644 --- a/libarchive/archive_write_set_format_mtree.c +++ b/libarchive/archive_write_set_format_mtree.c @@ -186,7 +186,7 @@ struct mtree_writer { #endif /* Keyword options */ int keys; -#define F_CKSUM 0x00000001 /* check sum */ +#define F_CKSUM 0x00000001 /* checksum */ #define F_DEV 0x00000002 /* device type */ #define F_DONE 0x00000004 /* directory done */ #define F_FLAGS 0x00000008 /* file flags */ @@ -371,7 +371,7 @@ mtree_quote(struct archive_string *s, const char *str) } /* - * Indent a line as mtree utility to be readable for people. + * Indent a line as the mtree utility does so it is readable for people. */ static void mtree_indent(struct mtree_writer *mtree) @@ -446,8 +446,8 @@ mtree_indent(struct mtree_writer *mtree) /* * Write /set keyword. - * Set most used value of uid,gid,mode and fflags, which are - * collected by attr_counter_set_collect() function. + * Set the most used value of uid, gid, mode and fflags, which are + * collected by the attr_counter_set_collect() function. */ static void write_global(struct mtree_writer *mtree) @@ -649,7 +649,7 @@ attr_counter_inc(struct attr_counter **top, struct attr_counter *ac, } /* - * Tabulate uid,gid,mode and fflags of a entry in order to be used for /set. + * Tabulate uid, gid, mode and fflags of a entry in order to be used for /set. */ static int attr_counter_set_collect(struct mtree_writer *mtree, struct mtree_entry *me) @@ -912,7 +912,7 @@ archive_write_mtree_header(struct archive_write *a, /* If the current file is a regular file, we have to * compute the sum of its content. - * Initialize a bunch of sum check context. */ + * Initialize a bunch of checksum context. */ if (mtree_entry->reg_info) sum_init(mtree); @@ -1265,7 +1265,7 @@ archive_write_mtree_free(struct archive_write *a) if (mtree == NULL) return (ARCHIVE_OK); - /* Make sure we dot not leave any entries. */ + /* Make sure we do not leave any entries. */ mtree_entry_register_free(mtree); archive_string_free(&mtree->cur_dirstr); archive_string_free(&mtree->ebuf); @@ -2024,7 +2024,7 @@ mtree_entry_tree_add(struct archive_write *a, struct mtree_entry **filep) if (file->parentdir.length == 0) { archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC, - "Internal programing error " + "Internal programming error " "in generating canonical name for %s", file->pathname.s); return (ARCHIVE_FAILED); diff --git a/libarchive/archive_write_set_options.3 b/libarchive/archive_write_set_options.3 index aeb7a1848658..a9f70a664092 100644 --- a/libarchive/archive_write_set_options.3 +++ b/libarchive/archive_write_set_options.3 @@ -24,7 +24,7 @@ .\" .\" $FreeBSD$ .\" -.Dd February 2, 2012 +.Dd July 27, 2019 .Dt ARCHIVE_WRITE_OPTIONS 3 .Os .Sh NAME @@ -70,7 +70,7 @@ specific write modules. .Fn archive_write_set_filter_option , .Fn archive_write_set_format_option .Xc -Specifies an option that will be passed to currently-registered +Specifies an option that will be passed to the currently-registered filters (including decompression filters) or format readers. .Pp If @@ -138,7 +138,7 @@ If either function returns .Cm ARCHIVE_FATAL will be returned immediately. -Otherwise, greater of the two values will be returned. +Otherwise, the greater of the two values will be returned. .\" .It Fn archive_write_set_options .Ar options @@ -203,22 +203,28 @@ These options are used to set standard ISO9660 metadata. .Bl -tag -compact -width indent .It Cm abstract-file Ns = Ns Ar filename The file with the specified name will be identified in the ISO9660 metadata -as holding the abstract for this volume. Default: none. +as holding the abstract for this volume. +Default: none. .It Cm application-id Ns = Ns Ar filename The file with the specified name will be identified in the ISO9660 metadata -as holding the application identifier for this volume. Default: none. +as holding the application identifier for this volume. +Default: none. .It Cm biblio-file Ns = Ns Ar filename The file with the specified name will be identified in the ISO9660 metadata -as holding the bibliography for this volume. Default: none. +as holding the bibliography for this volume. +Default: none. .It Cm copyright-file Ns = Ns Ar filename The file with the specified name will be identified in the ISO9660 metadata -as holding the copyright for this volume. Default: none. +as holding the copyright for this volume. +Default: none. .It Cm publisher Ns = Ns Ar filename The file with the specified name will be identified in the ISO9660 metadata -as holding the publisher information for this volume. Default: none. +as holding the publisher information for this volume. +Default: none. .It Cm volume-id Ns = Ns Ar string The specified string will be used as the Volume Identifier in the ISO9660 metadata. -It is limited to 32 bytes. Default: none. +It is limited to 32 bytes. +Default: none. .El .It Format iso9660 - boot support These options are used to make an ISO9660 image that can be directly @@ -266,7 +272,7 @@ If the boot image is exactly 1.2MB, 1.44MB, or 2.88MB, then the default is .Cm fd , otherwise the default is -.Cm no-emulation. +.Cm no-emulation . .El .It Format iso9660 - filename and size extensions Various extensions to the base ISO9660 format. @@ -290,7 +296,7 @@ This does not impact names stored in the Rockridge or Joliet extension area. Default: disabled. .It Cm allow-period If enabled, allows filenames to contain trailing period characters, in violation of the ISO9660 specification. -If disabled,trailing periods will be converted to underscore characters. +If disabled, trailing periods will be converted to underscore characters. This does not impact names stored in the Rockridge or Joliet extension area. Default: disabled. .It Cm allow-pvd-lowercase @@ -398,6 +404,27 @@ Specifies a filename that should not be compressed when using This option can be provided multiple times to suppress compression on many files. .El +.It Format 7zip +.Bl -tag -compact -width indent +.It Cm compression +The value is one of +.Dq store , +.Dq deflate , +.Dq bzip2 , +.Dq lzma1 , +.Dq lzma2 +or +.Dq ppmd +to indicate how the following entries should be compressed. +Note that this setting is ignored for directories, symbolic links, +and other special entries. +.It Cm compression-level +The value is interpreted as a decimal integer specifying the +compression level. +Values between 0 and 9 are supported. +The interpretation of the compression level depends on the chosen +compression method. +.El .It Format zip .Bl -tag -compact -width indent .It Cm compression @@ -408,6 +435,15 @@ or to indicate how the following entries should be compressed. Note that this setting is ignored for directories, symbolic links, and other special entries. +.It Cm compression-level +The value is interpreted as a decimal integer specifying the +compression level. +Values between 0 and 9 are supported. +A compression level of 0 switches the compression method to +.Dq store , +other values will enable +.Dq deflate +compression with the given level. .It Cm experimental This boolean option enables or disables experimental Zip features that may not be compatible with other Zip implementations. @@ -465,9 +501,9 @@ functions. .\" .Sh SEE ALSO .Xr tar 1 , -.Xr libarchive 3 , .Xr archive_read_set_options 3 , -.Xr archive_write 3 +.Xr archive_write 3 , +.Xr libarchive 3 .Sh HISTORY The .Nm libarchive diff --git a/libarchive/archive_write_set_passphrase.3 b/libarchive/archive_write_set_passphrase.3 index 2585595e331a..2db77034c76e 100644 --- a/libarchive/archive_write_set_passphrase.3 +++ b/libarchive/archive_write_set_passphrase.3 @@ -49,7 +49,7 @@ Streaming Archive Library (libarchive, -larchive) .Sh DESCRIPTION .Bl -tag -width indent .It Fn archive_write_set_passphrase -Set a passphrase for writing an encryption archive. +Set a passphrase for writing an encrypted archive. If .Ar passphrase is @@ -59,16 +59,16 @@ or empty, this function will do nothing and will be returned. Otherwise, .Cm ARCHIVE_OK -will be returned. +will be returned. .It Fn archive_write_set_passphrase_callback -Register callback function that will be invoked to get a passphrase -for encrption if the passphrase was not set by the +Register a callback function that will be invoked to get a passphrase +for encryption if the passphrase was not set by the .Fn archive_write_set_passphrase function. .El .\" .Sh ERRORS .Sh SEE ALSO .Xr tar 1 , -.Xr libarchive 3 , .Xr archive_write 3 , -.Xr archive_write_set_options 3 +.Xr archive_write_set_options 3 , +.Xr libarchive 3 diff --git a/libarchive/libarchive_changes.3 b/libarchive/libarchive_changes.3 index adc87febd71d..6bf8db038c73 100644 --- a/libarchive/libarchive_changes.3 +++ b/libarchive/libarchive_changes.3 @@ -35,7 +35,6 @@ This page describes user-visible changes in libarchive3, and lists public functions and other symbols changed, deprecated or removed in libarchive3, along with their replacements if any. -.Pp .\" .Ss Multiple Filters .\" @@ -330,13 +329,13 @@ or .Li 10240 .El .Sh SEE ALSO -.Xr libarchive 3 , .Xr archive_read 3 , .Xr archive_read_filter 3 , .Xr archive_read_format 3 , .Xr archive_read_set_options 3 , +.Xr archive_util 3 , .Xr archive_write 3 , .Xr archive_write_filter 3 , .Xr archive_write_format 3 , .Xr archive_write_set_options 3 , -.Xr archive_util 3 +.Xr libarchive 3 diff --git a/libarchive/libarchive_internals.3 b/libarchive/libarchive_internals.3 index 8275d66e68f1..d672f3e8a64d 100644 --- a/libarchive/libarchive_internals.3 +++ b/libarchive/libarchive_internals.3 @@ -350,8 +350,8 @@ as a dedicated ZIP program. .Xr archive_entry 3 , .Xr archive_read 3 , .Xr archive_write 3 , -.Xr archive_write_disk 3 -.Xr libarchive 3 , +.Xr archive_write_disk 3 , +.Xr libarchive 3 .Sh HISTORY The .Nm libarchive diff --git a/libarchive/mtree.5 b/libarchive/mtree.5 index e607e4a81977..8147796f3100 100644 --- a/libarchive/mtree.5 +++ b/libarchive/mtree.5 @@ -133,7 +133,6 @@ or .Sy char file types. The value must be one of the following forms: -.Pp .Bl -tag -width 4n .It Ar format , Ns Ar major , Ns Ar minor Ns Bo , Ns Ar subunit Bc A device with @@ -165,8 +164,8 @@ are recognized: .Sy solaris , .Sy sunos , .Sy svr3 , -.Sy svr4 , -and +.Sy svr4 , +and .Sy ultrix . .Pp See @@ -288,12 +287,10 @@ The file owner as a numeric value. .It Cm uname The file owner as a symbolic name. .El -.Pp .Sh SEE ALSO .Xr cksum 1 , .Xr find 1 , .Xr mtree 8 -.Sh BUGS .Sh HISTORY The .Nm diff --git a/libarchive/tar.5 b/libarchive/tar.5 index 30b837dc4133..34ad4f79315e 100644 --- a/libarchive/tar.5 +++ b/libarchive/tar.5 @@ -441,7 +441,7 @@ archives to store files much larger than the historic 8GB limit. Vendor-specific attributes used by Joerg Schilling's .Nm star implementation. -.It Cm SCHILY.acl.access , Cm SCHILY.acl.default, Cm SCHILY.acl.ace +.It Cm SCHILY.acl.access , Cm SCHILY.acl.default , Cm SCHILY.acl.ace Stores the access, default and NFSv4 ACLs as textual strings in a format that is an extension of the format specified by POSIX.1e draft 17. In particular, each user or group access specification can include @@ -456,7 +456,7 @@ The file flags. .It Cm SCHILY.realsize The full size of the file on disk. XXX explain? XXX -.It Cm SCHILY.dev, Cm SCHILY.ino , Cm SCHILY.nlinks +.It Cm SCHILY.dev , Cm SCHILY.ino , Cm SCHILY.nlinks The device number, inode number, and link count for the entry. In particular, note that a pax interchange format archive using Joerg Schilling's @@ -473,7 +473,7 @@ The time when the file was created. .Dq ctime attribute, which refers to the time when the file metadata was last changed.) -.It Cm LIBARCHIVE.xattr. Ns Ar namespace Ns . Ns Ar key +.It Cm LIBARCHIVE.xattr . Ns Ar namespace . Ns Ar key Libarchive stores POSIX.1e-style extended attributes using keys of this form. The @@ -890,7 +890,8 @@ GNU tar long pathname for the following header. .It Cm M GNU tar multivolume marker, indicating the file is a continuation of a file from the previous volume. .It Cm N -GNU tar long filename support. Deprecated. +GNU tar long filename support. +Deprecated. .It Cm S GNU tar sparse regular file. .It Cm V diff --git a/libarchive/test/test_archive_write_add_filter_by_name.c b/libarchive/test/test_archive_write_add_filter_by_name.c index d962af92311b..ba8c1d0472f6 100644 --- a/libarchive/test/test_archive_write_add_filter_by_name.c +++ b/libarchive/test/test_archive_write_add_filter_by_name.c @@ -48,7 +48,7 @@ test_filter_by_name(const char *filter_name, int filter_code, r = archive_write_add_filter_by_name(a, filter_name); if (r == ARCHIVE_WARN) { if (!can_filter_prog()) { - skipping("%s filter not suported on this platform", + skipping("%s filter not supported on this platform", filter_name); assertEqualInt(ARCHIVE_OK, archive_write_free(a)); free(buff); @@ -59,7 +59,7 @@ test_filter_by_name(const char *filter_name, int filter_code, "lzma compression not supported on this platform") == 0 || strcmp(archive_error_string(a), "xz compression not supported on this platform") == 0)) { - skipping("%s filter not suported on this platform", filter_name); + skipping("%s filter not supported on this platform", filter_name); assertEqualInt(ARCHIVE_OK, archive_write_free(a)); free(buff); return; diff --git a/libarchive/test/test_archive_write_set_format_filter_by_ext.c b/libarchive/test/test_archive_write_set_format_filter_by_ext.c index c073505f7c97..4fe18e18c2d1 100644 --- a/libarchive/test/test_archive_write_set_format_filter_by_ext.c +++ b/libarchive/test/test_archive_write_set_format_filter_by_ext.c @@ -61,7 +61,7 @@ test_format_filter_by_ext(const char *output_file, strcmp(archive_error_string(a), "xz compression not supported on this platform") == 0)) { const char *filter_name = archive_filter_name(a, 0); - skipping("%s filter not suported on this platform", filter_name); + skipping("%s filter not supported on this platform", filter_name); assertEqualInt(ARCHIVE_OK, archive_write_free(a)); free(buff); return; diff --git a/libarchive/test/test_read_format_raw.c b/libarchive/test/test_read_format_raw.c index 0dac8bfbab4a..3961723b48a1 100644 --- a/libarchive/test/test_read_format_raw.c +++ b/libarchive/test/test_read_format_raw.c @@ -36,7 +36,9 @@ DEFINE_TEST(test_read_format_raw) const char *reffile1 = "test_read_format_raw.data"; const char *reffile2 = "test_read_format_raw.data.Z"; const char *reffile3 = "test_read_format_raw.bufr"; +#ifdef HAVE_ZLIB_H const char *reffile4 = "test_read_format_raw.data.gz"; +#endif /* First, try pulling data out of an uninterpretable file. */ extract_reference_file(reffile1); @@ -119,6 +121,7 @@ DEFINE_TEST(test_read_format_raw) assertEqualIntA(a, ARCHIVE_OK, archive_read_close(a)); assertEqualInt(ARCHIVE_OK, archive_read_free(a)); +#ifdef HAVE_ZLIB_H /* Fourth, try with gzip which has metadata. */ extract_reference_file(reffile4); assert((a = archive_read_new()) != NULL); @@ -144,4 +147,5 @@ DEFINE_TEST(test_read_format_raw) assertEqualIntA(a, ARCHIVE_EOF, archive_read_next_header(a, &ae)); assertEqualIntA(a, ARCHIVE_OK, archive_read_close(a)); assertEqualInt(ARCHIVE_OK, archive_read_free(a)); +#endif } diff --git a/libarchive/test/test_read_format_zip.c b/libarchive/test/test_read_format_zip.c index b6e957a96cec..9afbfb6c5866 100644 --- a/libarchive/test/test_read_format_zip.c +++ b/libarchive/test/test_read_format_zip.c @@ -139,7 +139,7 @@ verify_basic(struct archive *a, int seek_checks) } else { assertEqualInt(ARCHIVE_FAILED, archive_read_data(a, buff, 19)); assertEqualString(archive_error_string(a), - "Unsupported ZIP compression method (deflation)"); + "Unsupported ZIP compression method (8: deflation)"); assert(archive_errno(a) != 0); } @@ -162,7 +162,7 @@ verify_basic(struct archive *a, int seek_checks) } else { assertEqualInt(ARCHIVE_FAILED, archive_read_data(a, buff, 19)); assertEqualString(archive_error_string(a), - "Unsupported ZIP compression method (deflation)"); + "Unsupported ZIP compression method (8: deflation)"); assert(archive_errno(a) != 0); } assertEqualInt(ARCHIVE_EOF, archive_read_next_header(a, &ae)); @@ -231,7 +231,7 @@ verify_info_zip_ux(struct archive *a, int seek_checks) } else { assertEqualInt(ARCHIVE_FAILED, archive_read_data(a, buff, 19)); assertEqualString(archive_error_string(a), - "Unsupported ZIP compression method (deflation)"); + "Unsupported ZIP compression method (8: deflation)"); assert(archive_errno(a) != 0); } assertEqualIntA(a, ARCHIVE_EOF, archive_read_next_header(a, &ae)); @@ -302,7 +302,7 @@ verify_extract_length_at_end(struct archive *a, int seek_checks) } else { assertEqualIntA(a, ARCHIVE_FAILED, archive_read_extract(a, ae, 0)); assertEqualString(archive_error_string(a), - "Unsupported ZIP compression method (deflation)"); + "Unsupported ZIP compression method (8: deflation)"); assert(archive_errno(a) != 0); } diff --git a/libarchive/test/test_read_format_zip_traditional_encryption_data.c b/libarchive/test/test_read_format_zip_traditional_encryption_data.c index 305261567ec8..20e55bbc6948 100644 --- a/libarchive/test/test_read_format_zip_traditional_encryption_data.c +++ b/libarchive/test/test_read_format_zip_traditional_encryption_data.c @@ -130,7 +130,7 @@ DEFINE_TEST(test_read_format_zip_traditional_encryption_data) assertEqualInt(ARCHIVE_FAILED, archive_read_data(a, buff, sizeof(buff))); assertEqualString(archive_error_string(a), - "Unsupported ZIP compression method (deflation)"); + "Unsupported ZIP compression method (8: deflation)"); assert(archive_errno(a) != 0); } @@ -148,7 +148,7 @@ DEFINE_TEST(test_read_format_zip_traditional_encryption_data) assertEqualInt(ARCHIVE_FAILED, archive_read_data(a, buff, sizeof(buff))); assertEqualString(archive_error_string(a), - "Unsupported ZIP compression method (deflation)"); + "Unsupported ZIP compression method (8: deflation)"); assert(archive_errno(a) != 0); } diff --git a/libarchive/test/test_write_filter_zstd.c b/libarchive/test/test_write_filter_zstd.c index da3c806671a4..ba1b6bfe716b 100644 --- a/libarchive/test/test_write_filter_zstd.c +++ b/libarchive/test/test_write_filter_zstd.c @@ -34,7 +34,7 @@ DEFINE_TEST(test_write_filter_zstd) char *buff, *data; size_t buffsize, datasize; char path[16]; - size_t used1, used2; + size_t used1, used2, used3; int i, r; buffsize = 2000000; @@ -125,7 +125,7 @@ DEFINE_TEST(test_write_filter_zstd) assertEqualIntA(a, ARCHIVE_OK, archive_write_set_filter_option(a, NULL, "compression-level", "9")); assertEqualIntA(a, ARCHIVE_OK, - archive_write_set_filter_option(a, NULL, "compression-level", "6")); + archive_write_set_filter_option(a, NULL, "compression-level", "7")); assertEqualIntA(a, ARCHIVE_OK, archive_write_open_memory(a, buff, buffsize, &used2)); for (i = 0; i < 100; i++) { sprintf(path, "file%03d", i); @@ -140,10 +140,6 @@ DEFINE_TEST(test_write_filter_zstd) assertEqualIntA(a, ARCHIVE_OK, archive_write_close(a)); assertEqualInt(ARCHIVE_OK, archive_write_free(a)); - failure("compression-level=6 wrote %d bytes, default wrote %d bytes", - (int)used2, (int)used1); - assert(used2 < used1); - assert((a = archive_read_new()) != NULL); assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_all(a)); r = archive_read_support_filter_zstd(a); @@ -167,6 +163,64 @@ DEFINE_TEST(test_write_filter_zstd) } assertEqualInt(ARCHIVE_OK, archive_read_free(a)); + /* + * One more time at level 1 + */ + assert((a = archive_write_new()) != NULL); + assertEqualIntA(a, ARCHIVE_OK, archive_write_set_format_ustar(a)); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_bytes_per_block(a, 10)); + assertEqualIntA(a, ARCHIVE_OK, archive_write_add_filter_zstd(a)); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "compression-level", "1")); + assertEqualIntA(a, ARCHIVE_OK, archive_write_open_memory(a, buff, buffsize, &used3)); + assert((ae = archive_entry_new()) != NULL); + archive_entry_set_filetype(ae, AE_IFREG); + archive_entry_set_size(ae, datasize); + for (i = 0; i < 100; i++) { + sprintf(path, "file%03d", i); + archive_entry_copy_pathname(ae, path); + assertEqualIntA(a, ARCHIVE_OK, archive_write_header(a, ae)); + assertA(datasize == (size_t)archive_write_data(a, data, datasize)); + } + archive_entry_free(ae); + assertEqualIntA(a, ARCHIVE_OK, archive_write_close(a)); + assertEqualInt(ARCHIVE_OK, archive_write_free(a)); + + assert((a = archive_read_new()) != NULL); + assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_all(a)); + r = archive_read_support_filter_zstd(a); + if (r == ARCHIVE_WARN) { + skipping("zstd reading not fully supported on this platform"); + } else { + assertEqualIntA(a, ARCHIVE_OK, + archive_read_support_filter_all(a)); + assertEqualIntA(a, ARCHIVE_OK, + archive_read_open_memory(a, buff, used3)); + for (i = 0; i < 100; i++) { + sprintf(path, "file%03d", i); + failure("Trying to read %s", path); + if (!assertEqualIntA(a, ARCHIVE_OK, + archive_read_next_header(a, &ae))) + break; + assertEqualString(path, archive_entry_pathname(ae)); + assertEqualInt((int)datasize, archive_entry_size(ae)); + } + assertEqualIntA(a, ARCHIVE_OK, archive_read_close(a)); + } + assertEqualInt(ARCHIVE_OK, archive_read_free(a)); + + /* + * Check output sizes for various compression levels, expectation + * is that archive size for level=7 < default < level=1 + */ + failure("compression-level=7 wrote %d bytes, default wrote %d bytes", + (int)used2, (int)used1); + assert(used2 < used1); + failure("compression-level=1 wrote %d bytes, default wrote %d bytes", + (int)used3, (int)used1); + assert(used1 < used3); + /* * Test various premature shutdown scenarios to make sure we * don't crash or leak memory. diff --git a/tar/bsdtar.1 b/tar/bsdtar.1 index 6d8d6d3d617a..04b56553ce02 100644 --- a/tar/bsdtar.1 +++ b/tar/bsdtar.1 @@ -167,12 +167,14 @@ if it is unknown suffix or no suffix, creates a new archive with restricted pax format and bzip2 compression. .It Fl Fl acls (c, r, u, x modes only) -Archive or extract POSIX.1e or NFSv4 ACLs. This is the reverse of +Archive or extract POSIX.1e or NFSv4 ACLs. +This is the reverse of .Fl Fl no-acls and the default behavior in c, r, and u modes (except on Mac OS X) or if .Nm -is run in x mode as root. On Mac OS X this option translates extended ACLs -to NFSv4 ACLs. To store extended ACLs the +is run in x mode as root. +On Mac OS X this option translates extended ACLs to NFSv4 ACLs. +To store extended ACLs the .Fl Fl mac-metadata option is preferred. .It Fl B , Fl Fl read-full-blocks @@ -390,10 +392,12 @@ Do not extract modification time. By default, the modification time is set to the time stored in the archive. .It Fl Fl mac-metadata (c, r, u and x mode only) -Mac OS X specific. Archive or extract extended ACLs and extended file +Mac OS X specific. +Archive or extract extended ACLs and extended file attributes using .Xr copyfile 3 -in AppleDouble format. This is the reverse of +in AppleDouble format. +This is the reverse of .Fl Fl no-mac-metadata . and the default behavior in c, r, and u modes or if .Nm @@ -439,24 +443,28 @@ option to .Xr find 1 . .It Fl Fl no-acls (c, r, u, x modes only) -Do not archive or extract POSIX.1e or NFSv4 ACLs. This is the reverse of +Do not archive or extract POSIX.1e or NFSv4 ACLs. +This is the reverse of .Fl Fl acls and the default behavior if .Nm is run as non-root in x mode (on Mac OS X as any user in c, r, u and x modes). .It Fl Fl no-fflags (c, r, u, x modes only) -Do not archive or extract file attributes or file flags. This is the reverse of +Do not archive or extract file attributes or file flags. +This is the reverse of .Fl Fl fflags and the default behavior if .Nm is run as non-root in x mode. .It Fl Fl no-mac-metadata (x mode only) -Mac OS X specific. Do not archive or extract ACLs and extended file attributes +Mac OS X specific. +Do not archive or extract ACLs and extended file attributes using .Xr copyfile 3 -in AppleDouble format. This is the reverse of +in AppleDouble format. +This is the reverse of .Fl Fl mac-metadata . and the default behavior if .Nm @@ -480,7 +488,8 @@ and the default behavior if is run as non-root. .It Fl Fl no-xattrs (c, r, u, x modes only) -Do not archive or extract extended file attributes. This is the reverse of +Do not archive or extract extended file attributes. +This is the reverse of .Fl Fl xattrs and the default behavior if .Nm @@ -577,7 +586,8 @@ to disable. .It Cm gzip:compression-level A decimal integer from 1 to 9 specifying the gzip compression level. .It Cm gzip:timestamp -Store timestamp. This is enabled by default, use +Store timestamp. +This is enabled by default, use .Cm !timestamp or .Cm gzip:!timestamp @@ -593,7 +603,8 @@ A decimal integer from 1 to 9 specifying the lrzip compression level. .It Cm lz4:compression-level A decimal integer from 1 to 9 specifying the lzop compression level. .It Cm lz4:stream-checksum -Enable stream checksum. This is by default, use +Enable stream checksum. +This is by default, use .Cm lz4:!stream-checksum to disable. .It Cm lz4:block-checksum @@ -646,9 +657,10 @@ Supported values are zipcrypt (traditional zip encryption), aes128 (WinZip AES-128 encryption) and aes256 (WinZip AES-256 encryption). .It Cm read_concatenated_archives Ignore zeroed blocks in the archive, which occurs when multiple tar archives -have been concatenated together. Without this option, only the contents of -the first concatenated archive would be read. This option is comparable to -the +have been concatenated together. +Without this option, only the contents of +the first concatenated archive would be read. +This option is comparable to the .Fl i , Fl Fl ignore-zeros option of GNU tar. .El @@ -670,11 +682,13 @@ This option suppresses these behaviors. Preserve file permissions. Attempt to restore the full permissions, including file modes, file attributes or file flags, extended file attributes and ACLs, if available, for each item -extracted from the archive. This is the reverse of +extracted from the archive. +This is the reverse of .Fl Fl no-same-permissions and the default if .Nm -is being run as root. It can be partially overridden by also specifying +is being run as root. +It can be partially overridden by also specifying .Fl Fl no-acls , .Fl Fl no-fflags , .Fl Fl no-mac-metadata @@ -845,7 +859,8 @@ See for more information about the handling of exclusions. .It Fl Fl xattrs (c, r, u, x modes only) -Archive or extract extended file attributes. This is the reverse of +Archive or extract extended file attributes. +This is the reverse of .Fl Fl no-xattrs and the default behavior in c, r, and u modes or if .Nm @@ -937,9 +952,9 @@ To examine the contents of an ISO 9660 cdrom image: To move file hierarchies, invoke .Nm as -.Dl Nm Fl cf Pa - Fl C Pa srcdir\ . | Nm Fl xpf Pa - Fl C Pa destdir +.Dl Nm Fl cf Pa - Fl C Pa srcdir \&. | Nm Fl xpf Pa - Fl C Pa destdir or more traditionally -.Dl cd srcdir \&; Nm Fl cf Pa -\ . | ( cd destdir \&; Nm Fl xpf Pa - ) +.Dl cd srcdir \&; Nm Fl cf Pa - \&. | ( cd destdir \&; Nm Fl xpf Pa - ) .Pp In create mode, the list of files and directories to be archived can also include directory change instructions of the form @@ -967,7 +982,6 @@ An input file in .Xr mtree 5 format can be used to create an output archive with arbitrary ownership, permissions, or names that differ from existing data on disk: -.Pp .Bd -literal -offset indent $ cat input.mtree #mtree diff --git a/tar/test/test_option_n.c b/tar/test/test_option_n.c index e474ac1d5754..f36658ef1656 100644 --- a/tar/test/test_option_n.c +++ b/tar/test/test_option_n.c @@ -120,7 +120,7 @@ DEFINE_TEST(test_option_n) "d2/file4\n", "test5.out"); - /* Test 6: -t without -n and non-existant directory selected */ + /* Test 6: -t without -n and non-existent directory selected */ assertEqualInt(0, systemf("%s -tf partial-archive.tar d2 >test6.out 2>test6.err", testprog)); @@ -128,7 +128,7 @@ DEFINE_TEST(test_option_n) assertTextFileContents("d2/file4\n", "test6.out"); - /* Test 7: -t with -n and non-existant directory selected */ + /* Test 7: -t with -n and non-existent directory selected */ status = systemf("%s -tnf partial-archive.tar d2 " ">test7.out 2>test7.err", testprog); assert(status); diff --git a/tar/test/test_option_xattrs.c b/tar/test/test_option_xattrs.c index bce8a94e4dd2..79dfff528303 100644 --- a/tar/test/test_option_xattrs.c +++ b/tar/test/test_option_xattrs.c @@ -28,7 +28,7 @@ __FBSDID("$FreeBSD$"); DEFINE_TEST(test_option_xattrs) { #if !ARCHIVE_XATTR_SUPPORT - skipping("Extended atributes are not supported on this platform"); + skipping("Extended attributes are not supported on this platform"); #else /* ARCHIVE_XATTR_SUPPORT */ const char *testattr = "user.libarchive.test"; diff --git a/test_utils/test_main.c b/test_utils/test_main.c index 59c835ba81fa..1b9af9a9c37b 100644 --- a/test_utils/test_main.c +++ b/test_utils/test_main.c @@ -298,7 +298,7 @@ my_CreateSymbolicLinkA(const char *linkname, const char *target, ret = (*f)(src, tgt, tmpflags); /* * Prior to Windows 10 the SYMBOLIC_LINK_FLAG_ALLOW_UNPRIVILEGED_CREATE - * is not undestood + * is not understood */ if (!ret) ret = (*f)(src, tgt, flags); From f7b69dd9868ea30ef64f42b33a5f6c9966338b56 Mon Sep 17 00:00:00 2001 From: Conrad Meyer Date: Thu, 26 Sep 2019 01:50:26 +0000 Subject: [PATCH 090/106] amd64: Expose vm.pmap.large_map_pml4_entries as a sysctl node It's nice to have sysctl nodes for tunables. Sponsored by: Dell EMC Isilon --- sys/amd64/amd64/pmap.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 1424dee34f5e..e7f875c325d1 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -1965,6 +1965,11 @@ pmap_init(void) } } +SYSCTL_UINT(_vm_pmap, OID_AUTO, large_map_pml4_entries, + CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &lm_ents, 0, + "Maximum number of PML4 entries for use by large map (tunable). " + "Each entry corresponds to 512GB of address space."); + static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, "2MB page mapping counters"); From 407c48f0606011d6111e132e98116a52df05ebb1 Mon Sep 17 00:00:00 2001 From: Conrad Meyer Date: Thu, 26 Sep 2019 01:51:55 +0000 Subject: [PATCH 091/106] amd64 pmap: Clarify largemap bootverbose message units A PML4 covers 512 gigabytes, not gigabits. Use the typical B suffix for bytes. No functional change. Sponsored by: Dell EMC Isilon --- sys/amd64/amd64/pmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index e7f875c325d1..a39f7a5415cd 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -1947,7 +1947,7 @@ pmap_init(void) if (lm_ents > LMEPML4I - LMSPML4I + 1) lm_ents = LMEPML4I - LMSPML4I + 1; if (bootverbose) - printf("pmap: large map %u PML4 slots (%lu Gb)\n", + printf("pmap: large map %u PML4 slots (%lu GB)\n", lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024)); if (lm_ents != 0) { large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS, From ee7201a7256f5a5d817e2c67a4dc3a98b9700958 Mon Sep 17 00:00:00 2001 From: Rick Macklem Date: Thu, 26 Sep 2019 02:54:45 +0000 Subject: [PATCH 092/106] Replace all mtx_assert() calls for n_mtx and ncl_iod_mutex with macros. To be consistent with replacing the mtx_lock()/mtx_unlock() calls on the NFS node mutex (n_mtx) and ncl_iod_mutex, this patch replaces all mtx_assert() calls on these mutexes with macros as well. This will simplify changing these locks to sx locks in a future commit. However, this change may be delayed indefinitely, since it appears there is a deadlock when vnode_pager_setsize() is called to shrink the size and the NFS node lock is held. There is no semantic change as a result of this commit. Suggested by: kib MFC after: 1 week --- sys/fs/nfs/nfsport.h | 2 ++ sys/fs/nfsclient/nfs_clnfsiod.c | 4 ++-- sys/fs/nfsclient/nfs_clnode.c | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/sys/fs/nfs/nfsport.h b/sys/fs/nfs/nfsport.h index 49577dab0eb0..6987cc842e78 100644 --- a/sys/fs/nfs/nfsport.h +++ b/sys/fs/nfs/nfsport.h @@ -688,10 +688,12 @@ void nfsrvd_rcv(struct socket *, void *, int); #define NFSUNLOCKV4ROOTMUTEX() mtx_unlock(&nfs_v4root_mutex) #define NFSLOCKNODE(n) mtx_lock(&((n)->n_mtx)) #define NFSUNLOCKNODE(n) mtx_unlock(&((n)->n_mtx)) +#define NFSASSERTNODE(n) mtx_assert(&((n)->n_mtx), MA_OWNED) #define NFSLOCKMNT(m) mtx_lock(&((m)->nm_mtx)) #define NFSUNLOCKMNT(m) mtx_unlock(&((m)->nm_mtx)) #define NFSLOCKIOD() mtx_lock(&ncl_iod_mutex) #define NFSUNLOCKIOD() mtx_unlock(&ncl_iod_mutex) +#define NFSASSERTIOD() mtx_assert(&ncl_iod_mutex, MA_OWNED) #define NFSLOCKREQUEST(r) mtx_lock(&((r)->r_mtx)) #define NFSUNLOCKREQUEST(r) mtx_unlock(&((r)->r_mtx)) #define NFSLOCKSOCKREQ(r) mtx_lock(&((r)->nr_mtx)) diff --git a/sys/fs/nfsclient/nfs_clnfsiod.c b/sys/fs/nfsclient/nfs_clnfsiod.c index d43b960eca75..3f0fb777e37b 100644 --- a/sys/fs/nfsclient/nfs_clnfsiod.c +++ b/sys/fs/nfsclient/nfs_clnfsiod.c @@ -169,7 +169,7 @@ nfs_nfsiodnew_sync(void) { int error, i; - mtx_assert(&ncl_iod_mutex, MA_OWNED); + NFSASSERTIOD(); for (i = 0; i < ncl_iodmax; i++) { if (nfs_asyncdaemon[i] == 0) { nfs_asyncdaemon[i] = 1; @@ -206,7 +206,7 @@ void ncl_nfsiodnew(void) { - mtx_assert(&ncl_iod_mutex, MA_OWNED); + NFSASSERTIOD(); taskqueue_enqueue(taskqueue_thread, &ncl_nfsiodnew_task); } diff --git a/sys/fs/nfsclient/nfs_clnode.c b/sys/fs/nfsclient/nfs_clnode.c index e58797ac8be0..c9ea0acea06d 100644 --- a/sys/fs/nfsclient/nfs_clnode.c +++ b/sys/fs/nfsclient/nfs_clnode.c @@ -212,7 +212,7 @@ ncl_releasesillyrename(struct vnode *vp, struct thread *td) ASSERT_VOP_ELOCKED(vp, "releasesillyrename"); np = VTONFS(vp); - mtx_assert(&np->n_mtx, MA_OWNED); + NFSASSERTNODE(np); if (vp->v_type != VDIR) { sp = np->n_sillyrename; np->n_sillyrename = NULL; From d096bd7911d5c17ccfb3f8452f56b90611c38777 Mon Sep 17 00:00:00 2001 From: Cy Schubert Date: Thu, 26 Sep 2019 03:09:42 +0000 Subject: [PATCH 093/106] ipf mistakenly regards UDP packets with a checksum of 0xffff as bad. Obtained from: NetBSD fil.c r1.30, NetBSD PR/54443 MFC after: 3 days --- sys/contrib/ipfilter/netinet/fil.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sys/contrib/ipfilter/netinet/fil.c b/sys/contrib/ipfilter/netinet/fil.c index edc4caf9697c..827b71dae18c 100644 --- a/sys/contrib/ipfilter/netinet/fil.c +++ b/sys/contrib/ipfilter/netinet/fil.c @@ -6730,8 +6730,11 @@ ipf_checkl4sum(fin) /*NOTREACHED*/ } - if (csump != NULL) + if (csump != NULL) { hdrsum = *csump; + if (fin->fin_p == IPPROTO_UDP && hdrsum == 0xffff) + hdrsum = 0x0000; + } if (dosum) { sum = fr_cksum(fin, fin->fin_ip, fin->fin_p, fin->fin_dp); From 4fcb8706125692b62e3e11aeb0a0234ab62703b5 Mon Sep 17 00:00:00 2001 From: Cy Schubert Date: Thu, 26 Sep 2019 03:09:45 +0000 Subject: [PATCH 094/106] Teach the ippool parser about address families. This is a precursor to implementing IPv6 support within ippool which requires reworking radix_ipf.c. MFC after: 1 month --- contrib/ipfilter/tools/ippool_y.y | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/contrib/ipfilter/tools/ippool_y.y b/contrib/ipfilter/tools/ippool_y.y index 2a9d8ee3b079..02ae5c22141a 100644 --- a/contrib/ipfilter/tools/ippool_y.y +++ b/contrib/ipfilter/tools/ippool_y.y @@ -309,11 +309,27 @@ range: addrmask { $$ = calloc(1, sizeof(*$$)); $$->ipn_info = 0; $$->ipn_addr = $1[0]; $$->ipn_mask = $1[1]; +#ifdef USE_INET6 + if (use_inet6) + $$->ipn_addr.adf_family = + AF_INET6; + else +#endif + $$->ipn_addr.adf_family = + AF_INET; } | '!' addrmask { $$ = calloc(1, sizeof(*$$)); $$->ipn_info = 1; $$->ipn_addr = $2[0]; $$->ipn_mask = $2[1]; +#ifdef USE_INET6 + if (use_inet6) + $$->ipn_addr.adf_family = + AF_INET6; + else +#endif + $$->ipn_addr.adf_family = + AF_INET; } | YY_STR { $$ = add_poolhosts($1); free($1); From 29f7096df916cebe29fc5459114254da5fbbafce Mon Sep 17 00:00:00 2001 From: Toomas Soome Date: Thu, 26 Sep 2019 07:14:54 +0000 Subject: [PATCH 095/106] vt: use proper return value check with TUNABLE_INT_FETCH The TUNABLE_INT_FETCH is macro around getenv_int() and we will get return value 0 or 1 for failure or success, we can use it to decide which background color to use. --- sys/dev/vt/hw/fb/vt_fb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sys/dev/vt/hw/fb/vt_fb.c b/sys/dev/vt/hw/fb/vt_fb.c index 7bd717b77031..7691a3879913 100644 --- a/sys/dev/vt/hw/fb/vt_fb.c +++ b/sys/dev/vt/hw/fb/vt_fb.c @@ -480,8 +480,7 @@ vt_fb_init(struct vt_device *vd) } c = TC_BLACK; - TUNABLE_INT_FETCH("teken.bg_color", &bg); - if (bg != -1) { + if (TUNABLE_INT_FETCH("teken.bg_color", &bg) != 0) { if (bg == TC_WHITE) bg |= TC_LIGHT; c = bg; From 11fc80a09824d8e3acce9775aff1e3b7c8a1f305 Mon Sep 17 00:00:00 2001 From: Toomas Soome Date: Thu, 26 Sep 2019 07:19:26 +0000 Subject: [PATCH 096/106] kernel terminal should initialize fg and bg variables before calling TUNABLE_INT_FETCH We have two ways to check if kenv variable exists - either we check return value from TUNABLE_INT_FETCH, or we pre-initialize the variable and check if this value did change. In terminal_init() it is more convinient to use pre-initialized variables. Problem was revealed by older loader.efi, which did not set teken.* variables. Reported by: tuexen --- sys/kern/subr_terminal.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sys/kern/subr_terminal.c b/sys/kern/subr_terminal.c index be52d0618dfc..4d7665cf3377 100644 --- a/sys/kern/subr_terminal.c +++ b/sys/kern/subr_terminal.c @@ -175,6 +175,7 @@ terminal_init(struct terminal *tm) teken_init(&tm->tm_emulator, &terminal_drawmethods, tm); + fg = bg = -1; TUNABLE_INT_FETCH("teken.fg_color", &fg); TUNABLE_INT_FETCH("teken.bg_color", &bg); From 20bd59416dcacbd2b776fe49dfa193900f303287 Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Thu, 26 Sep 2019 13:27:25 +0000 Subject: [PATCH 097/106] bspatch: add integer overflow checks Introduce a new add_off_t static function that exits with an error message if there's an overflow, otherwise returns their sum. Use this when adding values obtained from the input patch. Reviewed by: delphij, allanjude (earlier) MFC after: 1 week Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D7897 --- usr.bin/bsdiff/bspatch/bspatch.c | 35 ++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/usr.bin/bsdiff/bspatch/bspatch.c b/usr.bin/bsdiff/bspatch/bspatch.c index 688962576f4c..6c9656486394 100644 --- a/usr.bin/bsdiff/bspatch/bspatch.c +++ b/usr.bin/bsdiff/bspatch/bspatch.c @@ -61,6 +61,23 @@ exit_cleanup(void) warn("unlinkat"); } +static inline off_t +add_off_t(off_t a, off_t b) +{ + off_t result; + +#if __GNUC__ >= 5 || \ + (defined(__has_builtin) && __has_builtin(__builtin_add_overflow)) + if (__builtin_add_overflow(a, b, &result)) + errx(1, "Corrupt patch"); +#else + if ((b > 0 && a > OFF_MAX - b) || (b < 0 && a < OFF_MIN - b)) + errx(1, "Corrupt patch"); + result = a + b; +#endif + return result; +} + static off_t offtin(u_char *buf) { off_t y; @@ -199,12 +216,12 @@ int main(int argc, char *argv[]) err(1, "fseeko(%s, %jd)", argv[3], (intmax_t)offset); if ((cpfbz2 = BZ2_bzReadOpen(&cbz2err, cpf, 0, 0, NULL, 0)) == NULL) errx(1, "BZ2_bzReadOpen, bz2err = %d", cbz2err); - offset += bzctrllen; + offset = add_off_t(offset, bzctrllen); if (fseeko(dpf, offset, SEEK_SET)) err(1, "fseeko(%s, %jd)", argv[3], (intmax_t)offset); if ((dpfbz2 = BZ2_bzReadOpen(&dbz2err, dpf, 0, 0, NULL, 0)) == NULL) errx(1, "BZ2_bzReadOpen, bz2err = %d", dbz2err); - offset += bzdatalen; + offset = add_off_t(offset, bzdatalen); if (fseeko(epf, offset, SEEK_SET)) err(1, "fseeko(%s, %jd)", argv[3], (intmax_t)offset); if ((epfbz2 = BZ2_bzReadOpen(&ebz2err, epf, 0, 0, NULL, 0)) == NULL) @@ -238,7 +255,7 @@ int main(int argc, char *argv[]) errx(1, "Corrupt patch"); /* Sanity-check */ - if (newpos + ctrl[0] > newsize) + if (add_off_t(newpos, ctrl[0]) > newsize) errx(1, "Corrupt patch"); /* Read diff string */ @@ -249,15 +266,15 @@ int main(int argc, char *argv[]) /* Add old data to diff string */ for (i = 0; i < ctrl[0]; i++) - if ((oldpos + i >= 0) && (oldpos + i < oldsize)) + if (add_off_t(oldpos, i) < oldsize) new[newpos + i] += old[oldpos + i]; /* Adjust pointers */ - newpos += ctrl[0]; - oldpos += ctrl[0]; + newpos = add_off_t(newpos, ctrl[0]); + oldpos = add_off_t(oldpos, ctrl[0]); /* Sanity-check */ - if (newpos + ctrl[1] > newsize) + if (add_off_t(newpos, ctrl[1]) > newsize) errx(1, "Corrupt patch"); /* Read extra string */ @@ -267,8 +284,8 @@ int main(int argc, char *argv[]) errx(1, "Corrupt patch"); /* Adjust pointers */ - newpos+=ctrl[1]; - oldpos+=ctrl[2]; + newpos = add_off_t(newpos, ctrl[1]); + oldpos = add_off_t(oldpos, ctrl[2]); } /* Clean up the bzip2 reads */ From 34a5c41c43ad38129e9bc3dfb923e6e01805097f Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 26 Sep 2019 14:48:39 +0000 Subject: [PATCH 098/106] Add kern.cam.da.X.quirks tunable, similar existing for ada. Submitted by: Michael Lass MFC after: 2 weeks Differential Revision: https://reviews.freebsd.org/D20677 --- sys/cam/scsi/scsi_da.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sys/cam/scsi/scsi_da.c b/sys/cam/scsi/scsi_da.c index f2f60d35b2e2..451af7a398f7 100644 --- a/sys/cam/scsi/scsi_da.c +++ b/sys/cam/scsi/scsi_da.c @@ -2694,6 +2694,7 @@ daregister(struct cam_periph *periph, void *arg) struct ccb_getdev *cgd; char tmpstr[80]; caddr_t match; + int quirks; cgd = (struct ccb_getdev *)arg; if (cgd == NULL) { @@ -2750,6 +2751,13 @@ daregister(struct cam_periph *periph, void *arg) if (cpi.ccb_h.status == CAM_REQ_CMP && (cpi.hba_misc & PIM_NO_6_BYTE)) softc->quirks |= DA_Q_NO_6_BYTE; + /* Override quirks if tunable is set */ + snprintf(tmpstr, sizeof(tmpstr), "kern.cam.da.%d.quirks", + periph->unit_number); + quirks = softc->quirks; + TUNABLE_INT_FETCH(tmpstr, &quirks); + softc->quirks = quirks; + if (SID_TYPE(&cgd->inq_data) == T_ZBC_HM) softc->zone_mode = DA_ZONE_HOST_MANAGED; else if (softc->quirks & DA_Q_SMR_DM) From 867e98f8ee02f00d822f6f7068e506f7ccf9cc65 Mon Sep 17 00:00:00 2001 From: "Jonathan T. Looney" Date: Thu, 26 Sep 2019 15:02:34 +0000 Subject: [PATCH 099/106] Remove the unused sch parameter to the syncache_respond() function. The use of this parameter was removed in r313330. This commit now removes passing this now-unused parameter. Reviewed by: gallatin, rrs Sponsored by: Netflix, Inc. Differential Revision: https://reviews.freebsd.org/D21644 --- sys/netinet/tcp_syncache.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index fd88e69b25a2..c36466dae374 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -130,8 +130,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, functions_inherit_listen_socket_stack, static void syncache_drop(struct syncache *, struct syncache_head *); static void syncache_free(struct syncache *); static void syncache_insert(struct syncache *, struct syncache_head *); -static int syncache_respond(struct syncache *, struct syncache_head *, - const struct mbuf *, int); +static int syncache_respond(struct syncache *, const struct mbuf *, int); static struct socket *syncache_socket(struct syncache *, struct socket *, struct mbuf *m); static void syncache_timeout(struct syncache *sc, struct syncache_head *sch, @@ -495,7 +494,7 @@ syncache_timer(void *xsch) free(s, M_TCPLOG); } - syncache_respond(sc, sch, NULL, TH_SYN|TH_ACK); + syncache_respond(sc, NULL, TH_SYN|TH_ACK); TCPSTAT_INC(tcps_sc_retransmitted); syncache_timeout(sc, sch, 0); } @@ -632,7 +631,7 @@ syncache_chkrst(struct in_conninfo *inc, struct tcphdr *th, struct mbuf *m) "sending challenge ACK\n", s, __func__, th->th_seq, sc->sc_irs + 1, sc->sc_wnd); - syncache_respond(sc, sch, m, TH_ACK); + syncache_respond(sc, m, TH_ACK); } } else { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) @@ -1475,7 +1474,7 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, s, __func__); free(s, M_TCPLOG); } - if (syncache_respond(sc, sch, m, TH_SYN|TH_ACK) == 0) { + if (syncache_respond(sc, m, TH_SYN|TH_ACK) == 0) { sc->sc_rxmits = 0; syncache_timeout(sc, sch, 1); TCPSTAT_INC(tcps_sndacks); @@ -1640,7 +1639,7 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, /* * Do a standard 3-way handshake. */ - if (syncache_respond(sc, sch, m, TH_SYN|TH_ACK) == 0) { + if (syncache_respond(sc, m, TH_SYN|TH_ACK) == 0) { if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != &scs) syncache_free(sc); else if (sc != &scs) @@ -1685,8 +1684,7 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, * i.e. m0 != NULL, or upon 3WHS ACK timeout, i.e. m0 == NULL. */ static int -syncache_respond(struct syncache *sc, struct syncache_head *sch, - const struct mbuf *m0, int flags) +syncache_respond(struct syncache *sc, const struct mbuf *m0, int flags) { struct ip *ip = NULL; struct mbuf *m; From 0bee4d631a16bddf55b44beff1bba5fab1c08f11 Mon Sep 17 00:00:00 2001 From: "Jonathan T. Looney" Date: Thu, 26 Sep 2019 15:06:46 +0000 Subject: [PATCH 100/106] Access the syncache secret directly from the V_tcp_syncache variable, rather than indirectly through the backpointer to the tcp_syncache structure stored in the hashtable bucket. This also allows us to remove the requirement in syncookie_generate() and syncookie_lookup() that the syncache hashtable bucket must be locked. Reviewed by: gallatin, rrs Sponsored by: Netflix, Inc. Differential Revision: https://reviews.freebsd.org/D21644 --- sys/netinet/tcp_syncache.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index c36466dae374..c36a10e08b8b 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -2061,8 +2061,6 @@ syncookie_generate(struct syncache_head *sch, struct syncache *sc) uint8_t *secbits; union syncookie cookie; - SCH_LOCK_ASSERT(sch); - cookie.cookie = 0; /* Map our computed MSS into the 3-bit index. */ @@ -2090,10 +2088,10 @@ syncookie_generate(struct syncache_head *sch, struct syncache *sc) cookie.flags.sack_ok = 1; /* Which of the two secrets to use. */ - secbit = sch->sch_sc->secret.oddeven & 0x1; + secbit = V_tcp_syncache.secret.oddeven & 0x1; cookie.flags.odd_even = secbit; - secbits = sch->sch_sc->secret.key[secbit]; + secbits = V_tcp_syncache.secret.key[secbit]; hash = syncookie_mac(&sc->sc_inc, sc->sc_irs, cookie.cookie, secbits, (uintptr_t)sch); @@ -2121,8 +2119,6 @@ syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, int wnd, wscale = 0; union syncookie cookie; - SCH_LOCK_ASSERT(sch); - /* * Pull information out of SYN-ACK/ACK and revert sequence number * advances. @@ -2137,7 +2133,7 @@ syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, cookie.cookie = (ack & 0xff) ^ (ack >> 24); /* Which of the two secrets to use. */ - secbits = sch->sch_sc->secret.key[cookie.flags.odd_even]; + secbits = V_tcp_syncache.secret.key[cookie.flags.odd_even]; hash = syncookie_mac(inc, seq, cookie.cookie, secbits, (uintptr_t)sch); From 0b18fb079891afa98e6fa32a458d4b61fbc5910f Mon Sep 17 00:00:00 2001 From: "Jonathan T. Looney" Date: Thu, 26 Sep 2019 15:18:57 +0000 Subject: [PATCH 101/106] Add new functionality to switch to using cookies exclusively when we the syn cache overflows. Whether this is due to an attack or due to the system having more legitimate connections than the syn cache can hold, this situation can quickly impact performance. To make the system perform better during these periods, the code will now switch to exclusively using cookies until the syn cache stops overflowing. In order for this to occur, the system must be configured to use the syn cache with syn cookie fallback. If syn cookies are completely disabled, this change should have no functional impact. When the system is exclusively using syn cookies (either due to configuration or the overflow detection enabled by this change), the code will now skip acquiring a lock on the syn cache bucket. Additionally, the code will now skip lookups in several places (such as when the system receives a RST in response to a SYN|ACK frame). Reviewed by: rrs, gallatin (previous version) Discussed with: tuexen Sponsored by: Netflix, Inc. Differential Revision: https://reviews.freebsd.org/D21644 --- sys/netinet/tcp_syncache.c | 224 ++++++++++++++++++++++++++++++++++--- sys/netinet/tcp_syncache.h | 8 ++ 2 files changed, 214 insertions(+), 18 deletions(-) diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index c36a10e08b8b..cbc4bc222fc0 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -144,6 +144,8 @@ static struct syncache *syncookie_lookup(struct in_conninfo *, struct syncache_head *, struct syncache *, struct tcphdr *, struct tcpopt *, struct socket *); +static void syncache_pause(struct in_conninfo *); +static void syncache_unpause(void *); static void syncookie_reseed(void *); #ifdef INVARIANTS static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, @@ -300,6 +302,14 @@ syncache_init(void) arc4rand(V_tcp_syncache.secret.key[1], SYNCOOKIE_SECRET_SIZE, 0); callout_reset(&V_tcp_syncache.secret.reseed, SYNCOOKIE_LIFETIME * hz, syncookie_reseed, &V_tcp_syncache); + + /* Initialize the pause machinery. */ + mtx_init(&V_tcp_syncache.pause_mtx, "tcp_sc_pause", NULL, MTX_DEF); + callout_init_mtx(&V_tcp_syncache.pause_co, &V_tcp_syncache.pause_mtx, + 0); + V_tcp_syncache.pause_until = time_uptime - TCP_SYNCACHE_PAUSE_TIME; + V_tcp_syncache.pause_backoff = 0; + V_tcp_syncache.paused = false; } #ifdef VIMAGE @@ -316,6 +326,14 @@ syncache_destroy(void) */ callout_drain(&V_tcp_syncache.secret.reseed); + /* Stop the SYN cache pause callout. */ + mtx_lock(&V_tcp_syncache.pause_mtx); + if (callout_stop(&V_tcp_syncache.pause_co) == 0) { + mtx_unlock(&V_tcp_syncache.pause_mtx); + callout_drain(&V_tcp_syncache.pause_co); + } else + mtx_unlock(&V_tcp_syncache.pause_mtx); + /* Cleanup hash buckets: stop timers, free entries, destroy locks. */ for (i = 0; i < V_tcp_syncache.hashsize; i++) { @@ -339,6 +357,7 @@ syncache_destroy(void) /* Free the allocated global resources. */ uma_zdestroy(V_tcp_syncache.zone); free(V_tcp_syncache.hashbase, M_SYNCACHE); + mtx_destroy(&V_tcp_syncache.pause_mtx); } #endif @@ -360,10 +379,10 @@ syncache_insert(struct syncache *sc, struct syncache_head *sch) if (sch->sch_length >= V_tcp_syncache.bucket_limit) { KASSERT(!TAILQ_EMPTY(&sch->sch_bucket), ("sch->sch_length incorrect")); + syncache_pause(&sc->sc_inc); sc2 = TAILQ_LAST(&sch->sch_bucket, sch_head); sch->sch_last_overflow = time_uptime; syncache_drop(sc2, sch); - TCPSTAT_INC(tcps_sc_bucketoverflow); } /* Put it into the bucket. */ @@ -450,6 +469,7 @@ syncache_timer(void *xsch) struct syncache *sc, *nsc; int tick = ticks; char *s; + bool paused; CURVNET_SET(sch->sch_sc->vnet); @@ -462,7 +482,19 @@ syncache_timer(void *xsch) */ sch->sch_nextc = tick + INT_MAX; + /* + * If we have paused processing, unconditionally remove + * all syncache entries. + */ + mtx_lock(&V_tcp_syncache.pause_mtx); + paused = V_tcp_syncache.paused; + mtx_unlock(&V_tcp_syncache.pause_mtx); + TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) { + if (paused) { + syncache_drop(sc, sch); + continue; + } /* * We do not check if the listen socket still exists * and accept the case where the listen socket may be @@ -505,14 +537,24 @@ syncache_timer(void *xsch) } /* - * Find an entry in the syncache. - * Returns always with locked syncache_head plus a matching entry or NULL. + * Returns true if the system is only using cookies at the moment. + * This could be due to a sysadmin decision to only use cookies, or it + * could be due to the system detecting an attack. */ -static struct syncache * -syncache_lookup(struct in_conninfo *inc, struct syncache_head **schp) +static inline bool +syncache_cookiesonly(void) +{ + + return (V_tcp_syncookies && (V_tcp_syncache.paused || + V_tcp_syncookiesonly)); +} + +/* + * Find the hash bucket for the given connection. + */ +static struct syncache_head * +syncache_hashbucket(struct in_conninfo *inc) { - struct syncache *sc; - struct syncache_head *sch; uint32_t hash; /* @@ -525,8 +567,20 @@ syncache_lookup(struct in_conninfo *inc, struct syncache_head **schp) hash = jenkins_hash32((uint32_t *)&inc->inc_ie, 5, V_tcp_syncache.hash_secret) & V_tcp_syncache.hashmask; - sch = &V_tcp_syncache.hashbase[hash]; - *schp = sch; + return (&V_tcp_syncache.hashbase[hash]); +} + +/* + * Find an entry in the syncache. + * Returns always with locked syncache_head plus a matching entry or NULL. + */ +static struct syncache * +syncache_lookup(struct in_conninfo *inc, struct syncache_head **schp) +{ + struct syncache *sc; + struct syncache_head *sch; + + *schp = sch = syncache_hashbucket(inc); SCH_LOCK(sch); /* Circle through bucket row to find matching entry. */ @@ -551,6 +605,8 @@ syncache_chkrst(struct in_conninfo *inc, struct tcphdr *th, struct mbuf *m) struct syncache_head *sch; char *s = NULL; + if (syncache_cookiesonly()) + return; sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); @@ -654,6 +710,8 @@ syncache_badack(struct in_conninfo *inc) struct syncache *sc; struct syncache_head *sch; + if (syncache_cookiesonly()) + return; sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); if (sc != NULL) { @@ -669,6 +727,8 @@ syncache_unreach(struct in_conninfo *inc, tcp_seq th_seq) struct syncache *sc; struct syncache_head *sch; + if (syncache_cookiesonly()) + return; sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); if (sc == NULL) @@ -1029,6 +1089,7 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct syncache_head *sch; struct syncache scs; char *s; + bool locked; /* * Global TCP locks are held because we manipulate the PCB lists @@ -1038,8 +1099,15 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK, ("%s: can handle only ACK", __func__)); - sc = syncache_lookup(inc, &sch); /* returns locked sch */ - SCH_LOCK_ASSERT(sch); + if (syncache_cookiesonly()) { + sc = NULL; + sch = syncache_hashbucket(inc); + locked = false; + } else { + sc = syncache_lookup(inc, &sch); /* returns locked sch */ + locked = true; + SCH_LOCK_ASSERT(sch); + } #ifdef INVARIANTS /* @@ -1063,7 +1131,7 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, * C. check that the syncookie is valid. If it is, then * cobble up a fake syncache entry, and return. */ - if (!V_tcp_syncookies) { + if (locked && !V_tcp_syncookies) { SCH_UNLOCK(sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious ACK, " @@ -1071,7 +1139,7 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, s, __func__); goto failed; } - if (!V_tcp_syncookiesonly && + if (locked && !V_tcp_syncookiesonly && sch->sch_last_overflow < time_uptime - SYNCOOKIE_LIFETIME) { SCH_UNLOCK(sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) @@ -1082,7 +1150,8 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, } bzero(&scs, sizeof(scs)); sc = syncookie_lookup(inc, sch, &scs, th, to, *lsop); - SCH_UNLOCK(sch); + if (locked) + SCH_UNLOCK(sch); if (sc == NULL) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Segment failed " @@ -1331,6 +1400,7 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, unsigned int *tfo_pending = NULL; int tfo_cookie_valid = 0; int tfo_response_cookie_valid = 0; + bool locked; INP_WLOCK_ASSERT(inp); /* listen socket */ KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN, @@ -1436,8 +1506,15 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, * how to handle such a case; either ignore it as spoofed, or * drop the current entry and create a new one? */ - sc = syncache_lookup(inc, &sch); /* returns locked entry */ - SCH_LOCK_ASSERT(sch); + if (syncache_cookiesonly()) { + sc = NULL; + sch = syncache_hashbucket(inc); + locked = false; + } else { + sc = syncache_lookup(inc, &sch); /* returns locked sch */ + locked = true; + SCH_LOCK_ASSERT(sch); + } if (sc != NULL) { if (tfo_cookie_valid) INP_WUNLOCK(inp); @@ -1490,7 +1567,15 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, goto skip_alloc; } - sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); + /* + * Skip allocating a syncache entry if we are just going to discard + * it later. + */ + if (!locked) { + bzero(&scs, sizeof(scs)); + sc = &scs; + } else + sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); if (sc == NULL) { /* * The zone allocator couldn't provide more entries. @@ -1501,6 +1586,7 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, if ((sc = TAILQ_LAST(&sch->sch_bucket, sch_head)) != NULL) { sch->sch_last_overflow = time_uptime; syncache_drop(sc, sch); + syncache_pause(inc); } sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); if (sc == NULL) { @@ -1508,6 +1594,9 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, bzero(&scs, sizeof(scs)); sc = &scs; } else { + KASSERT(locked, + ("%s: bucket unexpectedly unlocked", + __func__)); SCH_UNLOCK(sch); if (ipopts) (void) m_free(ipopts); @@ -1626,7 +1715,8 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, sc->sc_flowlabel = htonl(sc->sc_flowlabel) & IPV6_FLOWLABEL_MASK; } #endif - SCH_UNLOCK(sch); + if (locked) + SCH_UNLOCK(sch); if (tfo_cookie_valid) { syncache_tfo_expand(sc, lsop, m, tfo_response_cookie); @@ -2262,6 +2352,104 @@ syncookie_reseed(void *arg) callout_schedule(&sc->secret.reseed, SYNCOOKIE_LIFETIME * hz); } +/* + * We have overflowed a bucket. Let's pause dealing with the syncache. + * This function will increment the bucketoverflow statistics appropriately + * (once per pause when pausing is enabled; otherwise, once per overflow). + */ +static void +syncache_pause(struct in_conninfo *inc) +{ + time_t delta; + const char *s; + + /* XXX: + * 2. Add sysctl read here so we don't get the benefit of this + * change without the new sysctl. + */ + + /* + * Try an unlocked read. If we already know that another thread + * has activated the feature, there is no need to proceed. + */ + if (V_tcp_syncache.paused) + return; + + /* Are cookied enabled? If not, we can't pause. */ + if (!V_tcp_syncookies) { + TCPSTAT_INC(tcps_sc_bucketoverflow); + return; + } + + /* + * We may be the first thread to find an overflow. Get the lock + * and evaluate if we need to take action. + */ + mtx_lock(&V_tcp_syncache.pause_mtx); + if (V_tcp_syncache.paused) { + mtx_unlock(&V_tcp_syncache.pause_mtx); + return; + } + + /* Activate protection. */ + V_tcp_syncache.paused = true; + TCPSTAT_INC(tcps_sc_bucketoverflow); + + /* + * Determine the last backoff time. If we are seeing a re-newed + * attack within that same time after last reactivating the syncache, + * consider it an extension of the same attack. + */ + delta = TCP_SYNCACHE_PAUSE_TIME << V_tcp_syncache.pause_backoff; + if (V_tcp_syncache.pause_until + delta - time_uptime > 0) { + if (V_tcp_syncache.pause_backoff < TCP_SYNCACHE_MAX_BACKOFF) { + delta <<= 1; + V_tcp_syncache.pause_backoff++; + } + } else { + delta = TCP_SYNCACHE_PAUSE_TIME; + V_tcp_syncache.pause_backoff = 0; + } + + /* Log a warning, including IP addresses, if able. */ + if (inc != NULL) + s = tcp_log_addrs(inc, NULL, NULL, NULL); + else + s = (const char *)NULL; + log(LOG_WARNING, "TCP syncache overflow detected; using syncookies for " + "the next %lld seconds%s%s%s\n", (long long)delta, + (s != NULL) ? " (last SYN: " : "", (s != NULL) ? s : "", + (s != NULL) ? ")" : ""); + free(__DECONST(void *, s), M_TCPLOG); + + /* Use the calculated delta to set a new pause time. */ + V_tcp_syncache.pause_until = time_uptime + delta; + callout_reset(&V_tcp_syncache.pause_co, delta * hz, syncache_unpause, + &V_tcp_syncache); + mtx_unlock(&V_tcp_syncache.pause_mtx); +} + +/* Evaluate whether we need to unpause. */ +static void +syncache_unpause(void *arg) +{ + struct tcp_syncache *sc; + time_t delta; + + sc = arg; + mtx_assert(&sc->pause_mtx, MA_OWNED | MA_NOTRECURSED); + callout_deactivate(&sc->pause_co); + + /* + * Check to make sure we are not running early. If the pause + * time has expired, then deactivate the protection. + */ + if ((delta = sc->pause_until - time_uptime) > 0) + callout_schedule(&sc->pause_co, delta * hz); + else + sc->paused = false; +} + /* * Exports the syncache entries to userland so that netstat can display * them alongside the other sockets. This function is intended to be diff --git a/sys/netinet/tcp_syncache.h b/sys/netinet/tcp_syncache.h index 3067b7efeded..b7d4ba1fe28c 100644 --- a/sys/netinet/tcp_syncache.h +++ b/sys/netinet/tcp_syncache.h @@ -111,6 +111,9 @@ struct syncookie_secret { u_int lifetime; }; +#define TCP_SYNCACHE_PAUSE_TIME SYNCOOKIE_LIFETIME +#define TCP_SYNCACHE_MAX_BACKOFF 6 /* 16 minutes */ + struct tcp_syncache { struct syncache_head *hashbase; uma_zone_t zone; @@ -122,6 +125,11 @@ struct tcp_syncache { uint32_t hash_secret; struct vnet *vnet; struct syncookie_secret secret; + struct mtx pause_mtx; + struct callout pause_co; + time_t pause_until; + uint8_t pause_backoff; + volatile bool paused; }; /* Internal use for the syncookie functions. */ From 9afb12bab41f6a715f171e00e5099f93a93252be Mon Sep 17 00:00:00 2001 From: David Bright Date: Thu, 26 Sep 2019 15:32:28 +0000 Subject: [PATCH 102/106] Add an shm_rename syscall Add an atomic shm rename operation, similar in spirit to a file rename. Atomically unlink an shm from a source path and link it to a destination path. If an existing shm is linked at the destination path, unlink it as part of the same atomic operation. The caller needs the same permissions as shm_unlink to the shm being renamed, and the same permissions for the shm at the destination which is being unlinked, if it exists. If those fail, EACCES is returned, as with the other shm_* syscalls. truss support is included; audit support will come later. This commit includes only the implementation; the sysent-generated bits will come in a follow-on commit. Submitted by: Matthew Bryan Reviewed by: jilles (earlier revision) Reviewed by: brueffer (manpages, earlier revision) Relnotes: yes Sponsored by: Dell EMC Isilon Differential Revision: https://reviews.freebsd.org/D21423 --- lib/libc/sys/Makefile.inc | 3 +- lib/libc/sys/Symbol.map | 1 + lib/libc/sys/shm_open.2 | 92 ++++++++- sys/compat/freebsd32/syscalls.master | 2 + sys/kern/syscalls.master | 7 + sys/kern/uipc_shm.c | 157 +++++++++++++- sys/sys/mman.h | 9 + tests/sys/posixshm/posixshm_test.c | 292 +++++++++++++++++++++++++-- usr.bin/truss/syscalls.c | 2 + 9 files changed, 536 insertions(+), 29 deletions(-) diff --git a/lib/libc/sys/Makefile.inc b/lib/libc/sys/Makefile.inc index 9f3ec84b517c..6dec508065c0 100644 --- a/lib/libc/sys/Makefile.inc +++ b/lib/libc/sys/Makefile.inc @@ -477,7 +477,8 @@ MLINKS+=setuid.2 setegid.2 \ setuid.2 setgid.2 MLINKS+=shmat.2 shmdt.2 MLINKS+=shm_open.2 memfd_create.3 \ - shm_open.2 shm_unlink.2 + shm_open.2 shm_unlink.2 \ + shm_rename.2 MLINKS+=sigwaitinfo.2 sigtimedwait.2 MLINKS+=stat.2 fstat.2 \ stat.2 fstatat.2 \ diff --git a/lib/libc/sys/Symbol.map b/lib/libc/sys/Symbol.map index 711ddaa6ea9f..a31cf1616ddc 100644 --- a/lib/libc/sys/Symbol.map +++ b/lib/libc/sys/Symbol.map @@ -410,6 +410,7 @@ FBSD_1.6 { getfhat; funlinkat; memfd_create; + shm_rename; }; FBSDprivate_1.0 { diff --git a/lib/libc/sys/shm_open.2 b/lib/libc/sys/shm_open.2 index f089f177e1d6..e231115613cf 100644 --- a/lib/libc/sys/shm_open.2 +++ b/lib/libc/sys/shm_open.2 @@ -28,11 +28,11 @@ .\" .\" $FreeBSD$ .\" -.Dd September 24, 2019 +.Dd September 26, 2019 .Dt SHM_OPEN 2 .Os .Sh NAME -.Nm memfd_create , shm_open , shm_unlink +.Nm memfd_create , shm_open , shm_rename, shm_unlink .Nd "shared memory object operations" .Sh LIBRARY .Lb libc @@ -45,6 +45,8 @@ .Ft int .Fn shm_open "const char *path" "int flags" "mode_t mode" .Ft int +.Fn shm_rename "const char *path_from" "const char *path_to" "int flags" +.Ft int .Fn shm_unlink "const char *path" .Sh DESCRIPTION The @@ -112,8 +114,9 @@ see and .Xr fcntl 2 . .Pp -As a FreeBSD extension, -the constant +As a +.Fx +extension, the constant .Dv SHM_ANON may be used for the .Fa path @@ -122,7 +125,9 @@ argument to In this case, an anonymous, unnamed shared memory object is created. Since the object has no name, it cannot be removed via a subsequent call to -.Fn shm_unlink . +.Fn shm_unlink , +or moved with a call to +.Fn shm_rename . Instead, the shared memory object will be garbage collected when the last reference to the shared memory object is removed. @@ -138,6 +143,31 @@ will fail with All other flags are ignored. .Pp The +.Fn shm_rename +system call atomically removes a shared memory object named +.Fa path_from +and relinks it at +.Fa path_to . +If another object is already linked at +.Fa path_to , +that object will be unlinked, unless one of the following flags are provided: +.Bl -tag -offset indent -width Er +.It Er SHM_RENAME_EXCHANGE +Atomically exchange the shms at +.Fa path_from +and +.Fa path_to . +.It Er SHM_RENAME_NOREPLACE +Return an error if an shm exists at +.Fa path_to , +rather than unlinking it. +.El +.Fn shm_rename +is also a +.Fx +extension. +.Pp +The .Fn shm_unlink system call removes a shared memory object named .Fa path . @@ -196,15 +226,20 @@ and .Fn shm_open both return a non-negative integer, and +.Fn shm_rename +and .Fn shm_unlink -returns zero. -All three functions return -1 on failure, and set +return zero. +All functions return -1 on failure, and set .Va errno to indicate the error. .Sh COMPATIBILITY The -.Fa path -argument does not necessarily represent a pathname (although it does in +.Fa path , +.Fa path_from , +and +.Fa path_to +arguments do not necessarily represent a pathname (although they do in most other implementations). Two processes opening the same .Fa path @@ -325,7 +360,7 @@ The .Fa path argument points outside the process' allocated address space. .It Bq Er ENAMETOOLONG -The entire pathname exceeded 1023 characters. +The entire pathname exceeds 1023 characters. .It Bq Er EINVAL The .Fa path @@ -344,6 +379,31 @@ are specified and the named shared memory object does exist. The required permissions (for reading or reading and writing) are denied. .El .Pp +The following errors are defined for +.Fn shm_rename : +.Bl -tag -width Er +.It Bq Er EFAULT +The +.Fa path_from +or +.Fa path_to +argument points outside the process' allocated address space. +.It Bq Er ENAMETOOLONG +The entire pathname exceeds 1023 characters. +.It Bq Er ENOENT +The shared memory object at +.Fa path_from +does not exist. +.It Bq Er EACCES +The required permissions are denied. +.It Bq Er EEXIST +An shm exists at +.Fa path_to , +and the +.Dv SHM_RENAME_NOREPLACE +flag was provided. +.El +.Pp .Fn shm_unlink fails with these error codes for these conditions: .Bl -tag -width Er @@ -352,7 +412,7 @@ The .Fa path argument points outside the process' allocated address space. .It Bq Er ENAMETOOLONG -The entire pathname exceeded 1023 characters. +The entire pathname exceeds 1023 characters. .It Bq Er ENOENT The named shared memory object does not exist. .It Bq Er EACCES @@ -394,9 +454,19 @@ functions first appeared in The functions were reimplemented as system calls using shared memory objects directly rather than files in .Fx 8.0 . +.Pp +.Fn shm_rename +first appeared in +.Fx 13.0 +as a +.Fx +extension. .Sh AUTHORS .An Garrett A. Wollman Aq Mt wollman@FreeBSD.org (C library support and this manual page) .Pp .An Matthew Dillon Aq Mt dillon@FreeBSD.org .Pq Dv MAP_NOSYNC +.Pp +.An Matthew Bryan Aq Mt matthew.bryan@isilon.com +.Pq Dv shm_rename implementation diff --git a/sys/compat/freebsd32/syscalls.master b/sys/compat/freebsd32/syscalls.master index b212e72652fa..2662b31aa595 100644 --- a/sys/compat/freebsd32/syscalls.master +++ b/sys/compat/freebsd32/syscalls.master @@ -1157,5 +1157,7 @@ 571 AUE_SHMOPEN NOPROTO { int shm_open2( \ const char *path, int flags, mode_t mode, \ int shmflags, const char *name); } +572 AUE_NULL NOPROTO { int shm_rename(const char *path_from, \ + const char *path_to, int flags); } ; vim: syntax=off diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master index cd9a825e926c..7635c006df5f 100644 --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -3204,6 +3204,13 @@ _In_z_ const char *name ); } +572 AUE_NULL STD { + int shm_rename( + _In_z_ const char *path_from, + _In_z_ const char *path_to, + int flags + ); + } ; Please copy any additions and changes to the following compatability tables: ; sys/compat/freebsd32/syscalls.master diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c index f25f47201b9a..33ca9aacab1c 100644 --- a/sys/kern/uipc_shm.c +++ b/sys/kern/uipc_shm.c @@ -33,8 +33,9 @@ /* * Support for shared swap-backed anonymous memory objects via - * shm_open(2) and shm_unlink(2). While most of the implementation is - * here, vm_mmap.c contains mapping logic changes. + * shm_open(2), shm_rename(2), and shm_unlink(2). + * While most of the implementation is here, vm_mmap.c contains + * mapping logic changes. * * posixshmcontrol(1) allows users to inspect the state of the memory * objects. Per-uid swap resource limit controls total amount of @@ -947,6 +948,158 @@ sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap) return (error); } +int +sys_shm_rename(struct thread *td, struct shm_rename_args *uap) +{ + char *path_from = NULL, *path_to = NULL; + Fnv32_t fnv_from, fnv_to; + struct shmfd *fd_from; + struct shmfd *fd_to; + int error; + int flags; + + flags = uap->flags; + + /* + * Make sure the user passed only valid flags. + * If you add a new flag, please add a new term here. + */ + if ((flags & ~( + SHM_RENAME_NOREPLACE | + SHM_RENAME_EXCHANGE + )) != 0) { + error = EINVAL; + goto out; + } + + /* + * EXCHANGE and NOREPLACE don't quite make sense together. Let's + * force the user to choose one or the other. + */ + if ((flags & SHM_RENAME_NOREPLACE) != 0 && + (flags & SHM_RENAME_EXCHANGE) != 0) { + error = EINVAL; + goto out; + } + + /* + * Malloc zone M_SHMFD, since this path may end up freed later from + * M_SHMFD if we end up doing an insert. + */ + path_from = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK); + error = copyinstr(uap->path_from, path_from, MAXPATHLEN, NULL); + if (error) + goto out; + + path_to = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK); + error = copyinstr(uap->path_to, path_to, MAXPATHLEN, NULL); + if (error) + goto out; + + /* Rename with from/to equal is a no-op */ + if (strncmp(path_from, path_to, MAXPATHLEN) == 0) + goto out; + + fnv_from = fnv_32_str(path_from, FNV1_32_INIT); + fnv_to = fnv_32_str(path_to, FNV1_32_INIT); + + sx_xlock(&shm_dict_lock); + + fd_from = shm_lookup(path_from, fnv_from); + if (fd_from == NULL) { + sx_xunlock(&shm_dict_lock); + error = ENOENT; + goto out; + } + + fd_to = shm_lookup(path_to, fnv_to); + if ((flags & SHM_RENAME_NOREPLACE) != 0 && fd_to != NULL) { + sx_xunlock(&shm_dict_lock); + error = EEXIST; + goto out; + } + + /* + * Unconditionally prevents shm_remove from invalidating the 'from' + * shm's state. + */ + shm_hold(fd_from); + error = shm_remove(path_from, fnv_from, td->td_ucred); + + /* + * One of my assumptions failed if ENOENT (e.g. locking didn't + * protect us) + */ + KASSERT(error != ENOENT, ("Our shm disappeared during shm_rename: %s", + path_from)); + if (error) { + shm_drop(fd_from); + sx_xunlock(&shm_dict_lock); + goto out; + } + + /* + * If we are exchanging, we need to ensure the shm_remove below + * doesn't invalidate the dest shm's state. + */ + if ((flags & SHM_RENAME_EXCHANGE) != 0 && fd_to != NULL) + shm_hold(fd_to); + + /* + * NOTE: if path_to is not already in the hash, c'est la vie; + * it simply means we have nothing already at path_to to unlink. + * That is the ENOENT case. + * + * If we somehow don't have access to unlink this guy, but + * did for the shm at path_from, then relink the shm to path_from + * and abort with EACCES. + * + * All other errors: that is weird; let's relink and abort the + * operation. + */ + error = shm_remove(path_to, fnv_to, td->td_ucred); + if (error && error != ENOENT) { + shm_insert(path_from, fnv_from, fd_from); + shm_drop(fd_from); + /* Don't free path_from now, since the hash references it */ + path_from = NULL; + sx_xunlock(&shm_dict_lock); + goto out; + } + + shm_insert(path_to, fnv_to, fd_from); + + /* Don't free path_to now, since the hash references it */ + path_to = NULL; + + /* We kept a ref when we removed, and incremented again in insert */ + shm_drop(fd_from); +#ifdef DEBUG + KASSERT(fd_from->shm_refs > 0, ("Expected >0 refs; got: %d\n", + fd_from->shm_refs)); +#endif + + if ((flags & SHM_RENAME_EXCHANGE) != 0 && fd_to != NULL) { + shm_insert(path_from, fnv_from, fd_to); + path_from = NULL; + shm_drop(fd_to); +#ifdef DEBUG + KASSERT(fd_to->shm_refs > 0, ("Expected >0 refs; got: %d\n", + fd_to->shm_refs)); +#endif + } + + error = 0; + sx_xunlock(&shm_dict_lock); + +out: + if (path_from != NULL) + free(path_from, M_SHMFD); + if (path_to != NULL) + free(path_to, M_SHMFD); + return(error); +} + int shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, diff --git a/sys/sys/mman.h b/sys/sys/mman.h index a5c66f3596ea..b0776c0556df 100644 --- a/sys/sys/mman.h +++ b/sys/sys/mman.h @@ -133,6 +133,14 @@ */ #define MAP_FAILED ((void *)-1) +/* + * Flags provided to shm_rename + */ +/* Don't overwrite dest, if it exists */ +#define SHM_RENAME_NOREPLACE (1 << 0) +/* Atomically swap src and dest */ +#define SHM_RENAME_EXCHANGE (1 << 1) + /* * msync() flags */ @@ -313,6 +321,7 @@ int posix_madvise(void *, size_t, int); int mlockall(int); int munlockall(void); int shm_open(const char *, int, mode_t); +int shm_rename(const char *, const char *, int); int shm_unlink(const char *); #endif #if __BSD_VISIBLE diff --git a/tests/sys/posixshm/posixshm_test.c b/tests/sys/posixshm/posixshm_test.c index fd4101f6a842..935a2a5742fe 100644 --- a/tests/sys/posixshm/posixshm_test.c +++ b/tests/sys/posixshm/posixshm_test.c @@ -34,6 +34,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include @@ -46,18 +47,34 @@ __FBSDID("$FreeBSD$"); #define TEST_PATH_LEN 256 static char test_path[TEST_PATH_LEN]; +static char test_path2[TEST_PATH_LEN]; +static unsigned int test_path_idx = 0; + +static void +gen_a_test_path(char *path) +{ + snprintf(path, TEST_PATH_LEN, "%s/tmp.XXXXXX%d", + getenv("TMPDIR") == NULL ? "/tmp" : getenv("TMPDIR"), + test_path_idx); + + test_path_idx++; + + ATF_REQUIRE_MSG(mkstemp(path) != -1, + "mkstemp failed; errno=%d", errno); + ATF_REQUIRE_MSG(unlink(path) == 0, + "unlink failed; errno=%d", errno); +} static void gen_test_path(void) { + gen_a_test_path(test_path); +} - snprintf(test_path, sizeof(test_path), "%s/tmp.XXXXXX", - getenv("TMPDIR") == NULL ? "/tmp" : getenv("TMPDIR")); - test_path[sizeof(test_path) - 1] = '\0'; - ATF_REQUIRE_MSG(mkstemp(test_path) != -1, - "mkstemp failed; errno=%d", errno); - ATF_REQUIRE_MSG(unlink(test_path) == 0, - "unlink failed; errno=%d", errno); +static void +gen_test_path2(void) +{ + gen_a_test_path(test_path2); } /* @@ -89,20 +106,18 @@ shm_unlink_should_fail(const char *path, int error) } /* - * Open the test object and write '1' to the first byte. Returns valid fd + * Open the test object and write a value to the first byte. Returns valid fd * on success and -1 on failure. */ static int -scribble_object(void) +scribble_object(const char *path, char value) { char *page; int fd, pagesize; - gen_test_path(); - ATF_REQUIRE(0 < (pagesize = getpagesize())); - fd = shm_open(test_path, O_CREAT|O_EXCL|O_RDWR, 0777); + fd = shm_open(path, O_CREAT|O_EXCL|O_RDWR, 0777); if (fd < 0 && errno == EEXIST) { if (shm_unlink(test_path) < 0) atf_tc_fail("shm_unlink"); @@ -117,13 +132,45 @@ scribble_object(void) if (page == MAP_FAILED) atf_tc_fail("mmap failed; errno=%d", errno); - page[0] = '1'; + page[0] = value; ATF_REQUIRE_MSG(munmap(page, pagesize) == 0, "munmap failed; errno=%d", errno); return (fd); } +/* + * Fail the test case if the 'path' does not refer to an shm whose first byte + * is equal to expected_value + */ +static void +verify_object(const char *path, char expected_value) +{ + int fd; + int pagesize; + char *page; + + ATF_REQUIRE(0 < (pagesize = getpagesize())); + + fd = shm_open(path, O_RDONLY, 0777); + if (fd < 0) + atf_tc_fail("shm_open failed in verify_object; errno=%d, path=%s", + errno, path); + + page = mmap(0, pagesize, PROT_READ, MAP_SHARED, fd, 0); + if (page == MAP_FAILED) + atf_tc_fail("mmap(1)"); + if (page[0] != expected_value) + atf_tc_fail("Renamed object has incorrect value; has" + "%d (0x%x, '%c'), expected %d (0x%x, '%c')\n", + page[0], page[0], isprint(page[0]) ? page[0] : ' ', + expected_value, expected_value, + isprint(expected_value) ? expected_value : ' '); + ATF_REQUIRE_MSG(munmap(page, pagesize) == 0, "munmap failed; errno=%d", + errno); + close(fd); +} + ATF_TC_WITHOUT_HEAD(remap_object); ATF_TC_BODY(remap_object, tc) { @@ -132,7 +179,8 @@ ATF_TC_BODY(remap_object, tc) ATF_REQUIRE(0 < (pagesize = getpagesize())); - fd = scribble_object(); + gen_test_path(); + fd = scribble_object(test_path, '1'); page = mmap(0, pagesize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (page == MAP_FAILED) @@ -149,6 +197,209 @@ ATF_TC_BODY(remap_object, tc) "shm_unlink failed; errno=%d", errno); } +ATF_TC_WITHOUT_HEAD(rename_from_anon); +ATF_TC_BODY(rename_from_anon, tc) +{ + int rc; + + gen_test_path(); + rc = shm_rename(SHM_ANON, test_path, 0); + if (rc != -1) + atf_tc_fail("shm_rename from SHM_ANON succeeded unexpectedly"); +} + +ATF_TC_WITHOUT_HEAD(rename_bad_path_pointer); +ATF_TC_BODY(rename_bad_path_pointer, tc) +{ + const char *bad_path; + int rc; + + bad_path = (const char *)0x1; + + gen_test_path(); + rc = shm_rename(test_path, bad_path, 0); + if (rc != -1) + atf_tc_fail("shm_rename of nonexisting shm succeeded unexpectedly"); + + rc = shm_rename(bad_path, test_path, 0); + if (rc != -1) + atf_tc_fail("shm_rename of nonexisting shm succeeded unexpectedly"); +} + +ATF_TC_WITHOUT_HEAD(rename_from_nonexisting); +ATF_TC_BODY(rename_from_nonexisting, tc) +{ + int rc; + + gen_test_path(); + rc = shm_rename(test_path, test_path2, 0); + if (rc != -1) + atf_tc_fail("shm_rename of nonexisting shm succeeded unexpectedly"); + + if (errno != ENOENT) + atf_tc_fail("Expected ENOENT to rename of nonexistent shm"); +} + +ATF_TC_WITHOUT_HEAD(rename_to_anon); +ATF_TC_BODY(rename_to_anon, tc) +{ + int rc; + + gen_test_path(); + rc = shm_rename(test_path, SHM_ANON, 0); + if (rc != -1) + atf_tc_fail("shm_rename to SHM_ANON succeeded unexpectedly"); +} + +ATF_TC_WITHOUT_HEAD(rename_to_replace); +ATF_TC_BODY(rename_to_replace, tc) +{ + char expected_value; + int fd; + int fd2; + + // Some contents we can verify later + expected_value = 'g'; + + gen_test_path(); + fd = scribble_object(test_path, expected_value); + close(fd); + + // Give the other some different value so we can detect success + gen_test_path2(); + fd2 = scribble_object(test_path2, 'h'); + close(fd2); + + ATF_REQUIRE_MSG(shm_rename(test_path, test_path2, 0) == 0, + "shm_rename failed; errno=%d", errno); + + // Read back renamed; verify contents + verify_object(test_path2, expected_value); +} + +ATF_TC_WITHOUT_HEAD(rename_to_noreplace); +ATF_TC_BODY(rename_to_noreplace, tc) +{ + char expected_value_from; + char expected_value_to; + int fd_from; + int fd_to; + int rc; + + // Some contents we can verify later + expected_value_from = 'g'; + gen_test_path(); + fd_from = scribble_object(test_path, expected_value_from); + close(fd_from); + + // Give the other some different value so we can detect success + expected_value_to = 'h'; + gen_test_path2(); + fd_to = scribble_object(test_path2, expected_value_to); + close(fd_to); + + rc = shm_rename(test_path, test_path2, SHM_RENAME_NOREPLACE); + ATF_REQUIRE_MSG((rc == -1) && (errno == EEXIST), + "shm_rename didn't fail as expected; errno: %d; return: %d", errno, + rc); + + // Read back renamed; verify contents + verify_object(test_path2, expected_value_to); +} + +ATF_TC_WITHOUT_HEAD(rename_to_exchange); +ATF_TC_BODY(rename_to_exchange, tc) +{ + char expected_value_from; + char expected_value_to; + int fd_from; + int fd_to; + + // Some contents we can verify later + expected_value_from = 'g'; + gen_test_path(); + fd_from = scribble_object(test_path, expected_value_from); + close(fd_from); + + // Give the other some different value so we can detect success + expected_value_to = 'h'; + gen_test_path2(); + fd_to = scribble_object(test_path2, expected_value_to); + close(fd_to); + + ATF_REQUIRE_MSG(shm_rename(test_path, test_path2, + SHM_RENAME_EXCHANGE) == 0, + "shm_rename failed; errno=%d", errno); + + // Read back renamed; verify contents + verify_object(test_path, expected_value_to); + verify_object(test_path2, expected_value_from); +} + +ATF_TC_WITHOUT_HEAD(rename_to_exchange_nonexisting); +ATF_TC_BODY(rename_to_exchange_nonexisting, tc) +{ + char expected_value_from; + int fd_from; + + // Some contents we can verify later + expected_value_from = 'g'; + gen_test_path(); + fd_from = scribble_object(test_path, expected_value_from); + close(fd_from); + + gen_test_path2(); + + ATF_REQUIRE_MSG(shm_rename(test_path, test_path2, + SHM_RENAME_EXCHANGE) == 0, + "shm_rename failed; errno=%d", errno); + + // Read back renamed; verify contents + verify_object(test_path2, expected_value_from); +} + +ATF_TC_WITHOUT_HEAD(rename_to_self); +ATF_TC_BODY(rename_to_self, tc) +{ + int fd; + char expected_value; + + expected_value = 't'; + + gen_test_path(); + fd = scribble_object(test_path, expected_value); + close(fd); + + ATF_REQUIRE_MSG(shm_rename(test_path, test_path, 0) == 0, + "shm_rename failed; errno=%d", errno); + + verify_object(test_path, expected_value); +} + +ATF_TC_WITHOUT_HEAD(rename_bad_flag); +ATF_TC_BODY(rename_bad_flag, tc) +{ + int fd; + int rc; + + /* Make sure we don't fail out due to ENOENT */ + gen_test_path(); + gen_test_path2(); + fd = scribble_object(test_path, 'd'); + close(fd); + fd = scribble_object(test_path2, 'd'); + close(fd); + + /* + * Note: if we end up with enough flags that we use all the bits, + * then remove this test completely. + */ + rc = shm_rename(test_path, test_path2, INT_MIN); + ATF_REQUIRE_MSG((rc == -1) && (errno == EINVAL), + "shm_rename should have failed with EINVAL; got: return=%d, " + "errno=%d", rc, errno); +} + ATF_TC_WITHOUT_HEAD(reopen_object); ATF_TC_BODY(reopen_object, tc) { @@ -157,7 +408,8 @@ ATF_TC_BODY(reopen_object, tc) ATF_REQUIRE(0 < (pagesize = getpagesize())); - fd = scribble_object(); + gen_test_path(); + fd = scribble_object(test_path, '1'); close(fd); fd = shm_open(test_path, O_RDONLY, 0777); @@ -634,6 +886,16 @@ ATF_TP_ADD_TCS(tp) { ATF_TP_ADD_TC(tp, remap_object); + ATF_TP_ADD_TC(tp, rename_from_anon); + ATF_TP_ADD_TC(tp, rename_bad_path_pointer); + ATF_TP_ADD_TC(tp, rename_from_nonexisting); + ATF_TP_ADD_TC(tp, rename_to_anon); + ATF_TP_ADD_TC(tp, rename_to_replace); + ATF_TP_ADD_TC(tp, rename_to_noreplace); + ATF_TP_ADD_TC(tp, rename_to_exchange); + ATF_TP_ADD_TC(tp, rename_to_exchange_nonexisting); + ATF_TP_ADD_TC(tp, rename_to_self); + ATF_TP_ADD_TC(tp, rename_bad_flag); ATF_TP_ADD_TC(tp, reopen_object); ATF_TP_ADD_TC(tp, readonly_mmap_write); ATF_TP_ADD_TC(tp, open_after_link); diff --git a/usr.bin/truss/syscalls.c b/usr.bin/truss/syscalls.c index e6a1fbd21bf8..e06a818b721f 100644 --- a/usr.bin/truss/syscalls.c +++ b/usr.bin/truss/syscalls.c @@ -471,6 +471,8 @@ static struct syscall decoded_syscalls[] = { { Ptr | IN, 3 }, { Socklent, 4 } } }, { .name = "shm_open", .ret_type = 1, .nargs = 3, .args = { { ShmName | IN, 0 }, { Open, 1 }, { Octal, 2 } } }, + { .name = "shm_rename", .ret_type = 1, .nargs = 3, + .args = { { Name | IN, 0 }, { Name | IN, 1 }, { Hex, 2 } } }, { .name = "shm_unlink", .ret_type = 1, .nargs = 1, .args = { { Name | IN, 0 } } }, { .name = "shutdown", .ret_type = 1, .nargs = 2, From 55248d32f20af9f3ef5061a0fcf5f886b481ba6c Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Thu, 26 Sep 2019 15:35:35 +0000 Subject: [PATCH 103/106] Fix handling of invalid pages in exec_map_first_page(). exec_map_first_page() would unconditionally free an unbacked, invalid page from the executable image. However, it is possible that the page is wired, in which case it is incorrect to free the page, so check for additional wirings first. Reported by: syzkaller Tested by: pho Reviewed by: kib MFC after: 1 week Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D21767 --- sys/kern/kern_exec.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c index 2dd04d2417db..5abced8fbbaa 100644 --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -981,8 +981,10 @@ exec_map_first_page(struct image_params *imgp) if (ma[0]->valid != VM_PAGE_BITS_ALL) { vm_page_xbusy(ma[0]); if (!vm_pager_has_page(object, 0, NULL, &after)) { - vm_page_unwire_noq(ma[0]); - vm_page_free(ma[0]); + if (vm_page_unwire_noq(ma[0])) + vm_page_free(ma[0]); + else + vm_page_xunbusy(ma[0]); VM_OBJECT_WUNLOCK(object); return (EIO); } @@ -1006,9 +1008,16 @@ exec_map_first_page(struct image_params *imgp) initial_pagein = i; rv = vm_pager_get_pages(object, ma, initial_pagein, NULL, NULL); if (rv != VM_PAGER_OK) { - vm_page_unwire_noq(ma[0]); - for (i = 0; i < initial_pagein; i++) - vm_page_free(ma[i]); + if (vm_page_unwire_noq(ma[0])) + vm_page_free(ma[0]); + else + vm_page_xunbusy(ma[0]); + for (i = 1; i < initial_pagein; i++) { + if (!vm_page_wired(ma[i])) + vm_page_free(ma[i]); + else + vm_page_xunbusy(ma[i]); + } VM_OBJECT_WUNLOCK(object); return (EIO); } From c4571256af14765dbfa5649d51419cc7cfb03a31 Mon Sep 17 00:00:00 2001 From: David Bright Date: Thu, 26 Sep 2019 15:41:10 +0000 Subject: [PATCH 104/106] sysent: regenerate after r352747. Sponsored by: Dell EMC Isilon --- sys/compat/freebsd32/freebsd32_syscall.h | 3 +- sys/compat/freebsd32/freebsd32_syscalls.c | 1 + sys/compat/freebsd32/freebsd32_sysent.c | 1 + .../freebsd32/freebsd32_systrace_args.c | 30 +++++++++++++++++++ sys/kern/init_sysent.c | 1 + sys/kern/syscalls.c | 1 + sys/kern/systrace_args.c | 30 +++++++++++++++++++ sys/sys/syscall.h | 3 +- sys/sys/syscall.mk | 3 +- sys/sys/sysproto.h | 7 +++++ 10 files changed, 77 insertions(+), 3 deletions(-) diff --git a/sys/compat/freebsd32/freebsd32_syscall.h b/sys/compat/freebsd32/freebsd32_syscall.h index c535b03a05d8..bcdb1579cb57 100644 --- a/sys/compat/freebsd32/freebsd32_syscall.h +++ b/sys/compat/freebsd32/freebsd32_syscall.h @@ -498,4 +498,5 @@ #define FREEBSD32_SYS_copy_file_range 569 #define FREEBSD32_SYS_freebsd32___sysctlbyname 570 #define FREEBSD32_SYS_shm_open2 571 -#define FREEBSD32_SYS_MAXSYSCALL 572 +#define FREEBSD32_SYS_shm_rename 572 +#define FREEBSD32_SYS_MAXSYSCALL 573 diff --git a/sys/compat/freebsd32/freebsd32_syscalls.c b/sys/compat/freebsd32/freebsd32_syscalls.c index 1e42d2aaf0ce..223c6772829b 100644 --- a/sys/compat/freebsd32/freebsd32_syscalls.c +++ b/sys/compat/freebsd32/freebsd32_syscalls.c @@ -608,4 +608,5 @@ const char *freebsd32_syscallnames[] = { "copy_file_range", /* 569 = copy_file_range */ "freebsd32___sysctlbyname", /* 570 = freebsd32___sysctlbyname */ "shm_open2", /* 571 = shm_open2 */ + "shm_rename", /* 572 = shm_rename */ }; diff --git a/sys/compat/freebsd32/freebsd32_sysent.c b/sys/compat/freebsd32/freebsd32_sysent.c index c64cd81c1f3c..5b5e1117d16f 100644 --- a/sys/compat/freebsd32/freebsd32_sysent.c +++ b/sys/compat/freebsd32/freebsd32_sysent.c @@ -661,4 +661,5 @@ struct sysent freebsd32_sysent[] = { { AS(copy_file_range_args), (sy_call_t *)sys_copy_file_range, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 569 = copy_file_range */ { AS(freebsd32___sysctlbyname_args), (sy_call_t *)freebsd32___sysctlbyname, AUE_SYSCTL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 570 = freebsd32___sysctlbyname */ { AS(shm_open2_args), (sy_call_t *)sys_shm_open2, AUE_SHMOPEN, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 571 = shm_open2 */ + { AS(shm_rename_args), (sy_call_t *)sys_shm_rename, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 572 = shm_rename */ }; diff --git a/sys/compat/freebsd32/freebsd32_systrace_args.c b/sys/compat/freebsd32/freebsd32_systrace_args.c index 45ed055967e2..c4cf8a7ec72d 100644 --- a/sys/compat/freebsd32/freebsd32_systrace_args.c +++ b/sys/compat/freebsd32/freebsd32_systrace_args.c @@ -3346,6 +3346,15 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args) *n_args = 5; break; } + /* shm_rename */ + case 572: { + struct shm_rename_args *p = params; + uarg[0] = (intptr_t) p->path_from; /* const char * */ + uarg[1] = (intptr_t) p->path_to; /* const char * */ + iarg[2] = p->flags; /* int */ + *n_args = 3; + break; + } default: *n_args = 0; break; @@ -9016,6 +9025,22 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) break; }; break; + /* shm_rename */ + case 572: + switch(ndx) { + case 0: + p = "userland const char *"; + break; + case 1: + p = "userland const char *"; + break; + case 2: + p = "int"; + break; + default: + break; + }; + break; default: break; }; @@ -10900,6 +10925,11 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) if (ndx == 0 || ndx == 1) p = "int"; break; + /* shm_rename */ + case 572: + if (ndx == 0 || ndx == 1) + p = "int"; + break; default: break; }; diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c index 4727e8189254..850fbb281ad2 100644 --- a/sys/kern/init_sysent.c +++ b/sys/kern/init_sysent.c @@ -627,4 +627,5 @@ struct sysent sysent[] = { { AS(copy_file_range_args), (sy_call_t *)sys_copy_file_range, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 569 = copy_file_range */ { AS(__sysctlbyname_args), (sy_call_t *)sys___sysctlbyname, AUE_SYSCTL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 570 = __sysctlbyname */ { AS(shm_open2_args), (sy_call_t *)sys_shm_open2, AUE_SHMOPEN, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 571 = shm_open2 */ + { AS(shm_rename_args), (sy_call_t *)sys_shm_rename, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 572 = shm_rename */ }; diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c index 1e5a69ce6a68..3daa26a41f09 100644 --- a/sys/kern/syscalls.c +++ b/sys/kern/syscalls.c @@ -578,4 +578,5 @@ const char *syscallnames[] = { "copy_file_range", /* 569 = copy_file_range */ "__sysctlbyname", /* 570 = __sysctlbyname */ "shm_open2", /* 571 = shm_open2 */ + "shm_rename", /* 572 = shm_rename */ }; diff --git a/sys/kern/systrace_args.c b/sys/kern/systrace_args.c index 3816389eb3d0..8b40379c66ad 100644 --- a/sys/kern/systrace_args.c +++ b/sys/kern/systrace_args.c @@ -3338,6 +3338,15 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args) *n_args = 5; break; } + /* shm_rename */ + case 572: { + struct shm_rename_args *p = params; + uarg[0] = (intptr_t) p->path_from; /* const char * */ + uarg[1] = (intptr_t) p->path_to; /* const char * */ + iarg[2] = p->flags; /* int */ + *n_args = 3; + break; + } default: *n_args = 0; break; @@ -8921,6 +8930,22 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) break; }; break; + /* shm_rename */ + case 572: + switch(ndx) { + case 0: + p = "userland const char *"; + break; + case 1: + p = "userland const char *"; + break; + case 2: + p = "int"; + break; + default: + break; + }; + break; default: break; }; @@ -10832,6 +10857,11 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) if (ndx == 0 || ndx == 1) p = "int"; break; + /* shm_rename */ + case 572: + if (ndx == 0 || ndx == 1) + p = "int"; + break; default: break; }; diff --git a/sys/sys/syscall.h b/sys/sys/syscall.h index 57b764ec9085..bfe648fa0808 100644 --- a/sys/sys/syscall.h +++ b/sys/sys/syscall.h @@ -507,4 +507,5 @@ #define SYS_copy_file_range 569 #define SYS___sysctlbyname 570 #define SYS_shm_open2 571 -#define SYS_MAXSYSCALL 572 +#define SYS_shm_rename 572 +#define SYS_MAXSYSCALL 573 diff --git a/sys/sys/syscall.mk b/sys/sys/syscall.mk index e813af1e7bcf..269010e25e71 100644 --- a/sys/sys/syscall.mk +++ b/sys/sys/syscall.mk @@ -412,4 +412,5 @@ MIASM = \ funlinkat.o \ copy_file_range.o \ __sysctlbyname.o \ - shm_open2.o + shm_open2.o \ + shm_rename.o diff --git a/sys/sys/sysproto.h b/sys/sys/sysproto.h index 0b9364830128..941bd1b64c35 100644 --- a/sys/sys/sysproto.h +++ b/sys/sys/sysproto.h @@ -1814,6 +1814,11 @@ struct shm_open2_args { char shmflags_l_[PADL_(int)]; int shmflags; char shmflags_r_[PADR_(int)]; char name_l_[PADL_(const char *)]; const char * name; char name_r_[PADR_(const char *)]; }; +struct shm_rename_args { + char path_from_l_[PADL_(const char *)]; const char * path_from; char path_from_r_[PADR_(const char *)]; + char path_to_l_[PADL_(const char *)]; const char * path_to; char path_to_r_[PADR_(const char *)]; + char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; +}; int nosys(struct thread *, struct nosys_args *); void sys_sys_exit(struct thread *, struct sys_exit_args *); int sys_fork(struct thread *, struct fork_args *); @@ -2201,6 +2206,7 @@ int sys_funlinkat(struct thread *, struct funlinkat_args *); int sys_copy_file_range(struct thread *, struct copy_file_range_args *); int sys___sysctlbyname(struct thread *, struct __sysctlbyname_args *); int sys_shm_open2(struct thread *, struct shm_open2_args *); +int sys_shm_rename(struct thread *, struct shm_rename_args *); #ifdef COMPAT_43 @@ -3123,6 +3129,7 @@ int freebsd12_shm_open(struct thread *, struct freebsd12_shm_open_args *); #define SYS_AUE_copy_file_range AUE_NULL #define SYS_AUE___sysctlbyname AUE_SYSCTL #define SYS_AUE_shm_open2 AUE_SHMOPEN +#define SYS_AUE_shm_rename AUE_NULL #undef PAD_ #undef PADL_ From d4f44305039b3f432896fe8a5d011473573d46f3 Mon Sep 17 00:00:00 2001 From: David Bright Date: Thu, 26 Sep 2019 16:13:17 +0000 Subject: [PATCH 105/106] Correct mistake in MLINKS introduced in r352747 Messed up a merge conflict resolution and didn't catch that before commit. Sponsored by: Dell EMC Isilon --- lib/libc/sys/Makefile.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/libc/sys/Makefile.inc b/lib/libc/sys/Makefile.inc index 6dec508065c0..4e048b5f3927 100644 --- a/lib/libc/sys/Makefile.inc +++ b/lib/libc/sys/Makefile.inc @@ -478,7 +478,7 @@ MLINKS+=setuid.2 setegid.2 \ MLINKS+=shmat.2 shmdt.2 MLINKS+=shm_open.2 memfd_create.3 \ shm_open.2 shm_unlink.2 \ - shm_rename.2 + shm_open.2 shm_rename.2 MLINKS+=sigwaitinfo.2 sigtimedwait.2 MLINKS+=stat.2 fstat.2 \ stat.2 fstatat.2 \ From e12ff891366cf94db4bfe4c2c810b26a5531053d Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Thu, 26 Sep 2019 16:19:22 +0000 Subject: [PATCH 106/106] Further normalize copyright notices - s/C/c/ where I've been inconsistent about it - +SPDX tags - Remove "All rights reserved" where possible Requested by: rgrimes (all rights reserved) --- lib/libc/sys/shm_open.c | 1 - lib/libregex/tests/libregex_test.sh | 3 ++- stand/lua/config.lua | 2 +- stand/lua/menu.lua | 2 +- stand/lua/password.lua | 2 +- tests/sys/kern/memfd_test.c | 1 - 6 files changed, 5 insertions(+), 6 deletions(-) diff --git a/lib/libc/sys/shm_open.c b/lib/libc/sys/shm_open.c index 2c91eb2c8e1e..03c98a0c8421 100644 --- a/lib/libc/sys/shm_open.c +++ b/lib/libc/sys/shm_open.c @@ -2,7 +2,6 @@ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2019 Kyle Evans - * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions diff --git a/lib/libregex/tests/libregex_test.sh b/lib/libregex/tests/libregex_test.sh index 071f407cdb10..9e41db67e10b 100755 --- a/lib/libregex/tests/libregex_test.sh +++ b/lib/libregex/tests/libregex_test.sh @@ -1,6 +1,7 @@ # +# SPDX-License-Identifier: BSD-2-Clause-FreeBSD +# # Copyright (c) 2017 Kyle Evans -# All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions diff --git a/stand/lua/config.lua b/stand/lua/config.lua index c3f79314b20c..873e778810b1 100644 --- a/stand/lua/config.lua +++ b/stand/lua/config.lua @@ -2,7 +2,7 @@ -- SPDX-License-Identifier: BSD-2-Clause-FreeBSD -- -- Copyright (c) 2015 Pedro Souza --- Copyright (C) 2018 Kyle Evans +-- Copyright (c) 2018 Kyle Evans -- All rights reserved. -- -- Redistribution and use in source and binary forms, with or without diff --git a/stand/lua/menu.lua b/stand/lua/menu.lua index 51098844e87e..e8d9b91b6a54 100644 --- a/stand/lua/menu.lua +++ b/stand/lua/menu.lua @@ -2,7 +2,7 @@ -- SPDX-License-Identifier: BSD-2-Clause-FreeBSD -- -- Copyright (c) 2015 Pedro Souza --- Copyright (C) 2018 Kyle Evans +-- Copyright (c) 2018 Kyle Evans -- All rights reserved. -- -- Redistribution and use in source and binary forms, with or without diff --git a/stand/lua/password.lua b/stand/lua/password.lua index 8042a5d3d0b5..8edd4edd7ec3 100644 --- a/stand/lua/password.lua +++ b/stand/lua/password.lua @@ -2,7 +2,7 @@ -- SPDX-License-Identifier: BSD-2-Clause-FreeBSD -- -- Copyright (c) 2015 Pedro Souza --- Copyright (C) 2018 Kyle Evans +-- Copyright (c) 2018 Kyle Evans -- All rights reserved. -- -- Redistribution and use in source and binary forms, with or without diff --git a/tests/sys/kern/memfd_test.c b/tests/sys/kern/memfd_test.c index 6b48ab85bd60..6ab4c2a14258 100644 --- a/tests/sys/kern/memfd_test.c +++ b/tests/sys/kern/memfd_test.c @@ -2,7 +2,6 @@ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2019 Kyle Evans - * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions