Here we update the modular tcp to be able to switch to an
alternate TCP stack in other then the closed state (pre-listen/connect). The idea is that *if* that is supported by the alternate stack, it is asked if its ok to switch. If it approves the "handoff" then we allow the switch to happen. Also the fini() function now gets a flag to tell if you are switching away *or* the tcb is destroyed. The init() call into the alternate stack is moved to the end so the tcb is more fully formed before the init transpires. Sponsored by: Netflix Inc. Differential Revision: D6790
This commit is contained in:
parent
f13741f250
commit
4d7e0cd8cd
@ -633,7 +633,8 @@ when trying to use a TCP function block that is not available;
|
|||||||
.Xr mod_cc 4 ,
|
.Xr mod_cc 4 ,
|
||||||
.Xr siftr 4 ,
|
.Xr siftr 4 ,
|
||||||
.Xr syncache 4 ,
|
.Xr syncache 4 ,
|
||||||
.Xr setkey 8
|
.Xr setkey 8 ,
|
||||||
|
.Xr tcp_functions 9
|
||||||
.Rs
|
.Rs
|
||||||
.%A "V. Jacobson"
|
.%A "V. Jacobson"
|
||||||
.%A "R. Braden"
|
.%A "R. Braden"
|
||||||
|
@ -114,14 +114,17 @@ struct tcp_function_block {
|
|||||||
struct inpcb *inp, struct tcpcb *tp);
|
struct inpcb *inp, struct tcpcb *tp);
|
||||||
/* Optional memory allocation/free routine */
|
/* Optional memory allocation/free routine */
|
||||||
void (*tfb_tcp_fb_init)(struct tcpcb *);
|
void (*tfb_tcp_fb_init)(struct tcpcb *);
|
||||||
void (*tfb_tcp_fb_fini)(struct tcpcb *);
|
void (*tfb_tcp_fb_fini)(struct tcpcb *, int);
|
||||||
/* Optional timers, must define all if you define one */
|
/* Optional timers, must define all if you define one */
|
||||||
int (*tfb_tcp_timer_stop_all)(struct tcpcb *);
|
int (*tfb_tcp_timer_stop_all)(struct tcpcb *);
|
||||||
void (*tfb_tcp_timer_activate)(struct tcpcb *,
|
void (*tfb_tcp_timer_activate)(struct tcpcb *,
|
||||||
uint32_t, u_int);
|
uint32_t, u_int);
|
||||||
int (*tfb_tcp_timer_active)(struct tcpcb *, uint32_t);
|
int (*tfb_tcp_timer_active)(struct tcpcb *, uint32_t);
|
||||||
void (*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t);
|
void (*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t);
|
||||||
|
/* Optional functions */
|
||||||
void (*tfb_tcp_rexmit_tmr)(struct tcpcb *);
|
void (*tfb_tcp_rexmit_tmr)(struct tcpcb *);
|
||||||
|
void (*tfb_tcp_handoff_ok)(struct tcpcb *);
|
||||||
|
/* System use */
|
||||||
volatile uint32_t tfb_refcnt;
|
volatile uint32_t tfb_refcnt;
|
||||||
uint32_t tfb_flags;
|
uint32_t tfb_flags;
|
||||||
};
|
};
|
||||||
@ -157,6 +160,16 @@ in the
|
|||||||
.Va tfb_tcp_fb_fini
|
.Va tfb_tcp_fb_fini
|
||||||
field.
|
field.
|
||||||
.Pp
|
.Pp
|
||||||
|
If the
|
||||||
|
.Va tfb_tcp_fb_fini
|
||||||
|
argument is non-NULL, the function to which it points is called when the
|
||||||
|
kernel is destroying the TCP control block or when the socket is transitioning
|
||||||
|
to use a different TCP stack.
|
||||||
|
The function is called with arguments of the TCP control block and an integer
|
||||||
|
flag.
|
||||||
|
The flag will be zero if the socket is transitioning to use another TCP stack
|
||||||
|
or one if the TCP control block is being destroyed.
|
||||||
|
.Pp
|
||||||
If the TCP stack implements additional timers, the TCP stack should set a
|
If the TCP stack implements additional timers, the TCP stack should set a
|
||||||
non-NULL pointer in the
|
non-NULL pointer in the
|
||||||
.Va tfb_tcp_timer_stop_all ,
|
.Va tfb_tcp_timer_stop_all ,
|
||||||
@ -193,6 +206,37 @@ However, care must be taken to ensure the retransmit timer leaves the
|
|||||||
TCP control block in a valid state for the remainder of the retransmit
|
TCP control block in a valid state for the remainder of the retransmit
|
||||||
timer logic.
|
timer logic.
|
||||||
.Pp
|
.Pp
|
||||||
|
A user may select a new TCP stack before calling
|
||||||
|
.Xr connect 2
|
||||||
|
or
|
||||||
|
.Xr listen 2 .
|
||||||
|
Optionally, a TCP stack may also allow a user to begin using the TCP stack for
|
||||||
|
a connection that is in a later state by setting a non-NULL function pointer in
|
||||||
|
the
|
||||||
|
.Va tfb_tcp_handoff_ok
|
||||||
|
field.
|
||||||
|
If this field is non-NULL and a user attempts to select that TCP stack after
|
||||||
|
calling
|
||||||
|
.Xr connect 2
|
||||||
|
or
|
||||||
|
.Xr listen 2
|
||||||
|
for that socket, the kernel will call the function pointed to by the
|
||||||
|
.Va tfb_tcp_handoff_ok
|
||||||
|
field.
|
||||||
|
The function should return 0 if the user is allowed to switch the socket to use
|
||||||
|
the TCP stack. Otherwise, the function should return an error code, which will
|
||||||
|
be returned to the user.
|
||||||
|
If the
|
||||||
|
.Va tfb_tcp_handoff_ok
|
||||||
|
field is
|
||||||
|
.Dv NULL
|
||||||
|
and a user attempts to select the TCP stack after calling
|
||||||
|
.Xr connect 2
|
||||||
|
or
|
||||||
|
.Xr listen 2
|
||||||
|
for that socket, the operation will fail and the kernel will return
|
||||||
|
.Er EINVAL .
|
||||||
|
.Pp
|
||||||
The
|
The
|
||||||
.Va tfb_refcnt
|
.Va tfb_refcnt
|
||||||
and
|
and
|
||||||
@ -269,8 +313,10 @@ The
|
|||||||
.Fa blk
|
.Fa blk
|
||||||
argument references a function block that is not currently registered.
|
argument references a function block that is not currently registered.
|
||||||
.Sh SEE ALSO
|
.Sh SEE ALSO
|
||||||
.Xr malloc 9 ,
|
.Xr connect 2 ,
|
||||||
.Xr tcp 4
|
.Xr listen 2 ,
|
||||||
|
.Xr tcp 4 ,
|
||||||
|
.Xr malloc 9
|
||||||
.Sh HISTORY
|
.Sh HISTORY
|
||||||
This framework first appeared in
|
This framework first appeared in
|
||||||
.Fx 11.0 .
|
.Fx 11.0 .
|
||||||
|
@ -1187,9 +1187,6 @@ tcp_newtcpcb(struct inpcb *inp)
|
|||||||
tp->t_fb = tcp_func_set_ptr;
|
tp->t_fb = tcp_func_set_ptr;
|
||||||
refcount_acquire(&tp->t_fb->tfb_refcnt);
|
refcount_acquire(&tp->t_fb->tfb_refcnt);
|
||||||
rw_runlock(&tcp_function_lock);
|
rw_runlock(&tcp_function_lock);
|
||||||
if (tp->t_fb->tfb_tcp_fb_init) {
|
|
||||||
(*tp->t_fb->tfb_tcp_fb_init)(tp);
|
|
||||||
}
|
|
||||||
/*
|
/*
|
||||||
* Use the current system default CC algorithm.
|
* Use the current system default CC algorithm.
|
||||||
*/
|
*/
|
||||||
@ -1201,7 +1198,7 @@ tcp_newtcpcb(struct inpcb *inp)
|
|||||||
if (CC_ALGO(tp)->cb_init != NULL)
|
if (CC_ALGO(tp)->cb_init != NULL)
|
||||||
if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) {
|
if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) {
|
||||||
if (tp->t_fb->tfb_tcp_fb_fini)
|
if (tp->t_fb->tfb_tcp_fb_fini)
|
||||||
(*tp->t_fb->tfb_tcp_fb_fini)(tp);
|
(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
|
||||||
refcount_release(&tp->t_fb->tfb_refcnt);
|
refcount_release(&tp->t_fb->tfb_refcnt);
|
||||||
uma_zfree(V_tcpcb_zone, tm);
|
uma_zfree(V_tcpcb_zone, tm);
|
||||||
return (NULL);
|
return (NULL);
|
||||||
@ -1210,7 +1207,7 @@ tcp_newtcpcb(struct inpcb *inp)
|
|||||||
tp->osd = &tm->osd;
|
tp->osd = &tm->osd;
|
||||||
if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) {
|
if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) {
|
||||||
if (tp->t_fb->tfb_tcp_fb_fini)
|
if (tp->t_fb->tfb_tcp_fb_fini)
|
||||||
(*tp->t_fb->tfb_tcp_fb_fini)(tp);
|
(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
|
||||||
refcount_release(&tp->t_fb->tfb_refcnt);
|
refcount_release(&tp->t_fb->tfb_refcnt);
|
||||||
uma_zfree(V_tcpcb_zone, tm);
|
uma_zfree(V_tcpcb_zone, tm);
|
||||||
return (NULL);
|
return (NULL);
|
||||||
@ -1271,6 +1268,9 @@ tcp_newtcpcb(struct inpcb *inp)
|
|||||||
*/
|
*/
|
||||||
tcp_pcap_tcpcb_init(tp);
|
tcp_pcap_tcpcb_init(tp);
|
||||||
#endif
|
#endif
|
||||||
|
if (tp->t_fb->tfb_tcp_fb_init) {
|
||||||
|
(*tp->t_fb->tfb_tcp_fb_init)(tp);
|
||||||
|
}
|
||||||
return (tp); /* XXX */
|
return (tp); /* XXX */
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1484,7 +1484,7 @@ tcp_discardcb(struct tcpcb *tp)
|
|||||||
if (tp->t_timers->tt_draincnt == 0) {
|
if (tp->t_timers->tt_draincnt == 0) {
|
||||||
/* We own the last reference on tcpcb, let's free it. */
|
/* We own the last reference on tcpcb, let's free it. */
|
||||||
if (tp->t_fb->tfb_tcp_fb_fini)
|
if (tp->t_fb->tfb_tcp_fb_fini)
|
||||||
(*tp->t_fb->tfb_tcp_fb_fini)(tp);
|
(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
|
||||||
refcount_release(&tp->t_fb->tfb_refcnt);
|
refcount_release(&tp->t_fb->tfb_refcnt);
|
||||||
tp->t_inpcb = NULL;
|
tp->t_inpcb = NULL;
|
||||||
uma_zfree(V_tcpcb_zone, tp);
|
uma_zfree(V_tcpcb_zone, tp);
|
||||||
@ -1513,7 +1513,7 @@ tcp_timer_discard(void *ptp)
|
|||||||
if (tp->t_timers->tt_draincnt == 0) {
|
if (tp->t_timers->tt_draincnt == 0) {
|
||||||
/* We own the last reference on this tcpcb, let's free it. */
|
/* We own the last reference on this tcpcb, let's free it. */
|
||||||
if (tp->t_fb->tfb_tcp_fb_fini)
|
if (tp->t_fb->tfb_tcp_fb_fini)
|
||||||
(*tp->t_fb->tfb_tcp_fb_fini)(tp);
|
(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
|
||||||
refcount_release(&tp->t_fb->tfb_refcnt);
|
refcount_release(&tp->t_fb->tfb_refcnt);
|
||||||
tp->t_inpcb = NULL;
|
tp->t_inpcb = NULL;
|
||||||
uma_zfree(V_tcpcb_zone, tp);
|
uma_zfree(V_tcpcb_zone, tp);
|
||||||
|
@ -842,7 +842,7 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
|
|||||||
KASSERT(rblk != NULL,
|
KASSERT(rblk != NULL,
|
||||||
("cannot find blk %p out of syncache?", blk));
|
("cannot find blk %p out of syncache?", blk));
|
||||||
if (tp->t_fb->tfb_tcp_fb_fini)
|
if (tp->t_fb->tfb_tcp_fb_fini)
|
||||||
(*tp->t_fb->tfb_tcp_fb_fini)(tp);
|
(*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
|
||||||
refcount_release(&tp->t_fb->tfb_refcnt);
|
refcount_release(&tp->t_fb->tfb_refcnt);
|
||||||
tp->t_fb = rblk;
|
tp->t_fb = rblk;
|
||||||
if (tp->t_fb->tfb_tcp_fb_init) {
|
if (tp->t_fb->tfb_tcp_fb_init) {
|
||||||
|
@ -1420,40 +1420,59 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt)
|
|||||||
if (error)
|
if (error)
|
||||||
return (error);
|
return (error);
|
||||||
INP_WLOCK_RECHECK(inp);
|
INP_WLOCK_RECHECK(inp);
|
||||||
if (tp->t_state != TCPS_CLOSED) {
|
|
||||||
/*
|
|
||||||
* The user has advanced the state
|
|
||||||
* past the initial point, we can't
|
|
||||||
* switch since we are down the road
|
|
||||||
* and a new set of functions may
|
|
||||||
* not be compatibile.
|
|
||||||
*/
|
|
||||||
INP_WUNLOCK(inp);
|
|
||||||
return(EINVAL);
|
|
||||||
}
|
|
||||||
blk = find_and_ref_tcp_functions(&fsn);
|
blk = find_and_ref_tcp_functions(&fsn);
|
||||||
if (blk == NULL) {
|
if (blk == NULL) {
|
||||||
INP_WUNLOCK(inp);
|
INP_WUNLOCK(inp);
|
||||||
return (ENOENT);
|
return (ENOENT);
|
||||||
}
|
}
|
||||||
if (tp->t_fb != blk) {
|
if (tp->t_fb == blk) {
|
||||||
if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) {
|
/* You already have this */
|
||||||
|
refcount_release(&blk->tfb_refcnt);
|
||||||
|
INP_WUNLOCK(inp);
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
if (tp->t_state != TCPS_CLOSED) {
|
||||||
|
int error=EINVAL;
|
||||||
|
/*
|
||||||
|
* The user has advanced the state
|
||||||
|
* past the initial point, we may not
|
||||||
|
* be able to switch.
|
||||||
|
*/
|
||||||
|
if (blk->tfb_tcp_handoff_ok != NULL) {
|
||||||
|
/*
|
||||||
|
* Does the stack provide a
|
||||||
|
* query mechanism, if so it may
|
||||||
|
* still be possible?
|
||||||
|
*/
|
||||||
|
error = (*blk->tfb_tcp_handoff_ok)(tp);
|
||||||
|
}
|
||||||
|
if (error) {
|
||||||
refcount_release(&blk->tfb_refcnt);
|
refcount_release(&blk->tfb_refcnt);
|
||||||
INP_WUNLOCK(inp);
|
INP_WUNLOCK(inp);
|
||||||
return (ENOENT);
|
return(error);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) {
|
||||||
|
refcount_release(&blk->tfb_refcnt);
|
||||||
|
INP_WUNLOCK(inp);
|
||||||
|
return (ENOENT);
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
* Release the old refcnt, the
|
||||||
|
* lookup acquired a ref on the
|
||||||
|
* new one already.
|
||||||
|
*/
|
||||||
|
if (tp->t_fb->tfb_tcp_fb_fini) {
|
||||||
/*
|
/*
|
||||||
* Release the old refcnt, the
|
* Tell the stack to cleanup with 0 i.e.
|
||||||
* lookup acquires a ref on the
|
* the tcb is not going away.
|
||||||
* new one.
|
|
||||||
*/
|
*/
|
||||||
if (tp->t_fb->tfb_tcp_fb_fini)
|
(*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
|
||||||
(*tp->t_fb->tfb_tcp_fb_fini)(tp);
|
}
|
||||||
refcount_release(&tp->t_fb->tfb_refcnt);
|
refcount_release(&tp->t_fb->tfb_refcnt);
|
||||||
tp->t_fb = blk;
|
tp->t_fb = blk;
|
||||||
if (tp->t_fb->tfb_tcp_fb_init) {
|
if (tp->t_fb->tfb_tcp_fb_init) {
|
||||||
(*tp->t_fb->tfb_tcp_fb_init)(tp);
|
(*tp->t_fb->tfb_tcp_fb_init)(tp);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#ifdef TCP_OFFLOAD
|
#ifdef TCP_OFFLOAD
|
||||||
if (tp->t_flags & TF_TOE) {
|
if (tp->t_flags & TF_TOE) {
|
||||||
|
@ -116,6 +116,18 @@ struct socket;
|
|||||||
* does not know your callbacks you must provide a
|
* does not know your callbacks you must provide a
|
||||||
* stop_all function that loops through and calls
|
* stop_all function that loops through and calls
|
||||||
* tcp_timer_stop() with each of your defined timers.
|
* tcp_timer_stop() with each of your defined timers.
|
||||||
|
* Adding a tfb_tcp_handoff_ok function allows the socket
|
||||||
|
* option to change stacks to query you even if the
|
||||||
|
* connection is in a later stage. You return 0 to
|
||||||
|
* say you can take over and run your stack, you return
|
||||||
|
* non-zero (an error number) to say no you can't.
|
||||||
|
* If the function is undefined you can only change
|
||||||
|
* in the early states (before connect or listen).
|
||||||
|
* tfb_tcp_fb_fini is changed to add a flag to tell
|
||||||
|
* the old stack if the tcb is being destroyed or
|
||||||
|
* not. A one in the flag means the TCB is being
|
||||||
|
* destroyed, a zero indicates its transitioning to
|
||||||
|
* another stack (via socket option).
|
||||||
*/
|
*/
|
||||||
struct tcp_function_block {
|
struct tcp_function_block {
|
||||||
char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX];
|
char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX];
|
||||||
@ -128,7 +140,7 @@ struct tcp_function_block {
|
|||||||
struct inpcb *inp, struct tcpcb *tp);
|
struct inpcb *inp, struct tcpcb *tp);
|
||||||
/* Optional memory allocation/free routine */
|
/* Optional memory allocation/free routine */
|
||||||
void (*tfb_tcp_fb_init)(struct tcpcb *);
|
void (*tfb_tcp_fb_init)(struct tcpcb *);
|
||||||
void (*tfb_tcp_fb_fini)(struct tcpcb *);
|
void (*tfb_tcp_fb_fini)(struct tcpcb *, int);
|
||||||
/* Optional timers, must define all if you define one */
|
/* Optional timers, must define all if you define one */
|
||||||
int (*tfb_tcp_timer_stop_all)(struct tcpcb *);
|
int (*tfb_tcp_timer_stop_all)(struct tcpcb *);
|
||||||
void (*tfb_tcp_timer_activate)(struct tcpcb *,
|
void (*tfb_tcp_timer_activate)(struct tcpcb *,
|
||||||
@ -136,6 +148,7 @@ struct tcp_function_block {
|
|||||||
int (*tfb_tcp_timer_active)(struct tcpcb *, uint32_t);
|
int (*tfb_tcp_timer_active)(struct tcpcb *, uint32_t);
|
||||||
void (*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t);
|
void (*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t);
|
||||||
void (*tfb_tcp_rexmit_tmr)(struct tcpcb *);
|
void (*tfb_tcp_rexmit_tmr)(struct tcpcb *);
|
||||||
|
int (*tfb_tcp_handoff_ok)(struct tcpcb *);
|
||||||
volatile uint32_t tfb_refcnt;
|
volatile uint32_t tfb_refcnt;
|
||||||
uint32_t tfb_flags;
|
uint32_t tfb_flags;
|
||||||
};
|
};
|
||||||
|
Loading…
Reference in New Issue
Block a user