6d9b42b486
PR: bin/154928 Submitted by: Eitan Adler <lists at eitanadler.com> MFC after: 3 days
355 lines
12 KiB
C
355 lines
12 KiB
C
/*-
|
|
* Copyright (c) 2007, Chelsio Inc.
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Neither the name of the Chelsio Corporation nor the names of its
|
|
* contributors may be used to endorse or promote products derived from
|
|
* this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
* $FreeBSD$
|
|
*/
|
|
|
|
#ifndef _NETINET_TCP_OFFLOAD_H_
|
|
#define _NETINET_TCP_OFFLOAD_H_
|
|
|
|
#ifndef _KERNEL
|
|
#error "no user-serviceable parts inside"
|
|
#endif
|
|
|
|
/*
|
|
* A driver publishes that it provides offload services
|
|
* by setting IFCAP_TOE in the ifnet. The offload connect
|
|
* will bypass any further work if the interface that a
|
|
* connection would use does not support TCP offload.
|
|
*
|
|
* The TOE API assumes that the tcp offload engine can offload the
|
|
* the entire connection from set up to teardown, with some provision
|
|
* being made to allowing the software stack to handle time wait. If
|
|
* the device does not meet these criteria, it is the driver's responsibility
|
|
* to overload the functions that it needs to in tcp_usrreqs and make
|
|
* its own calls to tcp_output if it needs to do so.
|
|
*
|
|
* There is currently no provision for the device advertising the congestion
|
|
* control algorithms it supports as there is currently no API for querying
|
|
* an operating system for the protocols that it has loaded. This is a desirable
|
|
* future extension.
|
|
*
|
|
*
|
|
*
|
|
* It is assumed that individuals deploying TOE will want connections
|
|
* to be offloaded without software changes so all connections on an
|
|
* interface providing TOE are offloaded unless the SO_NO_OFFLOAD
|
|
* flag is set on the socket.
|
|
*
|
|
*
|
|
* The toe_usrreqs structure constitutes the TOE driver's
|
|
* interface to the TCP stack for functionality that doesn't
|
|
* interact directly with userspace. If one wants to provide
|
|
* (optional) functionality to do zero-copy to/from
|
|
* userspace one still needs to override soreceive/sosend
|
|
* with functions that fault in and pin the user buffers.
|
|
*
|
|
* + tu_send
|
|
* - tells the driver that new data may have been added to the
|
|
* socket's send buffer - the driver should not fail if the
|
|
* buffer is in fact unchanged
|
|
* - the driver is responsible for providing credits (bytes in the send window)
|
|
* back to the socket by calling sbdrop() as segments are acknowledged.
|
|
* - The driver expects the inpcb lock to be held - the driver is expected
|
|
* not to drop the lock. Hence the driver is not allowed to acquire the
|
|
* pcbinfo lock during this call.
|
|
*
|
|
* + tu_rcvd
|
|
* - returns credits to the driver and triggers window updates
|
|
* to the peer (a credit as used here is a byte in the peer's receive window)
|
|
* - the driver is expected to determine how many bytes have been
|
|
* consumed and credit that back to the card so that it can grow
|
|
* the window again by maintaining its own state between invocations.
|
|
* - In principle this could be used to shrink the window as well as
|
|
* grow the window, although it is not used for that now.
|
|
* - this function needs to correctly handle being called any number of
|
|
* times without any bytes being consumed from the receive buffer.
|
|
* - The driver expects the inpcb lock to be held - the driver is expected
|
|
* not to drop the lock. Hence the driver is not allowed to acquire the
|
|
* pcbinfo lock during this call.
|
|
*
|
|
* + tu_disconnect
|
|
* - tells the driver to send FIN to peer
|
|
* - driver is expected to send the remaining data and then do a clean half close
|
|
* - disconnect implies at least half-close so only send, reset, and detach
|
|
* are legal
|
|
* - the driver is expected to handle transition through the shutdown
|
|
* state machine and allow the stack to support SO_LINGER.
|
|
* - The driver expects the inpcb lock to be held - the driver is expected
|
|
* not to drop the lock. Hence the driver is not allowed to acquire the
|
|
* pcbinfo lock during this call.
|
|
*
|
|
* + tu_reset
|
|
* - closes the connection and sends a RST to peer
|
|
* - driver is expectd to trigger an RST and detach the toepcb
|
|
* - no further calls are legal after reset
|
|
* - The driver expects the inpcb lock to be held - the driver is expected
|
|
* not to drop the lock. Hence the driver is not allowed to acquire the
|
|
* pcbinfo lock during this call.
|
|
*
|
|
* The following fields in the tcpcb are expected to be referenced by the driver:
|
|
* + iss
|
|
* + rcv_nxt
|
|
* + rcv_wnd
|
|
* + snd_isn
|
|
* + snd_max
|
|
* + snd_nxt
|
|
* + snd_una
|
|
* + t_flags
|
|
* + t_inpcb
|
|
* + t_maxseg
|
|
* + t_toe
|
|
*
|
|
* The following fields in the inpcb are expected to be referenced by the driver:
|
|
* + inp_lport
|
|
* + inp_fport
|
|
* + inp_laddr
|
|
* + inp_fport
|
|
* + inp_socket
|
|
* + inp_ip_tos
|
|
*
|
|
* The following fields in the socket are expected to be referenced by the
|
|
* driver:
|
|
* + so_comp
|
|
* + so_error
|
|
* + so_linger
|
|
* + so_options
|
|
* + so_rcv
|
|
* + so_snd
|
|
* + so_state
|
|
* + so_timeo
|
|
*
|
|
* These functions all return 0 on success and can return the following errors
|
|
* as appropriate:
|
|
* + EPERM:
|
|
* + ENOBUFS: memory allocation failed
|
|
* + EMSGSIZE: MTU changed during the call
|
|
* + EHOSTDOWN:
|
|
* + EHOSTUNREACH:
|
|
* + ENETDOWN:
|
|
* * ENETUNREACH: the peer is no longer reachable
|
|
*
|
|
* + tu_detach
|
|
* - tells driver that the socket is going away so disconnect
|
|
* the toepcb and free appropriate resources
|
|
* - allows the driver to cleanly handle the case of connection state
|
|
* outliving the socket
|
|
* - no further calls are legal after detach
|
|
* - the driver is expected to provide its own synchronization between
|
|
* detach and receiving new data.
|
|
*
|
|
* + tu_syncache_event
|
|
* - even if it is not actually needed, the driver is expected to
|
|
* call syncache_add for the initial SYN and then syncache_expand
|
|
* for the SYN,ACK
|
|
* - tells driver that a connection either has not been added or has
|
|
* been dropped from the syncache
|
|
* - the driver is expected to maintain state that lives outside the
|
|
* software stack so the syncache needs to be able to notify the
|
|
* toe driver that the software stack is not going to create a connection
|
|
* for a received SYN
|
|
* - The driver is responsible for any synchronization required between
|
|
* the syncache dropping an entry and the driver processing the SYN,ACK.
|
|
*
|
|
*/
|
|
struct toe_usrreqs {
|
|
int (*tu_send)(struct tcpcb *tp);
|
|
int (*tu_rcvd)(struct tcpcb *tp);
|
|
int (*tu_disconnect)(struct tcpcb *tp);
|
|
int (*tu_reset)(struct tcpcb *tp);
|
|
void (*tu_detach)(struct tcpcb *tp);
|
|
void (*tu_syncache_event)(int event, void *toep);
|
|
};
|
|
|
|
/*
|
|
* Proxy for struct tcpopt between TOE drivers and TCP functions.
|
|
*/
|
|
struct toeopt {
|
|
u_int64_t to_flags; /* see tcpopt in tcp_var.h */
|
|
u_int16_t to_mss; /* maximum segment size */
|
|
u_int8_t to_wscale; /* window scaling */
|
|
|
|
u_int8_t _pad1; /* explicit pad for 64bit alignment */
|
|
u_int32_t _pad2; /* explicit pad for 64bit alignment */
|
|
u_int64_t _pad3[4]; /* TBD */
|
|
};
|
|
|
|
#define TOE_SC_ENTRY_PRESENT 1 /* 4-tuple already present */
|
|
#define TOE_SC_DROP 2 /* connection was timed out */
|
|
|
|
/*
|
|
* Because listen is a one-to-many relationship (a socket can be listening
|
|
* on all interfaces on a machine some of which may be using different TCP
|
|
* offload devices), listen uses a publish/subscribe mechanism. The TCP
|
|
* offload driver registers a listen notification function with the stack.
|
|
* When a listen socket is created all TCP offload devices are notified
|
|
* so that they can do the appropriate set up to offload connections on the
|
|
* port to which the socket is bound. When the listen socket is closed,
|
|
* the offload devices are notified so that they will stop listening on that
|
|
* port and free any associated resources as well as sending RSTs on any
|
|
* connections in the SYN_RCVD state.
|
|
*
|
|
*/
|
|
|
|
typedef void (*tcp_offload_listen_start_fn)(void *, struct tcpcb *);
|
|
typedef void (*tcp_offload_listen_stop_fn)(void *, struct tcpcb *);
|
|
|
|
EVENTHANDLER_DECLARE(tcp_offload_listen_start, tcp_offload_listen_start_fn);
|
|
EVENTHANDLER_DECLARE(tcp_offload_listen_stop, tcp_offload_listen_stop_fn);
|
|
|
|
/*
|
|
* Check if the socket can be offloaded by the following steps:
|
|
* - determine the egress interface
|
|
* - check the interface for TOE capability and TOE is enabled
|
|
* - check if the device has resources to offload the connection
|
|
*/
|
|
int tcp_offload_connect(struct socket *so, struct sockaddr *nam);
|
|
|
|
/*
|
|
* The tcp_output_* routines are wrappers around the toe_usrreqs calls
|
|
* which trigger packet transmission. In the non-offloaded case they
|
|
* translate to tcp_output. The tcp_offload_* routines notify TOE
|
|
* of specific events. I the non-offloaded case they are no-ops.
|
|
*
|
|
* Listen is a special case because it is a 1 to many relationship
|
|
* and there can be more than one offload driver in the system.
|
|
*/
|
|
|
|
/*
|
|
* Connection is offloaded
|
|
*/
|
|
#define tp_offload(tp) ((tp)->t_flags & TF_TOE)
|
|
|
|
/*
|
|
* hackish way of allowing this file to also be included by TOE
|
|
* which needs to be kept ignorant of socket implementation details
|
|
*/
|
|
#ifdef _SYS_SOCKETVAR_H_
|
|
/*
|
|
* The socket has not been marked as "do not offload"
|
|
*/
|
|
#define SO_OFFLOADABLE(so) ((so->so_options & SO_NO_OFFLOAD) == 0)
|
|
|
|
static __inline int
|
|
tcp_output_connect(struct socket *so, struct sockaddr *nam)
|
|
{
|
|
struct tcpcb *tp = sototcpcb(so);
|
|
int error;
|
|
|
|
/*
|
|
* If offload has been disabled for this socket or the
|
|
* connection cannot be offloaded just call tcp_output
|
|
* to start the TCP state machine.
|
|
*/
|
|
#ifndef TCP_OFFLOAD_DISABLE
|
|
if (!SO_OFFLOADABLE(so) || (error = tcp_offload_connect(so, nam)) != 0)
|
|
#endif
|
|
error = tcp_output(tp);
|
|
return (error);
|
|
}
|
|
|
|
static __inline int
|
|
tcp_output_send(struct tcpcb *tp)
|
|
{
|
|
|
|
#ifndef TCP_OFFLOAD_DISABLE
|
|
if (tp_offload(tp))
|
|
return (tp->t_tu->tu_send(tp));
|
|
#endif
|
|
return (tcp_output(tp));
|
|
}
|
|
|
|
static __inline int
|
|
tcp_output_rcvd(struct tcpcb *tp)
|
|
{
|
|
|
|
#ifndef TCP_OFFLOAD_DISABLE
|
|
if (tp_offload(tp))
|
|
return (tp->t_tu->tu_rcvd(tp));
|
|
#endif
|
|
return (tcp_output(tp));
|
|
}
|
|
|
|
static __inline int
|
|
tcp_output_disconnect(struct tcpcb *tp)
|
|
{
|
|
|
|
#ifndef TCP_OFFLOAD_DISABLE
|
|
if (tp_offload(tp))
|
|
return (tp->t_tu->tu_disconnect(tp));
|
|
#endif
|
|
return (tcp_output(tp));
|
|
}
|
|
|
|
static __inline int
|
|
tcp_output_reset(struct tcpcb *tp)
|
|
{
|
|
|
|
#ifndef TCP_OFFLOAD_DISABLE
|
|
if (tp_offload(tp))
|
|
return (tp->t_tu->tu_reset(tp));
|
|
#endif
|
|
return (tcp_output(tp));
|
|
}
|
|
|
|
static __inline void
|
|
tcp_offload_detach(struct tcpcb *tp)
|
|
{
|
|
|
|
#ifndef TCP_OFFLOAD_DISABLE
|
|
if (tp_offload(tp))
|
|
tp->t_tu->tu_detach(tp);
|
|
#endif
|
|
}
|
|
|
|
static __inline void
|
|
tcp_offload_listen_open(struct tcpcb *tp)
|
|
{
|
|
|
|
#ifndef TCP_OFFLOAD_DISABLE
|
|
if (SO_OFFLOADABLE(tp->t_inpcb->inp_socket))
|
|
EVENTHANDLER_INVOKE(tcp_offload_listen_start, tp);
|
|
#endif
|
|
}
|
|
|
|
static __inline void
|
|
tcp_offload_listen_close(struct tcpcb *tp)
|
|
{
|
|
|
|
#ifndef TCP_OFFLOAD_DISABLE
|
|
EVENTHANDLER_INVOKE(tcp_offload_listen_stop, tp);
|
|
#endif
|
|
}
|
|
#undef SO_OFFLOADABLE
|
|
#endif /* _SYS_SOCKETVAR_H_ */
|
|
#undef tp_offload
|
|
|
|
void tcp_offload_twstart(struct tcpcb *tp);
|
|
struct tcpcb *tcp_offload_close(struct tcpcb *tp);
|
|
struct tcpcb *tcp_offload_drop(struct tcpcb *tp, int error);
|
|
|
|
#endif /* _NETINET_TCP_OFFLOAD_H_ */
|