freebsd-nq/sys/netinet/tcp_offload.h

355 lines
12 KiB
C
Raw Normal View History

/*-
* Copyright (c) 2007, Chelsio Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Neither the name of the Chelsio Corporation nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _NETINET_TCP_OFFLOAD_H_
#define _NETINET_TCP_OFFLOAD_H_
#ifndef _KERNEL
#error "no user-serviceable parts inside"
#endif
/*
* A driver publishes that it provides offload services
* by setting IFCAP_TOE in the ifnet. The offload connect
* will bypass any further work if the interface that a
* connection would use does not support TCP offload.
*
* The TOE API assumes that the tcp offload engine can offload the
* the entire connection from set up to teardown, with some provision
* being made to allowing the software stack to handle time wait. If
* the device does not meet these criteria, it is the driver's responsibility
* to overload the functions that it needs to in tcp_usrreqs and make
* its own calls to tcp_output if it needs to do so.
*
* There is currently no provision for the device advertising the congestion
* control algorithms it supports as there is currently no API for querying
* an operating system for the protocols that it has loaded. This is a desirable
* future extension.
*
*
*
* It is assumed that individuals deploying TOE will want connections
* to be offloaded without software changes so all connections on an
* interface providing TOE are offloaded unless the the SO_NO_OFFLOAD
* flag is set on the socket.
*
*
* The toe_usrreqs structure constitutes the TOE driver's
* interface to the TCP stack for functionality that doesn't
* interact directly with userspace. If one wants to provide
* (optional) functionality to do zero-copy to/from
* userspace one still needs to override soreceive/sosend
* with functions that fault in and pin the user buffers.
*
* + tu_send
* - tells the driver that new data may have been added to the
* socket's send buffer - the driver should not fail if the
* buffer is in fact unchanged
* - the driver is responsible for providing credits (bytes in the send window)
* back to the socket by calling sbdrop() as segments are acknowledged.
* - The driver expects the inpcb lock to be held - the driver is expected
* not to drop the lock. Hence the driver is not allowed to acquire the
* pcbinfo lock during this call.
*
* + tu_rcvd
* - returns credits to the driver and triggers window updates
* to the peer (a credit as used here is a byte in the peer's receive window)
* - the driver is expected to determine how many bytes have been
* consumed and credit that back to the card so that it can grow
* the window again by maintaining its own state between invocations.
* - In principle this could be used to shrink the window as well as
* grow the window, although it is not used for that now.
* - this function needs to correctly handle being called any number of
* times without any bytes being consumed from the receive buffer.
* - The driver expects the inpcb lock to be held - the driver is expected
* not to drop the lock. Hence the driver is not allowed to acquire the
* pcbinfo lock during this call.
*
* + tu_disconnect
* - tells the driver to send FIN to peer
* - driver is expected to send the remaining data and then do a clean half close
* - disconnect implies at least half-close so only send, reset, and detach
* are legal
* - the driver is expected to handle transition through the shutdown
* state machine and allow the stack to support SO_LINGER.
* - The driver expects the inpcb lock to be held - the driver is expected
* not to drop the lock. Hence the driver is not allowed to acquire the
* pcbinfo lock during this call.
*
* + tu_reset
* - closes the connection and sends a RST to peer
* - driver is expectd to trigger an RST and detach the toepcb
* - no further calls are legal after reset
* - The driver expects the inpcb lock to be held - the driver is expected
* not to drop the lock. Hence the driver is not allowed to acquire the
* pcbinfo lock during this call.
*
* The following fields in the tcpcb are expected to be referenced by the driver:
* + iss
* + rcv_nxt
* + rcv_wnd
* + snd_isn
* + snd_max
* + snd_nxt
* + snd_una
* + t_flags
* + t_inpcb
* + t_maxseg
* + t_toe
*
* The following fields in the inpcb are expected to be referenced by the driver:
* + inp_lport
* + inp_fport
* + inp_laddr
* + inp_fport
* + inp_socket
* + inp_ip_tos
*
* The following fields in the socket are expected to be referenced by the
* driver:
* + so_comp
* + so_error
* + so_linger
* + so_options
* + so_rcv
* + so_snd
* + so_state
* + so_timeo
*
* These functions all return 0 on success and can return the following errors
* as appropriate:
* + EPERM:
* + ENOBUFS: memory allocation failed
* + EMSGSIZE: MTU changed during the call
* + EHOSTDOWN:
* + EHOSTUNREACH:
* + ENETDOWN:
* * ENETUNREACH: the peer is no longer reachable
*
* + tu_detach
* - tells driver that the socket is going away so disconnect
* the toepcb and free appropriate resources
* - allows the driver to cleanly handle the case of connection state
* outliving the socket
* - no further calls are legal after detach
* - the driver is expected to provide its own synchronization between
* detach and receiving new data.
*
* + tu_syncache_event
* - even if it is not actually needed, the driver is expected to
* call syncache_add for the initial SYN and then syncache_expand
* for the SYN,ACK
* - tells driver that a connection either has not been added or has
* been dropped from the syncache
* - the driver is expected to maintain state that lives outside the
* software stack so the syncache needs to be able to notify the
* toe driver that the software stack is not going to create a connection
* for a received SYN
* - The driver is responsible for any synchronization required between
* the syncache dropping an entry and the driver processing the SYN,ACK.
*
*/
struct toe_usrreqs {
int (*tu_send)(struct tcpcb *tp);
int (*tu_rcvd)(struct tcpcb *tp);
int (*tu_disconnect)(struct tcpcb *tp);
int (*tu_reset)(struct tcpcb *tp);
void (*tu_detach)(struct tcpcb *tp);
void (*tu_syncache_event)(int event, void *toep);
};
/*
* Proxy for struct tcpopt between TOE drivers and TCP functions.
*/
struct toeopt {
u_int64_t to_flags; /* see tcpopt in tcp_var.h */
u_int16_t to_mss; /* maximum segment size */
u_int8_t to_wscale; /* window scaling */
u_int8_t _pad1; /* explicit pad for 64bit alignment */
u_int32_t _pad2; /* explicit pad for 64bit alignment */
u_int64_t _pad3[4]; /* TBD */
};
#define TOE_SC_ENTRY_PRESENT 1 /* 4-tuple already present */
#define TOE_SC_DROP 2 /* connection was timed out */
/*
* Because listen is a one-to-many relationship (a socket can be listening
* on all interfaces on a machine some of which may be using different TCP
* offload devices), listen uses a publish/subscribe mechanism. The TCP
* offload driver registers a listen notification function with the stack.
* When a listen socket is created all TCP offload devices are notified
* so that they can do the appropriate set up to offload connections on the
* port to which the socket is bound. When the listen socket is closed,
* the offload devices are notified so that they will stop listening on that
* port and free any associated resources as well as sending RSTs on any
* connections in the SYN_RCVD state.
*
*/
typedef void (*tcp_offload_listen_start_fn)(void *, struct tcpcb *);
typedef void (*tcp_offload_listen_stop_fn)(void *, struct tcpcb *);
EVENTHANDLER_DECLARE(tcp_offload_listen_start, tcp_offload_listen_start_fn);
EVENTHANDLER_DECLARE(tcp_offload_listen_stop, tcp_offload_listen_stop_fn);
/*
* Check if the socket can be offloaded by the following steps:
* - determine the egress interface
* - check the interface for TOE capability and TOE is enabled
* - check if the device has resources to offload the connection
*/
int tcp_offload_connect(struct socket *so, struct sockaddr *nam);
/*
* The tcp_output_* routines are wrappers around the toe_usrreqs calls
* which trigger packet transmission. In the non-offloaded case they
* translate to tcp_output. The tcp_offload_* routines notify TOE
* of specific events. I the non-offloaded case they are no-ops.
*
* Listen is a special case because it is a 1 to many relationship
* and there can be more than one offload driver in the system.
*/
/*
* Connection is offloaded
*/
#define tp_offload(tp) ((tp)->t_flags & TF_TOE)
/*
* hackish way of allowing this file to also be included by TOE
* which needs to be kept ignorant of socket implementation details
*/
#ifdef _SYS_SOCKETVAR_H_
/*
* The socket has not been marked as "do not offload"
*/
#define SO_OFFLOADABLE(so) ((so->so_options & SO_NO_OFFLOAD) == 0)
static __inline int
tcp_output_connect(struct socket *so, struct sockaddr *nam)
{
struct tcpcb *tp = sototcpcb(so);
int error;
/*
* If offload has been disabled for this socket or the
* connection cannot be offloaded just call tcp_output
* to start the TCP state machine.
*/
#ifndef TCP_OFFLOAD_DISABLE
if (!SO_OFFLOADABLE(so) || (error = tcp_offload_connect(so, nam)) != 0)
#endif
error = tcp_output(tp);
return (error);
}
static __inline int
tcp_output_send(struct tcpcb *tp)
{
#ifndef TCP_OFFLOAD_DISABLE
if (tp_offload(tp))
return (tp->t_tu->tu_send(tp));
#endif
return (tcp_output(tp));
}
static __inline int
tcp_output_rcvd(struct tcpcb *tp)
{
#ifndef TCP_OFFLOAD_DISABLE
if (tp_offload(tp))
return (tp->t_tu->tu_rcvd(tp));
#endif
return (tcp_output(tp));
}
static __inline int
tcp_output_disconnect(struct tcpcb *tp)
{
#ifndef TCP_OFFLOAD_DISABLE
if (tp_offload(tp))
return (tp->t_tu->tu_disconnect(tp));
#endif
return (tcp_output(tp));
}
static __inline int
tcp_output_reset(struct tcpcb *tp)
{
#ifndef TCP_OFFLOAD_DISABLE
if (tp_offload(tp))
return (tp->t_tu->tu_reset(tp));
#endif
return (tcp_output(tp));
}
static __inline void
tcp_offload_detach(struct tcpcb *tp)
{
#ifndef TCP_OFFLOAD_DISABLE
if (tp_offload(tp))
tp->t_tu->tu_detach(tp);
#endif
}
static __inline void
tcp_offload_listen_open(struct tcpcb *tp)
{
#ifndef TCP_OFFLOAD_DISABLE
if (SO_OFFLOADABLE(tp->t_inpcb->inp_socket))
EVENTHANDLER_INVOKE(tcp_offload_listen_start, tp);
#endif
}
static __inline void
tcp_offload_listen_close(struct tcpcb *tp)
{
#ifndef TCP_OFFLOAD_DISABLE
EVENTHANDLER_INVOKE(tcp_offload_listen_stop, tp);
#endif
}
#undef SO_OFFLOADABLE
#endif /* _SYS_SOCKETVAR_H_ */
#undef tp_offload
void tcp_offload_twstart(struct tcpcb *tp);
struct tcpcb *tcp_offload_close(struct tcpcb *tp);
struct tcpcb *tcp_offload_drop(struct tcpcb *tp, int error);
#endif /* _NETINET_TCP_OFFLOAD_H_ */