freebsd-skq/sys/netinet/tcp_offload.h
kmacy 4eae976a6a move cxgb_lt2.[ch] from NIC to TOE
move most offload functionality from NIC to TOE
factor out all socket and inpcb direct access
factor out access to locking in incpb, pcbinfo, and sockbuf
2008-04-19 03:22:43 +00:00

337 lines
11 KiB
C

/*-
* Copyright (c) 2007, Chelsio Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Neither the name of the Chelsio Corporation nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _NETINET_TCP_OFFLOAD_H_
#define _NETINET_TCP_OFFLOAD_H_
#ifndef _KERNEL
#error "no user-serviceable parts inside"
#endif
/*
* A driver publishes that it provides offload services
* by setting IFCAP_TOE in the ifnet. The offload connect
* will bypass any further work if the interface that a
* connection would use does not support TCP offload.
*
* The TOE API assumes that the tcp offload engine can offload the
* the entire connection from set up to teardown, with some provision
* being made to allowing the software stack to handle time wait. If
* the device does not meet these criteria, it is the driver's responsibility
* to overload the functions that it needs to in tcp_usrreqs and make
* its own calls to tcp_output if it needs to do so.
*
* There is currently no provision for the device advertising the congestion
* control algorithms it supports as there is currently no API for querying
* an operating system for the protocols that it has loaded. This is a desirable
* future extension.
*
*
*
* It is assumed that individuals deploying TOE will want connections
* to be offloaded without software changes so all connections on an
* interface providing TOE are offloaded unless the the SO_NO_OFFLOAD
* flag is set on the socket.
*
*
* The toe_usrreqs structure constitutes the TOE driver's
* interface to the TCP stack for functionality that doesn't
* interact directly with userspace. If one wants to provide
* (optional) functionality to do zero-copy to/from
* userspace one still needs to override soreceive/sosend
* with functions that fault in and pin the user buffers.
*
* + tu_send
* - tells the driver that new data may have been added to the
* socket's send buffer - the driver should not fail if the
* buffer is in fact unchanged
* - the driver is responsible for providing credits (bytes in the send window)
* back to the socket by calling sbdrop() as segments are acknowledged.
* - The driver expects the inpcb lock to be held - the driver is expected
* not to drop the lock. Hence the driver is not allowed to acquire the
* pcbinfo lock during this call.
*
* + tu_rcvd
* - returns credits to the driver and triggers window updates
* to the peer (a credit as used here is a byte in the peer's receive window)
* - the driver is expected to determine how many bytes have been
* consumed and credit that back to the card so that it can grow
* the window again by maintaining its own state between invocations.
* - In principle this could be used to shrink the window as well as
* grow the window, although it is not used for that now.
* - this function needs to correctly handle being called any number of
* times without any bytes being consumed from the receive buffer.
* - The driver expects the inpcb lock to be held - the driver is expected
* not to drop the lock. Hence the driver is not allowed to acquire the
* pcbinfo lock during this call.
*
* + tu_disconnect
* - tells the driver to send FIN to peer
* - driver is expected to send the remaining data and then do a clean half close
* - disconnect implies at least half-close so only send, reset, and detach
* are legal
* - the driver is expected to handle transition through the shutdown
* state machine and allow the stack to support SO_LINGER.
* - The driver expects the inpcb lock to be held - the driver is expected
* not to drop the lock. Hence the driver is not allowed to acquire the
* pcbinfo lock during this call.
*
* + tu_reset
* - closes the connection and sends a RST to peer
* - driver is expectd to trigger an RST and detach the toepcb
* - no further calls are legal after reset
* - The driver expects the inpcb lock to be held - the driver is expected
* not to drop the lock. Hence the driver is not allowed to acquire the
* pcbinfo lock during this call.
*
* The following fields in the tcpcb are expected to be referenced by the driver:
* + iss
* + rcv_nxt
* + rcv_wnd
* + snd_isn
* + snd_max
* + snd_nxt
* + snd_una
* + t_flags
* + t_inpcb
* + t_maxseg
* + t_toe
*
* The following fields in the inpcb are expected to be referenced by the driver:
* + inp_lport
* + inp_fport
* + inp_laddr
* + inp_fport
* + inp_socket
* + inp_ip_tos
*
* The following fields in the socket are expected to be referenced by the
* driver:
* + so_comp
* + so_error
* + so_linger
* + so_options
* + so_rcv
* + so_snd
* + so_state
* + so_timeo
*
* These functions all return 0 on success and can return the following errors
* as appropriate:
* + EPERM:
* + ENOBUFS: memory allocation failed
* + EMSGSIZE: MTU changed during the call
* + EHOSTDOWN:
* + EHOSTUNREACH:
* + ENETDOWN:
* * ENETUNREACH: the peer is no longer reachable
*
* + tu_detach
* - tells driver that the socket is going away so disconnect
* the toepcb and free appropriate resources
* - allows the driver to cleanly handle the case of connection state
* outliving the socket
* - no further calls are legal after detach
* - the driver is expected to provide its own synchronization between
* detach and receiving new data.
*
* + tu_syncache_event
* - even if it is not actually needed, the driver is expected to
* call syncache_add for the initial SYN and then syncache_expand
* for the SYN,ACK
* - tells driver that a connection either has not been added or has
* been dropped from the syncache
* - the driver is expected to maintain state that lives outside the
* software stack so the syncache needs to be able to notify the
* toe driver that the software stack is not going to create a connection
* for a received SYN
* - The driver is responsible for any synchronization required between
* the syncache dropping an entry and the driver processing the SYN,ACK.
*
*/
struct toe_usrreqs {
int (*tu_send)(struct tcpcb *tp);
int (*tu_rcvd)(struct tcpcb *tp);
int (*tu_disconnect)(struct tcpcb *tp);
int (*tu_reset)(struct tcpcb *tp);
void (*tu_detach)(struct tcpcb *tp);
void (*tu_syncache_event)(int event, void *toep);
};
#define TOE_SC_ENTRY_PRESENT 1 /* 4-tuple already present */
#define TOE_SC_DROP 2 /* connection was timed out */
/*
* Because listen is a one-to-many relationship (a socket can be listening
* on all interfaces on a machine some of which may be using different TCP
* offload devices), listen uses a publish/subscribe mechanism. The TCP
* offload driver registers a listen notification function with the stack.
* When a listen socket is created all TCP offload devices are notified
* so that they can do the appropriate set up to offload connections on the
* port to which the socket is bound. When the listen socket is closed,
* the offload devices are notified so that they will stop listening on that
* port and free any associated resources as well as sending RSTs on any
* connections in the SYN_RCVD state.
*
*/
typedef void (*tcp_offload_listen_start_fn)(void *, struct tcpcb *);
typedef void (*tcp_offload_listen_stop_fn)(void *, struct tcpcb *);
EVENTHANDLER_DECLARE(tcp_offload_listen_start, tcp_offload_listen_start_fn);
EVENTHANDLER_DECLARE(tcp_offload_listen_stop, tcp_offload_listen_stop_fn);
/*
* Check if the socket can be offloaded by the following steps:
* - determine the egress interface
* - check the interface for TOE capability and TOE is enabled
* - check if the device has resources to offload the connection
*/
int tcp_offload_connect(struct socket *so, struct sockaddr *nam);
/*
* The tcp_output_* routines are wrappers around the toe_usrreqs calls
* which trigger packet transmission. In the non-offloaded case they
* translate to tcp_output. The tcp_offload_* routines notify TOE
* of specific events. I the non-offloaded case they are no-ops.
*
* Listen is a special case because it is a 1 to many relationship
* and there can be more than one offload driver in the system.
*/
/*
* Connection is offloaded
*/
#define tp_offload(tp) ((tp)->t_flags & TF_TOE)
/*
* hackish way of allowing this file to also be included by TOE
* which needs to be kept ignorant of socket implementation details
*/
#ifdef _SYS_SOCKETVAR_H_
/*
* The socket has not been marked as "do not offload"
*/
#define SO_OFFLOADABLE(so) ((so->so_options & SO_NO_OFFLOAD) == 0)
static __inline int
tcp_output_connect(struct socket *so, struct sockaddr *nam)
{
struct tcpcb *tp = sototcpcb(so);
int error;
/*
* If offload has been disabled for this socket or the
* connection cannot be offloaded just call tcp_output
* to start the TCP state machine.
*/
#ifndef TCP_OFFLOAD_DISABLE
if (!SO_OFFLOADABLE(so) || (error = tcp_offload_connect(so, nam)) != 0)
#endif
error = tcp_output(tp);
return (error);
}
static __inline int
tcp_output_send(struct tcpcb *tp)
{
#ifndef TCP_OFFLOAD_DISABLE
if (tp_offload(tp))
return (tp->t_tu->tu_send(tp));
#endif
return (tcp_output(tp));
}
static __inline int
tcp_output_rcvd(struct tcpcb *tp)
{
#ifndef TCP_OFFLOAD_DISABLE
if (tp_offload(tp))
return (tp->t_tu->tu_rcvd(tp));
#endif
return (tcp_output(tp));
}
static __inline int
tcp_output_disconnect(struct tcpcb *tp)
{
#ifndef TCP_OFFLOAD_DISABLE
if (tp_offload(tp))
return (tp->t_tu->tu_disconnect(tp));
#endif
return (tcp_output(tp));
}
static __inline int
tcp_output_reset(struct tcpcb *tp)
{
#ifndef TCP_OFFLOAD_DISABLE
if (tp_offload(tp))
return (tp->t_tu->tu_reset(tp));
#endif
return (tcp_output(tp));
}
static __inline void
tcp_offload_detach(struct tcpcb *tp)
{
#ifndef TCP_OFFLOAD_DISABLE
if (tp_offload(tp))
tp->t_tu->tu_detach(tp);
#endif
}
static __inline void
tcp_offload_listen_open(struct tcpcb *tp)
{
#ifndef TCP_OFFLOAD_DISABLE
if (SO_OFFLOADABLE(tp->t_inpcb->inp_socket))
EVENTHANDLER_INVOKE(tcp_offload_listen_start, tp);
#endif
}
static __inline void
tcp_offload_listen_close(struct tcpcb *tp)
{
#ifndef TCP_OFFLOAD_DISABLE
EVENTHANDLER_INVOKE(tcp_offload_listen_stop, tp);
#endif
}
#undef SO_OFFLOADABLE
#endif /* _SYS_SOCKETVAR_H_ */
#undef tp_offload
#endif /* _NETINET_TCP_OFFLOAD_H_ */