freebsd-nq/sys/netinet/tcp_offload.h
Lawrence Stewart 237fbe0a1c Replace struct tcpopt with a proxy toeopt struct in the TOE driver interface to
the TCP syncache. This returns struct tcpopt to being private within the TCP
implementation, thus allowing it to be modified without ABI concerns.

The patch breaks the ABI. Bump __FreeBSD_version to 800103 accordingly. The cxgb
driver is the only TOE consumer affected by this change, and needs to be
recompiled along with the kernel.

Suggested by:	rwatson
Reviewed by:	rwatson, kmacy
Approved by:	re (kensmith), kensmith (mentor temporarily unavailable)
2009-07-13 11:51:02 +00:00

355 lines
12 KiB
C

/*-
* Copyright (c) 2007, Chelsio Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Neither the name of the Chelsio Corporation nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _NETINET_TCP_OFFLOAD_H_
#define _NETINET_TCP_OFFLOAD_H_
#ifndef _KERNEL
#error "no user-serviceable parts inside"
#endif
/*
* A driver publishes that it provides offload services
* by setting IFCAP_TOE in the ifnet. The offload connect
* will bypass any further work if the interface that a
* connection would use does not support TCP offload.
*
* The TOE API assumes that the tcp offload engine can offload the
* the entire connection from set up to teardown, with some provision
* being made to allowing the software stack to handle time wait. If
* the device does not meet these criteria, it is the driver's responsibility
* to overload the functions that it needs to in tcp_usrreqs and make
* its own calls to tcp_output if it needs to do so.
*
* There is currently no provision for the device advertising the congestion
* control algorithms it supports as there is currently no API for querying
* an operating system for the protocols that it has loaded. This is a desirable
* future extension.
*
*
*
* It is assumed that individuals deploying TOE will want connections
* to be offloaded without software changes so all connections on an
* interface providing TOE are offloaded unless the the SO_NO_OFFLOAD
* flag is set on the socket.
*
*
* The toe_usrreqs structure constitutes the TOE driver's
* interface to the TCP stack for functionality that doesn't
* interact directly with userspace. If one wants to provide
* (optional) functionality to do zero-copy to/from
* userspace one still needs to override soreceive/sosend
* with functions that fault in and pin the user buffers.
*
* + tu_send
* - tells the driver that new data may have been added to the
* socket's send buffer - the driver should not fail if the
* buffer is in fact unchanged
* - the driver is responsible for providing credits (bytes in the send window)
* back to the socket by calling sbdrop() as segments are acknowledged.
* - The driver expects the inpcb lock to be held - the driver is expected
* not to drop the lock. Hence the driver is not allowed to acquire the
* pcbinfo lock during this call.
*
* + tu_rcvd
* - returns credits to the driver and triggers window updates
* to the peer (a credit as used here is a byte in the peer's receive window)
* - the driver is expected to determine how many bytes have been
* consumed and credit that back to the card so that it can grow
* the window again by maintaining its own state between invocations.
* - In principle this could be used to shrink the window as well as
* grow the window, although it is not used for that now.
* - this function needs to correctly handle being called any number of
* times without any bytes being consumed from the receive buffer.
* - The driver expects the inpcb lock to be held - the driver is expected
* not to drop the lock. Hence the driver is not allowed to acquire the
* pcbinfo lock during this call.
*
* + tu_disconnect
* - tells the driver to send FIN to peer
* - driver is expected to send the remaining data and then do a clean half close
* - disconnect implies at least half-close so only send, reset, and detach
* are legal
* - the driver is expected to handle transition through the shutdown
* state machine and allow the stack to support SO_LINGER.
* - The driver expects the inpcb lock to be held - the driver is expected
* not to drop the lock. Hence the driver is not allowed to acquire the
* pcbinfo lock during this call.
*
* + tu_reset
* - closes the connection and sends a RST to peer
* - driver is expectd to trigger an RST and detach the toepcb
* - no further calls are legal after reset
* - The driver expects the inpcb lock to be held - the driver is expected
* not to drop the lock. Hence the driver is not allowed to acquire the
* pcbinfo lock during this call.
*
* The following fields in the tcpcb are expected to be referenced by the driver:
* + iss
* + rcv_nxt
* + rcv_wnd
* + snd_isn
* + snd_max
* + snd_nxt
* + snd_una
* + t_flags
* + t_inpcb
* + t_maxseg
* + t_toe
*
* The following fields in the inpcb are expected to be referenced by the driver:
* + inp_lport
* + inp_fport
* + inp_laddr
* + inp_fport
* + inp_socket
* + inp_ip_tos
*
* The following fields in the socket are expected to be referenced by the
* driver:
* + so_comp
* + so_error
* + so_linger
* + so_options
* + so_rcv
* + so_snd
* + so_state
* + so_timeo
*
* These functions all return 0 on success and can return the following errors
* as appropriate:
* + EPERM:
* + ENOBUFS: memory allocation failed
* + EMSGSIZE: MTU changed during the call
* + EHOSTDOWN:
* + EHOSTUNREACH:
* + ENETDOWN:
* * ENETUNREACH: the peer is no longer reachable
*
* + tu_detach
* - tells driver that the socket is going away so disconnect
* the toepcb and free appropriate resources
* - allows the driver to cleanly handle the case of connection state
* outliving the socket
* - no further calls are legal after detach
* - the driver is expected to provide its own synchronization between
* detach and receiving new data.
*
* + tu_syncache_event
* - even if it is not actually needed, the driver is expected to
* call syncache_add for the initial SYN and then syncache_expand
* for the SYN,ACK
* - tells driver that a connection either has not been added or has
* been dropped from the syncache
* - the driver is expected to maintain state that lives outside the
* software stack so the syncache needs to be able to notify the
* toe driver that the software stack is not going to create a connection
* for a received SYN
* - The driver is responsible for any synchronization required between
* the syncache dropping an entry and the driver processing the SYN,ACK.
*
*/
struct toe_usrreqs {
int (*tu_send)(struct tcpcb *tp);
int (*tu_rcvd)(struct tcpcb *tp);
int (*tu_disconnect)(struct tcpcb *tp);
int (*tu_reset)(struct tcpcb *tp);
void (*tu_detach)(struct tcpcb *tp);
void (*tu_syncache_event)(int event, void *toep);
};
/*
* Proxy for struct tcpopt between TOE drivers and TCP functions.
*/
struct toeopt {
u_int64_t to_flags; /* see tcpopt in tcp_var.h */
u_int16_t to_mss; /* maximum segment size */
u_int8_t to_wscale; /* window scaling */
u_int8_t _pad1; /* explicit pad for 64bit alignment */
u_int32_t _pad2; /* explicit pad for 64bit alignment */
u_int64_t _pad3[4]; /* TBD */
};
#define TOE_SC_ENTRY_PRESENT 1 /* 4-tuple already present */
#define TOE_SC_DROP 2 /* connection was timed out */
/*
* Because listen is a one-to-many relationship (a socket can be listening
* on all interfaces on a machine some of which may be using different TCP
* offload devices), listen uses a publish/subscribe mechanism. The TCP
* offload driver registers a listen notification function with the stack.
* When a listen socket is created all TCP offload devices are notified
* so that they can do the appropriate set up to offload connections on the
* port to which the socket is bound. When the listen socket is closed,
* the offload devices are notified so that they will stop listening on that
* port and free any associated resources as well as sending RSTs on any
* connections in the SYN_RCVD state.
*
*/
typedef void (*tcp_offload_listen_start_fn)(void *, struct tcpcb *);
typedef void (*tcp_offload_listen_stop_fn)(void *, struct tcpcb *);
EVENTHANDLER_DECLARE(tcp_offload_listen_start, tcp_offload_listen_start_fn);
EVENTHANDLER_DECLARE(tcp_offload_listen_stop, tcp_offload_listen_stop_fn);
/*
* Check if the socket can be offloaded by the following steps:
* - determine the egress interface
* - check the interface for TOE capability and TOE is enabled
* - check if the device has resources to offload the connection
*/
int tcp_offload_connect(struct socket *so, struct sockaddr *nam);
/*
* The tcp_output_* routines are wrappers around the toe_usrreqs calls
* which trigger packet transmission. In the non-offloaded case they
* translate to tcp_output. The tcp_offload_* routines notify TOE
* of specific events. I the non-offloaded case they are no-ops.
*
* Listen is a special case because it is a 1 to many relationship
* and there can be more than one offload driver in the system.
*/
/*
* Connection is offloaded
*/
#define tp_offload(tp) ((tp)->t_flags & TF_TOE)
/*
* hackish way of allowing this file to also be included by TOE
* which needs to be kept ignorant of socket implementation details
*/
#ifdef _SYS_SOCKETVAR_H_
/*
* The socket has not been marked as "do not offload"
*/
#define SO_OFFLOADABLE(so) ((so->so_options & SO_NO_OFFLOAD) == 0)
static __inline int
tcp_output_connect(struct socket *so, struct sockaddr *nam)
{
struct tcpcb *tp = sototcpcb(so);
int error;
/*
* If offload has been disabled for this socket or the
* connection cannot be offloaded just call tcp_output
* to start the TCP state machine.
*/
#ifndef TCP_OFFLOAD_DISABLE
if (!SO_OFFLOADABLE(so) || (error = tcp_offload_connect(so, nam)) != 0)
#endif
error = tcp_output(tp);
return (error);
}
static __inline int
tcp_output_send(struct tcpcb *tp)
{
#ifndef TCP_OFFLOAD_DISABLE
if (tp_offload(tp))
return (tp->t_tu->tu_send(tp));
#endif
return (tcp_output(tp));
}
static __inline int
tcp_output_rcvd(struct tcpcb *tp)
{
#ifndef TCP_OFFLOAD_DISABLE
if (tp_offload(tp))
return (tp->t_tu->tu_rcvd(tp));
#endif
return (tcp_output(tp));
}
static __inline int
tcp_output_disconnect(struct tcpcb *tp)
{
#ifndef TCP_OFFLOAD_DISABLE
if (tp_offload(tp))
return (tp->t_tu->tu_disconnect(tp));
#endif
return (tcp_output(tp));
}
static __inline int
tcp_output_reset(struct tcpcb *tp)
{
#ifndef TCP_OFFLOAD_DISABLE
if (tp_offload(tp))
return (tp->t_tu->tu_reset(tp));
#endif
return (tcp_output(tp));
}
static __inline void
tcp_offload_detach(struct tcpcb *tp)
{
#ifndef TCP_OFFLOAD_DISABLE
if (tp_offload(tp))
tp->t_tu->tu_detach(tp);
#endif
}
static __inline void
tcp_offload_listen_open(struct tcpcb *tp)
{
#ifndef TCP_OFFLOAD_DISABLE
if (SO_OFFLOADABLE(tp->t_inpcb->inp_socket))
EVENTHANDLER_INVOKE(tcp_offload_listen_start, tp);
#endif
}
static __inline void
tcp_offload_listen_close(struct tcpcb *tp)
{
#ifndef TCP_OFFLOAD_DISABLE
EVENTHANDLER_INVOKE(tcp_offload_listen_stop, tp);
#endif
}
#undef SO_OFFLOADABLE
#endif /* _SYS_SOCKETVAR_H_ */
#undef tp_offload
void tcp_offload_twstart(struct tcpcb *tp);
struct tcpcb *tcp_offload_close(struct tcpcb *tp);
struct tcpcb *tcp_offload_drop(struct tcpcb *tp, int error);
#endif /* _NETINET_TCP_OFFLOAD_H_ */