freebsd-nq/sys/netipx/spx_usrreq.c

2078 lines
49 KiB
C
Raw Normal View History

/*-
* Copyright (c) 1984, 1985, 1986, 1987, 1993
* The Regents of the University of California.
* Copyright (c) 1995, Mike Mitchell
* Copyright (c) 2004-2006 Robert N. M. Watson
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)spx_usrreq.h
*/
2003-06-11 05:37:42 +00:00
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/protosw.h>
#include <sys/signalvar.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sx.h>
#include <sys/systm.h>
#include <net/route.h>
#include <netinet/tcp_fsm.h>
#include <netipx/ipx.h>
#include <netipx/ipx_pcb.h>
#include <netipx/ipx_var.h>
#include <netipx/spx.h>
#include <netipx/spx_debug.h>
#include <netipx/spx_timer.h>
#include <netipx/spx_var.h>
/*
* SPX protocol implementation.
*/
static struct mtx spx_mtx; /* Protects only spx_iss. */
1998-02-09 06:11:36 +00:00
static u_short spx_iss;
static u_short spx_newchecks[50];
static int spx_hardnosed;
static int spx_use_delack = 0;
static int traceallspxs = 0;
static struct spx_istat spx_istat;
static int spxrexmtthresh = 3;
#define SPX_LOCK_INIT() mtx_init(&spx_mtx, "spx_mtx", NULL, MTX_DEF)
#define SPX_LOCK() mtx_lock(&spx_mtx)
#define SPX_UNLOCK() mtx_unlock(&spx_mtx)
/* Following was struct spxstat spxstat; */
2005-01-02 15:13:59 +00:00
#ifndef spxstat
#define spxstat spx_istat.newstats
2005-01-02 15:13:59 +00:00
#endif
2004-07-12 19:35:29 +00:00
static const int spx_backoff[SPX_MAXRXTSHIFT+1] =
{ 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
static void spx_close(struct spxpcb *cb);
static void spx_disconnect(struct spxpcb *cb);
static void spx_drop(struct spxpcb *cb, int errno);
static int spx_output(struct spxpcb *cb, struct mbuf *m0);
static int spx_reass(struct spxpcb *cb, struct spx *si);
static void spx_setpersist(struct spxpcb *cb);
static void spx_template(struct spxpcb *cb);
static void spx_timers(struct spxpcb *cb, int timer);
static void spx_usrclosed(struct spxpcb *cb);
static void spx_usr_abort(struct socket *so);
static int spx_accept(struct socket *so, struct sockaddr **nam);
static int spx_attach(struct socket *so, int proto, struct thread *td);
static int spx_bind(struct socket *so, struct sockaddr *nam, struct thread *td);
static void spx_usr_close(struct socket *so);
static int spx_connect(struct socket *so, struct sockaddr *nam,
struct thread *td);
static void spx_detach(struct socket *so);
static void spx_pcbdetach(struct ipxpcb *ipxp);
static int spx_usr_disconnect(struct socket *so);
static int spx_listen(struct socket *so, int backlog, struct thread *td);
static int spx_rcvd(struct socket *so, int flags);
static int spx_rcvoob(struct socket *so, struct mbuf *m, int flags);
static int spx_send(struct socket *so, int flags, struct mbuf *m,
2005-01-02 15:13:59 +00:00
struct sockaddr *addr, struct mbuf *control,
struct thread *td);
static int spx_shutdown(struct socket *so);
static int spx_sp_attach(struct socket *so, int proto, struct thread *td);
struct pr_usrreqs spx_usrreqs = {
.pru_abort = spx_usr_abort,
.pru_accept = spx_accept,
.pru_attach = spx_attach,
.pru_bind = spx_bind,
.pru_connect = spx_connect,
.pru_control = ipx_control,
.pru_detach = spx_detach,
.pru_disconnect = spx_usr_disconnect,
.pru_listen = spx_listen,
.pru_peeraddr = ipx_peeraddr,
.pru_rcvd = spx_rcvd,
.pru_rcvoob = spx_rcvoob,
.pru_send = spx_send,
.pru_shutdown = spx_shutdown,
.pru_sockaddr = ipx_sockaddr,
.pru_close = spx_usr_close,
};
struct pr_usrreqs spx_usrreq_sps = {
.pru_abort = spx_usr_abort,
.pru_accept = spx_accept,
.pru_attach = spx_sp_attach,
.pru_bind = spx_bind,
.pru_connect = spx_connect,
.pru_control = ipx_control,
.pru_detach = spx_detach,
.pru_disconnect = spx_usr_disconnect,
.pru_listen = spx_listen,
.pru_peeraddr = ipx_peeraddr,
.pru_rcvd = spx_rcvd,
.pru_rcvoob = spx_rcvoob,
.pru_send = spx_send,
.pru_shutdown = spx_shutdown,
.pru_sockaddr = ipx_sockaddr,
.pru_close = spx_usr_close,
};
void
spx_init(void)
{
SPX_LOCK_INIT();
spx_iss = 1; /* WRONG !! should fish it out of TODR */
}
void
spx_input(struct mbuf *m, struct ipxpcb *ipxp)
{
struct spxpcb *cb;
struct spx *si = mtod(m, struct spx *);
struct socket *so;
struct spx spx_savesi;
int dropsocket = 0;
short ostate = 0;
spxstat.spxs_rcvtotal++;
KASSERT(ipxp != NULL, ("spx_input: ipxpcb == NULL"));
/*
* spx_input() assumes that the caller will hold both the pcb list
* lock and also the ipxp lock. spx_input() will release both before
* returning, and may in fact trade in the ipxp lock for another pcb
* lock following sonewconn().
*/
IPX_LIST_LOCK_ASSERT();
IPX_LOCK_ASSERT(ipxp);
cb = ipxtospxpcb(ipxp);
KASSERT(cb != NULL, ("spx_input: cb == NULL"));
if (ipxp->ipxp_flags & IPXP_DROPPED)
goto drop;
if (m->m_len < sizeof(*si)) {
if ((m = m_pullup(m, sizeof(*si))) == NULL) {
IPX_UNLOCK(ipxp);
IPX_LIST_UNLOCK();
spxstat.spxs_rcvshort++;
return;
}
si = mtod(m, struct spx *);
}
si->si_seq = ntohs(si->si_seq);
si->si_ack = ntohs(si->si_ack);
si->si_alo = ntohs(si->si_alo);
so = ipxp->ipxp_socket;
KASSERT(so != NULL, ("spx_input: so == NULL"));
if (so->so_options & SO_DEBUG || traceallspxs) {
ostate = cb->s_state;
spx_savesi = *si;
}
if (so->so_options & SO_ACCEPTCONN) {
struct spxpcb *ocb = cb;
so = sonewconn(so, 0);
if (so == NULL)
goto drop;
/*
* This is ugly, but ....
*
* Mark socket as temporary until we're committed to keeping
* it. The code at ``drop'' and ``dropwithreset'' check the
* flag dropsocket to see if the temporary socket created
* here should be discarded. We mark the socket as
* discardable until we're committed to it below in
* TCPS_LISTEN.
*
* XXXRW: In the new world order of real kernel parallelism,
* temporarily allocating the socket when we're "not sure"
* seems like a bad idea, as we might race to remove it if
* the listen socket is closed...?
*
* We drop the lock of the listen socket ipxp, and acquire
* the lock of the new socket ippx.
*/
dropsocket++;
IPX_UNLOCK(ipxp);
ipxp = (struct ipxpcb *)so->so_pcb;
IPX_LOCK(ipxp);
ipxp->ipxp_laddr = si->si_dna;
cb = ipxtospxpcb(ipxp);
cb->s_mtu = ocb->s_mtu; /* preserve sockopts */
cb->s_flags = ocb->s_flags; /* preserve sockopts */
cb->s_flags2 = ocb->s_flags2; /* preserve sockopts */
cb->s_state = TCPS_LISTEN;
}
IPX_LOCK_ASSERT(ipxp);
/*
* Packet received on connection. Reset idle time and keep-alive
* timer.
*/
cb->s_idle = 0;
cb->s_timer[SPXT_KEEP] = SPXTV_KEEP;
switch (cb->s_state) {
case TCPS_LISTEN:{
struct sockaddr_ipx *sipx, ssipx;
struct ipx_addr laddr;
/*
* If somebody here was carying on a conversation and went
* away, and his pen pal thinks he can still talk, we get the
* misdirected packet.
*/
if (spx_hardnosed && (si->si_did != 0 || si->si_seq != 0)) {
spx_istat.gonawy++;
goto dropwithreset;
}
sipx = &ssipx;
bzero(sipx, sizeof *sipx);
sipx->sipx_len = sizeof(*sipx);
sipx->sipx_family = AF_IPX;
sipx->sipx_addr = si->si_sna;
laddr = ipxp->ipxp_laddr;
if (ipx_nullhost(laddr))
ipxp->ipxp_laddr = si->si_dna;
if (ipx_pcbconnect(ipxp, (struct sockaddr *)sipx, &thread0)) {
ipxp->ipxp_laddr = laddr;
spx_istat.noconn++;
goto drop;
}
spx_template(cb);
dropsocket = 0; /* committed to socket */
cb->s_did = si->si_sid;
cb->s_rack = si->si_ack;
cb->s_ralo = si->si_alo;
#define THREEWAYSHAKE
#ifdef THREEWAYSHAKE
cb->s_state = TCPS_SYN_RECEIVED;
cb->s_force = 1 + SPXT_KEEP;
spxstat.spxs_accepts++;
cb->s_timer[SPXT_KEEP] = SPXTV_KEEP;
}
break;
case TCPS_SYN_RECEIVED: {
/*
* This state means that we have heard a response to our
* acceptance of their connection. It is probably logically
* unnecessary in this implementation.
*/
if (si->si_did != cb->s_sid) {
spx_istat.wrncon++;
goto drop;
}
#endif
ipxp->ipxp_fport = si->si_sport;
cb->s_timer[SPXT_REXMT] = 0;
cb->s_timer[SPXT_KEEP] = SPXTV_KEEP;
soisconnected(so);
cb->s_state = TCPS_ESTABLISHED;
spxstat.spxs_accepts++;
}
break;
case TCPS_SYN_SENT:
/*
* This state means that we have gotten a response to our
* attempt to establish a connection. We fill in the data
* from the other side, telling us which port to respond to,
* instead of the well-known one we might have sent to in the
* first place. We also require that this is a response to
* our connection id.
*/
if (si->si_did != cb->s_sid) {
spx_istat.notme++;
goto drop;
}
spxstat.spxs_connects++;
cb->s_did = si->si_sid;
cb->s_rack = si->si_ack;
cb->s_ralo = si->si_alo;
cb->s_dport = ipxp->ipxp_fport = si->si_sport;
cb->s_timer[SPXT_REXMT] = 0;
cb->s_flags |= SF_ACKNOW;
soisconnected(so);
cb->s_state = TCPS_ESTABLISHED;
/*
* Use roundtrip time of connection request for initial rtt.
*/
if (cb->s_rtt) {
cb->s_srtt = cb->s_rtt << 3;
cb->s_rttvar = cb->s_rtt << 1;
SPXT_RANGESET(cb->s_rxtcur,
((cb->s_srtt >> 2) + cb->s_rttvar) >> 1,
SPXTV_MIN, SPXTV_REXMTMAX);
cb->s_rtt = 0;
}
}
if (so->so_options & SO_DEBUG || traceallspxs)
spx_trace(SA_INPUT, (u_char)ostate, cb, &spx_savesi, 0);
m->m_len -= sizeof(struct ipx);
m->m_pkthdr.len -= sizeof(struct ipx);
m->m_data += sizeof(struct ipx);
if (spx_reass(cb, si))
m_freem(m);
if (cb->s_force || (cb->s_flags & (SF_ACKNOW|SF_WIN|SF_RXT)))
spx_output(cb, NULL);
cb->s_flags &= ~(SF_WIN|SF_RXT);
IPX_UNLOCK(ipxp);
IPX_LIST_UNLOCK();
return;
dropwithreset:
IPX_LOCK_ASSERT(ipxp);
if (cb == NULL || (cb->s_ipxpcb->ipxp_socket->so_options & SO_DEBUG ||
traceallspxs))
spx_trace(SA_DROP, (u_char)ostate, cb, &spx_savesi, 0);
IPX_UNLOCK(ipxp);
if (dropsocket) {
struct socket *head;
ACCEPT_LOCK();
KASSERT((so->so_qstate & SQ_INCOMP) != 0,
("spx_input: nascent socket not SQ_INCOMP on soabort()"));
head = so->so_head;
TAILQ_REMOVE(&head->so_incomp, so, so_list);
head->so_incqlen--;
so->so_qstate &= ~SQ_INCOMP;
so->so_head = NULL;
ACCEPT_UNLOCK();
soabort(so);
}
IPX_LIST_UNLOCK();
m_freem(dtom(si));
return;
drop:
IPX_LOCK_ASSERT(ipxp);
if (cb->s_ipxpcb->ipxp_socket->so_options & SO_DEBUG || traceallspxs)
spx_trace(SA_DROP, (u_char)ostate, cb, &spx_savesi, 0);
IPX_UNLOCK(ipxp);
IPX_LIST_UNLOCK();
m_freem(m);
}
/*
* This is structurally similar to the tcp reassembly routine but its
* function is somewhat different: It merely queues packets up, and
* suppresses duplicates.
*/
static int
spx_reass(struct spxpcb *cb, struct spx *si)
{
struct spx_q *q;
struct mbuf *m;
struct socket *so = cb->s_ipxpcb->ipxp_socket;
char packetp = cb->s_flags & SF_HI;
int incr;
char wakeup = 0;
IPX_LOCK_ASSERT(cb->s_ipxpcb);
if (si == SI(0))
goto present;
/*
* Update our news from them.
*/
if (si->si_cc & SPX_SA)
cb->s_flags |= (spx_use_delack ? SF_DELACK : SF_ACKNOW);
if (SSEQ_GT(si->si_alo, cb->s_ralo))
cb->s_flags |= SF_WIN;
if (SSEQ_LEQ(si->si_ack, cb->s_rack)) {
if ((si->si_cc & SPX_SP) && cb->s_rack != (cb->s_smax + 1)) {
spxstat.spxs_rcvdupack++;
/*
* If this is a completely duplicate ack and other
* conditions hold, we assume a packet has been
* dropped and retransmit it exactly as in
* tcp_input().
*/
if (si->si_ack != cb->s_rack ||
si->si_alo != cb->s_ralo)
cb->s_dupacks = 0;
else if (++cb->s_dupacks == spxrexmtthresh) {
u_short onxt = cb->s_snxt;
int cwnd = cb->s_cwnd;
cb->s_snxt = si->si_ack;
cb->s_cwnd = CUNIT;
cb->s_force = 1 + SPXT_REXMT;
spx_output(cb, NULL);
cb->s_timer[SPXT_REXMT] = cb->s_rxtcur;
cb->s_rtt = 0;
if (cwnd >= 4 * CUNIT)
cb->s_cwnd = cwnd / 2;
if (SSEQ_GT(onxt, cb->s_snxt))
cb->s_snxt = onxt;
return (1);
}
} else
cb->s_dupacks = 0;
goto update_window;
}
cb->s_dupacks = 0;
/*
* If our correspondent acknowledges data we haven't sent TCP would
* drop the packet after acking. We'll be a little more permissive.
*/
if (SSEQ_GT(si->si_ack, (cb->s_smax + 1))) {
spxstat.spxs_rcvacktoomuch++;
si->si_ack = cb->s_smax + 1;
}
spxstat.spxs_rcvackpack++;
/*
* If transmit timer is running and timed sequence number was acked,
* update smoothed round trip time. See discussion of algorithm in
* tcp_input.c
*/
if (cb->s_rtt && SSEQ_GT(si->si_ack, cb->s_rtseq)) {
spxstat.spxs_rttupdated++;
if (cb->s_srtt != 0) {
short delta;
delta = cb->s_rtt - (cb->s_srtt >> 3);
if ((cb->s_srtt += delta) <= 0)
cb->s_srtt = 1;
if (delta < 0)
delta = -delta;
delta -= (cb->s_rttvar >> 2);
if ((cb->s_rttvar += delta) <= 0)
cb->s_rttvar = 1;
} else {
/*
* No rtt measurement yet.
*/
cb->s_srtt = cb->s_rtt << 3;
cb->s_rttvar = cb->s_rtt << 1;
}
cb->s_rtt = 0;
cb->s_rxtshift = 0;
SPXT_RANGESET(cb->s_rxtcur,
((cb->s_srtt >> 2) + cb->s_rttvar) >> 1,
SPXTV_MIN, SPXTV_REXMTMAX);
}
/*
* If all outstanding data is acked, stop retransmit timer and
* remember to restart (more output or persist). If there is more
* data to be acked, restart retransmit timer, using current
* (possibly backed-off) value;
*/
if (si->si_ack == cb->s_smax + 1) {
cb->s_timer[SPXT_REXMT] = 0;
cb->s_flags |= SF_RXT;
} else if (cb->s_timer[SPXT_PERSIST] == 0)
cb->s_timer[SPXT_REXMT] = cb->s_rxtcur;
/*
* When new data is acked, open the congestion window. If the window
* gives us less than ssthresh packets in flight, open exponentially
* (maxseg at a time). Otherwise open linearly (maxseg^2 / cwnd at a
* time).
*/
incr = CUNIT;
if (cb->s_cwnd > cb->s_ssthresh)
incr = max(incr * incr / cb->s_cwnd, 1);
cb->s_cwnd = min(cb->s_cwnd + incr, cb->s_cwmx);
/*
* Trim Acked data from output queue.
*/
SOCKBUF_LOCK(&so->so_snd);
while ((m = so->so_snd.sb_mb) != NULL) {
if (SSEQ_LT((mtod(m, struct spx *))->si_seq, si->si_ack))
sbdroprecord_locked(&so->so_snd);
else
break;
}
sowwakeup_locked(so);
cb->s_rack = si->si_ack;
update_window:
if (SSEQ_LT(cb->s_snxt, cb->s_rack))
cb->s_snxt = cb->s_rack;
if (SSEQ_LT(cb->s_swl1, si->si_seq) || ((cb->s_swl1 == si->si_seq &&
(SSEQ_LT(cb->s_swl2, si->si_ack))) ||
(cb->s_swl2 == si->si_ack && SSEQ_LT(cb->s_ralo, si->si_alo)))) {
/* keep track of pure window updates */
if ((si->si_cc & SPX_SP) && cb->s_swl2 == si->si_ack
&& SSEQ_LT(cb->s_ralo, si->si_alo)) {
spxstat.spxs_rcvwinupd++;
spxstat.spxs_rcvdupack--;
}
cb->s_ralo = si->si_alo;
cb->s_swl1 = si->si_seq;
cb->s_swl2 = si->si_ack;
cb->s_swnd = (1 + si->si_alo - si->si_ack);
if (cb->s_swnd > cb->s_smxw)
cb->s_smxw = cb->s_swnd;
cb->s_flags |= SF_WIN;
}
/*
* If this packet number is higher than that which we have allocated
* refuse it, unless urgent.
*/
if (SSEQ_GT(si->si_seq, cb->s_alo)) {
if (si->si_cc & SPX_SP) {
spxstat.spxs_rcvwinprobe++;
return (1);
} else
spxstat.spxs_rcvpackafterwin++;
if (si->si_cc & SPX_OB) {
if (SSEQ_GT(si->si_seq, cb->s_alo + 60)) {
m_freem(dtom(si));
return (0);
} /* else queue this packet; */
} else {
#ifdef BROKEN
/*
* XXXRW: This is broken on at least one count:
* spx_close() will free the ipxp and related parts,
* which are then touched by spx_input() after the
* return from spx_reass().
*/
/*struct socket *so = cb->s_ipxpcb->ipxp_socket;
if (so->so_state && SS_NOFDREF) {
spx_close(cb);
} else
would crash system*/
#endif
spx_istat.notyet++;
m_freem(dtom(si));
return (0);
}
}
/*
* If this is a system packet, we don't need to queue it up, and
* won't update acknowledge #.
*/
if (si->si_cc & SPX_SP)
return (1);
/*
* We have already seen this packet, so drop.
*/
if (SSEQ_LT(si->si_seq, cb->s_ack)) {
spx_istat.bdreas++;
spxstat.spxs_rcvduppack++;
if (si->si_seq == cb->s_ack - 1)
spx_istat.lstdup++;
return (1);
}
/*
* Loop through all packets queued up to insert in appropriate
* sequence.
*/
for (q = cb->s_q.si_next; q != &cb->s_q; q = q->si_next) {
if (si->si_seq == SI(q)->si_seq) {
spxstat.spxs_rcvduppack++;
return (1);
}
if (SSEQ_LT(si->si_seq, SI(q)->si_seq)) {
spxstat.spxs_rcvoopack++;
break;
}
}
insque(si, q->si_prev);
/*
* If this packet is urgent, inform process
*/
if (si->si_cc & SPX_OB) {
cb->s_iobc = ((char *)si)[1 + sizeof(*si)];
sohasoutofband(so);
cb->s_oobflags |= SF_IOOB;
}
present:
#define SPINC sizeof(struct spxhdr)
SOCKBUF_LOCK(&so->so_rcv);
/*
* Loop through all packets queued up to update acknowledge number,
* and present all acknowledged data to user; if in packet interface
* mode, show packet headers.
*/
for (q = cb->s_q.si_next; q != &cb->s_q; q = q->si_next) {
if (SI(q)->si_seq == cb->s_ack) {
cb->s_ack++;
m = dtom(q);
if (SI(q)->si_cc & SPX_OB) {
cb->s_oobflags &= ~SF_IOOB;
if (so->so_rcv.sb_cc)
so->so_oobmark = so->so_rcv.sb_cc;
else
so->so_rcv.sb_state |= SBS_RCVATMARK;
}
q = q->si_prev;
remque(q->si_next);
wakeup = 1;
spxstat.spxs_rcvpack++;
#ifdef SF_NEWCALL
if (cb->s_flags2 & SF_NEWCALL) {
struct spxhdr *sp = mtod(m, struct spxhdr *);
u_char dt = sp->spx_dt;
spx_newchecks[4]++;
if (dt != cb->s_rhdr.spx_dt) {
struct mbuf *mm =
m_getclr(M_DONTWAIT, MT_CONTROL);
spx_newchecks[0]++;
if (mm != NULL) {
u_short *s =
mtod(mm, u_short *);
cb->s_rhdr.spx_dt = dt;
mm->m_len = 5; /*XXX*/
s[0] = 5;
s[1] = 1;
*(u_char *)(&s[2]) = dt;
sbappend_locked(&so->so_rcv, mm);
}
}
if (sp->spx_cc & SPX_OB) {
MCHTYPE(m, MT_OOBDATA);
spx_newchecks[1]++;
so->so_oobmark = 0;
so->so_rcv.sb_state &= ~SBS_RCVATMARK;
}
if (packetp == 0) {
m->m_data += SPINC;
m->m_len -= SPINC;
m->m_pkthdr.len -= SPINC;
}
if ((sp->spx_cc & SPX_EM) || packetp) {
sbappendrecord_locked(&so->so_rcv, m);
spx_newchecks[9]++;
} else
sbappend_locked(&so->so_rcv, m);
} else
#endif
if (packetp)
sbappendrecord_locked(&so->so_rcv, m);
else {
cb->s_rhdr = *mtod(m, struct spxhdr *);
m->m_data += SPINC;
m->m_len -= SPINC;
m->m_pkthdr.len -= SPINC;
sbappend_locked(&so->so_rcv, m);
}
} else
break;
}
if (wakeup)
sorwakeup_locked(so);
else
SOCKBUF_UNLOCK(&so->so_rcv);
return (0);
}
void
spx_ctlinput(int cmd, struct sockaddr *arg_as_sa, void *dummy)
{
/* Currently, nothing. */
}
static int
spx_output(struct spxpcb *cb, struct mbuf *m0)
{
struct socket *so = cb->s_ipxpcb->ipxp_socket;
struct mbuf *m;
struct spx *si = NULL;
struct sockbuf *sb = &so->so_snd;
int len = 0, win, rcv_win;
short span, off, recordp = 0;
u_short alo;
int error = 0, sendalot;
#ifdef notdef
int idle;
#endif
struct mbuf *mprev;
IPX_LOCK_ASSERT(cb->s_ipxpcb);
if (m0 != NULL) {
int mtu = cb->s_mtu;
int datalen;
/*
* Make sure that packet isn't too big.
*/
for (m = m0; m != NULL; m = m->m_next) {
mprev = m;
len += m->m_len;
if (m->m_flags & M_EOR)
recordp = 1;
}
datalen = (cb->s_flags & SF_HO) ?
len - sizeof(struct spxhdr) : len;
if (datalen > mtu) {
if (cb->s_flags & SF_PI) {
m_freem(m0);
return (EMSGSIZE);
} else {
int oldEM = cb->s_cc & SPX_EM;
cb->s_cc &= ~SPX_EM;
while (len > mtu) {
m = m_copym(m0, 0, mtu, M_DONTWAIT);
if (m == NULL) {
cb->s_cc |= oldEM;
m_freem(m0);
return (ENOBUFS);
}
if (cb->s_flags & SF_NEWCALL) {
struct mbuf *mm = m;
spx_newchecks[7]++;
while (mm != NULL) {
mm->m_flags &= ~M_EOR;
mm = mm->m_next;
}
}
error = spx_output(cb, m);
if (error) {
cb->s_cc |= oldEM;
m_freem(m0);
return (error);
}
m_adj(m0, mtu);
len -= mtu;
}
cb->s_cc |= oldEM;
}
}
/*
* Force length even, by adding a "garbage byte" if
* necessary.
*/
if (len & 1) {
m = mprev;
if (M_TRAILINGSPACE(m) >= 1)
m->m_len++;
else {
struct mbuf *m1 = m_get(M_DONTWAIT, MT_DATA);
if (m1 == NULL) {
m_freem(m0);
return (ENOBUFS);
}
m1->m_len = 1;
*(mtod(m1, u_char *)) = 0;
m->m_next = m1;
}
}
m = m_gethdr(M_DONTWAIT, MT_DATA);
if (m == NULL) {
m_freem(m0);
return (ENOBUFS);
}
/*
* Fill in mbuf with extended SP header and addresses and
* length put into network format.
*/
MH_ALIGN(m, sizeof(struct spx));
m->m_len = sizeof(struct spx);
m->m_next = m0;
si = mtod(m, struct spx *);
si->si_i = *cb->s_ipx;
si->si_s = cb->s_shdr;
if ((cb->s_flags & SF_PI) && (cb->s_flags & SF_HO)) {
struct spxhdr *sh;
if (m0->m_len < sizeof(*sh)) {
if((m0 = m_pullup(m0, sizeof(*sh))) == NULL) {
m_free(m);
m_freem(m0);
return (EINVAL);
}
m->m_next = m0;
}
sh = mtod(m0, struct spxhdr *);
si->si_dt = sh->spx_dt;
si->si_cc |= sh->spx_cc & SPX_EM;
m0->m_len -= sizeof(*sh);
m0->m_data += sizeof(*sh);
len -= sizeof(*sh);
}
len += sizeof(*si);
if ((cb->s_flags2 & SF_NEWCALL) && recordp) {
si->si_cc |= SPX_EM;
spx_newchecks[8]++;
}
if (cb->s_oobflags & SF_SOOB) {
/*
* Per jqj@cornell: Make sure OB packets convey
* exactly 1 byte. If the packet is 1 byte or
* larger, we have already guaranted there to be at
* least one garbage byte for the checksum, and extra
* bytes shouldn't hurt!
*/
if (len > sizeof(*si)) {
si->si_cc |= SPX_OB;
len = (1 + sizeof(*si));
}
}
si->si_len = htons((u_short)len);
m->m_pkthdr.len = ((len - 1) | 1) + 1;
/*
* Queue stuff up for output.
*/
sbappendrecord(sb, m);
cb->s_seq++;
}
#ifdef notdef
idle = (cb->s_smax == (cb->s_rack - 1));
#endif
again:
sendalot = 0;
off = cb->s_snxt - cb->s_rack;
win = min(cb->s_swnd, (cb->s_cwnd / CUNIT));
/*
* If in persist timeout with window of 0, send a probe. Otherwise,
* if window is small but nonzero and timer expired, send what we can
* and go into transmit state.
*/
if (cb->s_force == 1 + SPXT_PERSIST) {
if (win != 0) {
cb->s_timer[SPXT_PERSIST] = 0;
cb->s_rxtshift = 0;
}
}
span = cb->s_seq - cb->s_rack;
len = min(span, win) - off;
if (len < 0) {
/*
* Window shrank after we went into it. If window shrank to
* 0, cancel pending restransmission and pull s_snxt back to
* (closed) window. We will enter persist state below. If
* the widndow didn't close completely, just wait for an ACK.
*/
len = 0;
if (win == 0) {
cb->s_timer[SPXT_REXMT] = 0;
cb->s_snxt = cb->s_rack;
}
}
if (len > 1)
sendalot = 1;
rcv_win = sbspace(&so->so_rcv);
/*
* Send if we owe peer an ACK.
*/
if (cb->s_oobflags & SF_SOOB) {
/*
* Must transmit this out of band packet.
*/
cb->s_oobflags &= ~ SF_SOOB;
sendalot = 1;
spxstat.spxs_sndurg++;
goto found;
}
if (cb->s_flags & SF_ACKNOW)
goto send;
if (cb->s_state < TCPS_ESTABLISHED)
goto send;
/*
* Silly window can't happen in spx. Code from TCP deleted.
*/
if (len)
goto send;
/*
* Compare available window to amount of window known to peer (as
* advertised window less next expected input.) If the difference is
* at least two packets or at least 35% of the mximum possible
* window, then want to send a window update to peer.
*/
if (rcv_win > 0) {
u_short delta = 1 + cb->s_alo - cb->s_ack;
int adv = rcv_win - (delta * cb->s_mtu);
2005-01-02 15:13:59 +00:00
if ((so->so_rcv.sb_cc == 0 && adv >= (2 * cb->s_mtu)) ||
(100 * adv / so->so_rcv.sb_hiwat >= 35)) {
spxstat.spxs_sndwinup++;
cb->s_flags |= SF_ACKNOW;
goto send;
}
}
/*
* Many comments from tcp_output.c are appropriate here including ...
* If send window is too small, there is data to transmit, and no
* retransmit or persist is pending, then go to persist state. If
* nothing happens soon, send when timer expires: if window is
* nonzero, transmit what we can, otherwise send a probe.
*/
if (so->so_snd.sb_cc && cb->s_timer[SPXT_REXMT] == 0 &&
cb->s_timer[SPXT_PERSIST] == 0) {
cb->s_rxtshift = 0;
spx_setpersist(cb);
}
/*
* No reason to send a packet, just return.
*/
cb->s_outx = 1;
return (0);
send:
/*
* Find requested packet.
*/
si = 0;
if (len > 0) {
cb->s_want = cb->s_snxt;
for (m = sb->sb_mb; m != NULL; m = m->m_act) {
si = mtod(m, struct spx *);
if (SSEQ_LEQ(cb->s_snxt, si->si_seq))
break;
}
found:
if (si != NULL) {
if (si->si_seq == cb->s_snxt)
cb->s_snxt++;
else
spxstat.spxs_sndvoid++, si = 0;
}
}
/*
* Update window.
*/
if (rcv_win < 0)
rcv_win = 0;
alo = cb->s_ack - 1 + (rcv_win / ((short)cb->s_mtu));
2005-01-02 15:13:59 +00:00
if (SSEQ_LT(alo, cb->s_alo))
alo = cb->s_alo;
if (si != NULL) {
/*
* Must make a copy of this packet for ipx_output to monkey
* with.
*/
m = m_copy(dtom(si), 0, (int)M_COPYALL);
if (m == NULL)
return (ENOBUFS);
si = mtod(m, struct spx *);
if (SSEQ_LT(si->si_seq, cb->s_smax))
spxstat.spxs_sndrexmitpack++;
else
spxstat.spxs_sndpack++;
} else if (cb->s_force || cb->s_flags & SF_ACKNOW) {
/*
* Must send an acknowledgement or a probe.
*/
if (cb->s_force)
spxstat.spxs_sndprobe++;
if (cb->s_flags & SF_ACKNOW)
spxstat.spxs_sndacks++;
m = m_gethdr(M_DONTWAIT, MT_DATA);
if (m == NULL)
return (ENOBUFS);
/*
* Fill in mbuf with extended SP header and addresses and
* length put into network format.
*/
MH_ALIGN(m, sizeof(struct spx));
m->m_len = sizeof(*si);
m->m_pkthdr.len = sizeof(*si);
si = mtod(m, struct spx *);
si->si_i = *cb->s_ipx;
si->si_s = cb->s_shdr;
si->si_seq = cb->s_smax + 1;
si->si_len = htons(sizeof(*si));
si->si_cc |= SPX_SP;
} else {
cb->s_outx = 3;
if (so->so_options & SO_DEBUG || traceallspxs)
spx_trace(SA_OUTPUT, cb->s_state, cb, si, 0);
return (0);
}
/*
* Stuff checksum and output datagram.
*/
if ((si->si_cc & SPX_SP) == 0) {
if (cb->s_force != (1 + SPXT_PERSIST) ||
cb->s_timer[SPXT_PERSIST] == 0) {
/*
2005-01-02 15:13:59 +00:00
* If this is a new packet and we are not currently
* timing anything, time this one.
*/
if (SSEQ_LT(cb->s_smax, si->si_seq)) {
cb->s_smax = si->si_seq;
if (cb->s_rtt == 0) {
spxstat.spxs_segstimed++;
cb->s_rtseq = si->si_seq;
cb->s_rtt = 1;
}
}
/*
* Set rexmt timer if not currently set, initial
* value for retransmit timer is smoothed round-trip
* time + 2 * round-trip time variance. Initialize
* shift counter which is used for backoff of
* retransmit time.
*/
if (cb->s_timer[SPXT_REXMT] == 0 &&
cb->s_snxt != cb->s_rack) {
cb->s_timer[SPXT_REXMT] = cb->s_rxtcur;
if (cb->s_timer[SPXT_PERSIST]) {
cb->s_timer[SPXT_PERSIST] = 0;
cb->s_rxtshift = 0;
}
}
} else if (SSEQ_LT(cb->s_smax, si->si_seq))
cb->s_smax = si->si_seq;
} else if (cb->s_state < TCPS_ESTABLISHED) {
if (cb->s_rtt == 0)
cb->s_rtt = 1; /* Time initial handshake */
if (cb->s_timer[SPXT_REXMT] == 0)
cb->s_timer[SPXT_REXMT] = cb->s_rxtcur;
}
/*
* Do not request acks when we ack their data packets or when we do a
* gratuitous window update.
*/
if (((si->si_cc & SPX_SP) == 0) || cb->s_force)
si->si_cc |= SPX_SA;
si->si_seq = htons(si->si_seq);
si->si_alo = htons(alo);
si->si_ack = htons(cb->s_ack);
if (ipxcksum)
si->si_sum = ipx_cksum(m, ntohs(si->si_len));
else
si->si_sum = 0xffff;
cb->s_outx = 4;
if (so->so_options & SO_DEBUG || traceallspxs)
spx_trace(SA_OUTPUT, cb->s_state, cb, si, 0);
if (so->so_options & SO_DONTROUTE)
error = ipx_outputfl(m, NULL, IPX_ROUTETOIF);
else
error = ipx_outputfl(m, &cb->s_ipxpcb->ipxp_route, 0);
if (error)
return (error);
spxstat.spxs_sndtotal++;
/*
* Data sent (as far as we can tell). If this advertises a larger
* window than any other segment, then remember the size of the
* advertized window. Any pending ACK has now been sent.
*/
cb->s_force = 0;
cb->s_flags &= ~(SF_ACKNOW|SF_DELACK);
if (SSEQ_GT(alo, cb->s_alo))
cb->s_alo = alo;
if (sendalot)
goto again;
cb->s_outx = 5;
return (0);
}
1998-02-09 06:11:36 +00:00
static int spx_do_persist_panics = 0;
static void
spx_setpersist(struct spxpcb *cb)
{
int t = ((cb->s_srtt >> 2) + cb->s_rttvar) >> 1;
IPX_LOCK_ASSERT(cb->s_ipxpcb);
if (cb->s_timer[SPXT_REXMT] && spx_do_persist_panics)
panic("spx_output REXMT");
/*
* Start/restart persistance timer.
*/
SPXT_RANGESET(cb->s_timer[SPXT_PERSIST],
t*spx_backoff[cb->s_rxtshift],
SPXTV_PERSMIN, SPXTV_PERSMAX);
if (cb->s_rxtshift < SPX_MAXRXTSHIFT)
cb->s_rxtshift++;
}
int
spx_ctloutput(struct socket *so, struct sockopt *sopt)
{
struct spxhdr spxhdr;
struct ipxpcb *ipxp;
struct spxpcb *cb;
int mask, error;
short soptval;
u_short usoptval;
int optval;
ipxp = sotoipxpcb(so);
KASSERT(ipxp != NULL, ("spx_ctloutput: ipxp == NULL"));
/*
* This will have to be changed when we do more general stacking of
* protocols.
*/
if (sopt->sopt_level != IPXPROTO_SPX)
return (ipx_ctloutput(so, sopt));
IPX_LOCK(ipxp);
if (ipxp->ipxp_flags & IPXP_DROPPED) {
IPX_UNLOCK(ipxp);
return (ECONNRESET);
}
IPX_LOCK(ipxp);
cb = ipxtospxpcb(ipxp);
KASSERT(cb != NULL, ("spx_ctloutput: cb == NULL"));
error = 0;
switch (sopt->sopt_dir) {
case SOPT_GET:
switch (sopt->sopt_name) {
case SO_HEADERS_ON_INPUT:
mask = SF_HI;
goto get_flags;
case SO_HEADERS_ON_OUTPUT:
mask = SF_HO;
get_flags:
soptval = cb->s_flags & mask;
IPX_UNLOCK(ipxp);
error = sooptcopyout(sopt, &soptval,
sizeof(soptval));
break;
case SO_MTU:
usoptval = cb->s_mtu;
IPX_UNLOCK(ipxp);
error = sooptcopyout(sopt, &usoptval,
sizeof(usoptval));
break;
case SO_LAST_HEADER:
spxhdr = cb->s_rhdr;
IPX_UNLOCK(ipxp);
error = sooptcopyout(sopt, &spxhdr, sizeof(spxhdr));
break;
case SO_DEFAULT_HEADERS:
spxhdr = cb->s_shdr;
IPX_UNLOCK(ipxp);
error = sooptcopyout(sopt, &spxhdr, sizeof(spxhdr));
break;
default:
IPX_UNLOCK(ipxp);
error = ENOPROTOOPT;
}
break;
case SOPT_SET:
/*
* XXX Why are these shorts on get and ints on set? That
* doesn't make any sense...
*
* XXXRW: Note, when we re-acquire the ipxp lock, we should
* re-check that it's not dropped.
*/
IPX_UNLOCK(ipxp);
switch (sopt->sopt_name) {
case SO_HEADERS_ON_INPUT:
mask = SF_HI;
goto set_head;
case SO_HEADERS_ON_OUTPUT:
mask = SF_HO;
set_head:
error = sooptcopyin(sopt, &optval, sizeof optval,
sizeof optval);
if (error)
break;
IPX_LOCK(ipxp);
if (cb->s_flags & SF_PI) {
if (optval)
cb->s_flags |= mask;
else
cb->s_flags &= ~mask;
} else error = EINVAL;
IPX_UNLOCK(ipxp);
break;
case SO_MTU:
error = sooptcopyin(sopt, &usoptval, sizeof usoptval,
sizeof usoptval);
if (error)
break;
/* Unlocked write. */
cb->s_mtu = usoptval;
break;
#ifdef SF_NEWCALL
case SO_NEWCALL:
error = sooptcopyin(sopt, &optval, sizeof optval,
sizeof optval);
if (error)
break;
IPX_LOCK(ipxp);
if (optval) {
cb->s_flags2 |= SF_NEWCALL;
spx_newchecks[5]++;
} else {
cb->s_flags2 &= ~SF_NEWCALL;
spx_newchecks[6]++;
}
IPX_UNLOCK(ipxp);
break;
#endif
case SO_DEFAULT_HEADERS:
{
struct spxhdr sp;
error = sooptcopyin(sopt, &sp, sizeof sp,
sizeof sp);
if (error)
break;
IPX_LOCK(ipxp);
cb->s_dt = sp.spx_dt;
cb->s_cc = sp.spx_cc & SPX_EM;
IPX_UNLOCK(ipxp);
}
break;
default:
error = ENOPROTOOPT;
}
break;
default:
panic("spx_ctloutput: bad socket option direction");
}
return (error);
}
static void
spx_usr_abort(struct socket *so)
{
struct ipxpcb *ipxp;
struct spxpcb *cb;
ipxp = sotoipxpcb(so);
KASSERT(ipxp != NULL, ("spx_usr_abort: ipxp == NULL"));
cb = ipxtospxpcb(ipxp);
KASSERT(cb != NULL, ("spx_usr_abort: cb == NULL"));
IPX_LIST_LOCK();
IPX_LOCK(ipxp);
spx_drop(cb, ECONNABORTED);
IPX_UNLOCK(ipxp);
IPX_LIST_UNLOCK();
}
/*
* Accept a connection. Essentially all the work is done at higher levels;
* just return the address of the peer, storing through addr.
*/
static int
spx_accept(struct socket *so, struct sockaddr **nam)
{
struct ipxpcb *ipxp;
struct sockaddr_ipx *sipx, ssipx;
ipxp = sotoipxpcb(so);
KASSERT(ipxp != NULL, ("spx_accept: ipxp == NULL"));
sipx = &ssipx;
bzero(sipx, sizeof *sipx);
sipx->sipx_len = sizeof *sipx;
sipx->sipx_family = AF_IPX;
IPX_LOCK(ipxp);
sipx->sipx_addr = ipxp->ipxp_faddr;
IPX_UNLOCK(ipxp);
*nam = sodupsockaddr((struct sockaddr *)sipx, M_WAITOK);
return (0);
}
static int
spx_attach(struct socket *so, int proto, struct thread *td)
{
struct ipxpcb *ipxp;
struct spxpcb *cb;
struct mbuf *mm;
struct sockbuf *sb;
int error;
ipxp = sotoipxpcb(so);
KASSERT(ipxp == NULL, ("spx_attach: ipxp != NULL"));
if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
error = soreserve(so, (u_long) 3072, (u_long) 3072);
if (error)
return (error);
}
MALLOC(cb, struct spxpcb *, sizeof *cb, M_PCB, M_NOWAIT | M_ZERO);
if (cb == NULL)
return (ENOBUFS);
mm = m_getclr(M_DONTWAIT, MT_DATA);
if (mm == NULL) {
FREE(cb, M_PCB);
return (ENOBUFS);
}
IPX_LIST_LOCK();
error = ipx_pcballoc(so, &ipxpcb_list, td);
if (error) {
IPX_LIST_UNLOCK();
m_free(mm);
FREE(cb, M_PCB);
return (error);
}
ipxp = sotoipxpcb(so);
ipxp->ipxp_flags |= IPXP_SPX;
cb->s_ipx = mtod(mm, struct ipx *);
cb->s_state = TCPS_LISTEN;
cb->s_smax = -1;
cb->s_swl1 = -1;
cb->s_q.si_next = cb->s_q.si_prev = &cb->s_q;
cb->s_ipxpcb = ipxp;
cb->s_mtu = 576 - sizeof(struct spx);
sb = &so->so_snd;
cb->s_cwnd = sbspace(sb) * CUNIT / cb->s_mtu;
cb->s_ssthresh = cb->s_cwnd;
cb->s_cwmx = sbspace(sb) * CUNIT / (2 * sizeof(struct spx));
/*
* Above is recomputed when connecting to account for changed
* buffering or mtu's.
*/
cb->s_rtt = SPXTV_SRTTBASE;
cb->s_rttvar = SPXTV_SRTTDFLT << 2;
SPXT_RANGESET(cb->s_rxtcur,
((SPXTV_SRTTBASE >> 2) + (SPXTV_SRTTDFLT << 2)) >> 1,
SPXTV_MIN, SPXTV_REXMTMAX);
2005-01-02 15:13:59 +00:00
ipxp->ipxp_pcb = (caddr_t)cb;
IPX_LIST_UNLOCK();
return (0);
}
static void
spx_pcbdetach(struct ipxpcb *ipxp)
{
struct spxpcb *cb;
struct spx_q *s;
struct mbuf *m;
IPX_LOCK_ASSERT(ipxp);
cb = ipxtospxpcb(ipxp);
KASSERT(cb != NULL, ("spx_pcbdetach: cb == NULL"));
s = cb->s_q.si_next;
while (s != &(cb->s_q)) {
s = s->si_next;
remque(s);
m = dtom(s);
m_freem(m);
}
m_free(dtom(cb->s_ipx));
FREE(cb, M_PCB);
ipxp->ipxp_pcb = NULL;
}
static int
spx_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
2005-01-02 15:13:59 +00:00
{
struct ipxpcb *ipxp;
int error;
ipxp = sotoipxpcb(so);
KASSERT(ipxp != NULL, ("spx_bind: ipxp == NULL"));
IPX_LIST_LOCK();
IPX_LOCK(ipxp);
if (ipxp->ipxp_flags & IPXP_DROPPED) {
error = EINVAL;
goto out;
}
error = ipx_pcbbind(ipxp, nam, td);
out:
IPX_UNLOCK(ipxp);
IPX_LIST_UNLOCK();
return (error);
2005-01-02 15:13:59 +00:00
}
static void
spx_usr_close(struct socket *so)
{
struct ipxpcb *ipxp;
struct spxpcb *cb;
ipxp = sotoipxpcb(so);
KASSERT(ipxp != NULL, ("spx_usr_close: ipxp == NULL"));
cb = ipxtospxpcb(ipxp);
KASSERT(cb != NULL, ("spx_usr_close: cb == NULL"));
IPX_LIST_LOCK();
IPX_LOCK(ipxp);
if (cb->s_state > TCPS_LISTEN)
spx_disconnect(cb);
else
spx_close(cb);
IPX_UNLOCK(ipxp);
IPX_LIST_UNLOCK();
}
/*
* Initiate connection to peer. Enter SYN_SENT state, and mark socket as
* connecting. Start keep-alive timer, setup prototype header, send initial
* system packet requesting connection.
*/
static int
spx_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
{
struct ipxpcb *ipxp;
struct spxpcb *cb;
int error;
ipxp = sotoipxpcb(so);
KASSERT(ipxp != NULL, ("spx_connect: ipxp == NULL"));
cb = ipxtospxpcb(ipxp);
KASSERT(cb != NULL, ("spx_connect: cb == NULL"));
IPX_LIST_LOCK();
IPX_LOCK(ipxp);
if (ipxp->ipxp_flags & IPXP_DROPPED) {
error = EINVAL;
goto spx_connect_end;
}
if (ipxp->ipxp_lport == 0) {
error = ipx_pcbbind(ipxp, NULL, td);
if (error)
goto spx_connect_end;
}
error = ipx_pcbconnect(ipxp, nam, td);
if (error)
goto spx_connect_end;
soisconnecting(so);
spxstat.spxs_connattempt++;
cb->s_state = TCPS_SYN_SENT;
cb->s_did = 0;
spx_template(cb);
cb->s_timer[SPXT_KEEP] = SPXTV_KEEP;
cb->s_force = 1 + SPXTV_KEEP;
/*
* Other party is required to respond to the port I send from, but he
* is not required to answer from where I am sending to, so allow
* wildcarding. Original port I am sending to is still saved in
* cb->s_dport.
*/
ipxp->ipxp_fport = 0;
error = spx_output(cb, NULL);
spx_connect_end:
IPX_UNLOCK(ipxp);
IPX_LIST_UNLOCK();
return (error);
}
static void
spx_detach(struct socket *so)
{
struct ipxpcb *ipxp;
struct spxpcb *cb;
/*
* XXXRW: Should assert appropriately detached.
*/
ipxp = sotoipxpcb(so);
KASSERT(ipxp != NULL, ("spx_detach: ipxp == NULL"));
cb = ipxtospxpcb(ipxp);
KASSERT(cb != NULL, ("spx_detach: cb == NULL"));
IPX_LIST_LOCK();
IPX_LOCK(ipxp);
spx_pcbdetach(ipxp);
ipx_pcbfree(ipxp);
IPX_LIST_UNLOCK();
}
/*
* We may decide later to implement connection closing handshaking at the spx
* level optionally. Here is the hook to do it:
*/
static int
spx_usr_disconnect(struct socket *so)
{
struct ipxpcb *ipxp;
struct spxpcb *cb;
int error;
ipxp = sotoipxpcb(so);
KASSERT(ipxp != NULL, ("spx_usr_disconnect: ipxp == NULL"));
cb = ipxtospxpcb(ipxp);
KASSERT(cb != NULL, ("spx_usr_disconnect: cb == NULL"));
IPX_LIST_LOCK();
IPX_LOCK(ipxp);
if (ipxp->ipxp_flags & IPXP_DROPPED) {
error = EINVAL;
goto out;
}
spx_disconnect(cb);
error = 0;
out:
IPX_UNLOCK(ipxp);
IPX_LIST_UNLOCK();
return (error);
}
static int
spx_listen(struct socket *so, int backlog, struct thread *td)
{
int error;
struct ipxpcb *ipxp;
struct spxpcb *cb;
error = 0;
ipxp = sotoipxpcb(so);
KASSERT(ipxp != NULL, ("spx_listen: ipxp == NULL"));
cb = ipxtospxpcb(ipxp);
KASSERT(cb != NULL, ("spx_listen: cb == NULL"));
IPX_LIST_LOCK();
IPX_LOCK(ipxp);
if (ipxp->ipxp_flags & IPXP_DROPPED) {
error = EINVAL;
goto out;
}
In the current world order, solisten() implements the state transition of a socket from a regular socket to a listening socket able to accept new connections. As part of this state transition, solisten() calls into the protocol to update protocol-layer state. There were several bugs in this implementation that could result in a race wherein a TCP SYN received in the interval between the protocol state transition and the shortly following socket layer transition would result in a panic in the TCP code, as the socket would be in the TCPS_LISTEN state, but the socket would not have the SO_ACCEPTCONN flag set. This change does the following: - Pushes the socket state transition from the socket layer solisten() to to socket "library" routines called from the protocol. This permits the socket routines to be called while holding the protocol mutexes, preventing a race exposing the incomplete socket state transition to TCP after the TCP state transition has completed. The check for a socket layer state transition is performed by solisten_proto_check(), and the actual transition is performed by solisten_proto(). - Holds the socket lock for the duration of the socket state test and set, and over the protocol layer state transition, which is now possible as the socket lock is acquired by the protocol layer, rather than vice versa. This prevents additional state related races in the socket layer. This permits the dual transition of socket layer and protocol layer state to occur while holding locks for both layers, making the two changes atomic with respect to one another. Similar changes are likely require elsewhere in the socket/protocol code. Reported by: Peter Holm <peter@holm.cc> Review and fixes from: emax, Antoine Brodin <antoine.brodin@laposte.net> Philosophical head nod: gnn
2005-02-21 21:58:17 +00:00
SOCK_LOCK(so);
error = solisten_proto_check(so);
if (error == 0 && ipxp->ipxp_lport == 0)
error = ipx_pcbbind(ipxp, NULL, td);
In the current world order, solisten() implements the state transition of a socket from a regular socket to a listening socket able to accept new connections. As part of this state transition, solisten() calls into the protocol to update protocol-layer state. There were several bugs in this implementation that could result in a race wherein a TCP SYN received in the interval between the protocol state transition and the shortly following socket layer transition would result in a panic in the TCP code, as the socket would be in the TCPS_LISTEN state, but the socket would not have the SO_ACCEPTCONN flag set. This change does the following: - Pushes the socket state transition from the socket layer solisten() to to socket "library" routines called from the protocol. This permits the socket routines to be called while holding the protocol mutexes, preventing a race exposing the incomplete socket state transition to TCP after the TCP state transition has completed. The check for a socket layer state transition is performed by solisten_proto_check(), and the actual transition is performed by solisten_proto(). - Holds the socket lock for the duration of the socket state test and set, and over the protocol layer state transition, which is now possible as the socket lock is acquired by the protocol layer, rather than vice versa. This prevents additional state related races in the socket layer. This permits the dual transition of socket layer and protocol layer state to occur while holding locks for both layers, making the two changes atomic with respect to one another. Similar changes are likely require elsewhere in the socket/protocol code. Reported by: Peter Holm <peter@holm.cc> Review and fixes from: emax, Antoine Brodin <antoine.brodin@laposte.net> Philosophical head nod: gnn
2005-02-21 21:58:17 +00:00
if (error == 0) {
cb->s_state = TCPS_LISTEN;
solisten_proto(so, backlog);
In the current world order, solisten() implements the state transition of a socket from a regular socket to a listening socket able to accept new connections. As part of this state transition, solisten() calls into the protocol to update protocol-layer state. There were several bugs in this implementation that could result in a race wherein a TCP SYN received in the interval between the protocol state transition and the shortly following socket layer transition would result in a panic in the TCP code, as the socket would be in the TCPS_LISTEN state, but the socket would not have the SO_ACCEPTCONN flag set. This change does the following: - Pushes the socket state transition from the socket layer solisten() to to socket "library" routines called from the protocol. This permits the socket routines to be called while holding the protocol mutexes, preventing a race exposing the incomplete socket state transition to TCP after the TCP state transition has completed. The check for a socket layer state transition is performed by solisten_proto_check(), and the actual transition is performed by solisten_proto(). - Holds the socket lock for the duration of the socket state test and set, and over the protocol layer state transition, which is now possible as the socket lock is acquired by the protocol layer, rather than vice versa. This prevents additional state related races in the socket layer. This permits the dual transition of socket layer and protocol layer state to occur while holding locks for both layers, making the two changes atomic with respect to one another. Similar changes are likely require elsewhere in the socket/protocol code. Reported by: Peter Holm <peter@holm.cc> Review and fixes from: emax, Antoine Brodin <antoine.brodin@laposte.net> Philosophical head nod: gnn
2005-02-21 21:58:17 +00:00
}
SOCK_UNLOCK(so);
out:
IPX_UNLOCK(ipxp);
IPX_LIST_UNLOCK();
return (error);
}
/*
* After a receive, possibly send acknowledgment updating allocation.
*/
static int
spx_rcvd(struct socket *so, int flags)
{
struct ipxpcb *ipxp;
struct spxpcb *cb;
int error;
ipxp = sotoipxpcb(so);
KASSERT(ipxp != NULL, ("spx_rcvd: ipxp == NULL"));
cb = ipxtospxpcb(ipxp);
KASSERT(cb != NULL, ("spx_rcvd: cb == NULL"));
IPX_LOCK(ipxp);
if (ipxp->ipxp_flags & IPXP_DROPPED) {
error = EINVAL;
goto out;
}
cb->s_flags |= SF_RVD;
spx_output(cb, NULL);
cb->s_flags &= ~SF_RVD;
error = 0;
out:
IPX_UNLOCK(ipxp);
return (error);
}
static int
spx_rcvoob(struct socket *so, struct mbuf *m, int flags)
{
struct ipxpcb *ipxp;
struct spxpcb *cb;
int error;
ipxp = sotoipxpcb(so);
KASSERT(ipxp != NULL, ("spx_rcvoob: ipxp == NULL"));
cb = ipxtospxpcb(ipxp);
KASSERT(cb != NULL, ("spx_rcvoob: cb == NULL"));
IPX_LOCK(ipxp);
if (ipxp->ipxp_flags & IPXP_DROPPED) {
error = EINVAL;
goto out;
}
SOCKBUF_LOCK(&so->so_rcv);
if ((cb->s_oobflags & SF_IOOB) || so->so_oobmark ||
(so->so_rcv.sb_state & SBS_RCVATMARK)) {
SOCKBUF_UNLOCK(&so->so_rcv);
m->m_len = 1;
*mtod(m, caddr_t) = cb->s_iobc;
error = 0;
goto out;
}
SOCKBUF_UNLOCK(&so->so_rcv);
error = EINVAL;
out:
IPX_UNLOCK(ipxp);
return (error);
}
static int
spx_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
struct mbuf *controlp, struct thread *td)
{
struct ipxpcb *ipxp;
struct spxpcb *cb;
int error;
ipxp = sotoipxpcb(so);
KASSERT(ipxp != NULL, ("spx_send: ipxp == NULL"));
cb = ipxtospxpcb(ipxp);
KASSERT(cb != NULL, ("spx_send: cb == NULL"));
error = 0;
IPX_LOCK(ipxp);
if (ipxp->ipxp_flags & IPXP_DROPPED) {
error = ECONNRESET;
goto spx_send_end;
}
if (flags & PRUS_OOB) {
if (sbspace(&so->so_snd) < -512) {
error = ENOBUFS;
goto spx_send_end;
}
cb->s_oobflags |= SF_SOOB;
}
if (controlp != NULL) {
u_short *p = mtod(controlp, u_short *);
spx_newchecks[2]++;
if ((p[0] == 5) && (p[1] == 1)) { /* XXXX, for testing */
cb->s_shdr.spx_dt = *(u_char *)(&p[2]);
spx_newchecks[3]++;
}
m_freem(controlp);
}
controlp = NULL;
error = spx_output(cb, m);
m = NULL;
spx_send_end:
IPX_UNLOCK(ipxp);
if (controlp != NULL)
m_freem(controlp);
if (m != NULL)
m_freem(m);
return (error);
}
static int
spx_shutdown(struct socket *so)
{
struct ipxpcb *ipxp;
struct spxpcb *cb;
int error;
ipxp = sotoipxpcb(so);
KASSERT(ipxp != NULL, ("spx_shutdown: ipxp == NULL"));
cb = ipxtospxpcb(ipxp);
KASSERT(cb != NULL, ("spx_shutdown: cb == NULL"));
socantsendmore(so);
IPX_LIST_LOCK();
IPX_LOCK(ipxp);
if (ipxp->ipxp_flags & IPXP_DROPPED) {
error = EINVAL;
goto out;
}
spx_usrclosed(cb);
error = 0;
out:
IPX_UNLOCK(ipxp);
IPX_LIST_UNLOCK();
return (error);
}
static int
spx_sp_attach(struct socket *so, int proto, struct thread *td)
{
struct ipxpcb *ipxp;
struct spxpcb *cb;
int error;
KASSERT(so->so_pcb == NULL, ("spx_sp_attach: so_pcb != NULL"));
error = spx_attach(so, proto, td);
if (error)
return (error);
ipxp = sotoipxpcb(so);
KASSERT(ipxp != NULL, ("spx_sp_attach: ipxp == NULL"));
cb = ipxtospxpcb(ipxp);
KASSERT(cb != NULL, ("spx_sp_attach: cb == NULL"));
IPX_LOCK(ipxp);
cb->s_flags |= (SF_HI | SF_HO | SF_PI);
IPX_UNLOCK(ipxp);
return (0);
}
/*
* Create template to be used to send spx packets on a connection. Called
* after host entry created, fills in a skeletal spx header (choosing
* connection id), minimizing the amount of work necessary when the
* connection is used.
*/
static void
spx_template(struct spxpcb *cb)
{
struct ipxpcb *ipxp = cb->s_ipxpcb;
struct ipx *ipx = cb->s_ipx;
struct sockbuf *sb = &(ipxp->ipxp_socket->so_snd);
IPX_LOCK_ASSERT(ipxp);
ipx->ipx_pt = IPXPROTO_SPX;
ipx->ipx_sna = ipxp->ipxp_laddr;
ipx->ipx_dna = ipxp->ipxp_faddr;
SPX_LOCK();
cb->s_sid = htons(spx_iss);
spx_iss += SPX_ISSINCR/2;
SPX_UNLOCK();
cb->s_alo = 1;
cb->s_cwnd = (sbspace(sb) * CUNIT) / cb->s_mtu;
/* Try to expand fast to full complement of large packets. */
cb->s_ssthresh = cb->s_cwnd;
cb->s_cwmx = (sbspace(sb) * CUNIT) / (2 * sizeof(struct spx));
/* But allow for lots of little packets as well. */
cb->s_cwmx = max(cb->s_cwmx, cb->s_cwnd);
}
/*
* Close a SPIP control block. Wake up any sleepers. We used to free any
* queued packets and cb->s_ipx here, but now we defer that until the pcb is
* discarded.
*/
void
spx_close(struct spxpcb *cb)
{
struct ipxpcb *ipxp = cb->s_ipxpcb;
struct socket *so = ipxp->ipxp_socket;
KASSERT(ipxp != NULL, ("spx_close: ipxp == NULL"));
IPX_LIST_LOCK_ASSERT();
IPX_LOCK_ASSERT(ipxp);
ipxp->ipxp_flags |= IPXP_DROPPED;
soisdisconnected(so);
spxstat.spxs_closed++;
}
/*
* Someday we may do level 3 handshaking to close a connection or send a
* xerox style error. For now, just close. cb will always be invalid after
* this call.
*/
static void
spx_usrclosed(struct spxpcb *cb)
{
IPX_LIST_LOCK_ASSERT();
IPX_LOCK_ASSERT(cb->s_ipxpcb);
spx_close(cb);
}
/*
* cb will always be invalid after this call.
*/
static void
spx_disconnect(struct spxpcb *cb)
{
IPX_LIST_LOCK_ASSERT();
IPX_LOCK_ASSERT(cb->s_ipxpcb);
spx_close(cb);
}
/*
* Drop connection, reporting the specified error. cb will always be invalid
* after this call.
*/
static void
spx_drop(struct spxpcb *cb, int errno)
{
struct socket *so = cb->s_ipxpcb->ipxp_socket;
IPX_LIST_LOCK_ASSERT();
IPX_LOCK_ASSERT(cb->s_ipxpcb);
/*
* Someday, in the xerox world we will generate error protocol
* packets announcing that the socket has gone away.
*/
if (TCPS_HAVERCVDSYN(cb->s_state)) {
spxstat.spxs_drops++;
cb->s_state = TCPS_CLOSED;
/*tcp_output(cb);*/
} else
spxstat.spxs_conndrops++;
so->so_error = errno;
spx_close(cb);
}
/*
* Fast timeout routine for processing delayed acks.
*/
void
spx_fasttimo(void)
{
struct ipxpcb *ipxp;
struct spxpcb *cb;
IPX_LIST_LOCK();
LIST_FOREACH(ipxp, &ipxpcb_list, ipxp_list) {
IPX_LOCK(ipxp);
if (!(ipxp->ipxp_flags & IPXP_SPX) ||
(ipxp->ipxp_flags & IPXP_DROPPED)) {
IPX_UNLOCK(ipxp);
continue;
}
cb = ipxtospxpcb(ipxp);
if (cb->s_flags & SF_DELACK) {
cb->s_flags &= ~SF_DELACK;
cb->s_flags |= SF_ACKNOW;
spxstat.spxs_delack++;
spx_output(cb, NULL);
}
IPX_UNLOCK(ipxp);
}
IPX_LIST_UNLOCK();
}
/*
* spx protocol timeout routine called every 500 ms. Updates the timers in
* all active pcb's and causes finite state machine actions if timers expire.
*/
void
spx_slowtimo(void)
{
struct ipxpcb *ipxp;
struct spxpcb *cb;
int i;
/*
* Search through tcb's and update active timers. Once, timers could
* free ipxp's, but now we do that only when detaching a socket.
*/
IPX_LIST_LOCK();
LIST_FOREACH(ipxp, &ipxpcb_list, ipxp_list) {
IPX_LOCK(ipxp);
if (!(ipxp->ipxp_flags & IPXP_SPX) ||
(ipxp->ipxp_flags & IPXP_DROPPED)) {
IPX_UNLOCK(ipxp);
continue;
}
cb = (struct spxpcb *)ipxp->ipxp_pcb;
KASSERT(cb != NULL, ("spx_slowtimo: cb == NULL"));
for (i = 0; i < SPXT_NTIMERS; i++) {
if (cb->s_timer[i] && --cb->s_timer[i] == 0) {
spx_timers(cb, i);
if (ipxp->ipxp_flags & IPXP_DROPPED)
break;
}
}
if (!(ipxp->ipxp_flags & IPXP_DROPPED)) {
cb->s_idle++;
if (cb->s_rtt)
cb->s_rtt++;
}
IPX_UNLOCK(ipxp);
}
IPX_LIST_UNLOCK();
SPX_LOCK();
spx_iss += SPX_ISSINCR/PR_SLOWHZ; /* increment iss */
SPX_UNLOCK();
}
/*
* SPX timer processing.
*/
static void
spx_timers(struct spxpcb *cb, int timer)
{
long rexmt;
int win;
IPX_LIST_LOCK_ASSERT();
IPX_LOCK_ASSERT(cb->s_ipxpcb);
cb->s_force = 1 + timer;
switch (timer) {
case SPXT_2MSL:
/*
* 2 MSL timeout in shutdown went off. TCP deletes
* connection control block.
*/
printf("spx: SPXT_2MSL went off for no reason\n");
cb->s_timer[timer] = 0;
break;
case SPXT_REXMT:
/*
* Retransmission timer went off. Message has not been acked
* within retransmit interval. Back off to a longer
* retransmit interval and retransmit one packet.
*/
if (++cb->s_rxtshift > SPX_MAXRXTSHIFT) {
cb->s_rxtshift = SPX_MAXRXTSHIFT;
spxstat.spxs_timeoutdrop++;
spx_drop(cb, ETIMEDOUT);
break;
}
spxstat.spxs_rexmttimeo++;
rexmt = ((cb->s_srtt >> 2) + cb->s_rttvar) >> 1;
rexmt *= spx_backoff[cb->s_rxtshift];
SPXT_RANGESET(cb->s_rxtcur, rexmt, SPXTV_MIN, SPXTV_REXMTMAX);
cb->s_timer[SPXT_REXMT] = cb->s_rxtcur;
/*
* If we have backed off fairly far, our srtt estimate is
* probably bogus. Clobber it so we'll take the next rtt
* measurement as our srtt; move the current srtt into rttvar
* to keep the current retransmit times until then.
*/
if (cb->s_rxtshift > SPX_MAXRXTSHIFT / 4 ) {
cb->s_rttvar += (cb->s_srtt >> 2);
cb->s_srtt = 0;
}
cb->s_snxt = cb->s_rack;
/*
* If timing a packet, stop the timer.
*/
cb->s_rtt = 0;
/*
* See very long discussion in tcp_timer.c about congestion
* window and sstrhesh.
*/
win = min(cb->s_swnd, (cb->s_cwnd/CUNIT)) / 2;
if (win < 2)
win = 2;
cb->s_cwnd = CUNIT;
cb->s_ssthresh = win * CUNIT;
spx_output(cb, NULL);
break;
case SPXT_PERSIST:
/*
* Persistance timer into zero window. Force a probe to be
* sent.
*/
spxstat.spxs_persisttimeo++;
spx_setpersist(cb);
spx_output(cb, NULL);
break;
case SPXT_KEEP:
/*
* Keep-alive timer went off; send something or drop
* connection if idle for too long.
*/
spxstat.spxs_keeptimeo++;
if (cb->s_state < TCPS_ESTABLISHED)
goto dropit;
if (cb->s_ipxpcb->ipxp_socket->so_options & SO_KEEPALIVE) {
if (cb->s_idle >= SPXTV_MAXIDLE)
goto dropit;
spxstat.spxs_keepprobe++;
spx_output(cb, NULL);
} else
cb->s_idle = 0;
cb->s_timer[SPXT_KEEP] = SPXTV_KEEP;
break;
dropit:
spxstat.spxs_keepdrops++;
spx_drop(cb, ETIMEDOUT);
break;
default:
panic("spx_timers: unknown timer %d", timer);
}
}