freebsd-skq/sys/netipx/spx_usrreq.c

/*-
 * Copyright (c) 1984, 1985, 1986, 1987, 1993
 *	The Regents of the University of California.
 * Copyright (c) 2004-2006 Robert N. M. Watson
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * Copyright (c) 1995, Mike Mitchell
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *	This product includes software developed by the University of
 *	California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *	@(#)spx_usrreq.h
 */

#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");

#include <sys/param.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/protosw.h>
#include <sys/signalvar.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sx.h>
#include <sys/systm.h>

#include <net/route.h>
#include <netinet/tcp_fsm.h>

#include <netipx/ipx.h>
#include <netipx/ipx_pcb.h>
#include <netipx/ipx_var.h>
#include <netipx/spx.h>
#include <netipx/spx_debug.h>
#include <netipx/spx_timer.h>
#include <netipx/spx_var.h>

/*
 * SPX protocol implementation.
 */
static struct	mtx spx_mtx;			/* Protects only spx_iss. */
static u_short 	spx_iss;
static u_short	spx_newchecks[50];
static int	spx_hardnosed;
static int	spx_use_delack = 0;
static int	traceallspxs = 0;
static struct	spx_istat spx_istat;
static int	spxrexmtthresh = 3;

#define	SPX_LOCK_INIT()	mtx_init(&spx_mtx, "spx_mtx", NULL, MTX_DEF)
#define	SPX_LOCK()	mtx_lock(&spx_mtx)
#define	SPX_UNLOCK()	mtx_unlock(&spx_mtx)

/* Following was struct spxstat spxstat; */
#ifndef spxstat
#define spxstat spx_istat.newstats
#endif

static const int spx_backoff[SPX_MAXRXTSHIFT+1] =
    { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };

static	void spx_close(struct spxpcb *cb);
static	void spx_disconnect(struct spxpcb *cb);
static	void spx_drop(struct spxpcb *cb, int errno);
static	int spx_output(struct spxpcb *cb, struct mbuf *m0);
static	int spx_reass(struct spxpcb *cb, struct spx *si);
static	void spx_setpersist(struct spxpcb *cb);
static	void spx_template(struct spxpcb *cb);
static	void spx_timers(struct spxpcb *cb, int timer);
static	void spx_usrclosed(struct spxpcb *cb);

static	void spx_usr_abort(struct socket *so);
static	int spx_accept(struct socket *so, struct sockaddr **nam);
static	int spx_attach(struct socket *so, int proto, struct thread *td);
static	int spx_bind(struct socket *so, struct sockaddr *nam, struct thread *td);
static	void spx_usr_close(struct socket *so);
static	int spx_connect(struct socket *so, struct sockaddr *nam,
			struct thread *td);
static	void spx_detach(struct socket *so);
static	void spx_pcbdetach(struct ipxpcb *ipxp);
static	int spx_usr_disconnect(struct socket *so);
static	int spx_listen(struct socket *so, int backlog, struct thread *td);
static	int spx_rcvd(struct socket *so, int flags);
static	int spx_rcvoob(struct socket *so, struct mbuf *m, int flags);
static	int spx_send(struct socket *so, int flags, struct mbuf *m,
		     struct sockaddr *addr, struct mbuf *control,
		     struct thread *td);
static	int spx_shutdown(struct socket *so);
static	int spx_sp_attach(struct socket *so, int proto, struct thread *td);

struct	pr_usrreqs spx_usrreqs = {
	.pru_abort =		spx_usr_abort,
	.pru_accept =		spx_accept,
	.pru_attach =		spx_attach,
	.pru_bind =		spx_bind,
	.pru_connect =		spx_connect,
	.pru_control =		ipx_control,
	.pru_detach =		spx_detach,
	.pru_disconnect =	spx_usr_disconnect,
	.pru_listen =		spx_listen,
	.pru_peeraddr =		ipx_peeraddr,
	.pru_rcvd =		spx_rcvd,
	.pru_rcvoob =		spx_rcvoob,
	.pru_send =		spx_send,
	.pru_shutdown =		spx_shutdown,
	.pru_sockaddr =		ipx_sockaddr,
	.pru_close =		spx_usr_close,
};

struct	pr_usrreqs spx_usrreq_sps = {
	.pru_abort =		spx_usr_abort,
	.pru_accept =		spx_accept,
	.pru_attach =		spx_sp_attach,
	.pru_bind =		spx_bind,
	.pru_connect =		spx_connect,
	.pru_control =		ipx_control,
	.pru_detach =		spx_detach,
	.pru_disconnect =	spx_usr_disconnect,
	.pru_listen =		spx_listen,
	.pru_peeraddr =		ipx_peeraddr,
	.pru_rcvd =		spx_rcvd,
	.pru_rcvoob =		spx_rcvoob,
	.pru_send =		spx_send,
	.pru_shutdown =		spx_shutdown,
	.pru_sockaddr =		ipx_sockaddr,
	.pru_close =		spx_usr_close,
};

void
spx_init(void)
{

	SPX_LOCK_INIT();
	spx_iss = 1; /* WRONG !! should fish it out of TODR */
}

void
spx_input(struct mbuf *m, struct ipxpcb *ipxp)
{
	struct spxpcb *cb;
	struct spx *si = mtod(m, struct spx *);
	struct socket *so;
	struct spx spx_savesi;
	int dropsocket = 0;
	short ostate = 0;

	spxstat.spxs_rcvtotal++;
	KASSERT(ipxp != NULL, ("spx_input: ipxpcb == NULL"));

	/*
	 * spx_input() assumes that the caller will hold both the pcb list
	 * lock and also the ipxp lock.  spx_input() will release both before
	 * returning, and may in fact trade in the ipxp lock for another pcb
	 * lock following sonewconn().
	 */
	IPX_LIST_LOCK_ASSERT();
	IPX_LOCK_ASSERT(ipxp);

	cb = ipxtospxpcb(ipxp);
	KASSERT(cb != NULL, ("spx_input: cb == NULL"));

	if (ipxp->ipxp_flags & IPXP_DROPPED)
		goto drop;

	if (m->m_len < sizeof(*si)) {
		if ((m = m_pullup(m, sizeof(*si))) == NULL) {
			IPX_UNLOCK(ipxp);
			IPX_LIST_UNLOCK();
			spxstat.spxs_rcvshort++;
			return;
		}
		si = mtod(m, struct spx *);
	}
	si->si_seq = ntohs(si->si_seq);
	si->si_ack = ntohs(si->si_ack);
	si->si_alo = ntohs(si->si_alo);

	so = ipxp->ipxp_socket;
	KASSERT(so != NULL, ("spx_input: so == NULL"));

	if (so->so_options & SO_DEBUG || traceallspxs) {
		ostate = cb->s_state;
		spx_savesi = *si;
	}
	if (so->so_options & SO_ACCEPTCONN) {
		struct spxpcb *ocb = cb;

		so = sonewconn(so, 0);
		if (so == NULL)
			goto drop;

		/*
		 * This is ugly, but ....
		 *
		 * Mark socket as temporary until we're committed to keeping
		 * it.  The code at ``drop'' and ``dropwithreset'' check the
		 * flag dropsocket to see if the temporary socket created
		 * here should be discarded.  We mark the socket as
		 * discardable until we're committed to it below in
		 * TCPS_LISTEN.
		 *
		 * XXXRW: In the new world order of real kernel parallelism,
		 * temporarily allocating the socket when we're "not sure"
		 * seems like a bad idea, as we might race to remove it if
		 * the listen socket is closed...?
		 *
		 * We drop the lock of the listen socket ipxp, and acquire
		 * the lock of the new socket ippx.
		 */
		dropsocket++;
		IPX_UNLOCK(ipxp);
		ipxp = (struct ipxpcb *)so->so_pcb;
		IPX_LOCK(ipxp);
		ipxp->ipxp_laddr = si->si_dna;
		cb = ipxtospxpcb(ipxp);
		cb->s_mtu = ocb->s_mtu;		/* preserve sockopts */
		cb->s_flags = ocb->s_flags;	/* preserve sockopts */
		cb->s_flags2 = ocb->s_flags2;	/* preserve sockopts */
		cb->s_state = TCPS_LISTEN;
	}
	IPX_LOCK_ASSERT(ipxp);

	/*
	 * Packet received on connection.  Reset idle time and keep-alive
	 * timer.
	 */
	cb->s_idle = 0;
	cb->s_timer[SPXT_KEEP] = SPXTV_KEEP;

	switch (cb->s_state) {
	case TCPS_LISTEN:{
		struct sockaddr_ipx *sipx, ssipx;
		struct ipx_addr laddr;

		/*
		 * If somebody here was carying on a conversation and went
		 * away, and his pen pal thinks he can still talk, we get the
		 * misdirected packet.
		 */
		if (spx_hardnosed && (si->si_did != 0 || si->si_seq != 0)) {
			spx_istat.gonawy++;
			goto dropwithreset;
		}
		sipx = &ssipx;
		bzero(sipx, sizeof *sipx);
		sipx->sipx_len = sizeof(*sipx);
		sipx->sipx_family = AF_IPX;
		sipx->sipx_addr = si->si_sna;
		laddr = ipxp->ipxp_laddr;
		if (ipx_nullhost(laddr))
			ipxp->ipxp_laddr = si->si_dna;
		if (ipx_pcbconnect(ipxp, (struct sockaddr *)sipx, &thread0)) {
			ipxp->ipxp_laddr = laddr;
			spx_istat.noconn++;
			goto drop;
		}
		spx_template(cb);
		dropsocket = 0;		/* committed to socket */
		cb->s_did = si->si_sid;
		cb->s_rack = si->si_ack;
		cb->s_ralo = si->si_alo;
#define THREEWAYSHAKE
#ifdef THREEWAYSHAKE
		cb->s_state = TCPS_SYN_RECEIVED;
		cb->s_force = 1 + SPXT_KEEP;
		spxstat.spxs_accepts++;
		cb->s_timer[SPXT_KEEP] = SPXTV_KEEP;
		}
		break;

	 case TCPS_SYN_RECEIVED: {
		/*
		 * This state means that we have heard a response to our
		 * acceptance of their connection.  It is probably logically
		 * unnecessary in this implementation.
		 */
		if (si->si_did != cb->s_sid) {
			spx_istat.wrncon++;
			goto drop;
		}
#endif
		ipxp->ipxp_fport =  si->si_sport;
		cb->s_timer[SPXT_REXMT] = 0;
		cb->s_timer[SPXT_KEEP] = SPXTV_KEEP;
		soisconnected(so);
		cb->s_state = TCPS_ESTABLISHED;
		spxstat.spxs_accepts++;
		}
		break;

	case TCPS_SYN_SENT:
		/*
		 * This state means that we have gotten a response to our
		 * attempt to establish a connection.  We fill in the data
		 * from the other side, telling us which port to respond to,
		 * instead of the well-known one we might have sent to in the
		 * first place.  We also require that this is a response to
		 * our connection id.
		 */
		if (si->si_did != cb->s_sid) {
			spx_istat.notme++;
			goto drop;
		}
		spxstat.spxs_connects++;
		cb->s_did = si->si_sid;
		cb->s_rack = si->si_ack;
		cb->s_ralo = si->si_alo;
		cb->s_dport = ipxp->ipxp_fport =  si->si_sport;
		cb->s_timer[SPXT_REXMT] = 0;
		cb->s_flags |= SF_ACKNOW;
		soisconnected(so);
		cb->s_state = TCPS_ESTABLISHED;
		/*
		 * Use roundtrip time of connection request for initial rtt.
		 */
		if (cb->s_rtt) {
			cb->s_srtt = cb->s_rtt << 3;
			cb->s_rttvar = cb->s_rtt << 1;
			SPXT_RANGESET(cb->s_rxtcur,
			    ((cb->s_srtt >> 2) + cb->s_rttvar) >> 1,
			    SPXTV_MIN, SPXTV_REXMTMAX);
			    cb->s_rtt = 0;
		}
	}

	if (so->so_options & SO_DEBUG || traceallspxs)
		spx_trace(SA_INPUT, (u_char)ostate, cb, &spx_savesi, 0);

	m->m_len -= sizeof(struct ipx);
	m->m_pkthdr.len -= sizeof(struct ipx);
	m->m_data += sizeof(struct ipx);

	if (spx_reass(cb, si))
		m_freem(m);
	if (cb->s_force || (cb->s_flags & (SF_ACKNOW|SF_WIN|SF_RXT)))
		spx_output(cb, NULL);
	cb->s_flags &= ~(SF_WIN|SF_RXT);
	IPX_UNLOCK(ipxp);
	IPX_LIST_UNLOCK();
	return;

dropwithreset:
	IPX_LOCK_ASSERT(ipxp);
	if (cb == NULL || (cb->s_ipxpcb->ipxp_socket->so_options & SO_DEBUG ||
	    traceallspxs))
		spx_trace(SA_DROP, (u_char)ostate, cb, &spx_savesi, 0);
	IPX_UNLOCK(ipxp);
	if (dropsocket) {
		struct socket *head;
		ACCEPT_LOCK();
		KASSERT((so->so_qstate & SQ_INCOMP) != 0,
		    ("spx_input: nascent socket not SQ_INCOMP on soabort()"));
		head = so->so_head;
		TAILQ_REMOVE(&head->so_incomp, so, so_list);
		head->so_incqlen--;
		so->so_qstate &= ~SQ_INCOMP;
		so->so_head = NULL;
		ACCEPT_UNLOCK();
		soabort(so);
	}
	IPX_LIST_UNLOCK();
	m_freem(dtom(si));
	return;

drop:
	IPX_LOCK_ASSERT(ipxp);
	if (cb->s_ipxpcb->ipxp_socket->so_options & SO_DEBUG || traceallspxs)
		spx_trace(SA_DROP, (u_char)ostate, cb, &spx_savesi, 0);
	IPX_UNLOCK(ipxp);
	IPX_LIST_UNLOCK();
	m_freem(m);
}

/*
 * This is structurally similar to the tcp reassembly routine but its
 * function is somewhat different:  It merely queues packets up, and
 * suppresses duplicates.
 */
static int
spx_reass(struct spxpcb *cb, struct spx *si)
{
	struct spx_q *q;
	struct mbuf *m;
	struct socket *so = cb->s_ipxpcb->ipxp_socket;
	char packetp = cb->s_flags & SF_HI;
	int incr;
	char wakeup = 0;

	IPX_LOCK_ASSERT(cb->s_ipxpcb);

	if (si == SI(0))
		goto present;
	/*
	 * Update our news from them.
	 */
	if (si->si_cc & SPX_SA)
		cb->s_flags |= (spx_use_delack ? SF_DELACK : SF_ACKNOW);
	if (SSEQ_GT(si->si_alo, cb->s_ralo))
		cb->s_flags |= SF_WIN;
	if (SSEQ_LEQ(si->si_ack, cb->s_rack)) {
		if ((si->si_cc & SPX_SP) && cb->s_rack != (cb->s_smax + 1)) {
			spxstat.spxs_rcvdupack++;
			/*
			 * If this is a completely duplicate ack and other
			 * conditions hold, we assume a packet has been
			 * dropped and retransmit it exactly as in
			 * tcp_input().
			 */
			if (si->si_ack != cb->s_rack ||
			    si->si_alo != cb->s_ralo)
				cb->s_dupacks = 0;
			else if (++cb->s_dupacks == spxrexmtthresh) {
				u_short onxt = cb->s_snxt;
				int cwnd = cb->s_cwnd;

				cb->s_snxt = si->si_ack;
				cb->s_cwnd = CUNIT;
				cb->s_force = 1 + SPXT_REXMT;
				spx_output(cb, NULL);
				cb->s_timer[SPXT_REXMT] = cb->s_rxtcur;
				cb->s_rtt = 0;
				if (cwnd >= 4 * CUNIT)
					cb->s_cwnd = cwnd / 2;
				if (SSEQ_GT(onxt, cb->s_snxt))
					cb->s_snxt = onxt;
				return (1);
			}
		} else
			cb->s_dupacks = 0;
		goto update_window;
	}
	cb->s_dupacks = 0;

	/*
	 * If our correspondent acknowledges data we haven't sent TCP would
	 * drop the packet after acking.  We'll be a little more permissive.
	 */
	if (SSEQ_GT(si->si_ack, (cb->s_smax + 1))) {
		spxstat.spxs_rcvacktoomuch++;
		si->si_ack = cb->s_smax + 1;
	}
	spxstat.spxs_rcvackpack++;

	/*
	 * If transmit timer is running and timed sequence number was acked,
	 * update smoothed round trip time.  See discussion of algorithm in
	 * tcp_input.c
	 */
	if (cb->s_rtt && SSEQ_GT(si->si_ack, cb->s_rtseq)) {
		spxstat.spxs_rttupdated++;
		if (cb->s_srtt != 0) {
			short delta;
			delta = cb->s_rtt - (cb->s_srtt >> 3);
			if ((cb->s_srtt += delta) <= 0)
				cb->s_srtt = 1;
			if (delta < 0)
				delta = -delta;
			delta -= (cb->s_rttvar >> 2);
			if ((cb->s_rttvar += delta) <= 0)
				cb->s_rttvar = 1;
		} else {
			/*
			 * No rtt measurement yet.
			 */
			cb->s_srtt = cb->s_rtt << 3;
			cb->s_rttvar = cb->s_rtt << 1;
		}
		cb->s_rtt = 0;
		cb->s_rxtshift = 0;
		SPXT_RANGESET(cb->s_rxtcur,
			((cb->s_srtt >> 2) + cb->s_rttvar) >> 1,
			SPXTV_MIN, SPXTV_REXMTMAX);
	}

	/*
	 * If all outstanding data is acked, stop retransmit timer and
	 * remember to restart (more output or persist).  If there is more
	 * data to be acked, restart retransmit timer, using current
	 * (possibly backed-off) value;
	 */
	if (si->si_ack == cb->s_smax + 1) {
		cb->s_timer[SPXT_REXMT] = 0;
		cb->s_flags |= SF_RXT;
	} else if (cb->s_timer[SPXT_PERSIST] == 0)
		cb->s_timer[SPXT_REXMT] = cb->s_rxtcur;

	/*
	 * When new data is acked, open the congestion window.  If the window
	 * gives us less than ssthresh packets in flight, open exponentially
	 * (maxseg at a time).  Otherwise open linearly (maxseg^2 / cwnd at a
	 * time).
	 */
	incr = CUNIT;
	if (cb->s_cwnd > cb->s_ssthresh)
		incr = max(incr * incr / cb->s_cwnd, 1);
	cb->s_cwnd = min(cb->s_cwnd + incr, cb->s_cwmx);

	/*
	 * Trim Acked data from output queue.
	 */
	SOCKBUF_LOCK(&so->so_snd);
	while ((m = so->so_snd.sb_mb) != NULL) {
		if (SSEQ_LT((mtod(m, struct spx *))->si_seq, si->si_ack))
			sbdroprecord_locked(&so->so_snd);
		else
			break;
	}
	sowwakeup_locked(so);
	cb->s_rack = si->si_ack;
update_window:
	if (SSEQ_LT(cb->s_snxt, cb->s_rack))
		cb->s_snxt = cb->s_rack;
	if (SSEQ_LT(cb->s_swl1, si->si_seq) || ((cb->s_swl1 == si->si_seq &&
	    (SSEQ_LT(cb->s_swl2, si->si_ack))) ||
	     (cb->s_swl2 == si->si_ack && SSEQ_LT(cb->s_ralo, si->si_alo)))) {
		/* keep track of pure window updates */
		if ((si->si_cc & SPX_SP) && cb->s_swl2 == si->si_ack
		    && SSEQ_LT(cb->s_ralo, si->si_alo)) {
			spxstat.spxs_rcvwinupd++;
			spxstat.spxs_rcvdupack--;
		}
		cb->s_ralo = si->si_alo;
		cb->s_swl1 = si->si_seq;
		cb->s_swl2 = si->si_ack;
		cb->s_swnd = (1 + si->si_alo - si->si_ack);
		if (cb->s_swnd > cb->s_smxw)
			cb->s_smxw = cb->s_swnd;
		cb->s_flags |= SF_WIN;
	}

	/*
	 * If this packet number is higher than that which we have allocated
	 * refuse it, unless urgent.
	 */
	if (SSEQ_GT(si->si_seq, cb->s_alo)) {
		if (si->si_cc & SPX_SP) {
			spxstat.spxs_rcvwinprobe++;
			return (1);
		} else
			spxstat.spxs_rcvpackafterwin++;
		if (si->si_cc & SPX_OB) {
			if (SSEQ_GT(si->si_seq, cb->s_alo + 60)) {
				m_freem(dtom(si));
				return (0);
			} /* else queue this packet; */
		} else {
#ifdef BROKEN
			/*
			 * XXXRW: This is broken on at least one count:
			 * spx_close() will free the ipxp and related parts,
			 * which are then touched by spx_input() after the
			 * return from spx_reass().
			 */
			/*struct socket *so = cb->s_ipxpcb->ipxp_socket;
			if (so->so_state && SS_NOFDREF) {
				spx_close(cb);
			} else
				       would crash system*/
#endif
			spx_istat.notyet++;
			m_freem(dtom(si));
			return (0);
		}
	}

	/*
	 * If this is a system packet, we don't need to queue it up, and
	 * won't update acknowledge #.
	 */
	if (si->si_cc & SPX_SP)
		return (1);

	/*
	 * We have already seen this packet, so drop.
	 */
	if (SSEQ_LT(si->si_seq, cb->s_ack)) {
		spx_istat.bdreas++;
		spxstat.spxs_rcvduppack++;
		if (si->si_seq == cb->s_ack - 1)
			spx_istat.lstdup++;
		return (1);
	}

	/*
	 * Loop through all packets queued up to insert in appropriate
	 * sequence.
	 */
	for (q = cb->s_q.si_next; q != &cb->s_q; q = q->si_next) {
		if (si->si_seq == SI(q)->si_seq) {
			spxstat.spxs_rcvduppack++;
			return (1);
		}
		if (SSEQ_LT(si->si_seq, SI(q)->si_seq)) {
			spxstat.spxs_rcvoopack++;
			break;
		}
	}
	insque(si, q->si_prev);
	/*
	 * If this packet is urgent, inform process
	 */
	if (si->si_cc & SPX_OB) {
		cb->s_iobc = ((char *)si)[1 + sizeof(*si)];
		sohasoutofband(so);
		cb->s_oobflags |= SF_IOOB;
	}
present:
#define SPINC sizeof(struct spxhdr)
	SOCKBUF_LOCK(&so->so_rcv);

	/*
	 * Loop through all packets queued up to update acknowledge number,
	 * and present all acknowledged data to user; if in packet interface
	 * mode, show packet headers.
	 */
	for (q = cb->s_q.si_next; q != &cb->s_q; q = q->si_next) {
		  if (SI(q)->si_seq == cb->s_ack) {
			cb->s_ack++;
			m = dtom(q);
			if (SI(q)->si_cc & SPX_OB) {
				cb->s_oobflags &= ~SF_IOOB;
				if (so->so_rcv.sb_cc)
					so->so_oobmark = so->so_rcv.sb_cc;
				else
					so->so_rcv.sb_state |= SBS_RCVATMARK;
			}
			q = q->si_prev;
			remque(q->si_next);
			wakeup = 1;
			spxstat.spxs_rcvpack++;
#ifdef SF_NEWCALL
			if (cb->s_flags2 & SF_NEWCALL) {
				struct spxhdr *sp = mtod(m, struct spxhdr *);
				u_char dt = sp->spx_dt;
				spx_newchecks[4]++;
				if (dt != cb->s_rhdr.spx_dt) {
					struct mbuf *mm =
					   m_getclr(M_DONTWAIT, MT_CONTROL);
					spx_newchecks[0]++;
					if (mm != NULL) {
						u_short *s =
							mtod(mm, u_short *);
						cb->s_rhdr.spx_dt = dt;
						mm->m_len = 5; /*XXX*/
						s[0] = 5;
						s[1] = 1;
						*(u_char *)(&s[2]) = dt;
						sbappend_locked(&so->so_rcv, mm);
					}
				}
				if (sp->spx_cc & SPX_OB) {
					MCHTYPE(m, MT_OOBDATA);
					spx_newchecks[1]++;
					so->so_oobmark = 0;
					so->so_rcv.sb_state &= ~SBS_RCVATMARK;
				}
				if (packetp == 0) {
					m->m_data += SPINC;
					m->m_len -= SPINC;
					m->m_pkthdr.len -= SPINC;
				}
				if ((sp->spx_cc & SPX_EM) || packetp) {
					sbappendrecord_locked(&so->so_rcv, m);
					spx_newchecks[9]++;
				} else
					sbappend_locked(&so->so_rcv, m);
			} else
#endif
			if (packetp)
				sbappendrecord_locked(&so->so_rcv, m);
			else {
				cb->s_rhdr = *mtod(m, struct spxhdr *);
				m->m_data += SPINC;
				m->m_len -= SPINC;
				m->m_pkthdr.len -= SPINC;
				sbappend_locked(&so->so_rcv, m);
			}
		  } else
			break;
	}
	if (wakeup)
		sorwakeup_locked(so);
	else
		SOCKBUF_UNLOCK(&so->so_rcv);
	return (0);
}

void
spx_ctlinput(int cmd, struct sockaddr *arg_as_sa, void *dummy)
{

	/* Currently, nothing. */
}

static int
spx_output(struct spxpcb *cb, struct mbuf *m0)
{
	struct socket *so = cb->s_ipxpcb->ipxp_socket;
	struct mbuf *m;
	struct spx *si = NULL;
	struct sockbuf *sb = &so->so_snd;
	int len = 0, win, rcv_win;
	short span, off, recordp = 0;
	u_short alo;
	int error = 0, sendalot;
#ifdef notdef
	int idle;
#endif
	struct mbuf *mprev;

	IPX_LOCK_ASSERT(cb->s_ipxpcb);

	if (m0 != NULL) {
		int mtu = cb->s_mtu;
		int datalen;

		/*
		 * Make sure that packet isn't too big.
		 */
		for (m = m0; m != NULL; m = m->m_next) {
			mprev = m;
			len += m->m_len;
			if (m->m_flags & M_EOR)
				recordp = 1;
		}
		datalen = (cb->s_flags & SF_HO) ?
				len - sizeof(struct spxhdr) : len;
		if (datalen > mtu) {
			if (cb->s_flags & SF_PI) {
				m_freem(m0);
				return (EMSGSIZE);
			} else {
				int oldEM = cb->s_cc & SPX_EM;

				cb->s_cc &= ~SPX_EM;
				while (len > mtu) {
					m = m_copym(m0, 0, mtu, M_DONTWAIT);
					if (m == NULL) {
					    cb->s_cc |= oldEM;
					    m_freem(m0);
					    return (ENOBUFS);
					}
					if (cb->s_flags & SF_NEWCALL) {
					    struct mbuf *mm = m;
					    spx_newchecks[7]++;
					    while (mm != NULL) {
						mm->m_flags &= ~M_EOR;
						mm = mm->m_next;
					    }
					}
					error = spx_output(cb, m);
					if (error) {
						cb->s_cc |= oldEM;
						m_freem(m0);
						return (error);
					}
					m_adj(m0, mtu);
					len -= mtu;
				}
				cb->s_cc |= oldEM;
			}
		}

		/*
		 * Force length even, by adding a "garbage byte" if
		 * necessary.
		 */
		if (len & 1) {
			m = mprev;
			if (M_TRAILINGSPACE(m) >= 1)
				m->m_len++;
			else {
				struct mbuf *m1 = m_get(M_DONTWAIT, MT_DATA);

				if (m1 == NULL) {
					m_freem(m0);
					return (ENOBUFS);
				}
				m1->m_len = 1;
				*(mtod(m1, u_char *)) = 0;
				m->m_next = m1;
			}
		}
		m = m_gethdr(M_DONTWAIT, MT_DATA);
		if (m == NULL) {
			m_freem(m0);
			return (ENOBUFS);
		}

		/*
		 * Fill in mbuf with extended SP header and addresses and
		 * length put into network format.
		 */
		MH_ALIGN(m, sizeof(struct spx));
		m->m_len = sizeof(struct spx);
		m->m_next = m0;
		si = mtod(m, struct spx *);
		si->si_i = *cb->s_ipx;
		si->si_s = cb->s_shdr;
		if ((cb->s_flags & SF_PI) && (cb->s_flags & SF_HO)) {
			struct spxhdr *sh;
			if (m0->m_len < sizeof(*sh)) {
				if((m0 = m_pullup(m0, sizeof(*sh))) == NULL) {
					m_free(m);
					m_freem(m0);
					return (EINVAL);
				}
				m->m_next = m0;
			}
			sh = mtod(m0, struct spxhdr *);
			si->si_dt = sh->spx_dt;
			si->si_cc |= sh->spx_cc & SPX_EM;
			m0->m_len -= sizeof(*sh);
			m0->m_data += sizeof(*sh);
			len -= sizeof(*sh);
		}
		len += sizeof(*si);
		if ((cb->s_flags2 & SF_NEWCALL) && recordp) {
			si->si_cc |= SPX_EM;
			spx_newchecks[8]++;
		}
		if (cb->s_oobflags & SF_SOOB) {
			/*
			 * Per jqj@cornell: Make sure OB packets convey
			 * exactly 1 byte.  If the packet is 1 byte or
			 * larger, we have already guaranted there to be at
			 * least one garbage byte for the checksum, and extra
			 * bytes shouldn't hurt!
			 */
			if (len > sizeof(*si)) {
				si->si_cc |= SPX_OB;
				len = (1 + sizeof(*si));
			}
		}
		si->si_len = htons((u_short)len);
		m->m_pkthdr.len = ((len - 1) | 1) + 1;

		/*
		 * Queue stuff up for output.
		 */
		sbappendrecord(sb, m);
		cb->s_seq++;
	}
#ifdef notdef
	idle = (cb->s_smax == (cb->s_rack - 1));
#endif
again:
	sendalot = 0;
	off = cb->s_snxt - cb->s_rack;
	win = min(cb->s_swnd, (cb->s_cwnd / CUNIT));

	/*
	 * If in persist timeout with window of 0, send a probe.  Otherwise,
	 * if window is small but nonzero and timer expired, send what we can
	 * and go into transmit state.
	 */
	if (cb->s_force == 1 + SPXT_PERSIST) {
		if (win != 0) {
			cb->s_timer[SPXT_PERSIST] = 0;
			cb->s_rxtshift = 0;
		}
	}
	span = cb->s_seq - cb->s_rack;
	len = min(span, win) - off;

	if (len < 0) {
		/*
		 * Window shrank after we went into it.  If window shrank to
		 * 0, cancel pending restransmission and pull s_snxt back to
		 * (closed) window.  We will enter persist state below.  If
		 * the widndow didn't close completely, just wait for an ACK.
		 */
		len = 0;
		if (win == 0) {
			cb->s_timer[SPXT_REXMT] = 0;
			cb->s_snxt = cb->s_rack;
		}
	}
	if (len > 1)
		sendalot = 1;
	rcv_win = sbspace(&so->so_rcv);

	/*
	 * Send if we owe peer an ACK.
	 */
	if (cb->s_oobflags & SF_SOOB) {
		/*
		 * Must transmit this out of band packet.
		 */
		cb->s_oobflags &= ~ SF_SOOB;
		sendalot = 1;
		spxstat.spxs_sndurg++;
		goto found;
	}
	if (cb->s_flags & SF_ACKNOW)
		goto send;
	if (cb->s_state < TCPS_ESTABLISHED)
		goto send;

	/*
	 * Silly window can't happen in spx.  Code from TCP deleted.
	 */
	if (len)
		goto send;

	/*
	 * Compare available window to amount of window known to peer (as
	 * advertised window less next expected input.)  If the difference is
	 * at least two packets or at least 35% of the mximum possible
	 * window, then want to send a window update to peer.
	 */
	if (rcv_win > 0) {
		u_short delta =  1 + cb->s_alo - cb->s_ack;
		int adv = rcv_win - (delta * cb->s_mtu);

		if ((so->so_rcv.sb_cc == 0 && adv >= (2 * cb->s_mtu)) ||
		    (100 * adv / so->so_rcv.sb_hiwat >= 35)) {
			spxstat.spxs_sndwinup++;
			cb->s_flags |= SF_ACKNOW;
			goto send;
		}

	}

	/*
	 * Many comments from tcp_output.c are appropriate here including ...
	 * If send window is too small, there is data to transmit, and no
	 * retransmit or persist is pending, then go to persist state.  If
	 * nothing happens soon, send when timer expires: if window is
	 * nonzero, transmit what we can, otherwise send a probe.
	 */
	if (so->so_snd.sb_cc && cb->s_timer[SPXT_REXMT] == 0 &&
	    cb->s_timer[SPXT_PERSIST] == 0) {
		cb->s_rxtshift = 0;
		spx_setpersist(cb);
	}

	/*
	 * No reason to send a packet, just return.
	 */
	cb->s_outx = 1;
	return (0);

send:
	/*
	 * Find requested packet.
	 */
	si = 0;
	if (len > 0) {
		cb->s_want = cb->s_snxt;
		for (m = sb->sb_mb; m != NULL; m = m->m_act) {
			si = mtod(m, struct spx *);
			if (SSEQ_LEQ(cb->s_snxt, si->si_seq))
				break;
		}
	found:
		if (si != NULL) {
			if (si->si_seq == cb->s_snxt)
					cb->s_snxt++;
				else
					spxstat.spxs_sndvoid++, si = 0;
		}
	}

	/*
	 * Update window.
	 */
	if (rcv_win < 0)
		rcv_win = 0;
	alo = cb->s_ack - 1 + (rcv_win / ((short)cb->s_mtu));
	if (SSEQ_LT(alo, cb->s_alo))
		alo = cb->s_alo;

	if (si != NULL) {
		/*
		 * Must make a copy of this packet for ipx_output to monkey
		 * with.
		 */
		m = m_copy(dtom(si), 0, (int)M_COPYALL);
		if (m == NULL)
			return (ENOBUFS);
		si = mtod(m, struct spx *);
		if (SSEQ_LT(si->si_seq, cb->s_smax))
			spxstat.spxs_sndrexmitpack++;
		else
			spxstat.spxs_sndpack++;
	} else if (cb->s_force || cb->s_flags & SF_ACKNOW) {
		/*
		 * Must send an acknowledgement or a probe.
		 */
		if (cb->s_force)
			spxstat.spxs_sndprobe++;
		if (cb->s_flags & SF_ACKNOW)
			spxstat.spxs_sndacks++;
		m = m_gethdr(M_DONTWAIT, MT_DATA);
		if (m == NULL)
			return (ENOBUFS);

		/*
		 * Fill in mbuf with extended SP header and addresses and
		 * length put into network format.
		 */
		MH_ALIGN(m, sizeof(struct spx));
		m->m_len = sizeof(*si);
		m->m_pkthdr.len = sizeof(*si);
		si = mtod(m, struct spx *);
		si->si_i = *cb->s_ipx;
		si->si_s = cb->s_shdr;
		si->si_seq = cb->s_smax + 1;
		si->si_len = htons(sizeof(*si));
		si->si_cc |= SPX_SP;
	} else {
		cb->s_outx = 3;
		if (so->so_options & SO_DEBUG || traceallspxs)
			spx_trace(SA_OUTPUT, cb->s_state, cb, si, 0);
		return (0);
	}
	/*
	 * Stuff checksum and output datagram.
	 */
	if ((si->si_cc & SPX_SP) == 0) {
		if (cb->s_force != (1 + SPXT_PERSIST) ||
		    cb->s_timer[SPXT_PERSIST] == 0) {
			/*
			 * If this is a new packet and we are not currently
			 * timing anything, time this one.
			 */
			if (SSEQ_LT(cb->s_smax, si->si_seq)) {
				cb->s_smax = si->si_seq;
				if (cb->s_rtt == 0) {
					spxstat.spxs_segstimed++;
					cb->s_rtseq = si->si_seq;
					cb->s_rtt = 1;
				}
			}

			/*
			 * Set rexmt timer if not currently set, initial
			 * value for retransmit timer is smoothed round-trip
			 * time + 2 * round-trip time variance.  Initialize
			 * shift counter which is used for backoff of
			 * retransmit time.
			 */
			if (cb->s_timer[SPXT_REXMT] == 0 &&
			    cb->s_snxt != cb->s_rack) {
				cb->s_timer[SPXT_REXMT] = cb->s_rxtcur;
				if (cb->s_timer[SPXT_PERSIST]) {
					cb->s_timer[SPXT_PERSIST] = 0;
					cb->s_rxtshift = 0;
				}
			}
		} else if (SSEQ_LT(cb->s_smax, si->si_seq))
			cb->s_smax = si->si_seq;
	} else if (cb->s_state < TCPS_ESTABLISHED) {
		if (cb->s_rtt == 0)
			cb->s_rtt = 1; /* Time initial handshake */
		if (cb->s_timer[SPXT_REXMT] == 0)
			cb->s_timer[SPXT_REXMT] = cb->s_rxtcur;
	}

	/*
	 * Do not request acks when we ack their data packets or when we do a
	 * gratuitous window update.
	 */
	if (((si->si_cc & SPX_SP) == 0) || cb->s_force)
		si->si_cc |= SPX_SA;
	si->si_seq = htons(si->si_seq);
	si->si_alo = htons(alo);
	si->si_ack = htons(cb->s_ack);

	if (ipxcksum)
		si->si_sum = ipx_cksum(m, ntohs(si->si_len));
	else
		si->si_sum = 0xffff;

	cb->s_outx = 4;
	if (so->so_options & SO_DEBUG || traceallspxs)
		spx_trace(SA_OUTPUT, cb->s_state, cb, si, 0);

	if (so->so_options & SO_DONTROUTE)
		error = ipx_outputfl(m, NULL, IPX_ROUTETOIF);
	else
		error = ipx_outputfl(m, &cb->s_ipxpcb->ipxp_route, 0);
	if (error)
		return (error);
	spxstat.spxs_sndtotal++;

	/*
	 * Data sent (as far as we can tell).  If this advertises a larger
	 * window than any other segment, then remember the size of the
	 * advertized window.  Any pending ACK has now been sent.
	 */
	cb->s_force = 0;
	cb->s_flags &= ~(SF_ACKNOW|SF_DELACK);
	if (SSEQ_GT(alo, cb->s_alo))
		cb->s_alo = alo;
	if (sendalot)
		goto again;
	cb->s_outx = 5;
	return (0);
}

static int spx_do_persist_panics = 0;

static void
spx_setpersist(struct spxpcb *cb)
{
	int t = ((cb->s_srtt >> 2) + cb->s_rttvar) >> 1;

	IPX_LOCK_ASSERT(cb->s_ipxpcb);

	if (cb->s_timer[SPXT_REXMT] && spx_do_persist_panics)
		panic("spx_output REXMT");

	/*
	 * Start/restart persistance timer.
	 */
	SPXT_RANGESET(cb->s_timer[SPXT_PERSIST],
	    t*spx_backoff[cb->s_rxtshift],
	    SPXTV_PERSMIN, SPXTV_PERSMAX);
	if (cb->s_rxtshift < SPX_MAXRXTSHIFT)
		cb->s_rxtshift++;
}

int
spx_ctloutput(struct socket *so, struct sockopt *sopt)
{
	struct spxhdr spxhdr;
	struct ipxpcb *ipxp;
	struct spxpcb *cb;
	int mask, error;
	short soptval;
	u_short usoptval;
	int optval;

	ipxp = sotoipxpcb(so);
	KASSERT(ipxp != NULL, ("spx_ctloutput: ipxp == NULL"));

	/*
	 * This will have to be changed when we do more general stacking of
	 * protocols.
	 */
	if (sopt->sopt_level != IPXPROTO_SPX)
		return (ipx_ctloutput(so, sopt));

	IPX_LOCK(ipxp);
	if (ipxp->ipxp_flags & IPXP_DROPPED) {
		IPX_UNLOCK(ipxp);
		return (ECONNRESET);
	}

	IPX_LOCK(ipxp);
	cb = ipxtospxpcb(ipxp);
	KASSERT(cb != NULL, ("spx_ctloutput: cb == NULL"));

	error = 0;
	switch (sopt->sopt_dir) {
	case SOPT_GET:
		switch (sopt->sopt_name) {
		case SO_HEADERS_ON_INPUT:
			mask = SF_HI;
			goto get_flags;

		case SO_HEADERS_ON_OUTPUT:
			mask = SF_HO;
		get_flags:
			soptval = cb->s_flags & mask;
			IPX_UNLOCK(ipxp);
			error = sooptcopyout(sopt, &soptval,
			    sizeof(soptval));
			break;

		case SO_MTU:
			usoptval = cb->s_mtu;
			IPX_UNLOCK(ipxp);
			error = sooptcopyout(sopt, &usoptval,
			    sizeof(usoptval));
			break;

		case SO_LAST_HEADER:
			spxhdr = cb->s_rhdr;
			IPX_UNLOCK(ipxp);
			error = sooptcopyout(sopt, &spxhdr, sizeof(spxhdr));
			break;

		case SO_DEFAULT_HEADERS:
			spxhdr = cb->s_shdr;
			IPX_UNLOCK(ipxp);
			error = sooptcopyout(sopt, &spxhdr, sizeof(spxhdr));
			break;

		default:
			IPX_UNLOCK(ipxp);
			error = ENOPROTOOPT;
		}
		break;

	case SOPT_SET:
		/*
		 * XXX Why are these shorts on get and ints on set?  That
		 * doesn't make any sense...
		 *
		 * XXXRW: Note, when we re-acquire the ipxp lock, we should
		 * re-check that it's not dropped.
		 */
		IPX_UNLOCK(ipxp);
		switch (sopt->sopt_name) {
		case SO_HEADERS_ON_INPUT:
			mask = SF_HI;
			goto set_head;

		case SO_HEADERS_ON_OUTPUT:
			mask = SF_HO;
		set_head:
			error = sooptcopyin(sopt, &optval, sizeof optval,
					    sizeof optval);
			if (error)
				break;

			IPX_LOCK(ipxp);
			if (cb->s_flags & SF_PI) {
				if (optval)
					cb->s_flags |= mask;
				else
					cb->s_flags &= ~mask;
			} else error = EINVAL;
			IPX_UNLOCK(ipxp);
			break;

		case SO_MTU:
			error = sooptcopyin(sopt, &usoptval, sizeof usoptval,
					    sizeof usoptval);
			if (error)
				break;
			/* Unlocked write. */
			cb->s_mtu = usoptval;
			break;

#ifdef SF_NEWCALL
		case SO_NEWCALL:
			error = sooptcopyin(sopt, &optval, sizeof optval,
					    sizeof optval);
			if (error)
				break;
			IPX_LOCK(ipxp);
			if (optval) {
				cb->s_flags2 |= SF_NEWCALL;
				spx_newchecks[5]++;
			} else {
				cb->s_flags2 &= ~SF_NEWCALL;
				spx_newchecks[6]++;
			}
			IPX_UNLOCK(ipxp);
			break;
#endif

		case SO_DEFAULT_HEADERS:
			{
				struct spxhdr sp;

				error = sooptcopyin(sopt, &sp, sizeof sp,
						    sizeof sp);
				if (error)
					break;
				IPX_LOCK(ipxp);
				cb->s_dt = sp.spx_dt;
				cb->s_cc = sp.spx_cc & SPX_EM;
				IPX_UNLOCK(ipxp);
			}
			break;

		default:
			error = ENOPROTOOPT;
		}
		break;

	default:
		panic("spx_ctloutput: bad socket option direction");
	}
	return (error);
}

static void
spx_usr_abort(struct socket *so)
{
	struct ipxpcb *ipxp;
	struct spxpcb *cb;

	ipxp = sotoipxpcb(so);
	KASSERT(ipxp != NULL, ("spx_usr_abort: ipxp == NULL"));

	cb = ipxtospxpcb(ipxp);
	KASSERT(cb != NULL, ("spx_usr_abort: cb == NULL"));

	IPX_LIST_LOCK();
	IPX_LOCK(ipxp);
	spx_drop(cb, ECONNABORTED);
	IPX_UNLOCK(ipxp);
	IPX_LIST_UNLOCK();
}

/*
 * Accept a connection.  Essentially all the work is done at higher levels;
 * just return the address of the peer, storing through addr.
 */
static int
spx_accept(struct socket *so, struct sockaddr **nam)
{
	struct ipxpcb *ipxp;
	struct sockaddr_ipx *sipx, ssipx;

	ipxp = sotoipxpcb(so);
	KASSERT(ipxp != NULL, ("spx_accept: ipxp == NULL"));

	sipx = &ssipx;
	bzero(sipx, sizeof *sipx);
	sipx->sipx_len = sizeof *sipx;
	sipx->sipx_family = AF_IPX;
	IPX_LOCK(ipxp);
	sipx->sipx_addr = ipxp->ipxp_faddr;
	IPX_UNLOCK(ipxp);
	*nam = sodupsockaddr((struct sockaddr *)sipx, M_WAITOK);
	return (0);
}

static int
spx_attach(struct socket *so, int proto, struct thread *td)
{
	struct ipxpcb *ipxp;
	struct spxpcb *cb;
	struct mbuf *mm;
	struct sockbuf *sb;
	int error;

	ipxp = sotoipxpcb(so);
	KASSERT(ipxp == NULL, ("spx_attach: ipxp != NULL"));

	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
		error = soreserve(so, (u_long) 3072, (u_long) 3072);
		if (error)
			return (error);
	}

	MALLOC(cb, struct spxpcb *, sizeof *cb, M_PCB, M_NOWAIT | M_ZERO);
	if (cb == NULL)
		return (ENOBUFS);
	mm = m_getclr(M_DONTWAIT, MT_DATA);
	if (mm == NULL) {
		FREE(cb, M_PCB);
		return (ENOBUFS);
	}

	IPX_LIST_LOCK();
	error = ipx_pcballoc(so, &ipxpcb_list, td);
	if (error) {
		IPX_LIST_UNLOCK();
		m_free(mm);
		FREE(cb, M_PCB);
		return (error);
	}
	ipxp = sotoipxpcb(so);
	ipxp->ipxp_flags |= IPXP_SPX;

	cb->s_ipx = mtod(mm, struct ipx *);
	cb->s_state = TCPS_LISTEN;
	cb->s_smax = -1;
	cb->s_swl1 = -1;
	cb->s_q.si_next = cb->s_q.si_prev = &cb->s_q;
	cb->s_ipxpcb = ipxp;
	cb->s_mtu = 576 - sizeof(struct spx);
	sb = &so->so_snd;
	cb->s_cwnd = sbspace(sb) * CUNIT / cb->s_mtu;
	cb->s_ssthresh = cb->s_cwnd;
	cb->s_cwmx = sbspace(sb) * CUNIT / (2 * sizeof(struct spx));
	/*
	 * Above is recomputed when connecting to account for changed
	 * buffering or mtu's.
	 */
	cb->s_rtt = SPXTV_SRTTBASE;
	cb->s_rttvar = SPXTV_SRTTDFLT << 2;
	SPXT_RANGESET(cb->s_rxtcur,
	    ((SPXTV_SRTTBASE >> 2) + (SPXTV_SRTTDFLT << 2)) >> 1,
	    SPXTV_MIN, SPXTV_REXMTMAX);
	ipxp->ipxp_pcb = (caddr_t)cb;
	IPX_LIST_UNLOCK();
	return (0);
}

static void
spx_pcbdetach(struct ipxpcb *ipxp)
{
	struct spxpcb *cb;
	struct spx_q *s;
	struct mbuf *m;

	IPX_LOCK_ASSERT(ipxp);

	cb = ipxtospxpcb(ipxp);
	KASSERT(cb != NULL, ("spx_pcbdetach: cb == NULL"));

	s = cb->s_q.si_next;
	while (s != &(cb->s_q)) {
		s = s->si_next;
		remque(s);
		m = dtom(s);
		m_freem(m);
	}
	m_free(dtom(cb->s_ipx));
	FREE(cb, M_PCB);
	ipxp->ipxp_pcb = NULL;
}

static int
spx_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
{
	struct ipxpcb *ipxp;
	int error;

	ipxp = sotoipxpcb(so);
	KASSERT(ipxp != NULL, ("spx_bind: ipxp == NULL"));

	IPX_LIST_LOCK();
	IPX_LOCK(ipxp);
	if (ipxp->ipxp_flags & IPXP_DROPPED) {
		error = EINVAL;
		goto out;
	}
	error = ipx_pcbbind(ipxp, nam, td);
out:
	IPX_UNLOCK(ipxp);
	IPX_LIST_UNLOCK();
	return (error);
}

static void
spx_usr_close(struct socket *so)
{
	struct ipxpcb *ipxp;
	struct spxpcb *cb;

	ipxp = sotoipxpcb(so);
	KASSERT(ipxp != NULL, ("spx_usr_close: ipxp == NULL"));

	cb = ipxtospxpcb(ipxp);
	KASSERT(cb != NULL, ("spx_usr_close: cb == NULL"));

	IPX_LIST_LOCK();
	IPX_LOCK(ipxp);
	if (cb->s_state > TCPS_LISTEN)
		spx_disconnect(cb);
	else
		spx_close(cb);
	IPX_UNLOCK(ipxp);
	IPX_LIST_UNLOCK();
}

/*
 * Initiate connection to peer.  Enter SYN_SENT state, and mark socket as
 * connecting.  Start keep-alive timer, setup prototype header, send initial
 * system packet requesting connection.
 */
static int
spx_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
{
	struct ipxpcb *ipxp;
	struct spxpcb *cb;
	int error;

	ipxp = sotoipxpcb(so);
	KASSERT(ipxp != NULL, ("spx_connect: ipxp == NULL"));

	cb = ipxtospxpcb(ipxp);
	KASSERT(cb != NULL, ("spx_connect: cb == NULL"));

	IPX_LIST_LOCK();
	IPX_LOCK(ipxp);
	if (ipxp->ipxp_flags & IPXP_DROPPED) {
		error = EINVAL;
		goto spx_connect_end;
	}
	if (ipxp->ipxp_lport == 0) {
		error = ipx_pcbbind(ipxp, NULL, td);
		if (error)
			goto spx_connect_end;
	}
	error = ipx_pcbconnect(ipxp, nam, td);
	if (error)
		goto spx_connect_end;
	soisconnecting(so);
	spxstat.spxs_connattempt++;
	cb->s_state = TCPS_SYN_SENT;
	cb->s_did = 0;
	spx_template(cb);
	cb->s_timer[SPXT_KEEP] = SPXTV_KEEP;
	cb->s_force = 1 + SPXTV_KEEP;
	/*
	 * Other party is required to respond to the port I send from, but he
	 * is not required to answer from where I am sending to, so allow
	 * wildcarding.  Original port I am sending to is still saved in
	 * cb->s_dport.
	 */
	ipxp->ipxp_fport = 0;
	error = spx_output(cb, NULL);
spx_connect_end:
	IPX_UNLOCK(ipxp);
	IPX_LIST_UNLOCK();
	return (error);
}

static void
spx_detach(struct socket *so)
{
	struct ipxpcb *ipxp;
	struct spxpcb *cb;

	/*
	 * XXXRW: Should assert appropriately detached.
	 */
	ipxp = sotoipxpcb(so);
	KASSERT(ipxp != NULL, ("spx_detach: ipxp == NULL"));

	cb = ipxtospxpcb(ipxp);
	KASSERT(cb != NULL, ("spx_detach: cb == NULL"));

	IPX_LIST_LOCK();
	IPX_LOCK(ipxp);
	spx_pcbdetach(ipxp);
	ipx_pcbfree(ipxp);
	IPX_LIST_UNLOCK();
}

/*
 * We may decide later to implement connection closing handshaking at the spx
 * level optionally.  Here is the hook to do it:
 */
static int
spx_usr_disconnect(struct socket *so)
{
	struct ipxpcb *ipxp;
	struct spxpcb *cb;
	int error;

	ipxp = sotoipxpcb(so);
	KASSERT(ipxp != NULL, ("spx_usr_disconnect: ipxp == NULL"));

	cb = ipxtospxpcb(ipxp);
	KASSERT(cb != NULL, ("spx_usr_disconnect: cb == NULL"));

	IPX_LIST_LOCK();
	IPX_LOCK(ipxp);
	if (ipxp->ipxp_flags & IPXP_DROPPED) {
		error = EINVAL;
		goto out;
	}
	spx_disconnect(cb);
	error = 0;
out:
	IPX_UNLOCK(ipxp);
	IPX_LIST_UNLOCK();
	return (error);
}

static int
spx_listen(struct socket *so, int backlog, struct thread *td)
{
	int error;
	struct ipxpcb *ipxp;
	struct spxpcb *cb;

	error = 0;
	ipxp = sotoipxpcb(so);
	KASSERT(ipxp != NULL, ("spx_listen: ipxp == NULL"));

	cb = ipxtospxpcb(ipxp);
	KASSERT(cb != NULL, ("spx_listen: cb == NULL"));

	IPX_LIST_LOCK();
	IPX_LOCK(ipxp);
	if (ipxp->ipxp_flags & IPXP_DROPPED) {
		error = EINVAL;
		goto out;
	}
	SOCK_LOCK(so);
	error = solisten_proto_check(so);
	if (error == 0 && ipxp->ipxp_lport == 0)
		error = ipx_pcbbind(ipxp, NULL, td);
	if (error == 0) {
		cb->s_state = TCPS_LISTEN;
		solisten_proto(so, backlog);
	}
	SOCK_UNLOCK(so);
out:
	IPX_UNLOCK(ipxp);
	IPX_LIST_UNLOCK();
	return (error);
}

/*
 * After a receive, possibly send acknowledgment updating allocation.
 */
static int
spx_rcvd(struct socket *so, int flags)
{
	struct ipxpcb *ipxp;
	struct spxpcb *cb;
	int error;

	ipxp = sotoipxpcb(so);
	KASSERT(ipxp != NULL, ("spx_rcvd: ipxp == NULL"));

	cb = ipxtospxpcb(ipxp);
	KASSERT(cb != NULL, ("spx_rcvd: cb == NULL"));

	IPX_LOCK(ipxp);
	if (ipxp->ipxp_flags & IPXP_DROPPED) {
		error = EINVAL;
		goto out;
	}
	cb->s_flags |= SF_RVD;
	spx_output(cb, NULL);
	cb->s_flags &= ~SF_RVD;
	error = 0;
out:
	IPX_UNLOCK(ipxp);
	return (error);
}

static int
spx_rcvoob(struct socket *so, struct mbuf *m, int flags)
{
	struct ipxpcb *ipxp;
	struct spxpcb *cb;
	int error;

	ipxp = sotoipxpcb(so);
	KASSERT(ipxp != NULL, ("spx_rcvoob: ipxp == NULL"));

	cb = ipxtospxpcb(ipxp);
	KASSERT(cb != NULL, ("spx_rcvoob: cb == NULL"));

	IPX_LOCK(ipxp);
	if (ipxp->ipxp_flags & IPXP_DROPPED) {
		error = EINVAL;
		goto out;
	}
	SOCKBUF_LOCK(&so->so_rcv);
	if ((cb->s_oobflags & SF_IOOB) || so->so_oobmark ||
	    (so->so_rcv.sb_state & SBS_RCVATMARK)) {
		SOCKBUF_UNLOCK(&so->so_rcv);
		m->m_len = 1;
		*mtod(m, caddr_t) = cb->s_iobc;
		error = 0;
		goto out;
	}
	SOCKBUF_UNLOCK(&so->so_rcv);
	error = EINVAL;
out:
	IPX_UNLOCK(ipxp);
	return (error);
}

static int
spx_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
    struct mbuf *controlp, struct thread *td)
{
	struct ipxpcb *ipxp;
	struct spxpcb *cb;
	int error;

	ipxp = sotoipxpcb(so);
	KASSERT(ipxp != NULL, ("spx_send: ipxp == NULL"));

	cb = ipxtospxpcb(ipxp);
	KASSERT(cb != NULL, ("spx_send: cb == NULL"));

	error = 0;
	IPX_LOCK(ipxp);
	if (ipxp->ipxp_flags & IPXP_DROPPED) {
		error = ECONNRESET;
		goto spx_send_end;
	}
	if (flags & PRUS_OOB) {
		if (sbspace(&so->so_snd) < -512) {
			error = ENOBUFS;
			goto spx_send_end;
		}
		cb->s_oobflags |= SF_SOOB;
	}
	if (controlp != NULL) {
		u_short *p = mtod(controlp, u_short *);
		spx_newchecks[2]++;
		if ((p[0] == 5) && (p[1] == 1)) { /* XXXX, for testing */
			cb->s_shdr.spx_dt = *(u_char *)(&p[2]);
			spx_newchecks[3]++;
		}
		m_freem(controlp);
	}
	controlp = NULL;
	error = spx_output(cb, m);
	m = NULL;
spx_send_end:
	IPX_UNLOCK(ipxp);
	if (controlp != NULL)
		m_freem(controlp);
	if (m != NULL)
		m_freem(m);
	return (error);
}

static int
spx_shutdown(struct socket *so)
{
	struct ipxpcb *ipxp;
	struct spxpcb *cb;
	int error;

	ipxp = sotoipxpcb(so);
	KASSERT(ipxp != NULL, ("spx_shutdown: ipxp == NULL"));

	cb = ipxtospxpcb(ipxp);
	KASSERT(cb != NULL, ("spx_shutdown: cb == NULL"));

	socantsendmore(so);
	IPX_LIST_LOCK();
	IPX_LOCK(ipxp);
	if (ipxp->ipxp_flags & IPXP_DROPPED) {
		error = EINVAL;
		goto out;
	}
	spx_usrclosed(cb);
	error = 0;
out:
	IPX_UNLOCK(ipxp);
	IPX_LIST_UNLOCK();
	return (error);
}

static int
spx_sp_attach(struct socket *so, int proto, struct thread *td)
{
	struct ipxpcb *ipxp;
	struct spxpcb *cb;
	int error;

	KASSERT(so->so_pcb == NULL, ("spx_sp_attach: so_pcb != NULL"));

	error = spx_attach(so, proto, td);
	if (error)
		return (error);

	ipxp = sotoipxpcb(so);
	KASSERT(ipxp != NULL, ("spx_sp_attach: ipxp == NULL"));

	cb = ipxtospxpcb(ipxp);
	KASSERT(cb != NULL, ("spx_sp_attach: cb == NULL"));

	IPX_LOCK(ipxp);
	cb->s_flags |= (SF_HI | SF_HO | SF_PI);
	IPX_UNLOCK(ipxp);
	return (0);
}

/*
 * Create template to be used to send spx packets on a connection.  Called
 * after host entry created, fills in a skeletal spx header (choosing
 * connection id), minimizing the amount of work necessary when the
 * connection is used.
 */
static void
spx_template(struct spxpcb *cb)
{
	struct ipxpcb *ipxp = cb->s_ipxpcb;
	struct ipx *ipx = cb->s_ipx;
	struct sockbuf *sb = &(ipxp->ipxp_socket->so_snd);

	IPX_LOCK_ASSERT(ipxp);

	ipx->ipx_pt = IPXPROTO_SPX;
	ipx->ipx_sna = ipxp->ipxp_laddr;
	ipx->ipx_dna = ipxp->ipxp_faddr;
	SPX_LOCK();
	cb->s_sid = htons(spx_iss);
	spx_iss += SPX_ISSINCR/2;
	SPX_UNLOCK();
	cb->s_alo = 1;
	cb->s_cwnd = (sbspace(sb) * CUNIT) / cb->s_mtu;
	/* Try to expand fast to full complement of large packets. */
	cb->s_ssthresh = cb->s_cwnd;
	cb->s_cwmx = (sbspace(sb) * CUNIT) / (2 * sizeof(struct spx));
	/* But allow for lots of little packets as well. */
	cb->s_cwmx = max(cb->s_cwmx, cb->s_cwnd);
}

/*
 * Close a SPIP control block.  Wake up any sleepers.  We used to free any
 * queued packets and cb->s_ipx here, but now we defer that until the pcb is
 * discarded.
 */
void
spx_close(struct spxpcb *cb)
{
	struct ipxpcb *ipxp = cb->s_ipxpcb;
	struct socket *so = ipxp->ipxp_socket;

	KASSERT(ipxp != NULL, ("spx_close: ipxp == NULL"));
	IPX_LIST_LOCK_ASSERT();
	IPX_LOCK_ASSERT(ipxp);

	ipxp->ipxp_flags |= IPXP_DROPPED;
	soisdisconnected(so);
	spxstat.spxs_closed++;
}

/*
 * Someday we may do level 3 handshaking to close a connection or send a
 * xerox style error.  For now, just close.  cb will always be invalid after
 * this call.
 */
static void
spx_usrclosed(struct spxpcb *cb)
{

	IPX_LIST_LOCK_ASSERT();
	IPX_LOCK_ASSERT(cb->s_ipxpcb);

	spx_close(cb);
}

/*
 * cb will always be invalid after this call.
 */
static void
spx_disconnect(struct spxpcb *cb)
{

	IPX_LIST_LOCK_ASSERT();
	IPX_LOCK_ASSERT(cb->s_ipxpcb);

	spx_close(cb);
}

/*
 * Drop connection, reporting the specified error.  cb will always be invalid
 * after this call.
 */
static void
spx_drop(struct spxpcb *cb, int errno)
{
	struct socket *so = cb->s_ipxpcb->ipxp_socket;

	IPX_LIST_LOCK_ASSERT();
	IPX_LOCK_ASSERT(cb->s_ipxpcb);

	/*
	 * Someday, in the xerox world we will generate error protocol
	 * packets announcing that the socket has gone away.
	 */
	if (TCPS_HAVERCVDSYN(cb->s_state)) {
		spxstat.spxs_drops++;
		cb->s_state = TCPS_CLOSED;
		/*tcp_output(cb);*/
	} else
		spxstat.spxs_conndrops++;
	so->so_error = errno;
	spx_close(cb);
}

/*
 * Fast timeout routine for processing delayed acks.
 */
void
spx_fasttimo(void)
{
	struct ipxpcb *ipxp;
	struct spxpcb *cb;

	IPX_LIST_LOCK();
	LIST_FOREACH(ipxp, &ipxpcb_list, ipxp_list) {
		IPX_LOCK(ipxp);
		if (!(ipxp->ipxp_flags & IPXP_SPX) ||
		    (ipxp->ipxp_flags & IPXP_DROPPED)) {
			IPX_UNLOCK(ipxp);
			continue;
		}
		cb = ipxtospxpcb(ipxp);
		if (cb->s_flags & SF_DELACK) {
			cb->s_flags &= ~SF_DELACK;
			cb->s_flags |= SF_ACKNOW;
			spxstat.spxs_delack++;
			spx_output(cb, NULL);
		}
		IPX_UNLOCK(ipxp);
	}
	IPX_LIST_UNLOCK();
}

/*
 * spx protocol timeout routine called every 500 ms.  Updates the timers in
 * all active pcb's and causes finite state machine actions if timers expire.
 */
void
spx_slowtimo(void)
{
	struct ipxpcb *ipxp;
	struct spxpcb *cb;
	int i;

	/*
	 * Search through tcb's and update active timers.  Once, timers could
	 * free ipxp's, but now we do that only when detaching a socket.
	 */
	IPX_LIST_LOCK();
	LIST_FOREACH(ipxp, &ipxpcb_list, ipxp_list) {
		IPX_LOCK(ipxp);
		if (!(ipxp->ipxp_flags & IPXP_SPX) ||
		    (ipxp->ipxp_flags & IPXP_DROPPED)) {
			IPX_UNLOCK(ipxp);
			continue;
		}

		cb = (struct spxpcb *)ipxp->ipxp_pcb;
		KASSERT(cb != NULL, ("spx_slowtimo: cb == NULL"));
		for (i = 0; i < SPXT_NTIMERS; i++) {
			if (cb->s_timer[i] && --cb->s_timer[i] == 0) {
				spx_timers(cb, i);
				if (ipxp->ipxp_flags & IPXP_DROPPED)
					break;
			}
		}
		if (!(ipxp->ipxp_flags & IPXP_DROPPED)) {
			cb->s_idle++;
			if (cb->s_rtt)
				cb->s_rtt++;
		}
		IPX_UNLOCK(ipxp);
	}
	IPX_LIST_UNLOCK();
	SPX_LOCK();
	spx_iss += SPX_ISSINCR/PR_SLOWHZ;		/* increment iss */
	SPX_UNLOCK();
}

/*
 * SPX timer processing.
 */
static void
spx_timers(struct spxpcb *cb, int timer)
{
	long rexmt;
	int win;

	IPX_LIST_LOCK_ASSERT();
	IPX_LOCK_ASSERT(cb->s_ipxpcb);

	cb->s_force = 1 + timer;
	switch (timer) {
	case SPXT_2MSL:
		/*
		 * 2 MSL timeout in shutdown went off.  TCP deletes
		 * connection control block.
		 */
		printf("spx: SPXT_2MSL went off for no reason\n");
		cb->s_timer[timer] = 0;
		break;

	case SPXT_REXMT:
		/*
		 * Retransmission timer went off.  Message has not been acked
		 * within retransmit interval.  Back off to a longer
		 * retransmit interval and retransmit one packet.
		 */
		if (++cb->s_rxtshift > SPX_MAXRXTSHIFT) {
			cb->s_rxtshift = SPX_MAXRXTSHIFT;
			spxstat.spxs_timeoutdrop++;
			spx_drop(cb, ETIMEDOUT);
			break;
		}
		spxstat.spxs_rexmttimeo++;
		rexmt = ((cb->s_srtt >> 2) + cb->s_rttvar) >> 1;
		rexmt *= spx_backoff[cb->s_rxtshift];
		SPXT_RANGESET(cb->s_rxtcur, rexmt, SPXTV_MIN, SPXTV_REXMTMAX);
		cb->s_timer[SPXT_REXMT] = cb->s_rxtcur;

		/*
		 * If we have backed off fairly far, our srtt estimate is
		 * probably bogus.  Clobber it so we'll take the next rtt
		 * measurement as our srtt; move the current srtt into rttvar
		 * to keep the current retransmit times until then.
		 */
		if (cb->s_rxtshift > SPX_MAXRXTSHIFT / 4 ) {
			cb->s_rttvar += (cb->s_srtt >> 2);
			cb->s_srtt = 0;
		}
		cb->s_snxt = cb->s_rack;

		/*
		 * If timing a packet, stop the timer.
		 */
		cb->s_rtt = 0;

		/*
		 * See very long discussion in tcp_timer.c about congestion
		 * window and sstrhesh.
		 */
		win = min(cb->s_swnd, (cb->s_cwnd/CUNIT)) / 2;
		if (win < 2)
			win = 2;
		cb->s_cwnd = CUNIT;
		cb->s_ssthresh = win * CUNIT;
		spx_output(cb, NULL);
		break;

	case SPXT_PERSIST:
		/*
		 * Persistance timer into zero window.  Force a probe to be
		 * sent.
		 */
		spxstat.spxs_persisttimeo++;
		spx_setpersist(cb);
		spx_output(cb, NULL);
		break;

	case SPXT_KEEP:
		/*
		 * Keep-alive timer went off; send something or drop
		 * connection if idle for too long.
		 */
		spxstat.spxs_keeptimeo++;
		if (cb->s_state < TCPS_ESTABLISHED)
			goto dropit;
		if (cb->s_ipxpcb->ipxp_socket->so_options & SO_KEEPALIVE) {
		    	if (cb->s_idle >= SPXTV_MAXIDLE)
				goto dropit;
			spxstat.spxs_keepprobe++;
			spx_output(cb, NULL);
		} else
			cb->s_idle = 0;
		cb->s_timer[SPXT_KEEP] = SPXTV_KEEP;
		break;

	dropit:
		spxstat.spxs_keepdrops++;
		spx_drop(cb, ETIMEDOUT);
		break;

	default:
		panic("spx_timers: unknown timer %d", timer);
	}
}