1. Basic PIM kernel support

Disabled by default. To enable it, the new "options PIM" must be added to the kernel configuration file (in addition to MROUTING): options MROUTING # Multicast routing options PIM # Protocol Independent Multicast 2. Add support for advanced multicast API setup/configuration and extensibility. 3. Add support for kernel-level PIM Register encapsulation. Disabled by default. Can be enabled by the advanced multicast API. 4. Implement a mechanism for "multicast bandwidth monitoring and upcalls". Submitted by: Pavlin Radoslavov <pavlin@icir.org>
svn path=/head/; revision=118622
2003-08-07 18:16:59 +00:00 · 2003-08-07 18:16:59 +00:00 · 1e78ac216e · 2020-12-20 02:59:44 +00:00
commit 1e78ac216e
parent 8285491e78
7 changed files with 1748 additions and 213 deletions
--- a/sys/netinet/in.h
+++ b/sys/netinet/in.h
@ -486,6 +486,22 @@ struct ip_mreq {
 	{ 0, 0 }, \
 	{ 0, 0 }, \
 	{ "ipsec", CTLTYPE_NODE }, \
+	{ 0, 0 }, \
+	{ 0, 0 }, \
+	{ 0, 0 }, \
+	{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
+	{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
+	{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
+	{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
+	{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
+	{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
+	{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
+	{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
+	{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
+	{ 0, 0 }, \
+	{ 0, 0 }, \
+	{ 0, 0 }, \
+	{ "pim", CTLTYPE_NODE }, \
 }

 /*
--- a/sys/netinet/in_proto.c
+++ b/sys/netinet/in_proto.c
@ -36,6 +36,7 @@

 #include "opt_ipdivert.h"
 #include "opt_ipx.h"
+#include "opt_mrouting.h"
 #include "opt_ipsec.h"
 #include "opt_inet6.h"

@ -57,6 +58,9 @@
 #include <netinet/ip_var.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/igmp_var.h>
+#ifdef PIM
+#include <netinet/pim_var.h>
+#endif
 #include <netinet/tcp.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
@ -216,6 +220,14 @@ struct protosw inetsw[] = {
  &rip_usrreqs
 },
 #endif
+#ifdef PIM
+{ SOCK_RAW,	&inetdomain,	IPPROTO_PIM,	PR_ATOMIC|PR_ADDR|PR_LASTHDR,
+  pim_input,	0,		0,		rip_ctloutput,
+  0,
+  0,		0,		0,		0,
+  &rip_usrreqs
+},
+#endif	/* PIM */
 	/* raw wildcard */
 { SOCK_RAW,	&inetdomain,	0,		PR_ATOMIC|PR_ADDR,
  rip_input,	0,		0,		rip_ctloutput,
@ -260,4 +272,6 @@ SYSCTL_NODE(_net_inet, IPPROTO_RAW,	raw,	CTLFLAG_RW, 0,	"RAW");
 #ifdef IPDIVERT
 SYSCTL_NODE(_net_inet, IPPROTO_DIVERT,	divert,	CTLFLAG_RW, 0,	"DIVERT");
 #endif
-
+#ifdef PIM
+SYSCTL_NODE(_net_inet, IPPROTO_PIM,    pim,    CTLFLAG_RW, 0,  "PIM");
+#endif
--- a/sys/netinet/ip_mroute.c
+++ b/sys/netinet/ip_mroute.c
--- a/sys/netinet/ip_mroute.h
+++ b/sys/netinet/ip_mroute.h
@ -48,8 +48,12 @@
 * Modified by Steve Deering, Stanford, February 1989.
 * Modified by Ajit Thyagarajan, PARC, August 1993.
 * Modified by Ajit Thyagarajan, PARC, August 1994.
+ * Modified by Ahmed Helmy, SGI, June 1996.
+ * Modified by Pavlin Radoslavov, ICSI, October 2002.
 *
 * MROUTING Revision: 3.3.1.3
+ * and PIM-SMv2 and PIM-DM support, advanced API support,
+ * bandwidth metering and signaling.
 */


@ -63,7 +67,12 @@
 #define MRT_ADD_MFC	104	/* insert forwarding cache entry */
 #define MRT_DEL_MFC	105	/* delete forwarding cache entry */
 #define MRT_VERSION	106	/* get kernel version number */
-#define MRT_ASSERT      107     /* enable PIM assert processing */
+#define MRT_ASSERT      107     /* enable assert processing */
+#define MRT_PIM		MRT_ASSERT /* enable PIM processing */
+#define MRT_API_SUPPORT	109	/* supported MRT API */
+#define MRT_API_CONFIG	110	/* config MRT API */
+#define MRT_ADD_BW_UPCALL 111	/* create bandwidth monitor */
+#define MRT_DEL_BW_UPCALL 112	/* delete bandwidth monitor */


 #define GET_TIME(t)	microtime(&t)
@ -99,10 +108,11 @@ struct vifctl {

 #define	VIFF_TUNNEL	0x1		/* vif represents a tunnel end-point */
 #define VIFF_SRCRT	0x2		/* tunnel uses IP source routing */
+#define VIFF_REGISTER	0x4		/* used for PIM Register encap/decap */

 /*
 * Argument structure for MRT_ADD_MFC and MRT_DEL_MFC
- * (mfcc_tos to be added at a future point)
+ * XXX if you change this, make sure to change struct mfcctl2 as well.
 */
 struct mfcctl {
    struct in_addr  mfcc_origin;		/* ip origin of mcasts       */
@ -111,6 +121,94 @@ struct mfcctl {
    u_char	    mfcc_ttls[MAXVIFS]; 	/* forwarding ttls on vifs   */
 };

+/*
+ * The new argument structure for MRT_ADD_MFC and MRT_DEL_MFC overlays
+ * and extends the old struct mfcctl.
+ */
+struct mfcctl2 {
+	/* the mfcctl fields */
+	struct in_addr	mfcc_origin;		/* ip origin of mcasts	     */
+	struct in_addr	mfcc_mcastgrp;		/* multicast group associated*/
+	vifi_t		mfcc_parent;		/* incoming vif		     */
+	u_char		mfcc_ttls[MAXVIFS]; 	/* forwarding ttls on vifs   */
+
+	/* extension fields */
+	uint8_t		mfcc_flags[MAXVIFS];	/* the MRT_MFC_FLAGS_* flags */
+	struct in_addr	mfcc_rp;		/* the RP address            */
+};
+/*
+ * The advanced-API flags.
+ *
+ * The MRT_MFC_FLAGS_XXX API flags are also used as flags
+ * for the mfcc_flags field.
+ */
+#define	MRT_MFC_FLAGS_DISABLE_WRONGVIF	(1 << 0) /* disable WRONGVIF signals */
+#define	MRT_MFC_FLAGS_BORDER_VIF	(1 << 1) /* border vif		     */
+#define MRT_MFC_RP			(1 << 8) /* enable RP address	     */
+#define MRT_MFC_BW_UPCALL		(1 << 9) /* enable bw upcalls	     */
+#define MRT_MFC_FLAGS_ALL		(MRT_MFC_FLAGS_DISABLE_WRONGVIF |    \
+					 MRT_MFC_FLAGS_BORDER_VIF)
+#define MRT_API_FLAGS_ALL		(MRT_MFC_FLAGS_ALL |		     \
+					 MRT_MFC_RP |			     \
+					 MRT_MFC_BW_UPCALL)
+
+/*
+ * Structure for installing or delivering an upcall if the
+ * measured bandwidth is above or below a threshold.
+ *
+ * User programs (e.g. daemons) may have a need to know when the
+ * bandwidth used by some data flow is above or below some threshold.
+ * This interface allows the userland to specify the threshold (in
+ * bytes and/or packets) and the measurement interval. Flows are
+ * all packet with the same source and destination IP address.
+ * At the moment the code is only used for multicast destinations
+ * but there is nothing that prevents its use for unicast.
+ *
+ * The measurement interval cannot be shorter than some Tmin (currently, 3s).
+ * The threshold is set in packets and/or bytes per_interval.
+ *
+ * Measurement works as follows:
+ *
+ * For >= measurements: 
+ * The first packet marks the start of a measurement interval.
+ * During an interval we count packets and bytes, and when we
+ * pass the threshold we deliver an upcall and we are done.
+ * The first packet after the end of the interval resets the
+ * count and restarts the measurement.
+ *
+ * For <= measurement:
+ * We start a timer to fire at the end of the interval, and
+ * then for each incoming packet we count packets and bytes.
+ * When the timer fires, we compare the value with the threshold,
+ * schedule an upcall if we are below, and restart the measurement
+ * (reschedule timer and zero counters).
+ */
+
+struct bw_data {
+	struct timeval	b_time;
+	uint64_t	b_packets;
+	uint64_t	b_bytes;
+};
+
+struct bw_upcall {
+	struct in_addr	bu_src;			/* source address            */
+	struct in_addr	bu_dst;			/* destination address       */
+	uint32_t	bu_flags;		/* misc flags (see below)    */
+#define BW_UPCALL_UNIT_PACKETS   (1 << 0)	/* threshold (in packets)    */
+#define BW_UPCALL_UNIT_BYTES     (1 << 1)	/* threshold (in bytes)      */
+#define BW_UPCALL_GEQ            (1 << 2)	/* upcall if bw >= threshold */
+#define BW_UPCALL_LEQ            (1 << 3)	/* upcall if bw <= threshold */
+#define BW_UPCALL_DELETE_ALL     (1 << 4)	/* delete all upcalls for s,d*/
+	struct bw_data	bu_threshold;		/* the bw threshold	     */
+	struct bw_data	bu_measured;		/* the measured bw	     */
+};
+
+/* max. number of upcalls to deliver together */
+#define BW_UPCALLS_MAX				128
+/* min. threshold time interval for bandwidth measurement */
+#define BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC	3
+#define BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC	0
+
 /*
 * The kernel's multicast routing statistics.
 */
@ -179,17 +277,20 @@ struct vif {
 * at a future point)
 */
 struct mfc {
-    struct in_addr  mfc_origin;	 		/* IP origin of mcasts   */
-    struct in_addr  mfc_mcastgrp;  		/* multicast group associated*/
-    vifi_t	    mfc_parent; 		/* incoming vif              */
-    u_char	    mfc_ttls[MAXVIFS]; 		/* forwarding ttls on vifs   */
-    u_long	    mfc_pkt_cnt;		/* pkt count for src-grp     */
-    u_long	    mfc_byte_cnt;		/* byte count for src-grp    */
-    u_long	    mfc_wrong_if;		/* wrong if for src-grp	     */
-    int		    mfc_expire;			/* time to clean entry up    */
-    struct timeval  mfc_last_assert;		/* last time I sent an assert*/
-    struct rtdetq  *mfc_stall;			/* q of packets awaiting mfc */
-    struct mfc     *mfc_next;			/* next mfc entry            */
+	struct in_addr	mfc_origin;		/* IP origin of mcasts	     */
+	struct in_addr  mfc_mcastgrp;  		/* multicast group associated*/
+	vifi_t		mfc_parent; 		/* incoming vif              */
+	u_char		mfc_ttls[MAXVIFS]; 	/* forwarding ttls on vifs   */
+	u_long		mfc_pkt_cnt;		/* pkt count for src-grp     */
+	u_long		mfc_byte_cnt;		/* byte count for src-grp    */
+	u_long		mfc_wrong_if;		/* wrong if for src-grp	     */
+	int		mfc_expire;		/* time to clean entry up    */
+	struct timeval	mfc_last_assert;	/* last time I sent an assert*/
+	struct rtdetq	*mfc_stall;		/* q of packets awaiting mfc */
+	struct mfc	*mfc_next;		/* next mfc entry            */
+	uint8_t		mfc_flags[MAXVIFS];	/* the MRT_MFC_FLAGS_* flags */
+	struct in_addr	mfc_rp;			/* the RP address	     */
+	struct bw_meter	*mfc_bw_meter;		/* list of bandwidth meters  */
 };

 /*
@ -200,8 +301,10 @@ struct igmpmsg {
    u_long	    unused1;
    u_long	    unused2;
    u_char	    im_msgtype;			/* what type of message	    */
-#define IGMPMSG_NOCACHE		1
-#define IGMPMSG_WRONGVIF	2
+#define IGMPMSG_NOCACHE		1	/* no MFC in the kernel		    */
+#define IGMPMSG_WRONGVIF	2	/* packet came from wrong interface */
+#define	IGMPMSG_WHOLEPKT	3	/* PIM pkt for user level encap.    */
+#define	IGMPMSG_BW_UPCALL	4	/* BW monitoring upcall		    */
    u_char	    im_mbz;			/* must be zero		    */
    u_char	    im_vif;			/* vif rec'd on		    */
    u_char	    unused3;
@ -246,6 +349,32 @@ struct tbf
    struct mbuf *tbf_t;		/* tail-insertion pointer	*/
 };

+/*
+ * Structure for measuring the bandwidth and sending an upcall if the
+ * measured bandwidth is above or below a threshold.
+ */
+struct bw_meter {
+	struct bw_meter	*bm_mfc_next;		/* next bw meter (same mfc)  */
+	struct bw_meter	*bm_time_next;		/* next bw meter (same time) */
+	uint32_t	bm_time_hash;		/* the time hash value       */
+	struct mfc	*bm_mfc;		/* the corresponding mfc     */
+	uint32_t	bm_flags;		/* misc flags (see below)    */
+#define BW_METER_UNIT_PACKETS	(1 << 0)	/* threshold (in packets)    */
+#define BW_METER_UNIT_BYTES	(1 << 1)	/* threshold (in bytes)      */
+#define BW_METER_GEQ		(1 << 2)	/* upcall if bw >= threshold */
+#define BW_METER_LEQ		(1 << 3)	/* upcall if bw <= threshold */
+#define BW_METER_USER_FLAGS 	(BW_METER_UNIT_PACKETS |		\
+				 BW_METER_UNIT_BYTES |			\
+				 BW_METER_GEQ |				\
+				 BW_METER_LEQ)
+
+#define BW_METER_UPCALL_DELIVERED (1 << 24)	/* upcall was delivered      */
+
+	struct bw_data	bm_threshold;		/* the upcall threshold	     */
+	struct bw_data	bm_measured;		/* the measured bw	     */
+	struct timeval	bm_start_time;		/* abs. time		     */
+};
+
 #ifdef _KERNEL

 struct sockopt;
--- a/sys/netinet/ip_output.c
+++ b/sys/netinet/ip_output.c
@ -124,15 +124,10 @@ extern	struct protosw inetsw[];
 * The mbuf opt, if present, will not be freed.
 */
 int
-ip_output(m0, opt, ro, flags, imo, inp)
-	struct mbuf *m0;
-	struct mbuf *opt;
-	struct route *ro;
-	int flags;
-	struct ip_moptions *imo;
-	struct inpcb *inp;
+ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro,
+	int flags, struct ip_moptions *imo, struct inpcb *inp)
 {
-	struct ip *ip, *mhip;
+	struct ip *ip;
 	struct ifnet *ifp = NULL;	/* keep compiler happy */
 	struct mbuf *m;
 	int hlen = sizeof (struct ip);
@ -478,7 +473,7 @@ ip_output(m0, opt, ro, flags, imo, inp)
 			goto bad;
 		}
 		/* don't allow broadcast messages to be fragmented */
-		if ((u_short)ip->ip_len > ifp->if_mtu) {
+		if (ip->ip_len > ifp->if_mtu) {
 			error = EMSGSIZE;
 			goto bad;
 		}
@ -1014,8 +1009,7 @@ ip_output(m0, opt, ro, flags, imo, inp)
 	 * If small enough for interface, or the interface will take
 	 * care of the fragmentation for us, can just send directly.
 	 */
-	if ((u_short)ip->ip_len <= ifp->if_mtu ||
-	    ifp->if_hwassist & CSUM_FRAGMENT) {
+	if (ip->ip_len <= ifp->if_mtu || ifp->if_hwassist & CSUM_FRAGMENT) {
 		ip->ip_len = htons(ip->ip_len);
 		ip->ip_off = htons(ip->ip_off);
 		ip->ip_sum = 0;
@ -1057,10 +1051,7 @@ ip_output(m0, opt, ro, flags, imo, inp)
 				(struct sockaddr *)dst, ro->ro_rt);
 		goto done;
 	}
-	/*
-	 * Too large for interface; fragment if possible.
-	 * Must be able to put at least 8 bytes per fragment.
-	 */
+
 	if (ip->ip_off & IP_DF) {
 		error = EMSGSIZE;
 		/*
@ -1070,149 +1061,23 @@ ip_output(m0, opt, ro, flags, imo, inp)
 		 * them, there is no way for one to update all its
 		 * routes when the MTU is changed.
 		 */
-		if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST))
-		    && !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU)
-		    && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
+		if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) &&
+		    !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) &&
+		    (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
 			ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
 		}
 		ipstat.ips_cantfrag++;
 		goto bad;
 	}
-	len = (ifp->if_mtu - hlen) &~ 7;
-	if (len < 8) {
-		error = EMSGSIZE;
+
+	/*
+	 * Too large for interface; fragment if possible. If successful,
+	 * on return, m will point to a list of packets to be sent.
+	 */
+	error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist, sw_csum);
+	if (error)
 		goto bad;
-	}
-
-	/*
-	 * if the interface will not calculate checksums on
-	 * fragmented packets, then do it here.
-	 */
-	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
-	    (ifp->if_hwassist & CSUM_IP_FRAGS) == 0) {
-		in_delayed_cksum(m);
-		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
-	}
-
-	if (len > PAGE_SIZE) {
-		/* 
-		 * Fragement large datagrams such that each segment 
-		 * contains a multiple of PAGE_SIZE amount of data, 
-		 * plus headers. This enables a receiver to perform 
-		 * page-flipping zero-copy optimizations.
-		 */
-
-		int newlen;
-		struct mbuf *mtmp;
-
-		for (mtmp = m, off = 0; 
-		     mtmp && ((off + mtmp->m_len) <= ifp->if_mtu);
-		     mtmp = mtmp->m_next) {
-			off += mtmp->m_len;
-		}
-		/*
-		 * firstlen (off - hlen) must be aligned on an 
-		 * 8-byte boundary
-		 */
-		if (off < hlen)
-			goto smart_frag_failure;
-		off = ((off - hlen) & ~7) + hlen;
-		newlen = (~PAGE_MASK) & ifp->if_mtu;
-		if ((newlen + sizeof (struct ip)) > ifp->if_mtu) {
-			/* we failed, go back the default */
-smart_frag_failure:
-			newlen = len;
-			off = hlen + len;
-		}
-
-/*		printf("ipfrag: len = %d, hlen = %d, mhlen = %d, newlen = %d, off = %d\n",
-		len, hlen, sizeof (struct ip), newlen, off);*/
-
-		len = newlen;
-
-	} else {
-		off = hlen + len;
-	}
-
-
-
-    {
-	int mhlen, firstlen = off - hlen;
-	struct mbuf **mnext = &m->m_nextpkt;
-	int nfrags = 1;
-
-	/*
-	 * Loop through length of segment after first fragment,
-	 * make new header and copy data of each part and link onto chain.
-	 */
-	m0 = m;
-	mhlen = sizeof (struct ip);
-	for (; off < (u_short)ip->ip_len; off += len) {
-		MGETHDR(m, M_DONTWAIT, MT_HEADER);
-		if (m == 0) {
-			error = ENOBUFS;
-			ipstat.ips_odropped++;
-			goto sendorfree;
-		}
-		m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
-		m->m_data += max_linkhdr;
-		mhip = mtod(m, struct ip *);
-		*mhip = *ip;
-		if (hlen > sizeof (struct ip)) {
-			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
-			mhip->ip_v = IPVERSION;
-			mhip->ip_hl = mhlen >> 2;
-		}
-		m->m_len = mhlen;
-		mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
-		if (off + len >= (u_short)ip->ip_len)
-			len = (u_short)ip->ip_len - off;
-		else
-			mhip->ip_off |= IP_MF;
-		mhip->ip_len = htons((u_short)(len + mhlen));
-		m->m_next = m_copy(m0, off, len);
-		if (m->m_next == 0) {
-			(void) m_free(m);
-			error = ENOBUFS;	/* ??? */
-			ipstat.ips_odropped++;
-			goto sendorfree;
-		}
-		m->m_pkthdr.len = mhlen + len;
-		m->m_pkthdr.rcvif = (struct ifnet *)0;
-#ifdef MAC
-		mac_create_fragment(m0, m);
-#endif
-		m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
-		mhip->ip_off = htons(mhip->ip_off);
-		mhip->ip_sum = 0;
-		if (sw_csum & CSUM_DELAY_IP)
-			mhip->ip_sum = in_cksum(m, mhlen);
-		*mnext = m;
-		mnext = &m->m_nextpkt;
-		nfrags++;
-	}
-	ipstat.ips_ofragments += nfrags;
-
-	/* set first/last markers for fragment chain */
-	m->m_flags |= M_LASTFRAG;
-	m0->m_flags |= M_FIRSTFRAG | M_FRAG;
-	m0->m_pkthdr.csum_data = nfrags;
-
-	/*
-	 * Update first fragment by trimming what's been copied out
-	 * and updating header, then send each fragment (in order).
-	 */
-	m = m0;
-	m_adj(m, hlen + firstlen - (u_short)ip->ip_len);
-	m->m_pkthdr.len = hlen + firstlen;
-	ip->ip_len = htons((u_short)m->m_pkthdr.len);
-	ip->ip_off |= IP_MF;
-	ip->ip_off = htons(ip->ip_off);
-	ip->ip_sum = 0;
-	if (sw_csum & CSUM_DELAY_IP)
-		ip->ip_sum = in_cksum(m, hlen);
-sendorfree:
-	for (m = m0; m; m = m0) {
+	for (; m; m = m0) {
 		m0 = m->m_nextpkt;
 		m->m_nextpkt = 0;
 #ifdef IPSEC
@ -1234,7 +1099,7 @@ ip_output(m0, opt, ro, flags, imo, inp)

 	if (error == 0)
 		ipstat.ips_fragmented++;
-    }
+
 done:
 #ifdef IPSEC
 	if (ro == &iproute && ro->ro_rt) {
@ -1246,7 +1111,7 @@ ip_output(m0, opt, ro, flags, imo, inp)
 			printf("DP ip_output call free SP:%p\n", sp));
 		key_freesp(sp);
 	}
-#endif /* IPSEC */
+#endif
 #ifdef FAST_IPSEC
 	if (ro == &iproute && ro->ro_rt) {
 		RTFREE(ro->ro_rt);
@ -1254,13 +1119,181 @@ ip_output(m0, opt, ro, flags, imo, inp)
 	}
 	if (sp != NULL)
 		KEY_FREESP(&sp);
-#endif /* FAST_IPSEC */
+#endif
 	return (error);
 bad:
 	m_freem(m);
 	goto done;
 }

+/*
+ * Create a chain of fragments which fit the given mtu. m_frag points to the
+ * mbuf to be fragmented; on return it points to the chain with the fragments.
+ * Return 0 if no error. If error, m_frag may contain a partially built
+ * chain of fragments that should be freed by the caller.
+ *
+ * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
+ * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
+ */
+int
+ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
+	    u_long if_hwassist_flags, int sw_csum)
+{
+	int error = 0;
+	int hlen = ip->ip_hl << 2;
+	int len = (mtu - hlen) & ~7;	/* size of payload in each fragment */
+	int off;
+	struct mbuf *m0 = *m_frag;	/* the original packet		*/
+	int firstlen;
+	struct mbuf **mnext;
+	int nfrags;
+
+	if (ip->ip_off & IP_DF) {	/* Fragmentation not allowed */
+		ipstat.ips_cantfrag++;
+		return EMSGSIZE;
+	}
+
+	/*
+	 * Must be able to put at least 8 bytes per fragment.
+	 */
+	if (len < 8)
+		return EMSGSIZE;
+
+	/*
+	 * If the interface will not calculate checksums on
+	 * fragmented packets, then do it here.
+	 */
+	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
+	    (if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
+		in_delayed_cksum(m0);
+		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
+	}
+
+	if (len > PAGE_SIZE) {
+		/* 
+		 * Fragment large datagrams such that each segment 
+		 * contains a multiple of PAGE_SIZE amount of data, 
+		 * plus headers. This enables a receiver to perform 
+		 * page-flipping zero-copy optimizations.
+		 *
+		 * XXX When does this help given that sender and receiver
+		 * could have different page sizes, and also mtu could
+		 * be less than the receiver's page size ?
+		 */
+		int newlen;
+		struct mbuf *m;
+
+		for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next)
+			off += m->m_len;
+
+		/*
+		 * firstlen (off - hlen) must be aligned on an 
+		 * 8-byte boundary
+		 */
+		if (off < hlen)
+			goto smart_frag_failure;
+		off = ((off - hlen) & ~7) + hlen;
+		newlen = (~PAGE_MASK) & mtu;
+		if ((newlen + sizeof (struct ip)) > mtu) {
+			/* we failed, go back the default */
+smart_frag_failure:
+			newlen = len;
+			off = hlen + len;
+		}
+		len = newlen;
+
+	} else {
+		off = hlen + len;
+	}
+
+	firstlen = off - hlen;
+	mnext = &m0->m_nextpkt;		/* pointer to next packet */
+
+	/*
+	 * Loop through length of segment after first fragment,
+	 * make new header and copy data of each part and link onto chain.
+	 * Here, m0 is the original packet, m is the fragment being created.
+	 * The fragments are linked off the m_nextpkt of the original
+	 * packet, which after processing serves as the first fragment.
+	 */
+	for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) {
+		struct ip *mhip;	/* ip header on the fragment */
+		struct mbuf *m;
+		int mhlen = sizeof (struct ip);
+
+		MGETHDR(m, M_DONTWAIT, MT_HEADER);
+		if (m == 0) {
+			error = ENOBUFS;
+			ipstat.ips_odropped++;
+			goto done;
+		}
+		m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
+		/*
+		 * In the first mbuf, leave room for the link header, then
+		 * copy the original IP header including options. The payload
+		 * goes into an additional mbuf chain returned by m_copy().
+		 */
+		m->m_data += max_linkhdr;
+		mhip = mtod(m, struct ip *);
+		*mhip = *ip;
+		if (hlen > sizeof (struct ip)) {
+			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
+			mhip->ip_v = IPVERSION;
+			mhip->ip_hl = mhlen >> 2;
+		}
+		m->m_len = mhlen;
+		/* XXX do we need to add ip->ip_off below ? */
+		mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
+		if (off + len >= ip->ip_len) {	/* last fragment */
+			len = ip->ip_len - off;
+			m->m_flags |= M_LASTFRAG;
+		} else
+			mhip->ip_off |= IP_MF;
+		mhip->ip_len = htons((u_short)(len + mhlen));
+		m->m_next = m_copy(m0, off, len);
+		if (m->m_next == 0) {		/* copy failed */
+			m_free(m);
+			error = ENOBUFS;	/* ??? */
+			ipstat.ips_odropped++;
+			goto done;
+		}
+		m->m_pkthdr.len = mhlen + len;
+		m->m_pkthdr.rcvif = (struct ifnet *)0;
+#ifdef MAC
+		mac_create_fragment(m0, m);
+#endif
+		m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
+		mhip->ip_off = htons(mhip->ip_off);
+		mhip->ip_sum = 0;
+		if (sw_csum & CSUM_DELAY_IP)
+			mhip->ip_sum = in_cksum(m, mhlen);
+		*mnext = m;
+		mnext = &m->m_nextpkt;
+	}
+	ipstat.ips_ofragments += nfrags;
+
+	/* set first marker for fragment chain */
+	m0->m_flags |= M_FIRSTFRAG | M_FRAG;
+	m0->m_pkthdr.csum_data = nfrags;
+
+	/*
+	 * Update first fragment by trimming what's been copied out
+	 * and updating header.
+	 */
+	m_adj(m0, hlen + firstlen - ip->ip_len);
+	m0->m_pkthdr.len = hlen + firstlen;
+	ip->ip_len = htons((u_short)m0->m_pkthdr.len);
+	ip->ip_off |= IP_MF;
+	ip->ip_off = htons(ip->ip_off);
+	ip->ip_sum = 0;
+	if (sw_csum & CSUM_DELAY_IP)
+		ip->ip_sum = in_cksum(m0, hlen);
+
+done:
+	*m_frag = m0;
+	return error;
+}
+
 void
 in_delayed_cksum(struct mbuf *m)
 {
@ -1307,7 +1340,7 @@ ip_insertoptions(m, opt, phlen)
 	unsigned optlen;

 	optlen = opt->m_len - sizeof(p->ipopt_dst);
-	if (optlen + (u_short)ip->ip_len > IP_MAXPACKET) {
+	if (optlen + ip->ip_len > IP_MAXPACKET) {
 		*phlen = 0;
 		return (m);		/* XXX should fail */
 	}
--- a/sys/netinet/ip_var.h
+++ b/sys/netinet/ip_var.h
@ -164,6 +164,8 @@ extern struct	pr_usrreqs rip_usrreqs;

 int	 ip_ctloutput(struct socket *, struct sockopt *sopt);
 void	 ip_drain(void);
+int	 ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
+	    u_long if_hwassist_flags, int sw_csum);
 void	 ip_freemoptions(struct ip_moptions *);
 void	 ip_init(void);
 extern int	 (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
--- a/sys/netinet/raw_ip.c
+++ b/sys/netinet/raw_ip.c
@ -383,6 +383,10 @@ rip_ctloutput(so, sopt)
 		case MRT_DEL_MFC:
 		case MRT_VERSION:
 		case MRT_ASSERT:
+		case MRT_API_SUPPORT:
+		case MRT_API_CONFIG:
+		case MRT_ADD_BW_UPCALL:
+		case MRT_DEL_BW_UPCALL:
 			error = ip_mrouter_get ? ip_mrouter_get(so, sopt) :
 				EOPNOTSUPP;
 			break;
@ -448,6 +452,10 @@ rip_ctloutput(so, sopt)
 		case MRT_DEL_MFC:
 		case MRT_VERSION:
 		case MRT_ASSERT:
+		case MRT_API_SUPPORT:
+		case MRT_API_CONFIG:
+		case MRT_ADD_BW_UPCALL:
+		case MRT_DEL_BW_UPCALL:
 			error = ip_mrouter_set ? ip_mrouter_set(so, sopt) :
 					EOPNOTSUPP;
 			break;