1. Basic PIM kernel support

Disabled by default. To enable it, the new "options PIM" must be
added to the kernel configuration file (in addition to MROUTING):

options	MROUTING		# Multicast routing
options	PIM			# Protocol Independent Multicast

2. Add support for advanced multicast API setup/configuration and
extensibility.

3. Add support for kernel-level PIM Register encapsulation.
Disabled by default.  Can be enabled by the advanced multicast API.

4. Implement a mechanism for "multicast bandwidth monitoring and upcalls".

Submitted by:	Pavlin Radoslavov <pavlin@icir.org>
This commit is contained in:
Jeffrey Hsu 2003-08-07 18:16:59 +00:00
parent 8285491e78
commit 1e78ac216e
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=118622
7 changed files with 1748 additions and 213 deletions

View File

@ -486,6 +486,22 @@ struct ip_mreq {
{ 0, 0 }, \
{ 0, 0 }, \
{ "ipsec", CTLTYPE_NODE }, \
{ 0, 0 }, \
{ 0, 0 }, \
{ 0, 0 }, \
{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
{ 0, 0 }, \
{ 0, 0 }, \
{ 0, 0 }, \
{ "pim", CTLTYPE_NODE }, \
}
/*

View File

@ -36,6 +36,7 @@
#include "opt_ipdivert.h"
#include "opt_ipx.h"
#include "opt_mrouting.h"
#include "opt_ipsec.h"
#include "opt_inet6.h"
@ -57,6 +58,9 @@
#include <netinet/ip_var.h>
#include <netinet/ip_icmp.h>
#include <netinet/igmp_var.h>
#ifdef PIM
#include <netinet/pim_var.h>
#endif
#include <netinet/tcp.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
@ -216,6 +220,14 @@ struct protosw inetsw[] = {
&rip_usrreqs
},
#endif
#ifdef PIM
{ SOCK_RAW, &inetdomain, IPPROTO_PIM, PR_ATOMIC|PR_ADDR|PR_LASTHDR,
pim_input, 0, 0, rip_ctloutput,
0,
0, 0, 0, 0,
&rip_usrreqs
},
#endif /* PIM */
/* raw wildcard */
{ SOCK_RAW, &inetdomain, 0, PR_ATOMIC|PR_ADDR,
rip_input, 0, 0, rip_ctloutput,
@ -260,4 +272,6 @@ SYSCTL_NODE(_net_inet, IPPROTO_RAW, raw, CTLFLAG_RW, 0, "RAW");
#ifdef IPDIVERT
SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert, CTLFLAG_RW, 0, "DIVERT");
#endif
#ifdef PIM
SYSCTL_NODE(_net_inet, IPPROTO_PIM, pim, CTLFLAG_RW, 0, "PIM");
#endif

File diff suppressed because it is too large Load Diff

View File

@ -48,8 +48,12 @@
* Modified by Steve Deering, Stanford, February 1989.
* Modified by Ajit Thyagarajan, PARC, August 1993.
* Modified by Ajit Thyagarajan, PARC, August 1994.
* Modified by Ahmed Helmy, SGI, June 1996.
* Modified by Pavlin Radoslavov, ICSI, October 2002.
*
* MROUTING Revision: 3.3.1.3
* and PIM-SMv2 and PIM-DM support, advanced API support,
* bandwidth metering and signaling.
*/
@ -63,7 +67,12 @@
#define MRT_ADD_MFC 104 /* insert forwarding cache entry */
#define MRT_DEL_MFC 105 /* delete forwarding cache entry */
#define MRT_VERSION 106 /* get kernel version number */
#define MRT_ASSERT 107 /* enable PIM assert processing */
#define MRT_ASSERT 107 /* enable assert processing */
#define MRT_PIM MRT_ASSERT /* enable PIM processing */
#define MRT_API_SUPPORT 109 /* supported MRT API */
#define MRT_API_CONFIG 110 /* config MRT API */
#define MRT_ADD_BW_UPCALL 111 /* create bandwidth monitor */
#define MRT_DEL_BW_UPCALL 112 /* delete bandwidth monitor */
#define GET_TIME(t) microtime(&t)
@ -99,10 +108,11 @@ struct vifctl {
#define VIFF_TUNNEL 0x1 /* vif represents a tunnel end-point */
#define VIFF_SRCRT 0x2 /* tunnel uses IP source routing */
#define VIFF_REGISTER 0x4 /* used for PIM Register encap/decap */
/*
* Argument structure for MRT_ADD_MFC and MRT_DEL_MFC
* (mfcc_tos to be added at a future point)
* XXX if you change this, make sure to change struct mfcctl2 as well.
*/
struct mfcctl {
struct in_addr mfcc_origin; /* ip origin of mcasts */
@ -111,6 +121,94 @@ struct mfcctl {
u_char mfcc_ttls[MAXVIFS]; /* forwarding ttls on vifs */
};
/*
* The new argument structure for MRT_ADD_MFC and MRT_DEL_MFC overlays
* and extends the old struct mfcctl.
*/
struct mfcctl2 {
/* the mfcctl fields */
struct in_addr mfcc_origin; /* ip origin of mcasts */
struct in_addr mfcc_mcastgrp; /* multicast group associated*/
vifi_t mfcc_parent; /* incoming vif */
u_char mfcc_ttls[MAXVIFS]; /* forwarding ttls on vifs */
/* extension fields */
uint8_t mfcc_flags[MAXVIFS]; /* the MRT_MFC_FLAGS_* flags */
struct in_addr mfcc_rp; /* the RP address */
};
/*
* The advanced-API flags.
*
* The MRT_MFC_FLAGS_XXX API flags are also used as flags
* for the mfcc_flags field.
*/
#define MRT_MFC_FLAGS_DISABLE_WRONGVIF (1 << 0) /* disable WRONGVIF signals */
#define MRT_MFC_FLAGS_BORDER_VIF (1 << 1) /* border vif */
#define MRT_MFC_RP (1 << 8) /* enable RP address */
#define MRT_MFC_BW_UPCALL (1 << 9) /* enable bw upcalls */
#define MRT_MFC_FLAGS_ALL (MRT_MFC_FLAGS_DISABLE_WRONGVIF | \
MRT_MFC_FLAGS_BORDER_VIF)
#define MRT_API_FLAGS_ALL (MRT_MFC_FLAGS_ALL | \
MRT_MFC_RP | \
MRT_MFC_BW_UPCALL)
/*
* Structure for installing or delivering an upcall if the
* measured bandwidth is above or below a threshold.
*
* User programs (e.g. daemons) may have a need to know when the
* bandwidth used by some data flow is above or below some threshold.
* This interface allows the userland to specify the threshold (in
* bytes and/or packets) and the measurement interval. Flows are
* all packet with the same source and destination IP address.
* At the moment the code is only used for multicast destinations
* but there is nothing that prevents its use for unicast.
*
* The measurement interval cannot be shorter than some Tmin (currently, 3s).
* The threshold is set in packets and/or bytes per_interval.
*
* Measurement works as follows:
*
* For >= measurements:
* The first packet marks the start of a measurement interval.
* During an interval we count packets and bytes, and when we
* pass the threshold we deliver an upcall and we are done.
* The first packet after the end of the interval resets the
* count and restarts the measurement.
*
* For <= measurement:
* We start a timer to fire at the end of the interval, and
* then for each incoming packet we count packets and bytes.
* When the timer fires, we compare the value with the threshold,
* schedule an upcall if we are below, and restart the measurement
* (reschedule timer and zero counters).
*/
struct bw_data {
struct timeval b_time;
uint64_t b_packets;
uint64_t b_bytes;
};
struct bw_upcall {
struct in_addr bu_src; /* source address */
struct in_addr bu_dst; /* destination address */
uint32_t bu_flags; /* misc flags (see below) */
#define BW_UPCALL_UNIT_PACKETS (1 << 0) /* threshold (in packets) */
#define BW_UPCALL_UNIT_BYTES (1 << 1) /* threshold (in bytes) */
#define BW_UPCALL_GEQ (1 << 2) /* upcall if bw >= threshold */
#define BW_UPCALL_LEQ (1 << 3) /* upcall if bw <= threshold */
#define BW_UPCALL_DELETE_ALL (1 << 4) /* delete all upcalls for s,d*/
struct bw_data bu_threshold; /* the bw threshold */
struct bw_data bu_measured; /* the measured bw */
};
/* max. number of upcalls to deliver together */
#define BW_UPCALLS_MAX 128
/* min. threshold time interval for bandwidth measurement */
#define BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC 3
#define BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC 0
/*
* The kernel's multicast routing statistics.
*/
@ -179,17 +277,20 @@ struct vif {
* at a future point)
*/
struct mfc {
struct in_addr mfc_origin; /* IP origin of mcasts */
struct in_addr mfc_mcastgrp; /* multicast group associated*/
vifi_t mfc_parent; /* incoming vif */
u_char mfc_ttls[MAXVIFS]; /* forwarding ttls on vifs */
u_long mfc_pkt_cnt; /* pkt count for src-grp */
u_long mfc_byte_cnt; /* byte count for src-grp */
u_long mfc_wrong_if; /* wrong if for src-grp */
int mfc_expire; /* time to clean entry up */
struct timeval mfc_last_assert; /* last time I sent an assert*/
struct rtdetq *mfc_stall; /* q of packets awaiting mfc */
struct mfc *mfc_next; /* next mfc entry */
struct in_addr mfc_origin; /* IP origin of mcasts */
struct in_addr mfc_mcastgrp; /* multicast group associated*/
vifi_t mfc_parent; /* incoming vif */
u_char mfc_ttls[MAXVIFS]; /* forwarding ttls on vifs */
u_long mfc_pkt_cnt; /* pkt count for src-grp */
u_long mfc_byte_cnt; /* byte count for src-grp */
u_long mfc_wrong_if; /* wrong if for src-grp */
int mfc_expire; /* time to clean entry up */
struct timeval mfc_last_assert; /* last time I sent an assert*/
struct rtdetq *mfc_stall; /* q of packets awaiting mfc */
struct mfc *mfc_next; /* next mfc entry */
uint8_t mfc_flags[MAXVIFS]; /* the MRT_MFC_FLAGS_* flags */
struct in_addr mfc_rp; /* the RP address */
struct bw_meter *mfc_bw_meter; /* list of bandwidth meters */
};
/*
@ -200,8 +301,10 @@ struct igmpmsg {
u_long unused1;
u_long unused2;
u_char im_msgtype; /* what type of message */
#define IGMPMSG_NOCACHE 1
#define IGMPMSG_WRONGVIF 2
#define IGMPMSG_NOCACHE 1 /* no MFC in the kernel */
#define IGMPMSG_WRONGVIF 2 /* packet came from wrong interface */
#define IGMPMSG_WHOLEPKT 3 /* PIM pkt for user level encap. */
#define IGMPMSG_BW_UPCALL 4 /* BW monitoring upcall */
u_char im_mbz; /* must be zero */
u_char im_vif; /* vif rec'd on */
u_char unused3;
@ -246,6 +349,32 @@ struct tbf
struct mbuf *tbf_t; /* tail-insertion pointer */
};
/*
* Structure for measuring the bandwidth and sending an upcall if the
* measured bandwidth is above or below a threshold.
*/
struct bw_meter {
struct bw_meter *bm_mfc_next; /* next bw meter (same mfc) */
struct bw_meter *bm_time_next; /* next bw meter (same time) */
uint32_t bm_time_hash; /* the time hash value */
struct mfc *bm_mfc; /* the corresponding mfc */
uint32_t bm_flags; /* misc flags (see below) */
#define BW_METER_UNIT_PACKETS (1 << 0) /* threshold (in packets) */
#define BW_METER_UNIT_BYTES (1 << 1) /* threshold (in bytes) */
#define BW_METER_GEQ (1 << 2) /* upcall if bw >= threshold */
#define BW_METER_LEQ (1 << 3) /* upcall if bw <= threshold */
#define BW_METER_USER_FLAGS (BW_METER_UNIT_PACKETS | \
BW_METER_UNIT_BYTES | \
BW_METER_GEQ | \
BW_METER_LEQ)
#define BW_METER_UPCALL_DELIVERED (1 << 24) /* upcall was delivered */
struct bw_data bm_threshold; /* the upcall threshold */
struct bw_data bm_measured; /* the measured bw */
struct timeval bm_start_time; /* abs. time */
};
#ifdef _KERNEL
struct sockopt;

View File

@ -124,15 +124,10 @@ extern struct protosw inetsw[];
* The mbuf opt, if present, will not be freed.
*/
int
ip_output(m0, opt, ro, flags, imo, inp)
struct mbuf *m0;
struct mbuf *opt;
struct route *ro;
int flags;
struct ip_moptions *imo;
struct inpcb *inp;
ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro,
int flags, struct ip_moptions *imo, struct inpcb *inp)
{
struct ip *ip, *mhip;
struct ip *ip;
struct ifnet *ifp = NULL; /* keep compiler happy */
struct mbuf *m;
int hlen = sizeof (struct ip);
@ -478,7 +473,7 @@ ip_output(m0, opt, ro, flags, imo, inp)
goto bad;
}
/* don't allow broadcast messages to be fragmented */
if ((u_short)ip->ip_len > ifp->if_mtu) {
if (ip->ip_len > ifp->if_mtu) {
error = EMSGSIZE;
goto bad;
}
@ -1014,8 +1009,7 @@ ip_output(m0, opt, ro, flags, imo, inp)
* If small enough for interface, or the interface will take
* care of the fragmentation for us, can just send directly.
*/
if ((u_short)ip->ip_len <= ifp->if_mtu ||
ifp->if_hwassist & CSUM_FRAGMENT) {
if (ip->ip_len <= ifp->if_mtu || ifp->if_hwassist & CSUM_FRAGMENT) {
ip->ip_len = htons(ip->ip_len);
ip->ip_off = htons(ip->ip_off);
ip->ip_sum = 0;
@ -1057,10 +1051,7 @@ ip_output(m0, opt, ro, flags, imo, inp)
(struct sockaddr *)dst, ro->ro_rt);
goto done;
}
/*
* Too large for interface; fragment if possible.
* Must be able to put at least 8 bytes per fragment.
*/
if (ip->ip_off & IP_DF) {
error = EMSGSIZE;
/*
@ -1070,149 +1061,23 @@ ip_output(m0, opt, ro, flags, imo, inp)
* them, there is no way for one to update all its
* routes when the MTU is changed.
*/
if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST))
&& !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU)
&& (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) &&
!(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) &&
(ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
}
ipstat.ips_cantfrag++;
goto bad;
}
len = (ifp->if_mtu - hlen) &~ 7;
if (len < 8) {
error = EMSGSIZE;
/*
* Too large for interface; fragment if possible. If successful,
* on return, m will point to a list of packets to be sent.
*/
error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist, sw_csum);
if (error)
goto bad;
}
/*
* if the interface will not calculate checksums on
* fragmented packets, then do it here.
*/
if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
(ifp->if_hwassist & CSUM_IP_FRAGS) == 0) {
in_delayed_cksum(m);
m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
}
if (len > PAGE_SIZE) {
/*
* Fragement large datagrams such that each segment
* contains a multiple of PAGE_SIZE amount of data,
* plus headers. This enables a receiver to perform
* page-flipping zero-copy optimizations.
*/
int newlen;
struct mbuf *mtmp;
for (mtmp = m, off = 0;
mtmp && ((off + mtmp->m_len) <= ifp->if_mtu);
mtmp = mtmp->m_next) {
off += mtmp->m_len;
}
/*
* firstlen (off - hlen) must be aligned on an
* 8-byte boundary
*/
if (off < hlen)
goto smart_frag_failure;
off = ((off - hlen) & ~7) + hlen;
newlen = (~PAGE_MASK) & ifp->if_mtu;
if ((newlen + sizeof (struct ip)) > ifp->if_mtu) {
/* we failed, go back the default */
smart_frag_failure:
newlen = len;
off = hlen + len;
}
/* printf("ipfrag: len = %d, hlen = %d, mhlen = %d, newlen = %d, off = %d\n",
len, hlen, sizeof (struct ip), newlen, off);*/
len = newlen;
} else {
off = hlen + len;
}
{
int mhlen, firstlen = off - hlen;
struct mbuf **mnext = &m->m_nextpkt;
int nfrags = 1;
/*
* Loop through length of segment after first fragment,
* make new header and copy data of each part and link onto chain.
*/
m0 = m;
mhlen = sizeof (struct ip);
for (; off < (u_short)ip->ip_len; off += len) {
MGETHDR(m, M_DONTWAIT, MT_HEADER);
if (m == 0) {
error = ENOBUFS;
ipstat.ips_odropped++;
goto sendorfree;
}
m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
m->m_data += max_linkhdr;
mhip = mtod(m, struct ip *);
*mhip = *ip;
if (hlen > sizeof (struct ip)) {
mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
mhip->ip_v = IPVERSION;
mhip->ip_hl = mhlen >> 2;
}
m->m_len = mhlen;
mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
if (off + len >= (u_short)ip->ip_len)
len = (u_short)ip->ip_len - off;
else
mhip->ip_off |= IP_MF;
mhip->ip_len = htons((u_short)(len + mhlen));
m->m_next = m_copy(m0, off, len);
if (m->m_next == 0) {
(void) m_free(m);
error = ENOBUFS; /* ??? */
ipstat.ips_odropped++;
goto sendorfree;
}
m->m_pkthdr.len = mhlen + len;
m->m_pkthdr.rcvif = (struct ifnet *)0;
#ifdef MAC
mac_create_fragment(m0, m);
#endif
m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
mhip->ip_off = htons(mhip->ip_off);
mhip->ip_sum = 0;
if (sw_csum & CSUM_DELAY_IP)
mhip->ip_sum = in_cksum(m, mhlen);
*mnext = m;
mnext = &m->m_nextpkt;
nfrags++;
}
ipstat.ips_ofragments += nfrags;
/* set first/last markers for fragment chain */
m->m_flags |= M_LASTFRAG;
m0->m_flags |= M_FIRSTFRAG | M_FRAG;
m0->m_pkthdr.csum_data = nfrags;
/*
* Update first fragment by trimming what's been copied out
* and updating header, then send each fragment (in order).
*/
m = m0;
m_adj(m, hlen + firstlen - (u_short)ip->ip_len);
m->m_pkthdr.len = hlen + firstlen;
ip->ip_len = htons((u_short)m->m_pkthdr.len);
ip->ip_off |= IP_MF;
ip->ip_off = htons(ip->ip_off);
ip->ip_sum = 0;
if (sw_csum & CSUM_DELAY_IP)
ip->ip_sum = in_cksum(m, hlen);
sendorfree:
for (m = m0; m; m = m0) {
for (; m; m = m0) {
m0 = m->m_nextpkt;
m->m_nextpkt = 0;
#ifdef IPSEC
@ -1234,7 +1099,7 @@ ip_output(m0, opt, ro, flags, imo, inp)
if (error == 0)
ipstat.ips_fragmented++;
}
done:
#ifdef IPSEC
if (ro == &iproute && ro->ro_rt) {
@ -1246,7 +1111,7 @@ ip_output(m0, opt, ro, flags, imo, inp)
printf("DP ip_output call free SP:%p\n", sp));
key_freesp(sp);
}
#endif /* IPSEC */
#endif
#ifdef FAST_IPSEC
if (ro == &iproute && ro->ro_rt) {
RTFREE(ro->ro_rt);
@ -1254,13 +1119,181 @@ ip_output(m0, opt, ro, flags, imo, inp)
}
if (sp != NULL)
KEY_FREESP(&sp);
#endif /* FAST_IPSEC */
#endif
return (error);
bad:
m_freem(m);
goto done;
}
/*
* Create a chain of fragments which fit the given mtu. m_frag points to the
* mbuf to be fragmented; on return it points to the chain with the fragments.
* Return 0 if no error. If error, m_frag may contain a partially built
* chain of fragments that should be freed by the caller.
*
* if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
* sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
*/
int
ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
u_long if_hwassist_flags, int sw_csum)
{
int error = 0;
int hlen = ip->ip_hl << 2;
int len = (mtu - hlen) & ~7; /* size of payload in each fragment */
int off;
struct mbuf *m0 = *m_frag; /* the original packet */
int firstlen;
struct mbuf **mnext;
int nfrags;
if (ip->ip_off & IP_DF) { /* Fragmentation not allowed */
ipstat.ips_cantfrag++;
return EMSGSIZE;
}
/*
* Must be able to put at least 8 bytes per fragment.
*/
if (len < 8)
return EMSGSIZE;
/*
* If the interface will not calculate checksums on
* fragmented packets, then do it here.
*/
if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
(if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
in_delayed_cksum(m0);
m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
}
if (len > PAGE_SIZE) {
/*
* Fragment large datagrams such that each segment
* contains a multiple of PAGE_SIZE amount of data,
* plus headers. This enables a receiver to perform
* page-flipping zero-copy optimizations.
*
* XXX When does this help given that sender and receiver
* could have different page sizes, and also mtu could
* be less than the receiver's page size ?
*/
int newlen;
struct mbuf *m;
for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next)
off += m->m_len;
/*
* firstlen (off - hlen) must be aligned on an
* 8-byte boundary
*/
if (off < hlen)
goto smart_frag_failure;
off = ((off - hlen) & ~7) + hlen;
newlen = (~PAGE_MASK) & mtu;
if ((newlen + sizeof (struct ip)) > mtu) {
/* we failed, go back the default */
smart_frag_failure:
newlen = len;
off = hlen + len;
}
len = newlen;
} else {
off = hlen + len;
}
firstlen = off - hlen;
mnext = &m0->m_nextpkt; /* pointer to next packet */
/*
* Loop through length of segment after first fragment,
* make new header and copy data of each part and link onto chain.
* Here, m0 is the original packet, m is the fragment being created.
* The fragments are linked off the m_nextpkt of the original
* packet, which after processing serves as the first fragment.
*/
for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) {
struct ip *mhip; /* ip header on the fragment */
struct mbuf *m;
int mhlen = sizeof (struct ip);
MGETHDR(m, M_DONTWAIT, MT_HEADER);
if (m == 0) {
error = ENOBUFS;
ipstat.ips_odropped++;
goto done;
}
m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
/*
* In the first mbuf, leave room for the link header, then
* copy the original IP header including options. The payload
* goes into an additional mbuf chain returned by m_copy().
*/
m->m_data += max_linkhdr;
mhip = mtod(m, struct ip *);
*mhip = *ip;
if (hlen > sizeof (struct ip)) {
mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
mhip->ip_v = IPVERSION;
mhip->ip_hl = mhlen >> 2;
}
m->m_len = mhlen;
/* XXX do we need to add ip->ip_off below ? */
mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
if (off + len >= ip->ip_len) { /* last fragment */
len = ip->ip_len - off;
m->m_flags |= M_LASTFRAG;
} else
mhip->ip_off |= IP_MF;
mhip->ip_len = htons((u_short)(len + mhlen));
m->m_next = m_copy(m0, off, len);
if (m->m_next == 0) { /* copy failed */
m_free(m);
error = ENOBUFS; /* ??? */
ipstat.ips_odropped++;
goto done;
}
m->m_pkthdr.len = mhlen + len;
m->m_pkthdr.rcvif = (struct ifnet *)0;
#ifdef MAC
mac_create_fragment(m0, m);
#endif
m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
mhip->ip_off = htons(mhip->ip_off);
mhip->ip_sum = 0;
if (sw_csum & CSUM_DELAY_IP)
mhip->ip_sum = in_cksum(m, mhlen);
*mnext = m;
mnext = &m->m_nextpkt;
}
ipstat.ips_ofragments += nfrags;
/* set first marker for fragment chain */
m0->m_flags |= M_FIRSTFRAG | M_FRAG;
m0->m_pkthdr.csum_data = nfrags;
/*
* Update first fragment by trimming what's been copied out
* and updating header.
*/
m_adj(m0, hlen + firstlen - ip->ip_len);
m0->m_pkthdr.len = hlen + firstlen;
ip->ip_len = htons((u_short)m0->m_pkthdr.len);
ip->ip_off |= IP_MF;
ip->ip_off = htons(ip->ip_off);
ip->ip_sum = 0;
if (sw_csum & CSUM_DELAY_IP)
ip->ip_sum = in_cksum(m0, hlen);
done:
*m_frag = m0;
return error;
}
void
in_delayed_cksum(struct mbuf *m)
{
@ -1307,7 +1340,7 @@ ip_insertoptions(m, opt, phlen)
unsigned optlen;
optlen = opt->m_len - sizeof(p->ipopt_dst);
if (optlen + (u_short)ip->ip_len > IP_MAXPACKET) {
if (optlen + ip->ip_len > IP_MAXPACKET) {
*phlen = 0;
return (m); /* XXX should fail */
}

View File

@ -164,6 +164,8 @@ extern struct pr_usrreqs rip_usrreqs;
int ip_ctloutput(struct socket *, struct sockopt *sopt);
void ip_drain(void);
int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
u_long if_hwassist_flags, int sw_csum);
void ip_freemoptions(struct ip_moptions *);
void ip_init(void);
extern int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,

View File

@ -383,6 +383,10 @@ rip_ctloutput(so, sopt)
case MRT_DEL_MFC:
case MRT_VERSION:
case MRT_ASSERT:
case MRT_API_SUPPORT:
case MRT_API_CONFIG:
case MRT_ADD_BW_UPCALL:
case MRT_DEL_BW_UPCALL:
error = ip_mrouter_get ? ip_mrouter_get(so, sopt) :
EOPNOTSUPP;
break;
@ -448,6 +452,10 @@ rip_ctloutput(so, sopt)
case MRT_DEL_MFC:
case MRT_VERSION:
case MRT_ASSERT:
case MRT_API_SUPPORT:
case MRT_API_CONFIG:
case MRT_ADD_BW_UPCALL:
case MRT_DEL_BW_UPCALL:
error = ip_mrouter_set ? ip_mrouter_set(so, sopt) :
EOPNOTSUPP;
break;