1. Basic PIM kernel support
Disabled by default. To enable it, the new "options PIM" must be added to the kernel configuration file (in addition to MROUTING): options MROUTING # Multicast routing options PIM # Protocol Independent Multicast 2. Add support for advanced multicast API setup/configuration and extensibility. 3. Add support for kernel-level PIM Register encapsulation. Disabled by default. Can be enabled by the advanced multicast API. 4. Implement a mechanism for "multicast bandwidth monitoring and upcalls". Submitted by: Pavlin Radoslavov <pavlin@icir.org>
This commit is contained in:
parent
29ad90976f
commit
22b74d7669
@ -486,6 +486,22 @@ struct ip_mreq {
|
||||
{ 0, 0 }, \
|
||||
{ 0, 0 }, \
|
||||
{ "ipsec", CTLTYPE_NODE }, \
|
||||
{ 0, 0 }, \
|
||||
{ 0, 0 }, \
|
||||
{ 0, 0 }, \
|
||||
{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
|
||||
{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
|
||||
{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
|
||||
{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
|
||||
{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
|
||||
{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
|
||||
{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
|
||||
{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
|
||||
{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
|
||||
{ 0, 0 }, \
|
||||
{ 0, 0 }, \
|
||||
{ 0, 0 }, \
|
||||
{ "pim", CTLTYPE_NODE }, \
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -36,6 +36,7 @@
|
||||
|
||||
#include "opt_ipdivert.h"
|
||||
#include "opt_ipx.h"
|
||||
#include "opt_mrouting.h"
|
||||
#include "opt_ipsec.h"
|
||||
#include "opt_inet6.h"
|
||||
|
||||
@ -57,6 +58,9 @@
|
||||
#include <netinet/ip_var.h>
|
||||
#include <netinet/ip_icmp.h>
|
||||
#include <netinet/igmp_var.h>
|
||||
#ifdef PIM
|
||||
#include <netinet/pim_var.h>
|
||||
#endif
|
||||
#include <netinet/tcp.h>
|
||||
#include <netinet/tcp_timer.h>
|
||||
#include <netinet/tcp_var.h>
|
||||
@ -216,6 +220,14 @@ struct protosw inetsw[] = {
|
||||
&rip_usrreqs
|
||||
},
|
||||
#endif
|
||||
#ifdef PIM
|
||||
{ SOCK_RAW, &inetdomain, IPPROTO_PIM, PR_ATOMIC|PR_ADDR|PR_LASTHDR,
|
||||
pim_input, 0, 0, rip_ctloutput,
|
||||
0,
|
||||
0, 0, 0, 0,
|
||||
&rip_usrreqs
|
||||
},
|
||||
#endif /* PIM */
|
||||
/* raw wildcard */
|
||||
{ SOCK_RAW, &inetdomain, 0, PR_ATOMIC|PR_ADDR,
|
||||
rip_input, 0, 0, rip_ctloutput,
|
||||
@ -260,4 +272,6 @@ SYSCTL_NODE(_net_inet, IPPROTO_RAW, raw, CTLFLAG_RW, 0, "RAW");
|
||||
#ifdef IPDIVERT
|
||||
SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert, CTLFLAG_RW, 0, "DIVERT");
|
||||
#endif
|
||||
|
||||
#ifdef PIM
|
||||
SYSCTL_NODE(_net_inet, IPPROTO_PIM, pim, CTLFLAG_RW, 0, "PIM");
|
||||
#endif
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -48,8 +48,12 @@
|
||||
* Modified by Steve Deering, Stanford, February 1989.
|
||||
* Modified by Ajit Thyagarajan, PARC, August 1993.
|
||||
* Modified by Ajit Thyagarajan, PARC, August 1994.
|
||||
* Modified by Ahmed Helmy, SGI, June 1996.
|
||||
* Modified by Pavlin Radoslavov, ICSI, October 2002.
|
||||
*
|
||||
* MROUTING Revision: 3.3.1.3
|
||||
* and PIM-SMv2 and PIM-DM support, advanced API support,
|
||||
* bandwidth metering and signaling.
|
||||
*/
|
||||
|
||||
|
||||
@ -63,7 +67,12 @@
|
||||
#define MRT_ADD_MFC 104 /* insert forwarding cache entry */
|
||||
#define MRT_DEL_MFC 105 /* delete forwarding cache entry */
|
||||
#define MRT_VERSION 106 /* get kernel version number */
|
||||
#define MRT_ASSERT 107 /* enable PIM assert processing */
|
||||
#define MRT_ASSERT 107 /* enable assert processing */
|
||||
#define MRT_PIM MRT_ASSERT /* enable PIM processing */
|
||||
#define MRT_API_SUPPORT 109 /* supported MRT API */
|
||||
#define MRT_API_CONFIG 110 /* config MRT API */
|
||||
#define MRT_ADD_BW_UPCALL 111 /* create bandwidth monitor */
|
||||
#define MRT_DEL_BW_UPCALL 112 /* delete bandwidth monitor */
|
||||
|
||||
|
||||
#define GET_TIME(t) microtime(&t)
|
||||
@ -99,10 +108,11 @@ struct vifctl {
|
||||
|
||||
#define VIFF_TUNNEL 0x1 /* vif represents a tunnel end-point */
|
||||
#define VIFF_SRCRT 0x2 /* tunnel uses IP source routing */
|
||||
#define VIFF_REGISTER 0x4 /* used for PIM Register encap/decap */
|
||||
|
||||
/*
|
||||
* Argument structure for MRT_ADD_MFC and MRT_DEL_MFC
|
||||
* (mfcc_tos to be added at a future point)
|
||||
* XXX if you change this, make sure to change struct mfcctl2 as well.
|
||||
*/
|
||||
struct mfcctl {
|
||||
struct in_addr mfcc_origin; /* ip origin of mcasts */
|
||||
@ -111,6 +121,94 @@ struct mfcctl {
|
||||
u_char mfcc_ttls[MAXVIFS]; /* forwarding ttls on vifs */
|
||||
};
|
||||
|
||||
/*
|
||||
* The new argument structure for MRT_ADD_MFC and MRT_DEL_MFC overlays
|
||||
* and extends the old struct mfcctl.
|
||||
*/
|
||||
struct mfcctl2 {
|
||||
/* the mfcctl fields */
|
||||
struct in_addr mfcc_origin; /* ip origin of mcasts */
|
||||
struct in_addr mfcc_mcastgrp; /* multicast group associated*/
|
||||
vifi_t mfcc_parent; /* incoming vif */
|
||||
u_char mfcc_ttls[MAXVIFS]; /* forwarding ttls on vifs */
|
||||
|
||||
/* extension fields */
|
||||
uint8_t mfcc_flags[MAXVIFS]; /* the MRT_MFC_FLAGS_* flags */
|
||||
struct in_addr mfcc_rp; /* the RP address */
|
||||
};
|
||||
/*
|
||||
* The advanced-API flags.
|
||||
*
|
||||
* The MRT_MFC_FLAGS_XXX API flags are also used as flags
|
||||
* for the mfcc_flags field.
|
||||
*/
|
||||
#define MRT_MFC_FLAGS_DISABLE_WRONGVIF (1 << 0) /* disable WRONGVIF signals */
|
||||
#define MRT_MFC_FLAGS_BORDER_VIF (1 << 1) /* border vif */
|
||||
#define MRT_MFC_RP (1 << 8) /* enable RP address */
|
||||
#define MRT_MFC_BW_UPCALL (1 << 9) /* enable bw upcalls */
|
||||
#define MRT_MFC_FLAGS_ALL (MRT_MFC_FLAGS_DISABLE_WRONGVIF | \
|
||||
MRT_MFC_FLAGS_BORDER_VIF)
|
||||
#define MRT_API_FLAGS_ALL (MRT_MFC_FLAGS_ALL | \
|
||||
MRT_MFC_RP | \
|
||||
MRT_MFC_BW_UPCALL)
|
||||
|
||||
/*
|
||||
* Structure for installing or delivering an upcall if the
|
||||
* measured bandwidth is above or below a threshold.
|
||||
*
|
||||
* User programs (e.g. daemons) may have a need to know when the
|
||||
* bandwidth used by some data flow is above or below some threshold.
|
||||
* This interface allows the userland to specify the threshold (in
|
||||
* bytes and/or packets) and the measurement interval. Flows are
|
||||
* all packet with the same source and destination IP address.
|
||||
* At the moment the code is only used for multicast destinations
|
||||
* but there is nothing that prevents its use for unicast.
|
||||
*
|
||||
* The measurement interval cannot be shorter than some Tmin (currently, 3s).
|
||||
* The threshold is set in packets and/or bytes per_interval.
|
||||
*
|
||||
* Measurement works as follows:
|
||||
*
|
||||
* For >= measurements:
|
||||
* The first packet marks the start of a measurement interval.
|
||||
* During an interval we count packets and bytes, and when we
|
||||
* pass the threshold we deliver an upcall and we are done.
|
||||
* The first packet after the end of the interval resets the
|
||||
* count and restarts the measurement.
|
||||
*
|
||||
* For <= measurement:
|
||||
* We start a timer to fire at the end of the interval, and
|
||||
* then for each incoming packet we count packets and bytes.
|
||||
* When the timer fires, we compare the value with the threshold,
|
||||
* schedule an upcall if we are below, and restart the measurement
|
||||
* (reschedule timer and zero counters).
|
||||
*/
|
||||
|
||||
struct bw_data {
|
||||
struct timeval b_time;
|
||||
uint64_t b_packets;
|
||||
uint64_t b_bytes;
|
||||
};
|
||||
|
||||
struct bw_upcall {
|
||||
struct in_addr bu_src; /* source address */
|
||||
struct in_addr bu_dst; /* destination address */
|
||||
uint32_t bu_flags; /* misc flags (see below) */
|
||||
#define BW_UPCALL_UNIT_PACKETS (1 << 0) /* threshold (in packets) */
|
||||
#define BW_UPCALL_UNIT_BYTES (1 << 1) /* threshold (in bytes) */
|
||||
#define BW_UPCALL_GEQ (1 << 2) /* upcall if bw >= threshold */
|
||||
#define BW_UPCALL_LEQ (1 << 3) /* upcall if bw <= threshold */
|
||||
#define BW_UPCALL_DELETE_ALL (1 << 4) /* delete all upcalls for s,d*/
|
||||
struct bw_data bu_threshold; /* the bw threshold */
|
||||
struct bw_data bu_measured; /* the measured bw */
|
||||
};
|
||||
|
||||
/* max. number of upcalls to deliver together */
|
||||
#define BW_UPCALLS_MAX 128
|
||||
/* min. threshold time interval for bandwidth measurement */
|
||||
#define BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC 3
|
||||
#define BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC 0
|
||||
|
||||
/*
|
||||
* The kernel's multicast routing statistics.
|
||||
*/
|
||||
@ -179,17 +277,20 @@ struct vif {
|
||||
* at a future point)
|
||||
*/
|
||||
struct mfc {
|
||||
struct in_addr mfc_origin; /* IP origin of mcasts */
|
||||
struct in_addr mfc_mcastgrp; /* multicast group associated*/
|
||||
vifi_t mfc_parent; /* incoming vif */
|
||||
u_char mfc_ttls[MAXVIFS]; /* forwarding ttls on vifs */
|
||||
u_long mfc_pkt_cnt; /* pkt count for src-grp */
|
||||
u_long mfc_byte_cnt; /* byte count for src-grp */
|
||||
u_long mfc_wrong_if; /* wrong if for src-grp */
|
||||
int mfc_expire; /* time to clean entry up */
|
||||
struct timeval mfc_last_assert; /* last time I sent an assert*/
|
||||
struct rtdetq *mfc_stall; /* q of packets awaiting mfc */
|
||||
struct mfc *mfc_next; /* next mfc entry */
|
||||
struct in_addr mfc_origin; /* IP origin of mcasts */
|
||||
struct in_addr mfc_mcastgrp; /* multicast group associated*/
|
||||
vifi_t mfc_parent; /* incoming vif */
|
||||
u_char mfc_ttls[MAXVIFS]; /* forwarding ttls on vifs */
|
||||
u_long mfc_pkt_cnt; /* pkt count for src-grp */
|
||||
u_long mfc_byte_cnt; /* byte count for src-grp */
|
||||
u_long mfc_wrong_if; /* wrong if for src-grp */
|
||||
int mfc_expire; /* time to clean entry up */
|
||||
struct timeval mfc_last_assert; /* last time I sent an assert*/
|
||||
struct rtdetq *mfc_stall; /* q of packets awaiting mfc */
|
||||
struct mfc *mfc_next; /* next mfc entry */
|
||||
uint8_t mfc_flags[MAXVIFS]; /* the MRT_MFC_FLAGS_* flags */
|
||||
struct in_addr mfc_rp; /* the RP address */
|
||||
struct bw_meter *mfc_bw_meter; /* list of bandwidth meters */
|
||||
};
|
||||
|
||||
/*
|
||||
@ -200,8 +301,10 @@ struct igmpmsg {
|
||||
u_long unused1;
|
||||
u_long unused2;
|
||||
u_char im_msgtype; /* what type of message */
|
||||
#define IGMPMSG_NOCACHE 1
|
||||
#define IGMPMSG_WRONGVIF 2
|
||||
#define IGMPMSG_NOCACHE 1 /* no MFC in the kernel */
|
||||
#define IGMPMSG_WRONGVIF 2 /* packet came from wrong interface */
|
||||
#define IGMPMSG_WHOLEPKT 3 /* PIM pkt for user level encap. */
|
||||
#define IGMPMSG_BW_UPCALL 4 /* BW monitoring upcall */
|
||||
u_char im_mbz; /* must be zero */
|
||||
u_char im_vif; /* vif rec'd on */
|
||||
u_char unused3;
|
||||
@ -246,6 +349,32 @@ struct tbf
|
||||
struct mbuf *tbf_t; /* tail-insertion pointer */
|
||||
};
|
||||
|
||||
/*
|
||||
* Structure for measuring the bandwidth and sending an upcall if the
|
||||
* measured bandwidth is above or below a threshold.
|
||||
*/
|
||||
struct bw_meter {
|
||||
struct bw_meter *bm_mfc_next; /* next bw meter (same mfc) */
|
||||
struct bw_meter *bm_time_next; /* next bw meter (same time) */
|
||||
uint32_t bm_time_hash; /* the time hash value */
|
||||
struct mfc *bm_mfc; /* the corresponding mfc */
|
||||
uint32_t bm_flags; /* misc flags (see below) */
|
||||
#define BW_METER_UNIT_PACKETS (1 << 0) /* threshold (in packets) */
|
||||
#define BW_METER_UNIT_BYTES (1 << 1) /* threshold (in bytes) */
|
||||
#define BW_METER_GEQ (1 << 2) /* upcall if bw >= threshold */
|
||||
#define BW_METER_LEQ (1 << 3) /* upcall if bw <= threshold */
|
||||
#define BW_METER_USER_FLAGS (BW_METER_UNIT_PACKETS | \
|
||||
BW_METER_UNIT_BYTES | \
|
||||
BW_METER_GEQ | \
|
||||
BW_METER_LEQ)
|
||||
|
||||
#define BW_METER_UPCALL_DELIVERED (1 << 24) /* upcall was delivered */
|
||||
|
||||
struct bw_data bm_threshold; /* the upcall threshold */
|
||||
struct bw_data bm_measured; /* the measured bw */
|
||||
struct timeval bm_start_time; /* abs. time */
|
||||
};
|
||||
|
||||
#ifdef _KERNEL
|
||||
|
||||
struct sockopt;
|
||||
|
@ -124,15 +124,10 @@ extern struct protosw inetsw[];
|
||||
* The mbuf opt, if present, will not be freed.
|
||||
*/
|
||||
int
|
||||
ip_output(m0, opt, ro, flags, imo, inp)
|
||||
struct mbuf *m0;
|
||||
struct mbuf *opt;
|
||||
struct route *ro;
|
||||
int flags;
|
||||
struct ip_moptions *imo;
|
||||
struct inpcb *inp;
|
||||
ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro,
|
||||
int flags, struct ip_moptions *imo, struct inpcb *inp)
|
||||
{
|
||||
struct ip *ip, *mhip;
|
||||
struct ip *ip;
|
||||
struct ifnet *ifp = NULL; /* keep compiler happy */
|
||||
struct mbuf *m;
|
||||
int hlen = sizeof (struct ip);
|
||||
@ -478,7 +473,7 @@ ip_output(m0, opt, ro, flags, imo, inp)
|
||||
goto bad;
|
||||
}
|
||||
/* don't allow broadcast messages to be fragmented */
|
||||
if ((u_short)ip->ip_len > ifp->if_mtu) {
|
||||
if (ip->ip_len > ifp->if_mtu) {
|
||||
error = EMSGSIZE;
|
||||
goto bad;
|
||||
}
|
||||
@ -1014,8 +1009,7 @@ pass:
|
||||
* If small enough for interface, or the interface will take
|
||||
* care of the fragmentation for us, can just send directly.
|
||||
*/
|
||||
if ((u_short)ip->ip_len <= ifp->if_mtu ||
|
||||
ifp->if_hwassist & CSUM_FRAGMENT) {
|
||||
if (ip->ip_len <= ifp->if_mtu || ifp->if_hwassist & CSUM_FRAGMENT) {
|
||||
ip->ip_len = htons(ip->ip_len);
|
||||
ip->ip_off = htons(ip->ip_off);
|
||||
ip->ip_sum = 0;
|
||||
@ -1057,10 +1051,7 @@ pass:
|
||||
(struct sockaddr *)dst, ro->ro_rt);
|
||||
goto done;
|
||||
}
|
||||
/*
|
||||
* Too large for interface; fragment if possible.
|
||||
* Must be able to put at least 8 bytes per fragment.
|
||||
*/
|
||||
|
||||
if (ip->ip_off & IP_DF) {
|
||||
error = EMSGSIZE;
|
||||
/*
|
||||
@ -1070,149 +1061,23 @@ pass:
|
||||
* them, there is no way for one to update all its
|
||||
* routes when the MTU is changed.
|
||||
*/
|
||||
if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST))
|
||||
&& !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU)
|
||||
&& (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
|
||||
if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) &&
|
||||
!(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) &&
|
||||
(ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
|
||||
ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
|
||||
}
|
||||
ipstat.ips_cantfrag++;
|
||||
goto bad;
|
||||
}
|
||||
len = (ifp->if_mtu - hlen) &~ 7;
|
||||
if (len < 8) {
|
||||
error = EMSGSIZE;
|
||||
|
||||
/*
|
||||
* Too large for interface; fragment if possible. If successful,
|
||||
* on return, m will point to a list of packets to be sent.
|
||||
*/
|
||||
error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist, sw_csum);
|
||||
if (error)
|
||||
goto bad;
|
||||
}
|
||||
|
||||
/*
|
||||
* if the interface will not calculate checksums on
|
||||
* fragmented packets, then do it here.
|
||||
*/
|
||||
if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
|
||||
(ifp->if_hwassist & CSUM_IP_FRAGS) == 0) {
|
||||
in_delayed_cksum(m);
|
||||
m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
|
||||
}
|
||||
|
||||
if (len > PAGE_SIZE) {
|
||||
/*
|
||||
* Fragement large datagrams such that each segment
|
||||
* contains a multiple of PAGE_SIZE amount of data,
|
||||
* plus headers. This enables a receiver to perform
|
||||
* page-flipping zero-copy optimizations.
|
||||
*/
|
||||
|
||||
int newlen;
|
||||
struct mbuf *mtmp;
|
||||
|
||||
for (mtmp = m, off = 0;
|
||||
mtmp && ((off + mtmp->m_len) <= ifp->if_mtu);
|
||||
mtmp = mtmp->m_next) {
|
||||
off += mtmp->m_len;
|
||||
}
|
||||
/*
|
||||
* firstlen (off - hlen) must be aligned on an
|
||||
* 8-byte boundary
|
||||
*/
|
||||
if (off < hlen)
|
||||
goto smart_frag_failure;
|
||||
off = ((off - hlen) & ~7) + hlen;
|
||||
newlen = (~PAGE_MASK) & ifp->if_mtu;
|
||||
if ((newlen + sizeof (struct ip)) > ifp->if_mtu) {
|
||||
/* we failed, go back the default */
|
||||
smart_frag_failure:
|
||||
newlen = len;
|
||||
off = hlen + len;
|
||||
}
|
||||
|
||||
/* printf("ipfrag: len = %d, hlen = %d, mhlen = %d, newlen = %d, off = %d\n",
|
||||
len, hlen, sizeof (struct ip), newlen, off);*/
|
||||
|
||||
len = newlen;
|
||||
|
||||
} else {
|
||||
off = hlen + len;
|
||||
}
|
||||
|
||||
|
||||
|
||||
{
|
||||
int mhlen, firstlen = off - hlen;
|
||||
struct mbuf **mnext = &m->m_nextpkt;
|
||||
int nfrags = 1;
|
||||
|
||||
/*
|
||||
* Loop through length of segment after first fragment,
|
||||
* make new header and copy data of each part and link onto chain.
|
||||
*/
|
||||
m0 = m;
|
||||
mhlen = sizeof (struct ip);
|
||||
for (; off < (u_short)ip->ip_len; off += len) {
|
||||
MGETHDR(m, M_DONTWAIT, MT_HEADER);
|
||||
if (m == 0) {
|
||||
error = ENOBUFS;
|
||||
ipstat.ips_odropped++;
|
||||
goto sendorfree;
|
||||
}
|
||||
m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
|
||||
m->m_data += max_linkhdr;
|
||||
mhip = mtod(m, struct ip *);
|
||||
*mhip = *ip;
|
||||
if (hlen > sizeof (struct ip)) {
|
||||
mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
|
||||
mhip->ip_v = IPVERSION;
|
||||
mhip->ip_hl = mhlen >> 2;
|
||||
}
|
||||
m->m_len = mhlen;
|
||||
mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
|
||||
if (off + len >= (u_short)ip->ip_len)
|
||||
len = (u_short)ip->ip_len - off;
|
||||
else
|
||||
mhip->ip_off |= IP_MF;
|
||||
mhip->ip_len = htons((u_short)(len + mhlen));
|
||||
m->m_next = m_copy(m0, off, len);
|
||||
if (m->m_next == 0) {
|
||||
(void) m_free(m);
|
||||
error = ENOBUFS; /* ??? */
|
||||
ipstat.ips_odropped++;
|
||||
goto sendorfree;
|
||||
}
|
||||
m->m_pkthdr.len = mhlen + len;
|
||||
m->m_pkthdr.rcvif = (struct ifnet *)0;
|
||||
#ifdef MAC
|
||||
mac_create_fragment(m0, m);
|
||||
#endif
|
||||
m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
|
||||
mhip->ip_off = htons(mhip->ip_off);
|
||||
mhip->ip_sum = 0;
|
||||
if (sw_csum & CSUM_DELAY_IP)
|
||||
mhip->ip_sum = in_cksum(m, mhlen);
|
||||
*mnext = m;
|
||||
mnext = &m->m_nextpkt;
|
||||
nfrags++;
|
||||
}
|
||||
ipstat.ips_ofragments += nfrags;
|
||||
|
||||
/* set first/last markers for fragment chain */
|
||||
m->m_flags |= M_LASTFRAG;
|
||||
m0->m_flags |= M_FIRSTFRAG | M_FRAG;
|
||||
m0->m_pkthdr.csum_data = nfrags;
|
||||
|
||||
/*
|
||||
* Update first fragment by trimming what's been copied out
|
||||
* and updating header, then send each fragment (in order).
|
||||
*/
|
||||
m = m0;
|
||||
m_adj(m, hlen + firstlen - (u_short)ip->ip_len);
|
||||
m->m_pkthdr.len = hlen + firstlen;
|
||||
ip->ip_len = htons((u_short)m->m_pkthdr.len);
|
||||
ip->ip_off |= IP_MF;
|
||||
ip->ip_off = htons(ip->ip_off);
|
||||
ip->ip_sum = 0;
|
||||
if (sw_csum & CSUM_DELAY_IP)
|
||||
ip->ip_sum = in_cksum(m, hlen);
|
||||
sendorfree:
|
||||
for (m = m0; m; m = m0) {
|
||||
for (; m; m = m0) {
|
||||
m0 = m->m_nextpkt;
|
||||
m->m_nextpkt = 0;
|
||||
#ifdef IPSEC
|
||||
@ -1234,7 +1099,7 @@ sendorfree:
|
||||
|
||||
if (error == 0)
|
||||
ipstat.ips_fragmented++;
|
||||
}
|
||||
|
||||
done:
|
||||
#ifdef IPSEC
|
||||
if (ro == &iproute && ro->ro_rt) {
|
||||
@ -1246,7 +1111,7 @@ done:
|
||||
printf("DP ip_output call free SP:%p\n", sp));
|
||||
key_freesp(sp);
|
||||
}
|
||||
#endif /* IPSEC */
|
||||
#endif
|
||||
#ifdef FAST_IPSEC
|
||||
if (ro == &iproute && ro->ro_rt) {
|
||||
RTFREE(ro->ro_rt);
|
||||
@ -1254,13 +1119,181 @@ done:
|
||||
}
|
||||
if (sp != NULL)
|
||||
KEY_FREESP(&sp);
|
||||
#endif /* FAST_IPSEC */
|
||||
#endif
|
||||
return (error);
|
||||
bad:
|
||||
m_freem(m);
|
||||
goto done;
|
||||
}
|
||||
|
||||
/*
|
||||
* Create a chain of fragments which fit the given mtu. m_frag points to the
|
||||
* mbuf to be fragmented; on return it points to the chain with the fragments.
|
||||
* Return 0 if no error. If error, m_frag may contain a partially built
|
||||
* chain of fragments that should be freed by the caller.
|
||||
*
|
||||
* if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
|
||||
* sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
|
||||
*/
|
||||
int
|
||||
ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
|
||||
u_long if_hwassist_flags, int sw_csum)
|
||||
{
|
||||
int error = 0;
|
||||
int hlen = ip->ip_hl << 2;
|
||||
int len = (mtu - hlen) & ~7; /* size of payload in each fragment */
|
||||
int off;
|
||||
struct mbuf *m0 = *m_frag; /* the original packet */
|
||||
int firstlen;
|
||||
struct mbuf **mnext;
|
||||
int nfrags;
|
||||
|
||||
if (ip->ip_off & IP_DF) { /* Fragmentation not allowed */
|
||||
ipstat.ips_cantfrag++;
|
||||
return EMSGSIZE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Must be able to put at least 8 bytes per fragment.
|
||||
*/
|
||||
if (len < 8)
|
||||
return EMSGSIZE;
|
||||
|
||||
/*
|
||||
* If the interface will not calculate checksums on
|
||||
* fragmented packets, then do it here.
|
||||
*/
|
||||
if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
|
||||
(if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
|
||||
in_delayed_cksum(m0);
|
||||
m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
|
||||
}
|
||||
|
||||
if (len > PAGE_SIZE) {
|
||||
/*
|
||||
* Fragment large datagrams such that each segment
|
||||
* contains a multiple of PAGE_SIZE amount of data,
|
||||
* plus headers. This enables a receiver to perform
|
||||
* page-flipping zero-copy optimizations.
|
||||
*
|
||||
* XXX When does this help given that sender and receiver
|
||||
* could have different page sizes, and also mtu could
|
||||
* be less than the receiver's page size ?
|
||||
*/
|
||||
int newlen;
|
||||
struct mbuf *m;
|
||||
|
||||
for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next)
|
||||
off += m->m_len;
|
||||
|
||||
/*
|
||||
* firstlen (off - hlen) must be aligned on an
|
||||
* 8-byte boundary
|
||||
*/
|
||||
if (off < hlen)
|
||||
goto smart_frag_failure;
|
||||
off = ((off - hlen) & ~7) + hlen;
|
||||
newlen = (~PAGE_MASK) & mtu;
|
||||
if ((newlen + sizeof (struct ip)) > mtu) {
|
||||
/* we failed, go back the default */
|
||||
smart_frag_failure:
|
||||
newlen = len;
|
||||
off = hlen + len;
|
||||
}
|
||||
len = newlen;
|
||||
|
||||
} else {
|
||||
off = hlen + len;
|
||||
}
|
||||
|
||||
firstlen = off - hlen;
|
||||
mnext = &m0->m_nextpkt; /* pointer to next packet */
|
||||
|
||||
/*
|
||||
* Loop through length of segment after first fragment,
|
||||
* make new header and copy data of each part and link onto chain.
|
||||
* Here, m0 is the original packet, m is the fragment being created.
|
||||
* The fragments are linked off the m_nextpkt of the original
|
||||
* packet, which after processing serves as the first fragment.
|
||||
*/
|
||||
for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) {
|
||||
struct ip *mhip; /* ip header on the fragment */
|
||||
struct mbuf *m;
|
||||
int mhlen = sizeof (struct ip);
|
||||
|
||||
MGETHDR(m, M_DONTWAIT, MT_HEADER);
|
||||
if (m == 0) {
|
||||
error = ENOBUFS;
|
||||
ipstat.ips_odropped++;
|
||||
goto done;
|
||||
}
|
||||
m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
|
||||
/*
|
||||
* In the first mbuf, leave room for the link header, then
|
||||
* copy the original IP header including options. The payload
|
||||
* goes into an additional mbuf chain returned by m_copy().
|
||||
*/
|
||||
m->m_data += max_linkhdr;
|
||||
mhip = mtod(m, struct ip *);
|
||||
*mhip = *ip;
|
||||
if (hlen > sizeof (struct ip)) {
|
||||
mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
|
||||
mhip->ip_v = IPVERSION;
|
||||
mhip->ip_hl = mhlen >> 2;
|
||||
}
|
||||
m->m_len = mhlen;
|
||||
/* XXX do we need to add ip->ip_off below ? */
|
||||
mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
|
||||
if (off + len >= ip->ip_len) { /* last fragment */
|
||||
len = ip->ip_len - off;
|
||||
m->m_flags |= M_LASTFRAG;
|
||||
} else
|
||||
mhip->ip_off |= IP_MF;
|
||||
mhip->ip_len = htons((u_short)(len + mhlen));
|
||||
m->m_next = m_copy(m0, off, len);
|
||||
if (m->m_next == 0) { /* copy failed */
|
||||
m_free(m);
|
||||
error = ENOBUFS; /* ??? */
|
||||
ipstat.ips_odropped++;
|
||||
goto done;
|
||||
}
|
||||
m->m_pkthdr.len = mhlen + len;
|
||||
m->m_pkthdr.rcvif = (struct ifnet *)0;
|
||||
#ifdef MAC
|
||||
mac_create_fragment(m0, m);
|
||||
#endif
|
||||
m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
|
||||
mhip->ip_off = htons(mhip->ip_off);
|
||||
mhip->ip_sum = 0;
|
||||
if (sw_csum & CSUM_DELAY_IP)
|
||||
mhip->ip_sum = in_cksum(m, mhlen);
|
||||
*mnext = m;
|
||||
mnext = &m->m_nextpkt;
|
||||
}
|
||||
ipstat.ips_ofragments += nfrags;
|
||||
|
||||
/* set first marker for fragment chain */
|
||||
m0->m_flags |= M_FIRSTFRAG | M_FRAG;
|
||||
m0->m_pkthdr.csum_data = nfrags;
|
||||
|
||||
/*
|
||||
* Update first fragment by trimming what's been copied out
|
||||
* and updating header.
|
||||
*/
|
||||
m_adj(m0, hlen + firstlen - ip->ip_len);
|
||||
m0->m_pkthdr.len = hlen + firstlen;
|
||||
ip->ip_len = htons((u_short)m0->m_pkthdr.len);
|
||||
ip->ip_off |= IP_MF;
|
||||
ip->ip_off = htons(ip->ip_off);
|
||||
ip->ip_sum = 0;
|
||||
if (sw_csum & CSUM_DELAY_IP)
|
||||
ip->ip_sum = in_cksum(m0, hlen);
|
||||
|
||||
done:
|
||||
*m_frag = m0;
|
||||
return error;
|
||||
}
|
||||
|
||||
void
|
||||
in_delayed_cksum(struct mbuf *m)
|
||||
{
|
||||
@ -1307,7 +1340,7 @@ ip_insertoptions(m, opt, phlen)
|
||||
unsigned optlen;
|
||||
|
||||
optlen = opt->m_len - sizeof(p->ipopt_dst);
|
||||
if (optlen + (u_short)ip->ip_len > IP_MAXPACKET) {
|
||||
if (optlen + ip->ip_len > IP_MAXPACKET) {
|
||||
*phlen = 0;
|
||||
return (m); /* XXX should fail */
|
||||
}
|
||||
|
@ -164,6 +164,8 @@ extern struct pr_usrreqs rip_usrreqs;
|
||||
|
||||
int ip_ctloutput(struct socket *, struct sockopt *sopt);
|
||||
void ip_drain(void);
|
||||
int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
|
||||
u_long if_hwassist_flags, int sw_csum);
|
||||
void ip_freemoptions(struct ip_moptions *);
|
||||
void ip_init(void);
|
||||
extern int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
|
||||
|
@ -383,6 +383,10 @@ rip_ctloutput(so, sopt)
|
||||
case MRT_DEL_MFC:
|
||||
case MRT_VERSION:
|
||||
case MRT_ASSERT:
|
||||
case MRT_API_SUPPORT:
|
||||
case MRT_API_CONFIG:
|
||||
case MRT_ADD_BW_UPCALL:
|
||||
case MRT_DEL_BW_UPCALL:
|
||||
error = ip_mrouter_get ? ip_mrouter_get(so, sopt) :
|
||||
EOPNOTSUPP;
|
||||
break;
|
||||
@ -448,6 +452,10 @@ rip_ctloutput(so, sopt)
|
||||
case MRT_DEL_MFC:
|
||||
case MRT_VERSION:
|
||||
case MRT_ASSERT:
|
||||
case MRT_API_SUPPORT:
|
||||
case MRT_API_CONFIG:
|
||||
case MRT_ADD_BW_UPCALL:
|
||||
case MRT_DEL_BW_UPCALL:
|
||||
error = ip_mrouter_set ? ip_mrouter_set(so, sopt) :
|
||||
EOPNOTSUPP;
|
||||
break;
|
||||
|
Loading…
x
Reference in New Issue
Block a user