diff --git a/share/man/man9/mbuf.9 b/share/man/man9/mbuf.9 index 1074d85e844a..e80df0df922b 100644 --- a/share/man/man9/mbuf.9 +++ b/share/man/man9/mbuf.9 @@ -24,7 +24,7 @@ .\" .\" $FreeBSD$ .\" -.Dd November 18, 2005 +.Dd March 15, 2006 .Dt MBUF 9 .Os .\" @@ -132,6 +132,8 @@ .Fn m_getptr "struct mbuf *mbuf" "int loc" "int *off" .Ft struct mbuf * .Fn m_defrag "struct mbuf *m0" "int how" +.Ft struct mbuf * +.Fn m_unshare "struct mbuf *m0" "int how" .\" .Sh DESCRIPTION An @@ -886,6 +888,26 @@ depending on the caller's preference. This function is especially useful in network drivers, where certain long mbuf chains must be shortened before being added to TX descriptor lists. +.It Fn m_unshare m0 how +Create a version of the specified mbuf chain whose +contents can be safely modified without affecting other users. +If allocation fails and this operation can not be completed, +.Dv NULL +will be returned. +The original mbuf chain is always reclaimed and the reference +count of any shared mbuf clusters is decremented. +.Fa how +should be either +.Dv M_TRYWAIT +or +.Dv M_DONTWAIT , +depending on the caller's preference. +As a side-effect of this process the returned +mbuf chain may be compacted. +.Pp +This function is especially useful in the transmit path of +network code, when data must be encrypted or otherwise +altered prior to transmission. .El .Sh HARDWARE-ASSISTED CHECKSUM CALCULATION This section currently applies to TCP/IP only. diff --git a/sys/kern/uipc_mbuf.c b/sys/kern/uipc_mbuf.c index 887db0048b2c..5c4c5bc51a86 100644 --- a/sys/kern/uipc_mbuf.c +++ b/sys/kern/uipc_mbuf.c @@ -1679,3 +1679,156 @@ m_align(struct mbuf *m, int len) adjust = MLEN - len; m->m_data += adjust &~ (sizeof(long)-1); } + +/* + * Create a writable copy of the mbuf chain. While doing this + * we compact the chain with a goal of producing a chain with + * at most two mbufs. The second mbuf in this chain is likely + * to be a cluster. The primary purpose of this work is to create + * a writable packet for encryption, compression, etc. The + * secondary goal is to linearize the data so the data can be + * passed to crypto hardware in the most efficient manner possible. + */ +struct mbuf * +m_unshare(struct mbuf *m0, int how) +{ + struct mbuf *m, *mprev; + struct mbuf *n, *mfirst, *mlast; + int len, off; + + mprev = NULL; + for (m = m0; m != NULL; m = mprev->m_next) { + /* + * Regular mbufs are ignored unless there's a cluster + * in front of it that we can use to coalesce. We do + * the latter mainly so later clusters can be coalesced + * also w/o having to handle them specially (i.e. convert + * mbuf+cluster -> cluster). This optimization is heavily + * influenced by the assumption that we're running over + * Ethernet where MCLBYTES is large enough that the max + * packet size will permit lots of coalescing into a + * single cluster. This in turn permits efficient + * crypto operations, especially when using hardware. + */ + if ((m->m_flags & M_EXT) == 0) { + if (mprev && (mprev->m_flags & M_EXT) && + m->m_len <= M_TRAILINGSPACE(mprev)) { + /* XXX: this ignores mbuf types */ + memcpy(mtod(mprev, caddr_t) + mprev->m_len, + mtod(m, caddr_t), m->m_len); + mprev->m_len += m->m_len; + mprev->m_next = m->m_next; /* unlink from chain */ + m_free(m); /* reclaim mbuf */ +#if 0 + newipsecstat.ips_mbcoalesced++; +#endif + } else { + mprev = m; + } + continue; + } + /* + * Writable mbufs are left alone (for now). + */ + if (M_WRITABLE(m)) { + mprev = m; + continue; + } + + /* + * Not writable, replace with a copy or coalesce with + * the previous mbuf if possible (since we have to copy + * it anyway, we try to reduce the number of mbufs and + * clusters so that future work is easier). + */ + KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags)); + /* NB: we only coalesce into a cluster or larger */ + if (mprev != NULL && (mprev->m_flags & M_EXT) && + m->m_len <= M_TRAILINGSPACE(mprev)) { + /* XXX: this ignores mbuf types */ + memcpy(mtod(mprev, caddr_t) + mprev->m_len, + mtod(m, caddr_t), m->m_len); + mprev->m_len += m->m_len; + mprev->m_next = m->m_next; /* unlink from chain */ + m_free(m); /* reclaim mbuf */ +#if 0 + newipsecstat.ips_clcoalesced++; +#endif + continue; + } + + /* + * Allocate new space to hold the copy... + */ + /* XXX why can M_PKTHDR be set past the first mbuf? */ + if (mprev == NULL && (m->m_flags & M_PKTHDR)) { + /* + * NB: if a packet header is present we must + * allocate the mbuf separately from any cluster + * because M_MOVE_PKTHDR will smash the data + * pointer and drop the M_EXT marker. + */ + MGETHDR(n, how, m->m_type); + if (n == NULL) { + m_freem(m0); + return (NULL); + } + M_MOVE_PKTHDR(n, m); + MCLGET(n, how); + if ((n->m_flags & M_EXT) == 0) { + m_free(n); + m_freem(m0); + return (NULL); + } + } else { + n = m_getcl(how, m->m_type, m->m_flags); + if (n == NULL) { + m_freem(m0); + return (NULL); + } + } + /* + * ... and copy the data. We deal with jumbo mbufs + * (i.e. m_len > MCLBYTES) by splitting them into + * clusters. We could just malloc a buffer and make + * it external but too many device drivers don't know + * how to break up the non-contiguous memory when + * doing DMA. + */ + len = m->m_len; + off = 0; + mfirst = n; + mlast = NULL; + for (;;) { + int cc = min(len, MCLBYTES); + memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc); + n->m_len = cc; + if (mlast != NULL) + mlast->m_next = n; + mlast = n; +#if 0 + newipsecstat.ips_clcopied++; +#endif + + len -= cc; + if (len <= 0) + break; + off += cc; + + n = m_getcl(how, m->m_type, m->m_flags); + if (n == NULL) { + m_freem(mfirst); + m_freem(m0); + return (NULL); + } + } + n->m_next = m->m_next; + if (mprev == NULL) + m0 = mfirst; /* new head of chain */ + else + mprev->m_next = mfirst; /* replace old mbuf */ + m_free(m); /* release old mbuf */ + mprev = mfirst; + } + return (m0); +} diff --git a/sys/netipsec/ipsec.h b/sys/netipsec/ipsec.h index e08b94acc691..8fd10686801b 100644 --- a/sys/netipsec/ipsec.h +++ b/sys/netipsec/ipsec.h @@ -410,7 +410,6 @@ extern struct mbuf *ipsec_copypkt __P((struct mbuf *)); extern void m_checkalignment(const char* where, struct mbuf *m0, int off, int len); -extern struct mbuf *m_clone(struct mbuf *m0); extern struct mbuf *m_makespace(struct mbuf *m0, int skip, int hlen, int *off); extern caddr_t m_pad(struct mbuf *m, int n); extern int m_striphdr(struct mbuf *m, int skip, int hlen); diff --git a/sys/netipsec/ipsec_mbuf.c b/sys/netipsec/ipsec_mbuf.c index ade7d3313bd0..b63a5afc8f3b 100644 --- a/sys/netipsec/ipsec_mbuf.c +++ b/sys/netipsec/ipsec_mbuf.c @@ -42,155 +42,6 @@ #include -/* - * Create a writable copy of the mbuf chain. While doing this - * we compact the chain with a goal of producing a chain with - * at most two mbufs. The second mbuf in this chain is likely - * to be a cluster. The primary purpose of this work is to create - * a writable packet for encryption, compression, etc. The - * secondary goal is to linearize the data so the data can be - * passed to crypto hardware in the most efficient manner possible. - */ -struct mbuf * -m_clone(struct mbuf *m0) -{ - struct mbuf *m, *mprev; - struct mbuf *n, *mfirst, *mlast; - int len, off; - - IPSEC_ASSERT(m0 != NULL, ("null mbuf")); - - mprev = NULL; - for (m = m0; m != NULL; m = mprev->m_next) { - /* - * Regular mbufs are ignored unless there's a cluster - * in front of it that we can use to coalesce. We do - * the latter mainly so later clusters can be coalesced - * also w/o having to handle them specially (i.e. convert - * mbuf+cluster -> cluster). This optimization is heavily - * influenced by the assumption that we're running over - * Ethernet where MCLBYTES is large enough that the max - * packet size will permit lots of coalescing into a - * single cluster. This in turn permits efficient - * crypto operations, especially when using hardware. - */ - if ((m->m_flags & M_EXT) == 0) { - if (mprev && (mprev->m_flags & M_EXT) && - m->m_len <= M_TRAILINGSPACE(mprev)) { - /* XXX: this ignores mbuf types */ - memcpy(mtod(mprev, caddr_t) + mprev->m_len, - mtod(m, caddr_t), m->m_len); - mprev->m_len += m->m_len; - mprev->m_next = m->m_next; /* unlink from chain */ - m_free(m); /* reclaim mbuf */ - newipsecstat.ips_mbcoalesced++; - } else { - mprev = m; - } - continue; - } - /* - * Writable mbufs are left alone (for now). - */ - if (M_WRITABLE(m)) { - mprev = m; - continue; - } - - /* - * Not writable, replace with a copy or coalesce with - * the previous mbuf if possible (since we have to copy - * it anyway, we try to reduce the number of mbufs and - * clusters so that future work is easier). - */ - IPSEC_ASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags)); - /* NB: we only coalesce into a cluster or larger */ - if (mprev != NULL && (mprev->m_flags & M_EXT) && - m->m_len <= M_TRAILINGSPACE(mprev)) { - /* XXX: this ignores mbuf types */ - memcpy(mtod(mprev, caddr_t) + mprev->m_len, - mtod(m, caddr_t), m->m_len); - mprev->m_len += m->m_len; - mprev->m_next = m->m_next; /* unlink from chain */ - m_free(m); /* reclaim mbuf */ - newipsecstat.ips_clcoalesced++; - continue; - } - - /* - * Allocate new space to hold the copy... - */ - /* XXX why can M_PKTHDR be set past the first mbuf? */ - if (mprev == NULL && (m->m_flags & M_PKTHDR)) { - /* - * NB: if a packet header is present we must - * allocate the mbuf separately from any cluster - * because M_MOVE_PKTHDR will smash the data - * pointer and drop the M_EXT marker. - */ - MGETHDR(n, M_DONTWAIT, m->m_type); - if (n == NULL) { - m_freem(m0); - return (NULL); - } - M_MOVE_PKTHDR(n, m); - MCLGET(n, M_DONTWAIT); - if ((n->m_flags & M_EXT) == 0) { - m_free(n); - m_freem(m0); - return (NULL); - } - } else { - n = m_getcl(M_DONTWAIT, m->m_type, m->m_flags); - if (n == NULL) { - m_freem(m0); - return (NULL); - } - } - /* - * ... and copy the data. We deal with jumbo mbufs - * (i.e. m_len > MCLBYTES) by splitting them into - * clusters. We could just malloc a buffer and make - * it external but too many device drivers don't know - * how to break up the non-contiguous memory when - * doing DMA. - */ - len = m->m_len; - off = 0; - mfirst = n; - mlast = NULL; - for (;;) { - int cc = min(len, MCLBYTES); - memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc); - n->m_len = cc; - if (mlast != NULL) - mlast->m_next = n; - mlast = n; - newipsecstat.ips_clcopied++; - - len -= cc; - if (len <= 0) - break; - off += cc; - - n = m_getcl(M_DONTWAIT, m->m_type, m->m_flags); - if (n == NULL) { - m_freem(mfirst); - m_freem(m0); - return (NULL); - } - } - n->m_next = m->m_next; - if (mprev == NULL) - m0 = mfirst; /* new head of chain */ - else - mprev->m_next = mfirst; /* replace old mbuf */ - m_free(m); /* release old mbuf */ - mprev = mfirst; - } - return (m0); -} - /* * Make space for a new header of length hlen at skip bytes * into the packet. When doing this we allocate new mbufs only diff --git a/sys/netipsec/xform_ah.c b/sys/netipsec/xform_ah.c index 9b830d0e641c..3b1f66538fb1 100644 --- a/sys/netipsec/xform_ah.c +++ b/sys/netipsec/xform_ah.c @@ -942,7 +942,7 @@ ah_output( /* Update the counters. */ ahstat.ahs_obytes += m->m_pkthdr.len - skip; - m = m_clone(m); + m = m_unshare(m, M_NOWAIT); if (m == NULL) { DPRINTF(("%s: cannot clone mbuf chain, SA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst), diff --git a/sys/netipsec/xform_esp.c b/sys/netipsec/xform_esp.c index 7fe303e04929..fdcee7f95ae1 100644 --- a/sys/netipsec/xform_esp.c +++ b/sys/netipsec/xform_esp.c @@ -713,7 +713,7 @@ esp_output( /* Update the counters. */ espstat.esps_obytes += m->m_pkthdr.len - skip; - m = m_clone(m); + m = m_unshare(m, M_NOWAIT); if (m == NULL) { DPRINTF(("%s: cannot clone mbuf chain, SA %s/%08lx\n", __func__, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); diff --git a/sys/netipsec/xform_ipcomp.c b/sys/netipsec/xform_ipcomp.c index 90e74894de02..1ad6c492eb62 100644 --- a/sys/netipsec/xform_ipcomp.c +++ b/sys/netipsec/xform_ipcomp.c @@ -385,7 +385,7 @@ ipcomp_output( /* Update the counters */ ipcompstat.ipcomps_obytes += m->m_pkthdr.len - skip; - m = m_clone(m); + m = m_unshare(m, M_NOWAIT); if (m == NULL) { ipcompstat.ipcomps_hdrops++; DPRINTF(("%s: cannot clone mbuf chain, IPCA %s/%08lx\n", diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h index 4fb1ae8ad115..fee31976f19e 100644 --- a/sys/sys/mbuf.h +++ b/sys/sys/mbuf.h @@ -677,6 +677,7 @@ struct mbuf *m_pullup(struct mbuf *, int); int m_sanity(struct mbuf *, int); struct mbuf *m_split(struct mbuf *, int, int); struct mbuf *m_uiotombuf(struct uio *, int, int, int); +struct mbuf *m_unshare(struct mbuf *, int how); /*- * Network packets may have annotations attached by affixing a list