From 11cf751df214820c6fc8508f89595518a89aff8d Mon Sep 17 00:00:00 2001 From: Warner Losh Date: Tue, 5 Jan 2016 21:20:46 +0000 Subject: [PATCH 01/48] Disable abi variant hook until strangeness with packages can be sorted out. --- libexec/rtld-elf/rtld.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libexec/rtld-elf/rtld.c b/libexec/rtld-elf/rtld.c index 6daef2d13405..424f2eafd7ce 100644 --- a/libexec/rtld-elf/rtld.c +++ b/libexec/rtld-elf/rtld.c @@ -435,7 +435,7 @@ _rtld(Elf_Addr *sp, func_ptr_type *exit_proc, Obj_Entry **objp) trust = !issetugid(); - md_abi_variant_hook(aux_info); +/* md_abi_variant_hook(aux_info); */ ld_bind_now = getenv(_LD("BIND_NOW")); /* From 71b902050d159268fdc5e15c4fc9d143bf72bcaf Mon Sep 17 00:00:00 2001 From: Warner Losh Date: Tue, 5 Jan 2016 21:20:47 +0000 Subject: [PATCH 02/48] Use the more proper -f. Leave /bin/rm in place since that's what other rc scripts have, though it isn't strictly necessary. --- etc/rc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/etc/rc b/etc/rc index 2c90f385b88b..2127d7895925 100644 --- a/etc/rc +++ b/etc/rc @@ -132,9 +132,9 @@ done # Remove the firstboot sentinel, and reboot if it was requested. if [ -e ${firstboot_sentinel} ]; then [ ${root_rw_mount} = "yes" ] || mount -uw / - /bin/rm ${firstboot_sentinel} + /bin/rm -f ${firstboot_sentinel} if [ -e ${firstboot_sentinel}-reboot ]; then - /bin/rm ${firstboot_sentinel}-reboot + /bin/rm -f ${firstboot_sentinel}-reboot [ ${root_rw_mount} = "yes" ] || mount -ur / kill -INT 1 fi From a85f12322cd3cf3d794dc78ecbfa09728f23948b Mon Sep 17 00:00:00 2001 From: Alan Somers Date: Wed, 6 Jan 2016 00:00:11 +0000 Subject: [PATCH 03/48] "source routing" in rpcbind Fix a bug in rpcbind for multihomed hosts. If the server had interfaces on two separate subnets, and a client on the first subnet contacted rpcbind at the address on the second subnet, rpcbind would advertise addresses on the first subnet. This is a bug, because it should prefer to advertise the address where it was contacted. The requested service might be firewalled off from the address on the first subnet, for example. usr.sbin/rpcbind/check_bound.c If the address on which a request was received is known, pass that to addrmerge as the clnt_uaddr parameter. That is what addrmerge's comment indicates the parameter is supposed to mean. The previous behavior is that clnt_uaddr would contain the address from which the client sent the request. usr.sbin/rpcbind/util.c Modify addrmerge to prefer to use an IP that is equal to clnt_uaddr, if one is found. Refactor the relevant portion of the function for clarity, and to reduce the number of ifdefs. etc/mtree/BSD.tests.dist usr.sbin/rpcbind/tests/Makefile usr.sbin/rpcbind/tests/addrmerge_test.c Add unit tests for usr.sbin/rpcbind/util.c:addrmerge. usr.sbin/rpcbind/check_bound.c usr.sbin/rpcbind/rpcbind.h usr.sbin/rpcbind/util.c Constify some function arguments Reviewed by: imp MFC after: 4 weeks Sponsored by: Spectra Logic Corp Differential Revision: https://reviews.freebsd.org/D4690 --- etc/mtree/BSD.tests.dist | 2 + usr.sbin/rpcbind/Makefile | 4 + usr.sbin/rpcbind/check_bound.c | 15 +- usr.sbin/rpcbind/rpcbind.h | 6 +- usr.sbin/rpcbind/tests/Makefile | 17 + usr.sbin/rpcbind/tests/addrmerge_test.c | 849 ++++++++++++++++++++++++ usr.sbin/rpcbind/util.c | 145 ++-- 7 files changed, 985 insertions(+), 53 deletions(-) create mode 100644 usr.sbin/rpcbind/tests/Makefile create mode 100644 usr.sbin/rpcbind/tests/addrmerge_test.c diff --git a/etc/mtree/BSD.tests.dist b/etc/mtree/BSD.tests.dist index ad764021857c..ff3232417cf7 100644 --- a/etc/mtree/BSD.tests.dist +++ b/etc/mtree/BSD.tests.dist @@ -622,6 +622,8 @@ .. pw .. + rpcbind + .. sa .. .. diff --git a/usr.sbin/rpcbind/Makefile b/usr.sbin/rpcbind/Makefile index 2b679f416277..b3282601efc7 100644 --- a/usr.sbin/rpcbind/Makefile +++ b/usr.sbin/rpcbind/Makefile @@ -14,6 +14,10 @@ CFLAGS+= -DPORTMAP -DLIBWRAP CFLAGS+= -DINET6 .endif +.if ${MK_TESTS} != "no" +SUBDIR+= tests +.endif + WARNS?= 1 LIBADD= wrap diff --git a/usr.sbin/rpcbind/check_bound.c b/usr.sbin/rpcbind/check_bound.c index 3691f5f1a2f0..64b73c741d81 100644 --- a/usr.sbin/rpcbind/check_bound.c +++ b/usr.sbin/rpcbind/check_bound.c @@ -50,6 +50,7 @@ static char sccsid[] = "@(#)check_bound.c 1.11 89/04/21 Copyr 1989 Sun Micro"; #include #include #include +#include #include #include #include @@ -159,6 +160,7 @@ char * mergeaddr(SVCXPRT *xprt, char *netid, char *uaddr, char *saddr) { struct fdlist *fdl; + struct svc_dg_data *dg_data; char *c_uaddr, *s_uaddr, *m_uaddr, *allocated_uaddr = NULL; for (fdl = fdhead; fdl; fdl = fdl->next) @@ -170,11 +172,20 @@ mergeaddr(SVCXPRT *xprt, char *netid, char *uaddr, char *saddr) /* that server died */ return (nullstring); /* + * Try to determine the local address on which the client contacted us, + * so we can send a reply from the same address. If it's unknown, then + * try to determine which address the client used, and pick a nearby + * local address. + * * If saddr is not NULL, the remote client may have included the * address by which it contacted us. Use that for the "client" uaddr, * otherwise use the info from the SVCXPRT. */ - if (saddr != NULL) { + dg_data = (struct svc_dg_data*)xprt->xp_p2; + if (dg_data != NULL && dg_data->su_srcaddr.buf != NULL) { + c_uaddr = taddr2uaddr(fdl->nconf, &dg_data->su_srcaddr); + } + else if (saddr != NULL) { c_uaddr = saddr; } else { c_uaddr = taddr2uaddr(fdl->nconf, svc_getrpccaller(xprt)); @@ -217,7 +228,7 @@ mergeaddr(SVCXPRT *xprt, char *netid, char *uaddr, char *saddr) * structure should not be freed. */ struct netconfig * -rpcbind_get_conf(char *netid) +rpcbind_get_conf(const char *netid) { struct fdlist *fdl; diff --git a/usr.sbin/rpcbind/rpcbind.h b/usr.sbin/rpcbind/rpcbind.h index 4aba42042a6b..309bc0ba77da 100644 --- a/usr.sbin/rpcbind/rpcbind.h +++ b/usr.sbin/rpcbind/rpcbind.h @@ -85,7 +85,7 @@ extern char *tcp_uaddr; /* Universal TCP address */ int add_bndlist(struct netconfig *, struct netbuf *); bool_t is_bound(char *, char *); char *mergeaddr(SVCXPRT *, char *, char *, char *); -struct netconfig *rpcbind_get_conf(char *); +struct netconfig *rpcbind_get_conf(const char *); void rpcbs_init(void); void rpcbs_procinfo(rpcvers_t, rpcproc_t); @@ -134,8 +134,8 @@ extern void pmap_service(struct svc_req *, SVCXPRT *); void write_warmstart(void); void read_warmstart(void); -char *addrmerge(struct netbuf *caller, char *serv_uaddr, char *clnt_uaddr, - char *netid); +char *addrmerge(struct netbuf *caller, const char *serv_uaddr, + const char *clnt_uaddr, char const *netid); int listen_addr(const struct sockaddr *sa); void network_init(void); struct sockaddr *local_sa(int); diff --git a/usr.sbin/rpcbind/tests/Makefile b/usr.sbin/rpcbind/tests/Makefile new file mode 100644 index 000000000000..4b0cf15f13da --- /dev/null +++ b/usr.sbin/rpcbind/tests/Makefile @@ -0,0 +1,17 @@ +# $FreeBSD$ + +.include + +.PATH: ${.CURDIR}/.. + +ATF_TESTS_C= addrmerge_test +CFLAGS+= -I${.CURDIR}/.. -Wno-cast-qual +SRCS.addrmerge_test= addrmerge_test.c util.c + +.if ${MK_INET6_SUPPORT} != "no" +CFLAGS+= -DINET6 +.endif + +WARNS?= 3 + +.include diff --git a/usr.sbin/rpcbind/tests/addrmerge_test.c b/usr.sbin/rpcbind/tests/addrmerge_test.c new file mode 100644 index 000000000000..357354af1c70 --- /dev/null +++ b/usr.sbin/rpcbind/tests/addrmerge_test.c @@ -0,0 +1,849 @@ +/*- + * Copyright (c) 2014 Spectra Logic Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + * $FreeBSD$ + */ + +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include + +#include "rpcbind.h" + +#define MAX_IFADDRS 16 + +int debugging = false; + +/* Data for mocking getifaddrs */ +struct ifaddr_storage { + struct ifaddrs ifaddr; + struct sockaddr_storage addr; + struct sockaddr_storage mask; + struct sockaddr_storage bcast; +} mock_ifaddr_storage[MAX_IFADDRS]; +struct ifaddrs *mock_ifaddrs = NULL; +int ifaddr_count = 0; + +/* Data for mocking listen_addr */ +int bind_address_count = 0; +struct sockaddr* bind_addresses[MAX_IFADDRS]; + +/* Stub library functions */ +void +freeifaddrs(struct ifaddrs *ifp __unused) +{ + return ; +} + +int +getifaddrs(struct ifaddrs **ifap) +{ + *ifap = mock_ifaddrs; + return (0); +} + +static void +mock_ifaddr4(const char* name, const char* addr, const char* mask, + const char* bcast, unsigned int flags, bool bind) +{ + struct ifaddrs *ifaddr = &mock_ifaddr_storage[ifaddr_count].ifaddr; + struct sockaddr_in *in = (struct sockaddr_in*) + &mock_ifaddr_storage[ifaddr_count].addr; + struct sockaddr_in *mask_in = (struct sockaddr_in*) + &mock_ifaddr_storage[ifaddr_count].mask; + struct sockaddr_in *bcast_in = (struct sockaddr_in*) + &mock_ifaddr_storage[ifaddr_count].bcast; + + in->sin_family = AF_INET; + in->sin_port = 0; + in->sin_len = sizeof(in); + in->sin_addr.s_addr = inet_addr(addr); + mask_in->sin_family = AF_INET; + mask_in->sin_port = 0; + mask_in->sin_len = sizeof(mask_in); + mask_in->sin_addr.s_addr = inet_addr(mask); + bcast_in->sin_family = AF_INET; + bcast_in->sin_port = 0; + bcast_in->sin_len = sizeof(bcast_in); + bcast_in->sin_addr.s_addr = inet_addr(bcast); + *ifaddr = (struct ifaddrs) { + .ifa_next = NULL, + .ifa_name = (char*) name, + .ifa_flags = flags, + .ifa_addr = (struct sockaddr*) in, + .ifa_netmask = (struct sockaddr*) mask_in, + .ifa_broadaddr = (struct sockaddr*) bcast_in, + .ifa_data = NULL, /* addrmerge doesn't care*/ + }; + + if (ifaddr_count > 0) + mock_ifaddr_storage[ifaddr_count - 1].ifaddr.ifa_next = ifaddr; + ifaddr_count++; + mock_ifaddrs = &mock_ifaddr_storage[0].ifaddr; + + /* Optionally simulate binding an ip ala "rpcbind -h foo" */ + if (bind) { + bind_addresses[bind_address_count] = (struct sockaddr*)in; + bind_address_count++; + } +} + +#ifdef INET6 +static void +mock_ifaddr6(const char* name, const char* addr, const char* mask, + const char* bcast, unsigned int flags, uint32_t scope_id, bool bind) +{ + struct ifaddrs *ifaddr = &mock_ifaddr_storage[ifaddr_count].ifaddr; + struct sockaddr_in6 *in6 = (struct sockaddr_in6*) + &mock_ifaddr_storage[ifaddr_count].addr; + struct sockaddr_in6 *mask_in6 = (struct sockaddr_in6*) + &mock_ifaddr_storage[ifaddr_count].mask; + struct sockaddr_in6 *bcast_in6 = (struct sockaddr_in6*) + &mock_ifaddr_storage[ifaddr_count].bcast; + + in6->sin6_family = AF_INET6; + in6->sin6_port = 0; + in6->sin6_len = sizeof(*in6); + in6->sin6_scope_id = scope_id; + ATF_REQUIRE_EQ(1, inet_pton(AF_INET6, addr, (void*)&in6->sin6_addr)); + mask_in6->sin6_family = AF_INET6; + mask_in6->sin6_port = 0; + mask_in6->sin6_len = sizeof(*mask_in6); + mask_in6->sin6_scope_id = scope_id; + ATF_REQUIRE_EQ(1, inet_pton(AF_INET6, mask, + (void*)&mask_in6->sin6_addr)); + bcast_in6->sin6_family = AF_INET6; + bcast_in6->sin6_port = 0; + bcast_in6->sin6_len = sizeof(*bcast_in6); + bcast_in6->sin6_scope_id = scope_id; + ATF_REQUIRE_EQ(1, inet_pton(AF_INET6, bcast, + (void*)&bcast_in6->sin6_addr)); + *ifaddr = (struct ifaddrs) { + .ifa_next = NULL, + .ifa_name = (char*) name, + .ifa_flags = flags, + .ifa_addr = (struct sockaddr*) in6, + .ifa_netmask = (struct sockaddr*) mask_in6, + .ifa_broadaddr = (struct sockaddr*) bcast_in6, + .ifa_data = NULL, /* addrmerge doesn't care*/ + }; + + if (ifaddr_count > 0) + mock_ifaddr_storage[ifaddr_count - 1].ifaddr.ifa_next = ifaddr; + ifaddr_count++; + mock_ifaddrs = &mock_ifaddr_storage[0].ifaddr; + + /* Optionally simulate binding an ip ala "rpcbind -h foo" */ + if (bind) { + bind_addresses[bind_address_count] = (struct sockaddr*)in6; + bind_address_count++; + } +} +#else +static void +mock_ifaddr6(const char* name __unused, const char* addr __unused, + const char* mask __unused, const char* bcast __unused, + unsigned int flags __unused, uint32_t scope_id __unused, bool bind __unused) +{ +} +#endif /*INET6 */ + +static void +mock_lo0(void) +{ + /* + * This broadcast address looks wrong, but it's what getifaddrs(2) + * actually returns. It's invalid because IFF_BROADCAST is not set + */ + mock_ifaddr4("lo0", "127.0.0.1", "255.0.0.0", "127.0.0.1", + IFF_LOOPBACK | IFF_UP | IFF_RUNNING | IFF_MULTICAST, false); + mock_ifaddr6("lo0", "::1", "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff", + "::1", + IFF_LOOPBACK | IFF_UP | IFF_RUNNING | IFF_MULTICAST, 0, false); +} + +static void +mock_igb0(void) +{ + mock_ifaddr4("igb0", "192.0.2.2", "255.255.255.128", "192.0.2.127", + IFF_UP | IFF_BROADCAST | IFF_RUNNING | IFF_SIMPLEX | IFF_MULTICAST, + false); + mock_ifaddr6("igb0", "2001:db8::2", "ffff:ffff:ffff:ffff::", + "2001:db8::ffff:ffff:ffff:ffff", + IFF_UP | IFF_BROADCAST | IFF_RUNNING | IFF_SIMPLEX | IFF_MULTICAST, + 0, false); + /* Link local address */ + mock_ifaddr6("igb0", "fe80::2", "ffff:ffff:ffff:ffff::", + "fe80::ffff:ffff:ffff:ffff", + IFF_UP | IFF_BROADCAST | IFF_RUNNING | IFF_SIMPLEX | IFF_MULTICAST, + 2, false); +} + +/* On the same subnet as igb0 */ +static void +mock_igb1(bool bind) +{ + mock_ifaddr4("igb1", "192.0.2.3", "255.255.255.128", "192.0.2.127", + IFF_UP | IFF_BROADCAST | IFF_RUNNING | IFF_SIMPLEX | IFF_MULTICAST, + bind); + mock_ifaddr6("igb1", "2001:db8::3", "ffff:ffff:ffff:ffff::", + "2001:db8::ffff:ffff:ffff:ffff", + IFF_UP | IFF_BROADCAST | IFF_RUNNING | IFF_SIMPLEX | IFF_MULTICAST, + 0, bind); + /* Link local address */ + mock_ifaddr6("igb1", "fe80::3", "ffff:ffff:ffff:ffff::", + "fe80::ffff:ffff:ffff:ffff", + IFF_UP | IFF_BROADCAST | IFF_RUNNING | IFF_SIMPLEX | IFF_MULTICAST, + 3, bind); +} + +/* igb2 is on a different subnet than igb0 */ +static void +mock_igb2(void) +{ + mock_ifaddr4("igb2", "192.0.2.130", "255.255.255.128", "192.0.2.255", + IFF_UP | IFF_BROADCAST | IFF_RUNNING | IFF_SIMPLEX | IFF_MULTICAST, + false); + mock_ifaddr6("igb2", "2001:db8:1::2", "ffff:ffff:ffff:ffff::", + "2001:db8:1:0:ffff:ffff:ffff:ffff", + IFF_UP | IFF_BROADCAST | IFF_RUNNING | IFF_SIMPLEX | IFF_MULTICAST, + 0, false); +} + +/* tun0 is a P2P interface */ +static void +mock_tun0(void) +{ + mock_ifaddr4("tun0", "192.0.2.5", "255.255.255.255", "192.0.2.6", + IFF_UP | IFF_RUNNING | IFF_POINTOPOINT | IFF_MULTICAST, false); + mock_ifaddr6("tun0", "2001:db8::5", + "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff", + "2001:db8::6", + IFF_UP | IFF_RUNNING | IFF_POINTOPOINT | IFF_MULTICAST, 0, false); +} + + +/* Stub rpcbind functions */ +int +listen_addr(const struct sockaddr *sa) +{ + int i; + + if (bind_address_count == 0) + return (1); + + for (i = 0; i < bind_address_count; i++) { + if (bind_addresses[i]->sa_family != sa->sa_family) + continue; + + if (0 == memcmp(bind_addresses[i]->sa_data, sa->sa_data, + sa->sa_len)) + return (1); + } + return (0); +} + +struct netconfig* +rpcbind_get_conf(const char* netid __unused) +{ + /* Use static variables so we can return pointers to them */ + static char* lookups = NULL; + static struct netconfig nconf_udp; +#ifdef INET6 + static struct netconfig nconf_udp6; +#endif /* INET6 */ + + nconf_udp.nc_netid = "udp"; //netid_storage; + nconf_udp.nc_semantics = NC_TPI_CLTS; + nconf_udp.nc_flag = NC_VISIBLE; + nconf_udp.nc_protofmly = (char*)"inet"; + nconf_udp.nc_proto = (char*)"udp"; + nconf_udp.nc_device = (char*)"-"; + nconf_udp.nc_nlookups = 0; + nconf_udp.nc_lookups = &lookups; + +#ifdef INET6 + nconf_udp6.nc_netid = "udp6"; //netid_storage; + nconf_udp6.nc_semantics = NC_TPI_CLTS; + nconf_udp6.nc_flag = NC_VISIBLE; + nconf_udp6.nc_protofmly = (char*)"inet6"; + nconf_udp6.nc_proto = (char*)"udp6"; + nconf_udp6.nc_device = (char*)"-"; + nconf_udp6.nc_nlookups = 0; + nconf_udp6.nc_lookups = &lookups; +#endif /* INET6 */ + + if (0 == strncmp("udp", netid, sizeof("udp"))) + return (&nconf_udp); +#ifdef INET6 + else if (0 == strncmp("udp6", netid, sizeof("udp6"))) + return (&nconf_udp6); +#endif /* INET6 */ + else + return (NULL); +} + +/* + * Helper function used by most test cases + * param recvdstaddr If non-null, the uaddr on which the request was received + */ +static char* +do_addrmerge4(const char* recvdstaddr) +{ + struct netbuf caller; + struct sockaddr_in caller_in; + const char *serv_uaddr, *clnt_uaddr, *netid; + + /* caller contains the client's IP address */ + caller.maxlen = sizeof(struct sockaddr_storage); + caller.len = sizeof(caller_in); + caller_in.sin_family = AF_INET; + caller_in.sin_len = sizeof(caller_in); + caller_in.sin_port = 1234; + caller_in.sin_addr.s_addr = inet_addr("192.0.2.1"); + caller.buf = (void*)&caller_in; + if (recvdstaddr != NULL) + clnt_uaddr = recvdstaddr; + else + clnt_uaddr = "192.0.2.1.3.46"; + + /* assume server is bound in INADDR_ANY port 814 */ + serv_uaddr = "0.0.0.0.3.46"; + + netid = "udp"; + return (addrmerge(&caller, serv_uaddr, clnt_uaddr, netid)); +} + +#ifdef INET6 +/* + * Variant of do_addrmerge4 where the caller has an IPv6 address + * param recvdstaddr If non-null, the uaddr on which the request was received + */ +static char* +do_addrmerge6(const char* recvdstaddr) +{ + struct netbuf caller; + struct sockaddr_in6 caller_in6; + const char *serv_uaddr, *clnt_uaddr, *netid; + + /* caller contains the client's IP address */ + caller.maxlen = sizeof(struct sockaddr_storage); + caller.len = sizeof(caller_in6); + caller_in6.sin6_family = AF_INET6; + caller_in6.sin6_len = sizeof(caller_in6); + caller_in6.sin6_port = 1234; + ATF_REQUIRE_EQ(1, inet_pton(AF_INET6, "2001:db8::1", + (void*)&caller_in6.sin6_addr)); + caller.buf = (void*)&caller_in6; + if (recvdstaddr != NULL) + clnt_uaddr = recvdstaddr; + else + clnt_uaddr = "2001:db8::1.3.46"; + + /* assume server is bound in INADDR_ANY port 814 */ + serv_uaddr = "::1.3.46"; + + netid = "udp6"; + return (addrmerge(&caller, serv_uaddr, clnt_uaddr, netid)); +} + +/* Variant of do_addrmerge6 where the caller uses a link local address */ +static char* +do_addrmerge6_ll(void) +{ + struct netbuf caller; + struct sockaddr_in6 caller_in6; + const char *serv_uaddr, *clnt_uaddr, *netid; + + /* caller contains the client's IP address */ + caller.maxlen = sizeof(struct sockaddr_storage); + caller.len = sizeof(caller_in6); + caller_in6.sin6_family = AF_INET6; + caller_in6.sin6_len = sizeof(caller_in6); + caller_in6.sin6_port = 1234; + caller_in6.sin6_scope_id = 2; /* same as igb0 */ + ATF_REQUIRE_EQ(1, inet_pton(AF_INET6, "fe80::beef", + (void*)&caller_in6.sin6_addr)); + caller.buf = (void*)&caller_in6; + clnt_uaddr = "fe80::beef.3.46"; + + /* assume server is bound in INADDR_ANY port 814 */ + serv_uaddr = "::1.3.46"; + + netid = "udp6"; + return (addrmerge(&caller, serv_uaddr, clnt_uaddr, netid)); +} +#endif /* INET6 */ + +ATF_TC_WITHOUT_HEAD(addrmerge_noifaddrs); +ATF_TC_BODY(addrmerge_noifaddrs, tc) +{ + char* maddr; + + maddr = do_addrmerge4(NULL); + + /* Since getifaddrs returns null, addrmerge must too */ + ATF_CHECK_EQ(NULL, maddr); +} + +ATF_TC_WITHOUT_HEAD(addrmerge_localhost_only); +ATF_TC_BODY(addrmerge_localhost_only, tc) +{ + char *maddr; + + /* getifaddrs will return localhost only */ + mock_lo0(); + + maddr = do_addrmerge4(NULL); + + /* We must return localhost if there is nothing better */ + ATF_REQUIRE(maddr != NULL); + ATF_CHECK_STREQ("127.0.0.1.3.46", maddr); +} + +ATF_TC_WITHOUT_HEAD(addrmerge_singlehomed); +ATF_TC_BODY(addrmerge_singlehomed, tc) +{ + char *maddr; + + /* getifaddrs will return one public address */ + mock_lo0(); + mock_igb0(); + + maddr = do_addrmerge4(NULL); + + ATF_REQUIRE(maddr != NULL); + ATF_CHECK_STREQ("192.0.2.2.3.46", maddr); +} + +ATF_TC_WITHOUT_HEAD(addrmerge_one_addr_on_each_subnet); +ATF_TC_BODY(addrmerge_one_addr_on_each_subnet, tc) +{ + char *maddr; + + mock_lo0(); + mock_igb0(); + mock_igb2(); + + maddr = do_addrmerge4(NULL); + + /* We must return the address on the caller's subnet */ + ATF_REQUIRE(maddr != NULL); + ATF_CHECK_STREQ("192.0.2.2.3.46", maddr); +} + + +/* + * Like addrmerge_one_addr_on_each_subnet, but getifaddrs returns a different + * order + */ +ATF_TC_WITHOUT_HEAD(addrmerge_one_addr_on_each_subnet_rev); +ATF_TC_BODY(addrmerge_one_addr_on_each_subnet_rev, tc) +{ + char *maddr; + + /* getifaddrs will return one public address on each of two subnets */ + mock_igb2(); + mock_igb0(); + mock_lo0(); + + maddr = do_addrmerge4(NULL); + + /* We must return the address on the caller's subnet */ + ATF_REQUIRE(maddr != NULL); + ATF_CHECK_STREQ("192.0.2.2.3.46", maddr); +} + +ATF_TC_WITHOUT_HEAD(addrmerge_point2point); +ATF_TC_BODY(addrmerge_point2point, tc) +{ + char *maddr; + + /* getifaddrs will return one normal and one p2p address */ + mock_lo0(); + mock_igb2(); + mock_tun0(); + + maddr = do_addrmerge4(NULL); + + /* addrmerge should disprefer P2P interfaces */ + ATF_REQUIRE(maddr != NULL); + ATF_CHECK_STREQ("192.0.2.130.3.46", maddr); +} + +/* Like addrerge_point2point, but getifaddrs returns a different order */ +ATF_TC_WITHOUT_HEAD(addrmerge_point2point_rev); +ATF_TC_BODY(addrmerge_point2point_rev, tc) +{ + char *maddr; + + /* getifaddrs will return one normal and one p2p address */ + mock_tun0(); + mock_igb2(); + mock_lo0(); + + maddr = do_addrmerge4(NULL); + + /* addrmerge should disprefer P2P interfaces */ + ATF_REQUIRE(maddr != NULL); + ATF_CHECK_STREQ("192.0.2.130.3.46", maddr); +} + +/* + * Simulate using rpcbind -h to select just one ip when the subnet has + * multiple + */ +ATF_TC_WITHOUT_HEAD(addrmerge_bindip); +ATF_TC_BODY(addrmerge_bindip, tc) +{ + char *maddr; + + /* getifaddrs will return one public address on each of two subnets */ + mock_lo0(); + mock_igb0(); + mock_igb1(true); + + maddr = do_addrmerge4(NULL); + + /* We must return the address to which we are bound */ + ATF_REQUIRE(maddr != NULL); + ATF_CHECK_STREQ("192.0.2.3.3.46", maddr); +} + +/* Like addrmerge_bindip, but getifaddrs returns a different order */ +ATF_TC_WITHOUT_HEAD(addrmerge_bindip_rev); +ATF_TC_BODY(addrmerge_bindip_rev, tc) +{ + char *maddr; + + /* getifaddrs will return one public address on each of two subnets */ + mock_igb1(true); + mock_igb0(); + mock_lo0(); + + maddr = do_addrmerge4(NULL); + + /* We must return the address to which we are bound */ + ATF_REQUIRE(maddr != NULL); + ATF_CHECK_STREQ("192.0.2.3.3.46", maddr); +} + +/* + * The address on which the request was received is known, and is provided as + * the hint. + */ +ATF_TC_WITHOUT_HEAD(addrmerge_recvdstaddr); +ATF_TC_BODY(addrmerge_recvdstaddr, tc) +{ + char *maddr; + + mock_lo0(); + mock_igb0(); + mock_igb1(false); + + maddr = do_addrmerge4("192.0.2.2.3.46"); + + /* We must return the address on which the request was received */ + ATF_REQUIRE(maddr != NULL); + ATF_CHECK_STREQ("192.0.2.2.3.46", maddr); +} + +ATF_TC_WITHOUT_HEAD(addrmerge_recvdstaddr_rev); +ATF_TC_BODY(addrmerge_recvdstaddr_rev, tc) +{ + char *maddr; + + mock_igb1(false); + mock_igb0(); + mock_lo0(); + + maddr = do_addrmerge4("192.0.2.2.3.46"); + + /* We must return the address on which the request was received */ + ATF_REQUIRE(maddr != NULL); + ATF_CHECK_STREQ("192.0.2.2.3.46", maddr); +} + +#ifdef INET6 +ATF_TC_WITHOUT_HEAD(addrmerge_localhost_only6); +ATF_TC_BODY(addrmerge_localhost_only6, tc) +{ + char *maddr; + + /* getifaddrs will return localhost only */ + mock_lo0(); + + maddr = do_addrmerge6(NULL); + + /* We must return localhost if there is nothing better */ + ATF_REQUIRE(maddr != NULL); + ATF_CHECK_STREQ("::1.3.46", maddr); +} + +ATF_TC_WITHOUT_HEAD(addrmerge_singlehomed6); +ATF_TC_BODY(addrmerge_singlehomed6, tc) +{ + char *maddr; + + /* getifaddrs will return one public address */ + mock_lo0(); + mock_igb0(); + + maddr = do_addrmerge6(NULL); + + ATF_REQUIRE(maddr != NULL); + ATF_CHECK_STREQ("2001:db8::2.3.46", maddr); +} + +ATF_TC_WITHOUT_HEAD(addrmerge_one_addr_on_each_subnet6); +ATF_TC_BODY(addrmerge_one_addr_on_each_subnet6, tc) +{ + char *maddr; + + mock_lo0(); + mock_igb0(); + mock_igb2(); + + maddr = do_addrmerge6(NULL); + + /* We must return the address on the caller's subnet */ + ATF_REQUIRE(maddr != NULL); + ATF_CHECK_STREQ("2001:db8::2.3.46", maddr); +} + + +/* + * Like addrmerge_one_addr_on_each_subnet6, but getifaddrs returns a different + * order + */ +ATF_TC_WITHOUT_HEAD(addrmerge_one_addr_on_each_subnet6_rev); +ATF_TC_BODY(addrmerge_one_addr_on_each_subnet6_rev, tc) +{ + char *maddr; + + /* getifaddrs will return one public address on each of two subnets */ + mock_igb2(); + mock_igb0(); + mock_lo0(); + + maddr = do_addrmerge6(NULL); + + /* We must return the address on the caller's subnet */ + ATF_REQUIRE(maddr != NULL); + ATF_CHECK_STREQ("2001:db8::2.3.46", maddr); +} + +ATF_TC_WITHOUT_HEAD(addrmerge_point2point6); +ATF_TC_BODY(addrmerge_point2point6, tc) +{ + char *maddr; + + /* getifaddrs will return one normal and one p2p address */ + mock_lo0(); + mock_igb2(); + mock_tun0(); + + maddr = do_addrmerge6(NULL); + + /* addrmerge should disprefer P2P interfaces */ + ATF_REQUIRE(maddr != NULL); + ATF_CHECK_STREQ("2001:db8:1::2.3.46", maddr); +} + +/* Like addrerge_point2point, but getifaddrs returns a different order */ +ATF_TC_WITHOUT_HEAD(addrmerge_point2point6_rev); +ATF_TC_BODY(addrmerge_point2point6_rev, tc) +{ + char *maddr; + + /* getifaddrs will return one normal and one p2p address */ + mock_tun0(); + mock_igb2(); + mock_lo0(); + + maddr = do_addrmerge6(NULL); + + /* addrmerge should disprefer P2P interfaces */ + ATF_REQUIRE(maddr != NULL); + ATF_CHECK_STREQ("2001:db8:1::2.3.46", maddr); +} + +ATF_TC_WITHOUT_HEAD(addrmerge_bindip6); +ATF_TC_BODY(addrmerge_bindip6, tc) +{ + char *maddr; + + /* getifaddrs will return one public address on each of two subnets */ + mock_lo0(); + mock_igb0(); + mock_igb1(true); + + maddr = do_addrmerge6(NULL); + + /* We must return the address to which we are bound */ + ATF_REQUIRE(maddr != NULL); + ATF_CHECK_STREQ("2001:db8::3.3.46", maddr); +} + +/* Like addrerge_bindip, but getifaddrs returns a different order */ +ATF_TC_WITHOUT_HEAD(addrmerge_bindip6_rev); +ATF_TC_BODY(addrmerge_bindip6_rev, tc) +{ + char *maddr; + + /* getifaddrs will return one public address on each of two subnets */ + mock_igb1(true); + mock_igb0(); + mock_lo0(); + + maddr = do_addrmerge6(NULL); + + /* We must return the address to which we are bound */ + ATF_REQUIRE(maddr != NULL); + ATF_CHECK_STREQ("2001:db8::3.3.46", maddr); +} + +/* + * IPv6 Link Local addresses with the same scope id as the caller, if the caller + * is also a link local address, should be preferred + */ +ATF_TC_WITHOUT_HEAD(addrmerge_ipv6_linklocal); +ATF_TC_BODY(addrmerge_ipv6_linklocal, tc) +{ + char *maddr; + + /* + * getifaddrs will return two link local addresses with the same netmask + * and prefix but different scope IDs + */ + mock_igb1(false); + mock_igb0(); + mock_lo0(); + + maddr = do_addrmerge6_ll(); + + /* We must return the address to which we are bound */ + ATF_REQUIRE(maddr != NULL); + ATF_CHECK_STREQ("fe80::2.3.46", maddr); +} + +ATF_TC_WITHOUT_HEAD(addrmerge_ipv6_linklocal_rev); +ATF_TC_BODY(addrmerge_ipv6_linklocal_rev, tc) +{ + char *maddr; + + /* + * getifaddrs will return two link local addresses with the same netmask + * and prefix but different scope IDs + */ + mock_lo0(); + mock_igb0(); + mock_igb1(false); + + maddr = do_addrmerge6_ll(); + + /* We must return the address to which we are bound */ + ATF_REQUIRE(maddr != NULL); + ATF_CHECK_STREQ("fe80::2.3.46", maddr); +} + +ATF_TC_WITHOUT_HEAD(addrmerge_recvdstaddr6); +ATF_TC_BODY(addrmerge_recvdstaddr6, tc) +{ + char *maddr; + + mock_lo0(); + mock_igb0(); + mock_igb1(false); + + maddr = do_addrmerge6("2001:db8::2.3.46"); + + /* We must return the address on which the request was received */ + ATF_REQUIRE(maddr != NULL); + ATF_CHECK_STREQ("2001:db8::2.3.46", maddr); +} + +ATF_TC_WITHOUT_HEAD(addrmerge_recvdstaddr6_rev); +ATF_TC_BODY(addrmerge_recvdstaddr6_rev, tc) +{ + char *maddr; + + mock_igb1(false); + mock_igb0(); + mock_lo0(); + + maddr = do_addrmerge6("2001:db8::2.3.46"); + + /* We must return the address on which the request was received */ + ATF_REQUIRE(maddr != NULL); + ATF_CHECK_STREQ("2001:db8::2.3.46", maddr); +} +#endif /* INET6 */ + + +ATF_TP_ADD_TCS(tp) +{ + ATF_TP_ADD_TC(tp, addrmerge_noifaddrs); + ATF_TP_ADD_TC(tp, addrmerge_localhost_only); + ATF_TP_ADD_TC(tp, addrmerge_singlehomed); + ATF_TP_ADD_TC(tp, addrmerge_one_addr_on_each_subnet); + ATF_TP_ADD_TC(tp, addrmerge_one_addr_on_each_subnet_rev); + ATF_TP_ADD_TC(tp, addrmerge_point2point); + ATF_TP_ADD_TC(tp, addrmerge_point2point_rev); + ATF_TP_ADD_TC(tp, addrmerge_bindip); + ATF_TP_ADD_TC(tp, addrmerge_bindip_rev); + ATF_TP_ADD_TC(tp, addrmerge_recvdstaddr); + ATF_TP_ADD_TC(tp, addrmerge_recvdstaddr_rev); +#ifdef INET6 + ATF_TP_ADD_TC(tp, addrmerge_localhost_only6); + ATF_TP_ADD_TC(tp, addrmerge_singlehomed6); + ATF_TP_ADD_TC(tp, addrmerge_one_addr_on_each_subnet6); + ATF_TP_ADD_TC(tp, addrmerge_one_addr_on_each_subnet6_rev); + ATF_TP_ADD_TC(tp, addrmerge_point2point6); + ATF_TP_ADD_TC(tp, addrmerge_point2point6_rev); + ATF_TP_ADD_TC(tp, addrmerge_bindip6); + ATF_TP_ADD_TC(tp, addrmerge_bindip6_rev); + ATF_TP_ADD_TC(tp, addrmerge_ipv6_linklocal); + ATF_TP_ADD_TC(tp, addrmerge_ipv6_linklocal_rev); + ATF_TP_ADD_TC(tp, addrmerge_recvdstaddr6); + ATF_TP_ADD_TC(tp, addrmerge_recvdstaddr6_rev); +#endif + + return (atf_no_error()); +} diff --git a/usr.sbin/rpcbind/util.c b/usr.sbin/rpcbind/util.c index 8ddb13eec68d..da6a5abeea75 100644 --- a/usr.sbin/rpcbind/util.c +++ b/usr.sbin/rpcbind/util.c @@ -56,7 +56,7 @@ static struct sockaddr_in *local_in4; static struct sockaddr_in6 *local_in6; #endif -static int bitmaskcmp(void *, void *, void *, int); +static int bitmaskcmp(struct sockaddr *, struct sockaddr *, struct sockaddr *); /* * For all bits set in "mask", compare the corresponding bits in @@ -64,10 +64,34 @@ static int bitmaskcmp(void *, void *, void *, int); * match. */ static int -bitmaskcmp(void *dst, void *src, void *mask, int bytelen) +bitmaskcmp(struct sockaddr *dst, struct sockaddr *src, struct sockaddr *mask) { int i; - u_int8_t *p1 = dst, *p2 = src, *netmask = mask; + u_int8_t *p1, *p2, *netmask; + int bytelen; + + if (dst->sa_family != src->sa_family || + dst->sa_family != mask->sa_family) + return (1); + + switch (dst->sa_family) { + case AF_INET: + p1 = (uint8_t*) &SA2SINADDR(dst); + p2 = (uint8_t*) &SA2SINADDR(src); + netmask = (uint8_t*) &SA2SINADDR(mask); + bytelen = sizeof(struct in_addr); + break; +#ifdef INET6 + case AF_INET6: + p1 = (uint8_t*) &SA2SIN6ADDR(dst); + p2 = (uint8_t*) &SA2SIN6ADDR(src); + netmask = (uint8_t*) &SA2SIN6ADDR(mask); + bytelen = sizeof(struct in6_addr); + break; +#endif + default: + return (1); + } for (i = 0; i < bytelen; i++) if ((p1[i] & netmask[i]) != (p2[i] & netmask[i])) @@ -86,16 +110,18 @@ bitmaskcmp(void *dst, void *src, void *mask, int bytelen) * string which should be freed by the caller. On error, returns NULL. */ char * -addrmerge(struct netbuf *caller, char *serv_uaddr, char *clnt_uaddr, - char *netid) +addrmerge(struct netbuf *caller, const char *serv_uaddr, const char *clnt_uaddr, + const char *netid) { struct ifaddrs *ifap, *ifp = NULL, *bestif; struct netbuf *serv_nbp = NULL, *hint_nbp = NULL, tbuf; struct sockaddr *caller_sa, *hint_sa, *ifsa, *ifmasksa, *serv_sa; struct sockaddr_storage ss; struct netconfig *nconf; - char *caller_uaddr = NULL, *hint_uaddr = NULL; + char *caller_uaddr = NULL; + const char *hint_uaddr = NULL; char *ret = NULL; + int bestif_goodness; #ifdef ND_DEBUG if (debugging) @@ -139,19 +165,29 @@ addrmerge(struct netbuf *caller, char *serv_uaddr, char *clnt_uaddr, goto freeit; /* - * Loop through all interfaces. For each interface, see if it - * is either the loopback interface (which we always listen - * on) or is one of the addresses the program bound to (the - * wildcard by default, or a subset if -h is specified) and - * the network portion of its address is equal to that of the - * client. If so, we have found the interface that we want to - * use. + * Loop through all interface addresses. We are listening to an address + * if any of the following are true: + * a) It's a loopback address + * b) It was specified with the -h command line option + * c) There were no -h command line options. + * + * Among addresses on which we are listening, choose in order of + * preference an address that is: + * + * a) Equal to the hint + * b) A link local address with the same scope ID as the client's + * address, if the client's address is also link local + * c) An address on the same subnet as the client's address + * d) A non-localhost, non-p2p address + * e) Any usable address */ bestif = NULL; + bestif_goodness = 0; for (ifap = ifp; ifap != NULL; ifap = ifap->ifa_next) { ifsa = ifap->ifa_addr; ifmasksa = ifap->ifa_netmask; + /* Skip addresses where we don't listen */ if (ifsa == NULL || ifsa->sa_family != hint_sa->sa_family || !(ifap->ifa_flags & IFF_UP)) continue; @@ -159,21 +195,29 @@ addrmerge(struct netbuf *caller, char *serv_uaddr, char *clnt_uaddr, if (!(ifap->ifa_flags & IFF_LOOPBACK) && !listen_addr(ifsa)) continue; - switch (hint_sa->sa_family) { - case AF_INET: - /* - * If the hint address matches this interface - * address/netmask, then we're done. - */ - if (!bitmaskcmp(&SA2SINADDR(ifsa), - &SA2SINADDR(hint_sa), &SA2SINADDR(ifmasksa), - sizeof(struct in_addr))) { - bestif = ifap; - goto found; - } - break; + if ((hint_sa->sa_family == AF_INET) && + ((((struct sockaddr_in*)hint_sa)->sin_addr.s_addr == + ((struct sockaddr_in*)ifsa)->sin_addr.s_addr))) { + const int goodness = 4; + + bestif_goodness = goodness; + bestif = ifap; + goto found; + } #ifdef INET6 - case AF_INET6: + if ((hint_sa->sa_family == AF_INET6) && + (0 == memcmp(&((struct sockaddr_in6*)hint_sa)->sin6_addr, + &((struct sockaddr_in6*)ifsa)->sin6_addr, + sizeof(struct in6_addr))) && + (((struct sockaddr_in6*)hint_sa)->sin6_scope_id == + (((struct sockaddr_in6*)ifsa)->sin6_scope_id))) { + const int goodness = 4; + + bestif_goodness = goodness; + bestif = ifap; + goto found; + } + if (hint_sa->sa_family == AF_INET6) { /* * For v6 link local addresses, if the caller is on * a link-local address then use the scope id to see @@ -184,28 +228,33 @@ addrmerge(struct netbuf *caller, char *serv_uaddr, char *clnt_uaddr, IN6_IS_ADDR_LINKLOCAL(&SA2SIN6ADDR(hint_sa))) { if (SA2SIN6(ifsa)->sin6_scope_id == SA2SIN6(caller_sa)->sin6_scope_id) { - bestif = ifap; - goto found; - } - } else if (!bitmaskcmp(&SA2SIN6ADDR(ifsa), - &SA2SIN6ADDR(hint_sa), &SA2SIN6ADDR(ifmasksa), - sizeof(struct in6_addr))) { - bestif = ifap; - goto found; - } - break; -#endif - default: - continue; - } + const int goodness = 3; - /* - * Remember the first possibly useful interface, preferring - * "normal" to point-to-point and loopback ones. - */ - if (bestif == NULL || - (!(ifap->ifa_flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) && - (bestif->ifa_flags & (IFF_LOOPBACK | IFF_POINTOPOINT)))) + if (bestif_goodness < goodness) { + bestif = ifap; + bestif_goodness = goodness; + } + } + } + } +#endif /* INET6 */ + if (0 == bitmaskcmp(hint_sa, ifsa, ifmasksa)) { + const int goodness = 2; + + if (bestif_goodness < goodness) { + bestif = ifap; + bestif_goodness = goodness; + } + } + if (!(ifap->ifa_flags & (IFF_LOOPBACK | IFF_POINTOPOINT))) { + const int goodness = 1; + + if (bestif_goodness < goodness) { + bestif = ifap; + bestif_goodness = goodness; + } + } + if (bestif == NULL) bestif = ifap; } if (bestif == NULL) From fdbd473ce57080845c13750e72bb82fa0ca32e33 Mon Sep 17 00:00:00 2001 From: Glen Barber Date: Wed, 6 Jan 2016 05:23:25 +0000 Subject: [PATCH 04/48] Add a new target to touch the ${.OBJDIR}/release file, which indicates the 'release' target has run (in order to prevent subsequent invocations that may clobber original build output). As is, the 'release' target is a dummy target that does nothing more than depend on subsequent targets. Unless 'make obj' is invoked prior to 'make release', .OBJDIR and .CURDIR will always be '/usr/src/release' (or wherever /usr/src is located). When 'make release' invokes 'make real-release' (and subsequent targets), .OBJDIR is not updated, which still leads to src/ tree pollution. While arguably a hack, 'make release' will invoke the original dummy targets as originally intended, but instead of touching an empty file (or returing @true), will call a 'release-done' target that will trigger the behavior that was intended to prevent a subsequent invocation. Discussed with: hrs MFC after: 3 days X-MFC-With: r293173 Sponsored by: The FreeBSD Foundation --- release/Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/release/Makefile b/release/Makefile index 07b8048de483..ba1ca80c050c 100644 --- a/release/Makefile +++ b/release/Makefile @@ -281,7 +281,11 @@ ftp: packagesystem cp *.txz MANIFEST ftp release: real-release vm-release cloudware-release - touch ${.OBJDIR}/${.TARGET} + ${MAKE} -C ${.CURDIR} ${.MAKEFLAGS} release-done + true + +release-done: + touch release real-release: ${MAKE} -C ${.CURDIR} ${.MAKEFLAGS} obj From a93236cfcc168f958e05d557ae21e485d0e81b68 Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Wed, 6 Jan 2016 15:38:39 +0000 Subject: [PATCH 05/48] loader.efi: add terminal emulation support This is based on the vidconsole implementation. Submitted by: Toomas Soome Reviewed by: adrian MFC after: 2 weeks Relnotes: Yes Differential Revision: https://reviews.freebsd.org/D4797 --- sys/boot/efi/libefi/Makefile | 1 + sys/boot/efi/libefi/efi_console.c | 424 ++++++++++++++++++++++++++++-- sys/boot/efi/loader/main.c | 3 +- sys/boot/ficl/amd64/sysdep.c | 2 +- 4 files changed, 408 insertions(+), 22 deletions(-) diff --git a/sys/boot/efi/libefi/Makefile b/sys/boot/efi/libefi/Makefile index d248927c2b90..25251c5319c2 100644 --- a/sys/boot/efi/libefi/Makefile +++ b/sys/boot/efi/libefi/Makefile @@ -21,5 +21,6 @@ CFLAGS+= -I${.CURDIR}/../../common # Handle FreeBSD specific %b and %D printf format specifiers CFLAGS+= ${FORMAT_EXTENSIONS} +CFLAGS+= -DTERM_EMU .include diff --git a/sys/boot/efi/libefi/efi_console.c b/sys/boot/efi/libefi/efi_console.c index 3538994ac104..52a372582898 100644 --- a/sys/boot/efi/libefi/efi_console.c +++ b/sys/boot/efi/libefi/efi_console.c @@ -35,6 +35,69 @@ __FBSDID("$FreeBSD$"); static SIMPLE_TEXT_OUTPUT_INTERFACE *conout; static SIMPLE_INPUT_INTERFACE *conin; +#ifdef TERM_EMU +#define DEFAULT_FGCOLOR EFI_LIGHTGRAY +#define DEFAULT_BGCOLOR EFI_BLACK + +#define MAXARGS 8 +static int args[MAXARGS], argc; +static int fg_c, bg_c, curx, cury; +static int esc; + +void get_pos(int *x, int *y); +void curs_move(int *_x, int *_y, int x, int y); +static void CL(int); +#endif + +static void efi_cons_probe(struct console *); +static int efi_cons_init(int); +void efi_cons_putchar(int); +int efi_cons_getchar(void); +void efi_cons_efiputchar(int); +int efi_cons_poll(void); + +struct console efi_console = { + "efi", + "EFI console", + 0, + efi_cons_probe, + efi_cons_init, + efi_cons_putchar, + efi_cons_getchar, + efi_cons_poll +}; + +#ifdef TERM_EMU + +/* Get cursor position. */ +void +get_pos(int *x, int *y) +{ + *x = conout->Mode->CursorColumn; + *y = conout->Mode->CursorRow; +} + +/* Move cursor to x rows and y cols (0-based). */ +void +curs_move(int *_x, int *_y, int x, int y) +{ + conout->SetCursorPosition(conout, x, y); + if (_x != NULL) + *_x = conout->Mode->CursorColumn; + if (_y != NULL) + *_y = conout->Mode->CursorRow; +} + +/* Clear internal state of the terminal emulation code. */ +void +end_term(void) +{ + esc = 0; + argc = -1; +} + +#endif + static void efi_cons_probe(struct console *cp) { @@ -46,22 +109,314 @@ efi_cons_probe(struct console *cp) static int efi_cons_init(int arg) { - conout->SetAttribute(conout, EFI_TEXT_ATTR(EFI_LIGHTGRAY, EFI_BLACK)); + conout->SetAttribute(conout, EFI_TEXT_ATTR(DEFAULT_FGCOLOR, + DEFAULT_BGCOLOR)); +#ifdef TERM_EMU + end_term(); + get_pos(&curx, &cury); + curs_move(&curx, &cury, curx, cury); + fg_c = DEFAULT_FGCOLOR; + bg_c = DEFAULT_BGCOLOR; +#endif + conout->EnableCursor(conout, TRUE); return 0; } +static void +efi_cons_rawputchar(int c) +{ + int i; + UINTN x, y; + conout->QueryMode(conout, conout->Mode->Mode, &x, &y); + + if (c == '\t') + /* XXX lame tab expansion */ + for (i = 0; i < 8; i++) + efi_cons_rawputchar(' '); + else { +#ifndef TERM_EMU + if (c == '\n') + efi_cons_efiputchar('\r'); + else + efi_cons_efiputchar(c); +#else + switch (c) { + case '\r': + curx = 0; + curs_move(&curx, &cury, curx, cury); + return; + case '\n': + cury++; + if (cury >= y) { + efi_cons_efiputchar('\n'); + cury--; + } else + curs_move(&curx, &cury, curx, cury); + return; + case '\b': + if (curx > 0) { + curx--; + curs_move(&curx, &cury, curx, cury); + } + return; + default: + efi_cons_efiputchar(c); + curx++; + if (curx > x-1) { + curx = 0; + cury++; + } + if (cury > y-1) { + curx = 0; + cury--; + } + } + curs_move(&curx, &cury, curx, cury); +#endif + } +} + +/* Gracefully exit ESC-sequence processing in case of misunderstanding. */ +static void +bail_out(int c) +{ + char buf[16], *ch; + int i; + + if (esc) { + efi_cons_rawputchar('\033'); + if (esc != '\033') + efi_cons_rawputchar(esc); + for (i = 0; i <= argc; ++i) { + sprintf(buf, "%d", args[i]); + ch = buf; + while (*ch) + efi_cons_rawputchar(*ch++); + } + } + efi_cons_rawputchar(c); + end_term(); +} + +/* Clear display from current position to end of screen. */ +static void +CD(void) { + int i; + UINTN x, y; + + get_pos(&curx, &cury); + if (curx == 0 && cury == 0) { + conout->ClearScreen(conout); + end_term(); + return; + } + + conout->QueryMode(conout, conout->Mode->Mode, &x, &y); + CL(0); /* clear current line from cursor to end */ + for (i = cury + 1; i < y-1; i++) { + curs_move(NULL, NULL, 0, i); + CL(0); + } + curs_move(NULL, NULL, curx, cury); + end_term(); +} + +/* + * Absolute cursor move to args[0] rows and args[1] columns + * (the coordinates are 1-based). + */ +static void +CM(void) +{ + if (args[0] > 0) + args[0]--; + if (args[1] > 0) + args[1]--; + curs_move(&curx, &cury, args[1], args[0]); + end_term(); +} + +/* Home cursor (left top corner), also called from mode command. */ +void +HO(void) +{ + argc = 1; + args[0] = args[1] = 1; + CM(); +} + +/* Clear line from current position to end of line */ +static void +CL(int direction) +{ + int i, len; + UINTN x, y; + CHAR16 *line; + + conout->QueryMode(conout, conout->Mode->Mode, &x, &y); + switch (direction) { + case 0: /* from cursor to end */ + len = x - curx + 1; + break; + case 1: /* from beginning to cursor */ + len = curx; + break; + case 2: /* entire line */ + len = x; + break; + } + + if (cury == y - 1) + len--; + + line = malloc(len * sizeof (CHAR16)); + if (line == NULL) { + printf("out of memory\n"); + return; + } + for (i = 0; i < len; i++) + line[i] = ' '; + line[len-1] = 0; + + if (direction != 0) + curs_move(NULL, NULL, 0, cury); + + conout->OutputString(conout, line); + /* restore cursor position */ + curs_move(NULL, NULL, curx, cury); + free(line); + end_term(); +} + +static void +get_arg(int c) +{ + if (argc < 0) + argc = 0; + args[argc] *= 10; + args[argc] += c - '0'; +} + +/* Emulate basic capabilities of cons25 terminal */ +static void +efi_term_emu(int c) +{ + static int ansi_col[] = { + 0, 4, 2, 6, 1, 5, 3, 7 + }; + int t, i; + + switch (esc) { + case 0: + switch (c) { + case '\033': + esc = c; + break; + default: + efi_cons_rawputchar(c); + break; + } + break; + case '\033': + switch (c) { + case '[': + esc = c; + args[0] = 0; + argc = -1; + break; + default: + bail_out(c); + break; + } + break; + case '[': + switch (c) { + case ';': + if (argc < 0) + argc = 0; + else if (argc + 1 >= MAXARGS) + bail_out(c); + else + args[++argc] = 0; + break; + case 'H': /* ho = \E[H */ + if (argc < 0) + HO(); + else if (argc == 1) + CM(); + else + bail_out(c); + break; + case 'J': /* cd = \E[J */ + if (argc < 0) + CD(); + else + bail_out(c); + break; + case 'm': + if (argc < 0) { + fg_c = DEFAULT_FGCOLOR; + bg_c = DEFAULT_BGCOLOR; + } + for (i = 0; i <= argc; ++i) { + switch (args[i]) { + case 0: /* back to normal */ + fg_c = DEFAULT_FGCOLOR; + bg_c = DEFAULT_BGCOLOR; + break; + case 1: /* bold */ + fg_c |= 0x8; + break; + case 4: /* underline */ + case 5: /* blink */ + bg_c |= 0x8; + break; + case 7: /* reverse */ + t = fg_c; + fg_c = bg_c; + bg_c = t; + break; + case 30: case 31: case 32: case 33: + case 34: case 35: case 36: case 37: + fg_c = ansi_col[args[i] - 30]; + break; + case 39: /* normal */ + fg_c = DEFAULT_FGCOLOR; + break; + case 40: case 41: case 42: case 43: + case 44: case 45: case 46: case 47: + bg_c = ansi_col[args[i] - 40]; + break; + case 49: /* normal */ + bg_c = DEFAULT_BGCOLOR; + break; + } + } + conout->SetAttribute(conout, EFI_TEXT_ATTR(fg_c, bg_c)); + end_term(); + break; + default: + if (isdigit(c)) + get_arg(c); + else + bail_out(c); + break; + } + break; + default: + bail_out(c); + break; + } +} + void efi_cons_putchar(int c) { - CHAR16 buf[2]; - - if (c == '\n') - efi_cons_putchar('\r'); - - buf[0] = c; - buf[1] = 0; - - conout->OutputString(conout, buf); +#ifdef TERM_EMU + efi_term_emu(c); +#else + efi_cons_rawputchar(c); +#endif } int @@ -77,6 +432,12 @@ efi_cons_getchar() BS->WaitForEvent(1, &conin->WaitForKey, &junk); status = conin->ReadKeyStroke(conin, &key); } + switch (key.ScanCode) { + case 0x17: /* ESC */ + return (0x1b); /* esc */ + } + + /* this can return */ return (key.UnicodeChar); } @@ -87,13 +448,36 @@ efi_cons_poll() return (BS->CheckEvent(conin->WaitForKey) == EFI_SUCCESS); } -struct console efi_console = { - "efi", - "EFI console", - 0, - efi_cons_probe, - efi_cons_init, - efi_cons_putchar, - efi_cons_getchar, - efi_cons_poll -}; +/* Plain direct access to EFI OutputString(). */ +void +efi_cons_efiputchar(int c) +{ + CHAR16 buf[2]; + + /* + * translate box chars to unicode + */ + switch (c) { + /* single frame */ + case 0xb3: buf[0] = BOXDRAW_VERTICAL; break; + case 0xbf: buf[0] = BOXDRAW_DOWN_LEFT; break; + case 0xc0: buf[0] = BOXDRAW_UP_RIGHT; break; + case 0xc4: buf[0] = BOXDRAW_HORIZONTAL; break; + case 0xda: buf[0] = BOXDRAW_DOWN_RIGHT; break; + case 0xd9: buf[0] = BOXDRAW_UP_LEFT; break; + + /* double frame */ + case 0xba: buf[0] = BOXDRAW_DOUBLE_VERTICAL; break; + case 0xbb: buf[0] = BOXDRAW_DOUBLE_DOWN_LEFT; break; + case 0xbc: buf[0] = BOXDRAW_DOUBLE_UP_LEFT; break; + case 0xc8: buf[0] = BOXDRAW_DOUBLE_UP_RIGHT; break; + case 0xc9: buf[0] = BOXDRAW_DOUBLE_DOWN_RIGHT; break; + case 0xcd: buf[0] = BOXDRAW_DOUBLE_HORIZONTAL; break; + + default: + buf[0] = c; + } + buf[1] = 0; /* terminate string */ + + conout->OutputString(conout, buf); +} diff --git a/sys/boot/efi/loader/main.c b/sys/boot/efi/loader/main.c index 7a407094e0fe..01123b3a6715 100644 --- a/sys/boot/efi/loader/main.c +++ b/sys/boot/efi/loader/main.c @@ -334,6 +334,7 @@ command_mode(int argc, char *argv[]) char rowenv[8]; EFI_STATUS status; SIMPLE_TEXT_OUTPUT_INTERFACE *conout; + extern void HO(void); conout = ST->ConOut; @@ -355,7 +356,7 @@ command_mode(int argc, char *argv[]) } sprintf(rowenv, "%u", (unsigned)rows); setenv("LINES", rowenv, 1); - + HO(); /* set cursor */ return (CMD_OK); } diff --git a/sys/boot/ficl/amd64/sysdep.c b/sys/boot/ficl/amd64/sysdep.c index ad38660843cd..5957b71e461a 100644 --- a/sys/boot/ficl/amd64/sysdep.c +++ b/sys/boot/ficl/amd64/sysdep.c @@ -55,7 +55,7 @@ void ficlTextOut(FICL_VM *pVM, char *msg, int fNewline) IGNORE(pVM); while(*msg != 0) - putchar(*(msg++)); + putchar((unsigned char)*(msg++)); if (fNewline) putchar('\n'); From 353e5e6e3045457deaeb091d5910a03cce3f2ac4 Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Wed, 6 Jan 2016 15:50:21 +0000 Subject: [PATCH 06/48] Enable the beastie menu for the UEFI console As of r293233 the UEFI console includes basic terminal emulator support. MFC after: 2 weeks Relnotes: Yes --- sys/boot/forth/beastie.4th | 5 ----- sys/boot/forth/beastie.4th.8 | 5 ++--- sys/boot/forth/loader.conf.5 | 5 ++--- 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/sys/boot/forth/beastie.4th b/sys/boot/forth/beastie.4th index 52c403f6d1fa..752cce22a4ff 100644 --- a/sys/boot/forth/beastie.4th +++ b/sys/boot/forth/beastie.4th @@ -85,11 +85,6 @@ variable logoY also support-functions : beastie-start ( -- ) \ starts the menu - s" console" getenv dup -1 <> if - s" efi" 2swap contains? if - s" set beastie_disable=YES" evaluate - then - else drop then s" beastie_disable" getenv dup -1 <> if s" YES" compare-insensitive 0= if any_conf_read? if diff --git a/sys/boot/forth/beastie.4th.8 b/sys/boot/forth/beastie.4th.8 index 534a60ce6bd9..9f77d5db2977 100644 --- a/sys/boot/forth/beastie.4th.8 +++ b/sys/boot/forth/beastie.4th.8 @@ -24,7 +24,7 @@ .\" .\" $FreeBSD$ .\" -.Dd April 27, 2014 +.Dd January 6, 2016 .Dt BEASTIE.4TH 8 .Os .Sh NAME @@ -119,8 +119,7 @@ Sets the desired row position of the logo. Default is 4. If set to .Dq YES , the beastie boot menu will be skipped. -The beastie boot menu is always skipped if booting UEFI or running non-x86 -hardware. +The beastie boot menu is always skipped if running non-x86 hardware. .It Va loader_delay If set to a number higher than zero, introduces a delay before starting the beastie boot menu. During the delay the user can press either Ctrl-C to skip diff --git a/sys/boot/forth/loader.conf.5 b/sys/boot/forth/loader.conf.5 index 37f10bfea5d2..0320e988cdfd 100644 --- a/sys/boot/forth/loader.conf.5 +++ b/sys/boot/forth/loader.conf.5 @@ -23,7 +23,7 @@ .\" SUCH DAMAGE. .\" .\" $FreeBSD$ -.Dd April 27, 2014 +.Dd January 6, 2016 .Dt LOADER.CONF 5 .Os .Sh NAME @@ -236,8 +236,7 @@ be displayed. If set to .Dq YES , the beastie boot menu will be skipped. -The beastie boot menu is always skipped if booting UEFI or running non-x86 -hardware. +The beastie boot menu is always skipped if running non-x86 hardware. .It Va loader_logo Pq Dq Li orbbw Selects a desired logo in the beastie boot menu. Possible values are: From 3e972f44093c9fa5f585388920bb1caccbe6ff15 Mon Sep 17 00:00:00 2001 From: Warner Losh Date: Wed, 6 Jan 2016 17:13:40 +0000 Subject: [PATCH 07/48] Try a little harder to remove firstboot and firstboot-reboot files in case they accidentally get created as directories or with flags that prevent their removal. While I wouldn't normally go the extra mile here and let the normal unix rules prevail, the effects of failure are large enough that extra care is warranted. --- etc/rc | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/etc/rc b/etc/rc index 2127d7895925..576ddf937af4 100644 --- a/etc/rc +++ b/etc/rc @@ -130,11 +130,17 @@ for _rc_elem in ${files}; do done # Remove the firstboot sentinel, and reboot if it was requested. +# Be a bit paranoid about removing it to handle the common failure +# modes since the consequence of failure can be big. +# Note: this assumes firstboot_sentinel is on / when we have +# a read-only /, or that it is on media that's writable. if [ -e ${firstboot_sentinel} ]; then [ ${root_rw_mount} = "yes" ] || mount -uw / - /bin/rm -f ${firstboot_sentinel} + chflags -R 0 ${firstboot_sentinel} + rm -rf ${firstboot_sentinel} if [ -e ${firstboot_sentinel}-reboot ]; then - /bin/rm -f ${firstboot_sentinel}-reboot + chflags -R 0 ${firstboot_sentinel}-reboot + rm -rf ${firstboot_sentinel}-reboot [ ${root_rw_mount} = "yes" ] || mount -ur / kill -INT 1 fi From 4053652b9250470dee4681c12cdd478dd850cc50 Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Wed, 6 Jan 2016 17:33:32 +0000 Subject: [PATCH 08/48] Add fls() to libstand Although we don't use it in tree yet libstand is installed as user- facing /usr/liblibstand.a, and some work in progress makes use of it. Instead of conflicting with ongoing libstand Makefile deduplication, just add it now. --- lib/libstand/Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/libstand/Makefile b/lib/libstand/Makefile index 9d1f6bcd44a0..ee8087be3ea6 100644 --- a/lib/libstand/Makefile +++ b/lib/libstand/Makefile @@ -38,8 +38,9 @@ SRCS+= ntoh.c # string functions from libc .PATH: ${LIBC_SRC}/string -SRCS+= bcmp.c bcopy.c bzero.c ffs.c memccpy.c memchr.c memcmp.c memcpy.c \ - memmove.c memset.c qdivrem.c strcat.c strchr.c strcmp.c strcpy.c \ +SRCS+= bcmp.c bcopy.c bzero.c ffs.c fls.c \ + memccpy.c memchr.c memcmp.c memcpy.c memmove.c memset.c \ + qdivrem.c strcat.c strchr.c strcmp.c strcpy.c \ strcspn.c strlcat.c strlcpy.c strlen.c strncat.c strncmp.c strncpy.c \ strpbrk.c strrchr.c strsep.c strspn.c strstr.c strtok.c swab.c .if ${MACHINE_CPUARCH} == "arm" From c82e181b745260e6bc32d69011c73201eb1665e3 Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Wed, 6 Jan 2016 19:15:16 +0000 Subject: [PATCH 09/48] Introduce and use new EFI_ERROR_CODE macro for EFI errors Submitted by: smh MFC after: 1 week --- sys/boot/efi/boot1/boot1.c | 6 +++--- sys/boot/efi/include/efierr.h | 3 ++- sys/boot/efi/loader/arch/amd64/framebuffer.c | 8 ++++---- sys/boot/efi/loader/bootinfo.c | 7 +++---- sys/boot/efi/loader/copy.c | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/sys/boot/efi/boot1/boot1.c b/sys/boot/efi/boot1/boot1.c index be5999328636..febc5f421e15 100644 --- a/sys/boot/efi/boot1/boot1.c +++ b/sys/boot/efi/boot1/boot1.c @@ -331,20 +331,20 @@ load(const char *fname) buffer, bufsize, &loaderhandle); if (EFI_ERROR(status)) printf("LoadImage failed with error %lu\n", - status & ~EFI_ERROR_MASK); + EFI_ERROR_CODE(status)); status = systab->BootServices->HandleProtocol(loaderhandle, &LoadedImageGUID, (VOID**)&loaded_image); if (EFI_ERROR(status)) printf("HandleProtocol failed with error %lu\n", - status & ~EFI_ERROR_MASK); + EFI_ERROR_CODE(status)); loaded_image->DeviceHandle = bootdevhandle; status = systab->BootServices->StartImage(loaderhandle, NULL, NULL); if (EFI_ERROR(status)) printf("StartImage failed with error %lu\n", - status & ~EFI_ERROR_MASK); + EFI_ERROR_CODE(status)); } static void diff --git a/sys/boot/efi/include/efierr.h b/sys/boot/efi/include/efierr.h index dc57f0ed4333..921b297ed4fb 100644 --- a/sys/boot/efi/include/efierr.h +++ b/sys/boot/efi/include/efierr.h @@ -30,7 +30,8 @@ Revision History #define EFIWARN(a) (a) -#define EFI_ERROR(a) (((INTN) a) < 0) +#define EFI_ERROR(a) (((INTN) a) < 0) +#define EFI_ERROR_CODE(a) (a & ~EFI_ERROR_MASK) #define EFI_SUCCESS 0 diff --git a/sys/boot/efi/loader/arch/amd64/framebuffer.c b/sys/boot/efi/loader/arch/amd64/framebuffer.c index eb78f7b7ace9..04b880424f22 100644 --- a/sys/boot/efi/loader/arch/amd64/framebuffer.c +++ b/sys/boot/efi/loader/arch/amd64/framebuffer.c @@ -178,7 +178,7 @@ efifb_uga_find_pixel(EFI_UGA_DRAW_PROTOCOL *uga, u_int line, printf("No change detected in frame buffer"); fail: - printf(" -- error %lu\n", status & ~EFI_ERROR_MASK); + printf(" -- error %lu\n", EFI_ERROR_CODE(status)); free(data1); return (-1); } @@ -473,7 +473,7 @@ command_gop(int argc, char *argv[]) status = BS->LocateProtocol(&gop_guid, NULL, (VOID **)&gop); if (EFI_ERROR(status)) { sprintf(command_errbuf, "%s: Graphics Output Protocol not " - "present (error=%lu)", argv[0], status & ~EFI_ERROR_MASK); + "present (error=%lu)", argv[0], EFI_ERROR_CODE(status)); return (CMD_ERROR); } @@ -494,7 +494,7 @@ command_gop(int argc, char *argv[]) if (EFI_ERROR(status)) { sprintf(command_errbuf, "%s: Unable to set mode to " "%u (error=%lu)", argv[0], mode, - status & ~EFI_ERROR_MASK); + EFI_ERROR_CODE(status)); return (CMD_ERROR); } } else if (!strcmp(argv[1], "get")) { @@ -541,7 +541,7 @@ command_uga(int argc, char *argv[]) status = BS->LocateProtocol(&uga_guid, NULL, (VOID **)&uga); if (EFI_ERROR(status)) { sprintf(command_errbuf, "%s: UGA Protocol not present " - "(error=%lu)", argv[0], status & ~EFI_ERROR_MASK); + "(error=%lu)", argv[0], EFI_ERROR_CODE(status)); return (CMD_ERROR); } diff --git a/sys/boot/efi/loader/bootinfo.c b/sys/boot/efi/loader/bootinfo.c index 622f4c61dadf..ac665b200d91 100644 --- a/sys/boot/efi/loader/bootinfo.c +++ b/sys/boot/efi/loader/bootinfo.c @@ -290,7 +290,7 @@ bi_load_efi_data(struct preloaded_file *kfp) pages, &addr); if (EFI_ERROR(status)) { printf("%s: AllocatePages error %lu\n", __func__, - (unsigned long)(status & ~EFI_ERROR_MASK)); + EFI_ERROR_CODE(status)); return (ENOMEM); } @@ -306,7 +306,7 @@ bi_load_efi_data(struct preloaded_file *kfp) status = BS->GetMemoryMap(&sz, mm, &efi_mapkey, &mmsz, &mmver); if (EFI_ERROR(status)) { printf("%s: GetMemoryMap error %lu\n", __func__, - (unsigned long)(status & ~EFI_ERROR_MASK)); + EFI_ERROR_CODE(status)); return (EINVAL); } status = BS->ExitBootServices(IH, efi_mapkey); @@ -320,8 +320,7 @@ bi_load_efi_data(struct preloaded_file *kfp) } BS->FreePages(addr, pages); } - printf("ExitBootServices error %lu\n", - (unsigned long)(status & ~EFI_ERROR_MASK)); + printf("ExitBootServices error %lu\n", EFI_ERROR_CODE(status)); return (EINVAL); } diff --git a/sys/boot/efi/loader/copy.c b/sys/boot/efi/loader/copy.c index 716e9ea328a2..8714786c3471 100644 --- a/sys/boot/efi/loader/copy.c +++ b/sys/boot/efi/loader/copy.c @@ -56,7 +56,7 @@ efi_copy_init(void) STAGE_PAGES, &staging); if (EFI_ERROR(status)) { printf("failed to allocate staging area: %lu\n", - (unsigned long)(status & EFI_ERROR_MASK)); + EFI_ERROR_CODE(status)); return (status); } staging_end = staging + STAGE_PAGES * EFI_PAGE_SIZE; From f548a62da9522cf2fa7c215a4381b6d765797545 Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Wed, 6 Jan 2016 19:18:43 +0000 Subject: [PATCH 10/48] loader.efi style(9) cleanup Submitted by: smh --- sys/boot/efi/loader/main.c | 64 +++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 35 deletions(-) diff --git a/sys/boot/efi/loader/main.c b/sys/boot/efi/loader/main.c index 01123b3a6715..1fa031f29afe 100644 --- a/sys/boot/efi/loader/main.c +++ b/sys/boot/efi/loader/main.c @@ -227,50 +227,47 @@ command_memmap(int argc, char *argv[]) status = BS->GetMemoryMap(&sz, 0, &key, &dsz, &dver); if (status != EFI_BUFFER_TOO_SMALL) { printf("Can't determine memory map size\n"); - return CMD_ERROR; + return (CMD_ERROR); } map = malloc(sz); status = BS->GetMemoryMap(&sz, map, &key, &dsz, &dver); if (EFI_ERROR(status)) { printf("Can't read memory map\n"); - return CMD_ERROR; + return (CMD_ERROR); } ndesc = sz / dsz; printf("%23s %12s %12s %8s %4s\n", - "Type", "Physical", "Virtual", "#Pages", "Attr"); + "Type", "Physical", "Virtual", "#Pages", "Attr"); for (i = 0, p = map; i < ndesc; i++, p = NextMemoryDescriptor(p, dsz)) { - printf("%23s %012lx %012lx %08lx ", - types[p->Type], - p->PhysicalStart, - p->VirtualStart, - p->NumberOfPages); - if (p->Attribute & EFI_MEMORY_UC) - printf("UC "); - if (p->Attribute & EFI_MEMORY_WC) - printf("WC "); - if (p->Attribute & EFI_MEMORY_WT) - printf("WT "); - if (p->Attribute & EFI_MEMORY_WB) - printf("WB "); - if (p->Attribute & EFI_MEMORY_UCE) - printf("UCE "); - if (p->Attribute & EFI_MEMORY_WP) - printf("WP "); - if (p->Attribute & EFI_MEMORY_RP) - printf("RP "); - if (p->Attribute & EFI_MEMORY_XP) - printf("XP "); - printf("\n"); + printf("%23s %012lx %012lx %08lx ", types[p->Type], + p->PhysicalStart, p->VirtualStart, p->NumberOfPages); + if (p->Attribute & EFI_MEMORY_UC) + printf("UC "); + if (p->Attribute & EFI_MEMORY_WC) + printf("WC "); + if (p->Attribute & EFI_MEMORY_WT) + printf("WT "); + if (p->Attribute & EFI_MEMORY_WB) + printf("WB "); + if (p->Attribute & EFI_MEMORY_UCE) + printf("UCE "); + if (p->Attribute & EFI_MEMORY_WP) + printf("WP "); + if (p->Attribute & EFI_MEMORY_RP) + printf("RP "); + if (p->Attribute & EFI_MEMORY_XP) + printf("XP "); + printf("\n"); } - return CMD_OK; + return (CMD_OK); } -COMMAND_SET(configuration, "configuration", - "print configuration tables", command_configuration); +COMMAND_SET(configuration, "configuration", "print configuration tables", + command_configuration); static const char * guid_to_string(EFI_GUID *guid) @@ -318,7 +315,7 @@ command_configuration(int argc, char *argv[]) printf(" at %p\n", ST->ConfigurationTable[i].VendorTable); } - return CMD_OK; + return (CMD_OK); } @@ -395,20 +392,17 @@ command_nvram(int argc, char *argv[]) status = RS->GetNextVariableName(&varsz, NULL, NULL); for (; status != EFI_NOT_FOUND; ) { - status = RS->GetNextVariableName(&varsz, var, - &varguid); + status = RS->GetNextVariableName(&varsz, var, &varguid); //if (EFI_ERROR(status)) //break; conout->OutputString(conout, var); printf("="); datasz = 0; - status = RS->GetVariable(var, &varguid, NULL, &datasz, - NULL); + status = RS->GetVariable(var, &varguid, NULL, &datasz, NULL); /* XXX: check status */ data = malloc(datasz); - status = RS->GetVariable(var, &varguid, NULL, &datasz, - data); + status = RS->GetVariable(var, &varguid, NULL, &datasz, data); if (EFI_ERROR(status)) printf(""); else { From 0d7911c03bee7e46db416b105e21b53e9df00598 Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Wed, 6 Jan 2016 19:41:06 +0000 Subject: [PATCH 11/48] libunwind: Include header for dl_unwind_find_exidx for ARM EHABI --- contrib/llvm/projects/libunwind/src/AddressSpace.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/llvm/projects/libunwind/src/AddressSpace.hpp b/contrib/llvm/projects/libunwind/src/AddressSpace.hpp index 73013c73ff71..55828df2856d 100644 --- a/contrib/llvm/projects/libunwind/src/AddressSpace.hpp +++ b/contrib/llvm/projects/libunwind/src/AddressSpace.hpp @@ -37,6 +37,7 @@ namespace libunwind { #if _LIBUNWIND_ARM_EHABI #if defined(__FreeBSD__) +#include typedef void *_Unwind_Ptr; #elif defined(__linux__) From d446abfaec2f19ec107f485615f80de47c8f8c64 Mon Sep 17 00:00:00 2001 From: Steven Hartland Date: Wed, 6 Jan 2016 20:22:28 +0000 Subject: [PATCH 12/48] Fix _MSC_EXTENSIONS checks Use #ifdef instead of #if checks to prevent warnings generated by checks to be enabled shortly. MFC after: 2 weeks Sponsored by: Multiplay --- sys/boot/efi/include/amd64/efibind.h | 6 +++--- sys/boot/efi/include/arm64/efibind.h | 4 ++-- sys/boot/efi/include/i386/efibind.h | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sys/boot/efi/include/amd64/efibind.h b/sys/boot/efi/include/amd64/efibind.h index 3d70b58a6271..d7a8dc04d601 100644 --- a/sys/boot/efi/include/amd64/efibind.h +++ b/sys/boot/efi/include/amd64/efibind.h @@ -39,7 +39,7 @@ Revision History // No ANSI C 1999/2000 stdint.h integer width declarations - #if _MSC_EXTENSIONS + #ifdef _MSC_EXTENSIONS // Use Microsoft C compiler integer width declarations @@ -164,7 +164,7 @@ typedef uint64_t UINTN; #endif #ifndef EFIAPI // Forces EFI calling conventions reguardless of compiler options - #if _MSC_EXTENSIONS + #ifdef _MSC_EXTENSIONS #define EFIAPI __cdecl // Force C calling convention for Microsoft C compiler #else #define EFIAPI // Substitute expresion to force C calling convention @@ -265,7 +265,7 @@ typedef uint64_t UINTN; #endif #endif /* __FreeBSD__ */ -#if _MSC_EXTENSIONS +#ifdef _MSC_EXTENSIONS #pragma warning ( disable : 4731 ) // Suppress warnings about modification of EBP #endif diff --git a/sys/boot/efi/include/arm64/efibind.h b/sys/boot/efi/include/arm64/efibind.h index 21f0d25d70e7..6569f96fcf84 100644 --- a/sys/boot/efi/include/arm64/efibind.h +++ b/sys/boot/efi/include/arm64/efibind.h @@ -39,7 +39,7 @@ Revision History // No ANSI C 1999/2000 stdint.h integer width declarations - #if _MSC_EXTENSIONS + #ifdef _MSC_EXTENSIONS // Use Microsoft C compiler integer width declarations @@ -159,7 +159,7 @@ typedef uint64_t UINTN; // #ifndef EFIAPI // Forces EFI calling conventions reguardless of compiler options - #if _MSC_EXTENSIONS + #ifdef _MSC_EXTENSIONS #define EFIAPI __cdecl // Force C calling convention for Microsoft C compiler #else #define EFIAPI // Substitute expresion to force C calling convention diff --git a/sys/boot/efi/include/i386/efibind.h b/sys/boot/efi/include/i386/efibind.h index de3658fb95e0..6e5a7163a97c 100644 --- a/sys/boot/efi/include/i386/efibind.h +++ b/sys/boot/efi/include/i386/efibind.h @@ -39,7 +39,7 @@ Revision History // No ANSI C 1999/2000 stdint.h integer width declarations - #if _MSC_EXTENSIONS + #ifdef _MSC_EXTENSIONS // Use Microsoft C compiler integer width declarations @@ -160,7 +160,7 @@ typedef uint32_t UINTN; // #ifndef EFIAPI // Forces EFI calling conventions reguardless of compiler options - #if _MSC_EXTENSIONS + #ifdef _MSC_EXTENSIONS #define EFIAPI __cdecl // Force C calling convention for Microsoft C compiler #else #define EFIAPI // Substitute expresion to force C calling convention @@ -261,7 +261,7 @@ typedef uint32_t UINTN; #endif #endif /* __FreeBSD__ */ -#if _MSC_EXTENSIONS +#ifdef _MSC_EXTENSIONS #pragma warning ( disable : 4731 ) // Suppress warnings about modification of EBP #endif From 5d07d143e6f8b100ab0b711b4297160d8717bb01 Mon Sep 17 00:00:00 2001 From: Steven Hartland Date: Wed, 6 Jan 2016 20:25:41 +0000 Subject: [PATCH 13/48] Fix return from zfs_probe_dev Ensure zfs_probe_dev returns the correct value. Also fix a style(9) trailing whitespace issue while here. MFC after: 2 weeks X-MFC-With: r293268 Sponsored by: Multiplay --- sys/boot/zfs/zfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sys/boot/zfs/zfs.c b/sys/boot/zfs/zfs.c index 0e15ac48ff6a..fdb79bb21c1e 100644 --- a/sys/boot/zfs/zfs.c +++ b/sys/boot/zfs/zfs.c @@ -154,7 +154,7 @@ zfs_read(struct open_file *f, void *start, size_t size, size_t *resid /* out */) n = size; if (fp->f_seekp + n > sb.st_size) n = sb.st_size - fp->f_seekp; - + rc = dnode_read(spa, &fp->f_dnode, fp->f_seekp, start, n); if (rc) return (rc); @@ -507,7 +507,7 @@ zfs_probe_dev(const char *devname, uint64_t *pool_guid) } } close(pa.fd); - return (0); + return (ret); } /* From 9697b154f2ee4f53f68d6388d9c00ad4cf9a7073 Mon Sep 17 00:00:00 2001 From: Steven Hartland Date: Wed, 6 Jan 2016 20:28:09 +0000 Subject: [PATCH 14/48] Fix const conversion warning in lz4_decompress Fix const conversion warning in lz4_decompress which shows when warnings are enabled (to be done later). MFC after: 2 weeks X-MFC-With: r293268 Sponsored by: Multiplay --- sys/cddl/boot/zfs/lz4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/cddl/boot/zfs/lz4.c b/sys/cddl/boot/zfs/lz4.c index 055bd627fd10..c29f8614155d 100644 --- a/sys/cddl/boot/zfs/lz4.c +++ b/sys/cddl/boot/zfs/lz4.c @@ -52,7 +52,7 @@ lz4_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int dum * Returns 0 on success (decompression function returned non-negative) * and non-zero on failure (decompression function returned negative). */ - return (LZ4_uncompress_unknownOutputSize(s_start + 4, d_start, bufsiz, + return (LZ4_uncompress_unknownOutputSize((const char *)s_start + 4, d_start, bufsiz, d_len) < 0); } From 5fc656f1876fd2ec306182f830049874b23fbfa3 Mon Sep 17 00:00:00 2001 From: Steven Hartland Date: Wed, 6 Jan 2016 20:48:29 +0000 Subject: [PATCH 15/48] style(9) fixes for EFI boot Fix some style(9) nits for EFI boot code, no functional changes. MFC after: 2 weeks X-MFC-With: r293268 Sponsored by: Multiplay --- sys/boot/efi/boot1/boot1.c | 3 +-- sys/boot/efi/loader/devicename.c | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sys/boot/efi/boot1/boot1.c b/sys/boot/efi/boot1/boot1.c index febc5f421e15..e2e4c907dd01 100644 --- a/sys/boot/efi/boot1/boot1.c +++ b/sys/boot/efi/boot1/boot1.c @@ -132,8 +132,7 @@ EFI_STATUS efi_main(EFI_HANDLE Ximage, EFI_SYSTEM_TABLE* Xsystab) conout->Reset(conout, TRUE); max_dim = best_mode = 0; for (i = 0; ; i++) { - status = conout->QueryMode(conout, i, - &cols, &rows); + status = conout->QueryMode(conout, i, &cols, &rows); if (EFI_ERROR(status)) break; if (cols * rows > max_dim) { diff --git a/sys/boot/efi/loader/devicename.c b/sys/boot/efi/loader/devicename.c index 89f994112dda..1ba33e8f7ad0 100644 --- a/sys/boot/efi/loader/devicename.c +++ b/sys/boot/efi/loader/devicename.c @@ -147,7 +147,7 @@ efi_fmtdev(void *vdev) break; } - return(buf); + return (buf); } /* @@ -161,7 +161,7 @@ efi_setcurrdev(struct env_var *ev, int flags, const void *value) rv = efi_parsedev(&ncurr, value, NULL); if (rv != 0) - return(rv); + return (rv); free(ncurr); env_setenv(ev->ev_name, flags | EV_NOHOOK, value, NULL, NULL); From 2332dd949202c2e8f97a20f870bc7d177f6e1302 Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Wed, 6 Jan 2016 21:47:49 +0000 Subject: [PATCH 16/48] Use standard name for ASCII LF and FF control codes PR: 205778 MFC after: 2 weeks --- share/man/man7/ascii.7 | 8 ++++---- share/misc/ascii | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/share/man/man7/ascii.7 b/share/man/man7/ascii.7 index a9c75ad9031b..aa02af9a2832 100644 --- a/share/man/man7/ascii.7 +++ b/share/man/man7/ascii.7 @@ -28,7 +28,7 @@ .\" @(#)ascii.7 8.1 (Berkeley) 6/5/93 .\" $FreeBSD$ .\" -.Dd June 5, 1993 +.Dd January 6, 2016 .Dt ASCII 7 .Os .Sh NAME @@ -42,7 +42,7 @@ The set: .Bd -literal -offset left 000 NUL 001 SOH 002 STX 003 ETX 004 EOT 005 ENQ 006 ACK 007 BEL -010 BS 011 HT 012 NL 013 VT 014 NP 015 CR 016 SO 017 SI +010 BS 011 HT 012 LF 013 VT 014 FF 015 CR 016 SO 017 SI 020 DLE 021 DC1 022 DC2 023 DC3 024 DC4 025 NAK 026 SYN 027 ETB 030 CAN 031 EM 032 SUB 033 ESC 034 FS 035 GS 036 RS 037 US 040 SP 041 ! 042 " 043 # 044 $ 045 % 046 & 047 ' @@ -64,7 +64,7 @@ The set: .Bd -literal -offset left 00 NUL 01 SOH 02 STX 03 ETX 04 EOT 05 ENQ 06 ACK 07 BEL -08 BS 09 HT 0A NL 0B VT 0C NP 0D CR 0E SO 0F SI +08 BS 09 HT 0A LF 0B VT 0C FF 0D CR 0E SO 0F SI 10 DLE 11 DC1 12 DC2 13 DC3 14 DC4 15 NAK 16 SYN 17 ETB 18 CAN 19 EM 1A SUB 1B ESC 1C FS 1D GS 1E RS 1F US 20 SP 21 ! 22 " 23 # 24 $ 25 % 26 & 27 ' @@ -86,7 +86,7 @@ The set: .Bd -literal -offset left 0 NUL 1 SOH 2 STX 3 ETX 4 EOT 5 ENQ 6 ACK 7 BEL - 8 BS 9 HT 10 NL 11 VT 12 NP 13 CR 14 SO 15 SI + 8 BS 9 HT 10 LF 11 VT 12 FF 13 CR 14 SO 15 SI 16 DLE 17 DC1 18 DC2 19 DC3 20 DC4 21 NAK 22 SYN 23 ETB 24 CAN 25 EM 26 SUB 27 ESC 28 FS 29 GS 30 RS 31 US 32 SP 33 ! 34 " 35 # 36 $ 37 % 38 & 39 ' diff --git a/share/misc/ascii b/share/misc/ascii index b7bcef5c467a..2a71a4305dd8 100644 --- a/share/misc/ascii +++ b/share/misc/ascii @@ -1,5 +1,5 @@ |000 nul|001 soh|002 stx|003 etx|004 eot|005 enq|006 ack|007 bel| -|010 bs |011 ht |012 nl |013 vt |014 np |015 cr |016 so |017 si | +|010 bs |011 ht |012 lf |013 vt |014 ff |015 cr |016 so |017 si | |020 dle|021 dc1|022 dc2|023 dc3|024 dc4|025 nak|026 syn|027 etb| |030 can|031 em |032 sub|033 esc|034 fs |035 gs |036 rs |037 us | |040 sp |041 ! |042 " |043 # |044 $ |045 % |046 & |047 ' | @@ -16,7 +16,7 @@ |170 x |171 y |172 z |173 { |174 | |175 } |176 ~ |177 del| | 00 nul| 01 soh| 02 stx| 03 etx| 04 eot| 05 enq| 06 ack| 07 bel| -| 08 bs | 09 ht | 0a nl | 0b vt | 0c np | 0d cr | 0e so | 0f si | +| 08 bs | 09 ht | 0a lf | 0b vt | 0c ff | 0d cr | 0e so | 0f si | | 10 dle| 11 dc1| 12 dc2| 13 dc3| 14 dc4| 15 nak| 16 syn| 17 etb| | 18 can| 19 em | 1a sub| 1b esc| 1c fs | 1d gs | 1e rs | 1f us | | 20 sp | 21 ! | 22 " | 23 # | 24 $ | 25 % | 26 & | 27 ' | @@ -33,7 +33,7 @@ | 78 x | 79 y | 7a z | 7b { | 7c | | 7d } | 7e ~ | 7f del| | 0 nul| 1 soh| 2 stx| 3 etx| 4 eot| 5 enq| 6 ack| 7 bel| -| 8 bs | 9 ht | 10 nl | 11 vt | 12 np | 13 cr | 14 so | 15 si | +| 8 bs | 9 ht | 10 lf | 11 vt | 12 ff | 13 cr | 14 so | 15 si | | 16 dle| 17 dc1| 18 dc2| 19 dc3| 20 dc4| 21 nak| 22 syn| 23 etb| | 24 can| 25 em | 26 sub| 27 esc| 28 fs | 29 gs | 30 rs | 31 us | | 32 sp | 33 ! | 34 " | 35 # | 36 $ | 37 % | 38 & | 39 ' | From b46e917554ee224fb1035c66bf4f7bbfccb3a922 Mon Sep 17 00:00:00 2001 From: Gleb Smirnoff Date: Wed, 6 Jan 2016 21:58:45 +0000 Subject: [PATCH 17/48] Provide knob NO_INSTALLEXTRAKERNELS. If defined, extra kernels in KERNCONF won't be installed, only the first one would. --- Makefile.inc1 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile.inc1 b/Makefile.inc1 index 5102f9d0ec3f..f56fc71ddcf5 100644 --- a/Makefile.inc1 +++ b/Makefile.inc1 @@ -1230,7 +1230,7 @@ reinstallkernel reinstallkernel.debug: _installcheck_kernel ${CROSSENV} PATH=${TMPPATH} \ ${MAKE} ${IMAKE_INSTALL} KERNEL=${INSTKERNNAME} ${.TARGET:S/kernel//} .endif -.if ${BUILDKERNELS:[#]} > 1 +.if ${BUILDKERNELS:[#]} > 1 && !defined(NO_INSTALLEXTRAKERNELS) .for _kernel in ${BUILDKERNELS:[2..-1]} @echo "--------------------------------------------------------------" @echo ">>> Installing kernel ${_kernel}" @@ -1261,7 +1261,7 @@ distributekernel distributekernel.debug: ${DESTDIR}/${DISTDIR}/kernel.meta .endif .endif -.if ${BUILDKERNELS:[#]} > 1 +.if ${BUILDKERNELS:[#]} > 1 && !defined(NO_INSTALLEXTRAKERNELS) .for _kernel in ${BUILDKERNELS:[2..-1]} .if defined(NO_ROOT) echo "#${MTREE_MAGIC}" > ${DESTDIR}/${DISTDIR}/kernel.${_kernel}.premeta @@ -1292,7 +1292,7 @@ packagekernel: tar cvf - --include '*/*/*.debug' \ @${DESTDIR}/${DISTDIR}/kernel.meta | \ ${XZ_CMD} > ${DESTDIR}/${DISTDIR}/kernel-dbg.txz -.if ${BUILDKERNELS:[#]} > 1 +.if ${BUILDKERNELS:[#]} > 1 && !defined(NO_INSTALLEXTRAKERNELS) .for _kernel in ${BUILDKERNELS:[2..-1]} cd ${DESTDIR}/${DISTDIR}/kernel.${_kernel}; \ tar cvf - --exclude '*.debug' \ @@ -1313,7 +1313,7 @@ packagekernel: cd ${DESTDIR}/${DISTDIR}/kernel; \ tar cvf - --include '*/*/*.debug' $$(eval find .) | \ ${XZ_CMD} > ${DESTDIR}/${DISTDIR}/kernel-dbg.txz -.if ${BUILDKERNELS:[#]} > 1 +.if ${BUILDKERNELS:[#]} > 1 && !defined(NO_INSTALLEXTRAKERNELS) .for _kernel in ${BUILDKERNELS:[2..-1]} cd ${DESTDIR}/${DISTDIR}/kernel.${_kernel}; \ tar cvf - --exclude '*.debug' . | \ From 0c39d38d215df8423974f87134838fafc0170552 Mon Sep 17 00:00:00 2001 From: Gleb Smirnoff Date: Thu, 7 Jan 2016 00:14:42 +0000 Subject: [PATCH 18/48] Historically we have two fields in tcpcb to describe sender MSS: t_maxopd, and t_maxseg. This dualism emerged with T/TCP, but was not properly cleaned up after T/TCP removal. After all permutations over the years the result is that t_maxopd stores a minimum of peer offered MSS and MTU reduced by minimum protocol header. And t_maxseg stores (t_maxopd - TCPOLEN_TSTAMP_APPA) if timestamps are in action, or is equal to t_maxopd otherwise. That's a very rough estimate of MSS reduced by options length. Throughout the code it was used in places, where preciseness was not important, like cwnd or ssthresh calculations. With this change: - t_maxopd goes away. - t_maxseg now stores MSS not adjusted by options. - new function tcp_maxseg() is provided, that calculates MSS reduced by options length. The functions gives a better estimate, since it takes into account SACK state as well. Reviewed by: jtl Differential Revision: https://reviews.freebsd.org/D3593 --- sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c | 3 +- sys/dev/cxgbe/tom/t4_cpl_io.c | 3 +- sys/netinet/tcp_input.c | 102 +++++++++++++---------------- sys/netinet/tcp_output.c | 16 ++--- sys/netinet/tcp_subr.c | 57 +++++++++++++++- sys/netinet/tcp_timer.c | 21 +++--- sys/netinet/tcp_usrreq.c | 10 ++- sys/netinet/tcp_var.h | 5 +- 8 files changed, 123 insertions(+), 94 deletions(-) diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c index 985306c3c6e8..d896020abf39 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c +++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c @@ -1536,14 +1536,13 @@ assign_rxopt(struct tcpcb *tp, uint16_t tcpopt) struct toepcb *toep = tp->t_toe; struct adapter *sc = toep->tp_tod->tod_softc; - tp->t_maxseg = tp->t_maxopd = sc->params.mtus[G_TCPOPT_MSS(tcpopt)] - 40; + tp->t_maxseg = sc->params.mtus[G_TCPOPT_MSS(tcpopt)] - 40; if (G_TCPOPT_TSTAMP(tcpopt)) { tp->t_flags |= TF_RCVD_TSTMP; tp->t_flags |= TF_REQ_TSTMP; /* forcibly set */ tp->ts_recent = 0; /* XXX */ tp->ts_recent_age = tcp_ts_getticks(); - tp->t_maxseg -= TCPOLEN_TSTAMP_APPA; } if (G_TCPOPT_SACK(tcpopt)) diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c index d58592e85b49..f18f115c3202 100644 --- a/sys/dev/cxgbe/tom/t4_cpl_io.c +++ b/sys/dev/cxgbe/tom/t4_cpl_io.c @@ -221,7 +221,7 @@ assign_rxopt(struct tcpcb *tp, unsigned int opt) n = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else n = sizeof(struct ip) + sizeof(struct tcphdr); - tp->t_maxseg = tp->t_maxopd = sc->params.mtus[G_TCPOPT_MSS(opt)] - n; + tp->t_maxseg = sc->params.mtus[G_TCPOPT_MSS(opt)] - n; CTR4(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u)", __func__, toep->tid, G_TCPOPT_MSS(opt), sc->params.mtus[G_TCPOPT_MSS(opt)]); @@ -230,7 +230,6 @@ assign_rxopt(struct tcpcb *tp, unsigned int opt) tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ tp->ts_recent = 0; /* hmmm */ tp->ts_recent_age = tcp_ts_getticks(); - tp->t_maxseg -= TCPOLEN_TSTAMP_APPA; } if (G_TCPOPT_SACK(opt)) diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index a763e464ba48..3a979a0c42d7 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -290,7 +290,7 @@ cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type) if (type == CC_ACK) { if (tp->snd_cwnd > tp->snd_ssthresh) { tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, - V_tcp_abc_l_var * tp->t_maxseg); + V_tcp_abc_l_var * tcp_maxseg(tp)); if (tp->t_bytes_acked >= tp->snd_cwnd) { tp->t_bytes_acked -= tp->snd_cwnd; tp->ccv->flags |= CCF_ABC_SENTAWND; @@ -313,11 +313,13 @@ cc_conn_init(struct tcpcb *tp) { struct hc_metrics_lite metrics; struct inpcb *inp = tp->t_inpcb; + u_int maxseg; int rtt; INP_WLOCK_ASSERT(tp->t_inpcb); tcp_hc_get(&inp->inp_inc, &metrics); + maxseg = tcp_maxseg(tp); if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { tp->t_srtt = rtt; @@ -342,7 +344,7 @@ cc_conn_init(struct tcpcb *tp) * the slow start threshhold, but set the * threshold to no less than 2*mss. */ - tp->snd_ssthresh = max(2 * tp->t_maxseg, metrics.rmx_ssthresh); + tp->snd_ssthresh = max(2 * maxseg, metrics.rmx_ssthresh); TCPSTAT_INC(tcps_usedssthresh); } @@ -359,21 +361,20 @@ cc_conn_init(struct tcpcb *tp) * requiring us to be cautious. */ if (tp->snd_cwnd == 1) - tp->snd_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ + tp->snd_cwnd = maxseg; /* SYN(-ACK) lost */ else if (V_tcp_initcwnd_segments) - tp->snd_cwnd = min(V_tcp_initcwnd_segments * tp->t_maxseg, - max(2 * tp->t_maxseg, V_tcp_initcwnd_segments * 1460)); + tp->snd_cwnd = min(V_tcp_initcwnd_segments * maxseg, + max(2 * maxseg, V_tcp_initcwnd_segments * 1460)); else if (V_tcp_do_rfc3390) - tp->snd_cwnd = min(4 * tp->t_maxseg, - max(2 * tp->t_maxseg, 4380)); + tp->snd_cwnd = min(4 * maxseg, max(2 * maxseg, 4380)); else { /* Per RFC5681 Section 3.1 */ - if (tp->t_maxseg > 2190) - tp->snd_cwnd = 2 * tp->t_maxseg; - else if (tp->t_maxseg > 1095) - tp->snd_cwnd = 3 * tp->t_maxseg; + if (maxseg > 2190) + tp->snd_cwnd = 2 * maxseg; + else if (maxseg > 1095) + tp->snd_cwnd = 3 * maxseg; else - tp->snd_cwnd = 4 * tp->t_maxseg; + tp->snd_cwnd = 4 * maxseg; } if (CC_ALGO(tp)->conn_init != NULL) @@ -383,6 +384,8 @@ cc_conn_init(struct tcpcb *tp) void inline cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) { + u_int maxseg; + INP_WLOCK_ASSERT(tp->t_inpcb); switch(type) { @@ -402,12 +405,13 @@ cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) } break; case CC_RTO: + maxseg = tcp_maxseg(tp); tp->t_dupacks = 0; tp->t_bytes_acked = 0; EXIT_RECOVERY(tp->t_flags); tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 / - tp->t_maxseg) * tp->t_maxseg; - tp->snd_cwnd = tp->t_maxseg; + maxseg) * maxseg; + tp->snd_cwnd = maxseg; break; case CC_RTO_ERR: TCPSTAT_INC(tcps_sndrexmitbad); @@ -469,13 +473,11 @@ tcp_signature_verify_input(struct mbuf *m, int off0, int tlen, int optlen, * the ack that opens up a 0-sized window. * - LRO wasn't used for this segment. We make sure by checking that the * segment size is not larger than the MSS. - * - Delayed acks are enabled or this is a half-synchronized T/TCP - * connection. */ #define DELAY_ACK(tp, tlen) \ ((!tcp_timer_active(tp, TT_DELACK) && \ (tp->t_flags & TF_RXWIN0SENT) == 0) && \ - (tlen <= tp->t_maxopd) && \ + (tlen <= tp->t_maxseg) && \ (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) static void inline @@ -2481,6 +2483,9 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, hhook_run_tcp_est_in(tp, th, &to); if (SEQ_LEQ(th->th_ack, tp->snd_una)) { + u_int maxseg; + + maxseg = tcp_maxseg(tp); if (tlen == 0 && (tiwin == tp->snd_wnd || (tp->t_flags & TF_SACK_PERMIT))) { @@ -2560,12 +2565,12 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, tp->sackhint.sack_bytes_rexmit; if (awnd < tp->snd_ssthresh) { - tp->snd_cwnd += tp->t_maxseg; + tp->snd_cwnd += maxseg; if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; } } else - tp->snd_cwnd += tp->t_maxseg; + tp->snd_cwnd += maxseg; (void) tp->t_fb->tfb_tcp_output(tp); goto drop; } else if (tp->t_dupacks == tcprexmtthresh) { @@ -2599,18 +2604,18 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, TCPSTAT_INC( tcps_sack_recovery_episode); tp->sack_newdata = tp->snd_nxt; - tp->snd_cwnd = tp->t_maxseg; + tp->snd_cwnd = maxseg; (void) tp->t_fb->tfb_tcp_output(tp); goto drop; } tp->snd_nxt = th->th_ack; - tp->snd_cwnd = tp->t_maxseg; + tp->snd_cwnd = maxseg; (void) tp->t_fb->tfb_tcp_output(tp); KASSERT(tp->snd_limited <= 2, ("%s: tp->snd_limited too big", __func__)); tp->snd_cwnd = tp->snd_ssthresh + - tp->t_maxseg * + maxseg * (tp->t_dupacks - tp->snd_limited); if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; @@ -2641,7 +2646,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, tp->snd_cwnd = (tp->snd_nxt - tp->snd_una) + (tp->t_dupacks - tp->snd_limited) * - tp->t_maxseg; + maxseg; /* * Only call tcp_output when there * is new data available to be sent. @@ -2654,10 +2659,10 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, if (avail > 0) (void) tp->t_fb->tfb_tcp_output(tp); sent = tp->snd_max - oldsndmax; - if (sent > tp->t_maxseg) { + if (sent > maxseg) { KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) || - (sent == tp->t_maxseg + 1 && + (sent == maxseg + 1 && tp->t_flags & TF_SENTFIN), ("%s: sent too much", __func__)); @@ -3510,11 +3515,9 @@ tcp_xmit_timer(struct tcpcb *tp, int rtt) * While looking at the routing entry, we also initialize other path-dependent * parameters from pre-set or cached values in the routing entry. * - * Also take into account the space needed for options that we - * send regularly. Make maxseg shorter by that amount to assure - * that we can send maxseg amount of data even when the options - * are present. Store the upper limit of the length of options plus - * data in maxopd. + * NOTE that resulting t_maxseg doesn't include space for TCP options or + * IP options, e.g. IPSEC data, since length of this data may vary, and + * thus it is calculated for every segment separately in tcp_output(). * * NOTE that this routine is only called when we process an incoming * segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS @@ -3528,7 +3531,6 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, u_long maxmtu = 0; struct inpcb *inp = tp->t_inpcb; struct hc_metrics_lite metrics; - int origoffer; #ifdef INET6 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; size_t min_protoh = isipv6 ? @@ -3544,13 +3546,12 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, KASSERT(offer == -1, ("%s: conflict", __func__)); offer = mtuoffer - min_protoh; } - origoffer = offer; /* Initialize. */ #ifdef INET6 if (isipv6) { maxmtu = tcp_maxmtu6(&inp->inp_inc, cap); - tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt; + tp->t_maxseg = V_tcp_v6mssdflt; } #endif #if defined(INET) && defined(INET6) @@ -3559,7 +3560,7 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, #ifdef INET { maxmtu = tcp_maxmtu(&inp->inp_inc, cap); - tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt; + tp->t_maxseg = V_tcp_mssdflt; } #endif @@ -3583,9 +3584,9 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, /* * Offer == 0 means that there was no MSS on the SYN * segment, in this case we use tcp_mssdflt as - * already assigned to t_maxopd above. + * already assigned to t_maxseg above. */ - offer = tp->t_maxopd; + offer = tp->t_maxseg; break; case -1: @@ -3657,31 +3658,15 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, mss = min(mss, offer); /* - * Sanity check: make sure that maxopd will be large + * Sanity check: make sure that maxseg will be large * enough to allow some data on segments even if the * all the option space is used (40bytes). Otherwise * funny things may happen in tcp_output. + * + * XXXGL: shouldn't we reserve space for IP/IPv6 options? */ mss = max(mss, 64); - /* - * maxopd stores the maximum length of data AND options - * in a segment; maxseg is the amount of data in a normal - * segment. We need to store this value (maxopd) apart - * from maxseg, because now every segment carries options - * and thus we normally have somewhat less data in segments. - */ - tp->t_maxopd = mss; - - /* - * origoffer==-1 indicates that no segments were received yet. - * In this case we just guess. - */ - if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && - (origoffer == -1 || - (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) - mss -= TCPOLEN_TSTAMP_APPA; - tp->t_maxseg = mss; } @@ -3804,7 +3789,8 @@ void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) { tcp_seq onxt = tp->snd_nxt; - u_long ocwnd = tp->snd_cwnd; + u_long ocwnd = tp->snd_cwnd; + u_int maxseg = tcp_maxseg(tp); INP_WLOCK_ASSERT(tp->t_inpcb); @@ -3815,7 +3801,7 @@ tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) * Set snd_cwnd to one segment beyond acknowledged offset. * (tp->snd_una has not yet been updated when this function is called.) */ - tp->snd_cwnd = tp->t_maxseg + BYTES_THIS_ACK(tp, th); + tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th); tp->t_flags |= TF_ACKNOW; (void) tp->t_fb->tfb_tcp_output(tp); tp->snd_cwnd = ocwnd; @@ -3829,7 +3815,7 @@ tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) tp->snd_cwnd -= BYTES_THIS_ACK(tp, th); else tp->snd_cwnd = 0; - tp->snd_cwnd += tp->t_maxseg; + tp->snd_cwnd += maxseg; } int diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index 482ead5a329b..3c32d77c377e 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -830,11 +830,11 @@ tcp_output(struct tcpcb *tp) /* * Adjust data length if insertion of options will - * bump the packet length beyond the t_maxopd length. + * bump the packet length beyond the t_maxseg length. * Clear the FIN bit because we cut off the tail of * the segment. */ - if (len + optlen + ipoptlen > tp->t_maxopd) { + if (len + optlen + ipoptlen > tp->t_maxseg) { flags &= ~TH_FIN; if (tso) { @@ -937,7 +937,7 @@ tcp_output(struct tcpcb *tp) * fractional unless the send sockbuf can be * emptied: */ - max_len = (tp->t_maxopd - optlen); + max_len = (tp->t_maxseg - optlen); if ((off + len) < sbavail(&so->so_snd)) { moff = len % max_len; if (moff != 0) { @@ -967,7 +967,7 @@ tcp_output(struct tcpcb *tp) sendalot = 1; } else { - len = tp->t_maxopd - optlen - ipoptlen; + len = tp->t_maxseg - optlen - ipoptlen; sendalot = 1; } } else @@ -1277,10 +1277,10 @@ tcp_output(struct tcpcb *tp) * The TCP pseudo header checksum is always provided. */ if (tso) { - KASSERT(len > tp->t_maxopd - optlen, + KASSERT(len > tp->t_maxseg - optlen, ("%s: len <= tso_segsz", __func__)); m->m_pkthdr.csum_flags |= CSUM_TSO; - m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen; + m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; } #ifdef IPSEC @@ -1348,7 +1348,7 @@ tcp_output(struct tcpcb *tp) */ ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); - if (V_path_mtu_discovery && tp->t_maxopd > V_tcp_minmss) + if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) tp->t_flags2 |= TF2_PLPMTU_PMTUD; else tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; @@ -1394,7 +1394,7 @@ tcp_output(struct tcpcb *tp) * * NB: Don't set DF on small MTU/MSS to have a safe fallback. */ - if (V_path_mtu_discovery && tp->t_maxopd > V_tcp_minmss) { + if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { ip->ip_off |= htons(IP_DF); tp->t_flags2 |= TF2_PLPMTU_PMTUD; } else { diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index c2e0696394a5..9f21f116f164 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -1087,7 +1087,7 @@ tcp_newtcpcb(struct inpcb *inp) #endif tp->t_timers = &tm->tt; /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ - tp->t_maxseg = tp->t_maxopd = + tp->t_maxseg = #ifdef INET6 isipv6 ? V_tcp_v6mssdflt : #endif /* INET6 */ @@ -1901,7 +1901,7 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) * Only process the offered MTU if it * is smaller than the current one. */ - if (mtu < tp->t_maxopd + + if (mtu < tp->t_maxseg + sizeof(struct tcpiphdr)) { bzero(&inc, sizeof(inc)); inc.inc_faddr = faddr; @@ -2283,6 +2283,59 @@ tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap) } #endif /* INET6 */ +/* + * Calculate effective SMSS per RFC5681 definition for a given TCP + * connection at its current state, taking into account SACK and etc. + */ +u_int +tcp_maxseg(const struct tcpcb *tp) +{ + u_int optlen; + + if (tp->t_flags & TF_NOOPT) + return (tp->t_maxseg); + + /* + * Here we have a simplified code from tcp_addoptions(), + * without a proper loop, and having most of paddings hardcoded. + * We might make mistakes with padding here in some edge cases, + * but this is harmless, since result of tcp_maxseg() is used + * only in cwnd and ssthresh estimations. + */ +#define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4) + if (TCPS_HAVEESTABLISHED(tp->t_state)) { + if (tp->t_flags & TF_RCVD_TSTMP) + optlen = TCPOLEN_TSTAMP_APPA; + else + optlen = 0; +#ifdef TCP_SIGNATURE + if (tp->t_flags & TF_SIGNATURE) + optlen += PAD(TCPOLEN_SIGNATURE); +#endif + if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) { + optlen += TCPOLEN_SACKHDR; + optlen += tp->rcv_numsacks * TCPOLEN_SACK; + optlen = PAD(optlen); + } + } else { + if (tp->t_flags & TF_REQ_TSTMP) + optlen = TCPOLEN_TSTAMP_APPA; + else + optlen = PAD(TCPOLEN_MAXSEG); + if (tp->t_flags & TF_REQ_SCALE) + optlen += PAD(TCPOLEN_WINDOW); +#ifdef TCP_SIGNATURE + if (tp->t_flags & TF_SIGNATURE) + optlen += PAD(TCPOLEN_SIGNATURE); +#endif + if (tp->t_flags & TF_SACK_PERMIT) + optlen += PAD(TCPOLEN_SACK_PERMITTED); + } +#undef PAD + optlen = min(optlen, TCP_MAXOLEN); + return (tp->t_maxseg - optlen); +} + #ifdef IPSEC /* compute ESP/AH header size for TCP, including outer IP header. */ size_t diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c index 9767eb709056..fb4ff11a4a72 100644 --- a/sys/netinet/tcp_timer.c +++ b/sys/netinet/tcp_timer.c @@ -660,7 +660,6 @@ tcp_timer_rexmt(void * xtp) */ if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) || (tp->t_state == TCPS_FIN_WAIT_1))) { - int optlen; #ifdef INET6 int isipv6; #endif @@ -684,8 +683,7 @@ tcp_timer_rexmt(void * xtp) tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; /* Keep track of previous MSS. */ - optlen = tp->t_maxopd - tp->t_maxseg; - tp->t_pmtud_saved_maxopd = tp->t_maxopd; + tp->t_pmtud_saved_maxseg = tp->t_maxseg; /* * Reduce the MSS to blackhole value or to the default @@ -694,13 +692,13 @@ tcp_timer_rexmt(void * xtp) #ifdef INET6 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; if (isipv6 && - tp->t_maxopd > V_tcp_v6pmtud_blackhole_mss) { + tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { /* Use the sysctl tuneable blackhole MSS. */ - tp->t_maxopd = V_tcp_v6pmtud_blackhole_mss; + tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; V_tcp_pmtud_blackhole_activated++; } else if (isipv6) { /* Use the default MSS. */ - tp->t_maxopd = V_tcp_v6mssdflt; + tp->t_maxseg = V_tcp_v6mssdflt; /* * Disable Path MTU Discovery when we switch to * minmss. @@ -713,13 +711,13 @@ tcp_timer_rexmt(void * xtp) else #endif #ifdef INET - if (tp->t_maxopd > V_tcp_pmtud_blackhole_mss) { + if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { /* Use the sysctl tuneable blackhole MSS. */ - tp->t_maxopd = V_tcp_pmtud_blackhole_mss; + tp->t_maxseg = V_tcp_pmtud_blackhole_mss; V_tcp_pmtud_blackhole_activated++; } else { /* Use the default MSS. */ - tp->t_maxopd = V_tcp_mssdflt; + tp->t_maxseg = V_tcp_mssdflt; /* * Disable Path MTU Discovery when we switch to * minmss. @@ -728,7 +726,6 @@ tcp_timer_rexmt(void * xtp) V_tcp_pmtud_blackhole_activated_min_mss++; } #endif - tp->t_maxseg = tp->t_maxopd - optlen; /* * Reset the slow-start flight size * as it may depend on the new MSS. @@ -748,9 +745,7 @@ tcp_timer_rexmt(void * xtp) (tp->t_rxtshift > 6)) { tp->t_flags2 |= TF2_PLPMTU_PMTUD; tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; - optlen = tp->t_maxopd - tp->t_maxseg; - tp->t_maxopd = tp->t_pmtud_saved_maxopd; - tp->t_maxseg = tp->t_maxopd - optlen; + tp->t_maxseg = tp->t_pmtud_saved_maxseg; V_tcp_pmtud_blackhole_failed++; /* * Reset the slow-start flight size as it diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index 3435668dd8cb..76bc8aac0d99 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -904,8 +904,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, /* * Do implied connect if not yet connected, * initialize window to default value, and - * initialize maxseg/maxopd using peer's cached - * MSS. + * initialize maxseg using peer's cached MSS. */ #ifdef INET6 if (isipv6) @@ -964,8 +963,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, /* * Do implied connect if not yet connected, * initialize window to default value, and - * initialize maxseg/maxopd using peer's cached - * MSS. + * initialize maxseg using peer's cached MSS. */ #ifdef INET6 if (isipv6) @@ -2208,8 +2206,8 @@ db_print_tcpcb(struct tcpcb *tp, const char *name, int indent) "0x%08x\n", tp->snd_ssthresh, tp->snd_recover); db_print_indent(indent); - db_printf("t_maxopd: %u t_rcvtime: %u t_startime: %u\n", - tp->t_maxopd, tp->t_rcvtime, tp->t_starttime); + db_printf("t_rcvtime: %u t_startime: %u\n", + tp->t_rcvtime, tp->t_starttime); db_print_indent(indent); db_printf("t_rttime: %u t_rtsq: 0x%08x\n", diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 8d76912275a1..6cd4cf05874a 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -180,8 +180,6 @@ struct tcpcb { u_long snd_spare2; /* unused */ tcp_seq snd_recover; /* for use in NewReno Fast Recovery */ - u_int t_maxopd; /* mss plus options */ - u_int t_rcvtime; /* inactivity time */ u_int t_starttime; /* time connection was established */ u_int t_rtttime; /* RTT measurement start time */ @@ -192,6 +190,7 @@ struct tcpcb { int t_rxtcur; /* current retransmit value (ticks) */ u_int t_maxseg; /* maximum segment size */ + u_int t_pmtud_saved_maxseg; /* pre-blackhole MSS */ int t_srtt; /* smoothed round-trip time */ int t_rttvar; /* variance in round-trip time */ @@ -251,7 +250,6 @@ struct tcpcb { u_int t_tsomax; /* TSO total burst length limit in bytes */ u_int t_tsomaxsegcount; /* TSO maximum segment count */ u_int t_tsomaxsegsize; /* TSO maximum segment size in bytes */ - u_int t_pmtud_saved_maxopd; /* pre-blackhole MSS */ u_int t_flags2; /* More tcpcb flags storage */ #if defined(_KERNEL) && defined(TCP_RFC7413) uint32_t t_ispare[6]; /* 5 UTO, 1 TBD */ @@ -775,6 +773,7 @@ int tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb u_long tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *); u_long tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *); +u_int tcp_maxseg(const struct tcpcb *); void tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *, struct tcp_ifcap *); void tcp_mss(struct tcpcb *, int); From dbecd1bfdf736d7fff457b6f6e28f43365cb1dfa Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Thu, 7 Jan 2016 00:15:02 +0000 Subject: [PATCH 19/48] Switch GNU ld to be installed as ld.bfd and linked as ld We intend to replace GNU ld with LLVM's lld, and on the path to there we'll experiment with having lld installed or linked as /usr/bin/ld. Thus, make ld.bfd the primary install target for GNU ld, to later facilitate making the ld link optional. Reviewed by: davide, dim Differential Revision: https://reviews.freebsd.org/D4790 --- gnu/usr.bin/binutils/ld/Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gnu/usr.bin/binutils/ld/Makefile b/gnu/usr.bin/binutils/ld/Makefile index 5058c0a8e3e8..5bc3846a1434 100644 --- a/gnu/usr.bin/binutils/ld/Makefile +++ b/gnu/usr.bin/binutils/ld/Makefile @@ -6,7 +6,8 @@ ELF_SCR_EXT= x xbn xc xd xdc xdw xn xr xs xsc xsw xu xw .PATH: ${SRCDIR}/ld -PROG= ld +PROG= ld.bfd +MAN= ld.1 SCRIPTDIR= /usr/libdata/ldscripts SRCS+= ldcref.c \ ldctor.c \ @@ -48,7 +49,7 @@ CLEANFILES+= ldemul-list.h stringify.sed FILES= ${LDSCRIPTS:S|^|ldscripts/|} FILESDIR= ${SCRIPTDIR} -LINKS= ${BINDIR}/ld ${BINDIR}/ld.bfd +LINKS= ${BINDIR}/ld.bfd ${BINDIR}/ld HOST= ${TARGET_TUPLE} LIBSEARCHPATH= \"=/lib\":\"=/usr/lib\" From 3ba0785a742f0a29c3869f52bd99e6b5b8b9c0e0 Mon Sep 17 00:00:00 2001 From: Bryan Drewery Date: Thu, 7 Jan 2016 00:19:03 +0000 Subject: [PATCH 20/48] Move the MAKEOBJDIRPREFIX value guard to sys.mk and expand to MAKEOBJDIR. This will ensure that the variable was not set as a make override, in make.conf, src.conf or src-env.conf. It allows setting the value in src-env.conf when using WITH_AUTO_OBJ since that case properly handles changing .OBJDIR (except if MAKEOBJDIRPREFIX does not yet exist which is being discussed to be changed). This change allows setting a default MAKEOBJDIRPREFIX via local.sys.env.mk. Sponsored by: EMC / Isilon Storage Division --- Makefile | 7 ------- share/mk/sys.mk | 19 +++++++++++++++++++ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 836097b42389..c9b8a0a11bc0 100644 --- a/Makefile +++ b/Makefile @@ -143,13 +143,6 @@ TGTS+= ${BITGTS} PATH= /sbin:/bin:/usr/sbin:/usr/bin MAKEOBJDIRPREFIX?= /usr/obj -_MAKEOBJDIRPREFIX!= /usr/bin/env -i PATH=${PATH} ${MAKE} \ - ${.MAKEFLAGS:MMAKEOBJDIRPREFIX=*} __MAKE_CONF=${__MAKE_CONF} \ - -f /dev/null -V MAKEOBJDIRPREFIX dummy -.if !empty(_MAKEOBJDIRPREFIX) -.error MAKEOBJDIRPREFIX can only be set in environment, not as a global\ - (in make.conf(5)) or command-line variable. -.endif # We often need to use the tree's version of make to build it. # Choices add to complexity though. diff --git a/share/mk/sys.mk b/share/mk/sys.mk index 51f2818480d7..d8fc057b12db 100644 --- a/share/mk/sys.mk +++ b/share/mk/sys.mk @@ -33,6 +33,13 @@ __ENV_ONLY_OPTIONS:= \ ${__DEFAULT_YES_OPTIONS} \ ${__DEFAULT_DEPENDENT_OPTIONS:H} +__ENV_ONLY_VARS= \ + MAKEOBJDIR \ + MAKEOBJDIRPREFIX +.for _var in ${__ENV_ONLY_VARS} +_pre_includes_${_var:tl}:= ${${_var}:U__null} +.endfor + # early include for customization # see local.sys.mk below # Not included when building in fmake compatibility mode (still needed @@ -50,6 +57,9 @@ __ENV_ONLY_OPTIONS:= \ .endif .endif .if ${MK_AUTO_OBJ} == "yes" +# Reset, since it is allowed to be set from src-env.conf included before this. +_pre_includes_makeobjdirprefix:= ${MAKEOBJDIRPREFIX:U__null} +_pre_includes_makeobjdir:= ${MAKEOBJDIR:U__null} # This needs to be done early - before .PATH is computed # Don't do this for 'make showconfig' as it enables all options where meta mode # is not expected. @@ -410,6 +420,15 @@ __MAKE_SHELL?=/bin/sh path=${__MAKE_SHELL} .endif +# Ensure MAKEOBJDIRPREFIX was not incorrectly set. +.for _var in ${__ENV_ONLY_VARS} +.if ${.MAKEOVERRIDES:M${_var}} || (defined(${_var}) && \ + ${${_var}} != ${_pre_includes_${_var:tl}}) +.error ${_var} can only be set in environment, not as a global (in make.conf(5)) or command-line variable. +.endif +.undef _pre_includes_${_var:tl} +.endfor + # Hack for ports compatibility. Historically, ports makefiles have # assumed they can examine MACHINE_CPU without including anything # because this was automatically included in sys.mk. For /usr/src, From f99c0d1382b3a1ec51248c04d2bc19ddaac93221 Mon Sep 17 00:00:00 2001 From: Bryan Drewery Date: Thu, 7 Jan 2016 00:19:30 +0000 Subject: [PATCH 21/48] Add in DIRDEPS_BUILD support. Sponsored by: EMC / Isilon Storage Division --- lib/libsysdecode/Makefile.depend | 22 +++++++++++++++++++++ targets/pseudo/userland/lib/Makefile.depend | 1 + 2 files changed, 23 insertions(+) create mode 100644 lib/libsysdecode/Makefile.depend diff --git a/lib/libsysdecode/Makefile.depend b/lib/libsysdecode/Makefile.depend new file mode 100644 index 000000000000..653f52664dc0 --- /dev/null +++ b/lib/libsysdecode/Makefile.depend @@ -0,0 +1,22 @@ +# $FreeBSD$ +# Autogenerated - do NOT edit! + +DIRDEPS = \ + gnu/lib/csu \ + gnu/lib/libgcc \ + include \ + include/rpc \ + include/xlocale \ + lib/${CSU_DIR} \ + lib/libc \ + lib/libcompiler_rt \ + + +.include + +.if ${DEP_RELDIR} == ${_DEP_RELDIR} +# local dependencies - needed for -jN in clean tree +ioctl.So: ioctl.c +ioctl.o: ioctl.c +ioctl.po: ioctl.c +.endif diff --git a/targets/pseudo/userland/lib/Makefile.depend b/targets/pseudo/userland/lib/Makefile.depend index c5e2407ddc18..e57164a7d917 100644 --- a/targets/pseudo/userland/lib/Makefile.depend +++ b/targets/pseudo/userland/lib/Makefile.depend @@ -136,6 +136,7 @@ DIRDEPS = \ lib/libstand \ lib/libstdbuf \ lib/libstdthreads \ + lib/libsysdecode \ lib/libtacplus \ lib/libtelnet \ lib/libthr \ From 0e87e3cb6fc5d22df0d6cd87bc9cd90d65d811f3 Mon Sep 17 00:00:00 2001 From: Bryan Drewery Date: Thu, 7 Jan 2016 00:20:47 +0000 Subject: [PATCH 22/48] Revert r293286. It was not intended to come in yet. --- Makefile | 7 +++++++ share/mk/sys.mk | 19 ------------------- 2 files changed, 7 insertions(+), 19 deletions(-) diff --git a/Makefile b/Makefile index c9b8a0a11bc0..836097b42389 100644 --- a/Makefile +++ b/Makefile @@ -143,6 +143,13 @@ TGTS+= ${BITGTS} PATH= /sbin:/bin:/usr/sbin:/usr/bin MAKEOBJDIRPREFIX?= /usr/obj +_MAKEOBJDIRPREFIX!= /usr/bin/env -i PATH=${PATH} ${MAKE} \ + ${.MAKEFLAGS:MMAKEOBJDIRPREFIX=*} __MAKE_CONF=${__MAKE_CONF} \ + -f /dev/null -V MAKEOBJDIRPREFIX dummy +.if !empty(_MAKEOBJDIRPREFIX) +.error MAKEOBJDIRPREFIX can only be set in environment, not as a global\ + (in make.conf(5)) or command-line variable. +.endif # We often need to use the tree's version of make to build it. # Choices add to complexity though. diff --git a/share/mk/sys.mk b/share/mk/sys.mk index d8fc057b12db..51f2818480d7 100644 --- a/share/mk/sys.mk +++ b/share/mk/sys.mk @@ -33,13 +33,6 @@ __ENV_ONLY_OPTIONS:= \ ${__DEFAULT_YES_OPTIONS} \ ${__DEFAULT_DEPENDENT_OPTIONS:H} -__ENV_ONLY_VARS= \ - MAKEOBJDIR \ - MAKEOBJDIRPREFIX -.for _var in ${__ENV_ONLY_VARS} -_pre_includes_${_var:tl}:= ${${_var}:U__null} -.endfor - # early include for customization # see local.sys.mk below # Not included when building in fmake compatibility mode (still needed @@ -57,9 +50,6 @@ _pre_includes_${_var:tl}:= ${${_var}:U__null} .endif .endif .if ${MK_AUTO_OBJ} == "yes" -# Reset, since it is allowed to be set from src-env.conf included before this. -_pre_includes_makeobjdirprefix:= ${MAKEOBJDIRPREFIX:U__null} -_pre_includes_makeobjdir:= ${MAKEOBJDIR:U__null} # This needs to be done early - before .PATH is computed # Don't do this for 'make showconfig' as it enables all options where meta mode # is not expected. @@ -420,15 +410,6 @@ __MAKE_SHELL?=/bin/sh path=${__MAKE_SHELL} .endif -# Ensure MAKEOBJDIRPREFIX was not incorrectly set. -.for _var in ${__ENV_ONLY_VARS} -.if ${.MAKEOVERRIDES:M${_var}} || (defined(${_var}) && \ - ${${_var}} != ${_pre_includes_${_var:tl}}) -.error ${_var} can only be set in environment, not as a global (in make.conf(5)) or command-line variable. -.endif -.undef _pre_includes_${_var:tl} -.endfor - # Hack for ports compatibility. Historically, ports makefiles have # assumed they can examine MACHINE_CPU without including anything # because this was automatically included in sys.mk. For /usr/src, From b8e3d9cc79f70c21c09717b75766498d208577e3 Mon Sep 17 00:00:00 2001 From: Bryan Drewery Date: Thu, 7 Jan 2016 00:32:40 +0000 Subject: [PATCH 23/48] Update dependencies after r292622 moved the ioctl script. Sponsored by: EMC / Isilon Storage Division --- usr.bin/truss/Makefile.depend.amd64 | 3 --- 1 file changed, 3 deletions(-) diff --git a/usr.bin/truss/Makefile.depend.amd64 b/usr.bin/truss/Makefile.depend.amd64 index ad9c8b3d5906..334bb3a8d446 100644 --- a/usr.bin/truss/Makefile.depend.amd64 +++ b/usr.bin/truss/Makefile.depend.amd64 @@ -6,7 +6,6 @@ DIRDEPS = \ gnu/lib/libgcc \ include \ include/arpa \ - include/rpc \ include/xlocale \ lib/${CSU_DIR} \ lib/libc \ @@ -26,6 +25,4 @@ amd64-freebsd32.o: freebsd32_syscalls.h amd64-freebsd32.po: freebsd32_syscalls.h amd64-linux32.o: amd64-linux32_syscalls.h amd64-linux32.po: amd64-linux32_syscalls.h -ioctl.o: ioctl.c -ioctl.po: ioctl.c .endif From 6b53d1bc6f718a2c7b45cf992dfeaeb940f63d9c Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 7 Jan 2016 02:04:17 +0000 Subject: [PATCH 24/48] cache: ansify functions and fix some style issues No functional changes. --- sys/kern/vfs_cache.c | 55 ++++++++++++++------------------------------ 1 file changed, 17 insertions(+), 38 deletions(-) diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c index ff3736d2dffa..f1bd82122372 100644 --- a/sys/kern/vfs_cache.c +++ b/sys/kern/vfs_cache.c @@ -288,7 +288,7 @@ static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss, "Number of cache misses"); static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap, "Number of cache misses we do not want to cache"); -static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps, +static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps, "Number of cache hits (positive) we do not want to cache"); static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits, "Number of cache hits (positive)"); @@ -303,8 +303,6 @@ SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD | CTLFLAG_MPSAFE, &nchstats, sizeof(nchstats), "LU", "VFS cache effectiveness statistics"); - - static void cache_zap(struct namecache *ncp); static int vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen); @@ -410,8 +408,7 @@ SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| * pointer to a vnode or if it is just a negative cache entry. */ static void -cache_zap(ncp) - struct namecache *ncp; +cache_zap(struct namecache *ncp) { struct vnode *vp; @@ -446,7 +443,7 @@ cache_zap(ncp) } numcache--; cache_free(ncp); - if (vp) + if (vp != NULL) vdrop(vp); } @@ -468,12 +465,8 @@ cache_zap(ncp) */ int -cache_lookup(dvp, vpp, cnp, tsp, ticksp) - struct vnode *dvp; - struct vnode **vpp; - struct componentname *cnp; - struct timespec *tsp; - int *ticksp; +cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, + struct timespec *tsp, int *ticksp) { struct namecache *ncp; uint32_t hash; @@ -701,12 +694,8 @@ cache_lookup(dvp, vpp, cnp, tsp, ticksp) * Add an entry to the cache. */ void -cache_enter_time(dvp, vp, cnp, tsp, dtsp) - struct vnode *dvp; - struct vnode *vp; - struct componentname *cnp; - struct timespec *tsp; - struct timespec *dtsp; +cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, + struct timespec *tsp, struct timespec *dtsp) { struct namecache *ncp, *n2; struct namecache_ts *n3; @@ -836,9 +825,9 @@ cache_enter_time(dvp, vp, cnp, tsp, dtsp) * has populated v_cache_dd pointer already. */ if (dvp->v_cache_dd != NULL) { - CACHE_WUNLOCK(); - cache_free(ncp); - return; + CACHE_WUNLOCK(); + cache_free(ncp); + return; } KASSERT(vp == NULL || vp->v_type == VDIR, ("wrong vnode type %p", vp)); @@ -846,7 +835,7 @@ cache_enter_time(dvp, vp, cnp, tsp, dtsp) } numcache++; - if (!vp) { + if (vp == NULL) { numneg++; if (cnp->cn_flags & ISWHITEOUT) ncp->nc_flag |= NCF_WHITE; @@ -884,7 +873,7 @@ cache_enter_time(dvp, vp, cnp, tsp, dtsp) * "negative" cache queue, otherwise, we place it into the * destination vnode's cache entries queue. */ - if (vp) { + if (vp != NULL) { TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); SDT_PROBE3(vfs, namecache, enter, done, dvp, nc_get_name(ncp), vp); @@ -975,8 +964,7 @@ cache_changesize(int newmaxvnodes) * Invalidate all entries to a particular vnode. */ void -cache_purge(vp) - struct vnode *vp; +cache_purge(struct vnode *vp) { CTR1(KTR_VFS, "cache_purge(%p)", vp); @@ -999,8 +987,7 @@ cache_purge(vp) * Invalidate all negative entries for a particular directory vnode. */ void -cache_purge_negative(vp) - struct vnode *vp; +cache_purge_negative(struct vnode *vp) { struct namecache *cp, *ncp; @@ -1018,8 +1005,7 @@ cache_purge_negative(vp) * Flush all entries referencing a particular filesystem. */ void -cache_purgevfs(mp) - struct mount *mp; +cache_purgevfs(struct mount *mp) { struct nchashhead *ncpp; struct namecache *ncp, *nnp; @@ -1042,12 +1028,7 @@ cache_purgevfs(mp) */ int -vfs_cache_lookup(ap) - struct vop_lookup_args /* { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - } */ *ap; +vfs_cache_lookup(struct vop_lookup_args *ap) { struct vnode *dvp; int error; @@ -1088,9 +1069,7 @@ SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, /* Implementation of the getcwd syscall. */ int -sys___getcwd(td, uap) - struct thread *td; - struct __getcwd_args *uap; +sys___getcwd(struct thread *td, struct __getcwd_args *uap) { return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen, From b875c2e96d79d6174c20232829d7b84c0c3c8ca0 Mon Sep 17 00:00:00 2001 From: Josh Paetzel Date: Thu, 7 Jan 2016 05:34:39 +0000 Subject: [PATCH 25/48] Allow /etc/exports to contain usernames/groups with spaces in them. If you are getting your users/groups from a directory service such as LDAP or AD it's possible for those usernames or groupnames to contain spaces. Submitted by: Sean E. Fagan Reviewed by: rmacklem MFC after: 1 week Sponsored by: iXsystems --- usr.sbin/mountd/exports.5 | 2 ++ usr.sbin/mountd/mountd.c | 71 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 72 insertions(+), 1 deletion(-) diff --git a/usr.sbin/mountd/exports.5 b/usr.sbin/mountd/exports.5 index 88e2219ab46d..018a865015ab 100644 --- a/usr.sbin/mountd/exports.5 +++ b/usr.sbin/mountd/exports.5 @@ -131,6 +131,7 @@ The credential includes all the groups to which the user is a member on the local machine (see .Xr id 1 ) . The user may be specified by name or number. +The user string may be quoted, or use backslash escaping. .Pp .Sm off .Fl maproot Li = Sy user:group1:group2:... @@ -140,6 +141,7 @@ to be used for remote access by root. The elements of the list may be either names or numbers. Note that user: should be used to distinguish a credential containing no groups from a complete credential for that user. +The group names may be quoted, or use backslash escaping. .Pp .Sm off .Fl mapall Li = Sy user diff --git a/usr.sbin/mountd/mountd.c b/usr.sbin/mountd/mountd.c index 535a3f718197..d6da2bc9c24c 100644 --- a/usr.sbin/mountd/mountd.c +++ b/usr.sbin/mountd/mountd.c @@ -174,6 +174,7 @@ static int check_options(struct dirlist *); static int checkmask(struct sockaddr *sa); static int chk_host(struct dirlist *, struct sockaddr *, int *, int *, int *, int **); +static char *strsep_quote(char **stringp, const char *delim); static int create_service(struct netconfig *nconf); static void complete_service(struct netconfig *nconf, char *port_str); static void clearout_service(void); @@ -277,6 +278,73 @@ static void SYSLOG(int, const char *, ...) __printflike(2, 3); static int debug = 0; #endif +/* + * Similar to strsep(), but it allows for quoted strings + * and escaped characters. + * + * It returns the string (or NULL, if *stringp is NULL), + * which is a de-quoted version of the string if necessary. + * + * It modifies *stringp in place. + */ +static char * +strsep_quote(char **stringp, const char *delim) +{ + char *srcptr, *dstptr, *retval; + char quot = 0; + + if (stringp == NULL || *stringp == NULL) + return (NULL); + + srcptr = dstptr = retval = *stringp; + + while (*srcptr) { + /* + * We're looking for several edge cases here. + * First: if we're in quote state (quot != 0), + * then we ignore the delim characters, but otherwise + * process as normal, unless it is the quote character. + * Second: if the current character is a backslash, + * we take the next character as-is, without checking + * for delim, quote, or backslash. Exception: if the + * next character is a NUL, that's the end of the string. + * Third: if the character is a quote character, we toggle + * quote state. + * Otherwise: check the current character for NUL, or + * being in delim, and end the string if either is true. + */ + if (*srcptr == '\\') { + srcptr++; + /* + * The edge case here is if the next character + * is NUL, we want to stop processing. But if + * it's not NUL, then we simply want to copy it. + */ + if (*srcptr) { + *dstptr++ = *srcptr++; + } + continue; + } + if (quot == 0 && (*srcptr == '\'' || *srcptr == '"')) { + quot = *srcptr++; + continue; + } + if (quot && *srcptr == quot) { + /* End of the quoted part */ + quot = 0; + srcptr++; + continue; + } + if (!quot && strchr(delim, *srcptr)) + break; + *dstptr++ = *srcptr++; + } + + *dstptr = 0; /* Terminate the string */ + *stringp = (*srcptr == '\0') ? NULL : srcptr + 1; + return (retval); +} + /* * Mountd server for NFS mount protocol as described in: * NFS: Network File System Protocol Specification, RFC1094, Appendix A @@ -2831,8 +2899,9 @@ parsecred(char *namelist, struct xucred *cr) /* * Get the user's password table entry. */ - names = strsep(&namelist, " \t\n"); + names = strsep_quote(&namelist, " \t\n"); name = strsep(&names, ":"); + /* Bug? name could be NULL here */ if (isdigit(*name) || *name == '-') pw = getpwuid(atoi(name)); else From 4332feca4b8144d1842c7ecf98ac4f17bac3c1c6 Mon Sep 17 00:00:00 2001 From: Allan Jude Date: Thu, 7 Jan 2016 05:47:34 +0000 Subject: [PATCH 26/48] Make additional parts of sys/geom/eli more usable in userspace The upcoming GELI support in the loader reuses parts of this code Some ifdefs are added, and some code is moved outside of existing ifdefs The HMAC parts of GELI are broken out into their own file, to separate them from the kernel crypto/openssl dependant parts that are replaced in the boot code. Passed the GELI regression suite (tools/regression/geom/eli) Files=20 Tests=14996 Result: PASS Reviewed by: pjd, delphij MFC after: 1 week Sponsored by: ScaleEngine Inc. Differential Revision: https://reviews.freebsd.org/D4699 --- sbin/geom/class/eli/Makefile | 1 + sys/conf/files | 1 + sys/geom/eli/g_eli.c | 83 +--------------- sys/geom/eli/g_eli.h | 96 ++++++++++++++++-- sys/geom/eli/g_eli_crypto.c | 72 -------------- sys/geom/eli/g_eli_hmac.c | 150 +++++++++++++++++++++++++++++ sys/geom/eli/g_eli_key_cache.c | 32 +++--- sys/geom/eli/pkcs5v2.c | 2 + sys/modules/geom/geom_eli/Makefile | 1 + 9 files changed, 254 insertions(+), 184 deletions(-) create mode 100644 sys/geom/eli/g_eli_hmac.c diff --git a/sbin/geom/class/eli/Makefile b/sbin/geom/class/eli/Makefile index 50de65117c62..5eff32c50689 100644 --- a/sbin/geom/class/eli/Makefile +++ b/sbin/geom/class/eli/Makefile @@ -4,6 +4,7 @@ GEOM_CLASS= eli SRCS= g_eli_crypto.c +SRCS+= g_eli_hmac.c SRCS+= g_eli_key.c SRCS+= pkcs5v2.c SRCS+= sha256c.c diff --git a/sys/conf/files b/sys/conf/files index feac6c0e1de9..3652b3b9f7d9 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -2994,6 +2994,7 @@ geom/concat/g_concat.c optional geom_concat geom/eli/g_eli.c optional geom_eli geom/eli/g_eli_crypto.c optional geom_eli geom/eli/g_eli_ctl.c optional geom_eli +geom/eli/g_eli_hmac.c optional geom_eli geom/eli/g_eli_integrity.c optional geom_eli geom/eli/g_eli_key.c optional geom_eli geom/eli/g_eli_key_cache.c optional geom_eli diff --git a/sys/geom/eli/g_eli.c b/sys/geom/eli/g_eli.c index 57adc40addc8..a2b4e6517ee4 100644 --- a/sys/geom/eli/g_eli.c +++ b/sys/geom/eli/g_eli.c @@ -571,40 +571,6 @@ g_eli_worker(void *arg) } } -/* - * Here we generate IV. It is unique for every sector. - */ -void -g_eli_crypto_ivgen(struct g_eli_softc *sc, off_t offset, u_char *iv, - size_t size) -{ - uint8_t off[8]; - - if ((sc->sc_flags & G_ELI_FLAG_NATIVE_BYTE_ORDER) != 0) - bcopy(&offset, off, sizeof(off)); - else - le64enc(off, (uint64_t)offset); - - switch (sc->sc_ealgo) { - case CRYPTO_AES_XTS: - bcopy(off, iv, sizeof(off)); - bzero(iv + sizeof(off), size - sizeof(off)); - break; - default: - { - u_char hash[SHA256_DIGEST_LENGTH]; - SHA256_CTX ctx; - - /* Copy precalculated SHA256 context for IV-Key. */ - bcopy(&sc->sc_ivctx, &ctx, sizeof(ctx)); - SHA256_Update(&ctx, off, sizeof(off)); - SHA256_Final(hash, &ctx); - bcopy(hash, iv, MIN(sizeof(hash), size)); - break; - } - } -} - int g_eli_read_metadata(struct g_class *mp, struct g_provider *pp, struct g_eli_metadata *md) @@ -751,44 +717,9 @@ g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp, else gp->access = g_std_access; - sc->sc_version = md->md_version; - sc->sc_inflight = 0; - sc->sc_crypto = G_ELI_CRYPTO_UNKNOWN; - sc->sc_flags = md->md_flags; - /* Backward compatibility. */ - if (md->md_version < G_ELI_VERSION_04) - sc->sc_flags |= G_ELI_FLAG_NATIVE_BYTE_ORDER; - if (md->md_version < G_ELI_VERSION_05) - sc->sc_flags |= G_ELI_FLAG_SINGLE_KEY; - if (md->md_version < G_ELI_VERSION_06 && - (sc->sc_flags & G_ELI_FLAG_AUTH) != 0) { - sc->sc_flags |= G_ELI_FLAG_FIRST_KEY; - } - if (md->md_version < G_ELI_VERSION_07) - sc->sc_flags |= G_ELI_FLAG_ENC_IVKEY; - sc->sc_ealgo = md->md_ealgo; + eli_metadata_softc(sc, md, bpp->sectorsize, bpp->mediasize); sc->sc_nkey = nkey; - if (sc->sc_flags & G_ELI_FLAG_AUTH) { - sc->sc_akeylen = sizeof(sc->sc_akey) * 8; - sc->sc_aalgo = md->md_aalgo; - sc->sc_alen = g_eli_hashlen(sc->sc_aalgo); - - sc->sc_data_per_sector = bpp->sectorsize - sc->sc_alen; - /* - * Some hash functions (like SHA1 and RIPEMD160) generates hash - * which length is not multiple of 128 bits, but we want data - * length to be multiple of 128, so we can encrypt without - * padding. The line below rounds down data length to multiple - * of 128 bits. - */ - sc->sc_data_per_sector -= sc->sc_data_per_sector % 16; - - sc->sc_bytes_per_sector = - (md->md_sectorsize - 1) / sc->sc_data_per_sector + 1; - sc->sc_bytes_per_sector *= bpp->sectorsize; - } - gp->softc = sc; sc->sc_geom = gp; @@ -831,22 +762,10 @@ g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp, goto failed; } - sc->sc_sectorsize = md->md_sectorsize; - sc->sc_mediasize = bpp->mediasize; - if (!(sc->sc_flags & G_ELI_FLAG_ONETIME)) - sc->sc_mediasize -= bpp->sectorsize; - if (!(sc->sc_flags & G_ELI_FLAG_AUTH)) - sc->sc_mediasize -= (sc->sc_mediasize % sc->sc_sectorsize); - else { - sc->sc_mediasize /= sc->sc_bytes_per_sector; - sc->sc_mediasize *= sc->sc_sectorsize; - } - /* * Remember the keys in our softc structure. */ g_eli_mkey_propagate(sc, mkey); - sc->sc_ekeylen = md->md_keylen; LIST_INIT(&sc->sc_workers); diff --git a/sys/geom/eli/g_eli.h b/sys/geom/eli/g_eli.h index e4dbee6819ab..3deb865c4af7 100644 --- a/sys/geom/eli/g_eli.h +++ b/sys/geom/eli/g_eli.h @@ -40,8 +40,6 @@ #include #include #include -#include -#include #include #else #include @@ -49,6 +47,8 @@ #include #include #endif +#include +#include #ifndef _OpenSSL_ #include #endif @@ -132,15 +132,15 @@ /* Switch data encryption key every 2^20 blocks. */ #define G_ELI_KEY_SHIFT 20 +#define G_ELI_CRYPTO_UNKNOWN 0 +#define G_ELI_CRYPTO_HW 1 +#define G_ELI_CRYPTO_SW 2 + #ifdef _KERNEL extern int g_eli_debug; extern u_int g_eli_overwrites; extern u_int g_eli_batch; -#define G_ELI_CRYPTO_UNKNOWN 0 -#define G_ELI_CRYPTO_HW 1 -#define G_ELI_CRYPTO_SW 2 - #define G_ELI_DEBUG(lvl, ...) do { \ if (g_eli_debug >= (lvl)) { \ printf("GEOM_ELI"); \ @@ -173,6 +173,8 @@ struct g_eli_worker { LIST_ENTRY(g_eli_worker) w_next; }; +#endif /* _KERNEL */ + struct g_eli_softc { struct g_geom *sc_geom; u_int sc_version; @@ -200,15 +202,35 @@ struct g_eli_softc { size_t sc_sectorsize; u_int sc_bytes_per_sector; u_int sc_data_per_sector; +#ifndef _KERNEL + int sc_cpubind; +#else /* _KERNEL */ boolean_t sc_cpubind; /* Only for software cryptography. */ struct bio_queue_head sc_queue; struct mtx sc_queue_mtx; LIST_HEAD(, g_eli_worker) sc_workers; +#endif /* _KERNEL */ }; #define sc_name sc_geom->name -#endif /* _KERNEL */ + +#define G_ELI_KEY_MAGIC 0xe11341c + +struct g_eli_key { + /* Key value, must be first in the structure. */ + uint8_t gek_key[G_ELI_DATAKEYLEN]; + /* Magic. */ + int gek_magic; + /* Key number. */ + uint64_t gek_keyno; + /* Reference counter. */ + int gek_count; + /* Keeps keys sorted by most recent use. */ + TAILQ_ENTRY(g_eli_key) gek_next; + /* Keeps keys sorted by number. */ + RB_ENTRY(g_eli_key) gek_link; +}; struct g_eli_metadata { char md_magic[16]; /* Magic value. */ @@ -569,6 +591,60 @@ g_eli_hashlen(u_int algo) return (0); } +static __inline void +eli_metadata_softc(struct g_eli_softc *sc, const struct g_eli_metadata *md, + u_int sectorsize, off_t mediasize) +{ + + sc->sc_version = md->md_version; + sc->sc_inflight = 0; + sc->sc_crypto = G_ELI_CRYPTO_UNKNOWN; + sc->sc_flags = md->md_flags; + /* Backward compatibility. */ + if (md->md_version < G_ELI_VERSION_04) + sc->sc_flags |= G_ELI_FLAG_NATIVE_BYTE_ORDER; + if (md->md_version < G_ELI_VERSION_05) + sc->sc_flags |= G_ELI_FLAG_SINGLE_KEY; + if (md->md_version < G_ELI_VERSION_06 && + (sc->sc_flags & G_ELI_FLAG_AUTH) != 0) { + sc->sc_flags |= G_ELI_FLAG_FIRST_KEY; + } + if (md->md_version < G_ELI_VERSION_07) + sc->sc_flags |= G_ELI_FLAG_ENC_IVKEY; + sc->sc_ealgo = md->md_ealgo; + + if (sc->sc_flags & G_ELI_FLAG_AUTH) { + sc->sc_akeylen = sizeof(sc->sc_akey) * 8; + sc->sc_aalgo = md->md_aalgo; + sc->sc_alen = g_eli_hashlen(sc->sc_aalgo); + + sc->sc_data_per_sector = sectorsize - sc->sc_alen; + /* + * Some hash functions (like SHA1 and RIPEMD160) generates hash + * which length is not multiple of 128 bits, but we want data + * length to be multiple of 128, so we can encrypt without + * padding. The line below rounds down data length to multiple + * of 128 bits. + */ + sc->sc_data_per_sector -= sc->sc_data_per_sector % 16; + + sc->sc_bytes_per_sector = + (md->md_sectorsize - 1) / sc->sc_data_per_sector + 1; + sc->sc_bytes_per_sector *= sectorsize; + } + sc->sc_sectorsize = md->md_sectorsize; + sc->sc_mediasize = mediasize; + if (!(sc->sc_flags & G_ELI_FLAG_ONETIME)) + sc->sc_mediasize -= sectorsize; + if (!(sc->sc_flags & G_ELI_FLAG_AUTH)) + sc->sc_mediasize -= (sc->sc_mediasize % sc->sc_sectorsize); + else { + sc->sc_mediasize /= sc->sc_bytes_per_sector; + sc->sc_mediasize *= sc->sc_sectorsize; + } + sc->sc_ekeylen = md->md_keylen; +} + #ifdef _KERNEL int g_eli_read_metadata(struct g_class *mp, struct g_provider *pp, struct g_eli_metadata *md); @@ -583,8 +659,6 @@ void g_eli_config(struct gctl_req *req, struct g_class *mp, const char *verb); void g_eli_read_done(struct bio *bp); void g_eli_write_done(struct bio *bp); int g_eli_crypto_rerun(struct cryptop *crp); -void g_eli_crypto_ivgen(struct g_eli_softc *sc, off_t offset, u_char *iv, - size_t size); void g_eli_crypto_read(struct g_eli_softc *sc, struct bio *bp, boolean_t fromworker); void g_eli_crypto_run(struct g_eli_worker *wr, struct bio *bp); @@ -592,6 +666,8 @@ void g_eli_crypto_run(struct g_eli_worker *wr, struct bio *bp); void g_eli_auth_read(struct g_eli_softc *sc, struct bio *bp); void g_eli_auth_run(struct g_eli_worker *wr, struct bio *bp); #endif +void g_eli_crypto_ivgen(struct g_eli_softc *sc, off_t offset, u_char *iv, + size_t size); void g_eli_mkey_hmac(unsigned char *mkey, const unsigned char *key); int g_eli_mkey_decrypt(const struct g_eli_metadata *md, @@ -620,6 +696,8 @@ void g_eli_crypto_hmac_final(struct hmac_ctx *ctx, uint8_t *md, size_t mdsize); void g_eli_crypto_hmac(const uint8_t *hkey, size_t hkeysize, const uint8_t *data, size_t datasize, uint8_t *md, size_t mdsize); +void g_eli_key_fill(struct g_eli_softc *sc, struct g_eli_key *key, + uint64_t keyno); #ifdef _KERNEL void g_eli_key_init(struct g_eli_softc *sc); void g_eli_key_destroy(struct g_eli_softc *sc); diff --git a/sys/geom/eli/g_eli_crypto.c b/sys/geom/eli/g_eli_crypto.c index 43eabf49e3db..2d145fd80c5a 100644 --- a/sys/geom/eli/g_eli_crypto.c +++ b/sys/geom/eli/g_eli_crypto.c @@ -221,75 +221,3 @@ g_eli_crypto_decrypt(u_int algo, u_char *data, size_t datasize, return (g_eli_crypto_cipher(algo, 0, data, datasize, key, keysize)); } - -void -g_eli_crypto_hmac_init(struct hmac_ctx *ctx, const uint8_t *hkey, - size_t hkeylen) -{ - u_char k_ipad[128], key[128]; - SHA512_CTX lctx; - u_int i; - - bzero(key, sizeof(key)); - if (hkeylen == 0) - ; /* do nothing */ - else if (hkeylen <= 128) - bcopy(hkey, key, hkeylen); - else { - /* If key is longer than 128 bytes reset it to key = SHA512(key). */ - SHA512_Init(&lctx); - SHA512_Update(&lctx, hkey, hkeylen); - SHA512_Final(key, &lctx); - } - - /* XOR key with ipad and opad values. */ - for (i = 0; i < sizeof(key); i++) { - k_ipad[i] = key[i] ^ 0x36; - ctx->k_opad[i] = key[i] ^ 0x5c; - } - bzero(key, sizeof(key)); - /* Perform inner SHA512. */ - SHA512_Init(&ctx->shactx); - SHA512_Update(&ctx->shactx, k_ipad, sizeof(k_ipad)); - bzero(k_ipad, sizeof(k_ipad)); -} - -void -g_eli_crypto_hmac_update(struct hmac_ctx *ctx, const uint8_t *data, - size_t datasize) -{ - - SHA512_Update(&ctx->shactx, data, datasize); -} - -void -g_eli_crypto_hmac_final(struct hmac_ctx *ctx, uint8_t *md, size_t mdsize) -{ - u_char digest[SHA512_MDLEN]; - SHA512_CTX lctx; - - SHA512_Final(digest, &ctx->shactx); - /* Perform outer SHA512. */ - SHA512_Init(&lctx); - SHA512_Update(&lctx, ctx->k_opad, sizeof(ctx->k_opad)); - bzero(ctx, sizeof(*ctx)); - SHA512_Update(&lctx, digest, sizeof(digest)); - SHA512_Final(digest, &lctx); - bzero(&lctx, sizeof(lctx)); - /* mdsize == 0 means "Give me the whole hash!" */ - if (mdsize == 0) - mdsize = SHA512_MDLEN; - bcopy(digest, md, mdsize); - bzero(digest, sizeof(digest)); -} - -void -g_eli_crypto_hmac(const uint8_t *hkey, size_t hkeysize, const uint8_t *data, - size_t datasize, uint8_t *md, size_t mdsize) -{ - struct hmac_ctx ctx; - - g_eli_crypto_hmac_init(&ctx, hkey, hkeysize); - g_eli_crypto_hmac_update(&ctx, data, datasize); - g_eli_crypto_hmac_final(&ctx, md, mdsize); -} diff --git a/sys/geom/eli/g_eli_hmac.c b/sys/geom/eli/g_eli_hmac.c new file mode 100644 index 000000000000..36b76deb9fda --- /dev/null +++ b/sys/geom/eli/g_eli_hmac.c @@ -0,0 +1,150 @@ +/*- + * Copyright (c) 2005-2010 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#ifdef _KERNEL +#include +#include +#include +#else +#include +#include +#include +#include +#include +#include +#define _OpenSSL_ +#endif +#include + +void +g_eli_crypto_hmac_init(struct hmac_ctx *ctx, const uint8_t *hkey, + size_t hkeylen) +{ + u_char k_ipad[128], key[128]; + SHA512_CTX lctx; + u_int i; + + bzero(key, sizeof(key)); + if (hkeylen == 0) + ; /* do nothing */ + else if (hkeylen <= 128) + bcopy(hkey, key, hkeylen); + else { + /* If key is longer than 128 bytes reset it to key = SHA512(key). */ + SHA512_Init(&lctx); + SHA512_Update(&lctx, hkey, hkeylen); + SHA512_Final(key, &lctx); + } + + /* XOR key with ipad and opad values. */ + for (i = 0; i < sizeof(key); i++) { + k_ipad[i] = key[i] ^ 0x36; + ctx->k_opad[i] = key[i] ^ 0x5c; + } + bzero(key, sizeof(key)); + /* Perform inner SHA512. */ + SHA512_Init(&ctx->shactx); + SHA512_Update(&ctx->shactx, k_ipad, sizeof(k_ipad)); + bzero(k_ipad, sizeof(k_ipad)); +} + +void +g_eli_crypto_hmac_update(struct hmac_ctx *ctx, const uint8_t *data, + size_t datasize) +{ + + SHA512_Update(&ctx->shactx, data, datasize); +} + +void +g_eli_crypto_hmac_final(struct hmac_ctx *ctx, uint8_t *md, size_t mdsize) +{ + u_char digest[SHA512_MDLEN]; + SHA512_CTX lctx; + + SHA512_Final(digest, &ctx->shactx); + /* Perform outer SHA512. */ + SHA512_Init(&lctx); + SHA512_Update(&lctx, ctx->k_opad, sizeof(ctx->k_opad)); + bzero(ctx, sizeof(*ctx)); + SHA512_Update(&lctx, digest, sizeof(digest)); + SHA512_Final(digest, &lctx); + bzero(&lctx, sizeof(lctx)); + /* mdsize == 0 means "Give me the whole hash!" */ + if (mdsize == 0) + mdsize = SHA512_MDLEN; + bcopy(digest, md, mdsize); + bzero(digest, sizeof(digest)); +} + +void +g_eli_crypto_hmac(const uint8_t *hkey, size_t hkeysize, const uint8_t *data, + size_t datasize, uint8_t *md, size_t mdsize) +{ + struct hmac_ctx ctx; + + g_eli_crypto_hmac_init(&ctx, hkey, hkeysize); + g_eli_crypto_hmac_update(&ctx, data, datasize); + g_eli_crypto_hmac_final(&ctx, md, mdsize); +} + +/* + * Here we generate IV. It is unique for every sector. + */ +void +g_eli_crypto_ivgen(struct g_eli_softc *sc, off_t offset, u_char *iv, + size_t size) +{ + uint8_t off[8]; + + if ((sc->sc_flags & G_ELI_FLAG_NATIVE_BYTE_ORDER) != 0) + bcopy(&offset, off, sizeof(off)); + else + le64enc(off, (uint64_t)offset); + + switch (sc->sc_ealgo) { + case CRYPTO_AES_XTS: + bcopy(off, iv, sizeof(off)); + bzero(iv + sizeof(off), size - sizeof(off)); + break; + default: + { + u_char hash[SHA256_DIGEST_LENGTH]; + SHA256_CTX ctx; + + /* Copy precalculated SHA256 context for IV-Key. */ + bcopy(&sc->sc_ivctx, &ctx, sizeof(ctx)); + SHA256_Update(&ctx, off, sizeof(off)); + SHA256_Final(hash, &ctx); + bcopy(hash, iv, MIN(sizeof(hash), size)); + break; + } + } +} diff --git a/sys/geom/eli/g_eli_key_cache.c b/sys/geom/eli/g_eli_key_cache.c index cb867166cee3..0b909bef55ee 100644 --- a/sys/geom/eli/g_eli_key_cache.c +++ b/sys/geom/eli/g_eli_key_cache.c @@ -28,17 +28,20 @@ __FBSDID("$FreeBSD$"); #include +#ifdef _KERNEL #include #include -#include #include #include +#endif /* _KERNEL */ +#include #include #include #include +#ifdef _KERNEL MALLOC_DECLARE(M_ELI); SYSCTL_DECL(_kern_geom_eli); @@ -56,22 +59,7 @@ static uint64_t g_eli_key_cache_misses; SYSCTL_UQUAD(_kern_geom_eli, OID_AUTO, key_cache_misses, CTLFLAG_RW, &g_eli_key_cache_misses, 0, "Key cache misses"); -#define G_ELI_KEY_MAGIC 0xe11341c - -struct g_eli_key { - /* Key value, must be first in the structure. */ - uint8_t gek_key[G_ELI_DATAKEYLEN]; - /* Magic. */ - int gek_magic; - /* Key number. */ - uint64_t gek_keyno; - /* Reference counter. */ - int gek_count; - /* Keeps keys sorted by most recent use. */ - TAILQ_ENTRY(g_eli_key) gek_next; - /* Keeps keys sorted by number. */ - RB_ENTRY(g_eli_key) gek_link; -}; +#endif /* _KERNEL */ static int g_eli_key_cmp(const struct g_eli_key *a, const struct g_eli_key *b) @@ -84,10 +72,7 @@ g_eli_key_cmp(const struct g_eli_key *a, const struct g_eli_key *b) return (0); } -RB_PROTOTYPE(g_eli_key_tree, g_eli_key, gek_link, g_eli_key_cmp); -RB_GENERATE(g_eli_key_tree, g_eli_key, gek_link, g_eli_key_cmp); - -static void +void g_eli_key_fill(struct g_eli_softc *sc, struct g_eli_key *key, uint64_t keyno) { const uint8_t *ekey; @@ -110,6 +95,10 @@ g_eli_key_fill(struct g_eli_softc *sc, struct g_eli_key *key, uint64_t keyno) key->gek_magic = G_ELI_KEY_MAGIC; } +#ifdef _KERNEL +RB_PROTOTYPE(g_eli_key_tree, g_eli_key, gek_link, g_eli_key_cmp); +RB_GENERATE(g_eli_key_tree, g_eli_key, gek_link, g_eli_key_cmp); + static struct g_eli_key * g_eli_key_allocate(struct g_eli_softc *sc, uint64_t keyno) { @@ -350,3 +339,4 @@ g_eli_key_drop(struct g_eli_softc *sc, uint8_t *rawkey) } mtx_unlock(&sc->sc_ekeys_lock); } +#endif /* _KERNEL */ diff --git a/sys/geom/eli/pkcs5v2.c b/sys/geom/eli/pkcs5v2.c index 05677c1b76f1..6992801958ce 100644 --- a/sys/geom/eli/pkcs5v2.c +++ b/sys/geom/eli/pkcs5v2.c @@ -83,6 +83,7 @@ pkcs5v2_genkey(uint8_t *key, unsigned keylen, const uint8_t *salt, } #ifndef _KERNEL +#ifndef _STAND /* * Return the number of microseconds needed for 'interations' iterations. */ @@ -120,4 +121,5 @@ pkcs5v2_calculate(int usecs) } return (((intmax_t)iterations * (intmax_t)usecs) / v); } +#endif /* !_STAND */ #endif /* !_KERNEL */ diff --git a/sys/modules/geom/geom_eli/Makefile b/sys/modules/geom/geom_eli/Makefile index 51d821a64f62..c42ccf19f737 100644 --- a/sys/modules/geom/geom_eli/Makefile +++ b/sys/modules/geom/geom_eli/Makefile @@ -6,6 +6,7 @@ KMOD= geom_eli SRCS= g_eli.c SRCS+= g_eli_crypto.c SRCS+= g_eli_ctl.c +SRCS+= g_eli_hmac.c SRCS+= g_eli_integrity.c SRCS+= g_eli_key.c SRCS+= g_eli_key_cache.c From 8a9f7532b0742c8f48985f2d546795d833524bcd Mon Sep 17 00:00:00 2001 From: "Alexander V. Chernikov" Date: Thu, 7 Jan 2016 08:07:17 +0000 Subject: [PATCH 27/48] Convert cxgb/cxgbe to the new routing API. Discussed with: np --- sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c | 37 ++++++++++------------ sys/dev/cxgb/ulp/tom/cxgb_listen.c | 17 ++++------- sys/dev/cxgbe/iw_cxgbe/cm.c | 41 ++++++++++++------------- sys/dev/cxgbe/tom/t4_listen.c | 44 +++++++++++++-------------- 4 files changed, 64 insertions(+), 75 deletions(-) diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c index 7b435607df0c..b98caaedf21f 100644 --- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c @@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -264,20 +265,14 @@ void __free_ep(struct iwch_ep_common *epc) free(epc, M_DEVBUF); } -static struct rtentry * +static int find_route(__be32 local_ip, __be32 peer_ip, __be16 local_port, - __be16 peer_port, u8 tos) + __be16 peer_port, u8 tos, struct nhop4_extended *pnh4) { - struct route iproute; - struct sockaddr_in *dst = (struct sockaddr_in *)&iproute.ro_dst; - - bzero(&iproute, sizeof iproute); - dst->sin_family = AF_INET; - dst->sin_len = sizeof *dst; - dst->sin_addr.s_addr = peer_ip; - - rtalloc(&iproute); - return iproute.ro_rt; + struct in_addr addr; + + addr.s_addr = peer_ip; + return (fib4_lookup_nh_ext(RT_DEFAULT_FIB, addr, NHR_REF, 0, pnh4)); } static void @@ -1293,7 +1288,7 @@ iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) int err = 0; struct iwch_dev *h = to_iwch_dev(cm_id->device); struct iwch_ep *ep; - struct rtentry *rt; + struct nhop4_extended nh4; struct toedev *tdev; if (is_loopback_dst(cm_id)) { @@ -1329,28 +1324,28 @@ iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) goto fail2; /* find a route */ - rt = find_route(cm_id->local_addr.sin_addr.s_addr, + err = find_route(cm_id->local_addr.sin_addr.s_addr, cm_id->remote_addr.sin_addr.s_addr, cm_id->local_addr.sin_port, - cm_id->remote_addr.sin_port, IPTOS_LOWDELAY); - if (!rt) { + cm_id->remote_addr.sin_port, IPTOS_LOWDELAY, &nh4); + if (err) { printf("%s - cannot find route.\n", __FUNCTION__); err = EHOSTUNREACH; goto fail2; } - if (!(rt->rt_ifp->if_flags & IFCAP_TOE)) { + if (!(nh4.nh_ifp->if_flags & IFCAP_TOE)) { printf("%s - interface not TOE capable.\n", __FUNCTION__); - RTFREE(rt); + fib4_free_nh_ext(RT_DEFAULT_FIB, &nh4); goto fail2; } - tdev = TOEDEV(rt->rt_ifp); + tdev = TOEDEV(nh4.nh_ifp); if (tdev == NULL) { printf("%s - No toedev for interface.\n", __FUNCTION__); - RTFREE(rt); + fib4_free_nh_ext(RT_DEFAULT_FIB, &nh4); goto fail2; } - RTFREE(rt); + fib4_free_nh_ext(RT_DEFAULT_FIB, &nh4); state_set(&ep->com, CONNECTING); ep->com.local_addr = cm_id->local_addr; diff --git a/sys/dev/cxgb/ulp/tom/cxgb_listen.c b/sys/dev/cxgb/ulp/tom/cxgb_listen.c index 933a83c2aaa7..b7d69df6cd21 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_listen.c +++ b/sys/dev/cxgb/ulp/tom/cxgb_listen.c @@ -40,6 +40,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -480,8 +481,8 @@ do_pass_accept_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) unsigned int tid = GET_TID(req); struct listen_ctx *lctx = lookup_stid(&td->tid_maps, stid); struct l2t_entry *e = NULL; + struct nhop4_basic nh4; struct sockaddr_in nam; - struct rtentry *rt; struct inpcb *inp; struct socket *so; struct port_info *pi; @@ -525,18 +526,12 @@ do_pass_accept_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) nam.sin_len = sizeof(nam); nam.sin_family = AF_INET; nam.sin_addr = inc.inc_faddr; - rt = rtalloc1((struct sockaddr *)&nam, 0, 0); - if (rt == NULL) + if (fib4_lookup_nh_basic(RT_DEFAULT_FIB, nam.sin_addr, 0, 0, &nh4) != 0) REJECT_PASS_ACCEPT(); else { - struct sockaddr *nexthop; - - RT_UNLOCK(rt); - nexthop = rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : - (struct sockaddr *)&nam; - if (rt->rt_ifp == ifp) - e = t3_l2t_get(pi, rt->rt_ifp, nexthop); - RTFREE(rt); + nam.sin_addr = nh4.nh_addr; + if (nh4.nh_ifp == ifp) + e = t3_l2t_get(pi, ifp, (struct sockaddr *)&nam); if (e == NULL) REJECT_PASS_ACCEPT(); /* no l2te, or ifp mismatch */ } diff --git a/sys/dev/cxgbe/iw_cxgbe/cm.c b/sys/dev/cxgbe/iw_cxgbe/cm.c index cb4be71e5615..8af7df5f4ebd 100644 --- a/sys/dev/cxgbe/iw_cxgbe/cm.c +++ b/sys/dev/cxgbe/iw_cxgbe/cm.c @@ -47,6 +47,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -86,8 +87,8 @@ static void __state_set(struct c4iw_ep_common *epc, enum c4iw_ep_state tostate); static void state_set(struct c4iw_ep_common *epc, enum c4iw_ep_state tostate); static void *alloc_ep(int size, gfp_t flags); void __free_ep(struct c4iw_ep_common *epc); -static struct rtentry * find_route(__be32 local_ip, __be32 peer_ip, __be16 local_port, - __be16 peer_port, u8 tos); +static int find_route(__be32 local_ip, __be32 peer_ip, __be16 local_port, + __be16 peer_port, u8 tos, struct nhop4_extended *pnh4); static int close_socket(struct c4iw_ep_common *epc, int close); static int shutdown_socket(struct c4iw_ep_common *epc); static void abort_socket(struct c4iw_ep *ep); @@ -201,23 +202,21 @@ set_tcpinfo(struct c4iw_ep *ep) } -static struct rtentry * +static int find_route(__be32 local_ip, __be32 peer_ip, __be16 local_port, - __be16 peer_port, u8 tos) + __be16 peer_port, u8 tos, struct nhop4_extended *pnh4) { - struct route iproute; - struct sockaddr_in *dst = (struct sockaddr_in *)&iproute.ro_dst; + struct in_addr addr; + int err; CTR5(KTR_IW_CXGBE, "%s:frtB %x, %x, %d, %d", __func__, local_ip, peer_ip, ntohs(local_port), ntohs(peer_port)); - bzero(&iproute, sizeof iproute); - dst->sin_family = AF_INET; - dst->sin_len = sizeof *dst; - dst->sin_addr.s_addr = peer_ip; - rtalloc(&iproute); - CTR2(KTR_IW_CXGBE, "%s:frtE %p", __func__, (uint64_t)iproute.ro_rt); - return iproute.ro_rt; + addr.s_addr = peer_ip; + err = fib4_lookup_nh_ext(RT_DEFAULT_FIB, addr, NHR_REF, 0, pnh4); + + CTR2(KTR_IW_CXGBE, "%s:frtE %d", __func__, err); + return err; } static int @@ -2012,7 +2011,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) int err = 0; struct c4iw_dev *dev = to_c4iw_dev(cm_id->device); struct c4iw_ep *ep = NULL; - struct rtentry *rt; + struct nhop4_extended nh4; struct toedev *tdev; CTR2(KTR_IW_CXGBE, "%s:ccB %p", __func__, cm_id); @@ -2068,13 +2067,13 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) init_sock(&ep->com); /* find a route */ - rt = find_route( + err = find_route( cm_id->local_addr.sin_addr.s_addr, cm_id->remote_addr.sin_addr.s_addr, cm_id->local_addr.sin_port, - cm_id->remote_addr.sin_port, 0); + cm_id->remote_addr.sin_port, 0, &nh4); - if (!rt) { + if (err) { CTR2(KTR_IW_CXGBE, "%s:cc7 %p", __func__, ep); printk(KERN_ERR MOD "%s - cannot find route.\n", __func__); @@ -2082,7 +2081,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) goto fail2; } - if (!(rt->rt_ifp->if_capenable & IFCAP_TOE)) { + if (!(nh4.nh_ifp->if_capenable & IFCAP_TOE)) { CTR2(KTR_IW_CXGBE, "%s:cc8 %p", __func__, ep); printf("%s - interface not TOE capable.\n", __func__); @@ -2090,7 +2089,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) err = -ENOPROTOOPT; goto fail3; } - tdev = TOEDEV(rt->rt_ifp); + tdev = TOEDEV(nh4.nh_ifp); if (tdev == NULL) { @@ -2098,7 +2097,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) printf("%s - No toedev for interface.\n", __func__); goto fail3; } - RTFREE(rt); + fib4_free_nh_ext(RT_DEFAULT_FIB, &nh4); state_set(&ep->com, CONNECTING); ep->tos = 0; @@ -2117,7 +2116,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) fail3: CTR2(KTR_IW_CXGBE, "%s:ccb %p", __func__, ep); - RTFREE(rt); + fib4_free_nh_ext(RT_DEFAULT_FIB, &nh4); fail2: cm_id->rem_ref(cm_id); c4iw_put_ep(&ep->com); diff --git a/sys/dev/cxgbe/tom/t4_listen.c b/sys/dev/cxgbe/tom/t4_listen.c index 187a9f879f22..5af9260371f8 100644 --- a/sys/dev/cxgbe/tom/t4_listen.c +++ b/sys/dev/cxgbe/tom/t4_listen.c @@ -49,9 +49,11 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include +#include #include #include #include @@ -1095,46 +1097,44 @@ static struct l2t_entry * get_l2te_for_nexthop(struct port_info *pi, struct ifnet *ifp, struct in_conninfo *inc) { - struct rtentry *rt; struct l2t_entry *e; struct sockaddr_in6 sin6; struct sockaddr *dst = (void *)&sin6; if (inc->inc_flags & INC_ISIPV6) { + struct nhop6_basic nh6; + + bzero(dst, sizeof(struct sockaddr_in6)); dst->sa_len = sizeof(struct sockaddr_in6); dst->sa_family = AF_INET6; - ((struct sockaddr_in6 *)dst)->sin6_addr = inc->inc6_faddr; if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) { /* no need for route lookup */ e = t4_l2t_get(pi, ifp, dst); return (e); } + + if (fib6_lookup_nh_basic(RT_DEFAULT_FIB, &inc->inc6_faddr, + 0, 0, 0, &nh6) != 0) + return (NULL); + if (nh6.nh_ifp != ifp) + return (NULL); + ((struct sockaddr_in6 *)dst)->sin6_addr = nh6.nh_addr; } else { + struct nhop4_basic nh4; + dst->sa_len = sizeof(struct sockaddr_in); dst->sa_family = AF_INET; - ((struct sockaddr_in *)dst)->sin_addr = inc->inc_faddr; - } - - rt = rtalloc1(dst, 0, 0); - if (rt == NULL) - return (NULL); - else { - struct sockaddr *nexthop; - - RT_UNLOCK(rt); - if (rt->rt_ifp != ifp) - e = NULL; - else { - if (rt->rt_flags & RTF_GATEWAY) - nexthop = rt->rt_gateway; - else - nexthop = dst; - e = t4_l2t_get(pi, ifp, nexthop); - } - RTFREE(rt); + + if (fib4_lookup_nh_basic(RT_DEFAULT_FIB, inc->inc_faddr, 0, 0, + &nh4) != 0) + return (NULL); + if (nh4.nh_ifp != ifp) + return (NULL); + ((struct sockaddr_in *)dst)->sin_addr = nh4.nh_addr; } + e = t4_l2t_get(pi, ifp, dst); return (e); } From b9320e2a6a17b3d22b107fb0b968f94952ff33b3 Mon Sep 17 00:00:00 2001 From: Hans Petter Selasky Date: Thu, 7 Jan 2016 09:40:19 +0000 Subject: [PATCH 28/48] Remove unused file. --- sys/ofed/include/rdma/Kbuild | 1 - 1 file changed, 1 deletion(-) delete mode 100644 sys/ofed/include/rdma/Kbuild diff --git a/sys/ofed/include/rdma/Kbuild b/sys/ofed/include/rdma/Kbuild deleted file mode 100644 index e7c043216558..000000000000 --- a/sys/ofed/include/rdma/Kbuild +++ /dev/null @@ -1 +0,0 @@ -header-y += ib_user_mad.h From 460a5b502f0f3cac5f48f1328144fdedca6dd415 Mon Sep 17 00:00:00 2001 From: "Alexander V. Chernikov" Date: Thu, 7 Jan 2016 10:20:03 +0000 Subject: [PATCH 29/48] Convert pf(4) to the new routing API. Differential Revision: https://reviews.freebsd.org/D4763 --- sys/netpfil/pf/pf.c | 131 ++++++++++++++++++++++++++++++-------------- 1 file changed, 89 insertions(+), 42 deletions(-) diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c index 57e78e8498d6..1b7715309ba7 100644 --- a/sys/netpfil/pf/pf.c +++ b/sys/netpfil/pf/pf.c @@ -73,6 +73,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include @@ -94,6 +95,8 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include +#include #endif /* INET6 */ #include @@ -2985,49 +2988,35 @@ static u_int16_t pf_calc_mss(struct pf_addr *addr, sa_family_t af, int rtableid, u_int16_t offer) { #ifdef INET - struct sockaddr_in *dst; - struct route ro; + struct nhop4_basic nh4; #endif /* INET */ #ifdef INET6 - struct sockaddr_in6 *dst6; - struct route_in6 ro6; + struct nhop6_basic nh6; + struct in6_addr dst6; + uint32_t scopeid; #endif /* INET6 */ - struct rtentry *rt = NULL; int hlen = 0; - u_int16_t mss = V_tcp_mssdflt; + uint16_t mss = 0; switch (af) { #ifdef INET case AF_INET: hlen = sizeof(struct ip); - bzero(&ro, sizeof(ro)); - dst = (struct sockaddr_in *)&ro.ro_dst; - dst->sin_family = AF_INET; - dst->sin_len = sizeof(*dst); - dst->sin_addr = addr->v4; - in_rtalloc_ign(&ro, 0, rtableid); - rt = ro.ro_rt; + if (fib4_lookup_nh_basic(rtableid, addr->v4, 0, 0, &nh4) == 0) + mss = nh4.nh_mtu - hlen - sizeof(struct tcphdr); break; #endif /* INET */ #ifdef INET6 case AF_INET6: hlen = sizeof(struct ip6_hdr); - bzero(&ro6, sizeof(ro6)); - dst6 = (struct sockaddr_in6 *)&ro6.ro_dst; - dst6->sin6_family = AF_INET6; - dst6->sin6_len = sizeof(*dst6); - dst6->sin6_addr = addr->v6; - in6_rtalloc_ign(&ro6, 0, rtableid); - rt = ro6.ro_rt; + in6_splitscope(&addr->v6, &dst6, &scopeid); + if (fib6_lookup_nh_basic(rtableid, &dst6, scopeid, 0,0,&nh6)==0) + mss = nh6.nh_mtu - hlen - sizeof(struct tcphdr); break; #endif /* INET6 */ } - if (rt && rt->rt_ifp) { - mss = rt->rt_ifp->if_mtu - hlen - sizeof(struct tcphdr); - mss = max(V_tcp_mssdflt, mss); - RTFREE(rt); - } + mss = max(V_tcp_mssdflt, mss); mss = min(mss, offer); mss = max(mss, 64); /* sanity - at least max opt space */ return (mss); @@ -5194,13 +5183,12 @@ pf_pull_hdr(struct mbuf *m, int off, void *p, int len, return (p); } -int -pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif, +#ifdef RADIX_MPATH +static int +pf_routable_oldmpath(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif, int rtableid) { -#ifdef RADIX_MPATH struct radix_node_head *rnh; -#endif struct sockaddr_in *dst; int ret = 1; int check_mpath; @@ -5215,12 +5203,10 @@ pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif, struct ifnet *ifp; check_mpath = 0; -#ifdef RADIX_MPATH /* XXX: stick to table 0 for now */ rnh = rt_tables_get_rnh(0, af); if (rnh != NULL && rn_mpath_capable(rnh)) check_mpath = 1; -#endif bzero(&ro, sizeof(ro)); switch (af) { case AF_INET: @@ -5283,9 +5269,7 @@ pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif, if (kif->pfik_ifp == ifp) ret = 1; -#ifdef RADIX_MPATH rn = rn_mpath_next(rn); -#endif } while (check_mpath == 1 && rn != NULL && ret == 0); } else ret = 0; @@ -5294,6 +5278,72 @@ pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif, RTFREE(ro.ro_rt); return (ret); } +#endif + +int +pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif, + int rtableid) +{ +#ifdef INET + struct nhop4_basic nh4; +#endif +#ifdef INET6 + struct nhop6_basic nh6; +#endif + struct ifnet *ifp; +#ifdef RADIX_MPATH + struct radix_node_head *rnh; + + /* XXX: stick to table 0 for now */ + rnh = rt_tables_get_rnh(0, af); + if (rnh != NULL && rn_mpath_capable(rnh)) + return (pf_routable_oldmpath(addr, af, kif, rtableid)); +#endif + /* + * Skip check for addresses with embedded interface scope, + * as they would always match anyway. + */ + if (af == AF_INET6 && IN6_IS_SCOPE_EMBED(&addr->v6)) + return (1); + + if (af != AF_INET && af != AF_INET6) + return (0); + + /* Skip checks for ipsec interfaces */ + if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC) + return (1); + + ifp = NULL; + + switch (af) { +#ifdef INET6 + case AF_INET6: + if (fib6_lookup_nh_basic(rtableid, &addr->v6, 0, 0, 0, &nh6)!=0) + return (0); + ifp = nh6.nh_ifp; + break; +#endif +#ifdef INET + case AF_INET: + if (fib4_lookup_nh_basic(rtableid, addr->v4, 0, 0, &nh4) != 0) + return (0); + ifp = nh4.nh_ifp; + break; +#endif + } + + /* No interface given, this is a no-route check */ + if (kif == NULL) + return (1); + + if (kif->pfik_ifp == NULL) + return (0); + + /* Perform uRPF check if passed input interface */ + if (kif->pfik_ifp == ifp) + return (1); + return (0); +} #ifdef INET static void @@ -5344,23 +5394,20 @@ pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, dst.sin_addr = ip->ip_dst; if (r->rt == PF_FASTROUTE) { - struct rtentry *rt; + struct nhop4_basic nh4; if (s) PF_STATE_UNLOCK(s); - rt = rtalloc1_fib(sintosa(&dst), 0, 0, M_GETFIB(m0)); - if (rt == NULL) { + + if (fib4_lookup_nh_basic(M_GETFIB(m0), ip->ip_dst, 0, + m0->m_pkthdr.flowid, &nh4) != 0) { KMOD_IPSTAT_INC(ips_noroute); error = EHOSTUNREACH; goto bad; } - ifp = rt->rt_ifp; - counter_u64_add(rt->rt_pksent, 1); - - if (rt->rt_flags & RTF_GATEWAY) - bcopy(satosin(rt->rt_gateway), &dst, sizeof(dst)); - RTFREE_LOCKED(rt); + ifp = nh4.nh_ifp; + dst.sin_addr = nh4.nh_addr; } else { if (TAILQ_EMPTY(&r->rpool.list)) { DPFPRINTF(PF_DEBUG_URGENT, From 3aefe8928aaa4b3d670344de6b2e77fc958254c7 Mon Sep 17 00:00:00 2001 From: Renato Botelho Date: Thu, 7 Jan 2016 10:39:13 +0000 Subject: [PATCH 30/48] Make cap_mkdb and services_mkdb file operations sync Similar fix was done for passwd and group operations in r285050. When a temporary file is created and then renamed to replace official file there are no checks to make sure data was written to disk and if a power cycle happens at this time, system can end up with a 0 length file Approved by: bapt MFC after: 1 week Sponsored by: Netgate Differential Revision: https://reviews.freebsd.org/D2982 --- usr.bin/cap_mkdb/cap_mkdb.c | 2 +- usr.sbin/services_mkdb/services_mkdb.c | 20 ++++++++++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/usr.bin/cap_mkdb/cap_mkdb.c b/usr.bin/cap_mkdb/cap_mkdb.c index 2f8bd96f16af..bbcedd5a8d54 100644 --- a/usr.bin/cap_mkdb/cap_mkdb.c +++ b/usr.bin/cap_mkdb/cap_mkdb.c @@ -119,7 +119,7 @@ main(int argc, char *argv[]) (void)snprintf(buf, sizeof(buf), "%s.db", capname ? capname : *argv); if ((capname = strdup(buf)) == NULL) errx(1, "strdup failed"); - if ((capdbp = dbopen(capname, O_CREAT | O_TRUNC | O_RDWR, + if ((capdbp = dbopen(capname, O_CREAT | O_TRUNC | O_RDWR | O_SYNC, DEFFILEMODE, DB_HASH, &openinfo)) == NULL) err(1, "%s", buf); diff --git a/usr.sbin/services_mkdb/services_mkdb.c b/usr.sbin/services_mkdb/services_mkdb.c index a91340e7d7d2..9ea66deed689 100644 --- a/usr.sbin/services_mkdb/services_mkdb.c +++ b/usr.sbin/services_mkdb/services_mkdb.c @@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -91,6 +92,8 @@ main(int argc, char *argv[]) size_t cnt = 0; StringList *sl, ***svc; size_t port, proto; + char *dbname_dir; + int dbname_dir_fd = -1; setprogname(argv[0]); @@ -138,7 +141,7 @@ main(int argc, char *argv[]) err(1, "Cannot install exit handler"); (void)snprintf(tname, sizeof(tname), "%s.tmp", dbname); - db = dbopen(tname, O_RDWR | O_CREAT | O_EXCL, + db = dbopen(tname, O_RDWR | O_CREAT | O_EXCL | O_SYNC, (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH), DB_HASH, &hinfo); if (!db) err(1, "Error opening temporary database `%s'", tname); @@ -164,8 +167,21 @@ main(int argc, char *argv[]) if ((db->close)(db)) err(1, "Error closing temporary database `%s'", tname); - if (rename(tname, dbname) == -1) + /* + * Make sure file is safe on disk. To improve performance we will call + * fsync() to the directory where file lies + */ + if (rename(tname, dbname) == -1 || + (dbname_dir = dirname(dbname)) == NULL || + (dbname_dir_fd = open(dbname_dir, O_RDONLY|O_DIRECTORY)) == -1 || + fsync(dbname_dir_fd) != 0) { + if (dbname_dir_fd != -1) + close(dbname_dir_fd); err(1, "Cannot rename `%s' to `%s'", tname, dbname); + } + + if (dbname_dir_fd != -1) + close(dbname_dir_fd); return 0; } From 49b375e74b9813477d95292cd321dde141cfe1c3 Mon Sep 17 00:00:00 2001 From: "Jonathan T. Looney" Date: Thu, 7 Jan 2016 11:54:20 +0000 Subject: [PATCH 31/48] Apply the changes from r293284 to one additional file. Discussed with: glebius --- sys/netinet/tcp_stacks/fastpath.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sys/netinet/tcp_stacks/fastpath.c b/sys/netinet/tcp_stacks/fastpath.c index 85b24f67c7a3..a49f85bbaa06 100644 --- a/sys/netinet/tcp_stacks/fastpath.c +++ b/sys/netinet/tcp_stacks/fastpath.c @@ -158,13 +158,11 @@ static void tcp_do_segment_fastack(struct mbuf *, struct tcphdr *, * the ack that opens up a 0-sized window. * - LRO wasn't used for this segment. We make sure by checking that the * segment size is not larger than the MSS. - * - Delayed acks are enabled or this is a half-synchronized T/TCP - * connection. */ #define DELAY_ACK(tp, tlen) \ ((!tcp_timer_active(tp, TT_DELACK) && \ (tp->t_flags & TF_RXWIN0SENT) == 0) && \ - (tlen <= tp->t_maxopd) && \ + (tlen <= tp->t_maxseg) && \ (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) /* From a72d531f3563245375a44e834e184d00a207a5e1 Mon Sep 17 00:00:00 2001 From: "Alexander V. Chernikov" Date: Thu, 7 Jan 2016 12:22:29 +0000 Subject: [PATCH 32/48] Do not use 'struct route_in6' inside hash6_insert(). rin6 was used only as sockaddr_in6 storage. Make rtalloc1_fib() use on-stack sin6 and return rtenry directly, instead of doing useless work with 'struct route_in6'. --- sys/netgraph/netflow/netflow.c | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/sys/netgraph/netflow/netflow.c b/sys/netgraph/netflow/netflow.c index 9b3f7d839ce5..644d46f7a712 100644 --- a/sys/netgraph/netflow/netflow.c +++ b/sys/netgraph/netflow/netflow.c @@ -395,9 +395,8 @@ hash6_insert(priv_p priv, struct flow_hash_entry *hsh6, struct flow6_rec *r, int plen, uint8_t flags, uint8_t tcp_flags) { struct flow6_entry *fle6; - struct sockaddr_in6 *src, *dst; + struct sockaddr_in6 sin6; struct rtentry *rt; - struct route_in6 rin6; mtx_assert(&hsh6->mtx, MA_OWNED); @@ -425,16 +424,14 @@ hash6_insert(priv_p priv, struct flow_hash_entry *hsh6, struct flow6_rec *r, * fill in out_ifx, dst_mask, nexthop, and dst_as in future releases. */ if ((flags & NG_NETFLOW_CONF_NODSTLOOKUP) == 0) { - bzero(&rin6, sizeof(struct route_in6)); - dst = (struct sockaddr_in6 *)&rin6.ro_dst; - dst->sin6_len = sizeof(struct sockaddr_in6); - dst->sin6_family = AF_INET6; - dst->sin6_addr = r->dst.r_dst6; + bzero(&sin6, sizeof(struct sockaddr_in6)); + sin6.sin6_len = sizeof(struct sockaddr_in6); + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = r->dst.r_dst6; - rin6.ro_rt = rtalloc1_fib((struct sockaddr *)dst, 0, 0, r->fib); + rt = rtalloc1_fib((struct sockaddr *)&sin6, 0, 0, r->fib); - if (rin6.ro_rt != NULL) { - rt = rin6.ro_rt; + if (rt != NULL) { fle6->f.fle_o_ifx = rt->rt_ifp->if_index; if (rt->rt_flags & RTF_GATEWAY && @@ -453,17 +450,14 @@ hash6_insert(priv_p priv, struct flow_hash_entry *hsh6, struct flow6_rec *r, if ((flags & NG_NETFLOW_CONF_NOSRCLOOKUP) == 0) { /* Do route lookup on source address, to fill in src_mask. */ - bzero(&rin6, sizeof(struct route_in6)); - src = (struct sockaddr_in6 *)&rin6.ro_dst; - src->sin6_len = sizeof(struct sockaddr_in6); - src->sin6_family = AF_INET6; - src->sin6_addr = r->src.r_src6; + bzero(&sin6, sizeof(struct sockaddr_in6)); + sin6.sin6_len = sizeof(struct sockaddr_in6); + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = r->src.r_src6; - rin6.ro_rt = rtalloc1_fib((struct sockaddr *)src, 0, 0, r->fib); - - if (rin6.ro_rt != NULL) { - rt = rin6.ro_rt; + rt = rtalloc1_fib((struct sockaddr *)&sin6, 0, 0, r->fib); + if (rt != NULL) { if (rt_mask(rt)) fle6->f.src_mask = RT_MASK6(rt); else From 514ba65500d01b433c2faa6d7132af4804eb0865 Mon Sep 17 00:00:00 2001 From: Svatopluk Kraus Date: Thu, 7 Jan 2016 12:31:49 +0000 Subject: [PATCH 33/48] Print curpmap in "show pcpu" command. Approved by: kib (mentor) --- sys/arm/arm/db_interface.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sys/arm/arm/db_interface.c b/sys/arm/arm/db_interface.c index 25d1706c8ff1..43831462d896 100644 --- a/sys/arm/arm/db_interface.c +++ b/sys/arm/arm/db_interface.c @@ -152,6 +152,10 @@ int db_frame(struct db_variable *vp, db_expr_t *valp, int rw) void db_show_mdpcpu(struct pcpu *pc) { + +#if __ARM_ARCH >= 6 + db_printf("curpmap = %p\n", pc->pc_curpmap); +#endif } int db_validate_address(vm_offset_t addr) From 454f163b9fcf0e871b35e1d36874d519561f64be Mon Sep 17 00:00:00 2001 From: Jim Harris Date: Thu, 7 Jan 2016 15:55:41 +0000 Subject: [PATCH 34/48] nvd: set DISKFLAG_DIRECT_COMPLETION Submitted by: gallatin MFC after: 3 days --- sys/dev/nvd/nvd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/dev/nvd/nvd.c b/sys/dev/nvd/nvd.c index f459e06b2ab4..a890686ebc6e 100644 --- a/sys/dev/nvd/nvd.c +++ b/sys/dev/nvd/nvd.c @@ -287,7 +287,7 @@ nvd_new_disk(struct nvme_namespace *ns, void *ctrlr_arg) disk->d_unit = TAILQ_LAST(&disk_head, disk_list)->disk->d_unit + 1; - disk->d_flags = 0; + disk->d_flags = DISKFLAG_DIRECT_COMPLETION; if (nvme_ns_get_flags(ns) & NVME_NS_DEALLOCATE_SUPPORTED) disk->d_flags |= DISKFLAG_CANDELETE; From 8fe5c0d2867653bb073062361cd954a64798167b Mon Sep 17 00:00:00 2001 From: Jim Harris Date: Thu, 7 Jan 2016 15:57:17 +0000 Subject: [PATCH 35/48] nvd: do not wait for previous bios before submitting ordered bio Still wait until all in-flight bios (including the ordered bio) complete before processing more bios from the queue. MFC after: 3 days Sponsored by: Intel --- sys/dev/nvd/nvd.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/sys/dev/nvd/nvd.c b/sys/dev/nvd/nvd.c index a890686ebc6e..e6d4c0cb3be1 100644 --- a/sys/dev/nvd/nvd.c +++ b/sys/dev/nvd/nvd.c @@ -204,19 +204,6 @@ nvd_bioq_process(void *arg, int pending) if (bp == NULL) break; -#ifdef BIO_ORDERED - /* - * BIO_ORDERED flag dictates that all outstanding bios - * must be completed before processing the bio with - * BIO_ORDERED flag set. - */ - if (bp->bio_flags & BIO_ORDERED) { - while (ndisk->cur_depth > 0) { - pause("nvd flush", 1); - } - } -#endif - bp->bio_driver1 = NULL; atomic_add_int(&ndisk->cur_depth, 1); From 26ca317aef1ba366f08136da9342e99e525383fa Mon Sep 17 00:00:00 2001 From: Jim Harris Date: Thu, 7 Jan 2016 15:58:44 +0000 Subject: [PATCH 36/48] nvd: skip BIO_ORDERED logic when bio fails submission This ensures the bio flags are not read after biodone(). The ordering will still be enforced, after the bio is submitted successfully. MFC after: 3 days Sponsored by: Intel --- sys/dev/nvd/nvd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sys/dev/nvd/nvd.c b/sys/dev/nvd/nvd.c index e6d4c0cb3be1..7d78a8ca52c7 100644 --- a/sys/dev/nvd/nvd.c +++ b/sys/dev/nvd/nvd.c @@ -215,6 +215,7 @@ nvd_bioq_process(void *arg, int pending) bp->bio_flags |= BIO_ERROR; bp->bio_resid = bp->bio_bcount; biodone(bp); + continue; } #ifdef BIO_ORDERED From 47ef4244f580eb12dc49a8aa7ef3d581b66c4219 Mon Sep 17 00:00:00 2001 From: Jim Harris Date: Thu, 7 Jan 2016 15:59:51 +0000 Subject: [PATCH 37/48] nvd: break out submission logic into separate function This enables a future patch using this same logic to submit I/O directly bypassing the taskqueue. MFC after: 3 days Sponsored by: Intel --- sys/dev/nvd/nvd.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/sys/dev/nvd/nvd.c b/sys/dev/nvd/nvd.c index 7d78a8ca52c7..710b04bd7ddc 100644 --- a/sys/dev/nvd/nvd.c +++ b/sys/dev/nvd/nvd.c @@ -47,6 +47,8 @@ struct nvd_disk; static disk_ioctl_t nvd_ioctl; static disk_strategy_t nvd_strategy; +static void nvd_done(void *arg, const struct nvme_completion *cpl); + static void *nvd_new_disk(struct nvme_namespace *ns, void *ctrlr); static void destroy_geom_disk(struct nvd_disk *ndisk); @@ -148,6 +150,26 @@ nvd_unload() nvme_unregister_consumer(consumer_handle); } +static int +nvd_bio_submit(struct nvd_disk *ndisk, struct bio *bp) +{ + int err; + + bp->bio_driver1 = NULL; + atomic_add_int(&ndisk->cur_depth, 1); + err = nvme_ns_bio_process(ndisk->ns, bp, nvd_done); + if (err) { + atomic_add_int(&ndisk->cur_depth, -1); + bp->bio_error = err; + bp->bio_flags |= BIO_ERROR; + bp->bio_resid = bp->bio_bcount; + biodone(bp); + return (-1); + } + + return (0); +} + static void nvd_strategy(struct bio *bp) { @@ -195,7 +217,6 @@ nvd_bioq_process(void *arg, int pending) { struct nvd_disk *ndisk = arg; struct bio *bp; - int err; for (;;) { mtx_lock(&ndisk->bioqlock); @@ -204,17 +225,7 @@ nvd_bioq_process(void *arg, int pending) if (bp == NULL) break; - bp->bio_driver1 = NULL; - atomic_add_int(&ndisk->cur_depth, 1); - - err = nvme_ns_bio_process(ndisk->ns, bp, nvd_done); - - if (err) { - atomic_add_int(&ndisk->cur_depth, -1); - bp->bio_error = err; - bp->bio_flags |= BIO_ERROR; - bp->bio_resid = bp->bio_bcount; - biodone(bp); + if (nvd_bio_submit(ndisk, bp) != 0) { continue; } From 58d0b8f3c3e2b5441ea4c2323feccc1b9454fed6 Mon Sep 17 00:00:00 2001 From: Jim Harris Date: Thu, 7 Jan 2016 16:06:23 +0000 Subject: [PATCH 38/48] nvd: submit bios directly when BIO_ORDERED not set or in flight This significantly improves parallelism in the most common case. The taskqueue is still used whenever BIO_ORDERED bios are in flight. This patch is based heavily on a patch from gallatin@. MFC after: 3 days Sponsored by: Intel --- sys/dev/nvd/nvd.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/sys/dev/nvd/nvd.c b/sys/dev/nvd/nvd.c index 710b04bd7ddc..24ee07583034 100644 --- a/sys/dev/nvd/nvd.c +++ b/sys/dev/nvd/nvd.c @@ -73,6 +73,7 @@ struct nvd_disk { struct nvme_namespace *ns; uint32_t cur_depth; + uint32_t ordered_in_flight; TAILQ_ENTRY(nvd_disk) global_tailq; TAILQ_ENTRY(nvd_disk) ctrlr_tailq; @@ -160,6 +161,8 @@ nvd_bio_submit(struct nvd_disk *ndisk, struct bio *bp) err = nvme_ns_bio_process(ndisk->ns, bp, nvd_done); if (err) { atomic_add_int(&ndisk->cur_depth, -1); + if (__predict_false(bp->bio_flags & BIO_ORDERED)) + atomic_add_int(&ndisk->ordered_in_flight, -1); bp->bio_error = err; bp->bio_flags |= BIO_ERROR; bp->bio_resid = bp->bio_bcount; @@ -177,6 +180,18 @@ nvd_strategy(struct bio *bp) ndisk = (struct nvd_disk *)bp->bio_disk->d_drv1; + if (__predict_false(bp->bio_flags & BIO_ORDERED)) + atomic_add_int(&ndisk->ordered_in_flight, 1); + + if (__predict_true(ndisk->ordered_in_flight == 0)) { + nvd_bio_submit(ndisk, bp); + return; + } + + /* + * There are ordered bios in flight, so we need to submit + * bios through the task queue to enforce ordering. + */ mtx_lock(&ndisk->bioqlock); bioq_insert_tail(&ndisk->bioq, bp); mtx_unlock(&ndisk->bioqlock); @@ -208,6 +223,8 @@ nvd_done(void *arg, const struct nvme_completion *cpl) ndisk = bp->bio_disk->d_drv1; atomic_add_int(&ndisk->cur_depth, -1); + if (__predict_false(bp->bio_flags & BIO_ORDERED)) + atomic_add_int(&ndisk->ordered_in_flight, -1); biodone(bp); } @@ -316,6 +333,7 @@ nvd_new_disk(struct nvme_namespace *ns, void *ctrlr_arg) ndisk->ns = ns; ndisk->disk = disk; ndisk->cur_depth = 0; + ndisk->ordered_in_flight = 0; mtx_init(&ndisk->bioqlock, "NVD bioq lock", NULL, MTX_DEF); bioq_init(&ndisk->bioq); From d85f84abb80933c07138584631bb91283e2c0952 Mon Sep 17 00:00:00 2001 From: Jim Harris Date: Thu, 7 Jan 2016 16:08:04 +0000 Subject: [PATCH 39/48] nvme: simplify some of the nested ifs in interrupt setup code This prepares for some follow-up commits which do more work in this area. MFC after: 3 days Sponsored by: Intel --- sys/dev/nvme/nvme_ctrlr.c | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c index ee4b901a9c75..c0117fe0442b 100644 --- a/sys/dev/nvme/nvme_ctrlr.c +++ b/sys/dev/nvme/nvme_ctrlr.c @@ -999,7 +999,9 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev) if (pci_msix_count(dev) < 2) { ctrlr->msix_enabled = 0; goto intx; - } else if (pci_msix_count(dev) < num_vectors_requested) { + } + + if (pci_msix_count(dev) < num_vectors_requested) { ctrlr->per_cpu_io_queues = FALSE; ctrlr->num_io_queues = 1; num_vectors_requested = 2; /* one for admin, one for I/O */ @@ -1009,26 +1011,28 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev) if (pci_alloc_msix(dev, &num_vectors_allocated) != 0) { ctrlr->msix_enabled = 0; goto intx; - } else if (num_vectors_allocated < num_vectors_requested) { + } + + if (num_vectors_allocated < num_vectors_requested) { if (num_vectors_allocated < 2) { pci_release_msi(dev); ctrlr->msix_enabled = 0; goto intx; - } else { - ctrlr->per_cpu_io_queues = FALSE; - ctrlr->num_io_queues = 1; - /* - * Release whatever vectors were allocated, and just - * reallocate the two needed for the admin and single - * I/O qpair. - */ - num_vectors_allocated = 2; - pci_release_msi(dev); - if (pci_alloc_msix(dev, &num_vectors_allocated) != 0) - panic("could not reallocate any vectors\n"); - if (num_vectors_allocated != 2) - panic("could not reallocate 2 vectors\n"); } + + ctrlr->per_cpu_io_queues = FALSE; + ctrlr->num_io_queues = 1; + /* + * Release whatever vectors were allocated, and just + * reallocate the two needed for the admin and single + * I/O qpair. + */ + num_vectors_allocated = 2; + pci_release_msi(dev); + if (pci_alloc_msix(dev, &num_vectors_allocated) != 0) + panic("could not reallocate any vectors\n"); + if (num_vectors_allocated != 2) + panic("could not reallocate 2 vectors\n"); } /* From c75ad8ce5a37c30e5a0507211c472b3b2bb20e75 Mon Sep 17 00:00:00 2001 From: Jim Harris Date: Thu, 7 Jan 2016 16:09:56 +0000 Subject: [PATCH 40/48] nvme: remove per_cpu_io_queues from struct nvme_controller Instead just use num_io_queues to make this determination. This prepares for some future changes enabling use of multiple queues when we do not have enough queues or MSI-X vectors for one queue per CPU. MFC after: 3 days Sponsored by: Intel --- sys/dev/nvme/nvme_ctrlr.c | 11 +++-------- sys/dev/nvme/nvme_private.h | 1 - 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c index c0117fe0442b..d5c7373aa260 100644 --- a/sys/dev/nvme/nvme_ctrlr.c +++ b/sys/dev/nvme/nvme_ctrlr.c @@ -160,7 +160,7 @@ nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr) num_trackers, ctrlr); - if (ctrlr->per_cpu_io_queues) + if (ctrlr->num_io_queues > 1) bus_bind_intr(ctrlr->dev, qpair->res, i); } @@ -402,7 +402,6 @@ nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr) nvme_io_qpair_destroy(&ctrlr->ioq[i]); ctrlr->num_io_queues = 1; - ctrlr->per_cpu_io_queues = 0; } return (0); @@ -779,7 +778,6 @@ nvme_ctrlr_configure_intx(struct nvme_controller *ctrlr) { ctrlr->num_io_queues = 1; - ctrlr->per_cpu_io_queues = 0; ctrlr->rid = 0; ctrlr->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ, &ctrlr->rid, RF_SHAREABLE | RF_ACTIVE); @@ -969,9 +967,8 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev) per_cpu_io_queues = 1; TUNABLE_INT_FETCH("hw.nvme.per_cpu_io_queues", &per_cpu_io_queues); - ctrlr->per_cpu_io_queues = per_cpu_io_queues ? TRUE : FALSE; - if (ctrlr->per_cpu_io_queues) + if (per_cpu_io_queues) ctrlr->num_io_queues = mp_ncpus; else ctrlr->num_io_queues = 1; @@ -1002,7 +999,6 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev) } if (pci_msix_count(dev) < num_vectors_requested) { - ctrlr->per_cpu_io_queues = FALSE; ctrlr->num_io_queues = 1; num_vectors_requested = 2; /* one for admin, one for I/O */ } @@ -1020,7 +1016,6 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev) goto intx; } - ctrlr->per_cpu_io_queues = FALSE; ctrlr->num_io_queues = 1; /* * Release whatever vectors were allocated, and just @@ -1192,7 +1187,7 @@ nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr, { struct nvme_qpair *qpair; - if (ctrlr->per_cpu_io_queues) + if (ctrlr->num_io_queues > 1) qpair = &ctrlr->ioq[curcpu]; else qpair = &ctrlr->ioq[0]; diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h index 6137b41e6425..a6d3eff51b37 100644 --- a/sys/dev/nvme/nvme_private.h +++ b/sys/dev/nvme/nvme_private.h @@ -265,7 +265,6 @@ struct nvme_controller { uint32_t enable_aborts; uint32_t num_io_queues; - boolean_t per_cpu_io_queues; /* Fields for tracking progress during controller initialization. */ struct intr_config_hook config_hook; From e5af5854ff3b396ed514c3231463b7ef654ca0bb Mon Sep 17 00:00:00 2001 From: Jim Harris Date: Thu, 7 Jan 2016 16:11:31 +0000 Subject: [PATCH 41/48] nvme: do not pre-allocate MSI-X IRQ resources The issue referenced here was resolved by other changes in recent commits, so this code is no longer needed. MFC after: 3 days Sponsored by: Intel --- sys/dev/nvme/nvme_ctrlr.c | 37 +------------------------------------ sys/dev/nvme/nvme_private.h | 2 -- sys/dev/nvme/nvme_qpair.c | 3 ++- 3 files changed, 3 insertions(+), 39 deletions(-) diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c index d5c7373aa260..35ee07b2b7e4 100644 --- a/sys/dev/nvme/nvme_ctrlr.c +++ b/sys/dev/nvme/nvme_ctrlr.c @@ -929,7 +929,7 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev) { union cap_lo_register cap_lo; union cap_hi_register cap_hi; - int i, per_cpu_io_queues, rid; + int per_cpu_io_queues; int num_vectors_requested, num_vectors_allocated; int status, timeout_period; @@ -1030,41 +1030,6 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev) panic("could not reallocate 2 vectors\n"); } - /* - * On earlier FreeBSD releases, there are reports that - * pci_alloc_msix() can return successfully with all vectors - * requested, but a subsequent bus_alloc_resource_any() - * for one of those vectors fails. This issue occurs more - * readily with multiple devices using per-CPU vectors. - * To workaround this issue, try to allocate the resources now, - * and fall back to INTx if we cannot allocate all of them. - * This issue cannot be reproduced on more recent versions of - * FreeBSD which have increased the maximum number of MSI-X - * vectors, but adding the workaround makes it easier for - * vendors wishing to import this driver into kernels based on - * older versions of FreeBSD. - */ - for (i = 0; i < num_vectors_allocated; i++) { - rid = i + 1; - ctrlr->msi_res[i] = bus_alloc_resource_any(ctrlr->dev, - SYS_RES_IRQ, &rid, RF_ACTIVE); - - if (ctrlr->msi_res[i] == NULL) { - ctrlr->msix_enabled = 0; - while (i > 0) { - i--; - bus_release_resource(ctrlr->dev, - SYS_RES_IRQ, - rman_get_rid(ctrlr->msi_res[i]), - ctrlr->msi_res[i]); - } - pci_release_msi(dev); - nvme_printf(ctrlr, "could not obtain all MSI-X " - "resources, reverting to intx\n"); - break; - } - } - intx: if (!ctrlr->msix_enabled) diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h index a6d3eff51b37..7e049eef0adf 100644 --- a/sys/dev/nvme/nvme_private.h +++ b/sys/dev/nvme/nvme_private.h @@ -275,8 +275,6 @@ struct nvme_controller { struct task fail_req_task; struct taskqueue *taskqueue; - struct resource *msi_res[MAXCPU + 1]; - /* For shared legacy interrupt. */ int rid; struct resource *res; diff --git a/sys/dev/nvme/nvme_qpair.c b/sys/dev/nvme/nvme_qpair.c index d0cb8c6212c7..92fe6722e4f4 100644 --- a/sys/dev/nvme/nvme_qpair.c +++ b/sys/dev/nvme/nvme_qpair.c @@ -479,8 +479,9 @@ nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id, * the queue's vector to get the corresponding rid to use. */ qpair->rid = vector + 1; - qpair->res = ctrlr->msi_res[vector]; + qpair->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ, + &qpair->rid, RF_ACTIVE); bus_setup_intr(ctrlr->dev, qpair->res, INTR_TYPE_MISC | INTR_MPSAFE, NULL, nvme_qpair_msix_handler, qpair, &qpair->tag); From d400f790b12fc5a8b945b6dfbde3b90e723a5632 Mon Sep 17 00:00:00 2001 From: Jim Harris Date: Thu, 7 Jan 2016 16:12:42 +0000 Subject: [PATCH 42/48] nvme: break out interrupt setup code into a separate function MFC after: 3 days Sponsored by: Intel --- sys/dev/nvme/nvme_ctrlr.c | 133 +++++++++++++++++++------------------- 1 file changed, 65 insertions(+), 68 deletions(-) diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c index 35ee07b2b7e4..76f9fb3ec996 100644 --- a/sys/dev/nvme/nvme_ctrlr.c +++ b/sys/dev/nvme/nvme_ctrlr.c @@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$"); static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr, struct nvme_async_event_request *aer); +static void nvme_ctrlr_setup_interrupts(struct nvme_controller *ctrlr); static int nvme_ctrlr_allocate_bar(struct nvme_controller *ctrlr) @@ -777,6 +778,7 @@ static int nvme_ctrlr_configure_intx(struct nvme_controller *ctrlr) { + ctrlr->msix_enabled = 0; ctrlr->num_io_queues = 1; ctrlr->rid = 0; ctrlr->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ, @@ -924,13 +926,73 @@ static struct cdevsw nvme_ctrlr_cdevsw = { .d_ioctl = nvme_ctrlr_ioctl }; +static void +nvme_ctrlr_setup_interrupts(struct nvme_controller *ctrlr) +{ + device_t dev; + int per_cpu_io_queues; + int num_vectors_requested, num_vectors_allocated; + + dev = ctrlr->dev; + per_cpu_io_queues = 1; + TUNABLE_INT_FETCH("hw.nvme.per_cpu_io_queues", &per_cpu_io_queues); + + ctrlr->force_intx = 0; + TUNABLE_INT_FETCH("hw.nvme.force_intx", &ctrlr->force_intx); + + if (ctrlr->force_intx || pci_msix_count(dev) < 2) { + nvme_ctrlr_configure_intx(ctrlr); + return; + } + + ctrlr->msix_enabled = 1; + + if (per_cpu_io_queues) + ctrlr->num_io_queues = mp_ncpus; + else + ctrlr->num_io_queues = 1; + + /* One vector per IO queue, plus one vector for admin queue. */ + num_vectors_requested = ctrlr->num_io_queues + 1; + + if (pci_msix_count(dev) < num_vectors_requested) { + ctrlr->num_io_queues = 1; + num_vectors_requested = 2; /* one for admin, one for I/O */ + } + + num_vectors_allocated = num_vectors_requested; + if (pci_alloc_msix(dev, &num_vectors_allocated) != 0) { + nvme_ctrlr_configure_intx(ctrlr); + return; + } + + if (num_vectors_allocated < num_vectors_requested) { + if (num_vectors_allocated < 2) { + pci_release_msi(dev); + nvme_ctrlr_configure_intx(ctrlr); + return; + } + + ctrlr->num_io_queues = 1; + /* + * Release whatever vectors were allocated, and just + * reallocate the two needed for the admin and single + * I/O qpair. + */ + num_vectors_allocated = 2; + pci_release_msi(dev); + if (pci_alloc_msix(dev, &num_vectors_allocated) != 0) + panic("could not reallocate any vectors\n"); + if (num_vectors_allocated != 2) + panic("could not reallocate 2 vectors\n"); + } +} + int nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev) { union cap_lo_register cap_lo; union cap_hi_register cap_hi; - int per_cpu_io_queues; - int num_vectors_requested, num_vectors_allocated; int status, timeout_period; ctrlr->dev = dev; @@ -965,75 +1027,10 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev) nvme_retry_count = NVME_DEFAULT_RETRY_COUNT; TUNABLE_INT_FETCH("hw.nvme.retry_count", &nvme_retry_count); - per_cpu_io_queues = 1; - TUNABLE_INT_FETCH("hw.nvme.per_cpu_io_queues", &per_cpu_io_queues); - - if (per_cpu_io_queues) - ctrlr->num_io_queues = mp_ncpus; - else - ctrlr->num_io_queues = 1; - - ctrlr->force_intx = 0; - TUNABLE_INT_FETCH("hw.nvme.force_intx", &ctrlr->force_intx); - ctrlr->enable_aborts = 0; TUNABLE_INT_FETCH("hw.nvme.enable_aborts", &ctrlr->enable_aborts); - ctrlr->msix_enabled = 1; - - if (ctrlr->force_intx) { - ctrlr->msix_enabled = 0; - goto intx; - } - - /* One vector per IO queue, plus one vector for admin queue. */ - num_vectors_requested = ctrlr->num_io_queues + 1; - - /* - * If we cannot even allocate 2 vectors (one for admin, one for - * I/O), then revert to INTx. - */ - if (pci_msix_count(dev) < 2) { - ctrlr->msix_enabled = 0; - goto intx; - } - - if (pci_msix_count(dev) < num_vectors_requested) { - ctrlr->num_io_queues = 1; - num_vectors_requested = 2; /* one for admin, one for I/O */ - } - - num_vectors_allocated = num_vectors_requested; - if (pci_alloc_msix(dev, &num_vectors_allocated) != 0) { - ctrlr->msix_enabled = 0; - goto intx; - } - - if (num_vectors_allocated < num_vectors_requested) { - if (num_vectors_allocated < 2) { - pci_release_msi(dev); - ctrlr->msix_enabled = 0; - goto intx; - } - - ctrlr->num_io_queues = 1; - /* - * Release whatever vectors were allocated, and just - * reallocate the two needed for the admin and single - * I/O qpair. - */ - num_vectors_allocated = 2; - pci_release_msi(dev); - if (pci_alloc_msix(dev, &num_vectors_allocated) != 0) - panic("could not reallocate any vectors\n"); - if (num_vectors_allocated != 2) - panic("could not reallocate 2 vectors\n"); - } - -intx: - - if (!ctrlr->msix_enabled) - nvme_ctrlr_configure_intx(ctrlr); + nvme_ctrlr_setup_interrupts(ctrlr); ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE; nvme_ctrlr_construct_admin_qpair(ctrlr); From 2b647da7a0e7239bff2b30bda457bda3dd82a691 Mon Sep 17 00:00:00 2001 From: Jim Harris Date: Thu, 7 Jan 2016 16:18:32 +0000 Subject: [PATCH 43/48] nvme: do not revert o single I/O queue when per-CPU queues not possible Previously nvme(4) would revert to a signle I/O queue if it could not allocate enought interrupt vectors or NVMe submission/completion queues to have one I/O queue per core. This patch determines how to utilize a smaller number of available interrupt vectors, and assigns (as closely as possible) an equal number of cores to each associated I/O queue. MFC after: 3 days Sponsored by: Intel --- sys/dev/nvme/nvme.c | 2 - sys/dev/nvme/nvme_ctrlr.c | 167 +++++++++++++++++++++++------------- sys/dev/nvme/nvme_private.h | 1 + 3 files changed, 106 insertions(+), 64 deletions(-) diff --git a/sys/dev/nvme/nvme.c b/sys/dev/nvme/nvme.c index cc14d34afbc4..9db2b14d13d0 100644 --- a/sys/dev/nvme/nvme.c +++ b/sys/dev/nvme/nvme.c @@ -270,8 +270,6 @@ nvme_attach(device_t dev) return (status); } - nvme_sysctl_initialize_ctrlr(ctrlr); - pci_enable_busmaster(dev); ctrlr->config_hook.ich_func = nvme_ctrlr_start_config_hook; diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c index 76f9fb3ec996..151f02542626 100644 --- a/sys/dev/nvme/nvme_ctrlr.c +++ b/sys/dev/nvme/nvme_ctrlr.c @@ -42,6 +42,12 @@ __FBSDID("$FreeBSD$"); #include "nvme_private.h" +/* + * Used for calculating number of CPUs to assign to each core and number of I/O + * queues to allocate per controller. + */ +#define NVME_CEILING(num, div) ((((num) - 1) / (div)) + 1) + static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr, struct nvme_async_event_request *aer); static void nvme_ctrlr_setup_interrupts(struct nvme_controller *ctrlr); @@ -141,6 +147,13 @@ nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr) */ num_trackers = min(num_trackers, (num_entries-1)); + /* + * This was calculated previously when setting up interrupts, but + * a controller could theoretically support fewer I/O queues than + * MSI-X vectors. So calculate again here just to be safe. + */ + ctrlr->num_cpus_per_ioq = NVME_CEILING(mp_ncpus, ctrlr->num_io_queues); + ctrlr->ioq = malloc(ctrlr->num_io_queues * sizeof(struct nvme_qpair), M_NVME, M_ZERO | M_WAITOK); @@ -161,8 +174,13 @@ nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr) num_trackers, ctrlr); + /* + * Do not bother binding interrupts if we only have one I/O + * interrupt thread for this controller. + */ if (ctrlr->num_io_queues > 1) - bus_bind_intr(ctrlr->dev, qpair->res, i); + bus_bind_intr(ctrlr->dev, qpair->res, + i * ctrlr->num_cpus_per_ioq); } return (0); @@ -307,8 +325,15 @@ nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr) int i; nvme_admin_qpair_disable(&ctrlr->adminq); - for (i = 0; i < ctrlr->num_io_queues; i++) - nvme_io_qpair_disable(&ctrlr->ioq[i]); + /* + * I/O queues are not allocated before the initial HW + * reset, so do not try to disable them. Use is_initialized + * to determine if this is the initial HW reset. + */ + if (ctrlr->is_initialized) { + for (i = 0; i < ctrlr->num_io_queues; i++) + nvme_io_qpair_disable(&ctrlr->ioq[i]); + } DELAY(100*1000); @@ -364,7 +389,7 @@ static int nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr) { struct nvme_completion_poll_status status; - int cq_allocated, i, sq_allocated; + int cq_allocated, sq_allocated; status.done = FALSE; nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->num_io_queues, @@ -385,25 +410,12 @@ nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr) cq_allocated = (status.cpl.cdw0 >> 16) + 1; /* - * Check that the controller was able to allocate the number of - * queues we requested. If not, revert to one IO queue pair. + * Controller may allocate more queues than we requested, + * so use the minimum of the number requested and what was + * actually allocated. */ - if (sq_allocated < ctrlr->num_io_queues || - cq_allocated < ctrlr->num_io_queues) { - - /* - * Destroy extra IO queue pairs that were created at - * controller construction time but are no longer - * needed. This will only happen when a controller - * supports fewer queues than MSI-X vectors. This - * is not the normal case, but does occur with the - * Chatham prototype board. - */ - for (i = 1; i < ctrlr->num_io_queues; i++) - nvme_io_qpair_destroy(&ctrlr->ioq[i]); - - ctrlr->num_io_queues = 1; - } + ctrlr->num_io_queues = min(ctrlr->num_io_queues, sq_allocated); + ctrlr->num_io_queues = min(ctrlr->num_io_queues, cq_allocated); return (0); } @@ -687,9 +699,20 @@ static void nvme_ctrlr_start(void *ctrlr_arg) { struct nvme_controller *ctrlr = ctrlr_arg; + uint32_t old_num_io_queues; int i; - nvme_qpair_reset(&ctrlr->adminq); + /* + * Only reset adminq here when we are restarting the + * controller after a reset. During initialization, + * we have already submitted admin commands to get + * the number of I/O queues supported, so cannot reset + * the adminq again here. + */ + if (ctrlr->is_resetting) { + nvme_qpair_reset(&ctrlr->adminq); + } + for (i = 0; i < ctrlr->num_io_queues; i++) nvme_qpair_reset(&ctrlr->ioq[i]); @@ -700,11 +723,25 @@ nvme_ctrlr_start(void *ctrlr_arg) return; } + /* + * The number of qpairs are determined during controller initialization, + * including using NVMe SET_FEATURES/NUMBER_OF_QUEUES to determine the + * HW limit. We call SET_FEATURES again here so that it gets called + * after any reset for controllers that depend on the driver to + * explicit specify how many queues it will use. This value should + * never change between resets, so panic if somehow that does happen. + */ + old_num_io_queues = ctrlr->num_io_queues; if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0) { nvme_ctrlr_fail(ctrlr); return; } + if (old_num_io_queues != ctrlr->num_io_queues) { + panic("num_io_queues changed from %u to %u", old_num_io_queues, + ctrlr->num_io_queues); + } + if (nvme_ctrlr_create_qpairs(ctrlr) != 0) { nvme_ctrlr_fail(ctrlr); return; @@ -727,7 +764,16 @@ nvme_ctrlr_start_config_hook(void *arg) { struct nvme_controller *ctrlr = arg; - nvme_ctrlr_start(ctrlr); + nvme_qpair_reset(&ctrlr->adminq); + nvme_admin_qpair_enable(&ctrlr->adminq); + + if (nvme_ctrlr_set_num_qpairs(ctrlr) == 0 && + nvme_ctrlr_construct_io_qpairs(ctrlr) == 0) + nvme_ctrlr_start(ctrlr); + else + nvme_ctrlr_fail(ctrlr); + + nvme_sysctl_initialize_ctrlr(ctrlr); config_intrhook_disestablish(&ctrlr->config_hook); ctrlr->is_initialized = 1; @@ -780,6 +826,7 @@ nvme_ctrlr_configure_intx(struct nvme_controller *ctrlr) ctrlr->msix_enabled = 0; ctrlr->num_io_queues = 1; + ctrlr->num_cpus_per_ioq = mp_ncpus; ctrlr->rid = 0; ctrlr->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ, &ctrlr->rid, RF_SHAREABLE | RF_ACTIVE); @@ -932,6 +979,7 @@ nvme_ctrlr_setup_interrupts(struct nvme_controller *ctrlr) device_t dev; int per_cpu_io_queues; int num_vectors_requested, num_vectors_allocated; + int num_vectors_available; dev = ctrlr->dev; per_cpu_io_queues = 1; @@ -940,52 +988,55 @@ nvme_ctrlr_setup_interrupts(struct nvme_controller *ctrlr) ctrlr->force_intx = 0; TUNABLE_INT_FETCH("hw.nvme.force_intx", &ctrlr->force_intx); - if (ctrlr->force_intx || pci_msix_count(dev) < 2) { + /* + * FreeBSD currently cannot allocate more than about 190 vectors at + * boot, meaning that systems with high core count and many devices + * requesting per-CPU interrupt vectors will not get their full + * allotment. So first, try to allocate as many as we may need to + * understand what is available, then immediately release them. + * Then figure out how many of those we will actually use, based on + * assigning an equal number of cores to each I/O queue. + */ + + /* One vector for per core I/O queue, plus one vector for admin queue. */ + num_vectors_available = min(pci_msix_count(dev), mp_ncpus + 1); + if (pci_alloc_msix(dev, &num_vectors_available) != 0) { + num_vectors_available = 0; + } + pci_release_msi(dev); + + if (ctrlr->force_intx || num_vectors_available < 2) { nvme_ctrlr_configure_intx(ctrlr); return; } - ctrlr->msix_enabled = 1; - if (per_cpu_io_queues) - ctrlr->num_io_queues = mp_ncpus; + ctrlr->num_cpus_per_ioq = NVME_CEILING(mp_ncpus, num_vectors_available + 1); else - ctrlr->num_io_queues = 1; + ctrlr->num_cpus_per_ioq = mp_ncpus; - /* One vector per IO queue, plus one vector for admin queue. */ + ctrlr->num_io_queues = NVME_CEILING(mp_ncpus, ctrlr->num_cpus_per_ioq); num_vectors_requested = ctrlr->num_io_queues + 1; - - if (pci_msix_count(dev) < num_vectors_requested) { - ctrlr->num_io_queues = 1; - num_vectors_requested = 2; /* one for admin, one for I/O */ - } - num_vectors_allocated = num_vectors_requested; + + /* + * Now just allocate the number of vectors we need. This should + * succeed, since we previously called pci_alloc_msix() + * successfully returning at least this many vectors, but just to + * be safe, if something goes wrong just revert to INTx. + */ if (pci_alloc_msix(dev, &num_vectors_allocated) != 0) { nvme_ctrlr_configure_intx(ctrlr); return; } if (num_vectors_allocated < num_vectors_requested) { - if (num_vectors_allocated < 2) { - pci_release_msi(dev); - nvme_ctrlr_configure_intx(ctrlr); - return; - } - - ctrlr->num_io_queues = 1; - /* - * Release whatever vectors were allocated, and just - * reallocate the two needed for the admin and single - * I/O qpair. - */ - num_vectors_allocated = 2; pci_release_msi(dev); - if (pci_alloc_msix(dev, &num_vectors_allocated) != 0) - panic("could not reallocate any vectors\n"); - if (num_vectors_allocated != 2) - panic("could not reallocate 2 vectors\n"); + nvme_ctrlr_configure_intx(ctrlr); + return; } + + ctrlr->msix_enabled = 1; } int @@ -1034,10 +1085,6 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev) ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE; nvme_ctrlr_construct_admin_qpair(ctrlr); - status = nvme_ctrlr_construct_io_qpairs(ctrlr); - - if (status != 0) - return (status); ctrlr->cdev = make_dev(&nvme_ctrlr_cdevsw, device_get_unit(dev), UID_ROOT, GID_WHEEL, 0600, "nvme%d", device_get_unit(dev)); @@ -1149,11 +1196,7 @@ nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr, { struct nvme_qpair *qpair; - if (ctrlr->num_io_queues > 1) - qpair = &ctrlr->ioq[curcpu]; - else - qpair = &ctrlr->ioq[0]; - + qpair = &ctrlr->ioq[curcpu / ctrlr->num_cpus_per_ioq]; nvme_qpair_submit_request(qpair, req); } diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h index 7e049eef0adf..33307117b5be 100644 --- a/sys/dev/nvme/nvme_private.h +++ b/sys/dev/nvme/nvme_private.h @@ -265,6 +265,7 @@ struct nvme_controller { uint32_t enable_aborts; uint32_t num_io_queues; + uint32_t num_cpus_per_ioq; /* Fields for tracking progress during controller initialization. */ struct intr_config_hook config_hook; From 712b97a63019863268edb2fe9067dc26ee1faf4a Mon Sep 17 00:00:00 2001 From: Sean Bruno Date: Thu, 7 Jan 2016 16:20:55 +0000 Subject: [PATCH 44/48] Switch em(4) to the extended RX descriptor format. This matches the e1000/e1000e split in linux. MFC after: 2 weeks Sponsored by: Limelight Networks Differential Revision: https://reviews.freebsd.org/D3447 --- sys/dev/e1000/if_em.c | 108 ++++++++++++++++++++++++------------------ 1 file changed, 63 insertions(+), 45 deletions(-) diff --git a/sys/dev/e1000/if_em.c b/sys/dev/e1000/if_em.c index d9b3ca559d6e..9fa5ee40898b 100644 --- a/sys/dev/e1000/if_em.c +++ b/sys/dev/e1000/if_em.c @@ -3733,29 +3733,38 @@ em_transmit_checksum_setup(struct tx_ring *txr, struct mbuf *mp, int ip_off, offload |= CSUM_TCP; tucss = hdr_len; tucso = hdr_len + offsetof(struct tcphdr, th_sum); - /* - * Setting up new checksum offload context for every frames - * takes a lot of processing time for hardware. This also - * reduces performance a lot for small sized frames so avoid - * it if driver can use previously configured checksum - * offload context. - */ - if (txr->last_hw_offload == offload) { - if (offload & CSUM_IP) { - if (txr->last_hw_ipcss == ipcss && - txr->last_hw_ipcso == ipcso && - txr->last_hw_tucss == tucss && - txr->last_hw_tucso == tucso) - return; - } else { - if (txr->last_hw_tucss == tucss && - txr->last_hw_tucso == tucso) - return; - } - } - txr->last_hw_offload = offload; - txr->last_hw_tucss = tucss; - txr->last_hw_tucso = tucso; + /* + * The 82574L can only remember the *last* context used + * regardless of queue that it was use for. We cannot reuse + * contexts on this hardware platform and must generate a new + * context every time. 82574L hardware spec, section 7.2.6, + * second note. + */ + if (adapter->num_queues < 2) { + /* + * Setting up new checksum offload context for every + * frames takes a lot of processing time for hardware. + * This also reduces performance a lot for small sized + * frames so avoid it if driver can use previously + * configured checksum offload context. + */ + if (txr->last_hw_offload == offload) { + if (offload & CSUM_IP) { + if (txr->last_hw_ipcss == ipcss && + txr->last_hw_ipcso == ipcso && + txr->last_hw_tucss == tucss && + txr->last_hw_tucso == tucso) + return; + } else { + if (txr->last_hw_tucss == tucss && + txr->last_hw_tucso == tucso) + return; + } + } + txr->last_hw_offload = offload; + txr->last_hw_tucss = tucss; + txr->last_hw_tucso = tucso; + } /* * Start offset for payload checksum calculation. * End offset for payload checksum calculation. @@ -3771,29 +3780,38 @@ em_transmit_checksum_setup(struct tx_ring *txr, struct mbuf *mp, int ip_off, *txd_upper |= E1000_TXD_POPTS_TXSM << 8; tucss = hdr_len; tucso = hdr_len + offsetof(struct udphdr, uh_sum); - /* - * Setting up new checksum offload context for every frames - * takes a lot of processing time for hardware. This also - * reduces performance a lot for small sized frames so avoid - * it if driver can use previously configured checksum - * offload context. - */ - if (txr->last_hw_offload == offload) { - if (offload & CSUM_IP) { - if (txr->last_hw_ipcss == ipcss && - txr->last_hw_ipcso == ipcso && - txr->last_hw_tucss == tucss && - txr->last_hw_tucso == tucso) - return; - } else { - if (txr->last_hw_tucss == tucss && - txr->last_hw_tucso == tucso) - return; + /* + * The 82574L can only remember the *last* context used + * regardless of queue that it was use for. We cannot reuse + * contexts on this hardware platform and must generate a new + * context every time. 82574L hardware spec, section 7.2.6, + * second note. + */ + if (adapter->num_queues < 2) { + /* + * Setting up new checksum offload context for every + * frames takes a lot of processing time for hardware. + * This also reduces performance a lot for small sized + * frames so avoid it if driver can use previously + * configured checksum offload context. + */ + if (txr->last_hw_offload == offload) { + if (offload & CSUM_IP) { + if (txr->last_hw_ipcss == ipcss && + txr->last_hw_ipcso == ipcso && + txr->last_hw_tucss == tucss && + txr->last_hw_tucso == tucso) + return; + } else { + if (txr->last_hw_tucss == tucss && + txr->last_hw_tucso == tucso) + return; + } } - } - txr->last_hw_offload = offload; - txr->last_hw_tucss = tucss; - txr->last_hw_tucso = tucso; + txr->last_hw_offload = offload; + txr->last_hw_tucss = tucss; + txr->last_hw_tucso = tucso; + } /* * Start offset for header checksum calculation. * End offset for header checksum calculation. From 8061e8bb1e379c9bbe7305423c387ea6523484bc Mon Sep 17 00:00:00 2001 From: Sean Bruno Date: Thu, 7 Jan 2016 16:24:18 +0000 Subject: [PATCH 45/48] Wow, um ... sorry about that. The commit log for this code should have read that it was for EM_MULTIQUEUE. Revert this and try again. --- sys/dev/e1000/if_em.c | 108 ++++++++++++++++++------------------------ 1 file changed, 45 insertions(+), 63 deletions(-) diff --git a/sys/dev/e1000/if_em.c b/sys/dev/e1000/if_em.c index 9fa5ee40898b..d9b3ca559d6e 100644 --- a/sys/dev/e1000/if_em.c +++ b/sys/dev/e1000/if_em.c @@ -3733,38 +3733,29 @@ em_transmit_checksum_setup(struct tx_ring *txr, struct mbuf *mp, int ip_off, offload |= CSUM_TCP; tucss = hdr_len; tucso = hdr_len + offsetof(struct tcphdr, th_sum); - /* - * The 82574L can only remember the *last* context used - * regardless of queue that it was use for. We cannot reuse - * contexts on this hardware platform and must generate a new - * context every time. 82574L hardware spec, section 7.2.6, - * second note. - */ - if (adapter->num_queues < 2) { - /* - * Setting up new checksum offload context for every - * frames takes a lot of processing time for hardware. - * This also reduces performance a lot for small sized - * frames so avoid it if driver can use previously - * configured checksum offload context. - */ - if (txr->last_hw_offload == offload) { - if (offload & CSUM_IP) { - if (txr->last_hw_ipcss == ipcss && - txr->last_hw_ipcso == ipcso && - txr->last_hw_tucss == tucss && - txr->last_hw_tucso == tucso) - return; - } else { - if (txr->last_hw_tucss == tucss && - txr->last_hw_tucso == tucso) - return; - } - } - txr->last_hw_offload = offload; - txr->last_hw_tucss = tucss; - txr->last_hw_tucso = tucso; - } + /* + * Setting up new checksum offload context for every frames + * takes a lot of processing time for hardware. This also + * reduces performance a lot for small sized frames so avoid + * it if driver can use previously configured checksum + * offload context. + */ + if (txr->last_hw_offload == offload) { + if (offload & CSUM_IP) { + if (txr->last_hw_ipcss == ipcss && + txr->last_hw_ipcso == ipcso && + txr->last_hw_tucss == tucss && + txr->last_hw_tucso == tucso) + return; + } else { + if (txr->last_hw_tucss == tucss && + txr->last_hw_tucso == tucso) + return; + } + } + txr->last_hw_offload = offload; + txr->last_hw_tucss = tucss; + txr->last_hw_tucso = tucso; /* * Start offset for payload checksum calculation. * End offset for payload checksum calculation. @@ -3780,38 +3771,29 @@ em_transmit_checksum_setup(struct tx_ring *txr, struct mbuf *mp, int ip_off, *txd_upper |= E1000_TXD_POPTS_TXSM << 8; tucss = hdr_len; tucso = hdr_len + offsetof(struct udphdr, uh_sum); - /* - * The 82574L can only remember the *last* context used - * regardless of queue that it was use for. We cannot reuse - * contexts on this hardware platform and must generate a new - * context every time. 82574L hardware spec, section 7.2.6, - * second note. - */ - if (adapter->num_queues < 2) { - /* - * Setting up new checksum offload context for every - * frames takes a lot of processing time for hardware. - * This also reduces performance a lot for small sized - * frames so avoid it if driver can use previously - * configured checksum offload context. - */ - if (txr->last_hw_offload == offload) { - if (offload & CSUM_IP) { - if (txr->last_hw_ipcss == ipcss && - txr->last_hw_ipcso == ipcso && - txr->last_hw_tucss == tucss && - txr->last_hw_tucso == tucso) - return; - } else { - if (txr->last_hw_tucss == tucss && - txr->last_hw_tucso == tucso) - return; - } + /* + * Setting up new checksum offload context for every frames + * takes a lot of processing time for hardware. This also + * reduces performance a lot for small sized frames so avoid + * it if driver can use previously configured checksum + * offload context. + */ + if (txr->last_hw_offload == offload) { + if (offload & CSUM_IP) { + if (txr->last_hw_ipcss == ipcss && + txr->last_hw_ipcso == ipcso && + txr->last_hw_tucss == tucss && + txr->last_hw_tucso == tucso) + return; + } else { + if (txr->last_hw_tucss == tucss && + txr->last_hw_tucso == tucso) + return; } - txr->last_hw_offload = offload; - txr->last_hw_tucss = tucss; - txr->last_hw_tucso = tucso; - } + } + txr->last_hw_offload = offload; + txr->last_hw_tucss = tucss; + txr->last_hw_tucso = tucso; /* * Start offset for header checksum calculation. * End offset for header checksum calculation. From b834dcea9a55500a0aa8e79443a398810b3bfdbe Mon Sep 17 00:00:00 2001 From: Sean Bruno Date: Thu, 7 Jan 2016 16:42:48 +0000 Subject: [PATCH 46/48] Switch em(4) to the extended RX descriptor format. This matches the e1000/e1000e split in linux. Split rxbuffer and txbuffer apart to support the new RX descriptor format structures. Move rxbuffer manipulation to em_setup_rxdesc() to unify the new behavior changes. Add a RSSKEYLEN macro for help in generating the RSSKEY data structures in the card. Change em_receive_checksum() to process the new rxdescriptor format status bit. MFC after: 2 weeks Sponsored by: Limelight Networks Differential Revision: https://reviews.freebsd.org/D3447 --- sys/dev/e1000/if_em.c | 188 +++++++++++++++++++--------------- sys/dev/e1000/if_em.h | 20 +++- sys/dev/netmap/if_em_netmap.h | 16 +-- 3 files changed, 129 insertions(+), 95 deletions(-) diff --git a/sys/dev/e1000/if_em.c b/sys/dev/e1000/if_em.c index d9b3ca559d6e..de53d8b9f229 100644 --- a/sys/dev/e1000/if_em.c +++ b/sys/dev/e1000/if_em.c @@ -260,7 +260,9 @@ static bool em_rxeof(struct rx_ring *, int, int *); #ifndef __NO_STRICT_ALIGNMENT static int em_fixup_rx(struct rx_ring *); #endif -static void em_receive_checksum(struct e1000_rx_desc *, struct mbuf *); +static void em_setup_rxdesc(union e1000_rx_desc_extended *, + const struct em_rxbuffer *rxbuf); +static void em_receive_checksum(uint32_t status, struct mbuf *); static void em_transmit_checksum_setup(struct tx_ring *, struct mbuf *, int, struct ip *, u32 *, u32 *); static void em_tso_setup(struct tx_ring *, struct mbuf *, int, struct ip *, @@ -631,7 +633,7 @@ em_attach(device_t dev) } else adapter->num_tx_desc = em_txd; - if (((em_rxd * sizeof(struct e1000_rx_desc)) % EM_DBA_ALIGN) != 0 || + if (((em_rxd * sizeof(union e1000_rx_desc_extended)) % EM_DBA_ALIGN) != 0 || (em_rxd > EM_MAX_RXD) || (em_rxd < EM_MIN_RXD)) { device_printf(dev, "Using %d RX descriptors instead of %d!\n", EM_DEFAULT_RXD, em_rxd); @@ -1872,7 +1874,7 @@ em_xmit(struct tx_ring *txr, struct mbuf **m_headp) struct adapter *adapter = txr->adapter; bus_dma_segment_t segs[EM_MAX_SCATTER]; bus_dmamap_t map; - struct em_buffer *tx_buffer, *tx_buffer_mapped; + struct em_txbuffer *tx_buffer, *tx_buffer_mapped; struct e1000_tx_desc *ctxd = NULL; struct mbuf *m_head; struct ether_header *eh; @@ -3296,7 +3298,7 @@ em_allocate_queues(struct adapter *adapter) * Next the RX queues... */ rsize = roundup2(adapter->num_rx_desc * - sizeof(struct e1000_rx_desc), EM_DBA_ALIGN); + sizeof(union e1000_rx_desc_extended), EM_DBA_ALIGN); for (int i = 0; i < adapter->num_queues; i++, rxconf++) { rxr = &adapter->rx_rings[i]; rxr->adapter = adapter; @@ -3314,7 +3316,7 @@ em_allocate_queues(struct adapter *adapter) error = ENOMEM; goto err_rx_desc; } - rxr->rx_base = (struct e1000_rx_desc *)rxr->rxdma.dma_vaddr; + rxr->rx_base = (union e1000_rx_desc_extended *)rxr->rxdma.dma_vaddr; bzero((void *)rxr->rx_base, rsize); /* Allocate receive buffers for the ring*/ @@ -3357,7 +3359,7 @@ em_allocate_transmit_buffers(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; device_t dev = adapter->dev; - struct em_buffer *txbuf; + struct em_txbuffer *txbuf; int error, i; /* @@ -3380,7 +3382,7 @@ em_allocate_transmit_buffers(struct tx_ring *txr) } if (!(txr->tx_buffers = - (struct em_buffer *) malloc(sizeof(struct em_buffer) * + (struct em_txbuffer *) malloc(sizeof(struct em_txbuffer) * adapter->num_tx_desc, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate tx_buffer memory\n"); error = ENOMEM; @@ -3413,7 +3415,7 @@ static void em_setup_transmit_ring(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; - struct em_buffer *txbuf; + struct em_txbuffer *txbuf; int i; #ifdef DEV_NETMAP struct netmap_slot *slot; @@ -3632,7 +3634,7 @@ static void em_free_transmit_buffers(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; - struct em_buffer *txbuf; + struct em_txbuffer *txbuf; INIT_DEBUGOUT("free_transmit_ring: begin"); @@ -3699,7 +3701,7 @@ em_transmit_checksum_setup(struct tx_ring *txr, struct mbuf *mp, int ip_off, { struct adapter *adapter = txr->adapter; struct e1000_context_desc *TXD = NULL; - struct em_buffer *tx_buffer; + struct em_txbuffer *tx_buffer; int cur, hdr_len; u32 cmd = 0; u16 offload = 0; @@ -3836,7 +3838,7 @@ em_tso_setup(struct tx_ring *txr, struct mbuf *mp, int ip_off, { struct adapter *adapter = txr->adapter; struct e1000_context_desc *TXD; - struct em_buffer *tx_buffer; + struct em_txbuffer *tx_buffer; int cur, hdr_len; /* @@ -3914,7 +3916,7 @@ em_txeof(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; int first, last, done, processed; - struct em_buffer *tx_buffer; + struct em_txbuffer *tx_buffer; struct e1000_tx_desc *tx_desc, *eop_desc; if_t ifp = adapter->ifp; @@ -4020,7 +4022,6 @@ em_txeof(struct tx_ring *txr) txr->busy = EM_TX_IDLE; } - /********************************************************************* * * Refresh RX descriptor mbufs from system mbuf buffer pool. @@ -4031,8 +4032,8 @@ em_refresh_mbufs(struct rx_ring *rxr, int limit) { struct adapter *adapter = rxr->adapter; struct mbuf *m; - bus_dma_segment_t segs[1]; - struct em_buffer *rxbuf; + bus_dma_segment_t segs; + struct em_rxbuffer *rxbuf; int i, j, error, nsegs; bool cleaned = FALSE; @@ -4067,7 +4068,7 @@ em_refresh_mbufs(struct rx_ring *rxr, int limit) /* Use bus_dma machinery to setup the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->rxtag, rxbuf->map, - m, segs, &nsegs, BUS_DMA_NOWAIT); + m, &segs, &nsegs, BUS_DMA_NOWAIT); if (error != 0) { printf("Refresh mbufs: hdr dmamap load" " failure - %d\n", error); @@ -4076,9 +4077,10 @@ em_refresh_mbufs(struct rx_ring *rxr, int limit) goto update; } rxbuf->m_head = m; + rxbuf->paddr = segs.ds_addr; bus_dmamap_sync(rxr->rxtag, rxbuf->map, BUS_DMASYNC_PREREAD); - rxr->rx_base[i].buffer_addr = htole64(segs[0].ds_addr); + em_setup_rxdesc(&rxr->rx_base[i], rxbuf); cleaned = TRUE; i = j; /* Next is precalulated for us */ @@ -4113,10 +4115,10 @@ em_allocate_receive_buffers(struct rx_ring *rxr) { struct adapter *adapter = rxr->adapter; device_t dev = adapter->dev; - struct em_buffer *rxbuf; + struct em_rxbuffer *rxbuf; int error; - rxr->rx_buffers = malloc(sizeof(struct em_buffer) * + rxr->rx_buffers = malloc(sizeof(struct em_rxbuffer) * adapter->num_rx_desc, M_DEVBUF, M_NOWAIT | M_ZERO); if (rxr->rx_buffers == NULL) { device_printf(dev, "Unable to allocate rx_buffer memory\n"); @@ -4169,7 +4171,7 @@ static int em_setup_receive_ring(struct rx_ring *rxr) { struct adapter *adapter = rxr->adapter; - struct em_buffer *rxbuf; + struct em_rxbuffer *rxbuf; bus_dma_segment_t seg[1]; int rsize, nsegs, error = 0; #ifdef DEV_NETMAP @@ -4181,7 +4183,7 @@ em_setup_receive_ring(struct rx_ring *rxr) /* Clear the ring contents */ EM_RX_LOCK(rxr); rsize = roundup2(adapter->num_rx_desc * - sizeof(struct e1000_rx_desc), EM_DBA_ALIGN); + sizeof(union e1000_rx_desc_extended), EM_DBA_ALIGN); bzero((void *)rxr->rx_base, rsize); #ifdef DEV_NETMAP slot = netmap_reset(na, NR_RX, rxr->me, 0); @@ -4212,8 +4214,7 @@ em_setup_receive_ring(struct rx_ring *rxr) addr = PNMB(na, slot + si, &paddr); netmap_load_map(na, rxr->rxtag, rxbuf->map, addr); - /* Update descriptor */ - rxr->rx_base[j].buffer_addr = htole64(paddr); + em_setup_rxdesc(&rxr->rx_base[j], rxbuf); continue; } #endif /* DEV_NETMAP */ @@ -4239,8 +4240,8 @@ em_setup_receive_ring(struct rx_ring *rxr) bus_dmamap_sync(rxr->rxtag, rxbuf->map, BUS_DMASYNC_PREREAD); - /* Update descriptor */ - rxr->rx_base[j].buffer_addr = htole64(seg[0].ds_addr); + rxbuf->paddr = seg[0].ds_addr; + em_setup_rxdesc(&rxr->rx_base[j], rxbuf); } rxr->next_to_check = 0; rxr->next_to_refresh = 0; @@ -4277,7 +4278,7 @@ em_setup_receive_structures(struct adapter *adapter) for (int i = 0; i < q; ++i) { rxr = &adapter->rx_rings[i]; for (int n = 0; n < adapter->num_rx_desc; n++) { - struct em_buffer *rxbuf; + struct em_rxbuffer *rxbuf; rxbuf = &rxr->rx_buffers[n]; if (rxbuf->m_head != NULL) { bus_dmamap_sync(rxr->rxtag, rxbuf->map, @@ -4324,7 +4325,7 @@ static void em_free_receive_buffers(struct rx_ring *rxr) { struct adapter *adapter = rxr->adapter; - struct em_buffer *rxbuf = NULL; + struct em_rxbuffer *rxbuf = NULL; INIT_DEBUGOUT("free_receive_buffers: begin"); @@ -4366,11 +4367,10 @@ em_free_receive_buffers(struct rx_ring *rxr) static void em_initialize_receive_unit(struct adapter *adapter) { - struct rx_ring *rxr = adapter->rx_rings; + struct rx_ring *rxr = adapter->rx_rings; if_t ifp = adapter->ifp; struct e1000_hw *hw = &adapter->hw; - u64 bus_addr; - u32 rctl, rxcsum; + u32 rctl, rxcsum, rfctl; INIT_DEBUGOUT("em_initialize_receive_units: begin"); @@ -4383,6 +4383,25 @@ em_initialize_receive_unit(struct adapter *adapter) if ((hw->mac.type != e1000_82574) && (hw->mac.type != e1000_82583)) E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN); + /* Setup the Receive Control Register */ + rctl &= ~(3 << E1000_RCTL_MO_SHIFT); + rctl |= E1000_RCTL_EN | E1000_RCTL_BAM | + E1000_RCTL_LBM_NO | E1000_RCTL_RDMTS_HALF | + (hw->mac.mc_filter_type << E1000_RCTL_MO_SHIFT); + + /* Do not store bad packets */ + rctl &= ~E1000_RCTL_SBP; + + /* Enable Long Packet receive */ + if (if_getmtu(ifp) > ETHERMTU) + rctl |= E1000_RCTL_LPE; + else + rctl &= ~E1000_RCTL_LPE; + + /* Strip the CRC */ + if (!em_disable_crc_stripping) + rctl |= E1000_RCTL_SECRC; + E1000_WRITE_REG(&adapter->hw, E1000_RADV, adapter->rx_abs_int_delay.value); @@ -4394,20 +4413,21 @@ em_initialize_receive_unit(struct adapter *adapter) */ E1000_WRITE_REG(hw, E1000_ITR, DEFAULT_ITR); + /* Use extended rx descriptor formats */ + rfctl = E1000_READ_REG(hw, E1000_RFCTL); + rfctl |= E1000_RFCTL_EXTEN; /* ** When using MSIX interrupts we need to throttle ** using the EITR register (82574 only) */ if (hw->mac.type == e1000_82574) { - u32 rfctl; for (int i = 0; i < 4; i++) E1000_WRITE_REG(hw, E1000_EITR_82574(i), DEFAULT_ITR); /* Disable accelerated acknowledge */ - rfctl = E1000_READ_REG(hw, E1000_RFCTL); rfctl |= E1000_RFCTL_ACK_DIS; - E1000_WRITE_REG(hw, E1000_RFCTL, rfctl); } + E1000_WRITE_REG(hw, E1000_RFCTL, rfctl); rxcsum = E1000_READ_REG(hw, E1000_RXCSUM); if (if_getcapenable(ifp) & IFCAP_RXCSUM) { @@ -4424,38 +4444,44 @@ em_initialize_receive_unit(struct adapter *adapter) E1000_WRITE_REG(hw, E1000_RXCSUM, rxcsum); #ifdef EM_MULTIQUEUE +#define RSSKEYLEN 10 if (adapter->num_queues > 1) { - uint32_t rss_key[10]; - uint32_t reta; + uint8_t rss_key[4 * RSSKEYLEN]; + uint32_t reta = 0; int i; /* * Configure RSS key */ arc4rand(rss_key, sizeof(rss_key), 0); - for (i = 0; i < 10; ++i) - E1000_WRITE_REG_ARRAY(hw,E1000_RSSRK(0), i, rss_key[i]); + for (i = 0; i < RSSKEYLEN; ++i) { + uint32_t rssrk = 0; + + rssrk = EM_RSSRK_VAL(rss_key, i); + E1000_WRITE_REG(hw,E1000_RSSRK(i), rssrk); + } /* * Configure RSS redirect table in following fashion: * (hash & ring_cnt_mask) == rdr_table[(hash & rdr_table_mask)] */ - reta = 0; - for (i = 0; i < 4; ++i) { + for (i = 0; i < sizeof(reta); ++i) { uint32_t q; + q = (i % adapter->num_queues) << 7; reta |= q << (8 * i); } - for (i = 0; i < 32; ++i) + + for (i = 0; i < 32; ++i) { E1000_WRITE_REG(hw, E1000_RETA(i), reta); + } E1000_WRITE_REG(hw, E1000_MRQC, E1000_MRQC_RSS_ENABLE_2Q | E1000_MRQC_RSS_FIELD_IPV4_TCP | E1000_MRQC_RSS_FIELD_IPV4 | E1000_MRQC_RSS_FIELD_IPV6_TCP_EX | E1000_MRQC_RSS_FIELD_IPV6_EX | - E1000_MRQC_RSS_FIELD_IPV6 | - E1000_MRQC_RSS_FIELD_IPV6_TCP); + E1000_MRQC_RSS_FIELD_IPV6); } #endif /* @@ -4470,11 +4496,11 @@ em_initialize_receive_unit(struct adapter *adapter) for (int i = 0; i < adapter->num_queues; i++, rxr++) { /* Setup the Base and Length of the Rx Descriptor Ring */ + u64 bus_addr = rxr->rxdma.dma_paddr; u32 rdt = adapter->num_rx_desc - 1; /* default */ - bus_addr = rxr->rxdma.dma_paddr; E1000_WRITE_REG(hw, E1000_RDLEN(i), - adapter->num_rx_desc * sizeof(struct e1000_rx_desc)); + adapter->num_rx_desc * sizeof(union e1000_rx_desc_extended)); E1000_WRITE_REG(hw, E1000_RDBAH(i), (u32)(bus_addr >> 32)); E1000_WRITE_REG(hw, E1000_RDBAL(i), (u32)bus_addr); /* Setup the Head and Tail Descriptor Pointers */ @@ -4505,14 +4531,13 @@ em_initialize_receive_unit(struct adapter *adapter) (if_getmtu(ifp) > ETHERMTU)) { u32 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(0)); E1000_WRITE_REG(hw, E1000_RXDCTL(0), rxdctl | 3); - } else if ((adapter->hw.mac.type == e1000_82574) && - (if_getmtu(ifp) > ETHERMTU)) { + } else if (adapter->hw.mac.type == e1000_82574) { for (int i = 0; i < adapter->num_queues; i++) { u32 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(i)); - rxdctl |= 0x20; /* PTHRESH */ - rxdctl |= 4 << 8; /* HTHRESH */ - rxdctl |= 4 << 16;/* WTHRESH */ + rxdctl |= 0x20; /* PTHRESH */ + rxdctl |= 4 << 8; /* HTHRESH */ + rxdctl |= 4 << 16;/* WTHRESH */ rxdctl |= 1 << 24; /* Switch to granularity */ E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl); } @@ -4525,19 +4550,8 @@ em_initialize_receive_unit(struct adapter *adapter) e1000_lv_jumbo_workaround_ich8lan(hw, FALSE); } - /* Setup the Receive Control Register */ - rctl &= ~(3 << E1000_RCTL_MO_SHIFT); - rctl |= E1000_RCTL_EN | E1000_RCTL_BAM | - E1000_RCTL_LBM_NO | E1000_RCTL_RDMTS_HALF | - (hw->mac.mc_filter_type << E1000_RCTL_MO_SHIFT); - - /* Strip the CRC */ - if (!em_disable_crc_stripping) - rctl |= E1000_RCTL_SECRC; - /* Make sure VLAN Filters are off */ rctl &= ~E1000_RCTL_VFE; - rctl &= ~E1000_RCTL_SBP; if (adapter->rx_mbuf_sz == MCLBYTES) rctl |= E1000_RCTL_SZ_2048; @@ -4546,11 +4560,8 @@ em_initialize_receive_unit(struct adapter *adapter) else if (adapter->rx_mbuf_sz > MJUMPAGESIZE) rctl |= E1000_RCTL_SZ_8192 | E1000_RCTL_BSEX; - if (if_getmtu(ifp) > ETHERMTU) - rctl |= E1000_RCTL_LPE; - else - rctl &= ~E1000_RCTL_LPE; - + /* ensure we clear use DTYPE of 00 here */ + rctl &= ~0x00000C00; /* Write out the settings */ E1000_WRITE_REG(hw, E1000_RCTL, rctl); @@ -4575,11 +4586,11 @@ em_rxeof(struct rx_ring *rxr, int count, int *done) struct adapter *adapter = rxr->adapter; if_t ifp = adapter->ifp; struct mbuf *mp, *sendmp; - u8 status = 0; + u32 status = 0; u16 len; int i, processed, rxdone = 0; bool eop; - struct e1000_rx_desc *cur; + union e1000_rx_desc_extended *cur; EM_RX_LOCK(rxr); @@ -4596,21 +4607,20 @@ em_rxeof(struct rx_ring *rxr, int count, int *done) #endif /* DEV_NETMAP */ for (i = rxr->next_to_check, processed = 0; count != 0;) { - if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) break; cur = &rxr->rx_base[i]; - status = cur->status; + status = le32toh(cur->wb.upper.status_error); mp = sendmp = NULL; if ((status & E1000_RXD_STAT_DD) == 0) break; - len = le16toh(cur->length); + len = le16toh(cur->wb.upper.length); eop = (status & E1000_RXD_STAT_EOP) != 0; - if ((cur->errors & E1000_RXD_ERR_FRAME_ERR_MASK) || + if ((status & E1000_RXDEXT_ERR_FRAME_ERR_MASK) || (rxr->discard == TRUE)) { adapter->dropped_pkts++; ++rxr->rx_discarded; @@ -4647,7 +4657,7 @@ em_rxeof(struct rx_ring *rxr, int count, int *done) sendmp = rxr->fmp; if_setrcvif(sendmp, ifp); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); - em_receive_checksum(cur, sendmp); + em_receive_checksum(status, sendmp); #ifndef __NO_STRICT_ALIGNMENT if (adapter->hw.mac.max_frame_size > (MCLBYTES - ETHER_ALIGN) && @@ -4656,7 +4666,7 @@ em_rxeof(struct rx_ring *rxr, int count, int *done) #endif if (status & E1000_RXD_STAT_VP) { if_setvtag(sendmp, - le16toh(cur->special)); + le16toh(cur->wb.upper.vlan)); sendmp->m_flags |= M_VLANTAG; } #ifndef __NO_STRICT_ALIGNMENT @@ -4670,7 +4680,7 @@ em_rxeof(struct rx_ring *rxr, int count, int *done) BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* Zero out the receive descriptors status. */ - cur->status = 0; + cur->wb.upper.status_error &= htole32(~0xFF); ++rxdone; /* cumulative for POLL */ ++processed; @@ -4709,7 +4719,7 @@ em_rxeof(struct rx_ring *rxr, int count, int *done) static __inline void em_rx_discard(struct rx_ring *rxr, int i) { - struct em_buffer *rbuf; + struct em_rxbuffer *rbuf; rbuf = &rxr->rx_buffers[i]; bus_dmamap_unload(rxr->rxtag, rbuf->map); @@ -4781,6 +4791,14 @@ em_fixup_rx(struct rx_ring *rxr) } #endif +static void +em_setup_rxdesc(union e1000_rx_desc_extended *rxd, const struct em_rxbuffer *rxbuf) +{ + rxd->read.buffer_addr = htole64(rxbuf->paddr); + /* DD bits must be cleared */ + rxd->wb.upper.status_error= 0; +} + /********************************************************************* * * Verify that the hardware indicated that the checksum is valid. @@ -4789,23 +4807,27 @@ em_fixup_rx(struct rx_ring *rxr) * *********************************************************************/ static void -em_receive_checksum(struct e1000_rx_desc *rx_desc, struct mbuf *mp) +em_receive_checksum(uint32_t status, struct mbuf *mp) { mp->m_pkthdr.csum_flags = 0; /* Ignore Checksum bit is set */ - if (rx_desc->status & E1000_RXD_STAT_IXSM) + if (status & E1000_RXD_STAT_IXSM) return; - if (rx_desc->errors & (E1000_RXD_ERR_TCPE | E1000_RXD_ERR_IPE)) - return; - - /* IP Checksum Good? */ - if (rx_desc->status & E1000_RXD_STAT_IPCS) + /* If the IP checksum exists and there is no IP Checksum error */ + if ((status & (E1000_RXD_STAT_IPCS | E1000_RXDEXT_STATERR_IPE)) == + E1000_RXD_STAT_IPCS) { mp->m_pkthdr.csum_flags = (CSUM_IP_CHECKED | CSUM_IP_VALID); + } /* TCP or UDP checksum */ - if (rx_desc->status & (E1000_RXD_STAT_TCPCS | E1000_RXD_STAT_UDPCS)) { + if ((status & (E1000_RXD_STAT_TCPCS | E1000_RXDEXT_STATERR_TCPE)) == + E1000_RXD_STAT_TCPCS) { + mp->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); + mp->m_pkthdr.csum_data = htons(0xffff); + } + if (status & E1000_RXD_STAT_UDPCS) { mp->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); mp->m_pkthdr.csum_data = htons(0xffff); } diff --git a/sys/dev/e1000/if_em.h b/sys/dev/e1000/if_em.h index 8725de35e81d..be9fdc96e6d3 100644 --- a/sys/dev/e1000/if_em.h +++ b/sys/dev/e1000/if_em.h @@ -330,7 +330,7 @@ struct tx_ring { struct taskqueue *tq; u32 next_avail_desc; u32 next_to_clean; - struct em_buffer *tx_buffers; + struct em_txbuffer *tx_buffers; volatile u16 tx_avail; u32 tx_tso; /* last tx was tso */ u16 last_hw_offload; @@ -362,11 +362,11 @@ struct rx_ring { u32 payload; struct task rx_task; struct taskqueue *tq; - struct e1000_rx_desc *rx_base; + union e1000_rx_desc_extended *rx_base; struct em_dma_alloc rxdma; u32 next_to_refresh; u32 next_to_check; - struct em_buffer *rx_buffers; + struct em_rxbuffer *rx_buffers; struct mbuf *fmp; struct mbuf *lmp; @@ -499,12 +499,19 @@ typedef struct _em_vendor_info_t { unsigned int index; } em_vendor_info_t; -struct em_buffer { +struct em_txbuffer { int next_eop; /* Index of the desc to watch */ struct mbuf *m_head; bus_dmamap_t map; /* bus_dma map for packet */ }; +struct em_rxbuffer { + int next_eop; /* Index of the desc to watch */ + struct mbuf *m_head; + bus_dmamap_t map; /* bus_dma map for packet */ + bus_addr_t paddr; +}; + /* ** Find the number of unrefreshed RX descriptors @@ -541,4 +548,9 @@ e1000_rx_unrefreshed(struct rx_ring *rxr) #define EM_TX_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->tx_mtx, MA_OWNED) #define EM_RX_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->rx_mtx, MA_OWNED) +#define EM_RSSRK_SIZE 4 +#define EM_RSSRK_VAL(key, i) (key[(i) * EM_RSSRK_SIZE] | \ + key[(i) * EM_RSSRK_SIZE + 1] << 8 | \ + key[(i) * EM_RSSRK_SIZE + 2] << 16 | \ + key[(i) * EM_RSSRK_SIZE + 3] << 24) #endif /* _EM_H_DEFINED_ */ diff --git a/sys/dev/netmap/if_em_netmap.h b/sys/dev/netmap/if_em_netmap.h index eae4f8c18ca0..28f2dd4bbc64 100644 --- a/sys/dev/netmap/if_em_netmap.h +++ b/sys/dev/netmap/if_em_netmap.h @@ -148,7 +148,7 @@ em_netmap_txsync(struct netmap_kring *kring, int flags) /* device-specific */ struct e1000_tx_desc *curr = &txr->tx_base[nic_i]; - struct em_buffer *txbuf = &txr->tx_buffers[nic_i]; + struct em_txbuffer *txbuf = &txr->tx_buffers[nic_i]; int flags = (slot->flags & NS_REPORT || nic_i == 0 || nic_i == report_frequency) ? E1000_TXD_CMD_RS : 0; @@ -239,12 +239,12 @@ em_netmap_rxsync(struct netmap_kring *kring, int flags) nm_i = netmap_idx_n2k(kring, nic_i); for (n = 0; ; n++) { // XXX no need to count - struct e1000_rx_desc *curr = &rxr->rx_base[nic_i]; - uint32_t staterr = le32toh(curr->status); + union e1000_rx_desc_extended *curr = &rxr->rx_base[nic_i]; + uint32_t staterr = le32toh(curr->wb.upper.status_error); if ((staterr & E1000_RXD_STAT_DD) == 0) break; - ring->slot[nm_i].len = le16toh(curr->length); + ring->slot[nm_i].len = le16toh(curr->wb.upper.length); ring->slot[nm_i].flags = slot_flags; bus_dmamap_sync(rxr->rxtag, rxr->rx_buffers[nic_i].map, BUS_DMASYNC_POSTREAD); @@ -271,19 +271,19 @@ em_netmap_rxsync(struct netmap_kring *kring, int flags) uint64_t paddr; void *addr = PNMB(na, slot, &paddr); - struct e1000_rx_desc *curr = &rxr->rx_base[nic_i]; - struct em_buffer *rxbuf = &rxr->rx_buffers[nic_i]; + union e1000_rx_desc_extended *curr = &rxr->rx_base[nic_i]; + struct em_rxbuffer *rxbuf = &rxr->rx_buffers[nic_i]; if (addr == NETMAP_BUF_BASE(na)) /* bad buf */ goto ring_reset; if (slot->flags & NS_BUF_CHANGED) { /* buffer has changed, reload map */ - curr->buffer_addr = htole64(paddr); + curr->read.buffer_addr = htole64(paddr); netmap_reload_map(na, rxr->rxtag, rxbuf->map, addr); slot->flags &= ~NS_BUF_CHANGED; } - curr->status = 0; + curr->wb.upper.status_error = 0; bus_dmamap_sync(rxr->rxtag, rxbuf->map, BUS_DMASYNC_PREREAD); nm_i = nm_next(nm_i, lim); From 676822acb5d3da091aed044b47b8c8cbcb0aa815 Mon Sep 17 00:00:00 2001 From: Sean Bruno Date: Thu, 7 Jan 2016 16:48:47 +0000 Subject: [PATCH 47/48] Disable the reuse of checksum offload context descriptors in the case of multiple queues in em(4). Document errata in the code. MFC after: 2 weeks Sponsored by: Limelight Networks Differential Revision: https://reviews.freebsd.org/D3995 --- sys/dev/e1000/if_em.c | 108 ++++++++++++++++++++++++------------------ 1 file changed, 63 insertions(+), 45 deletions(-) diff --git a/sys/dev/e1000/if_em.c b/sys/dev/e1000/if_em.c index de53d8b9f229..f586d399822f 100644 --- a/sys/dev/e1000/if_em.c +++ b/sys/dev/e1000/if_em.c @@ -3735,29 +3735,38 @@ em_transmit_checksum_setup(struct tx_ring *txr, struct mbuf *mp, int ip_off, offload |= CSUM_TCP; tucss = hdr_len; tucso = hdr_len + offsetof(struct tcphdr, th_sum); - /* - * Setting up new checksum offload context for every frames - * takes a lot of processing time for hardware. This also - * reduces performance a lot for small sized frames so avoid - * it if driver can use previously configured checksum - * offload context. - */ - if (txr->last_hw_offload == offload) { - if (offload & CSUM_IP) { - if (txr->last_hw_ipcss == ipcss && - txr->last_hw_ipcso == ipcso && - txr->last_hw_tucss == tucss && - txr->last_hw_tucso == tucso) - return; - } else { - if (txr->last_hw_tucss == tucss && - txr->last_hw_tucso == tucso) - return; - } - } - txr->last_hw_offload = offload; - txr->last_hw_tucss = tucss; - txr->last_hw_tucso = tucso; + /* + * The 82574L can only remember the *last* context used + * regardless of queue that it was use for. We cannot reuse + * contexts on this hardware platform and must generate a new + * context every time. 82574L hardware spec, section 7.2.6, + * second note. + */ + if (adapter->num_queues < 2) { + /* + * Setting up new checksum offload context for every + * frames takes a lot of processing time for hardware. + * This also reduces performance a lot for small sized + * frames so avoid it if driver can use previously + * configured checksum offload context. + */ + if (txr->last_hw_offload == offload) { + if (offload & CSUM_IP) { + if (txr->last_hw_ipcss == ipcss && + txr->last_hw_ipcso == ipcso && + txr->last_hw_tucss == tucss && + txr->last_hw_tucso == tucso) + return; + } else { + if (txr->last_hw_tucss == tucss && + txr->last_hw_tucso == tucso) + return; + } + } + txr->last_hw_offload = offload; + txr->last_hw_tucss = tucss; + txr->last_hw_tucso = tucso; + } /* * Start offset for payload checksum calculation. * End offset for payload checksum calculation. @@ -3773,29 +3782,38 @@ em_transmit_checksum_setup(struct tx_ring *txr, struct mbuf *mp, int ip_off, *txd_upper |= E1000_TXD_POPTS_TXSM << 8; tucss = hdr_len; tucso = hdr_len + offsetof(struct udphdr, uh_sum); - /* - * Setting up new checksum offload context for every frames - * takes a lot of processing time for hardware. This also - * reduces performance a lot for small sized frames so avoid - * it if driver can use previously configured checksum - * offload context. - */ - if (txr->last_hw_offload == offload) { - if (offload & CSUM_IP) { - if (txr->last_hw_ipcss == ipcss && - txr->last_hw_ipcso == ipcso && - txr->last_hw_tucss == tucss && - txr->last_hw_tucso == tucso) - return; - } else { - if (txr->last_hw_tucss == tucss && - txr->last_hw_tucso == tucso) - return; + /* + * The 82574L can only remember the *last* context used + * regardless of queue that it was use for. We cannot reuse + * contexts on this hardware platform and must generate a new + * context every time. 82574L hardware spec, section 7.2.6, + * second note. + */ + if (adapter->num_queues < 2) { + /* + * Setting up new checksum offload context for every + * frames takes a lot of processing time for hardware. + * This also reduces performance a lot for small sized + * frames so avoid it if driver can use previously + * configured checksum offload context. + */ + if (txr->last_hw_offload == offload) { + if (offload & CSUM_IP) { + if (txr->last_hw_ipcss == ipcss && + txr->last_hw_ipcso == ipcso && + txr->last_hw_tucss == tucss && + txr->last_hw_tucso == tucso) + return; + } else { + if (txr->last_hw_tucss == tucss && + txr->last_hw_tucso == tucso) + return; + } } - } - txr->last_hw_offload = offload; - txr->last_hw_tucss = tucss; - txr->last_hw_tucso = tucso; + txr->last_hw_offload = offload; + txr->last_hw_tucss = tucss; + txr->last_hw_tucso = tucso; + } /* * Start offset for header checksum calculation. * End offset for header checksum calculation. From 97f9586e97f7a4b1e1b4b5dab6dba6ec12a36f38 Mon Sep 17 00:00:00 2001 From: Sean Bruno Date: Thu, 7 Jan 2016 17:02:34 +0000 Subject: [PATCH 48/48] Fixup SFP module insertion on the 82599 when insertion happens after the system is booted and running. Add PHY detection logic to ixgbe_handle_mod() and add locking to ixgbe_handle_msf() as well. PR: 150251 Submitted by: aboyer@averesystems.com MFC after: 2 weeks Differential Revision: https://reviews.freebsd.org/D3188 --- sys/dev/ixgbe/if_ix.c | 58 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 49 insertions(+), 9 deletions(-) diff --git a/sys/dev/ixgbe/if_ix.c b/sys/dev/ixgbe/if_ix.c index c7ac59399ad2..90d70d98a464 100644 --- a/sys/dev/ixgbe/if_ix.c +++ b/sys/dev/ixgbe/if_ix.c @@ -2947,12 +2947,7 @@ ixgbe_config_link(struct adapter *adapter) sfp = ixgbe_is_sfp(hw); if (sfp) { - if (hw->phy.multispeed_fiber) { - hw->mac.ops.setup_sfp(hw); - ixgbe_enable_tx_laser(hw); - taskqueue_enqueue(adapter->tq, &adapter->msf_task); - } else - taskqueue_enqueue(adapter->tq, &adapter->mod_task); + taskqueue_enqueue(adapter->tq, &adapter->mod_task); } else { if (hw->mac.ops.check_link) err = ixgbe_check_link(hw, &adapter->link_speed, @@ -3758,23 +3753,66 @@ ixgbe_handle_mod(void *context, int pending) { struct adapter *adapter = context; struct ixgbe_hw *hw = &adapter->hw; + enum ixgbe_phy_type orig_type = hw->phy.type; device_t dev = adapter->dev; u32 err; + IXGBE_CORE_LOCK(adapter); + + /* Check to see if the PHY type changed */ + if (hw->phy.ops.identify) { + hw->phy.type = ixgbe_phy_unknown; + hw->phy.ops.identify(hw); + } + + if (hw->phy.type != orig_type) { + device_printf(dev, "Detected phy_type %d\n", hw->phy.type); + + if (hw->phy.type == ixgbe_phy_none) { + hw->phy.sfp_type = ixgbe_sfp_type_unknown; + goto out; + } + + /* Try to do the initialization that was skipped before */ + if (hw->phy.ops.init) + hw->phy.ops.init(hw); + if (hw->phy.ops.reset) + hw->phy.ops.reset(hw); + } + err = hw->phy.ops.identify_sfp(hw); if (err == IXGBE_ERR_SFP_NOT_SUPPORTED) { device_printf(dev, "Unsupported SFP+ module type was detected.\n"); - return; + goto out; } err = hw->mac.ops.setup_sfp(hw); if (err == IXGBE_ERR_SFP_NOT_SUPPORTED) { device_printf(dev, "Setup failure - unsupported SFP+ module type.\n"); - return; + goto out; } - taskqueue_enqueue(adapter->tq, &adapter->msf_task); + if (hw->phy.multispeed_fiber) + taskqueue_enqueue(adapter->tq, &adapter->msf_task); +out: + /* Update media type */ + switch (hw->mac.ops.get_media_type(hw)) { + case ixgbe_media_type_fiber: + adapter->optics = IFM_10G_SR; + break; + case ixgbe_media_type_copper: + adapter->optics = IFM_10G_TWINAX; + break; + case ixgbe_media_type_cx4: + adapter->optics = IFM_10G_CX4; + break; + default: + adapter->optics = 0; + break; + } + + IXGBE_CORE_UNLOCK(adapter); return; } @@ -3790,6 +3828,7 @@ ixgbe_handle_msf(void *context, int pending) u32 autoneg; bool negotiate; + IXGBE_CORE_LOCK(adapter); /* get_supported_phy_layer will call hw->phy.ops.identify_sfp() */ adapter->phy_layer = ixgbe_get_supported_physical_layer(hw); @@ -3802,6 +3841,7 @@ ixgbe_handle_msf(void *context, int pending) /* Adjust media types shown in ifconfig */ ifmedia_removeall(&adapter->media); ixgbe_add_media_types(adapter); + IXGBE_CORE_UNLOCK(adapter); return; }