From 72d32bf80dfdcfe0e69da200b66f195e919653f7 Mon Sep 17 00:00:00 2001
From: jkim <jkim@FreeBSD.org>
Date: Tue, 1 Mar 2016 17:57:01 +0000
Subject: [PATCH] Import OpenSSL 1.0.2g.

---
 CHANGES                              |  134 ++-
 Configure                            |    8 +-
 Makefile                             |    6 +-
 Makefile.shared                      |    6 +-
 NEWS                                 |   13 +
 README                               |    2 +-
 apps/apps.c                          |    8 +-
 apps/apps.h                          |    2 +-
 apps/pkeyutl.c                       |   90 +-
 apps/req.c                           |    4 +-
 apps/rsautl.c                        |    6 +-
 apps/s_client.c                      |    2 -
 apps/s_server.c                      |   49 +-
 config                               |    3 +-
 crypto/asn1/tasn_dec.c               |   14 +-
 crypto/bio/b_print.c                 |  187 ++--
 crypto/bio/bio.h                     |    4 +-
 crypto/bio/bss_mem.c                 |    6 +-
 crypto/bn/Makefile                   |    4 +-
 crypto/bn/asm/rsaz-avx2.pl           |  215 +++--
 crypto/bn/asm/rsaz-x86_64.pl         |  375 ++++++--
 crypto/bn/asm/x86_64-mont.pl         |  235 +++--
 crypto/bn/asm/x86_64-mont5.pl        | 1280 +++++++++++++++-----------
 crypto/bn/bn.h                       |   14 +-
 crypto/bn/bn_exp.c                   |  103 ++-
 crypto/bn/bn_print.c                 |   17 +-
 crypto/bn/bn_recp.c                  |    1 +
 crypto/cmac/cmac.c                   |    8 +
 crypto/cryptlib.c                    |    6 +-
 crypto/crypto.h                      |    2 +-
 crypto/dh/dh.h                       |    2 +-
 crypto/dh/dh_check.c                 |    7 +-
 crypto/dsa/dsa_ameth.c               |   24 +-
 crypto/dso/dso_lib.c                 |    1 +
 crypto/ec/asm/ecp_nistz256-x86_64.pl |   11 +-
 crypto/ec/ecp_nistp224.c             |    4 +-
 crypto/ec/ecp_nistp256.c             |    4 +-
 crypto/ec/ecp_nistp521.c             |    4 +-
 crypto/ec/ectest.c                   |    9 +
 crypto/engine/eng_dyn.c              |    4 +-
 crypto/evp/e_des.c                   |   11 +-
 crypto/evp/e_des3.c                  |   13 +-
 crypto/modes/asm/aesni-gcm-x86_64.pl |    4 +-
 crypto/modes/asm/ghash-x86_64.pl     |    2 +-
 crypto/modes/ctr128.c                |   41 +-
 crypto/opensslconf.h                 |   12 +
 crypto/opensslv.h                    |    6 +-
 crypto/perlasm/x86_64-xlate.pl       |    7 +-
 crypto/pkcs7/pk7_smime.c             |   17 +
 crypto/rsa/rsa_sign.c                |    4 +-
 crypto/srp/srp.h                     |   10 +
 crypto/srp/srp_vfy.c                 |   61 +-
 crypto/stack/stack.c                 |    2 +-
 crypto/x509/x509_vfy.c               |   70 +-
 doc/apps/ciphers.pod                 |   59 +-
 doc/apps/pkeyutl.pod                 |   13 +
 doc/apps/req.pod                     |    9 +-
 doc/apps/s_client.pod                |   12 +-
 doc/apps/s_server.pod                |    8 +-
 doc/crypto/BIO_s_mem.pod             |    4 +-
 doc/ssl/SSL_CONF_cmd.pod             |   33 +-
 doc/ssl/SSL_CTX_new.pod              |  162 +++-
 doc/ssl/SSL_CTX_set_options.pod      |   10 +
 doc/ssl/ssl.pod                      |   79 +-
 engines/e_capi.c                     |   32 +
 ssl/Makefile                         |   69 +-
 ssl/s2_lib.c                         |    6 +
 ssl/s3_lib.c                         |   69 +-
 ssl/ssl.h                            |    1 -
 ssl/ssl_conf.c                       |   10 +-
 ssl/ssl_err.c                        |    1 -
 ssl/ssl_lib.c                        |   14 +-
 ssl/sslv2conftest.c                  |  231 +++++
 util/libeay.num                      |    2 +
 util/mk1mf.pl                        |    4 +-
 util/pl/BC-32.pl                     |    4 +-
 util/pl/Mingw32.pl                   |    2 +-
 util/pl/OS2-EMX.pl                   |    4 +-
 util/pl/VC-32.pl                     |   10 +-
 util/pl/linux.pl                     |    2 +-
 util/pl/netware.pl                   |    8 +-
 util/pl/ultrix.pl                    |    2 +-
 util/pl/unix.pl                      |    2 +-
 83 files changed, 2673 insertions(+), 1323 deletions(-)
 create mode 100644 ssl/sslv2conftest.c

diff --git a/CHANGES b/CHANGES
index 18693f70efe9..7578f7eb7ace 100644
--- a/CHANGES
+++ b/CHANGES
@@ -2,6 +2,138 @@
  OpenSSL CHANGES
  _______________
 
+ Changes between 1.0.2f and 1.0.2g [1 Mar 2016]
+
+  * Disable weak ciphers in SSLv3 and up in default builds of OpenSSL.
+    Builds that are not configured with "enable-weak-ssl-ciphers" will not
+    provide any "EXPORT" or "LOW" strength ciphers.
+    [Viktor Dukhovni]
+
+  * Disable SSLv2 default build, default negotiation and weak ciphers.  SSLv2
+    is by default disabled at build-time.  Builds that are not configured with
+    "enable-ssl2" will not support SSLv2.  Even if "enable-ssl2" is used,
+    users who want to negotiate SSLv2 via the version-flexible SSLv23_method()
+    will need to explicitly call either of:
+
+        SSL_CTX_clear_options(ctx, SSL_OP_NO_SSLv2);
+    or
+        SSL_clear_options(ssl, SSL_OP_NO_SSLv2);
+
+    as appropriate.  Even if either of those is used, or the application
+    explicitly uses the version-specific SSLv2_method() or its client and
+    server variants, SSLv2 ciphers vulnerable to exhaustive search key
+    recovery have been removed.  Specifically, the SSLv2 40-bit EXPORT
+    ciphers, and SSLv2 56-bit DES are no longer available.
+    (CVE-2016-0800)
+    [Viktor Dukhovni]
+
+  *) Fix a double-free in DSA code
+
+     A double free bug was discovered when OpenSSL parses malformed DSA private
+     keys and could lead to a DoS attack or memory corruption for applications
+     that receive DSA private keys from untrusted sources.  This scenario is
+     considered rare.
+
+     This issue was reported to OpenSSL by Adam Langley(Google/BoringSSL) using
+     libFuzzer.
+     (CVE-2016-0705)
+     [Stephen Henson]
+
+  *) Disable SRP fake user seed to address a server memory leak.
+
+     Add a new method SRP_VBASE_get1_by_user that handles the seed properly.
+
+     SRP_VBASE_get_by_user had inconsistent memory management behaviour.
+     In order to fix an unavoidable memory leak, SRP_VBASE_get_by_user
+     was changed to ignore the "fake user" SRP seed, even if the seed
+     is configured.
+
+     Users should use SRP_VBASE_get1_by_user instead. Note that in
+     SRP_VBASE_get1_by_user, caller must free the returned value. Note
+     also that even though configuring the SRP seed attempts to hide
+     invalid usernames by continuing the handshake with fake
+     credentials, this behaviour is not constant time and no strong
+     guarantees are made that the handshake is indistinguishable from
+     that of a valid user.
+     (CVE-2016-0798)
+     [Emilia Käsper]
+
+  *) Fix BN_hex2bn/BN_dec2bn NULL pointer deref/heap corruption
+
+     In the BN_hex2bn function the number of hex digits is calculated using an
+     int value |i|. Later |bn_expand| is called with a value of |i * 4|. For
+     large values of |i| this can result in |bn_expand| not allocating any
+     memory because |i * 4| is negative. This can leave the internal BIGNUM data
+     field as NULL leading to a subsequent NULL ptr deref. For very large values
+     of |i|, the calculation |i * 4| could be a positive value smaller than |i|.
+     In this case memory is allocated to the internal BIGNUM data field, but it
+     is insufficiently sized leading to heap corruption. A similar issue exists
+     in BN_dec2bn. This could have security consequences if BN_hex2bn/BN_dec2bn
+     is ever called by user applications with very large untrusted hex/dec data.
+     This is anticipated to be a rare occurrence.
+
+     All OpenSSL internal usage of these functions use data that is not expected
+     to be untrusted, e.g. config file data or application command line
+     arguments. If user developed applications generate config file data based
+     on untrusted data then it is possible that this could also lead to security
+     consequences. This is also anticipated to be rare.
+
+     This issue was reported to OpenSSL by Guido Vranken.
+     (CVE-2016-0797)
+     [Matt Caswell]
+
+  *) Fix memory issues in BIO_*printf functions
+
+     The internal |fmtstr| function used in processing a "%s" format string in
+     the BIO_*printf functions could overflow while calculating the length of a
+     string and cause an OOB read when printing very long strings.
+
+     Additionally the internal |doapr_outch| function can attempt to write to an
+     OOB memory location (at an offset from the NULL pointer) in the event of a
+     memory allocation failure. In 1.0.2 and below this could be caused where
+     the size of a buffer to be allocated is greater than INT_MAX. E.g. this
+     could be in processing a very long "%s" format string. Memory leaks can
+     also occur.
+
+     The first issue may mask the second issue dependent on compiler behaviour.
+     These problems could enable attacks where large amounts of untrusted data
+     is passed to the BIO_*printf functions. If applications use these functions
+     in this way then they could be vulnerable. OpenSSL itself uses these
+     functions when printing out human-readable dumps of ASN.1 data. Therefore
+     applications that print this data could be vulnerable if the data is from
+     untrusted sources. OpenSSL command line applications could also be
+     vulnerable where they print out ASN.1 data, or if untrusted data is passed
+     as command line arguments.
+
+     Libssl is not considered directly vulnerable. Additionally certificates etc
+     received via remote connections via libssl are also unlikely to be able to
+     trigger these issues because of message size limits enforced within libssl.
+
+     This issue was reported to OpenSSL Guido Vranken.
+     (CVE-2016-0799)
+     [Matt Caswell]
+
+  *) Side channel attack on modular exponentiation
+
+     A side-channel attack was found which makes use of cache-bank conflicts on
+     the Intel Sandy-Bridge microarchitecture which could lead to the recovery
+     of RSA keys.  The ability to exploit this issue is limited as it relies on
+     an attacker who has control of code in a thread running on the same
+     hyper-threaded core as the victim thread which is performing decryptions.
+
+     This issue was reported to OpenSSL by Yuval Yarom, The University of
+     Adelaide and NICTA, Daniel Genkin, Technion and Tel Aviv University, and
+     Nadia Heninger, University of Pennsylvania with more information at
+     http://cachebleed.info.
+     (CVE-2016-0702)
+     [Andy Polyakov]
+
+  *) Change the req app to generate a 2048-bit RSA/DSA key by default,
+     if no keysize is specified with default_bits. This fixes an
+     omission in an earlier change that changed all RSA/DSA key generation
+     apps to use 2048 bits by default.
+     [Emilia Käsper]
+
  Changes between 1.0.2e and 1.0.2f [28 Jan 2016]
 
   *) DH small subgroups
@@ -105,7 +237,7 @@
      [Emilia Käsper]
 
   *) In DSA_generate_parameters_ex, if the provided seed is too short,
-     return an error
+     use a random seed, as already documented.
      [Rich Salz and Ismo Puustinen <ismo.puustinen@intel.com>]
 
  Changes between 1.0.2c and 1.0.2d [9 Jul 2015]
diff --git a/Configure b/Configure
index 4a715dc43732..c98107a48718 100755
--- a/Configure
+++ b/Configure
@@ -58,6 +58,10 @@ my $usage="Usage: Configure [no-<cipher> ...] [enable-<cipher> ...] [experimenta
 #		library and will be loaded in run-time by the OpenSSL library.
 # sctp          include SCTP support
 # 386           generate 80386 code
+# enable-weak-ssl-ciphers
+#		Enable EXPORT and LOW SSLv3 ciphers that are disabled by
+#		default.  Note, weak SSLv2 ciphers are unconditionally
+#		disabled.
 # no-sse2	disables IA-32 SSE2 code, above option implies no-sse2
 # no-<cipher>   build without specified algorithm (rsa, idea, rc5, ...)
 # -<xxx> +<xxx> compiler options are passed through 
@@ -781,11 +785,13 @@ my %disabled = ( # "what"         => "comment" [or special keyword "experimental
 		 "md2"            => "default",
 		 "rc5"            => "default",
 		 "rfc3779"	  => "default",
-		 "sctp"       => "default",
+		 "sctp"           => "default",
 		 "shared"         => "default",
 		 "ssl-trace"	  => "default",
+		 "ssl2"           => "default",
 		 "store"	  => "experimental",
 		 "unit-test"	  => "default",
+		 "weak-ssl-ciphers" => "default",
 		 "zlib"           => "default",
 		 "zlib-dynamic"   => "default"
 	       );
diff --git a/Makefile b/Makefile
index ee04c02cc1ca..190d064d89ef 100644
--- a/Makefile
+++ b/Makefile
@@ -4,7 +4,7 @@
 ## Makefile for OpenSSL
 ##
 
-VERSION=1.0.2f
+VERSION=1.0.2g
 MAJOR=1
 MINOR=0.2
 SHLIB_VERSION_NUMBER=1.0.0
@@ -13,7 +13,7 @@ SHLIB_MAJOR=1
 SHLIB_MINOR=0.0
 SHLIB_EXT=
 PLATFORM=dist
-OPTIONS= no-ec_nistp_64_gcc_128 no-gmp no-jpake no-krb5 no-libunbound no-md2 no-rc5 no-rfc3779 no-sctp no-shared no-ssl-trace no-store no-unit-test no-zlib no-zlib-dynamic static-engine
+OPTIONS= no-ec_nistp_64_gcc_128 no-gmp no-jpake no-krb5 no-libunbound no-md2 no-rc5 no-rfc3779 no-sctp no-shared no-ssl-trace no-ssl2 no-store no-unit-test no-weak-ssl-ciphers no-zlib no-zlib-dynamic static-engine
 CONFIGURE_ARGS=dist
 SHLIB_TARGET=
 
@@ -61,7 +61,7 @@ OPENSSLDIR=/usr/local/ssl
 
 CC= cc
 CFLAG= -O
-DEPFLAG= -DOPENSSL_NO_EC_NISTP_64_GCC_128 -DOPENSSL_NO_GMP -DOPENSSL_NO_JPAKE -DOPENSSL_NO_LIBUNBOUND -DOPENSSL_NO_MD2 -DOPENSSL_NO_RC5 -DOPENSSL_NO_RFC3779 -DOPENSSL_NO_SCTP -DOPENSSL_NO_SSL_TRACE -DOPENSSL_NO_STORE -DOPENSSL_NO_UNIT_TEST
+DEPFLAG= -DOPENSSL_NO_EC_NISTP_64_GCC_128 -DOPENSSL_NO_GMP -DOPENSSL_NO_JPAKE -DOPENSSL_NO_LIBUNBOUND -DOPENSSL_NO_MD2 -DOPENSSL_NO_RC5 -DOPENSSL_NO_RFC3779 -DOPENSSL_NO_SCTP -DOPENSSL_NO_SSL_TRACE -DOPENSSL_NO_SSL2 -DOPENSSL_NO_STORE -DOPENSSL_NO_UNIT_TEST -DOPENSSL_NO_WEAK_SSL_CIPHERS
 PEX_LIBS= 
 EX_LIBS= 
 EXE_EXT= 
diff --git a/Makefile.shared b/Makefile.shared
index e753f44e18fd..a2aa9804c1d9 100644
--- a/Makefile.shared
+++ b/Makefile.shared
@@ -272,7 +272,7 @@ link_o.cygwin:
 	SHLIB_SOVER=${LIBVERSION:+"-$(LIBVERSION)"}; \
 	ALLSYMSFLAGS='-Wl,--whole-archive'; \
 	NOALLSYMSFLAGS='-Wl,--no-whole-archive'; \
-	SHAREDFLAGS="$(CFLAGS) $(SHARED_LDFLAGS) -shared $$base $$deffile -Wl,-s,-Bsymbolic"; \
+	SHAREDFLAGS="$(CFLAGS) $(SHARED_LDFLAGS) -shared $$base $$deffile -Wl,-Bsymbolic"; \
 	$(LINK_SO_O)
 #for mingw target if def-file is in use dll-name should match library-name
 link_a.cygwin:
@@ -289,7 +289,7 @@ link_a.cygwin:
 		SHLIB_SOVER=32; \
 		extras="$(LIBNAME).def"; \
 		$(PERL) util/mkdef.pl 32 $$SHLIB > $$extras; \
-		base=; [ $(LIBNAME) = "crypto" ] && base=-Wl,--image-base,0x63000000; \
+		base=; [ $(LIBNAME) = "crypto" -a -n "$(FIPSCANLIB)" ] && base=-Wl,--image-base,0x63000000; \
 	fi; \
 	dll_name=$$SHLIB$$SHLIB_SOVER$$SHLIB_SUFFIX; \
 	$(PERL) util/mkrc.pl $$dll_name | \
@@ -297,7 +297,7 @@ link_a.cygwin:
 	extras="$$extras rc.o"; \
 	ALLSYMSFLAGS='-Wl,--whole-archive'; \
 	NOALLSYMSFLAGS='-Wl,--no-whole-archive'; \
-	SHAREDFLAGS="$(CFLAGS) $(SHARED_LDFLAGS) -shared $$base -Wl,-s,-Bsymbolic -Wl,--out-implib,lib$(LIBNAME).dll.a $$extras"; \
+	SHAREDFLAGS="$(CFLAGS) $(SHARED_LDFLAGS) -shared $$base -Wl,-Bsymbolic -Wl,--out-implib,lib$(LIBNAME).dll.a $$extras"; \
 	[ -f apps/$$dll_name ] && rm apps/$$dll_name; \
 	[ -f test/$$dll_name ] && rm test/$$dll_name; \
 	$(LINK_SO_A) || exit 1; \
diff --git a/NEWS b/NEWS
index 06c77025e999..33242c83624d 100644
--- a/NEWS
+++ b/NEWS
@@ -5,6 +5,19 @@
   This file gives a brief overview of the major changes between each OpenSSL
   release. For more details please read the CHANGES file.
 
+  Major changes between OpenSSL 1.0.2f and OpenSSL 1.0.2g [1 Mar 2016]
+
+      o Disable weak ciphers in SSLv3 and up in default builds of OpenSSL.
+      o Disable SSLv2 default build, default negotiation and weak ciphers
+        (CVE-2016-0800)
+      o Fix a double-free in DSA code (CVE-2016-0705)
+      o Disable SRP fake user seed to address a server memory leak
+        (CVE-2016-0798)
+      o Fix BN_hex2bn/BN_dec2bn NULL pointer deref/heap corruption
+        (CVE-2016-0797)
+      o Fix memory issues in BIO_*printf functions (CVE-2016-0799)
+      o Fix side channel attack on modular exponentiation (CVE-2016-0702)
+
   Major changes between OpenSSL 1.0.2e and OpenSSL 1.0.2f [28 Jan 2016]
 
       o DH small subgroups (CVE-2016-0701)
diff --git a/README b/README
index 1e9869daee00..2077b04eb271 100644
--- a/README
+++ b/README
@@ -1,5 +1,5 @@
 
- OpenSSL 1.0.2f 28 Jan 2016
+ OpenSSL 1.0.2g 1 Mar 2016
 
  Copyright (c) 1998-2015 The OpenSSL Project
  Copyright (c) 1995-1998 Eric A. Young, Tim J. Hudson
diff --git a/apps/apps.c b/apps/apps.c
index 2e778054ca8f..b1dd97038f7d 100644
--- a/apps/apps.c
+++ b/apps/apps.c
@@ -2442,7 +2442,11 @@ int bio_to_mem(unsigned char **out, int maxlen, BIO *in)
         else
             len = 1024;
         len = BIO_read(in, tbuf, len);
-        if (len <= 0)
+        if (len < 0) {
+            BIO_free(mem);
+            return -1;
+        }
+        if (len == 0)
             break;
         if (BIO_write(mem, tbuf, len) != len) {
             BIO_free(mem);
@@ -2459,7 +2463,7 @@ int bio_to_mem(unsigned char **out, int maxlen, BIO *in)
     return ret;
 }
 
-int pkey_ctrl_string(EVP_PKEY_CTX *ctx, char *value)
+int pkey_ctrl_string(EVP_PKEY_CTX *ctx, const char *value)
 {
     int rv;
     char *stmp, *vtmp = NULL;
diff --git a/apps/apps.h b/apps/apps.h
index 8276e708694d..19bf5cc3337d 100644
--- a/apps/apps.h
+++ b/apps/apps.h
@@ -321,7 +321,7 @@ int args_verify(char ***pargs, int *pargc,
                 int *badarg, BIO *err, X509_VERIFY_PARAM **pm);
 void policies_print(BIO *out, X509_STORE_CTX *ctx);
 int bio_to_mem(unsigned char **out, int maxlen, BIO *in);
-int pkey_ctrl_string(EVP_PKEY_CTX *ctx, char *value);
+int pkey_ctrl_string(EVP_PKEY_CTX *ctx, const char *value);
 int init_gen_str(BIO *err, EVP_PKEY_CTX **pctx,
                  const char *algname, ENGINE *e, int do_param);
 int do_X509_sign(BIO *err, X509 *x, EVP_PKEY *pkey, const EVP_MD *md,
diff --git a/apps/pkeyutl.c b/apps/pkeyutl.c
index c8d513b44ac4..e47206c40a11 100644
--- a/apps/pkeyutl.c
+++ b/apps/pkeyutl.c
@@ -73,7 +73,7 @@ static void usage(void);
 #define PROG pkeyutl_main
 
 static EVP_PKEY_CTX *init_ctx(int *pkeysize,
-                              char *keyfile, int keyform, int key_type,
+                              const char *keyfile, int keyform, int key_type,
                               char *passargin, int pkey_op, ENGINE *e,
                               int   impl);
 
@@ -99,10 +99,12 @@ int MAIN(int argc, char **argv)
     char *passargin = NULL;
     int keysize = -1;
     int engine_impl = 0;
-
     unsigned char *buf_in = NULL, *buf_out = NULL, *sig = NULL;
-    size_t buf_outlen;
+    size_t buf_outlen = 0;
     int buf_inlen = 0, siglen = -1;
+    const char *inkey = NULL;
+    const char *peerkey = NULL;
+    STACK_OF(OPENSSL_STRING) *pkeyopts = NULL;
 
     int ret = 1, rv = -1;
 
@@ -136,21 +138,13 @@ int MAIN(int argc, char **argv)
         } else if (!strcmp(*argv, "-inkey")) {
             if (--argc < 1)
                 badarg = 1;
-            else {
-                ctx = init_ctx(&keysize,
-                               *(++argv), keyform, key_type,
-                               passargin, pkey_op, e, engine_impl);
-                if (!ctx) {
-                    BIO_puts(bio_err, "Error initializing context\n");
-                    ERR_print_errors(bio_err);
-                    badarg = 1;
-                }
-            }
+            else
+                inkey = *++argv;
         } else if (!strcmp(*argv, "-peerkey")) {
             if (--argc < 1)
                 badarg = 1;
-            else if (!setup_peer(bio_err, ctx, peerform, *(++argv), e))
-                badarg = 1;
+            else
+                peerkey = *++argv;
         } else if (!strcmp(*argv, "-passin")) {
             if (--argc < 1)
                 badarg = 1;
@@ -191,23 +185,21 @@ int MAIN(int argc, char **argv)
             pkey_op = EVP_PKEY_OP_VERIFY;
         else if (!strcmp(*argv, "-verifyrecover"))
             pkey_op = EVP_PKEY_OP_VERIFYRECOVER;
-        else if (!strcmp(*argv, "-rev"))
-            rev = 1;
         else if (!strcmp(*argv, "-encrypt"))
             pkey_op = EVP_PKEY_OP_ENCRYPT;
         else if (!strcmp(*argv, "-decrypt"))
             pkey_op = EVP_PKEY_OP_DECRYPT;
         else if (!strcmp(*argv, "-derive"))
             pkey_op = EVP_PKEY_OP_DERIVE;
+        else if (!strcmp(*argv, "-rev"))
+            rev = 1;
         else if (strcmp(*argv, "-pkeyopt") == 0) {
             if (--argc < 1)
                 badarg = 1;
-            else if (!ctx) {
-                BIO_puts(bio_err, "-pkeyopt command before -inkey\n");
-                badarg = 1;
-            } else if (pkey_ctrl_string(ctx, *(++argv)) <= 0) {
-                BIO_puts(bio_err, "parameter setting error\n");
-                ERR_print_errors(bio_err);
+            else if ((pkeyopts == NULL &&
+                     (pkeyopts = sk_OPENSSL_STRING_new_null()) == NULL) ||
+                    sk_OPENSSL_STRING_push(pkeyopts, *++argv) == 0) {
+                BIO_puts(bio_err, "out of memory\n");
                 goto end;
             }
         } else
@@ -220,10 +212,37 @@ int MAIN(int argc, char **argv)
         argv++;
     }
 
-    if (!ctx) {
+    if (inkey == NULL ||
+        (peerkey != NULL && pkey_op != EVP_PKEY_OP_DERIVE)) {
         usage();
         goto end;
     }
+    ctx = init_ctx(&keysize, inkey, keyform, key_type,
+                   passargin, pkey_op, e, engine_impl);
+    if (!ctx) {
+        BIO_puts(bio_err, "Error initializing context\n");
+        ERR_print_errors(bio_err);
+        goto end;
+    }
+    if (peerkey != NULL && !setup_peer(bio_err, ctx, peerform, peerkey, e)) {
+        BIO_puts(bio_err, "Error setting up peer key\n");
+        ERR_print_errors(bio_err);
+        goto end;
+    }
+    if (pkeyopts != NULL) {
+        int num = sk_OPENSSL_STRING_num(pkeyopts);
+        int i;
+
+        for (i = 0; i < num; ++i) {
+            const char *opt = sk_OPENSSL_STRING_value(pkeyopts, i);
+
+            if (pkey_ctrl_string(ctx, opt) <= 0) {
+                BIO_puts(bio_err, "parameter setting error\n");
+                ERR_print_errors(bio_err);
+                goto end;
+            }
+        }
+    }
 
     if (sigfile && (pkey_op != EVP_PKEY_OP_VERIFY)) {
         BIO_puts(bio_err, "Signature file specified for non verify\n");
@@ -273,7 +292,7 @@ int MAIN(int argc, char **argv)
         }
         siglen = bio_to_mem(&sig, keysize * 10, sigbio);
         BIO_free(sigbio);
-        if (siglen <= 0) {
+        if (siglen < 0) {
             BIO_printf(bio_err, "Error reading signature data\n");
             goto end;
         }
@@ -282,7 +301,7 @@ int MAIN(int argc, char **argv)
     if (in) {
         /* Read the input data */
         buf_inlen = bio_to_mem(&buf_in, keysize * 10, in);
-        if (buf_inlen <= 0) {
+        if (buf_inlen < 0) {
             BIO_printf(bio_err, "Error reading input Data\n");
             exit(1);
         }
@@ -310,7 +329,7 @@ int MAIN(int argc, char **argv)
     } else {
         rv = do_keyop(ctx, pkey_op, NULL, (size_t *)&buf_outlen,
                       buf_in, (size_t)buf_inlen);
-        if (rv > 0) {
+        if (rv > 0 && buf_outlen != 0) {
             buf_out = OPENSSL_malloc(buf_outlen);
             if (!buf_out)
                 rv = -1;
@@ -340,12 +359,14 @@ int MAIN(int argc, char **argv)
         EVP_PKEY_CTX_free(ctx);
     BIO_free(in);
     BIO_free_all(out);
-    if (buf_in)
+    if (buf_in != NULL)
         OPENSSL_free(buf_in);
-    if (buf_out)
+    if (buf_out != NULL)
         OPENSSL_free(buf_out);
-    if (sig)
+    if (sig != NULL)
         OPENSSL_free(sig);
+    if (pkeyopts != NULL)
+        sk_OPENSSL_STRING_free(pkeyopts);
     return ret;
 }
 
@@ -380,7 +401,7 @@ static void usage()
 }
 
 static EVP_PKEY_CTX *init_ctx(int *pkeysize,
-                              char *keyfile, int keyform, int key_type,
+                              const char *keyfile, int keyform, int key_type,
                               char *passargin, int pkey_op, ENGINE *e,
                               int   engine_impl)
 {
@@ -484,14 +505,9 @@ static int setup_peer(BIO *err, EVP_PKEY_CTX *ctx, int peerform,
     EVP_PKEY *peer = NULL;
     ENGINE* engine = NULL;
     int ret;
-    if (!ctx) {
-        BIO_puts(err, "-peerkey command before -inkey\n");
-        return 0;
-    }
 
     if (peerform == FORMAT_ENGINE)
-      engine = e;
-
+        engine = e;
     peer = load_pubkey(bio_err, file, peerform, 0, NULL, engine, "Peer Key");
 
     if (!peer) {
diff --git a/apps/req.c b/apps/req.c
index 57781c93c4ca..e818bd2976d6 100644
--- a/apps/req.c
+++ b/apps/req.c
@@ -101,8 +101,8 @@
 #define STRING_MASK     "string_mask"
 #define UTF8_IN         "utf8"
 
-#define DEFAULT_KEY_LENGTH      512
-#define MIN_KEY_LENGTH          384
+#define DEFAULT_KEY_LENGTH      2048
+#define MIN_KEY_LENGTH          512
 
 #undef PROG
 #define PROG    req_main
diff --git a/apps/rsautl.c b/apps/rsautl.c
index d642f9ad97f3..5b6f849ea74d 100644
--- a/apps/rsautl.c
+++ b/apps/rsautl.c
@@ -250,7 +250,7 @@ int MAIN(int argc, char **argv)
 
     if (outfile) {
         if (!(out = BIO_new_file(outfile, "wb"))) {
-            BIO_printf(bio_err, "Error Reading Output File\n");
+            BIO_printf(bio_err, "Error Writing Output File\n");
             ERR_print_errors(bio_err);
             goto end;
         }
@@ -276,7 +276,7 @@ int MAIN(int argc, char **argv)
 
     /* Read the input data */
     rsa_inlen = BIO_read(in, rsa_in, keysize * 2);
-    if (rsa_inlen <= 0) {
+    if (rsa_inlen < 0) {
         BIO_printf(bio_err, "Error reading input Data\n");
         exit(1);
     }
@@ -311,7 +311,7 @@ int MAIN(int argc, char **argv)
 
     }
 
-    if (rsa_outlen <= 0) {
+    if (rsa_outlen < 0) {
         BIO_printf(bio_err, "RSA operation error\n");
         ERR_print_errors(bio_err);
         goto end;
diff --git a/apps/s_client.c b/apps/s_client.c
index caf76d35dc5a..0c1102b9c36a 100644
--- a/apps/s_client.c
+++ b/apps/s_client.c
@@ -390,8 +390,6 @@ static void sc_usage(void)
                " -no_tls1_2/-no_tls1_1/-no_tls1/-no_ssl3/-no_ssl2 - turn off that protocol\n");
     BIO_printf(bio_err,
                " -bugs         - Switch on all SSL implementation bug workarounds\n");
-    BIO_printf(bio_err,
-               " -serverpref   - Use server's cipher preferences (only SSLv2)\n");
     BIO_printf(bio_err,
                " -cipher       - preferred cipher to use, use the 'openssl ciphers'\n");
     BIO_printf(bio_err,
diff --git a/apps/s_server.c b/apps/s_server.c
index 65cbaaf6eb9b..09c755b55cfe 100644
--- a/apps/s_server.c
+++ b/apps/s_server.c
@@ -429,6 +429,8 @@ typedef struct srpsrvparm_st {
 static int MS_CALLBACK ssl_srp_server_param_cb(SSL *s, int *ad, void *arg)
 {
     srpsrvparm *p = (srpsrvparm *) arg;
+    int ret = SSL3_AL_FATAL;
+
     if (p->login == NULL && p->user == NULL) {
         p->login = SSL_get_srp_username(s);
         BIO_printf(bio_err, "SRP username = \"%s\"\n", p->login);
@@ -437,21 +439,25 @@ static int MS_CALLBACK ssl_srp_server_param_cb(SSL *s, int *ad, void *arg)
 
     if (p->user == NULL) {
         BIO_printf(bio_err, "User %s doesn't exist\n", p->login);
-        return SSL3_AL_FATAL;
+        goto err;
     }
+
     if (SSL_set_srp_server_param
         (s, p->user->N, p->user->g, p->user->s, p->user->v,
          p->user->info) < 0) {
         *ad = SSL_AD_INTERNAL_ERROR;
-        return SSL3_AL_FATAL;
+        goto err;
     }
     BIO_printf(bio_err,
                "SRP parameters set: username = \"%s\" info=\"%s\" \n",
                p->login, p->user->info);
-    /* need to check whether there are memory leaks */
+    ret = SSL_ERROR_NONE;
+
+err:
+    SRP_user_pwd_free(p->user);
     p->user = NULL;
     p->login = NULL;
-    return SSL_ERROR_NONE;
+    return ret;
 }
 
 #endif
@@ -2452,9 +2458,10 @@ static int sv_body(char *hostname, int s, int stype, unsigned char *context)
 #ifndef OPENSSL_NO_SRP
                 while (SSL_get_error(con, k) == SSL_ERROR_WANT_X509_LOOKUP) {
                     BIO_printf(bio_s_out, "LOOKUP renego during write\n");
+                    SRP_user_pwd_free(srp_callback_parm.user);
                     srp_callback_parm.user =
-                        SRP_VBASE_get_by_user(srp_callback_parm.vb,
-                                              srp_callback_parm.login);
+                        SRP_VBASE_get1_by_user(srp_callback_parm.vb,
+                                               srp_callback_parm.login);
                     if (srp_callback_parm.user)
                         BIO_printf(bio_s_out, "LOOKUP done %s\n",
                                    srp_callback_parm.user->info);
@@ -2508,9 +2515,10 @@ static int sv_body(char *hostname, int s, int stype, unsigned char *context)
 #ifndef OPENSSL_NO_SRP
                 while (SSL_get_error(con, i) == SSL_ERROR_WANT_X509_LOOKUP) {
                     BIO_printf(bio_s_out, "LOOKUP renego during read\n");
+                    SRP_user_pwd_free(srp_callback_parm.user);
                     srp_callback_parm.user =
-                        SRP_VBASE_get_by_user(srp_callback_parm.vb,
-                                              srp_callback_parm.login);
+                        SRP_VBASE_get1_by_user(srp_callback_parm.vb,
+                                               srp_callback_parm.login);
                     if (srp_callback_parm.user)
                         BIO_printf(bio_s_out, "LOOKUP done %s\n",
                                    srp_callback_parm.user->info);
@@ -2605,9 +2613,10 @@ static int init_ssl_connection(SSL *con)
     while (i <= 0 && SSL_get_error(con, i) == SSL_ERROR_WANT_X509_LOOKUP) {
         BIO_printf(bio_s_out, "LOOKUP during accept %s\n",
                    srp_callback_parm.login);
+        SRP_user_pwd_free(srp_callback_parm.user);
         srp_callback_parm.user =
-            SRP_VBASE_get_by_user(srp_callback_parm.vb,
-                                  srp_callback_parm.login);
+            SRP_VBASE_get1_by_user(srp_callback_parm.vb,
+                                   srp_callback_parm.login);
         if (srp_callback_parm.user)
             BIO_printf(bio_s_out, "LOOKUP done %s\n",
                        srp_callback_parm.user->info);
@@ -2849,9 +2858,10 @@ static int www_body(char *hostname, int s, int stype, unsigned char *context)
                    && SSL_get_error(con, i) == SSL_ERROR_WANT_X509_LOOKUP) {
                 BIO_printf(bio_s_out, "LOOKUP during accept %s\n",
                            srp_callback_parm.login);
+                SRP_user_pwd_free(srp_callback_parm.user);
                 srp_callback_parm.user =
-                    SRP_VBASE_get_by_user(srp_callback_parm.vb,
-                                          srp_callback_parm.login);
+                    SRP_VBASE_get1_by_user(srp_callback_parm.vb,
+                                           srp_callback_parm.login);
                 if (srp_callback_parm.user)
                     BIO_printf(bio_s_out, "LOOKUP done %s\n",
                                srp_callback_parm.user->info);
@@ -2891,9 +2901,10 @@ static int www_body(char *hostname, int s, int stype, unsigned char *context)
                 if (BIO_should_io_special(io)
                     && BIO_get_retry_reason(io) == BIO_RR_SSL_X509_LOOKUP) {
                     BIO_printf(bio_s_out, "LOOKUP renego during read\n");
+                    SRP_user_pwd_free(srp_callback_parm.user);
                     srp_callback_parm.user =
-                        SRP_VBASE_get_by_user(srp_callback_parm.vb,
-                                              srp_callback_parm.login);
+                        SRP_VBASE_get1_by_user(srp_callback_parm.vb,
+                                               srp_callback_parm.login);
                     if (srp_callback_parm.user)
                         BIO_printf(bio_s_out, "LOOKUP done %s\n",
                                    srp_callback_parm.user->info);
@@ -3236,9 +3247,10 @@ static int rev_body(char *hostname, int s, int stype, unsigned char *context)
         if (BIO_should_io_special(io)
             && BIO_get_retry_reason(io) == BIO_RR_SSL_X509_LOOKUP) {
             BIO_printf(bio_s_out, "LOOKUP renego during accept\n");
+            SRP_user_pwd_free(srp_callback_parm.user);
             srp_callback_parm.user =
-                SRP_VBASE_get_by_user(srp_callback_parm.vb,
-                                      srp_callback_parm.login);
+                SRP_VBASE_get1_by_user(srp_callback_parm.vb,
+                                       srp_callback_parm.login);
             if (srp_callback_parm.user)
                 BIO_printf(bio_s_out, "LOOKUP done %s\n",
                            srp_callback_parm.user->info);
@@ -3264,9 +3276,10 @@ static int rev_body(char *hostname, int s, int stype, unsigned char *context)
                 if (BIO_should_io_special(io)
                     && BIO_get_retry_reason(io) == BIO_RR_SSL_X509_LOOKUP) {
                     BIO_printf(bio_s_out, "LOOKUP renego during read\n");
+                    SRP_user_pwd_free(srp_callback_parm.user);
                     srp_callback_parm.user =
-                        SRP_VBASE_get_by_user(srp_callback_parm.vb,
-                                              srp_callback_parm.login);
+                        SRP_VBASE_get1_by_user(srp_callback_parm.vb,
+                                               srp_callback_parm.login);
                     if (srp_callback_parm.user)
                         BIO_printf(bio_s_out, "LOOKUP done %s\n",
                                    srp_callback_parm.user->info);
diff --git a/config b/config
index 77f730f093e6..bba370c4f3f1 100755
--- a/config
+++ b/config
@@ -852,7 +852,8 @@ case "$GUESSOS" in
   # *-dgux) OUT="dgux" ;;
   mips-sony-newsos4) OUT="newsos4-gcc" ;;
   *-*-cygwin_pre1.3) OUT="Cygwin-pre1.3" ;;
-  *-*-cygwin) OUT="Cygwin" ;;
+  i[3456]86-*-cygwin) OUT="Cygwin" ;;
+  *-*-cygwin) OUT="Cygwin-${MACHINE}" ;;
   t3e-cray-unicosmk) OUT="cray-t3e" ;;
   j90-cray-unicos) OUT="cray-j90" ;;
   nsr-tandem-nsk) OUT="tandem-c89" ;;
diff --git a/crypto/asn1/tasn_dec.c b/crypto/asn1/tasn_dec.c
index 9256049d1588..5a507967c894 100644
--- a/crypto/asn1/tasn_dec.c
+++ b/crypto/asn1/tasn_dec.c
@@ -717,7 +717,7 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
     long plen;
     char cst, inf, free_cont = 0;
     const unsigned char *p;
-    BUF_MEM buf;
+    BUF_MEM buf = { 0, NULL, 0 };
     const unsigned char *cont = NULL;
     long len;
     if (!pval) {
@@ -793,7 +793,6 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
         } else {
             len = p - cont + plen;
             p += plen;
-            buf.data = NULL;
         }
     } else if (cst) {
         if (utype == V_ASN1_NULL || utype == V_ASN1_BOOLEAN
@@ -802,9 +801,9 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
             ASN1err(ASN1_F_ASN1_D2I_EX_PRIMITIVE, ASN1_R_TYPE_NOT_PRIMITIVE);
             return 0;
         }
-        buf.length = 0;
-        buf.max = 0;
-        buf.data = NULL;
+
+        /* Free any returned 'buf' content */
+        free_cont = 1;
         /*
          * Should really check the internal tags are correct but some things
          * may get this wrong. The relevant specs say that constructed string
@@ -812,18 +811,16 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
          * So instead just check for UNIVERSAL class and ignore the tag.
          */
         if (!asn1_collect(&buf, &p, plen, inf, -1, V_ASN1_UNIVERSAL, 0)) {
-            free_cont = 1;
             goto err;
         }
         len = buf.length;
         /* Append a final null to string */
         if (!BUF_MEM_grow_clean(&buf, len + 1)) {
             ASN1err(ASN1_F_ASN1_D2I_EX_PRIMITIVE, ERR_R_MALLOC_FAILURE);
-            return 0;
+            goto err;
         }
         buf.data[len] = 0;
         cont = (const unsigned char *)buf.data;
-        free_cont = 1;
     } else {
         cont = p;
         len = plen;
@@ -831,6 +828,7 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval,
     }
 
     /* We now have content length and type: translate into a structure */
+    /* asn1_ex_c2i may reuse allocated buffer, and so sets free_cont to 0 */
     if (!asn1_ex_c2i(pval, cont, len, utype, &free_cont, it))
         goto err;
 
diff --git a/crypto/bio/b_print.c b/crypto/bio/b_print.c
index 7c81e25d482c..90248fa2aaba 100644
--- a/crypto/bio/b_print.c
+++ b/crypto/bio/b_print.c
@@ -125,16 +125,16 @@
 # define LLONG long
 #endif
 
-static void fmtstr(char **, char **, size_t *, size_t *,
-                   const char *, int, int, int);
-static void fmtint(char **, char **, size_t *, size_t *,
-                   LLONG, int, int, int, int);
-static void fmtfp(char **, char **, size_t *, size_t *,
-                  LDOUBLE, int, int, int);
-static void doapr_outch(char **, char **, size_t *, size_t *, int);
-static void _dopr(char **sbuffer, char **buffer,
-                  size_t *maxlen, size_t *retlen, int *truncated,
-                  const char *format, va_list args);
+static int fmtstr(char **, char **, size_t *, size_t *,
+                  const char *, int, int, int);
+static int fmtint(char **, char **, size_t *, size_t *,
+                  LLONG, int, int, int, int);
+static int fmtfp(char **, char **, size_t *, size_t *,
+                 LDOUBLE, int, int, int);
+static int doapr_outch(char **, char **, size_t *, size_t *, int);
+static int _dopr(char **sbuffer, char **buffer,
+                 size_t *maxlen, size_t *retlen, int *truncated,
+                 const char *format, va_list args);
 
 /* format read states */
 #define DP_S_DEFAULT    0
@@ -165,7 +165,7 @@ static void _dopr(char **sbuffer, char **buffer,
 #define char_to_int(p) (p - '0')
 #define OSSL_MAX(p,q) ((p >= q) ? p : q)
 
-static void
+static int
 _dopr(char **sbuffer,
       char **buffer,
       size_t *maxlen,
@@ -196,7 +196,8 @@ _dopr(char **sbuffer,
             if (ch == '%')
                 state = DP_S_FLAGS;
             else
-                doapr_outch(sbuffer, buffer, &currlen, maxlen, ch);
+                if(!doapr_outch(sbuffer, buffer, &currlen, maxlen, ch))
+                    return 0;
             ch = *format++;
             break;
         case DP_S_FLAGS:
@@ -302,8 +303,9 @@ _dopr(char **sbuffer,
                     value = va_arg(args, int);
                     break;
                 }
-                fmtint(sbuffer, buffer, &currlen, maxlen,
-                       value, 10, min, max, flags);
+                if (!fmtint(sbuffer, buffer, &currlen, maxlen, value, 10, min,
+                            max, flags))
+                    return 0;
                 break;
             case 'X':
                 flags |= DP_F_UP;
@@ -326,17 +328,19 @@ _dopr(char **sbuffer,
                     value = (LLONG) va_arg(args, unsigned int);
                     break;
                 }
-                fmtint(sbuffer, buffer, &currlen, maxlen, value,
-                       ch == 'o' ? 8 : (ch == 'u' ? 10 : 16),
-                       min, max, flags);
+                if (!fmtint(sbuffer, buffer, &currlen, maxlen, value,
+                            ch == 'o' ? 8 : (ch == 'u' ? 10 : 16),
+                            min, max, flags))
+                    return 0;
                 break;
             case 'f':
                 if (cflags == DP_C_LDOUBLE)
                     fvalue = va_arg(args, LDOUBLE);
                 else
                     fvalue = va_arg(args, double);
-                fmtfp(sbuffer, buffer, &currlen, maxlen,
-                      fvalue, min, max, flags);
+                if (!fmtfp(sbuffer, buffer, &currlen, maxlen, fvalue, min, max,
+                           flags))
+                    return 0;
                 break;
             case 'E':
                 flags |= DP_F_UP;
@@ -355,8 +359,9 @@ _dopr(char **sbuffer,
                     fvalue = va_arg(args, double);
                 break;
             case 'c':
-                doapr_outch(sbuffer, buffer, &currlen, maxlen,
-                            va_arg(args, int));
+                if(!doapr_outch(sbuffer, buffer, &currlen, maxlen,
+                            va_arg(args, int)))
+                    return 0;
                 break;
             case 's':
                 strvalue = va_arg(args, char *);
@@ -366,13 +371,15 @@ _dopr(char **sbuffer,
                     else
                         max = *maxlen;
                 }
-                fmtstr(sbuffer, buffer, &currlen, maxlen, strvalue,
-                       flags, min, max);
+                if (!fmtstr(sbuffer, buffer, &currlen, maxlen, strvalue,
+                            flags, min, max))
+                    return 0;
                 break;
             case 'p':
                 value = (long)va_arg(args, void *);
-                fmtint(sbuffer, buffer, &currlen, maxlen,
-                       value, 16, min, max, flags | DP_F_NUM);
+                if (!fmtint(sbuffer, buffer, &currlen, maxlen,
+                            value, 16, min, max, flags | DP_F_NUM))
+                    return 0;
                 break;
             case 'n':          /* XXX */
                 if (cflags == DP_C_SHORT) {
@@ -394,7 +401,8 @@ _dopr(char **sbuffer,
                 }
                 break;
             case '%':
-                doapr_outch(sbuffer, buffer, &currlen, maxlen, ch);
+                if(!doapr_outch(sbuffer, buffer, &currlen, maxlen, ch))
+                    return 0;
                 break;
             case 'w':
                 /* not supported yet, treat as next char */
@@ -418,46 +426,56 @@ _dopr(char **sbuffer,
     *truncated = (currlen > *maxlen - 1);
     if (*truncated)
         currlen = *maxlen - 1;
-    doapr_outch(sbuffer, buffer, &currlen, maxlen, '\0');
+    if(!doapr_outch(sbuffer, buffer, &currlen, maxlen, '\0'))
+        return 0;
     *retlen = currlen - 1;
-    return;
+    return 1;
 }
 
-static void
+static int
 fmtstr(char **sbuffer,
        char **buffer,
        size_t *currlen,
        size_t *maxlen, const char *value, int flags, int min, int max)
 {
-    int padlen, strln;
+    int padlen;
+    size_t strln;
     int cnt = 0;
 
     if (value == 0)
         value = "<NULL>";
-    for (strln = 0; value[strln]; ++strln) ;
+
+    strln = strlen(value);
+    if (strln > INT_MAX)
+        strln = INT_MAX;
+
     padlen = min - strln;
-    if (padlen < 0)
+    if (min < 0 || padlen < 0)
         padlen = 0;
     if (flags & DP_F_MINUS)
         padlen = -padlen;
 
     while ((padlen > 0) && (cnt < max)) {
-        doapr_outch(sbuffer, buffer, currlen, maxlen, ' ');
+        if(!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+            return 0;
         --padlen;
         ++cnt;
     }
     while (*value && (cnt < max)) {
-        doapr_outch(sbuffer, buffer, currlen, maxlen, *value++);
+        if(!doapr_outch(sbuffer, buffer, currlen, maxlen, *value++))
+            return 0;
         ++cnt;
     }
     while ((padlen < 0) && (cnt < max)) {
-        doapr_outch(sbuffer, buffer, currlen, maxlen, ' ');
+        if(!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+            return 0;
         ++padlen;
         ++cnt;
     }
+    return 1;
 }
 
-static void
+static int
 fmtint(char **sbuffer,
        char **buffer,
        size_t *currlen,
@@ -517,37 +535,44 @@ fmtint(char **sbuffer,
 
     /* spaces */
     while (spadlen > 0) {
-        doapr_outch(sbuffer, buffer, currlen, maxlen, ' ');
+        if(!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+            return 0;
         --spadlen;
     }
 
     /* sign */
     if (signvalue)
-        doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue);
+        if(!doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue))
+            return 0;
 
     /* prefix */
     while (*prefix) {
-        doapr_outch(sbuffer, buffer, currlen, maxlen, *prefix);
+        if(!doapr_outch(sbuffer, buffer, currlen, maxlen, *prefix))
+            return 0;
         prefix++;
     }
 
     /* zeros */
     if (zpadlen > 0) {
         while (zpadlen > 0) {
-            doapr_outch(sbuffer, buffer, currlen, maxlen, '0');
+            if(!doapr_outch(sbuffer, buffer, currlen, maxlen, '0'))
+                return 0;
             --zpadlen;
         }
     }
     /* digits */
-    while (place > 0)
-        doapr_outch(sbuffer, buffer, currlen, maxlen, convert[--place]);
+    while (place > 0) {
+        if (!doapr_outch(sbuffer, buffer, currlen, maxlen, convert[--place]))
+            return 0;
+    }
 
     /* left justified spaces */
     while (spadlen < 0) {
-        doapr_outch(sbuffer, buffer, currlen, maxlen, ' ');
+        if (!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+            return 0;
         ++spadlen;
     }
-    return;
+    return 1;
 }
 
 static LDOUBLE abs_val(LDOUBLE value)
@@ -578,7 +603,7 @@ static long roundv(LDOUBLE value)
     return intpart;
 }
 
-static void
+static int
 fmtfp(char **sbuffer,
       char **buffer,
       size_t *currlen,
@@ -657,47 +682,61 @@ fmtfp(char **sbuffer,
 
     if ((flags & DP_F_ZERO) && (padlen > 0)) {
         if (signvalue) {
-            doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue);
+            if (!doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue))
+                return 0;
             --padlen;
             signvalue = 0;
         }
         while (padlen > 0) {
-            doapr_outch(sbuffer, buffer, currlen, maxlen, '0');
+            if (!doapr_outch(sbuffer, buffer, currlen, maxlen, '0'))
+                return 0;
             --padlen;
         }
     }
     while (padlen > 0) {
-        doapr_outch(sbuffer, buffer, currlen, maxlen, ' ');
+        if (!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+            return 0;
         --padlen;
     }
-    if (signvalue)
-        doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue);
+    if (signvalue && !doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue))
+        return 0;
 
-    while (iplace > 0)
-        doapr_outch(sbuffer, buffer, currlen, maxlen, iconvert[--iplace]);
+    while (iplace > 0) {
+        if (!doapr_outch(sbuffer, buffer, currlen, maxlen, iconvert[--iplace]))
+            return 0;
+    }
 
     /*
      * Decimal point. This should probably use locale to find the correct
      * char to print out.
      */
     if (max > 0 || (flags & DP_F_NUM)) {
-        doapr_outch(sbuffer, buffer, currlen, maxlen, '.');
+        if (!doapr_outch(sbuffer, buffer, currlen, maxlen, '.'))
+            return 0;
 
-        while (fplace > 0)
-            doapr_outch(sbuffer, buffer, currlen, maxlen, fconvert[--fplace]);
+        while (fplace > 0) {
+            if(!doapr_outch(sbuffer, buffer, currlen, maxlen,
+                            fconvert[--fplace]))
+                return 0;
+        }
     }
     while (zpadlen > 0) {
-        doapr_outch(sbuffer, buffer, currlen, maxlen, '0');
+        if (!doapr_outch(sbuffer, buffer, currlen, maxlen, '0'))
+            return 0;
         --zpadlen;
     }
 
     while (padlen < 0) {
-        doapr_outch(sbuffer, buffer, currlen, maxlen, ' ');
+        if (!doapr_outch(sbuffer, buffer, currlen, maxlen, ' '))
+            return 0;
         ++padlen;
     }
+    return 1;
 }
 
-static void
+#define BUFFER_INC  1024
+
+static int
 doapr_outch(char **sbuffer,
             char **buffer, size_t *currlen, size_t *maxlen, int c)
 {
@@ -708,24 +747,25 @@ doapr_outch(char **sbuffer,
     assert(*currlen <= *maxlen);
 
     if (buffer && *currlen == *maxlen) {
-        *maxlen += 1024;
+        if (*maxlen > INT_MAX - BUFFER_INC)
+            return 0;
+
+        *maxlen += BUFFER_INC;
         if (*buffer == NULL) {
             *buffer = OPENSSL_malloc(*maxlen);
-            if (!*buffer) {
-                /* Panic! Can't really do anything sensible. Just return */
-                return;
-            }
+            if (*buffer == NULL)
+                return 0;
             if (*currlen > 0) {
                 assert(*sbuffer != NULL);
                 memcpy(*buffer, *sbuffer, *currlen);
             }
             *sbuffer = NULL;
         } else {
-            *buffer = OPENSSL_realloc(*buffer, *maxlen);
-            if (!*buffer) {
-                /* Panic! Can't really do anything sensible. Just return */
-                return;
-            }
+            char *tmpbuf;
+            tmpbuf = OPENSSL_realloc(*buffer, *maxlen);
+            if (tmpbuf == NULL)
+                return 0;
+            *buffer = tmpbuf;
         }
     }
 
@@ -736,7 +776,7 @@ doapr_outch(char **sbuffer,
             (*buffer)[(*currlen)++] = (char)c;
     }
 
-    return;
+    return 1;
 }
 
 /***************************************************************************/
@@ -768,7 +808,11 @@ int BIO_vprintf(BIO *bio, const char *format, va_list args)
 
     dynbuf = NULL;
     CRYPTO_push_info("doapr()");
-    _dopr(&hugebufp, &dynbuf, &hugebufsize, &retlen, &ignored, format, args);
+    if (!_dopr(&hugebufp, &dynbuf, &hugebufsize, &retlen, &ignored, format,
+                args)) {
+        OPENSSL_free(dynbuf);
+        return -1;
+    }
     if (dynbuf) {
         ret = BIO_write(bio, dynbuf, (int)retlen);
         OPENSSL_free(dynbuf);
@@ -803,7 +847,8 @@ int BIO_vsnprintf(char *buf, size_t n, const char *format, va_list args)
     size_t retlen;
     int truncated;
 
-    _dopr(&buf, NULL, &n, &retlen, &truncated, format, args);
+    if(!_dopr(&buf, NULL, &n, &retlen, &truncated, format, args))
+        return -1;
 
     if (truncated)
         /*
diff --git a/crypto/bio/bio.h b/crypto/bio/bio.h
index 6e2293bc66da..6790aed28e0b 100644
--- a/crypto/bio/bio.h
+++ b/crypto/bio/bio.h
@@ -479,7 +479,7 @@ struct bio_dgram_sctp_prinfo {
 # define BIO_get_conn_hostname(b)  BIO_ptr_ctrl(b,BIO_C_GET_CONNECT,0)
 # define BIO_get_conn_port(b)      BIO_ptr_ctrl(b,BIO_C_GET_CONNECT,1)
 # define BIO_get_conn_ip(b)               BIO_ptr_ctrl(b,BIO_C_GET_CONNECT,2)
-# define BIO_get_conn_int_port(b) BIO_ctrl(b,BIO_C_GET_CONNECT,3,0,NULL)
+# define BIO_get_conn_int_port(b) BIO_ctrl(b,BIO_C_GET_CONNECT,3,NULL)
 
 # define BIO_set_nbio(b,n)       BIO_ctrl(b,BIO_C_SET_NBIO,(n),NULL)
 
@@ -689,7 +689,7 @@ long BIO_debug_callback(BIO *bio, int cmd, const char *argp, int argi,
                         long argl, long ret);
 
 BIO_METHOD *BIO_s_mem(void);
-BIO *BIO_new_mem_buf(void *buf, int len);
+BIO *BIO_new_mem_buf(const void *buf, int len);
 BIO_METHOD *BIO_s_socket(void);
 BIO_METHOD *BIO_s_connect(void);
 BIO_METHOD *BIO_s_accept(void);
diff --git a/crypto/bio/bss_mem.c b/crypto/bio/bss_mem.c
index d190765dc201..b0394a960da1 100644
--- a/crypto/bio/bss_mem.c
+++ b/crypto/bio/bss_mem.c
@@ -91,7 +91,8 @@ BIO_METHOD *BIO_s_mem(void)
     return (&mem_method);
 }
 
-BIO *BIO_new_mem_buf(void *buf, int len)
+
+BIO *BIO_new_mem_buf(const void *buf, int len)
 {
     BIO *ret;
     BUF_MEM *b;
@@ -105,7 +106,8 @@ BIO *BIO_new_mem_buf(void *buf, int len)
     if (!(ret = BIO_new(BIO_s_mem())))
         return NULL;
     b = (BUF_MEM *)ret->ptr;
-    b->data = buf;
+    /* Cast away const and trust in the MEM_RDONLY flag. */
+    b->data = (void *)buf;
     b->length = sz;
     b->max = sz;
     ret->flags |= BIO_FLAGS_MEM_RDONLY;
diff --git a/crypto/bn/Makefile b/crypto/bn/Makefile
index 215855ecae91..c4c640951759 100644
--- a/crypto/bn/Makefile
+++ b/crypto/bn/Makefile
@@ -252,8 +252,8 @@ bn_exp.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
 bn_exp.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
 bn_exp.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
 bn_exp.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_exp.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_exp.c bn_lcl.h
-bn_exp.o: rsaz_exp.h
+bn_exp.o: ../../include/openssl/symhacks.h ../constant_time_locl.h
+bn_exp.o: ../cryptlib.h bn_exp.c bn_lcl.h rsaz_exp.h
 bn_exp2.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
 bn_exp2.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
 bn_exp2.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
diff --git a/crypto/bn/asm/rsaz-avx2.pl b/crypto/bn/asm/rsaz-avx2.pl
index 3b6ccf83d13e..712a77fe8ca3 100755
--- a/crypto/bn/asm/rsaz-avx2.pl
+++ b/crypto/bn/asm/rsaz-avx2.pl
@@ -443,7 +443,7 @@ $TEMP2 = $B2;
 $TEMP3 = $Y1;
 $TEMP4 = $Y2;
 $code.=<<___;
-	#we need to fix indexes 32-39 to avoid overflow
+	# we need to fix indices 32-39 to avoid overflow
 	vmovdqu		32*8(%rsp), $ACC8		# 32*8-192($tp0),
 	vmovdqu		32*9(%rsp), $ACC1		# 32*9-192($tp0)
 	vmovdqu		32*10(%rsp), $ACC2		# 32*10-192($tp0)
@@ -1592,68 +1592,128 @@ rsaz_1024_scatter5_avx2:
 .type	rsaz_1024_gather5_avx2,\@abi-omnipotent
 .align	32
 rsaz_1024_gather5_avx2:
+	vzeroupper
+	mov	%rsp,%r11
 ___
 $code.=<<___ if ($win64);
 	lea	-0x88(%rsp),%rax
-	vzeroupper
 .LSEH_begin_rsaz_1024_gather5:
 	# I can't trust assembler to use specific encoding:-(
-	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax),%rsp
-	.byte	0xc5,0xf8,0x29,0x70,0xe0	#vmovaps %xmm6,-0x20(%rax)
-	.byte	0xc5,0xf8,0x29,0x78,0xf0	#vmovaps %xmm7,-0x10(%rax)
-	.byte	0xc5,0x78,0x29,0x40,0x00	#vmovaps %xmm8,0(%rax)
-	.byte	0xc5,0x78,0x29,0x48,0x10	#vmovaps %xmm9,0x10(%rax)
-	.byte	0xc5,0x78,0x29,0x50,0x20	#vmovaps %xmm10,0x20(%rax)
-	.byte	0xc5,0x78,0x29,0x58,0x30	#vmovaps %xmm11,0x30(%rax)
-	.byte	0xc5,0x78,0x29,0x60,0x40	#vmovaps %xmm12,0x40(%rax)
-	.byte	0xc5,0x78,0x29,0x68,0x50	#vmovaps %xmm13,0x50(%rax)
-	.byte	0xc5,0x78,0x29,0x70,0x60	#vmovaps %xmm14,0x60(%rax)
-	.byte	0xc5,0x78,0x29,0x78,0x70	#vmovaps %xmm15,0x70(%rax)
+	.byte	0x48,0x8d,0x60,0xe0		# lea	-0x20(%rax),%rsp
+	.byte	0xc5,0xf8,0x29,0x70,0xe0	# vmovaps %xmm6,-0x20(%rax)
+	.byte	0xc5,0xf8,0x29,0x78,0xf0	# vmovaps %xmm7,-0x10(%rax)
+	.byte	0xc5,0x78,0x29,0x40,0x00	# vmovaps %xmm8,0(%rax)
+	.byte	0xc5,0x78,0x29,0x48,0x10	# vmovaps %xmm9,0x10(%rax)
+	.byte	0xc5,0x78,0x29,0x50,0x20	# vmovaps %xmm10,0x20(%rax)
+	.byte	0xc5,0x78,0x29,0x58,0x30	# vmovaps %xmm11,0x30(%rax)
+	.byte	0xc5,0x78,0x29,0x60,0x40	# vmovaps %xmm12,0x40(%rax)
+	.byte	0xc5,0x78,0x29,0x68,0x50	# vmovaps %xmm13,0x50(%rax)
+	.byte	0xc5,0x78,0x29,0x70,0x60	# vmovaps %xmm14,0x60(%rax)
+	.byte	0xc5,0x78,0x29,0x78,0x70	# vmovaps %xmm15,0x70(%rax)
 ___
 $code.=<<___;
-	lea	.Lgather_table(%rip),%r11
-	mov	$power,%eax
-	and	\$3,$power
-	shr	\$2,%eax			# cache line number
-	shl	\$4,$power			# offset within cache line
+	lea	-0x100(%rsp),%rsp
+	and	\$-32, %rsp
+	lea	.Linc(%rip), %r10
+	lea	-128(%rsp),%rax			# control u-op density
 
-	vmovdqu		-32(%r11),%ymm7		# .Lgather_permd
-	vpbroadcastb	8(%r11,%rax), %xmm8
-	vpbroadcastb	7(%r11,%rax), %xmm9
-	vpbroadcastb	6(%r11,%rax), %xmm10
-	vpbroadcastb	5(%r11,%rax), %xmm11
-	vpbroadcastb	4(%r11,%rax), %xmm12
-	vpbroadcastb	3(%r11,%rax), %xmm13
-	vpbroadcastb	2(%r11,%rax), %xmm14
-	vpbroadcastb	1(%r11,%rax), %xmm15
+	vmovd		$power, %xmm4
+	vmovdqa		(%r10),%ymm0
+	vmovdqa		32(%r10),%ymm1
+	vmovdqa		64(%r10),%ymm5
+	vpbroadcastd	%xmm4,%ymm4
 
-	lea	64($inp,$power),$inp
-	mov	\$64,%r11			# size optimization
-	mov	\$9,%eax
-	jmp	.Loop_gather_1024
+	vpaddd		%ymm5, %ymm0, %ymm2
+	vpcmpeqd	%ymm4, %ymm0, %ymm0
+	vpaddd		%ymm5, %ymm1, %ymm3
+	vpcmpeqd	%ymm4, %ymm1, %ymm1
+	vmovdqa		%ymm0, 32*0+128(%rax)
+	vpaddd		%ymm5, %ymm2, %ymm0
+	vpcmpeqd	%ymm4, %ymm2, %ymm2
+	vmovdqa		%ymm1, 32*1+128(%rax)
+	vpaddd		%ymm5, %ymm3, %ymm1
+	vpcmpeqd	%ymm4, %ymm3, %ymm3
+	vmovdqa		%ymm2, 32*2+128(%rax)
+	vpaddd		%ymm5, %ymm0, %ymm2
+	vpcmpeqd	%ymm4, %ymm0, %ymm0
+	vmovdqa		%ymm3, 32*3+128(%rax)
+	vpaddd		%ymm5, %ymm1, %ymm3
+	vpcmpeqd	%ymm4, %ymm1, %ymm1
+	vmovdqa		%ymm0, 32*4+128(%rax)
+	vpaddd		%ymm5, %ymm2, %ymm8
+	vpcmpeqd	%ymm4, %ymm2, %ymm2
+	vmovdqa		%ymm1, 32*5+128(%rax)
+	vpaddd		%ymm5, %ymm3, %ymm9
+	vpcmpeqd	%ymm4, %ymm3, %ymm3
+	vmovdqa		%ymm2, 32*6+128(%rax)
+	vpaddd		%ymm5, %ymm8, %ymm10
+	vpcmpeqd	%ymm4, %ymm8, %ymm8
+	vmovdqa		%ymm3, 32*7+128(%rax)
+	vpaddd		%ymm5, %ymm9, %ymm11
+	vpcmpeqd	%ymm4, %ymm9, %ymm9
+	vpaddd		%ymm5, %ymm10, %ymm12
+	vpcmpeqd	%ymm4, %ymm10, %ymm10
+	vpaddd		%ymm5, %ymm11, %ymm13
+	vpcmpeqd	%ymm4, %ymm11, %ymm11
+	vpaddd		%ymm5, %ymm12, %ymm14
+	vpcmpeqd	%ymm4, %ymm12, %ymm12
+	vpaddd		%ymm5, %ymm13, %ymm15
+	vpcmpeqd	%ymm4, %ymm13, %ymm13
+	vpcmpeqd	%ymm4, %ymm14, %ymm14
+	vpcmpeqd	%ymm4, %ymm15, %ymm15
+
+	vmovdqa	-32(%r10),%ymm7			# .Lgather_permd
+	lea	128($inp), $inp
+	mov	\$9,$power
 
-.align	32
 .Loop_gather_1024:
-	vpand		-64($inp),		%xmm8,%xmm0
-	vpand		($inp),			%xmm9,%xmm1
-	vpand		64($inp),		%xmm10,%xmm2
-	vpand		($inp,%r11,2),		%xmm11,%xmm3
-	 vpor					%xmm0,%xmm1,%xmm1
-	vpand		64($inp,%r11,2),	%xmm12,%xmm4
-	 vpor					%xmm2,%xmm3,%xmm3
-	vpand		($inp,%r11,4),		%xmm13,%xmm5
-	 vpor					%xmm1,%xmm3,%xmm3
-	vpand		64($inp,%r11,4),	%xmm14,%xmm6
-	 vpor					%xmm4,%xmm5,%xmm5
-	vpand		-128($inp,%r11,8),	%xmm15,%xmm2
-	lea		($inp,%r11,8),$inp
-	 vpor					%xmm3,%xmm5,%xmm5
-	 vpor					%xmm2,%xmm6,%xmm6
-	 vpor					%xmm5,%xmm6,%xmm6
-	vpermd		%ymm6,%ymm7,%ymm6
-	vmovdqu		%ymm6,($out)
+	vmovdqa		32*0-128($inp),	%ymm0
+	vmovdqa		32*1-128($inp),	%ymm1
+	vmovdqa		32*2-128($inp),	%ymm2
+	vmovdqa		32*3-128($inp),	%ymm3
+	vpand		32*0+128(%rax),	%ymm0,	%ymm0
+	vpand		32*1+128(%rax),	%ymm1,	%ymm1
+	vpand		32*2+128(%rax),	%ymm2,	%ymm2
+	vpor		%ymm0, %ymm1, %ymm4
+	vpand		32*3+128(%rax),	%ymm3,	%ymm3
+	vmovdqa		32*4-128($inp),	%ymm0
+	vmovdqa		32*5-128($inp),	%ymm1
+	vpor		%ymm2, %ymm3, %ymm5
+	vmovdqa		32*6-128($inp),	%ymm2
+	vmovdqa		32*7-128($inp),	%ymm3
+	vpand		32*4+128(%rax),	%ymm0,	%ymm0
+	vpand		32*5+128(%rax),	%ymm1,	%ymm1
+	vpand		32*6+128(%rax),	%ymm2,	%ymm2
+	vpor		%ymm0, %ymm4, %ymm4
+	vpand		32*7+128(%rax),	%ymm3,	%ymm3
+	vpand		32*8-128($inp),	%ymm8,	%ymm0
+	vpor		%ymm1, %ymm5, %ymm5
+	vpand		32*9-128($inp),	%ymm9,	%ymm1
+	vpor		%ymm2, %ymm4, %ymm4
+	vpand		32*10-128($inp),%ymm10,	%ymm2
+	vpor		%ymm3, %ymm5, %ymm5
+	vpand		32*11-128($inp),%ymm11,	%ymm3
+	vpor		%ymm0, %ymm4, %ymm4
+	vpand		32*12-128($inp),%ymm12,	%ymm0
+	vpor		%ymm1, %ymm5, %ymm5
+	vpand		32*13-128($inp),%ymm13,	%ymm1
+	vpor		%ymm2, %ymm4, %ymm4
+	vpand		32*14-128($inp),%ymm14,	%ymm2
+	vpor		%ymm3, %ymm5, %ymm5
+	vpand		32*15-128($inp),%ymm15,	%ymm3
+	lea		32*16($inp), $inp
+	vpor		%ymm0, %ymm4, %ymm4
+	vpor		%ymm1, %ymm5, %ymm5
+	vpor		%ymm2, %ymm4, %ymm4
+	vpor		%ymm3, %ymm5, %ymm5
+
+	vpor		%ymm5, %ymm4, %ymm4
+	vextracti128	\$1, %ymm4, %xmm5	# upper half is cleared
+	vpor		%xmm4, %xmm5, %xmm5
+	vpermd		%ymm5,%ymm7,%ymm5
+	vmovdqu		%ymm5,($out)
 	lea		32($out),$out
-	dec	%eax
+	dec	$power
 	jnz	.Loop_gather_1024
 
 	vpxor	%ymm0,%ymm0,%ymm0
@@ -1661,20 +1721,20 @@ $code.=<<___;
 	vzeroupper
 ___
 $code.=<<___ if ($win64);
-	movaps	(%rsp),%xmm6
-	movaps	0x10(%rsp),%xmm7
-	movaps	0x20(%rsp),%xmm8
-	movaps	0x30(%rsp),%xmm9
-	movaps	0x40(%rsp),%xmm10
-	movaps	0x50(%rsp),%xmm11
-	movaps	0x60(%rsp),%xmm12
-	movaps	0x70(%rsp),%xmm13
-	movaps	0x80(%rsp),%xmm14
-	movaps	0x90(%rsp),%xmm15
-	lea	0xa8(%rsp),%rsp
+	movaps	-0xa8(%r11),%xmm6
+	movaps	-0x98(%r11),%xmm7
+	movaps	-0x88(%r11),%xmm8
+	movaps	-0x78(%r11),%xmm9
+	movaps	-0x68(%r11),%xmm10
+	movaps	-0x58(%r11),%xmm11
+	movaps	-0x48(%r11),%xmm12
+	movaps	-0x38(%r11),%xmm13
+	movaps	-0x28(%r11),%xmm14
+	movaps	-0x18(%r11),%xmm15
 .LSEH_end_rsaz_1024_gather5:
 ___
 $code.=<<___;
+	lea	(%r11),%rsp
 	ret
 .size	rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
 ___
@@ -1708,8 +1768,10 @@ $code.=<<___;
 	.long	0,2,4,6,7,7,7,7
 .Lgather_permd:
 	.long	0,7,1,7,2,7,3,7
-.Lgather_table:
-	.byte	0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0
+.Linc:
+	.long	0,0,0,0, 1,1,1,1
+	.long	2,2,2,2, 3,3,3,3
+	.long	4,4,4,4, 4,4,4,4
 .align	64
 ___
 
@@ -1837,18 +1899,19 @@ rsaz_se_handler:
 	.rva	rsaz_se_handler
 	.rva	.Lmul_1024_body,.Lmul_1024_epilogue
 .LSEH_info_rsaz_1024_gather5:
-	.byte	0x01,0x33,0x16,0x00
-	.byte	0x36,0xf8,0x09,0x00	#vmovaps 0x90(rsp),xmm15
-	.byte	0x31,0xe8,0x08,0x00	#vmovaps 0x80(rsp),xmm14
-	.byte	0x2c,0xd8,0x07,0x00	#vmovaps 0x70(rsp),xmm13
-	.byte	0x27,0xc8,0x06,0x00	#vmovaps 0x60(rsp),xmm12
-	.byte	0x22,0xb8,0x05,0x00	#vmovaps 0x50(rsp),xmm11
-	.byte	0x1d,0xa8,0x04,0x00	#vmovaps 0x40(rsp),xmm10
-	.byte	0x18,0x98,0x03,0x00	#vmovaps 0x30(rsp),xmm9
-	.byte	0x13,0x88,0x02,0x00	#vmovaps 0x20(rsp),xmm8
-	.byte	0x0e,0x78,0x01,0x00	#vmovaps 0x10(rsp),xmm7
-	.byte	0x09,0x68,0x00,0x00	#vmovaps 0x00(rsp),xmm6
-	.byte	0x04,0x01,0x15,0x00	#sub	rsp,0xa8
+	.byte	0x01,0x36,0x17,0x0b
+	.byte	0x36,0xf8,0x09,0x00	# vmovaps 0x90(rsp),xmm15
+	.byte	0x31,0xe8,0x08,0x00	# vmovaps 0x80(rsp),xmm14
+	.byte	0x2c,0xd8,0x07,0x00	# vmovaps 0x70(rsp),xmm13
+	.byte	0x27,0xc8,0x06,0x00	# vmovaps 0x60(rsp),xmm12
+	.byte	0x22,0xb8,0x05,0x00	# vmovaps 0x50(rsp),xmm11
+	.byte	0x1d,0xa8,0x04,0x00	# vmovaps 0x40(rsp),xmm10
+	.byte	0x18,0x98,0x03,0x00	# vmovaps 0x30(rsp),xmm9
+	.byte	0x13,0x88,0x02,0x00	# vmovaps 0x20(rsp),xmm8
+	.byte	0x0e,0x78,0x01,0x00	# vmovaps 0x10(rsp),xmm7
+	.byte	0x09,0x68,0x00,0x00	# vmovaps 0x00(rsp),xmm6
+	.byte	0x04,0x01,0x15,0x00	# sub	  rsp,0xa8
+	.byte	0x00,0xb3,0x00,0x00	# set_frame r11
 ___
 }
 
diff --git a/crypto/bn/asm/rsaz-x86_64.pl b/crypto/bn/asm/rsaz-x86_64.pl
index 091cdc2069da..87ce2c34d90c 100755
--- a/crypto/bn/asm/rsaz-x86_64.pl
+++ b/crypto/bn/asm/rsaz-x86_64.pl
@@ -915,9 +915,76 @@ rsaz_512_mul_gather4:
 	push	%r14
 	push	%r15
 
-	mov	$pwr, $pwr
-	subq	\$128+24, %rsp
+	subq	\$`128+24+($win64?0xb0:0)`, %rsp
+___
+$code.=<<___	if ($win64);
+	movaps	%xmm6,0xa0(%rsp)
+	movaps	%xmm7,0xb0(%rsp)
+	movaps	%xmm8,0xc0(%rsp)
+	movaps	%xmm9,0xd0(%rsp)
+	movaps	%xmm10,0xe0(%rsp)
+	movaps	%xmm11,0xf0(%rsp)
+	movaps	%xmm12,0x100(%rsp)
+	movaps	%xmm13,0x110(%rsp)
+	movaps	%xmm14,0x120(%rsp)
+	movaps	%xmm15,0x130(%rsp)
+___
+$code.=<<___;
 .Lmul_gather4_body:
+	movd	$pwr,%xmm8
+	movdqa	.Linc+16(%rip),%xmm1	# 00000002000000020000000200000002
+	movdqa	.Linc(%rip),%xmm0	# 00000001000000010000000000000000
+
+	pshufd	\$0,%xmm8,%xmm8		# broadcast $power
+	movdqa	%xmm1,%xmm7
+	movdqa	%xmm1,%xmm2
+___
+########################################################################
+# calculate mask by comparing 0..15 to $power
+#
+for($i=0;$i<4;$i++) {
+$code.=<<___;
+	paddd	%xmm`$i`,%xmm`$i+1`
+	pcmpeqd	%xmm8,%xmm`$i`
+	movdqa	%xmm7,%xmm`$i+3`
+___
+}
+for(;$i<7;$i++) {
+$code.=<<___;
+	paddd	%xmm`$i`,%xmm`$i+1`
+	pcmpeqd	%xmm8,%xmm`$i`
+___
+}
+$code.=<<___;
+	pcmpeqd	%xmm8,%xmm7
+
+	movdqa	16*0($bp),%xmm8
+	movdqa	16*1($bp),%xmm9
+	movdqa	16*2($bp),%xmm10
+	movdqa	16*3($bp),%xmm11
+	pand	%xmm0,%xmm8
+	movdqa	16*4($bp),%xmm12
+	pand	%xmm1,%xmm9
+	movdqa	16*5($bp),%xmm13
+	pand	%xmm2,%xmm10
+	movdqa	16*6($bp),%xmm14
+	pand	%xmm3,%xmm11
+	movdqa	16*7($bp),%xmm15
+	leaq	128($bp), %rbp
+	pand	%xmm4,%xmm12
+	pand	%xmm5,%xmm13
+	pand	%xmm6,%xmm14
+	pand	%xmm7,%xmm15
+	por	%xmm10,%xmm8
+	por	%xmm11,%xmm9
+	por	%xmm12,%xmm8
+	por	%xmm13,%xmm9
+	por	%xmm14,%xmm8
+	por	%xmm15,%xmm9
+
+	por	%xmm9,%xmm8
+	pshufd	\$0x4e,%xmm8,%xmm9
+	por	%xmm9,%xmm8
 ___
 $code.=<<___ if ($addx);
 	movl	\$0x80100,%r11d
@@ -926,45 +993,38 @@ $code.=<<___ if ($addx);
 	je	.Lmulx_gather
 ___
 $code.=<<___;
-	movl	64($bp,$pwr,4), %eax
-	movq	$out, %xmm0		# off-load arguments
-	movl	($bp,$pwr,4), %ebx
-	movq	$mod, %xmm1
-	movq	$n0, 128(%rsp)
+	movq	%xmm8,%rbx
+
+	movq	$n0, 128(%rsp)		# off-load arguments
+	movq	$out, 128+8(%rsp)
+	movq	$mod, 128+16(%rsp)
 
-	shlq	\$32, %rax
-	or	%rax, %rbx
 	movq	($ap), %rax
 	 movq	8($ap), %rcx
-	 leaq	128($bp,$pwr,4), %rbp
 	mulq	%rbx			# 0 iteration
 	movq	%rax, (%rsp)
 	movq	%rcx, %rax
 	movq	%rdx, %r8
 
 	mulq	%rbx
-	 movd	(%rbp), %xmm4
 	addq	%rax, %r8
 	movq	16($ap), %rax
 	movq	%rdx, %r9
 	adcq	\$0, %r9
 
 	mulq	%rbx
-	 movd	64(%rbp), %xmm5
 	addq	%rax, %r9
 	movq	24($ap), %rax
 	movq	%rdx, %r10
 	adcq	\$0, %r10
 
 	mulq	%rbx
-	 pslldq	\$4, %xmm5
 	addq	%rax, %r10
 	movq	32($ap), %rax
 	movq	%rdx, %r11
 	adcq	\$0, %r11
 
 	mulq	%rbx
-	 por	%xmm5, %xmm4
 	addq	%rax, %r11
 	movq	40($ap), %rax
 	movq	%rdx, %r12
@@ -977,14 +1037,12 @@ $code.=<<___;
 	adcq	\$0, %r13
 
 	mulq	%rbx
-	 leaq	128(%rbp), %rbp
 	addq	%rax, %r13
 	movq	56($ap), %rax
 	movq	%rdx, %r14
 	adcq	\$0, %r14
 	
 	mulq	%rbx
-	 movq	%xmm4, %rbx
 	addq	%rax, %r14
 	 movq	($ap), %rax
 	movq	%rdx, %r15
@@ -996,6 +1054,35 @@ $code.=<<___;
 
 .align	32
 .Loop_mul_gather:
+	movdqa	16*0(%rbp),%xmm8
+	movdqa	16*1(%rbp),%xmm9
+	movdqa	16*2(%rbp),%xmm10
+	movdqa	16*3(%rbp),%xmm11
+	pand	%xmm0,%xmm8
+	movdqa	16*4(%rbp),%xmm12
+	pand	%xmm1,%xmm9
+	movdqa	16*5(%rbp),%xmm13
+	pand	%xmm2,%xmm10
+	movdqa	16*6(%rbp),%xmm14
+	pand	%xmm3,%xmm11
+	movdqa	16*7(%rbp),%xmm15
+	leaq	128(%rbp), %rbp
+	pand	%xmm4,%xmm12
+	pand	%xmm5,%xmm13
+	pand	%xmm6,%xmm14
+	pand	%xmm7,%xmm15
+	por	%xmm10,%xmm8
+	por	%xmm11,%xmm9
+	por	%xmm12,%xmm8
+	por	%xmm13,%xmm9
+	por	%xmm14,%xmm8
+	por	%xmm15,%xmm9
+
+	por	%xmm9,%xmm8
+	pshufd	\$0x4e,%xmm8,%xmm9
+	por	%xmm9,%xmm8
+	movq	%xmm8,%rbx
+
 	mulq	%rbx
 	addq	%rax, %r8
 	movq	8($ap), %rax
@@ -1004,7 +1091,6 @@ $code.=<<___;
 	adcq	\$0, %r8
 
 	mulq	%rbx
-	 movd	(%rbp), %xmm4
 	addq	%rax, %r9
 	movq	16($ap), %rax
 	adcq	\$0, %rdx
@@ -1013,7 +1099,6 @@ $code.=<<___;
 	adcq	\$0, %r9
 
 	mulq	%rbx
-	 movd	64(%rbp), %xmm5
 	addq	%rax, %r10
 	movq	24($ap), %rax
 	adcq	\$0, %rdx
@@ -1022,7 +1107,6 @@ $code.=<<___;
 	adcq	\$0, %r10
 
 	mulq	%rbx
-	 pslldq	\$4, %xmm5
 	addq	%rax, %r11
 	movq	32($ap), %rax
 	adcq	\$0, %rdx
@@ -1031,7 +1115,6 @@ $code.=<<___;
 	adcq	\$0, %r11
 
 	mulq	%rbx
-	 por	%xmm5, %xmm4
 	addq	%rax, %r12
 	movq	40($ap), %rax
 	adcq	\$0, %rdx
@@ -1056,7 +1139,6 @@ $code.=<<___;
 	adcq	\$0, %r14
 
 	mulq	%rbx
-	 movq	%xmm4, %rbx
 	addq	%rax, %r15
 	 movq	($ap), %rax
 	adcq	\$0, %rdx
@@ -1064,7 +1146,6 @@ $code.=<<___;
 	movq	%rdx, %r15	
 	adcq	\$0, %r15
 
-	leaq	128(%rbp), %rbp
 	leaq	8(%rdi), %rdi
 
 	decl	%ecx
@@ -1079,8 +1160,8 @@ $code.=<<___;
 	movq	%r14, 48(%rdi)
 	movq	%r15, 56(%rdi)
 
-	movq	%xmm0, $out
-	movq	%xmm1, %rbp
+	movq	128+8(%rsp), $out
+	movq	128+16(%rsp), %rbp
 
 	movq	(%rsp), %r8
 	movq	8(%rsp), %r9
@@ -1098,45 +1179,37 @@ $code.=<<___ if ($addx);
 
 .align	32
 .Lmulx_gather:
-	mov	64($bp,$pwr,4), %eax
-	movq	$out, %xmm0		# off-load arguments
-	lea	128($bp,$pwr,4), %rbp
-	mov	($bp,$pwr,4), %edx
-	movq	$mod, %xmm1
-	mov	$n0, 128(%rsp)
+	movq	%xmm8,%rdx
+
+	mov	$n0, 128(%rsp)		# off-load arguments
+	mov	$out, 128+8(%rsp)
+	mov	$mod, 128+16(%rsp)
 
-	shl	\$32, %rax
-	or	%rax, %rdx
 	mulx	($ap), %rbx, %r8	# 0 iteration
 	mov	%rbx, (%rsp)
 	xor	%edi, %edi		# cf=0, of=0
 
 	mulx	8($ap), %rax, %r9
-	 movd	(%rbp), %xmm4
 
 	mulx	16($ap), %rbx, %r10
-	 movd	64(%rbp), %xmm5
 	adcx	%rax, %r8
 
 	mulx	24($ap), %rax, %r11
-	 pslldq	\$4, %xmm5
 	adcx	%rbx, %r9
 
 	mulx	32($ap), %rbx, %r12
-	 por	%xmm5, %xmm4
 	adcx	%rax, %r10
 
 	mulx	40($ap), %rax, %r13
 	adcx	%rbx, %r11
 
 	mulx	48($ap), %rbx, %r14
-	 lea	128(%rbp), %rbp
 	adcx	%rax, %r12
 	
 	mulx	56($ap), %rax, %r15
-	 movq	%xmm4, %rdx
 	adcx	%rbx, %r13
 	adcx	%rax, %r14
+	.byte	0x67
 	mov	%r8, %rbx
 	adcx	%rdi, %r15		# %rdi is 0
 
@@ -1145,24 +1218,48 @@ $code.=<<___ if ($addx);
 
 .align	32
 .Loop_mulx_gather:
-	mulx	($ap), %rax, %r8
+	movdqa	16*0(%rbp),%xmm8
+	movdqa	16*1(%rbp),%xmm9
+	movdqa	16*2(%rbp),%xmm10
+	movdqa	16*3(%rbp),%xmm11
+	pand	%xmm0,%xmm8
+	movdqa	16*4(%rbp),%xmm12
+	pand	%xmm1,%xmm9
+	movdqa	16*5(%rbp),%xmm13
+	pand	%xmm2,%xmm10
+	movdqa	16*6(%rbp),%xmm14
+	pand	%xmm3,%xmm11
+	movdqa	16*7(%rbp),%xmm15
+	leaq	128(%rbp), %rbp
+	pand	%xmm4,%xmm12
+	pand	%xmm5,%xmm13
+	pand	%xmm6,%xmm14
+	pand	%xmm7,%xmm15
+	por	%xmm10,%xmm8
+	por	%xmm11,%xmm9
+	por	%xmm12,%xmm8
+	por	%xmm13,%xmm9
+	por	%xmm14,%xmm8
+	por	%xmm15,%xmm9
+
+	por	%xmm9,%xmm8
+	pshufd	\$0x4e,%xmm8,%xmm9
+	por	%xmm9,%xmm8
+	movq	%xmm8,%rdx
+
+	.byte	0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00	# mulx	($ap), %rax, %r8
 	adcx	%rax, %rbx
 	adox	%r9, %r8
 
 	mulx	8($ap), %rax, %r9
-	.byte	0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00		# movd	(%rbp), %xmm4
 	adcx	%rax, %r8
 	adox	%r10, %r9
 
 	mulx	16($ap), %rax, %r10
-	 movd	64(%rbp), %xmm5
-	 lea	128(%rbp), %rbp
 	adcx	%rax, %r9
 	adox	%r11, %r10
 
 	.byte	0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00	# mulx	24($ap), %rax, %r11
-	 pslldq	\$4, %xmm5
-	 por	%xmm5, %xmm4
 	adcx	%rax, %r10
 	adox	%r12, %r11
 
@@ -1176,10 +1273,10 @@ $code.=<<___ if ($addx);
 
 	.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00	# mulx	48($ap), %rax, %r14
 	adcx	%rax, %r13
+	.byte	0x67
 	adox	%r15, %r14
 
 	mulx	56($ap), %rax, %r15
-	 movq	%xmm4, %rdx
 	 mov	%rbx, 64(%rsp,%rcx,8)
 	adcx	%rax, %r14
 	adox	%rdi, %r15
@@ -1198,10 +1295,10 @@ $code.=<<___ if ($addx);
 	mov	%r14, 64+48(%rsp)
 	mov	%r15, 64+56(%rsp)
 
-	movq	%xmm0, $out
-	movq	%xmm1, %rbp
+	mov	128(%rsp), %rdx		# pull arguments
+	mov	128+8(%rsp), $out
+	mov	128+16(%rsp), %rbp
 
-	mov	128(%rsp), %rdx		# pull $n0
 	mov	(%rsp), %r8
 	mov	8(%rsp), %r9
 	mov	16(%rsp), %r10
@@ -1229,6 +1326,21 @@ $code.=<<___;
 	call	__rsaz_512_subtract
 
 	leaq	128+24+48(%rsp), %rax
+___
+$code.=<<___	if ($win64);
+	movaps	0xa0-0xc8(%rax),%xmm6
+	movaps	0xb0-0xc8(%rax),%xmm7
+	movaps	0xc0-0xc8(%rax),%xmm8
+	movaps	0xd0-0xc8(%rax),%xmm9
+	movaps	0xe0-0xc8(%rax),%xmm10
+	movaps	0xf0-0xc8(%rax),%xmm11
+	movaps	0x100-0xc8(%rax),%xmm12
+	movaps	0x110-0xc8(%rax),%xmm13
+	movaps	0x120-0xc8(%rax),%xmm14
+	movaps	0x130-0xc8(%rax),%xmm15
+	lea	0xb0(%rax),%rax
+___
+$code.=<<___;
 	movq	-48(%rax), %r15
 	movq	-40(%rax), %r14
 	movq	-32(%rax), %r13
@@ -1258,7 +1370,7 @@ rsaz_512_mul_scatter4:
 	mov	$pwr, $pwr
 	subq	\$128+24, %rsp
 .Lmul_scatter4_body:
-	leaq	($tbl,$pwr,4), $tbl
+	leaq	($tbl,$pwr,8), $tbl
 	movq	$out, %xmm0		# off-load arguments
 	movq	$mod, %xmm1
 	movq	$tbl, %xmm2
@@ -1329,30 +1441,14 @@ $code.=<<___;
 
 	call	__rsaz_512_subtract
 
-	movl	%r8d, 64*0($inp)	# scatter
-	shrq	\$32, %r8
-	movl	%r9d, 64*2($inp)
-	shrq	\$32, %r9
-	movl	%r10d, 64*4($inp)
-	shrq	\$32, %r10
-	movl	%r11d, 64*6($inp)
-	shrq	\$32, %r11
-	movl	%r12d, 64*8($inp)
-	shrq	\$32, %r12
-	movl	%r13d, 64*10($inp)
-	shrq	\$32, %r13
-	movl	%r14d, 64*12($inp)
-	shrq	\$32, %r14
-	movl	%r15d, 64*14($inp)
-	shrq	\$32, %r15
-	movl	%r8d, 64*1($inp)
-	movl	%r9d, 64*3($inp)
-	movl	%r10d, 64*5($inp)
-	movl	%r11d, 64*7($inp)
-	movl	%r12d, 64*9($inp)
-	movl	%r13d, 64*11($inp)
-	movl	%r14d, 64*13($inp)
-	movl	%r15d, 64*15($inp)
+	movq	%r8, 128*0($inp)	# scatter
+	movq	%r9, 128*1($inp)
+	movq	%r10, 128*2($inp)
+	movq	%r11, 128*3($inp)
+	movq	%r12, 128*4($inp)
+	movq	%r13, 128*5($inp)
+	movq	%r14, 128*6($inp)
+	movq	%r15, 128*7($inp)
 
 	leaq	128+24+48(%rsp), %rax
 	movq	-48(%rax), %r15
@@ -1956,16 +2052,14 @@ $code.=<<___;
 .type	rsaz_512_scatter4,\@abi-omnipotent
 .align	16
 rsaz_512_scatter4:
-	leaq	($out,$power,4), $out
+	leaq	($out,$power,8), $out
 	movl	\$8, %r9d
 	jmp	.Loop_scatter
 .align	16
 .Loop_scatter:
 	movq	($inp), %rax
 	leaq	8($inp), $inp
-	movl	%eax, ($out)
-	shrq	\$32, %rax
-	movl	%eax, 64($out)
+	movq	%rax, ($out)
 	leaq	128($out), $out
 	decl	%r9d
 	jnz	.Loop_scatter
@@ -1976,22 +2070,106 @@ rsaz_512_scatter4:
 .type	rsaz_512_gather4,\@abi-omnipotent
 .align	16
 rsaz_512_gather4:
-	leaq	($inp,$power,4), $inp
+___
+$code.=<<___	if ($win64);
+.LSEH_begin_rsaz_512_gather4:
+	.byte	0x48,0x81,0xec,0xa8,0x00,0x00,0x00	# sub    $0xa8,%rsp
+	.byte	0x0f,0x29,0x34,0x24			# movaps %xmm6,(%rsp)
+	.byte	0x0f,0x29,0x7c,0x24,0x10		# movaps %xmm7,0x10(%rsp)
+	.byte	0x44,0x0f,0x29,0x44,0x24,0x20		# movaps %xmm8,0x20(%rsp)
+	.byte	0x44,0x0f,0x29,0x4c,0x24,0x30		# movaps %xmm9,0x30(%rsp)
+	.byte	0x44,0x0f,0x29,0x54,0x24,0x40		# movaps %xmm10,0x40(%rsp)
+	.byte	0x44,0x0f,0x29,0x5c,0x24,0x50		# movaps %xmm11,0x50(%rsp)
+	.byte	0x44,0x0f,0x29,0x64,0x24,0x60		# movaps %xmm12,0x60(%rsp)
+	.byte	0x44,0x0f,0x29,0x6c,0x24,0x70		# movaps %xmm13,0x70(%rsp)
+	.byte	0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0	# movaps %xmm14,0x80(%rsp)
+	.byte	0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0	# movaps %xmm15,0x90(%rsp)
+___
+$code.=<<___;
+	movd	$power,%xmm8
+	movdqa	.Linc+16(%rip),%xmm1	# 00000002000000020000000200000002
+	movdqa	.Linc(%rip),%xmm0	# 00000001000000010000000000000000
+
+	pshufd	\$0,%xmm8,%xmm8		# broadcast $power
+	movdqa	%xmm1,%xmm7
+	movdqa	%xmm1,%xmm2
+___
+########################################################################
+# calculate mask by comparing 0..15 to $power
+#
+for($i=0;$i<4;$i++) {
+$code.=<<___;
+	paddd	%xmm`$i`,%xmm`$i+1`
+	pcmpeqd	%xmm8,%xmm`$i`
+	movdqa	%xmm7,%xmm`$i+3`
+___
+}
+for(;$i<7;$i++) {
+$code.=<<___;
+	paddd	%xmm`$i`,%xmm`$i+1`
+	pcmpeqd	%xmm8,%xmm`$i`
+___
+}
+$code.=<<___;
+	pcmpeqd	%xmm8,%xmm7
 	movl	\$8, %r9d
 	jmp	.Loop_gather
 .align	16
 .Loop_gather:
-	movl	($inp), %eax
-	movl	64($inp), %r8d
+	movdqa	16*0($inp),%xmm8
+	movdqa	16*1($inp),%xmm9
+	movdqa	16*2($inp),%xmm10
+	movdqa	16*3($inp),%xmm11
+	pand	%xmm0,%xmm8
+	movdqa	16*4($inp),%xmm12
+	pand	%xmm1,%xmm9
+	movdqa	16*5($inp),%xmm13
+	pand	%xmm2,%xmm10
+	movdqa	16*6($inp),%xmm14
+	pand	%xmm3,%xmm11
+	movdqa	16*7($inp),%xmm15
 	leaq	128($inp), $inp
-	shlq	\$32, %r8
-	or	%r8, %rax
-	movq	%rax, ($out)
+	pand	%xmm4,%xmm12
+	pand	%xmm5,%xmm13
+	pand	%xmm6,%xmm14
+	pand	%xmm7,%xmm15
+	por	%xmm10,%xmm8
+	por	%xmm11,%xmm9
+	por	%xmm12,%xmm8
+	por	%xmm13,%xmm9
+	por	%xmm14,%xmm8
+	por	%xmm15,%xmm9
+
+	por	%xmm9,%xmm8
+	pshufd	\$0x4e,%xmm8,%xmm9
+	por	%xmm9,%xmm8
+	movq	%xmm8,($out)
 	leaq	8($out), $out
 	decl	%r9d
 	jnz	.Loop_gather
+___
+$code.=<<___	if ($win64);
+	movaps	0x00(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	movaps	0x40(%rsp),%xmm10
+	movaps	0x50(%rsp),%xmm11
+	movaps	0x60(%rsp),%xmm12
+	movaps	0x70(%rsp),%xmm13
+	movaps	0x80(%rsp),%xmm14
+	movaps	0x90(%rsp),%xmm15
+	add	\$0xa8,%rsp
+___
+$code.=<<___;
 	ret
+.LSEH_end_rsaz_512_gather4:
 .size	rsaz_512_gather4,.-rsaz_512_gather4
+
+.align	64
+.Linc:
+	.long	0,0, 1,1
+	.long	2,2, 2,2
 ___
 }
 
@@ -2039,6 +2217,18 @@ se_handler:
 
 	lea	128+24+48(%rax),%rax
 
+	lea	.Lmul_gather4_epilogue(%rip),%rbx
+	cmp	%r10,%rbx
+	jne	.Lse_not_in_mul_gather4
+
+	lea	0xb0(%rax),%rax
+
+	lea	-48-0xa8(%rax),%rsi
+	lea	512($context),%rdi
+	mov	\$20,%ecx
+	.long	0xa548f3fc		# cld; rep movsq
+
+.Lse_not_in_mul_gather4:
 	mov	-8(%rax),%rbx
 	mov	-16(%rax),%rbp
 	mov	-24(%rax),%r12
@@ -2090,7 +2280,7 @@ se_handler:
 	pop	%rdi
 	pop	%rsi
 	ret
-.size	sqr_handler,.-sqr_handler
+.size	se_handler,.-se_handler
 
 .section	.pdata
 .align	4
@@ -2114,6 +2304,10 @@ se_handler:
 	.rva	.LSEH_end_rsaz_512_mul_by_one
 	.rva	.LSEH_info_rsaz_512_mul_by_one
 
+	.rva	.LSEH_begin_rsaz_512_gather4
+	.rva	.LSEH_end_rsaz_512_gather4
+	.rva	.LSEH_info_rsaz_512_gather4
+
 .section	.xdata
 .align	8
 .LSEH_info_rsaz_512_sqr:
@@ -2136,6 +2330,19 @@ se_handler:
 	.byte	9,0,0,0
 	.rva	se_handler
 	.rva	.Lmul_by_one_body,.Lmul_by_one_epilogue		# HandlerData[]
+.LSEH_info_rsaz_512_gather4:
+	.byte	0x01,0x46,0x16,0x00
+	.byte	0x46,0xf8,0x09,0x00	# vmovaps 0x90(rsp),xmm15
+	.byte	0x3d,0xe8,0x08,0x00	# vmovaps 0x80(rsp),xmm14
+	.byte	0x34,0xd8,0x07,0x00	# vmovaps 0x70(rsp),xmm13
+	.byte	0x2e,0xc8,0x06,0x00	# vmovaps 0x60(rsp),xmm12
+	.byte	0x28,0xb8,0x05,0x00	# vmovaps 0x50(rsp),xmm11
+	.byte	0x22,0xa8,0x04,0x00	# vmovaps 0x40(rsp),xmm10
+	.byte	0x1c,0x98,0x03,0x00	# vmovaps 0x30(rsp),xmm9
+	.byte	0x16,0x88,0x02,0x00	# vmovaps 0x20(rsp),xmm8
+	.byte	0x10,0x78,0x01,0x00	# vmovaps 0x10(rsp),xmm7
+	.byte	0x0b,0x68,0x00,0x00	# vmovaps 0x00(rsp),xmm6
+	.byte	0x07,0x01,0x15,0x00	# sub     rsp,0xa8
 ___
 }
 
diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl
index e82e451388c7..29ba1224e36b 100755
--- a/crypto/bn/asm/x86_64-mont.pl
+++ b/crypto/bn/asm/x86_64-mont.pl
@@ -775,100 +775,126 @@ bn_sqr8x_mont:
 	# 4096. this is done to allow memory disambiguation logic
 	# do its job.
 	#
-	lea	-64(%rsp,$num,4),%r11
+	lea	-64(%rsp,$num,2),%r11
 	mov	($n0),$n0		# *n0
 	sub	$aptr,%r11
 	and	\$4095,%r11
 	cmp	%r11,%r10
 	jb	.Lsqr8x_sp_alt
 	sub	%r11,%rsp		# align with $aptr
-	lea	-64(%rsp,$num,4),%rsp	# alloca(frame+4*$num)
+	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
 	jmp	.Lsqr8x_sp_done
 
 .align	32
 .Lsqr8x_sp_alt:
-	lea	4096-64(,$num,4),%r10	# 4096-frame-4*$num
-	lea	-64(%rsp,$num,4),%rsp	# alloca(frame+4*$num)
+	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
+	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
 	sub	%r10,%r11
 	mov	\$0,%r10
 	cmovc	%r10,%r11
 	sub	%r11,%rsp
 .Lsqr8x_sp_done:
 	and	\$-64,%rsp
-	mov	$num,%r10	
+	mov	$num,%r10
 	neg	$num
 
-	lea	64(%rsp,$num,2),%r11	# copy of modulus
 	mov	$n0,  32(%rsp)
 	mov	%rax, 40(%rsp)		# save original %rsp
 .Lsqr8x_body:
 
-	mov	$num,$i
-	movq	%r11, %xmm2		# save pointer to modulus copy
-	shr	\$3+2,$i
-	mov	OPENSSL_ia32cap_P+8(%rip),%eax
-	jmp	.Lsqr8x_copy_n
-
-.align	32
-.Lsqr8x_copy_n:
-	movq	8*0($nptr),%xmm0
-	movq	8*1($nptr),%xmm1
-	movq	8*2($nptr),%xmm3
-	movq	8*3($nptr),%xmm4
-	lea	8*4($nptr),$nptr
-	movdqa	%xmm0,16*0(%r11)
-	movdqa	%xmm1,16*1(%r11)
-	movdqa	%xmm3,16*2(%r11)
-	movdqa	%xmm4,16*3(%r11)
-	lea	16*4(%r11),%r11
-	dec	$i
-	jnz	.Lsqr8x_copy_n
-
+	movq	$nptr, %xmm2		# save pointer to modulus
 	pxor	%xmm0,%xmm0
 	movq	$rptr,%xmm1		# save $rptr
 	movq	%r10, %xmm3		# -$num
 ___
 $code.=<<___ if ($addx);
+	mov	OPENSSL_ia32cap_P+8(%rip),%eax
 	and	\$0x80100,%eax
 	cmp	\$0x80100,%eax
 	jne	.Lsqr8x_nox
 
 	call	bn_sqrx8x_internal	# see x86_64-mont5 module
-
-	pxor	%xmm0,%xmm0
-	lea	48(%rsp),%rax
-	lea	64(%rsp,$num,2),%rdx
-	shr	\$3+2,$num
-	mov	40(%rsp),%rsi		# restore %rsp
-	jmp	.Lsqr8x_zero
+					# %rax	top-most carry
+					# %rbp	nptr
+					# %rcx	-8*num
+					# %r8	end of tp[2*num]
+	lea	(%r8,%rcx),%rbx
+	mov	%rcx,$num
+	mov	%rcx,%rdx
+	movq	%xmm1,$rptr
+	sar	\$3+2,%rcx		# %cf=0
+	jmp	.Lsqr8x_sub
 
 .align	32
 .Lsqr8x_nox:
 ___
 $code.=<<___;
 	call	bn_sqr8x_internal	# see x86_64-mont5 module
-
-	pxor	%xmm0,%xmm0
-	lea	48(%rsp),%rax
-	lea	64(%rsp,$num,2),%rdx
-	shr	\$3+2,$num
-	mov	40(%rsp),%rsi		# restore %rsp
-	jmp	.Lsqr8x_zero
+					# %rax	top-most carry
+					# %rbp	nptr
+					# %r8	-8*num
+					# %rdi	end of tp[2*num]
+	lea	(%rdi,$num),%rbx
+	mov	$num,%rcx
+	mov	$num,%rdx
+	movq	%xmm1,$rptr
+	sar	\$3+2,%rcx		# %cf=0
+	jmp	.Lsqr8x_sub
 
 .align	32
-.Lsqr8x_zero:
-	movdqa	%xmm0,16*0(%rax)	# wipe t
-	movdqa	%xmm0,16*1(%rax)
-	movdqa	%xmm0,16*2(%rax)
-	movdqa	%xmm0,16*3(%rax)
-	lea	16*4(%rax),%rax
-	movdqa	%xmm0,16*0(%rdx)	# wipe n
-	movdqa	%xmm0,16*1(%rdx)
-	movdqa	%xmm0,16*2(%rdx)
-	movdqa	%xmm0,16*3(%rdx)
-	lea	16*4(%rdx),%rdx
-	dec	$num
-	jnz	.Lsqr8x_zero
+.Lsqr8x_sub:
+	mov	8*0(%rbx),%r12
+	mov	8*1(%rbx),%r13
+	mov	8*2(%rbx),%r14
+	mov	8*3(%rbx),%r15
+	lea	8*4(%rbx),%rbx
+	sbb	8*0(%rbp),%r12
+	sbb	8*1(%rbp),%r13
+	sbb	8*2(%rbp),%r14
+	sbb	8*3(%rbp),%r15
+	lea	8*4(%rbp),%rbp
+	mov	%r12,8*0($rptr)
+	mov	%r13,8*1($rptr)
+	mov	%r14,8*2($rptr)
+	mov	%r15,8*3($rptr)
+	lea	8*4($rptr),$rptr
+	inc	%rcx			# preserves %cf
+	jnz	.Lsqr8x_sub
+
+	sbb	\$0,%rax		# top-most carry
+	lea	(%rbx,$num),%rbx	# rewind
+	lea	($rptr,$num),$rptr	# rewind
+
+	movq	%rax,%xmm1
+	pxor	%xmm0,%xmm0
+	pshufd	\$0,%xmm1,%xmm1
+	mov	40(%rsp),%rsi		# restore %rsp
+	jmp	.Lsqr8x_cond_copy
+
+.align	32
+.Lsqr8x_cond_copy:
+	movdqa	16*0(%rbx),%xmm2
+	movdqa	16*1(%rbx),%xmm3
+	lea	16*2(%rbx),%rbx
+	movdqu	16*0($rptr),%xmm4
+	movdqu	16*1($rptr),%xmm5
+	lea	16*2($rptr),$rptr
+	movdqa	%xmm0,-16*2(%rbx)	# zero tp
+	movdqa	%xmm0,-16*1(%rbx)
+	movdqa	%xmm0,-16*2(%rbx,%rdx)
+	movdqa	%xmm0,-16*1(%rbx,%rdx)
+	pcmpeqd	%xmm1,%xmm0
+	pand	%xmm1,%xmm2
+	pand	%xmm1,%xmm3
+	pand	%xmm0,%xmm4
+	pand	%xmm0,%xmm5
+	pxor	%xmm0,%xmm0
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqu	%xmm4,-16*2($rptr)
+	movdqu	%xmm5,-16*1($rptr)
+	add	\$32,$num
+	jnz	.Lsqr8x_cond_copy
 
 	mov	\$1,%rax
 	mov	-48(%rsi),%r15
@@ -1135,64 +1161,75 @@ $code.=<<___;
 	adc	$zero,%r15		# modulo-scheduled
 	sub	0*8($tptr),$zero	# pull top-most carry
 	adc	%r15,%r14
-	mov	-8($nptr),$mi
 	sbb	%r15,%r15		# top-most carry
 	mov	%r14,-1*8($tptr)
 
 	cmp	16(%rsp),$bptr
 	jne	.Lmulx4x_outer
 
-	sub	%r14,$mi		# compare top-most words
-	sbb	$mi,$mi
-	or	$mi,%r15
-
-	neg	$num
-	xor	%rdx,%rdx
-	mov	32(%rsp),$rptr		# restore rp
 	lea	64(%rsp),$tptr
-
-	pxor	%xmm0,%xmm0
-	mov	0*8($nptr,$num),%r8
-	mov	1*8($nptr,$num),%r9
-	neg	%r8
-	jmp	.Lmulx4x_sub_entry
+	sub	$num,$nptr		# rewind $nptr
+	neg	%r15
+	mov	$num,%rdx
+	shr	\$3+2,$num		# %cf=0
+	mov	32(%rsp),$rptr		# restore rp
+	jmp	.Lmulx4x_sub
 
 .align	32
 .Lmulx4x_sub:
-	mov	0*8($nptr,$num),%r8
-	mov	1*8($nptr,$num),%r9
-	not	%r8
-.Lmulx4x_sub_entry:
-	mov	2*8($nptr,$num),%r10
-	not	%r9
-	and	%r15,%r8
-	mov	3*8($nptr,$num),%r11
-	not	%r10
-	and	%r15,%r9
-	not	%r11
-	and	%r15,%r10
-	and	%r15,%r11
-
-	neg	%rdx			# mov %rdx,%cf
-	adc	0*8($tptr),%r8
-	adc	1*8($tptr),%r9
-	movdqa	%xmm0,($tptr)
-	adc	2*8($tptr),%r10
-	adc	3*8($tptr),%r11
-	movdqa	%xmm0,16($tptr)
-	lea	4*8($tptr),$tptr
-	sbb	%rdx,%rdx		# mov %cf,%rdx
-
-	mov	%r8,0*8($rptr)
-	mov	%r9,1*8($rptr)
-	mov	%r10,2*8($rptr)
-	mov	%r11,3*8($rptr)
-	lea	4*8($rptr),$rptr
-
-	add	\$32,$num
+	mov	8*0($tptr),%r11
+	mov	8*1($tptr),%r12
+	mov	8*2($tptr),%r13
+	mov	8*3($tptr),%r14
+	lea	8*4($tptr),$tptr
+	sbb	8*0($nptr),%r11
+	sbb	8*1($nptr),%r12
+	sbb	8*2($nptr),%r13
+	sbb	8*3($nptr),%r14
+	lea	8*4($nptr),$nptr
+	mov	%r11,8*0($rptr)
+	mov	%r12,8*1($rptr)
+	mov	%r13,8*2($rptr)
+	mov	%r14,8*3($rptr)
+	lea	8*4($rptr),$rptr
+	dec	$num			# preserves %cf
 	jnz	.Lmulx4x_sub
 
+	sbb	\$0,%r15		# top-most carry
+	lea	64(%rsp),$tptr
+	sub	%rdx,$rptr		# rewind
+
+	movq	%r15,%xmm1
+	pxor	%xmm0,%xmm0
+	pshufd	\$0,%xmm1,%xmm1
 	mov	40(%rsp),%rsi		# restore %rsp
+	jmp	.Lmulx4x_cond_copy
+
+.align	32
+.Lmulx4x_cond_copy:
+	movdqa	16*0($tptr),%xmm2
+	movdqa	16*1($tptr),%xmm3
+	lea	16*2($tptr),$tptr
+	movdqu	16*0($rptr),%xmm4
+	movdqu	16*1($rptr),%xmm5
+	lea	16*2($rptr),$rptr
+	movdqa	%xmm0,-16*2($tptr)	# zero tp
+	movdqa	%xmm0,-16*1($tptr)
+	pcmpeqd	%xmm1,%xmm0
+	pand	%xmm1,%xmm2
+	pand	%xmm1,%xmm3
+	pand	%xmm0,%xmm4
+	pand	%xmm0,%xmm5
+	pxor	%xmm0,%xmm0
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqu	%xmm4,-16*2($rptr)
+	movdqu	%xmm5,-16*1($rptr)
+	sub	\$32,%rdx
+	jnz	.Lmulx4x_cond_copy
+
+	mov	%rdx,($tptr)
+
 	mov	\$1,%rax
 	mov	-48(%rsi),%r15
 	mov	-40(%rsi),%r14
diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl
index 292409c4ffb8..2e8c9db32cbc 100755
--- a/crypto/bn/asm/x86_64-mont5.pl
+++ b/crypto/bn/asm/x86_64-mont5.pl
@@ -99,58 +99,111 @@ $code.=<<___;
 .Lmul_enter:
 	mov	${num}d,${num}d
 	mov	%rsp,%rax
-	mov	`($win64?56:8)`(%rsp),%r10d	# load 7th argument
+	movd	`($win64?56:8)`(%rsp),%xmm5	# load 7th argument
+	lea	.Linc(%rip),%r10
 	push	%rbx
 	push	%rbp
 	push	%r12
 	push	%r13
 	push	%r14
 	push	%r15
-___
-$code.=<<___ if ($win64);
-	lea	-0x28(%rsp),%rsp
-	movaps	%xmm6,(%rsp)
-	movaps	%xmm7,0x10(%rsp)
-___
-$code.=<<___;
+
 	lea	2($num),%r11
 	neg	%r11
-	lea	(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+2))
+	lea	-264(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+2)+256+8)
 	and	\$-1024,%rsp		# minimize TLB usage
 
 	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
 .Lmul_body:
-	mov	$bp,%r12		# reassign $bp
+	lea	128($bp),%r12		# reassign $bp (+size optimization)
 ___
 		$bp="%r12";
 		$STRIDE=2**5*8;		# 5 is "window size"
 		$N=$STRIDE/4;		# should match cache line size
 $code.=<<___;
-	mov	%r10,%r11
-	shr	\$`log($N/8)/log(2)`,%r10
-	and	\$`$N/8-1`,%r11
-	not	%r10
-	lea	.Lmagic_masks(%rip),%rax
-	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
-	lea	96($bp,%r11,8),$bp	# pointer within 1st cache line
-	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
-	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
-	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
-	movq	24(%rax,%r10,8),%xmm7
+	movdqa	0(%r10),%xmm0		# 00000001000000010000000000000000
+	movdqa	16(%r10),%xmm1		# 00000002000000020000000200000002
+	lea	24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization)
+	and	\$-16,%r10
 
-	movq	`0*$STRIDE/4-96`($bp),%xmm0
-	movq	`1*$STRIDE/4-96`($bp),%xmm1
-	pand	%xmm4,%xmm0
-	movq	`2*$STRIDE/4-96`($bp),%xmm2
-	pand	%xmm5,%xmm1
-	movq	`3*$STRIDE/4-96`($bp),%xmm3
-	pand	%xmm6,%xmm2
-	por	%xmm1,%xmm0
-	pand	%xmm7,%xmm3
+	pshufd	\$0,%xmm5,%xmm5		# broadcast index
+	movdqa	%xmm1,%xmm4
+	movdqa	%xmm1,%xmm2
+___
+########################################################################
+# calculate mask by comparing 0..31 to index and save result to stack
+#
+$code.=<<___;
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
+	.byte	0x67
+	movdqa	%xmm4,%xmm3
+___
+for($k=0;$k<$STRIDE/16-4;$k+=4) {
+$code.=<<___;
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
+	movdqa	%xmm0,`16*($k+0)+112`(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
+	movdqa	%xmm1,`16*($k+1)+112`(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
+	movdqa	%xmm2,`16*($k+2)+112`(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,`16*($k+3)+112`(%r10)
+	movdqa	%xmm4,%xmm3
+___
+}
+$code.=<<___;				# last iteration can be optimized
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,`16*($k+0)+112`(%r10)
+
+	paddd	%xmm2,%xmm3
+	.byte	0x67
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,`16*($k+1)+112`(%r10)
+
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,`16*($k+2)+112`(%r10)
+	pand	`16*($k+0)-128`($bp),%xmm0	# while it's still in register
+
+	pand	`16*($k+1)-128`($bp),%xmm1
+	pand	`16*($k+2)-128`($bp),%xmm2
+	movdqa	%xmm3,`16*($k+3)+112`(%r10)
+	pand	`16*($k+3)-128`($bp),%xmm3
 	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+___
+for($k=0;$k<$STRIDE/16-4;$k+=4) {
+$code.=<<___;
+	movdqa	`16*($k+0)-128`($bp),%xmm4
+	movdqa	`16*($k+1)-128`($bp),%xmm5
+	movdqa	`16*($k+2)-128`($bp),%xmm2
+	pand	`16*($k+0)+112`(%r10),%xmm4
+	movdqa	`16*($k+3)-128`($bp),%xmm3
+	pand	`16*($k+1)+112`(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	`16*($k+2)+112`(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	`16*($k+3)+112`(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+___
+}
+$code.=<<___;
+	por	%xmm1,%xmm0
+	pshufd	\$0x4e,%xmm0,%xmm1
+	por	%xmm1,%xmm0
 	lea	$STRIDE($bp),$bp
-	por	%xmm3,%xmm0
-
 	movq	%xmm0,$m0		# m0=bp[0]
 
 	mov	($n0),$n0		# pull n0[0] value
@@ -159,29 +212,14 @@ $code.=<<___;
 	xor	$i,$i			# i=0
 	xor	$j,$j			# j=0
 
-	movq	`0*$STRIDE/4-96`($bp),%xmm0
-	movq	`1*$STRIDE/4-96`($bp),%xmm1
-	pand	%xmm4,%xmm0
-	movq	`2*$STRIDE/4-96`($bp),%xmm2
-	pand	%xmm5,%xmm1
-
 	mov	$n0,$m1
 	mulq	$m0			# ap[0]*bp[0]
 	mov	%rax,$lo0
 	mov	($np),%rax
 
-	movq	`3*$STRIDE/4-96`($bp),%xmm3
-	pand	%xmm6,%xmm2
-	por	%xmm1,%xmm0
-	pand	%xmm7,%xmm3
-
 	imulq	$lo0,$m1		# "tp[0]"*n0
 	mov	%rdx,$hi0
 
-	por	%xmm2,%xmm0
-	lea	$STRIDE($bp),$bp
-	por	%xmm3,%xmm0
-
 	mulq	$m1			# np[0]*m1
 	add	%rax,$lo0		# discarded
 	mov	8($ap),%rax
@@ -212,16 +250,14 @@ $code.=<<___;
 
 	mulq	$m1			# np[j]*m1
 	cmp	$num,$j
-	jne	.L1st
-
-	movq	%xmm0,$m0		# bp[1]
+	jne	.L1st			# note that upon exit $j==$num, so
+					# they can be used interchangeably
 
 	add	%rax,$hi1
-	mov	($ap),%rax		# ap[0]
 	adc	\$0,%rdx
 	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
 	adc	\$0,%rdx
-	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	$hi1,-16(%rsp,$num,8)	# tp[num-1]
 	mov	%rdx,$hi1
 	mov	$lo0,$hi0
 
@@ -235,33 +271,48 @@ $code.=<<___;
 	jmp	.Louter
 .align	16
 .Louter:
+	lea	24+128(%rsp,$num,8),%rdx	# where 256-byte mask is (+size optimization)
+	and	\$-16,%rdx
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+___
+for($k=0;$k<$STRIDE/16;$k+=4) {
+$code.=<<___;
+	movdqa	`16*($k+0)-128`($bp),%xmm0
+	movdqa	`16*($k+1)-128`($bp),%xmm1
+	movdqa	`16*($k+2)-128`($bp),%xmm2
+	movdqa	`16*($k+3)-128`($bp),%xmm3
+	pand	`16*($k+0)-128`(%rdx),%xmm0
+	pand	`16*($k+1)-128`(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	`16*($k+2)-128`(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	`16*($k+3)-128`(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+___
+}
+$code.=<<___;
+	por	%xmm5,%xmm4
+	pshufd	\$0x4e,%xmm4,%xmm0
+	por	%xmm4,%xmm0
+	lea	$STRIDE($bp),$bp
+
+	mov	($ap),%rax		# ap[0]
+	movq	%xmm0,$m0		# m0=bp[i]
+
 	xor	$j,$j			# j=0
 	mov	$n0,$m1
 	mov	(%rsp),$lo0
 
-	movq	`0*$STRIDE/4-96`($bp),%xmm0
-	movq	`1*$STRIDE/4-96`($bp),%xmm1
-	pand	%xmm4,%xmm0
-	movq	`2*$STRIDE/4-96`($bp),%xmm2
-	pand	%xmm5,%xmm1
-
 	mulq	$m0			# ap[0]*bp[i]
 	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
 	mov	($np),%rax
 	adc	\$0,%rdx
 
-	movq	`3*$STRIDE/4-96`($bp),%xmm3
-	pand	%xmm6,%xmm2
-	por	%xmm1,%xmm0
-	pand	%xmm7,%xmm3
-
 	imulq	$lo0,$m1		# tp[0]*n0
 	mov	%rdx,$hi0
 
-	por	%xmm2,%xmm0
-	lea	$STRIDE($bp),$bp
-	por	%xmm3,%xmm0
-
 	mulq	$m1			# np[0]*m1
 	add	%rax,$lo0		# discarded
 	mov	8($ap),%rax
@@ -295,17 +346,14 @@ $code.=<<___;
 
 	mulq	$m1			# np[j]*m1
 	cmp	$num,$j
-	jne	.Linner
-
-	movq	%xmm0,$m0		# bp[i+1]
-
+	jne	.Linner			# note that upon exit $j==$num, so
+					# they can be used interchangeably
 	add	%rax,$hi1
-	mov	($ap),%rax		# ap[0]
 	adc	\$0,%rdx
 	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
-	mov	(%rsp,$j,8),$lo0
+	mov	(%rsp,$num,8),$lo0
 	adc	\$0,%rdx
-	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	$hi1,-16(%rsp,$num,8)	# tp[num-1]
 	mov	%rdx,$hi1
 
 	xor	%rdx,%rdx
@@ -352,12 +400,7 @@ $code.=<<___;
 
 	mov	8(%rsp,$num,8),%rsi	# restore %rsp
 	mov	\$1,%rax
-___
-$code.=<<___ if ($win64);
-	movaps	-88(%rsi),%xmm6
-	movaps	-72(%rsi),%xmm7
-___
-$code.=<<___;
+
 	mov	-48(%rsi),%r15
 	mov	-40(%rsi),%r14
 	mov	-32(%rsi),%r13
@@ -379,8 +422,8 @@ bn_mul4x_mont_gather5:
 .Lmul4x_enter:
 ___
 $code.=<<___ if ($addx);
-	and	\$0x80100,%r11d
-	cmp	\$0x80100,%r11d
+	and	\$0x80108,%r11d
+	cmp	\$0x80108,%r11d		# check for AD*X+BMI2+BMI1
 	je	.Lmulx4x_enter
 ___
 $code.=<<___;
@@ -392,39 +435,34 @@ $code.=<<___;
 	push	%r13
 	push	%r14
 	push	%r15
-___
-$code.=<<___ if ($win64);
-	lea	-0x28(%rsp),%rsp
-	movaps	%xmm6,(%rsp)
-	movaps	%xmm7,0x10(%rsp)
-___
-$code.=<<___;
+
 	.byte	0x67
-	mov	${num}d,%r10d
-	shl	\$3,${num}d
-	shl	\$3+2,%r10d		# 4*$num
+	shl	\$3,${num}d		# convert $num to bytes
+	lea	($num,$num,2),%r10	# 3*$num in bytes
 	neg	$num			# -$num
 
 	##############################################################
-	# ensure that stack frame doesn't alias with $aptr+4*$num
-	# modulo 4096, which covers ret[num], am[num] and n[2*num]
-	# (see bn_exp.c). this is done to allow memory disambiguation
-	# logic do its magic. [excessive frame is allocated in order
-	# to allow bn_from_mont8x to clear it.]
+	# Ensure that stack frame doesn't alias with $rptr+3*$num
+	# modulo 4096, which covers ret[num], am[num] and n[num]
+	# (see bn_exp.c). This is done to allow memory disambiguation
+	# logic do its magic. [Extra [num] is allocated in order
+	# to align with bn_power5's frame, which is cleansed after
+	# completing exponentiation. Extra 256 bytes is for power mask
+	# calculated from 7th argument, the index.]
 	#
-	lea	-64(%rsp,$num,2),%r11
-	sub	$ap,%r11
+	lea	-320(%rsp,$num,2),%r11
+	sub	$rp,%r11
 	and	\$4095,%r11
 	cmp	%r11,%r10
 	jb	.Lmul4xsp_alt
-	sub	%r11,%rsp		# align with $ap
-	lea	-64(%rsp,$num,2),%rsp	# alloca(128+num*8)
+	sub	%r11,%rsp		# align with $rp
+	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*num*8+256)
 	jmp	.Lmul4xsp_done
 
 .align	32
 .Lmul4xsp_alt:
-	lea	4096-64(,$num,2),%r10
-	lea	-64(%rsp,$num,2),%rsp	# alloca(128+num*8)
+	lea	4096-320(,$num,2),%r10
+	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*num*8+256)
 	sub	%r10,%r11
 	mov	\$0,%r10
 	cmovc	%r10,%r11
@@ -440,12 +478,7 @@ $code.=<<___;
 
 	mov	40(%rsp),%rsi		# restore %rsp
 	mov	\$1,%rax
-___
-$code.=<<___ if ($win64);
-	movaps	-88(%rsi),%xmm6
-	movaps	-72(%rsi),%xmm7
-___
-$code.=<<___;
+
 	mov	-48(%rsi),%r15
 	mov	-40(%rsi),%r14
 	mov	-32(%rsi),%r13
@@ -460,9 +493,10 @@ $code.=<<___;
 .type	mul4x_internal,\@abi-omnipotent
 .align	32
 mul4x_internal:
-	shl	\$5,$num
-	mov	`($win64?56:8)`(%rax),%r10d	# load 7th argument
-	lea	256(%rdx,$num),%r13
+	shl	\$5,$num		# $num was in bytes
+	movd	`($win64?56:8)`(%rax),%xmm5	# load 7th argument, index
+	lea	.Linc(%rip),%rax
+	lea	128(%rdx,$num),%r13	# end of powers table (+size optimization)
 	shr	\$5,$num		# restore $num
 ___
 		$bp="%r12";
@@ -470,44 +504,92 @@ ___
 		$N=$STRIDE/4;		# should match cache line size
 		$tp=$i;
 $code.=<<___;
-	mov	%r10,%r11
-	shr	\$`log($N/8)/log(2)`,%r10
-	and	\$`$N/8-1`,%r11
-	not	%r10
-	lea	.Lmagic_masks(%rip),%rax
-	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
-	lea	96(%rdx,%r11,8),$bp	# pointer within 1st cache line
-	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
-	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
-	add	\$7,%r11
-	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
-	movq	24(%rax,%r10,8),%xmm7
-	and	\$7,%r11
+	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
+	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
+	lea	88-112(%rsp,$num),%r10	# place the mask after tp[num+1] (+ICache optimization)
+	lea	128(%rdx),$bp		# size optimization
 
-	movq	`0*$STRIDE/4-96`($bp),%xmm0
-	lea	$STRIDE($bp),$tp	# borrow $tp
-	movq	`1*$STRIDE/4-96`($bp),%xmm1
-	pand	%xmm4,%xmm0
-	movq	`2*$STRIDE/4-96`($bp),%xmm2
-	pand	%xmm5,%xmm1
-	movq	`3*$STRIDE/4-96`($bp),%xmm3
-	pand	%xmm6,%xmm2
+	pshufd	\$0,%xmm5,%xmm5		# broadcast index
+	movdqa	%xmm1,%xmm4
+	.byte	0x67,0x67
+	movdqa	%xmm1,%xmm2
+___
+########################################################################
+# calculate mask by comparing 0..31 to index and save result to stack
+#
+$code.=<<___;
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
 	.byte	0x67
-	por	%xmm1,%xmm0
-	movq	`0*$STRIDE/4-96`($tp),%xmm1
-	.byte	0x67
-	pand	%xmm7,%xmm3
+	movdqa	%xmm4,%xmm3
+___
+for($i=0;$i<$STRIDE/16-4;$i+=4) {
+$code.=<<___;
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
+	movdqa	%xmm0,`16*($i+0)+112`(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
+	movdqa	%xmm1,`16*($i+1)+112`(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
+	movdqa	%xmm2,`16*($i+2)+112`(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,`16*($i+3)+112`(%r10)
+	movdqa	%xmm4,%xmm3
+___
+}
+$code.=<<___;				# last iteration can be optimized
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,`16*($i+0)+112`(%r10)
+
+	paddd	%xmm2,%xmm3
 	.byte	0x67
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,`16*($i+1)+112`(%r10)
+
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,`16*($i+2)+112`(%r10)
+	pand	`16*($i+0)-128`($bp),%xmm0	# while it's still in register
+
+	pand	`16*($i+1)-128`($bp),%xmm1
+	pand	`16*($i+2)-128`($bp),%xmm2
+	movdqa	%xmm3,`16*($i+3)+112`(%r10)
+	pand	`16*($i+3)-128`($bp),%xmm3
 	por	%xmm2,%xmm0
-	movq	`1*$STRIDE/4-96`($tp),%xmm2
-	.byte	0x67
-	pand	%xmm4,%xmm1
-	.byte	0x67
-	por	%xmm3,%xmm0
-	movq	`2*$STRIDE/4-96`($tp),%xmm3
-
+	por	%xmm3,%xmm1
+___
+for($i=0;$i<$STRIDE/16-4;$i+=4) {
+$code.=<<___;
+	movdqa	`16*($i+0)-128`($bp),%xmm4
+	movdqa	`16*($i+1)-128`($bp),%xmm5
+	movdqa	`16*($i+2)-128`($bp),%xmm2
+	pand	`16*($i+0)+112`(%r10),%xmm4
+	movdqa	`16*($i+3)-128`($bp),%xmm3
+	pand	`16*($i+1)+112`(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	`16*($i+2)+112`(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	`16*($i+3)+112`(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+___
+}
+$code.=<<___;
+	por	%xmm1,%xmm0
+	pshufd	\$0x4e,%xmm0,%xmm1
+	por	%xmm1,%xmm0
+	lea	$STRIDE($bp),$bp
 	movq	%xmm0,$m0		# m0=bp[0]
-	movq	`3*$STRIDE/4-96`($tp),%xmm0
+
 	mov	%r13,16+8(%rsp)		# save end of b[num]
 	mov	$rp, 56+8(%rsp)		# save $rp
 
@@ -521,26 +603,10 @@ $code.=<<___;
 	mov	%rax,$A[0]
 	mov	($np),%rax
 
-	pand	%xmm5,%xmm2
-	pand	%xmm6,%xmm3
-	por	%xmm2,%xmm1
-
 	imulq	$A[0],$m1		# "tp[0]"*n0
-	##############################################################
-	# $tp is chosen so that writing to top-most element of the
-	# vector occurs just "above" references to powers table,
-	# "above" modulo cache-line size, which effectively precludes
-	# possibility of memory disambiguation logic failure when
-	# accessing the table.
-	# 
-	lea	64+8(%rsp,%r11,8),$tp
+	lea	64+8(%rsp),$tp
 	mov	%rdx,$A[1]
 
-	pand	%xmm7,%xmm0
-	por	%xmm3,%xmm1
-	lea	2*$STRIDE($bp),$bp
-	por	%xmm1,%xmm0
-
 	mulq	$m1			# np[0]*m1
 	add	%rax,$A[0]		# discarded
 	mov	8($ap,$num),%rax
@@ -549,7 +615,7 @@ $code.=<<___;
 
 	mulq	$m0
 	add	%rax,$A[1]
-	mov	16*1($np),%rax		# interleaved with 0, therefore 16*n
+	mov	8*1($np),%rax
 	adc	\$0,%rdx
 	mov	%rdx,$A[0]
 
@@ -559,7 +625,7 @@ $code.=<<___;
 	adc	\$0,%rdx
 	add	$A[1],$N[1]
 	lea	4*8($num),$j		# j=4
-	lea	16*4($np),$np
+	lea	8*4($np),$np
 	adc	\$0,%rdx
 	mov	$N[1],($tp)
 	mov	%rdx,$N[0]
@@ -569,7 +635,7 @@ $code.=<<___;
 .L1st4x:
 	mulq	$m0			# ap[j]*bp[0]
 	add	%rax,$A[0]
-	mov	-16*2($np),%rax
+	mov	-8*2($np),%rax
 	lea	32($tp),$tp
 	adc	\$0,%rdx
 	mov	%rdx,$A[1]
@@ -585,7 +651,7 @@ $code.=<<___;
 
 	mulq	$m0			# ap[j]*bp[0]
 	add	%rax,$A[1]
-	mov	-16*1($np),%rax
+	mov	-8*1($np),%rax
 	adc	\$0,%rdx
 	mov	%rdx,$A[0]
 
@@ -600,7 +666,7 @@ $code.=<<___;
 
 	mulq	$m0			# ap[j]*bp[0]
 	add	%rax,$A[0]
-	mov	16*0($np),%rax
+	mov	8*0($np),%rax
 	adc	\$0,%rdx
 	mov	%rdx,$A[1]
 
@@ -615,7 +681,7 @@ $code.=<<___;
 
 	mulq	$m0			# ap[j]*bp[0]
 	add	%rax,$A[1]
-	mov	16*1($np),%rax
+	mov	8*1($np),%rax
 	adc	\$0,%rdx
 	mov	%rdx,$A[0]
 
@@ -624,7 +690,7 @@ $code.=<<___;
 	mov	16($ap,$j),%rax
 	adc	\$0,%rdx
 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
-	lea	16*4($np),$np
+	lea	8*4($np),$np
 	adc	\$0,%rdx
 	mov	$N[1],($tp)		# tp[j-1]
 	mov	%rdx,$N[0]
@@ -634,7 +700,7 @@ $code.=<<___;
 
 	mulq	$m0			# ap[j]*bp[0]
 	add	%rax,$A[0]
-	mov	-16*2($np),%rax
+	mov	-8*2($np),%rax
 	lea	32($tp),$tp
 	adc	\$0,%rdx
 	mov	%rdx,$A[1]
@@ -650,7 +716,7 @@ $code.=<<___;
 
 	mulq	$m0			# ap[j]*bp[0]
 	add	%rax,$A[1]
-	mov	-16*1($np),%rax
+	mov	-8*1($np),%rax
 	adc	\$0,%rdx
 	mov	%rdx,$A[0]
 
@@ -663,8 +729,7 @@ $code.=<<___;
 	mov	$N[1],-16($tp)		# tp[j-1]
 	mov	%rdx,$N[0]
 
-	movq	%xmm0,$m0		# bp[1]
-	lea	($np,$num,2),$np	# rewind $np
+	lea	($np,$num),$np		# rewind $np
 
 	xor	$N[1],$N[1]
 	add	$A[0],$N[0]
@@ -675,6 +740,33 @@ $code.=<<___;
 
 .align	32
 .Louter4x:
+	lea	16+128($tp),%rdx	# where 256-byte mask is (+size optimization)
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+___
+for($i=0;$i<$STRIDE/16;$i+=4) {
+$code.=<<___;
+	movdqa	`16*($i+0)-128`($bp),%xmm0
+	movdqa	`16*($i+1)-128`($bp),%xmm1
+	movdqa	`16*($i+2)-128`($bp),%xmm2
+	movdqa	`16*($i+3)-128`($bp),%xmm3
+	pand	`16*($i+0)-128`(%rdx),%xmm0
+	pand	`16*($i+1)-128`(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	`16*($i+2)-128`(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	`16*($i+3)-128`(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+___
+}
+$code.=<<___;
+	por	%xmm5,%xmm4
+	pshufd	\$0x4e,%xmm4,%xmm0
+	por	%xmm4,%xmm0
+	lea	$STRIDE($bp),$bp
+	movq	%xmm0,$m0		# m0=bp[i]
+
 	mov	($tp,$num),$A[0]
 	mov	$n0,$m1
 	mulq	$m0			# ap[0]*bp[i]
@@ -682,25 +774,11 @@ $code.=<<___;
 	mov	($np),%rax
 	adc	\$0,%rdx
 
-	movq	`0*$STRIDE/4-96`($bp),%xmm0
-	movq	`1*$STRIDE/4-96`($bp),%xmm1
-	pand	%xmm4,%xmm0
-	movq	`2*$STRIDE/4-96`($bp),%xmm2
-	pand	%xmm5,%xmm1
-	movq	`3*$STRIDE/4-96`($bp),%xmm3
-
 	imulq	$A[0],$m1		# tp[0]*n0
-	.byte	0x67
 	mov	%rdx,$A[1]
 	mov	$N[1],($tp)		# store upmost overflow bit
 
-	pand	%xmm6,%xmm2
-	por	%xmm1,%xmm0
-	pand	%xmm7,%xmm3
-	por	%xmm2,%xmm0
 	lea	($tp,$num),$tp		# rewind $tp
-	lea	$STRIDE($bp),$bp
-	por	%xmm3,%xmm0
 
 	mulq	$m1			# np[0]*m1
 	add	%rax,$A[0]		# "$N[0]", discarded
@@ -710,7 +788,7 @@ $code.=<<___;
 
 	mulq	$m0			# ap[j]*bp[i]
 	add	%rax,$A[1]
-	mov	16*1($np),%rax		# interleaved with 0, therefore 16*n
+	mov	8*1($np),%rax
 	adc	\$0,%rdx
 	add	8($tp),$A[1]		# +tp[1]
 	adc	\$0,%rdx
@@ -722,7 +800,7 @@ $code.=<<___;
 	adc	\$0,%rdx
 	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
 	lea	4*8($num),$j		# j=4
-	lea	16*4($np),$np
+	lea	8*4($np),$np
 	adc	\$0,%rdx
 	mov	%rdx,$N[0]
 	jmp	.Linner4x
@@ -731,7 +809,7 @@ $code.=<<___;
 .Linner4x:
 	mulq	$m0			# ap[j]*bp[i]
 	add	%rax,$A[0]
-	mov	-16*2($np),%rax
+	mov	-8*2($np),%rax
 	adc	\$0,%rdx
 	add	16($tp),$A[0]		# ap[j]*bp[i]+tp[j]
 	lea	32($tp),$tp
@@ -749,7 +827,7 @@ $code.=<<___;
 
 	mulq	$m0			# ap[j]*bp[i]
 	add	%rax,$A[1]
-	mov	-16*1($np),%rax
+	mov	-8*1($np),%rax
 	adc	\$0,%rdx
 	add	-8($tp),$A[1]
 	adc	\$0,%rdx
@@ -766,7 +844,7 @@ $code.=<<___;
 
 	mulq	$m0			# ap[j]*bp[i]
 	add	%rax,$A[0]
-	mov	16*0($np),%rax
+	mov	8*0($np),%rax
 	adc	\$0,%rdx
 	add	($tp),$A[0]		# ap[j]*bp[i]+tp[j]
 	adc	\$0,%rdx
@@ -783,7 +861,7 @@ $code.=<<___;
 
 	mulq	$m0			# ap[j]*bp[i]
 	add	%rax,$A[1]
-	mov	16*1($np),%rax
+	mov	8*1($np),%rax
 	adc	\$0,%rdx
 	add	8($tp),$A[1]
 	adc	\$0,%rdx
@@ -794,7 +872,7 @@ $code.=<<___;
 	mov	16($ap,$j),%rax
 	adc	\$0,%rdx
 	add	$A[1],$N[1]
-	lea	16*4($np),$np
+	lea	8*4($np),$np
 	adc	\$0,%rdx
 	mov	$N[0],-8($tp)		# tp[j-1]
 	mov	%rdx,$N[0]
@@ -804,7 +882,7 @@ $code.=<<___;
 
 	mulq	$m0			# ap[j]*bp[i]
 	add	%rax,$A[0]
-	mov	-16*2($np),%rax
+	mov	-8*2($np),%rax
 	adc	\$0,%rdx
 	add	16($tp),$A[0]		# ap[j]*bp[i]+tp[j]
 	lea	32($tp),$tp
@@ -823,7 +901,7 @@ $code.=<<___;
 	mulq	$m0			# ap[j]*bp[i]
 	add	%rax,$A[1]
 	mov	$m1,%rax
-	mov	-16*1($np),$m1
+	mov	-8*1($np),$m1
 	adc	\$0,%rdx
 	add	-8($tp),$A[1]
 	adc	\$0,%rdx
@@ -838,9 +916,8 @@ $code.=<<___;
 	mov	$N[0],-24($tp)		# tp[j-1]
 	mov	%rdx,$N[0]
 
-	movq	%xmm0,$m0		# bp[i+1]
 	mov	$N[1],-16($tp)		# tp[j-1]
-	lea	($np,$num,2),$np	# rewind $np
+	lea	($np,$num),$np		# rewind $np
 
 	xor	$N[1],$N[1]
 	add	$A[0],$N[0]
@@ -854,16 +931,23 @@ $code.=<<___;
 ___
 if (1) {
 $code.=<<___;
+	xor	%rax,%rax
 	sub	$N[0],$m1		# compare top-most words
 	adc	$j,$j			# $j is zero
 	or	$j,$N[1]
-	xor	\$1,$N[1]
+	sub	$N[1],%rax		# %rax=-$N[1]
 	lea	($tp,$num),%rbx		# tptr in .sqr4x_sub
-	lea	($np,$N[1],8),%rbp	# nptr in .sqr4x_sub
+	mov	($np),%r12
+	lea	($np),%rbp		# nptr in .sqr4x_sub
 	mov	%r9,%rcx
-	sar	\$3+2,%rcx		# cf=0
+	sar	\$3+2,%rcx
 	mov	56+8(%rsp),%rdi		# rptr in .sqr4x_sub
-	jmp	.Lsqr4x_sub
+	dec	%r12			# so that after 'not' we get -n[0]
+	xor	%r10,%r10
+	mov	8*1(%rbp),%r13
+	mov	8*2(%rbp),%r14
+	mov	8*3(%rbp),%r15
+	jmp	.Lsqr4x_sub_entry
 ___
 } else {
 my @ri=("%rax",$bp,$m0,$m1);
@@ -930,8 +1014,8 @@ bn_power5:
 ___
 $code.=<<___ if ($addx);
 	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
-	and	\$0x80100,%r11d
-	cmp	\$0x80100,%r11d
+	and	\$0x80108,%r11d
+	cmp	\$0x80108,%r11d		# check for AD*X+BMI2+BMI1
 	je	.Lpowerx5_enter
 ___
 $code.=<<___;
@@ -942,38 +1026,32 @@ $code.=<<___;
 	push	%r13
 	push	%r14
 	push	%r15
-___
-$code.=<<___ if ($win64);
-	lea	-0x28(%rsp),%rsp
-	movaps	%xmm6,(%rsp)
-	movaps	%xmm7,0x10(%rsp)
-___
-$code.=<<___;
-	mov	${num}d,%r10d
+
 	shl	\$3,${num}d		# convert $num to bytes
-	shl	\$3+2,%r10d		# 4*$num
+	lea	($num,$num,2),%r10d	# 3*$num
 	neg	$num
 	mov	($n0),$n0		# *n0
 
 	##############################################################
-	# ensure that stack frame doesn't alias with $aptr+4*$num
-	# modulo 4096, which covers ret[num], am[num] and n[2*num]
-	# (see bn_exp.c). this is done to allow memory disambiguation
-	# logic do its magic.
+	# Ensure that stack frame doesn't alias with $rptr+3*$num
+	# modulo 4096, which covers ret[num], am[num] and n[num]
+	# (see bn_exp.c). This is done to allow memory disambiguation
+	# logic do its magic. [Extra 256 bytes is for power mask
+	# calculated from 7th argument, the index.]
 	#
-	lea	-64(%rsp,$num,2),%r11
-	sub	$aptr,%r11
+	lea	-320(%rsp,$num,2),%r11
+	sub	$rptr,%r11
 	and	\$4095,%r11
 	cmp	%r11,%r10
 	jb	.Lpwr_sp_alt
 	sub	%r11,%rsp		# align with $aptr
-	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
+	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*num*8+256)
 	jmp	.Lpwr_sp_done
 
 .align	32
 .Lpwr_sp_alt:
-	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
-	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
+	lea	4096-320(,$num,2),%r10
+	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*num*8+256)
 	sub	%r10,%r11
 	mov	\$0,%r10
 	cmovc	%r10,%r11
@@ -995,16 +1073,21 @@ $code.=<<___;
 	mov	$n0,  32(%rsp)
 	mov	%rax, 40(%rsp)		# save original %rsp
 .Lpower5_body:
-	movq	$rptr,%xmm1		# save $rptr
+	movq	$rptr,%xmm1		# save $rptr, used in sqr8x
 	movq	$nptr,%xmm2		# save $nptr
-	movq	%r10, %xmm3		# -$num
+	movq	%r10, %xmm3		# -$num, used in sqr8x
 	movq	$bptr,%xmm4
 
 	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
 	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
 	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
 	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
 	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
 
 	movq	%xmm2,$nptr
 	movq	%xmm4,$bptr
@@ -1565,9 +1648,9 @@ my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
 
 $code.=<<___;
 	movq	%xmm2,$nptr
-sqr8x_reduction:
+__bn_sqr8x_reduction:
 	xor	%rax,%rax
-	lea	($nptr,$num,2),%rcx	# end of n[]
+	lea	($nptr,$num),%rcx	# end of n[]
 	lea	48+8(%rsp,$num,2),%rdx	# end of t[] buffer
 	mov	%rcx,0+8(%rsp)
 	lea	48+8(%rsp,$num),$tptr	# end of initial t[] window
@@ -1593,21 +1676,21 @@ sqr8x_reduction:
 	.byte	0x67
 	mov	$m0,%r8
 	imulq	32+8(%rsp),$m0		# n0*a[0]
-	mov	16*0($nptr),%rax	# n[0]
+	mov	8*0($nptr),%rax		# n[0]
 	mov	\$8,%ecx
 	jmp	.L8x_reduce
 
 .align	32
 .L8x_reduce:
 	mulq	$m0
-	 mov	16*1($nptr),%rax	# n[1]
+	 mov	8*1($nptr),%rax		# n[1]
 	neg	%r8
 	mov	%rdx,%r8
 	adc	\$0,%r8
 
 	mulq	$m0
 	add	%rax,%r9
-	 mov	16*2($nptr),%rax
+	 mov	8*2($nptr),%rax
 	adc	\$0,%rdx
 	add	%r9,%r8
 	 mov	$m0,48-8+8(%rsp,%rcx,8)	# put aside n0*a[i]
@@ -1616,7 +1699,7 @@ sqr8x_reduction:
 
 	mulq	$m0
 	add	%rax,%r10
-	 mov	16*3($nptr),%rax
+	 mov	8*3($nptr),%rax
 	adc	\$0,%rdx
 	add	%r10,%r9
 	 mov	32+8(%rsp),$carry	# pull n0, borrow $carry
@@ -1625,7 +1708,7 @@ sqr8x_reduction:
 
 	mulq	$m0
 	add	%rax,%r11
-	 mov	16*4($nptr),%rax
+	 mov	8*4($nptr),%rax
 	adc	\$0,%rdx
 	 imulq	%r8,$carry		# modulo-scheduled
 	add	%r11,%r10
@@ -1634,7 +1717,7 @@ sqr8x_reduction:
 
 	mulq	$m0
 	add	%rax,%r12
-	 mov	16*5($nptr),%rax
+	 mov	8*5($nptr),%rax
 	adc	\$0,%rdx
 	add	%r12,%r11
 	mov	%rdx,%r12
@@ -1642,7 +1725,7 @@ sqr8x_reduction:
 
 	mulq	$m0
 	add	%rax,%r13
-	 mov	16*6($nptr),%rax
+	 mov	8*6($nptr),%rax
 	adc	\$0,%rdx
 	add	%r13,%r12
 	mov	%rdx,%r13
@@ -1650,7 +1733,7 @@ sqr8x_reduction:
 
 	mulq	$m0
 	add	%rax,%r14
-	 mov	16*7($nptr),%rax
+	 mov	8*7($nptr),%rax
 	adc	\$0,%rdx
 	add	%r14,%r13
 	mov	%rdx,%r14
@@ -1659,7 +1742,7 @@ sqr8x_reduction:
 	mulq	$m0
 	 mov	$carry,$m0		# n0*a[i]
 	add	%rax,%r15
-	 mov	16*0($nptr),%rax	# n[0]
+	 mov	8*0($nptr),%rax		# n[0]
 	adc	\$0,%rdx
 	add	%r15,%r14
 	mov	%rdx,%r15
@@ -1668,7 +1751,7 @@ sqr8x_reduction:
 	dec	%ecx
 	jnz	.L8x_reduce
 
-	lea	16*8($nptr),$nptr
+	lea	8*8($nptr),$nptr
 	xor	%rax,%rax
 	mov	8+8(%rsp),%rdx		# pull end of t[]
 	cmp	0+8(%rsp),$nptr		# end of n[]?
@@ -1687,21 +1770,21 @@ sqr8x_reduction:
 
 	mov	48+56+8(%rsp),$m0	# pull n0*a[0]
 	mov	\$8,%ecx
-	mov	16*0($nptr),%rax
+	mov	8*0($nptr),%rax
 	jmp	.L8x_tail
 
 .align	32
 .L8x_tail:
 	mulq	$m0
 	add	%rax,%r8
-	 mov	16*1($nptr),%rax
+	 mov	8*1($nptr),%rax
 	 mov	%r8,($tptr)		# save result
 	mov	%rdx,%r8
 	adc	\$0,%r8
 
 	mulq	$m0
 	add	%rax,%r9
-	 mov	16*2($nptr),%rax
+	 mov	8*2($nptr),%rax
 	adc	\$0,%rdx
 	add	%r9,%r8
 	 lea	8($tptr),$tptr		# $tptr++
@@ -1710,7 +1793,7 @@ sqr8x_reduction:
 
 	mulq	$m0
 	add	%rax,%r10
-	 mov	16*3($nptr),%rax
+	 mov	8*3($nptr),%rax
 	adc	\$0,%rdx
 	add	%r10,%r9
 	mov	%rdx,%r10
@@ -1718,7 +1801,7 @@ sqr8x_reduction:
 
 	mulq	$m0
 	add	%rax,%r11
-	 mov	16*4($nptr),%rax
+	 mov	8*4($nptr),%rax
 	adc	\$0,%rdx
 	add	%r11,%r10
 	mov	%rdx,%r11
@@ -1726,7 +1809,7 @@ sqr8x_reduction:
 
 	mulq	$m0
 	add	%rax,%r12
-	 mov	16*5($nptr),%rax
+	 mov	8*5($nptr),%rax
 	adc	\$0,%rdx
 	add	%r12,%r11
 	mov	%rdx,%r12
@@ -1734,7 +1817,7 @@ sqr8x_reduction:
 
 	mulq	$m0
 	add	%rax,%r13
-	 mov	16*6($nptr),%rax
+	 mov	8*6($nptr),%rax
 	adc	\$0,%rdx
 	add	%r13,%r12
 	mov	%rdx,%r13
@@ -1742,7 +1825,7 @@ sqr8x_reduction:
 
 	mulq	$m0
 	add	%rax,%r14
-	 mov	16*7($nptr),%rax
+	 mov	8*7($nptr),%rax
 	adc	\$0,%rdx
 	add	%r14,%r13
 	mov	%rdx,%r14
@@ -1753,14 +1836,14 @@ sqr8x_reduction:
 	add	%rax,%r15
 	adc	\$0,%rdx
 	add	%r15,%r14
-	 mov	16*0($nptr),%rax	# pull n[0]
+	 mov	8*0($nptr),%rax		# pull n[0]
 	mov	%rdx,%r15
 	adc	\$0,%r15
 
 	dec	%ecx
 	jnz	.L8x_tail
 
-	lea	16*8($nptr),$nptr
+	lea	8*8($nptr),$nptr
 	mov	8+8(%rsp),%rdx		# pull end of t[]
 	cmp	0+8(%rsp),$nptr		# end of n[]?
 	jae	.L8x_tail_done		# break out of loop
@@ -1806,7 +1889,7 @@ sqr8x_reduction:
 	adc	8*6($tptr),%r14
 	adc	8*7($tptr),%r15
 	adc	\$0,%rax		# top-most carry
-	 mov	-16($nptr),%rcx		# np[num-1]
+	 mov	-8($nptr),%rcx		# np[num-1]
 	 xor	$carry,$carry
 
 	movq	%xmm2,$nptr		# restore $nptr
@@ -1824,6 +1907,8 @@ sqr8x_reduction:
 
 	cmp	%rdx,$tptr		# end of t[]?
 	jb	.L8x_reduction_loop
+	ret
+.size	bn_sqr8x_internal,.-bn_sqr8x_internal
 ___
 }
 ##############################################################
@@ -1832,48 +1917,62 @@ ___
 {
 my ($tptr,$nptr)=("%rbx","%rbp");
 $code.=<<___;
-	#xor	%rsi,%rsi		# %rsi was $carry above
-	sub	%r15,%rcx		# compare top-most words
-	lea	(%rdi,$num),$tptr	# %rdi was $tptr above
-	adc	%rsi,%rsi
-	mov	$num,%rcx
-	or	%rsi,%rax
-	movq	%xmm1,$rptr		# restore $rptr
-	xor	\$1,%rax
-	movq	%xmm1,$aptr		# prepare for back-to-back call
-	lea	($nptr,%rax,8),$nptr
-	sar	\$3+2,%rcx		# cf=0
-	jmp	.Lsqr4x_sub
-
+.type	__bn_post4x_internal,\@abi-omnipotent
 .align	32
+__bn_post4x_internal:
+	mov	8*0($nptr),%r12
+	lea	(%rdi,$num),$tptr	# %rdi was $tptr above
+	mov	$num,%rcx
+	movq	%xmm1,$rptr		# restore $rptr
+	neg	%rax
+	movq	%xmm1,$aptr		# prepare for back-to-back call
+	sar	\$3+2,%rcx
+	dec	%r12			# so that after 'not' we get -n[0]
+	xor	%r10,%r10
+	mov	8*1($nptr),%r13
+	mov	8*2($nptr),%r14
+	mov	8*3($nptr),%r15
+	jmp	.Lsqr4x_sub_entry
+
+.align	16
 .Lsqr4x_sub:
-	.byte	0x66
-	mov	8*0($tptr),%r12
-	mov	8*1($tptr),%r13
-	sbb	16*0($nptr),%r12
-	mov	8*2($tptr),%r14
-	sbb	16*1($nptr),%r13
-	mov	8*3($tptr),%r15
-	lea	8*4($tptr),$tptr
-	sbb	16*2($nptr),%r14
+	mov	8*0($nptr),%r12
+	mov	8*1($nptr),%r13
+	mov	8*2($nptr),%r14
+	mov	8*3($nptr),%r15
+.Lsqr4x_sub_entry:
+	lea	8*4($nptr),$nptr
+	not	%r12
+	not	%r13
+	not	%r14
+	not	%r15
+	and	%rax,%r12
+	and	%rax,%r13
+	and	%rax,%r14
+	and	%rax,%r15
+
+	neg	%r10			# mov %r10,%cf
+	adc	8*0($tptr),%r12
+	adc	8*1($tptr),%r13
+	adc	8*2($tptr),%r14
+	adc	8*3($tptr),%r15
 	mov	%r12,8*0($rptr)
-	sbb	16*3($nptr),%r15
-	lea	16*4($nptr),$nptr
+	lea	8*4($tptr),$tptr
 	mov	%r13,8*1($rptr)
+	sbb	%r10,%r10		# mov %cf,%r10
 	mov	%r14,8*2($rptr)
 	mov	%r15,8*3($rptr)
 	lea	8*4($rptr),$rptr
 
 	inc	%rcx			# pass %cf
 	jnz	.Lsqr4x_sub
-___
-}
-$code.=<<___;
+
 	mov	$num,%r10		# prepare for back-to-back call
 	neg	$num			# restore $num	
 	ret
-.size	bn_sqr8x_internal,.-bn_sqr8x_internal
+.size	__bn_post4x_internal,.-__bn_post4x_internal
 ___
+}
 {
 $code.=<<___;
 .globl	bn_from_montgomery
@@ -1897,39 +1996,32 @@ bn_from_mont8x:
 	push	%r13
 	push	%r14
 	push	%r15
-___
-$code.=<<___ if ($win64);
-	lea	-0x28(%rsp),%rsp
-	movaps	%xmm6,(%rsp)
-	movaps	%xmm7,0x10(%rsp)
-___
-$code.=<<___;
-	.byte	0x67
-	mov	${num}d,%r10d
+
 	shl	\$3,${num}d		# convert $num to bytes
-	shl	\$3+2,%r10d		# 4*$num
+	lea	($num,$num,2),%r10	# 3*$num in bytes
 	neg	$num
 	mov	($n0),$n0		# *n0
 
 	##############################################################
-	# ensure that stack frame doesn't alias with $aptr+4*$num
-	# modulo 4096, which covers ret[num], am[num] and n[2*num]
-	# (see bn_exp.c). this is done to allow memory disambiguation
-	# logic do its magic.
+	# Ensure that stack frame doesn't alias with $rptr+3*$num
+	# modulo 4096, which covers ret[num], am[num] and n[num]
+	# (see bn_exp.c). The stack is allocated to aligned with
+	# bn_power5's frame, and as bn_from_montgomery happens to be
+	# last operation, we use the opportunity to cleanse it.
 	#
-	lea	-64(%rsp,$num,2),%r11
-	sub	$aptr,%r11
+	lea	-320(%rsp,$num,2),%r11
+	sub	$rptr,%r11
 	and	\$4095,%r11
 	cmp	%r11,%r10
 	jb	.Lfrom_sp_alt
 	sub	%r11,%rsp		# align with $aptr
-	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
+	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
 	jmp	.Lfrom_sp_done
 
 .align	32
 .Lfrom_sp_alt:
-	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
-	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
+	lea	4096-320(,$num,2),%r10
+	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
 	sub	%r10,%r11
 	mov	\$0,%r10
 	cmovc	%r10,%r11
@@ -1983,12 +2075,13 @@ $code.=<<___;
 ___
 $code.=<<___ if ($addx);
 	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
-	and	\$0x80100,%r11d
-	cmp	\$0x80100,%r11d
+	and	\$0x80108,%r11d
+	cmp	\$0x80108,%r11d		# check for AD*X+BMI2+BMI1
 	jne	.Lfrom_mont_nox
 
 	lea	(%rax,$num),$rptr
-	call	sqrx8x_reduction
+	call	__bn_sqrx8x_reduction
+	call	__bn_postx4x_internal
 
 	pxor	%xmm0,%xmm0
 	lea	48(%rsp),%rax
@@ -1999,7 +2092,8 @@ $code.=<<___ if ($addx);
 .Lfrom_mont_nox:
 ___
 $code.=<<___;
-	call	sqr8x_reduction
+	call	__bn_sqr8x_reduction
+	call	__bn_post4x_internal
 
 	pxor	%xmm0,%xmm0
 	lea	48(%rsp),%rax
@@ -2039,7 +2133,6 @@ $code.=<<___;
 .align	32
 bn_mulx4x_mont_gather5:
 .Lmulx4x_enter:
-	.byte	0x67
 	mov	%rsp,%rax
 	push	%rbx
 	push	%rbp
@@ -2047,40 +2140,33 @@ bn_mulx4x_mont_gather5:
 	push	%r13
 	push	%r14
 	push	%r15
-___
-$code.=<<___ if ($win64);
-	lea	-0x28(%rsp),%rsp
-	movaps	%xmm6,(%rsp)
-	movaps	%xmm7,0x10(%rsp)
-___
-$code.=<<___;
-	.byte	0x67
-	mov	${num}d,%r10d
+
 	shl	\$3,${num}d		# convert $num to bytes
-	shl	\$3+2,%r10d		# 4*$num
+	lea	($num,$num,2),%r10	# 3*$num in bytes
 	neg	$num			# -$num
 	mov	($n0),$n0		# *n0
 
 	##############################################################
-	# ensure that stack frame doesn't alias with $aptr+4*$num
-	# modulo 4096, which covers a[num], ret[num] and n[2*num]
-	# (see bn_exp.c). this is done to allow memory disambiguation
-	# logic do its magic. [excessive frame is allocated in order
-	# to allow bn_from_mont8x to clear it.]
+	# Ensure that stack frame doesn't alias with $rptr+3*$num
+	# modulo 4096, which covers ret[num], am[num] and n[num]
+	# (see bn_exp.c). This is done to allow memory disambiguation
+	# logic do its magic. [Extra [num] is allocated in order
+	# to align with bn_power5's frame, which is cleansed after
+	# completing exponentiation. Extra 256 bytes is for power mask
+	# calculated from 7th argument, the index.]
 	#
-	lea	-64(%rsp,$num,2),%r11
-	sub	$ap,%r11
+	lea	-320(%rsp,$num,2),%r11
+	sub	$rp,%r11
 	and	\$4095,%r11
 	cmp	%r11,%r10
 	jb	.Lmulx4xsp_alt
 	sub	%r11,%rsp		# align with $aptr
-	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+$num)
+	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
 	jmp	.Lmulx4xsp_done
 
-.align	32
 .Lmulx4xsp_alt:
-	lea	4096-64(,$num,2),%r10	# 4096-frame-$num
-	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+$num)
+	lea	4096-320(,$num,2),%r10
+	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
 	sub	%r10,%r11
 	mov	\$0,%r10
 	cmovc	%r10,%r11
@@ -2106,12 +2192,7 @@ $code.=<<___;
 
 	mov	40(%rsp),%rsi		# restore %rsp
 	mov	\$1,%rax
-___
-$code.=<<___ if ($win64);
-	movaps	-88(%rsi),%xmm6
-	movaps	-72(%rsi),%xmm7
-___
-$code.=<<___;
+
 	mov	-48(%rsi),%r15
 	mov	-40(%rsi),%r14
 	mov	-32(%rsi),%r13
@@ -2126,14 +2207,16 @@ $code.=<<___;
 .type	mulx4x_internal,\@abi-omnipotent
 .align	32
 mulx4x_internal:
-	.byte	0x4c,0x89,0x8c,0x24,0x08,0x00,0x00,0x00	# mov	$num,8(%rsp)		# save -$num
-	.byte	0x67
+	mov	$num,8(%rsp)		# save -$num (it was in bytes)
+	mov	$num,%r10
 	neg	$num			# restore $num
 	shl	\$5,$num
-	lea	256($bp,$num),%r13
+	neg	%r10			# restore $num
+	lea	128($bp,$num),%r13	# end of powers table (+size optimization)
 	shr	\$5+5,$num
-	mov	`($win64?56:8)`(%rax),%r10d	# load 7th argument
+	movd	`($win64?56:8)`(%rax),%xmm5	# load 7th argument
 	sub	\$1,$num
+	lea	.Linc(%rip),%rax
 	mov	%r13,16+8(%rsp)		# end of b[num]
 	mov	$num,24+8(%rsp)		# inner counter
 	mov	$rp, 56+8(%rsp)		# save $rp
@@ -2144,52 +2227,92 @@ my $rptr=$bptr;
 my $STRIDE=2**5*8;		# 5 is "window size"
 my $N=$STRIDE/4;		# should match cache line size
 $code.=<<___;
-	mov	%r10,%r11
-	shr	\$`log($N/8)/log(2)`,%r10
-	and	\$`$N/8-1`,%r11
-	not	%r10
-	lea	.Lmagic_masks(%rip),%rax
-	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
-	lea	96($bp,%r11,8),$bptr	# pointer within 1st cache line
-	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
-	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
-	add	\$7,%r11
-	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
-	movq	24(%rax,%r10,8),%xmm7
-	and	\$7,%r11
+	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
+	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
+	lea	88-112(%rsp,%r10),%r10	# place the mask after tp[num+1] (+ICache optimizaton)
+	lea	128($bp),$bptr		# size optimization
 
-	movq	`0*$STRIDE/4-96`($bptr),%xmm0
-	lea	$STRIDE($bptr),$tptr	# borrow $tptr
-	movq	`1*$STRIDE/4-96`($bptr),%xmm1
-	pand	%xmm4,%xmm0
-	movq	`2*$STRIDE/4-96`($bptr),%xmm2
-	pand	%xmm5,%xmm1
-	movq	`3*$STRIDE/4-96`($bptr),%xmm3
-	pand	%xmm6,%xmm2
-	por	%xmm1,%xmm0
-	movq	`0*$STRIDE/4-96`($tptr),%xmm1
-	pand	%xmm7,%xmm3
+	pshufd	\$0,%xmm5,%xmm5		# broadcast index
+	movdqa	%xmm1,%xmm4
+	.byte	0x67
+	movdqa	%xmm1,%xmm2
+___
+########################################################################
+# calculate mask by comparing 0..31 to index and save result to stack
+#
+$code.=<<___;
+	.byte	0x67
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
+	movdqa	%xmm4,%xmm3
+___
+for($i=0;$i<$STRIDE/16-4;$i+=4) {
+$code.=<<___;
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
+	movdqa	%xmm0,`16*($i+0)+112`(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
+	movdqa	%xmm1,`16*($i+1)+112`(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
+	movdqa	%xmm2,`16*($i+2)+112`(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,`16*($i+3)+112`(%r10)
+	movdqa	%xmm4,%xmm3
+___
+}
+$code.=<<___;				# last iteration can be optimized
+	.byte	0x67
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,`16*($i+0)+112`(%r10)
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,`16*($i+1)+112`(%r10)
+
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,`16*($i+2)+112`(%r10)
+
+	pand	`16*($i+0)-128`($bptr),%xmm0	# while it's still in register
+	pand	`16*($i+1)-128`($bptr),%xmm1
+	pand	`16*($i+2)-128`($bptr),%xmm2
+	movdqa	%xmm3,`16*($i+3)+112`(%r10)
+	pand	`16*($i+3)-128`($bptr),%xmm3
 	por	%xmm2,%xmm0
-	movq	`1*$STRIDE/4-96`($tptr),%xmm2
-	por	%xmm3,%xmm0
-	.byte	0x67,0x67
-	pand	%xmm4,%xmm1
-	movq	`2*$STRIDE/4-96`($tptr),%xmm3
-
+	por	%xmm3,%xmm1
+___
+for($i=0;$i<$STRIDE/16-4;$i+=4) {
+$code.=<<___;
+	movdqa	`16*($i+0)-128`($bptr),%xmm4
+	movdqa	`16*($i+1)-128`($bptr),%xmm5
+	movdqa	`16*($i+2)-128`($bptr),%xmm2
+	pand	`16*($i+0)+112`(%r10),%xmm4
+	movdqa	`16*($i+3)-128`($bptr),%xmm3
+	pand	`16*($i+1)+112`(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	`16*($i+2)+112`(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	`16*($i+3)+112`(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+___
+}
+$code.=<<___;
+	pxor	%xmm1,%xmm0
+	pshufd	\$0x4e,%xmm0,%xmm1
+	por	%xmm1,%xmm0
+	lea	$STRIDE($bptr),$bptr
 	movq	%xmm0,%rdx		# bp[0]
-	movq	`3*$STRIDE/4-96`($tptr),%xmm0
-	lea	2*$STRIDE($bptr),$bptr	# next &b[i]
-	pand	%xmm5,%xmm2
-	.byte	0x67,0x67
-	pand	%xmm6,%xmm3
-	##############################################################
-	# $tptr is chosen so that writing to top-most element of the
-	# vector occurs just "above" references to powers table,
-	# "above" modulo cache-line size, which effectively precludes
-	# possibility of memory disambiguation logic failure when
-	# accessing the table.
-	# 
-	lea	64+8*4+8(%rsp,%r11,8),$tptr
+	lea	64+8*4+8(%rsp),$tptr
 
 	mov	%rdx,$bi
 	mulx	0*8($aptr),$mi,%rax	# a[0]*b[0]
@@ -2205,37 +2328,31 @@ $code.=<<___;
 	xor	$zero,$zero		# cf=0, of=0
 	mov	$mi,%rdx
 
-	por	%xmm2,%xmm1
-	pand	%xmm7,%xmm0
-	por	%xmm3,%xmm1
 	mov	$bptr,8+8(%rsp)		# off-load &b[i]
-	por	%xmm1,%xmm0
 
-	.byte	0x48,0x8d,0xb6,0x20,0x00,0x00,0x00	# lea	4*8($aptr),$aptr
+	lea	4*8($aptr),$aptr
 	adcx	%rax,%r13
 	adcx	$zero,%r14		# cf=0
 
-	mulx	0*16($nptr),%rax,%r10
+	mulx	0*8($nptr),%rax,%r10
 	adcx	%rax,%r15		# discarded
 	adox	%r11,%r10
-	mulx	1*16($nptr),%rax,%r11
+	mulx	1*8($nptr),%rax,%r11
 	adcx	%rax,%r10
 	adox	%r12,%r11
-	mulx	2*16($nptr),%rax,%r12
+	mulx	2*8($nptr),%rax,%r12
 	mov	24+8(%rsp),$bptr	# counter value
-	.byte	0x66
 	mov	%r10,-8*4($tptr)
 	adcx	%rax,%r11
 	adox	%r13,%r12
-	mulx	3*16($nptr),%rax,%r15
-	 .byte	0x67,0x67
+	mulx	3*8($nptr),%rax,%r15
 	 mov	$bi,%rdx
 	mov	%r11,-8*3($tptr)
 	adcx	%rax,%r12
 	adox	$zero,%r15		# of=0
-	.byte	0x48,0x8d,0x89,0x40,0x00,0x00,0x00	# lea	4*16($nptr),$nptr
+	lea	4*8($nptr),$nptr
 	mov	%r12,-8*2($tptr)
-	#jmp	.Lmulx4x_1st
+	jmp	.Lmulx4x_1st
 
 .align	32
 .Lmulx4x_1st:
@@ -2255,30 +2372,29 @@ $code.=<<___;
 	lea	4*8($tptr),$tptr
 
 	adox	%r15,%r10
-	mulx	0*16($nptr),%rax,%r15
+	mulx	0*8($nptr),%rax,%r15
 	adcx	%rax,%r10
 	adox	%r15,%r11
-	mulx	1*16($nptr),%rax,%r15
+	mulx	1*8($nptr),%rax,%r15
 	adcx	%rax,%r11
 	adox	%r15,%r12
-	mulx	2*16($nptr),%rax,%r15
+	mulx	2*8($nptr),%rax,%r15
 	mov	%r10,-5*8($tptr)
 	adcx	%rax,%r12
 	mov	%r11,-4*8($tptr)
 	adox	%r15,%r13
-	mulx	3*16($nptr),%rax,%r15
+	mulx	3*8($nptr),%rax,%r15
 	 mov	$bi,%rdx
 	mov	%r12,-3*8($tptr)
 	adcx	%rax,%r13
 	adox	$zero,%r15
-	lea	4*16($nptr),$nptr
+	lea	4*8($nptr),$nptr
 	mov	%r13,-2*8($tptr)
 
 	dec	$bptr			# of=0, pass cf
 	jnz	.Lmulx4x_1st
 
 	mov	8(%rsp),$num		# load -num
-	movq	%xmm0,%rdx		# bp[1]
 	adc	$zero,%r15		# modulo-scheduled
 	lea	($aptr,$num),$aptr	# rewind $aptr
 	add	%r15,%r14
@@ -2289,6 +2405,34 @@ $code.=<<___;
 
 .align	32
 .Lmulx4x_outer:
+	lea	16-256($tptr),%r10	# where 256-byte mask is (+density control)
+	pxor	%xmm4,%xmm4
+	.byte	0x67,0x67
+	pxor	%xmm5,%xmm5
+___
+for($i=0;$i<$STRIDE/16;$i+=4) {
+$code.=<<___;
+	movdqa	`16*($i+0)-128`($bptr),%xmm0
+	movdqa	`16*($i+1)-128`($bptr),%xmm1
+	movdqa	`16*($i+2)-128`($bptr),%xmm2
+	pand	`16*($i+0)+256`(%r10),%xmm0
+	movdqa	`16*($i+3)-128`($bptr),%xmm3
+	pand	`16*($i+1)+256`(%r10),%xmm1
+	por	%xmm0,%xmm4
+	pand	`16*($i+2)+256`(%r10),%xmm2
+	por	%xmm1,%xmm5
+	pand	`16*($i+3)+256`(%r10),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+___
+}
+$code.=<<___;
+	por	%xmm5,%xmm4
+	pshufd	\$0x4e,%xmm4,%xmm0
+	por	%xmm4,%xmm0
+	lea	$STRIDE($bptr),$bptr
+	movq	%xmm0,%rdx		# m0=bp[i]
+
 	mov	$zero,($tptr)		# save top-most carry
 	lea	4*8($tptr,$num),$tptr	# rewind $tptr
 	mulx	0*8($aptr),$mi,%r11	# a[0]*b[i]
@@ -2303,54 +2447,37 @@ $code.=<<___;
 	mulx	3*8($aptr),%rdx,%r14
 	adox	-2*8($tptr),%r12
 	adcx	%rdx,%r13
-	lea	($nptr,$num,2),$nptr	# rewind $nptr
+	lea	($nptr,$num),$nptr	# rewind $nptr
 	lea	4*8($aptr),$aptr
 	adox	-1*8($tptr),%r13
 	adcx	$zero,%r14
 	adox	$zero,%r14
 
-	.byte	0x67
 	mov	$mi,%r15
 	imulq	32+8(%rsp),$mi		# "t[0]"*n0
 
-	movq	`0*$STRIDE/4-96`($bptr),%xmm0
-	.byte	0x67,0x67
 	mov	$mi,%rdx
-	movq	`1*$STRIDE/4-96`($bptr),%xmm1
-	.byte	0x67
-	pand	%xmm4,%xmm0
-	movq	`2*$STRIDE/4-96`($bptr),%xmm2
-	.byte	0x67
-	pand	%xmm5,%xmm1
-	movq	`3*$STRIDE/4-96`($bptr),%xmm3
-	add	\$$STRIDE,$bptr		# next &b[i]
-	.byte	0x67
-	pand	%xmm6,%xmm2
-	por	%xmm1,%xmm0
-	pand	%xmm7,%xmm3
 	xor	$zero,$zero		# cf=0, of=0
 	mov	$bptr,8+8(%rsp)		# off-load &b[i]
 
-	mulx	0*16($nptr),%rax,%r10
+	mulx	0*8($nptr),%rax,%r10
 	adcx	%rax,%r15		# discarded
 	adox	%r11,%r10
-	mulx	1*16($nptr),%rax,%r11
+	mulx	1*8($nptr),%rax,%r11
 	adcx	%rax,%r10
 	adox	%r12,%r11
-	mulx	2*16($nptr),%rax,%r12
+	mulx	2*8($nptr),%rax,%r12
 	adcx	%rax,%r11
 	adox	%r13,%r12
-	mulx	3*16($nptr),%rax,%r15
+	mulx	3*8($nptr),%rax,%r15
 	 mov	$bi,%rdx
-	 por	%xmm2,%xmm0
 	mov	24+8(%rsp),$bptr	# counter value
 	mov	%r10,-8*4($tptr)
-	 por	%xmm3,%xmm0
 	adcx	%rax,%r12
 	mov	%r11,-8*3($tptr)
 	adox	$zero,%r15		# of=0
 	mov	%r12,-8*2($tptr)
-	lea	4*16($nptr),$nptr
+	lea	4*8($nptr),$nptr
 	jmp	.Lmulx4x_inner
 
 .align	32
@@ -2375,20 +2502,20 @@ $code.=<<___;
 	adcx	$zero,%r14		# cf=0
 
 	adox	%r15,%r10
-	mulx	0*16($nptr),%rax,%r15
+	mulx	0*8($nptr),%rax,%r15
 	adcx	%rax,%r10
 	adox	%r15,%r11
-	mulx	1*16($nptr),%rax,%r15
+	mulx	1*8($nptr),%rax,%r15
 	adcx	%rax,%r11
 	adox	%r15,%r12
-	mulx	2*16($nptr),%rax,%r15
+	mulx	2*8($nptr),%rax,%r15
 	mov	%r10,-5*8($tptr)
 	adcx	%rax,%r12
 	adox	%r15,%r13
 	mov	%r11,-4*8($tptr)
-	mulx	3*16($nptr),%rax,%r15
+	mulx	3*8($nptr),%rax,%r15
 	 mov	$bi,%rdx
-	lea	4*16($nptr),$nptr
+	lea	4*8($nptr),$nptr
 	mov	%r12,-3*8($tptr)
 	adcx	%rax,%r13
 	adox	$zero,%r15
@@ -2398,7 +2525,6 @@ $code.=<<___;
 	jnz	.Lmulx4x_inner
 
 	mov	0+8(%rsp),$num		# load -num
-	movq	%xmm0,%rdx		# bp[i+1]
 	adc	$zero,%r15		# modulo-scheduled
 	sub	0*8($tptr),$bptr	# pull top-most carry to %cf
 	mov	8+8(%rsp),$bptr		# re-load &b[i]
@@ -2411,20 +2537,26 @@ $code.=<<___;
 	cmp	%r10,$bptr
 	jb	.Lmulx4x_outer
 
-	mov	-16($nptr),%r10
+	mov	-8($nptr),%r10
+	mov	$zero,%r8
+	mov	($nptr,$num),%r12
+	lea	($nptr,$num),%rbp	# rewind $nptr
+	mov	$num,%rcx
+	lea	($tptr,$num),%rdi	# rewind $tptr
+	xor	%eax,%eax
 	xor	%r15,%r15
 	sub	%r14,%r10		# compare top-most words
 	adc	%r15,%r15
-	or	%r15,$zero
-	xor	\$1,$zero
-	lea	($tptr,$num),%rdi	# rewind $tptr
-	lea	($nptr,$num,2),$nptr	# rewind $nptr
-	.byte	0x67,0x67
-	sar	\$3+2,$num		# cf=0
-	lea	($nptr,$zero,8),%rbp
+	or	%r15,%r8
+	sar	\$3+2,%rcx
+	sub	%r8,%rax		# %rax=-%r8
 	mov	56+8(%rsp),%rdx		# restore rp
-	mov	$num,%rcx
-	jmp	.Lsqrx4x_sub		# common post-condition
+	dec	%r12			# so that after 'not' we get -n[0]
+	mov	8*1(%rbp),%r13
+	xor	%r8,%r8
+	mov	8*2(%rbp),%r14
+	mov	8*3(%rbp),%r15
+	jmp	.Lsqrx4x_sub_entry	# common post-condition
 .size	mulx4x_internal,.-mulx4x_internal
 ___
 }{
@@ -2448,7 +2580,6 @@ $code.=<<___;
 .align	32
 bn_powerx5:
 .Lpowerx5_enter:
-	.byte	0x67
 	mov	%rsp,%rax
 	push	%rbx
 	push	%rbp
@@ -2456,39 +2587,32 @@ bn_powerx5:
 	push	%r13
 	push	%r14
 	push	%r15
-___
-$code.=<<___ if ($win64);
-	lea	-0x28(%rsp),%rsp
-	movaps	%xmm6,(%rsp)
-	movaps	%xmm7,0x10(%rsp)
-___
-$code.=<<___;
-	.byte	0x67
-	mov	${num}d,%r10d
+
 	shl	\$3,${num}d		# convert $num to bytes
-	shl	\$3+2,%r10d		# 4*$num
+	lea	($num,$num,2),%r10	# 3*$num in bytes
 	neg	$num
 	mov	($n0),$n0		# *n0
 
 	##############################################################
-	# ensure that stack frame doesn't alias with $aptr+4*$num
-	# modulo 4096, which covers ret[num], am[num] and n[2*num]
-	# (see bn_exp.c). this is done to allow memory disambiguation
-	# logic do its magic.
+	# Ensure that stack frame doesn't alias with $rptr+3*$num
+	# modulo 4096, which covers ret[num], am[num] and n[num]
+	# (see bn_exp.c). This is done to allow memory disambiguation
+	# logic do its magic. [Extra 256 bytes is for power mask
+	# calculated from 7th argument, the index.]
 	#
-	lea	-64(%rsp,$num,2),%r11
-	sub	$aptr,%r11
+	lea	-320(%rsp,$num,2),%r11
+	sub	$rptr,%r11
 	and	\$4095,%r11
 	cmp	%r11,%r10
 	jb	.Lpwrx_sp_alt
 	sub	%r11,%rsp		# align with $aptr
-	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
+	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
 	jmp	.Lpwrx_sp_done
 
 .align	32
 .Lpwrx_sp_alt:
-	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
-	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
+	lea	4096-320(,$num,2),%r10
+	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
 	sub	%r10,%r11
 	mov	\$0,%r10
 	cmovc	%r10,%r11
@@ -2519,10 +2643,15 @@ $code.=<<___;
 .Lpowerx5_body:
 
 	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
 	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
 	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
 	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
 	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
 
 	mov	%r10,$num		# -num
 	mov	$aptr,$rptr
@@ -2534,12 +2663,7 @@ $code.=<<___;
 
 	mov	40(%rsp),%rsi		# restore %rsp
 	mov	\$1,%rax
-___
-$code.=<<___ if ($win64);
-	movaps	-88(%rsi),%xmm6
-	movaps	-72(%rsi),%xmm7
-___
-$code.=<<___;
+
 	mov	-48(%rsi),%r15
 	mov	-40(%rsi),%r14
 	mov	-32(%rsi),%r13
@@ -2973,11 +3097,11 @@ my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
 
 $code.=<<___;
 	movq	%xmm2,$nptr
-sqrx8x_reduction:
+__bn_sqrx8x_reduction:
 	xor	%eax,%eax		# initial top-most carry bit
 	mov	32+8(%rsp),%rbx		# n0
 	mov	48+8(%rsp),%rdx		# "%r8", 8*0($tptr)
-	lea	-128($nptr,$num,2),%rcx	# end of n[]
+	lea	-8*8($nptr,$num),%rcx	# end of n[]
 	#lea	48+8(%rsp,$num,2),$tptr	# end of t[] buffer
 	mov	%rcx, 0+8(%rsp)		# save end of n[]
 	mov	$tptr,8+8(%rsp)		# save end of t[]
@@ -3006,23 +3130,23 @@ sqrx8x_reduction:
 .align	32
 .Lsqrx8x_reduce:
 	mov	%r8, %rbx
-	mulx	16*0($nptr),%rax,%r8	# n[0]
+	mulx	8*0($nptr),%rax,%r8	# n[0]
 	adcx	%rbx,%rax		# discarded
 	adox	%r9,%r8
 
-	mulx	16*1($nptr),%rbx,%r9	# n[1]
+	mulx	8*1($nptr),%rbx,%r9	# n[1]
 	adcx	%rbx,%r8
 	adox	%r10,%r9
 
-	mulx	16*2($nptr),%rbx,%r10
+	mulx	8*2($nptr),%rbx,%r10
 	adcx	%rbx,%r9
 	adox	%r11,%r10
 
-	mulx	16*3($nptr),%rbx,%r11
+	mulx	8*3($nptr),%rbx,%r11
 	adcx	%rbx,%r10
 	adox	%r12,%r11
 
-	.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x40,0x00,0x00,0x00	# mulx	16*4($nptr),%rbx,%r12
+	.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	8*4($nptr),%rbx,%r12
 	 mov	%rdx,%rax
 	 mov	%r8,%rdx
 	adcx	%rbx,%r11
@@ -3032,15 +3156,15 @@ sqrx8x_reduction:
 	 mov	%rax,%rdx
 	 mov	%rax,64+48+8(%rsp,%rcx,8)	# put aside n0*a[i]
 
-	mulx	16*5($nptr),%rax,%r13
+	mulx	8*5($nptr),%rax,%r13
 	adcx	%rax,%r12
 	adox	%r14,%r13
 
-	mulx	16*6($nptr),%rax,%r14
+	mulx	8*6($nptr),%rax,%r14
 	adcx	%rax,%r13
 	adox	%r15,%r14
 
-	mulx	16*7($nptr),%rax,%r15
+	mulx	8*7($nptr),%rax,%r15
 	 mov	%rbx,%rdx
 	adcx	%rax,%r14
 	adox	$carry,%r15		# $carry is 0
@@ -3056,7 +3180,7 @@ sqrx8x_reduction:
 
 	mov	48+8(%rsp),%rdx		# pull n0*a[0]
 	add	8*0($tptr),%r8
-	lea	16*8($nptr),$nptr
+	lea	8*8($nptr),$nptr
 	mov	\$-8,%rcx
 	adcx	8*1($tptr),%r9
 	adcx	8*2($tptr),%r10
@@ -3075,35 +3199,35 @@ sqrx8x_reduction:
 .align	32
 .Lsqrx8x_tail:
 	mov	%r8,%rbx
-	mulx	16*0($nptr),%rax,%r8
+	mulx	8*0($nptr),%rax,%r8
 	adcx	%rax,%rbx
 	adox	%r9,%r8
 
-	mulx	16*1($nptr),%rax,%r9
+	mulx	8*1($nptr),%rax,%r9
 	adcx	%rax,%r8
 	adox	%r10,%r9
 
-	mulx	16*2($nptr),%rax,%r10
+	mulx	8*2($nptr),%rax,%r10
 	adcx	%rax,%r9
 	adox	%r11,%r10
 
-	mulx	16*3($nptr),%rax,%r11
+	mulx	8*3($nptr),%rax,%r11
 	adcx	%rax,%r10
 	adox	%r12,%r11
 
-	.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x40,0x00,0x00,0x00	# mulx	16*4($nptr),%rax,%r12
+	.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	8*4($nptr),%rax,%r12
 	adcx	%rax,%r11
 	adox	%r13,%r12
 
-	mulx	16*5($nptr),%rax,%r13
+	mulx	8*5($nptr),%rax,%r13
 	adcx	%rax,%r12
 	adox	%r14,%r13
 
-	mulx	16*6($nptr),%rax,%r14
+	mulx	8*6($nptr),%rax,%r14
 	adcx	%rax,%r13
 	adox	%r15,%r14
 
-	mulx	16*7($nptr),%rax,%r15
+	mulx	8*7($nptr),%rax,%r15
 	 mov	72+48+8(%rsp,%rcx,8),%rdx	# pull n0*a[i]
 	adcx	%rax,%r14
 	adox	$carry,%r15
@@ -3119,7 +3243,7 @@ sqrx8x_reduction:
 
 	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
 	 mov	48+8(%rsp),%rdx		# pull n0*a[0]
-	 lea	16*8($nptr),$nptr
+	 lea	8*8($nptr),$nptr
 	adc	8*0($tptr),%r8
 	adc	8*1($tptr),%r9
 	adc	8*2($tptr),%r10
@@ -3155,7 +3279,7 @@ sqrx8x_reduction:
 	adc	8*0($tptr),%r8
 	 movq	%xmm3,%rcx
 	adc	8*1($tptr),%r9
-	 mov	16*7($nptr),$carry
+	 mov	8*7($nptr),$carry
 	 movq	%xmm2,$nptr		# restore $nptr
 	adc	8*2($tptr),%r10
 	adc	8*3($tptr),%r11
@@ -3181,6 +3305,8 @@ sqrx8x_reduction:
 	lea	8*8($tptr,%rcx),$tptr	# start of current t[] window
 	cmp	8+8(%rsp),%r8		# end of t[]?
 	jb	.Lsqrx8x_reduction_loop
+	ret
+.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
 ___
 }
 ##############################################################
@@ -3188,52 +3314,59 @@ ___
 #
 {
 my ($rptr,$nptr)=("%rdx","%rbp");
-my @ri=map("%r$_",(10..13));
-my @ni=map("%r$_",(14..15));
 $code.=<<___;
-	xor	%ebx,%ebx
-	sub	%r15,%rsi		# compare top-most words
-	adc	%rbx,%rbx
+.align	32
+__bn_postx4x_internal:
+	mov	8*0($nptr),%r12
 	mov	%rcx,%r10		# -$num
-	or	%rbx,%rax
 	mov	%rcx,%r9		# -$num
-	xor	\$1,%rax
-	sar	\$3+2,%rcx		# cf=0
+	neg	%rax
+	sar	\$3+2,%rcx
 	#lea	48+8(%rsp,%r9),$tptr
-	lea	($nptr,%rax,8),$nptr
 	movq	%xmm1,$rptr		# restore $rptr
 	movq	%xmm1,$aptr		# prepare for back-to-back call
-	jmp	.Lsqrx4x_sub
+	dec	%r12			# so that after 'not' we get -n[0]
+	mov	8*1($nptr),%r13
+	xor	%r8,%r8
+	mov	8*2($nptr),%r14
+	mov	8*3($nptr),%r15
+	jmp	.Lsqrx4x_sub_entry
 
-.align	32
+.align	16
 .Lsqrx4x_sub:
-	.byte	0x66
-	mov	8*0($tptr),%r12
-	mov	8*1($tptr),%r13
-	sbb	16*0($nptr),%r12
-	mov	8*2($tptr),%r14
-	sbb	16*1($nptr),%r13
-	mov	8*3($tptr),%r15
-	lea	8*4($tptr),$tptr
-	sbb	16*2($nptr),%r14
+	mov	8*0($nptr),%r12
+	mov	8*1($nptr),%r13
+	mov	8*2($nptr),%r14
+	mov	8*3($nptr),%r15
+.Lsqrx4x_sub_entry:
+	andn	%rax,%r12,%r12
+	lea	8*4($nptr),$nptr
+	andn	%rax,%r13,%r13
+	andn	%rax,%r14,%r14
+	andn	%rax,%r15,%r15
+
+	neg	%r8			# mov %r8,%cf
+	adc	8*0($tptr),%r12
+	adc	8*1($tptr),%r13
+	adc	8*2($tptr),%r14
+	adc	8*3($tptr),%r15
 	mov	%r12,8*0($rptr)
-	sbb	16*3($nptr),%r15
-	lea	16*4($nptr),$nptr
+	lea	8*4($tptr),$tptr
 	mov	%r13,8*1($rptr)
+	sbb	%r8,%r8			# mov %cf,%r8
 	mov	%r14,8*2($rptr)
 	mov	%r15,8*3($rptr)
 	lea	8*4($rptr),$rptr
 
 	inc	%rcx
 	jnz	.Lsqrx4x_sub
-___
-}
-$code.=<<___;
+
 	neg	%r9			# restore $num
 
 	ret
-.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
+.size	__bn_postx4x_internal,.-__bn_postx4x_internal
 ___
+}
 }}}
 {
 my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
@@ -3282,56 +3415,91 @@ bn_scatter5:
 
 .globl	bn_gather5
 .type	bn_gather5,\@abi-omnipotent
-.align	16
+.align	32
 bn_gather5:
-___
-$code.=<<___ if ($win64);
-.LSEH_begin_bn_gather5:
+.LSEH_begin_bn_gather5:			# Win64 thing, but harmless in other cases
 	# I can't trust assembler to use specific encoding:-(
-	.byte	0x48,0x83,0xec,0x28		#sub	\$0x28,%rsp
-	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
-	.byte	0x0f,0x29,0x7c,0x24,0x10	#movdqa	%xmm7,0x10(%rsp)
+	.byte	0x4c,0x8d,0x14,0x24			#lea    (%rsp),%r10
+	.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00	#sub	$0x108,%rsp
+	lea	.Linc(%rip),%rax
+	and	\$-16,%rsp		# shouldn't be formally required
+
+	movd	$idx,%xmm5
+	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
+	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
+	lea	128($tbl),%r11		# size optimization
+	lea	128(%rsp),%rax		# size optimization
+
+	pshufd	\$0,%xmm5,%xmm5		# broadcast $idx
+	movdqa	%xmm1,%xmm4
+	movdqa	%xmm1,%xmm2
+___
+########################################################################
+# calculate mask by comparing 0..31 to $idx and save result to stack
+#
+for($i=0;$i<$STRIDE/16;$i+=4) {
+$code.=<<___;
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
+___
+$code.=<<___	if ($i);
+	movdqa	%xmm3,`16*($i-1)-128`(%rax)
 ___
 $code.=<<___;
-	mov	$idx,%r11d
-	shr	\$`log($N/8)/log(2)`,$idx
-	and	\$`$N/8-1`,%r11
-	not	$idx
-	lea	.Lmagic_masks(%rip),%rax
-	and	\$`2**5/($N/8)-1`,$idx	# 5 is "window size"
-	lea	128($tbl,%r11,8),$tbl	# pointer within 1st cache line
-	movq	0(%rax,$idx,8),%xmm4	# set of masks denoting which
-	movq	8(%rax,$idx,8),%xmm5	# cache line contains element
-	movq	16(%rax,$idx,8),%xmm6	# denoted by 7th argument
-	movq	24(%rax,$idx,8),%xmm7
-	jmp	.Lgather
-.align	16
-.Lgather:
-	movq	`0*$STRIDE/4-128`($tbl),%xmm0
-	movq	`1*$STRIDE/4-128`($tbl),%xmm1
-	pand	%xmm4,%xmm0
-	movq	`2*$STRIDE/4-128`($tbl),%xmm2
-	pand	%xmm5,%xmm1
-	movq	`3*$STRIDE/4-128`($tbl),%xmm3
-	pand	%xmm6,%xmm2
-	por	%xmm1,%xmm0
-	pand	%xmm7,%xmm3
-	.byte	0x67,0x67
-	por	%xmm2,%xmm0
-	lea	$STRIDE($tbl),$tbl
-	por	%xmm3,%xmm0
+	movdqa	%xmm4,%xmm3
 
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
+	movdqa	%xmm0,`16*($i+0)-128`(%rax)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
+	movdqa	%xmm1,`16*($i+1)-128`(%rax)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
+	movdqa	%xmm2,`16*($i+2)-128`(%rax)
+	movdqa	%xmm4,%xmm2
+___
+}
+$code.=<<___;
+	movdqa	%xmm3,`16*($i-1)-128`(%rax)
+	jmp	.Lgather
+
+.align	32
+.Lgather:
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+___
+for($i=0;$i<$STRIDE/16;$i+=4) {
+$code.=<<___;
+	movdqa	`16*($i+0)-128`(%r11),%xmm0
+	movdqa	`16*($i+1)-128`(%r11),%xmm1
+	movdqa	`16*($i+2)-128`(%r11),%xmm2
+	pand	`16*($i+0)-128`(%rax),%xmm0
+	movdqa	`16*($i+3)-128`(%r11),%xmm3
+	pand	`16*($i+1)-128`(%rax),%xmm1
+	por	%xmm0,%xmm4
+	pand	`16*($i+2)-128`(%rax),%xmm2
+	por	%xmm1,%xmm5
+	pand	`16*($i+3)-128`(%rax),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+___
+}
+$code.=<<___;
+	por	%xmm5,%xmm4
+	lea	$STRIDE(%r11),%r11
+	pshufd	\$0x4e,%xmm4,%xmm0
+	por	%xmm4,%xmm0
 	movq	%xmm0,($out)		# m0=bp[0]
 	lea	8($out),$out
 	sub	\$1,$num
 	jnz	.Lgather
-___
-$code.=<<___ if ($win64);
-	movaps	(%rsp),%xmm6
-	movaps	0x10(%rsp),%xmm7
-	lea	0x28(%rsp),%rsp
-___
-$code.=<<___;
+
+	lea	(%r10),%rsp
 	ret
 .LSEH_end_bn_gather5:
 .size	bn_gather5,.-bn_gather5
@@ -3339,9 +3507,9 @@ ___
 }
 $code.=<<___;
 .align	64
-.Lmagic_masks:
-	.long	0,0, 0,0, 0,0, -1,-1
-	.long	0,0, 0,0, 0,0,  0,0
+.Linc:
+	.long	0,0, 1,1
+	.long	2,2, 2,2
 .asciz	"Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 
@@ -3389,19 +3557,16 @@ mul_handler:
 
 	lea	.Lmul_epilogue(%rip),%r10
 	cmp	%r10,%rbx
-	jb	.Lbody_40
+	ja	.Lbody_40
 
 	mov	192($context),%r10	# pull $num
 	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
+
 	jmp	.Lbody_proceed
 
 .Lbody_40:
 	mov	40(%rax),%rax		# pull saved stack pointer
 .Lbody_proceed:
-
-	movaps	-88(%rax),%xmm0
-	movaps	-72(%rax),%xmm1
-
 	mov	-8(%rax),%rbx
 	mov	-16(%rax),%rbp
 	mov	-24(%rax),%r12
@@ -3414,8 +3579,6 @@ mul_handler:
 	mov	%r13,224($context)	# restore context->R13
 	mov	%r14,232($context)	# restore context->R14
 	mov	%r15,240($context)	# restore context->R15
-	movups	%xmm0,512($context)	# restore context->Xmm6
-	movups	%xmm1,528($context)	# restore context->Xmm7
 
 .Lcommon_seh_tail:
 	mov	8(%rax),%rdi
@@ -3526,10 +3689,9 @@ ___
 $code.=<<___;
 .align	8
 .LSEH_info_bn_gather5:
-        .byte   0x01,0x0d,0x05,0x00
-        .byte   0x0d,0x78,0x01,0x00	#movaps	0x10(rsp),xmm7
-        .byte   0x08,0x68,0x00,0x00	#movaps	(rsp),xmm6
-        .byte   0x04,0x42,0x00,0x00	#sub	rsp,0x28
+	.byte	0x01,0x0b,0x03,0x0a
+	.byte	0x0b,0x01,0x21,0x00	# sub	rsp,0x108
+	.byte	0x04,0xa3,0x00,0x00	# lea	r10,(rsp)
 .align	8
 ___
 }
diff --git a/crypto/bn/bn.h b/crypto/bn/bn.h
index 5696965e9a09..86264ae6315f 100644
--- a/crypto/bn/bn.h
+++ b/crypto/bn/bn.h
@@ -125,6 +125,7 @@
 #ifndef HEADER_BN_H
 # define HEADER_BN_H
 
+# include <limits.h>
 # include <openssl/e_os2.h>
 # ifndef OPENSSL_NO_FP_API
 #  include <stdio.h>            /* FILE */
@@ -721,8 +722,17 @@ const BIGNUM *BN_get0_nist_prime_521(void);
 
 /* library internal functions */
 
-# define bn_expand(a,bits) ((((((bits+BN_BITS2-1))/BN_BITS2)) <= (a)->dmax)?\
-        (a):bn_expand2((a),(bits+BN_BITS2-1)/BN_BITS2))
+# define bn_expand(a,bits) \
+    ( \
+        bits > (INT_MAX - BN_BITS2 + 1) ? \
+            NULL \
+        : \
+            (((bits+BN_BITS2-1)/BN_BITS2) <= (a)->dmax) ? \
+                (a) \
+            : \
+                bn_expand2((a),(bits+BN_BITS2-1)/BN_BITS2) \
+    )
+
 # define bn_wexpand(a,words) (((words) <= (a)->dmax)?(a):bn_expand2((a),(words)))
 BIGNUM *bn_expand2(BIGNUM *a, int words);
 # ifndef OPENSSL_NO_DEPRECATED
diff --git a/crypto/bn/bn_exp.c b/crypto/bn/bn_exp.c
index 6d30d1e0fff5..1670f01d1d8c 100644
--- a/crypto/bn/bn_exp.c
+++ b/crypto/bn/bn_exp.c
@@ -110,6 +110,7 @@
  */
 
 #include "cryptlib.h"
+#include "constant_time_locl.h"
 #include "bn_lcl.h"
 
 #include <stdlib.h>
@@ -606,15 +607,17 @@ static BN_ULONG bn_get_bits(const BIGNUM *a, int bitpos)
 
 static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top,
                                         unsigned char *buf, int idx,
-                                        int width)
+                                        int window)
 {
-    size_t i, j;
+    int i, j;
+    int width = 1 << window;
+    BN_ULONG *table = (BN_ULONG *)buf;
 
     if (top > b->top)
         top = b->top;           /* this works because 'buf' is explicitly
                                  * zeroed */
-    for (i = 0, j = idx; i < top * sizeof b->d[0]; i++, j += width) {
-        buf[j] = ((unsigned char *)b->d)[i];
+    for (i = 0, j = idx; i < top; i++, j += width) {
+        table[j] = b->d[i];
     }
 
     return 1;
@@ -622,15 +625,51 @@ static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top,
 
 static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top,
                                           unsigned char *buf, int idx,
-                                          int width)
+                                          int window)
 {
-    size_t i, j;
+    int i, j;
+    int width = 1 << window;
+    volatile BN_ULONG *table = (volatile BN_ULONG *)buf;
 
     if (bn_wexpand(b, top) == NULL)
         return 0;
 
-    for (i = 0, j = idx; i < top * sizeof b->d[0]; i++, j += width) {
-        ((unsigned char *)b->d)[i] = buf[j];
+    if (window <= 3) {
+        for (i = 0; i < top; i++, table += width) {
+            BN_ULONG acc = 0;
+
+            for (j = 0; j < width; j++) {
+                acc |= table[j] &
+                       ((BN_ULONG)0 - (constant_time_eq_int(j,idx)&1));
+            }
+
+            b->d[i] = acc;
+        }
+    } else {
+        int xstride = 1 << (window - 2);
+        BN_ULONG y0, y1, y2, y3;
+
+        i = idx >> (window - 2);        /* equivalent of idx / xstride */
+        idx &= xstride - 1;             /* equivalent of idx % xstride */
+
+        y0 = (BN_ULONG)0 - (constant_time_eq_int(i,0)&1);
+        y1 = (BN_ULONG)0 - (constant_time_eq_int(i,1)&1);
+        y2 = (BN_ULONG)0 - (constant_time_eq_int(i,2)&1);
+        y3 = (BN_ULONG)0 - (constant_time_eq_int(i,3)&1);
+
+        for (i = 0; i < top; i++, table += width) {
+            BN_ULONG acc = 0;
+
+            for (j = 0; j < xstride; j++) {
+                acc |= ( (table[j + 0 * xstride] & y0) |
+                         (table[j + 1 * xstride] & y1) |
+                         (table[j + 2 * xstride] & y2) |
+                         (table[j + 3 * xstride] & y3) )
+                       & ((BN_ULONG)0 - (constant_time_eq_int(j,idx)&1));
+            }
+
+            b->d[i] = acc;
+        }
     }
 
     b->top = top;
@@ -749,8 +788,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
     if (window >= 5) {
         window = 5;             /* ~5% improvement for RSA2048 sign, and even
                                  * for RSA4096 */
-        if ((top & 7) == 0)
-            powerbufLen += 2 * top * sizeof(m->d[0]);
+        /* reserve space for mont->N.d[] copy */
+        powerbufLen += top * sizeof(mont->N.d[0]);
     }
 #endif
     (void)0;
@@ -971,7 +1010,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
                                const BN_ULONG *not_used, const BN_ULONG *np,
                                const BN_ULONG *n0, int num);
 
-        BN_ULONG *np = mont->N.d, *n0 = mont->n0, *np2;
+        BN_ULONG *n0 = mont->n0, *np;
 
         /*
          * BN_to_montgomery can contaminate words above .top [in
@@ -982,11 +1021,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
         for (i = tmp.top; i < top; i++)
             tmp.d[i] = 0;
 
-        if (top & 7)
-            np2 = np;
-        else
-            for (np2 = am.d + top, i = 0; i < top; i++)
-                np2[2 * i] = np[i];
+        /*
+         * copy mont->N.d[] to improve cache locality
+         */
+        for (np = am.d + top, i = 0; i < top; i++)
+            np[i] = mont->N.d[i];
 
         bn_scatter5(tmp.d, top, powerbuf, 0);
         bn_scatter5(am.d, am.top, powerbuf, 1);
@@ -996,7 +1035,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 # if 0
         for (i = 3; i < 32; i++) {
             /* Calculate a^i = a^(i-1) * a */
-            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
             bn_scatter5(tmp.d, top, powerbuf, i);
         }
 # else
@@ -1007,7 +1046,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
         }
         for (i = 3; i < 8; i += 2) {
             int j;
-            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
             bn_scatter5(tmp.d, top, powerbuf, i);
             for (j = 2 * i; j < 32; j *= 2) {
                 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
@@ -1015,13 +1054,13 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
             }
         }
         for (; i < 16; i += 2) {
-            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
             bn_scatter5(tmp.d, top, powerbuf, i);
             bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
             bn_scatter5(tmp.d, top, powerbuf, 2 * i);
         }
         for (; i < 32; i += 2) {
-            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
             bn_scatter5(tmp.d, top, powerbuf, i);
         }
 # endif
@@ -1050,11 +1089,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
             while (bits >= 0) {
                 wvalue = bn_get_bits5(p->d, bits - 4);
                 bits -= 5;
-                bn_power5(tmp.d, tmp.d, powerbuf, np2, n0, top, wvalue);
+                bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue);
             }
         }
 
-        ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np2, n0, top);
+        ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np, n0, top);
         tmp.top = top;
         bn_correct_top(&tmp);
         if (ret) {
@@ -1065,9 +1104,9 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
     } else
 #endif
     {
-        if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, numPowers))
+        if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, window))
             goto err;
-        if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, numPowers))
+        if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, window))
             goto err;
 
         /*
@@ -1079,15 +1118,15 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
         if (window > 1) {
             if (!BN_mod_mul_montgomery(&tmp, &am, &am, mont, ctx))
                 goto err;
-            if (!MOD_EXP_CTIME_COPY_TO_PREBUF
-                (&tmp, top, powerbuf, 2, numPowers))
+            if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 2,
+                                              window))
                 goto err;
             for (i = 3; i < numPowers; i++) {
                 /* Calculate a^i = a^(i-1) * a */
                 if (!BN_mod_mul_montgomery(&tmp, &am, &tmp, mont, ctx))
                     goto err;
-                if (!MOD_EXP_CTIME_COPY_TO_PREBUF
-                    (&tmp, top, powerbuf, i, numPowers))
+                if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, i,
+                                                  window))
                     goto err;
             }
         }
@@ -1095,8 +1134,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
         bits--;
         for (wvalue = 0, i = bits % window; i >= 0; i--, bits--)
             wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
-        if (!MOD_EXP_CTIME_COPY_FROM_PREBUF
-            (&tmp, top, powerbuf, wvalue, numPowers))
+        if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp, top, powerbuf, wvalue,
+                                            window))
             goto err;
 
         /*
@@ -1116,8 +1155,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
             /*
              * Fetch the appropriate pre-computed value from the pre-buf
              */
-            if (!MOD_EXP_CTIME_COPY_FROM_PREBUF
-                (&am, top, powerbuf, wvalue, numPowers))
+            if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&am, top, powerbuf, wvalue,
+                                                window))
                 goto err;
 
             /* Multiply the result into the intermediate result */
diff --git a/crypto/bn/bn_print.c b/crypto/bn/bn_print.c
index ab10b957ba27..bfa31efc5621 100644
--- a/crypto/bn/bn_print.c
+++ b/crypto/bn/bn_print.c
@@ -58,6 +58,7 @@
 
 #include <stdio.h>
 #include <ctype.h>
+#include <limits.h>
 #include "cryptlib.h"
 #include <openssl/buffer.h>
 #include "bn_lcl.h"
@@ -189,7 +190,11 @@ int BN_hex2bn(BIGNUM **bn, const char *a)
         a++;
     }
 
-    for (i = 0; isxdigit((unsigned char)a[i]); i++) ;
+    for (i = 0; i <= (INT_MAX/4) && isxdigit((unsigned char)a[i]); i++)
+        continue;
+
+    if (i > INT_MAX/4)
+        goto err;
 
     num = i + neg;
     if (bn == NULL)
@@ -204,7 +209,7 @@ int BN_hex2bn(BIGNUM **bn, const char *a)
         BN_zero(ret);
     }
 
-    /* i is the number of hex digests; */
+    /* i is the number of hex digits */
     if (bn_expand(ret, i * 4) == NULL)
         goto err;
 
@@ -260,7 +265,11 @@ int BN_dec2bn(BIGNUM **bn, const char *a)
         a++;
     }
 
-    for (i = 0; isdigit((unsigned char)a[i]); i++) ;
+    for (i = 0; i <= (INT_MAX/4) && isdigit((unsigned char)a[i]); i++)
+        continue;
+
+    if (i > INT_MAX/4)
+        goto err;
 
     num = i + neg;
     if (bn == NULL)
@@ -278,7 +287,7 @@ int BN_dec2bn(BIGNUM **bn, const char *a)
         BN_zero(ret);
     }
 
-    /* i is the number of digests, a bit of an over expand; */
+    /* i is the number of digits, a bit of an over expand */
     if (bn_expand(ret, i * 4) == NULL)
         goto err;
 
diff --git a/crypto/bn/bn_recp.c b/crypto/bn/bn_recp.c
index 7497ac624d94..f047040efe03 100644
--- a/crypto/bn/bn_recp.c
+++ b/crypto/bn/bn_recp.c
@@ -65,6 +65,7 @@ void BN_RECP_CTX_init(BN_RECP_CTX *recp)
     BN_init(&(recp->N));
     BN_init(&(recp->Nr));
     recp->num_bits = 0;
+    recp->shift = 0;
     recp->flags = 0;
 }
 
diff --git a/crypto/cmac/cmac.c b/crypto/cmac/cmac.c
index 774e6dc91905..2954b6eb7dcf 100644
--- a/crypto/cmac/cmac.c
+++ b/crypto/cmac/cmac.c
@@ -160,6 +160,14 @@ int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen,
             EVPerr(EVP_F_CMAC_INIT, EVP_R_DISABLED_FOR_FIPS);
             return 0;
         }
+
+        /* Switch to FIPS cipher implementation if possible */
+        if (cipher != NULL) {
+            const EVP_CIPHER *fcipher;
+            fcipher = FIPS_get_cipherbynid(EVP_CIPHER_nid(cipher));
+            if (fcipher != NULL)
+                cipher = fcipher;
+        }
         /*
          * Other algorithm blocking will be done in FIPS_cmac_init, via
          * FIPS_cipherinit().
diff --git a/crypto/cryptlib.c b/crypto/cryptlib.c
index c9f674ba8e62..1925428f5ec5 100644
--- a/crypto/cryptlib.c
+++ b/crypto/cryptlib.c
@@ -1016,11 +1016,11 @@ void *OPENSSL_stderr(void)
     return stderr;
 }
 
-int CRYPTO_memcmp(const void *in_a, const void *in_b, size_t len)
+int CRYPTO_memcmp(const volatile void *in_a, const volatile void *in_b, size_t len)
 {
     size_t i;
-    const unsigned char *a = in_a;
-    const unsigned char *b = in_b;
+    const volatile unsigned char *a = in_a;
+    const volatile unsigned char *b = in_b;
     unsigned char x = 0;
 
     for (i = 0; i < len; i++)
diff --git a/crypto/crypto.h b/crypto/crypto.h
index c450d7a3c374..6c644ce12a82 100644
--- a/crypto/crypto.h
+++ b/crypto/crypto.h
@@ -628,7 +628,7 @@ void OPENSSL_init(void);
  * into a defined order as the return value when a != b is undefined, other
  * than to be non-zero.
  */
-int CRYPTO_memcmp(const void *a, const void *b, size_t len);
+int CRYPTO_memcmp(const volatile void *a, const volatile void *b, size_t len);
 
 /* BEGIN ERROR CODES */
 /*
diff --git a/crypto/dh/dh.h b/crypto/dh/dh.h
index 5498a9dc1060..a5bd9016aae8 100644
--- a/crypto/dh/dh.h
+++ b/crypto/dh/dh.h
@@ -174,7 +174,7 @@ struct dh_st {
 /* DH_check_pub_key error codes */
 # define DH_CHECK_PUBKEY_TOO_SMALL       0x01
 # define DH_CHECK_PUBKEY_TOO_LARGE       0x02
-# define DH_CHECK_PUBKEY_INVALID         0x03
+# define DH_CHECK_PUBKEY_INVALID         0x04
 
 /*
  * primes p where (p-1)/2 is prime too are called "safe"; we define this for
diff --git a/crypto/dh/dh_check.c b/crypto/dh/dh_check.c
index 5adedc0d264e..027704111432 100644
--- a/crypto/dh/dh_check.c
+++ b/crypto/dh/dh_check.c
@@ -160,13 +160,12 @@ int DH_check_pub_key(const DH *dh, const BIGNUM *pub_key, int *ret)
         goto err;
     BN_CTX_start(ctx);
     tmp = BN_CTX_get(ctx);
-    if (tmp == NULL)
+    if (tmp == NULL || !BN_set_word(tmp, 1))
         goto err;
-    BN_set_word(tmp, 1);
     if (BN_cmp(pub_key, tmp) <= 0)
         *ret |= DH_CHECK_PUBKEY_TOO_SMALL;
-    BN_copy(tmp, dh->p);
-    BN_sub_word(tmp, 1);
+    if (BN_copy(tmp, dh->p) == NULL || !BN_sub_word(tmp, 1))
+        goto err;
     if (BN_cmp(pub_key, tmp) >= 0)
         *ret |= DH_CHECK_PUBKEY_TOO_LARGE;
 
diff --git a/crypto/dsa/dsa_ameth.c b/crypto/dsa/dsa_ameth.c
index c40e1777ade1..cc83d6e6ad3b 100644
--- a/crypto/dsa/dsa_ameth.c
+++ b/crypto/dsa/dsa_ameth.c
@@ -191,6 +191,8 @@ static int dsa_priv_decode(EVP_PKEY *pkey, PKCS8_PRIV_KEY_INFO *p8)
     STACK_OF(ASN1_TYPE) *ndsa = NULL;
     DSA *dsa = NULL;
 
+    int ret = 0;
+
     if (!PKCS8_pkey_get0(NULL, &p, &pklen, &palg, p8))
         return 0;
     X509_ALGOR_get0(NULL, &ptype, &pval, palg);
@@ -262,23 +264,21 @@ static int dsa_priv_decode(EVP_PKEY *pkey, PKCS8_PRIV_KEY_INFO *p8)
     }
 
     EVP_PKEY_assign_DSA(pkey, dsa);
+
+    ret = 1;
+    goto done;
+
+ decerr:
+    DSAerr(DSA_F_DSA_PRIV_DECODE, DSA_R_DECODE_ERROR);
+ dsaerr:
+    DSA_free(dsa);
+ done:
     BN_CTX_free(ctx);
     if (ndsa)
         sk_ASN1_TYPE_pop_free(ndsa, ASN1_TYPE_free);
     else
         ASN1_STRING_clear_free(privkey);
-
-    return 1;
-
- decerr:
-    DSAerr(DSA_F_DSA_PRIV_DECODE, EVP_R_DECODE_ERROR);
- dsaerr:
-    BN_CTX_free(ctx);
-    if (privkey)
-        ASN1_STRING_clear_free(privkey);
-    sk_ASN1_TYPE_pop_free(ndsa, ASN1_TYPE_free);
-    DSA_free(dsa);
-    return 0;
+    return ret;
 }
 
 static int dsa_priv_encode(PKCS8_PRIV_KEY_INFO *p8, const EVP_PKEY *pkey)
diff --git a/crypto/dso/dso_lib.c b/crypto/dso/dso_lib.c
index 3312450eae67..2beb7c1ba542 100644
--- a/crypto/dso/dso_lib.c
+++ b/crypto/dso/dso_lib.c
@@ -122,6 +122,7 @@ DSO *DSO_new_method(DSO_METHOD *meth)
         ret->meth = meth;
     ret->references = 1;
     if ((ret->meth->init != NULL) && !ret->meth->init(ret)) {
+        sk_void_free(ret->meth_data);
         OPENSSL_free(ret);
         ret = NULL;
     }
diff --git a/crypto/ec/asm/ecp_nistz256-x86_64.pl b/crypto/ec/asm/ecp_nistz256-x86_64.pl
index e6acfd59f0d4..7140860e245b 100755
--- a/crypto/ec/asm/ecp_nistz256-x86_64.pl
+++ b/crypto/ec/asm/ecp_nistz256-x86_64.pl
@@ -2001,6 +2001,7 @@ $code.=<<___;
 	push	%r15
 	sub	\$32*5+8, %rsp
 
+.Lpoint_double_shortcut$x:
 	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr.x
 	mov	$a_ptr, $b_ptr			# backup copy
 	movdqu	0x10($a_ptr), %xmm1
@@ -2291,6 +2292,7 @@ $code.=<<___;
 	 mov	0x40+8*1($b_ptr), $acc6
 	 mov	0x40+8*2($b_ptr), $acc7
 	 mov	0x40+8*3($b_ptr), $acc0
+	movq	$b_ptr, %xmm1
 
 	lea	0x40-$bias($b_ptr), $a_ptr
 	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
@@ -2346,7 +2348,7 @@ $code.=<<___;
 	test	$acc0, $acc0
 	jnz	.Ladd_proceed$x			# (in1infty || in2infty)?
 	test	$acc1, $acc1
-	jz	.Ladd_proceed$x			# is_equal(S1,S2)?
+	jz	.Ladd_double$x			# is_equal(S1,S2)?
 
 	movq	%xmm0, $r_ptr			# restore $r_ptr
 	pxor	%xmm0, %xmm0
@@ -2358,6 +2360,13 @@ $code.=<<___;
 	movdqu	%xmm0, 0x50($r_ptr)
 	jmp	.Ladd_done$x
 
+.align	32
+.Ladd_double$x:
+	movq	%xmm1, $a_ptr			# restore $a_ptr
+	movq	%xmm0, $r_ptr			# restore $r_ptr
+	add	\$`32*(18-5)`, %rsp		# difference in frame sizes
+	jmp	.Lpoint_double_shortcut$x
+
 .align	32
 .Ladd_proceed$x:
 	`&load_for_sqr("$R(%rsp)", "$src0")`
diff --git a/crypto/ec/ecp_nistp224.c b/crypto/ec/ecp_nistp224.c
index ed09f97ade68..d81cc9ce6b1a 100644
--- a/crypto/ec/ecp_nistp224.c
+++ b/crypto/ec/ecp_nistp224.c
@@ -1657,8 +1657,7 @@ int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
      */
     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
-        ret = 1;
-        goto err;
+        goto done;
     }
     if ((!BN_to_felem(pre->g_pre_comp[0][1][0], &group->generator->X)) ||
         (!BN_to_felem(pre->g_pre_comp[0][1][1], &group->generator->Y)) ||
@@ -1736,6 +1735,7 @@ int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
     }
     make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_felems);
 
+ done:
     if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp224_pre_comp_dup,
                              nistp224_pre_comp_free,
                              nistp224_pre_comp_clear_free))
diff --git a/crypto/ec/ecp_nistp256.c b/crypto/ec/ecp_nistp256.c
index a5887086c638..78d191aac7af 100644
--- a/crypto/ec/ecp_nistp256.c
+++ b/crypto/ec/ecp_nistp256.c
@@ -2249,8 +2249,7 @@ int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
      */
     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
-        ret = 1;
-        goto err;
+        goto done;
     }
     if ((!BN_to_felem(x_tmp, &group->generator->X)) ||
         (!BN_to_felem(y_tmp, &group->generator->Y)) ||
@@ -2337,6 +2336,7 @@ int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
     }
     make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
 
+ done:
     if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp256_pre_comp_dup,
                              nistp256_pre_comp_free,
                              nistp256_pre_comp_clear_free))
diff --git a/crypto/ec/ecp_nistp521.c b/crypto/ec/ecp_nistp521.c
index 360b9a3516f6..c53a61bbfb69 100644
--- a/crypto/ec/ecp_nistp521.c
+++ b/crypto/ec/ecp_nistp521.c
@@ -2056,8 +2056,7 @@ int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
      */
     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
-        ret = 1;
-        goto err;
+        goto done;
     }
     if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
         (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
@@ -2115,6 +2114,7 @@ int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
     }
     make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
 
+ done:
     if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
                              nistp521_pre_comp_free,
                              nistp521_pre_comp_clear_free))
diff --git a/crypto/ec/ectest.c b/crypto/ec/ectest.c
index efab0b07b1d2..40a1f003259f 100644
--- a/crypto/ec/ectest.c
+++ b/crypto/ec/ectest.c
@@ -1758,9 +1758,18 @@ static void nistp_single_test(const struct nistp_test_params *test)
     if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx))
         ABORT;
 
+    /*
+     * We have not performed precomputation so have_precompute mult should be
+     * false
+     */
+    if (EC_GROUP_have_precompute_mult(NISTP))
+        ABORT;
+
     /* now repeat all tests with precomputation */
     if (!EC_GROUP_precompute_mult(NISTP, ctx))
         ABORT;
+    if (!EC_GROUP_have_precompute_mult(NISTP))
+        ABORT;
 
     /* fixed point multiplication */
     EC_POINT_mul(NISTP, Q, m, NULL, NULL, ctx);
diff --git a/crypto/engine/eng_dyn.c b/crypto/engine/eng_dyn.c
index 3169b09ad865..40f30e9d585e 100644
--- a/crypto/engine/eng_dyn.c
+++ b/crypto/engine/eng_dyn.c
@@ -243,8 +243,10 @@ static int dynamic_set_data_ctx(ENGINE *e, dynamic_data_ctx **ctx)
      * If we lost the race to set the context, c is non-NULL and *ctx is the
      * context of the thread that won.
      */
-    if (c)
+    if (c) {
+        sk_OPENSSL_STRING_free(c->dirs);
         OPENSSL_free(c);
+    }
     return 1;
 }
 
diff --git a/crypto/evp/e_des.c b/crypto/evp/e_des.c
index aae13a675694..8ca65cd03ae1 100644
--- a/crypto/evp/e_des.c
+++ b/crypto/evp/e_des.c
@@ -71,12 +71,13 @@ typedef struct {
         DES_key_schedule ks;
     } ks;
     union {
-        void (*cbc) (const void *, void *, size_t, const void *, void *);
+        void (*cbc) (const void *, void *, size_t,
+                     const DES_key_schedule *, unsigned char *);
     } stream;
 } EVP_DES_KEY;
 
 # if defined(AES_ASM) && (defined(__sparc) || defined(__sparc__))
-/* ---------^^^ this is not a typo, just a way to detect that
+/* ----------^^^ this is not a typo, just a way to detect that
  * assembler support was in general requested... */
 #  include "sparc_arch.h"
 
@@ -86,9 +87,9 @@ extern unsigned int OPENSSL_sparcv9cap_P[];
 
 void des_t4_key_expand(const void *key, DES_key_schedule *ks);
 void des_t4_cbc_encrypt(const void *inp, void *out, size_t len,
-                        DES_key_schedule *ks, unsigned char iv[8]);
+                        const DES_key_schedule *ks, unsigned char iv[8]);
 void des_t4_cbc_decrypt(const void *inp, void *out, size_t len,
-                        DES_key_schedule *ks, unsigned char iv[8]);
+                        const DES_key_schedule *ks, unsigned char iv[8]);
 # endif
 
 static int des_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
@@ -130,7 +131,7 @@ static int des_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
 {
     EVP_DES_KEY *dat = (EVP_DES_KEY *) ctx->cipher_data;
 
-    if (dat->stream.cbc) {
+    if (dat->stream.cbc != NULL) {
         (*dat->stream.cbc) (in, out, inl, &dat->ks.ks, ctx->iv);
         return 1;
     }
diff --git a/crypto/evp/e_des3.c b/crypto/evp/e_des3.c
index bf6c1d2d3d39..0e910d6d8085 100644
--- a/crypto/evp/e_des3.c
+++ b/crypto/evp/e_des3.c
@@ -75,7 +75,8 @@ typedef struct {
         DES_key_schedule ks[3];
     } ks;
     union {
-        void (*cbc) (const void *, void *, size_t, const void *, void *);
+        void (*cbc) (const void *, void *, size_t,
+                     const DES_key_schedule *, unsigned char *);
     } stream;
 } DES_EDE_KEY;
 # define ks1 ks.ks[0]
@@ -93,9 +94,9 @@ extern unsigned int OPENSSL_sparcv9cap_P[];
 
 void des_t4_key_expand(const void *key, DES_key_schedule *ks);
 void des_t4_ede3_cbc_encrypt(const void *inp, void *out, size_t len,
-                             DES_key_schedule *ks, unsigned char iv[8]);
+                             const DES_key_schedule ks[3], unsigned char iv[8]);
 void des_t4_ede3_cbc_decrypt(const void *inp, void *out, size_t len,
-                             DES_key_schedule *ks, unsigned char iv[8]);
+                             const DES_key_schedule ks[3], unsigned char iv[8]);
 # endif
 
 static int des_ede_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
@@ -162,7 +163,7 @@ static int des_ede_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
     }
 # endif                         /* KSSL_DEBUG */
     if (dat->stream.cbc) {
-        (*dat->stream.cbc) (in, out, inl, &dat->ks, ctx->iv);
+        (*dat->stream.cbc) (in, out, inl, dat->ks.ks, ctx->iv);
         return 1;
     }
 
@@ -395,7 +396,7 @@ static int des_ede3_unwrap(EVP_CIPHER_CTX *ctx, unsigned char *out,
     int rv = -1;
     if (inl < 24)
         return -1;
-    if (!out)
+    if (out == NULL)
         return inl - 16;
     memcpy(ctx->iv, wrap_iv, 8);
     /* Decrypt first block which will end up as icv */
@@ -438,7 +439,7 @@ static int des_ede3_wrap(EVP_CIPHER_CTX *ctx, unsigned char *out,
                          const unsigned char *in, size_t inl)
 {
     unsigned char sha1tmp[SHA_DIGEST_LENGTH];
-    if (!out)
+    if (out == NULL)
         return inl + 16;
     /* Copy input to output buffer + 8 so we have space for IV */
     memmove(out + 8, in, inl);
diff --git a/crypto/modes/asm/aesni-gcm-x86_64.pl b/crypto/modes/asm/aesni-gcm-x86_64.pl
index bd6bf72fe487..980cfd23efe3 100755
--- a/crypto/modes/asm/aesni-gcm-x86_64.pl
+++ b/crypto/modes/asm/aesni-gcm-x86_64.pl
@@ -43,7 +43,7 @@ die "can't locate x86_64-xlate.pl";
 
 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
 		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
-	$avx = ($1>=2.19) + ($1>=2.22);
+	$avx = ($1>=2.20) + ($1>=2.22);
 }
 
 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
@@ -489,7 +489,7 @@ $code.=<<___;
 ___
 $code.=<<___ if ($win64);
 	movaps	-0xd8(%rax),%xmm6
-	movaps	-0xd8(%rax),%xmm7
+	movaps	-0xc8(%rax),%xmm7
 	movaps	-0xb8(%rax),%xmm8
 	movaps	-0xa8(%rax),%xmm9
 	movaps	-0x98(%rax),%xmm10
diff --git a/crypto/modes/asm/ghash-x86_64.pl b/crypto/modes/asm/ghash-x86_64.pl
index 4ff2d39aa7b2..f889f2018789 100755
--- a/crypto/modes/asm/ghash-x86_64.pl
+++ b/crypto/modes/asm/ghash-x86_64.pl
@@ -92,7 +92,7 @@ die "can't locate x86_64-xlate.pl";
 
 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
 		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
-	$avx = ($1>=2.19) + ($1>=2.22);
+	$avx = ($1>=2.20) + ($1>=2.22);
 }
 
 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
diff --git a/crypto/modes/ctr128.c b/crypto/modes/ctr128.c
index f3bbcbf72376..bcafd6b6bfb1 100644
--- a/crypto/modes/ctr128.c
+++ b/crypto/modes/ctr128.c
@@ -67,23 +67,20 @@
 /* increment counter (128-bit int) by 1 */
 static void ctr128_inc(unsigned char *counter)
 {
-    u32 n = 16;
-    u8 c;
+    u32 n = 16, c = 1;
 
     do {
         --n;
-        c = counter[n];
-        ++c;
-        counter[n] = c;
-        if (c)
-            return;
+        c += counter[n];
+        counter[n] = (u8)c;
+        c >>= 8;
     } while (n);
 }
 
 #if !defined(OPENSSL_SMALL_FOOTPRINT)
 static void ctr128_inc_aligned(unsigned char *counter)
 {
-    size_t *data, c, n;
+    size_t *data, c, d, n;
     const union {
         long one;
         char little;
@@ -91,20 +88,19 @@ static void ctr128_inc_aligned(unsigned char *counter)
         1
     };
 
-    if (is_endian.little) {
+    if (is_endian.little || ((size_t)counter % sizeof(size_t)) != 0) {
         ctr128_inc(counter);
         return;
     }
 
     data = (size_t *)counter;
+    c = 1;
     n = 16 / sizeof(size_t);
     do {
         --n;
-        c = data[n];
-        ++c;
-        data[n] = c;
-        if (c)
-            return;
+        d = data[n] += c;
+        /* did addition carry? */
+        c = ((d - c) ^ d) >> (sizeof(size_t) * 8 - 1);
     } while (n);
 }
 #endif
@@ -144,14 +140,14 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
             }
 
 # if defined(STRICT_ALIGNMENT)
-            if (((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) !=
-                0)
+            if (((size_t)in | (size_t)out | (size_t)ecount_buf)
+                % sizeof(size_t) != 0)
                 break;
 # endif
             while (len >= 16) {
                 (*block) (ivec, ecount_buf, key);
                 ctr128_inc_aligned(ivec);
-                for (; n < 16; n += sizeof(size_t))
+                for (n = 0; n < 16; n += sizeof(size_t))
                     *(size_t *)(out + n) =
                         *(size_t *)(in + n) ^ *(size_t *)(ecount_buf + n);
                 len -= 16;
@@ -189,16 +185,13 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
 /* increment upper 96 bits of 128-bit counter by 1 */
 static void ctr96_inc(unsigned char *counter)
 {
-    u32 n = 12;
-    u8 c;
+    u32 n = 12, c = 1;
 
     do {
         --n;
-        c = counter[n];
-        ++c;
-        counter[n] = c;
-        if (c)
-            return;
+        c += counter[n];
+        counter[n] = (u8)c;
+        c >>= 8;
     } while (n);
 }
 
diff --git a/crypto/opensslconf.h b/crypto/opensslconf.h
index b4d522e68505..f533508b152c 100644
--- a/crypto/opensslconf.h
+++ b/crypto/opensslconf.h
@@ -38,12 +38,18 @@ extern "C" {
 #ifndef OPENSSL_NO_SSL_TRACE
 # define OPENSSL_NO_SSL_TRACE
 #endif
+#ifndef OPENSSL_NO_SSL2
+# define OPENSSL_NO_SSL2
+#endif
 #ifndef OPENSSL_NO_STORE
 # define OPENSSL_NO_STORE
 #endif
 #ifndef OPENSSL_NO_UNIT_TEST
 # define OPENSSL_NO_UNIT_TEST
 #endif
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
+# define OPENSSL_NO_WEAK_SSL_CIPHERS
+#endif
 
 #endif /* OPENSSL_DOING_MAKEDEPEND */
 
@@ -86,12 +92,18 @@ extern "C" {
 # if defined(OPENSSL_NO_SSL_TRACE) && !defined(NO_SSL_TRACE)
 #  define NO_SSL_TRACE
 # endif
+# if defined(OPENSSL_NO_SSL2) && !defined(NO_SSL2)
+#  define NO_SSL2
+# endif
 # if defined(OPENSSL_NO_STORE) && !defined(NO_STORE)
 #  define NO_STORE
 # endif
 # if defined(OPENSSL_NO_UNIT_TEST) && !defined(NO_UNIT_TEST)
 #  define NO_UNIT_TEST
 # endif
+# if defined(OPENSSL_NO_WEAK_SSL_CIPHERS) && !defined(NO_WEAK_SSL_CIPHERS)
+#  define NO_WEAK_SSL_CIPHERS
+# endif
 #endif
 
 /* crypto/opensslconf.h.in */
diff --git a/crypto/opensslv.h b/crypto/opensslv.h
index 03b8c4843784..4334fd15cd87 100644
--- a/crypto/opensslv.h
+++ b/crypto/opensslv.h
@@ -30,11 +30,11 @@ extern "C" {
  * (Prior to 0.9.5a beta1, a different scheme was used: MMNNFFRBB for
  *  major minor fix final patch/beta)
  */
-# define OPENSSL_VERSION_NUMBER  0x1000206fL
+# define OPENSSL_VERSION_NUMBER  0x1000207fL
 # ifdef OPENSSL_FIPS
-#  define OPENSSL_VERSION_TEXT    "OpenSSL 1.0.2f-fips  28 Jan 2016"
+#  define OPENSSL_VERSION_TEXT    "OpenSSL 1.0.2g-fips  1 Mar 2016"
 # else
-#  define OPENSSL_VERSION_TEXT    "OpenSSL 1.0.2f  28 Jan 2016"
+#  define OPENSSL_VERSION_TEXT    "OpenSSL 1.0.2g  1 Mar 2016"
 # endif
 # define OPENSSL_VERSION_PTEXT   " part of " OPENSSL_VERSION_TEXT
 
diff --git a/crypto/perlasm/x86_64-xlate.pl b/crypto/perlasm/x86_64-xlate.pl
index 9c70b8c2c6e9..ee04221c7e1d 100755
--- a/crypto/perlasm/x86_64-xlate.pl
+++ b/crypto/perlasm/x86_64-xlate.pl
@@ -198,8 +198,11 @@ my %globals;
 	if ($gas) {
 	    # Solaris /usr/ccs/bin/as can't handle multiplications
 	    # in $self->{value}
-	    $self->{value} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
-	    $self->{value} =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg;
+	    my $value = $self->{value};
+	    $value =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
+	    if ($value =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg) {
+		$self->{value} = $value;
+	    }
 	    sprintf "\$%s",$self->{value};
 	} else {
 	    $self->{value} =~ s/(0b[0-1]+)/oct($1)/eig;
diff --git a/crypto/pkcs7/pk7_smime.c b/crypto/pkcs7/pk7_smime.c
index c4d3724d2a48..dc9b484078af 100644
--- a/crypto/pkcs7/pk7_smime.c
+++ b/crypto/pkcs7/pk7_smime.c
@@ -274,12 +274,29 @@ int PKCS7_verify(PKCS7 *p7, STACK_OF(X509) *certs, X509_STORE *store,
         PKCS7err(PKCS7_F_PKCS7_VERIFY, PKCS7_R_NO_CONTENT);
         return 0;
     }
+#if 0
+    /*
+     * NB: this test commented out because some versions of Netscape
+     * illegally include zero length content when signing data. Also
+     * Microsoft Authenticode includes a SpcIndirectDataContent data
+     * structure which describes the content to be protected by the
+     * signature, rather than directly embedding that content. So
+     * Authenticode implementations are also expected to use
+     * PKCS7_verify() with explicit external data, on non-detached
+     * PKCS#7 signatures.
+     *
+     * In OpenSSL 1.1 a new flag PKCS7_NO_DUAL_CONTENT has been
+     * introduced to disable this sanity check. For the 1.0.2 branch
+     * this change is not acceptable, so the check remains completely
+     * commented out (as it has been for a long time).
+     */
 
     /* Check for data and content: two sets of data */
     if (!PKCS7_get_detached(p7) && indata) {
         PKCS7err(PKCS7_F_PKCS7_VERIFY, PKCS7_R_CONTENT_AND_DATA_PRESENT);
         return 0;
     }
+#endif
 
     sinfos = PKCS7_get_signer_info(p7);
 
diff --git a/crypto/rsa/rsa_sign.c b/crypto/rsa/rsa_sign.c
index ed63a1d8b0e3..82ca8324dfbc 100644
--- a/crypto/rsa/rsa_sign.c
+++ b/crypto/rsa/rsa_sign.c
@@ -84,7 +84,7 @@ int RSA_sign(int type, const unsigned char *m, unsigned int m_len,
         return 0;
     }
 #endif
-    if (rsa->meth->rsa_sign) {
+    if ((rsa->flags & RSA_FLAG_SIGN_VER) && rsa->meth->rsa_sign) {
         return rsa->meth->rsa_sign(type, m, m_len, sigret, siglen, rsa);
     }
     /* Special case: SSL signature, just check the length */
@@ -293,7 +293,7 @@ int RSA_verify(int dtype, const unsigned char *m, unsigned int m_len,
                const unsigned char *sigbuf, unsigned int siglen, RSA *rsa)
 {
 
-    if (rsa->meth->rsa_verify) {
+    if ((rsa->flags & RSA_FLAG_SIGN_VER) && rsa->meth->rsa_verify) {
         return rsa->meth->rsa_verify(dtype, m, m_len, sigbuf, siglen, rsa);
     }
 
diff --git a/crypto/srp/srp.h b/crypto/srp/srp.h
index d072536fec9b..028892a1ff5e 100644
--- a/crypto/srp/srp.h
+++ b/crypto/srp/srp.h
@@ -82,16 +82,21 @@ typedef struct SRP_gN_cache_st {
 DECLARE_STACK_OF(SRP_gN_cache)
 
 typedef struct SRP_user_pwd_st {
+    /* Owned by us. */
     char *id;
     BIGNUM *s;
     BIGNUM *v;
+    /* Not owned by us. */
     const BIGNUM *g;
     const BIGNUM *N;
+    /* Owned by us. */
     char *info;
 } SRP_user_pwd;
 
 DECLARE_STACK_OF(SRP_user_pwd)
 
+void SRP_user_pwd_free(SRP_user_pwd *user_pwd);
+
 typedef struct SRP_VBASE_st {
     STACK_OF(SRP_user_pwd) *users_pwd;
     STACK_OF(SRP_gN_cache) *gN_cache;
@@ -115,7 +120,12 @@ DECLARE_STACK_OF(SRP_gN)
 SRP_VBASE *SRP_VBASE_new(char *seed_key);
 int SRP_VBASE_free(SRP_VBASE *vb);
 int SRP_VBASE_init(SRP_VBASE *vb, char *verifier_file);
+
+/* This method ignores the configured seed and fails for an unknown user. */
 SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username);
+/* NOTE: unlike in SRP_VBASE_get_by_user, caller owns the returned pointer.*/
+SRP_user_pwd *SRP_VBASE_get1_by_user(SRP_VBASE *vb, char *username);
+
 char *SRP_create_verifier(const char *user, const char *pass, char **salt,
                           char **verifier, const char *N, const char *g);
 int SRP_create_verifier_BN(const char *user, const char *pass, BIGNUM **salt,
diff --git a/crypto/srp/srp_vfy.c b/crypto/srp/srp_vfy.c
index a3f1a8a0a4d5..26ad3e07b4bb 100644
--- a/crypto/srp/srp_vfy.c
+++ b/crypto/srp/srp_vfy.c
@@ -185,7 +185,7 @@ static char *t_tob64(char *dst, const unsigned char *src, int size)
     return olddst;
 }
 
-static void SRP_user_pwd_free(SRP_user_pwd *user_pwd)
+void SRP_user_pwd_free(SRP_user_pwd *user_pwd)
 {
     if (user_pwd == NULL)
         return;
@@ -247,6 +247,24 @@ static int SRP_user_pwd_set_sv_BN(SRP_user_pwd *vinfo, BIGNUM *s, BIGNUM *v)
     return (vinfo->s != NULL && vinfo->v != NULL);
 }
 
+static SRP_user_pwd *srp_user_pwd_dup(SRP_user_pwd *src)
+{
+    SRP_user_pwd *ret;
+
+    if (src == NULL)
+        return NULL;
+    if ((ret = SRP_user_pwd_new()) == NULL)
+        return NULL;
+
+    SRP_user_pwd_set_gN(ret, src->g, src->N);
+    if (!SRP_user_pwd_set_ids(ret, src->id, src->info)
+        || !SRP_user_pwd_set_sv_BN(ret, BN_dup(src->s), BN_dup(src->v))) {
+            SRP_user_pwd_free(ret);
+            return NULL;
+    }
+    return ret;
+}
+
 SRP_VBASE *SRP_VBASE_new(char *seed_key)
 {
     SRP_VBASE *vb = (SRP_VBASE *)OPENSSL_malloc(sizeof(SRP_VBASE));
@@ -468,21 +486,50 @@ int SRP_VBASE_init(SRP_VBASE *vb, char *verifier_file)
 
 }
 
-SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username)
+static SRP_user_pwd *find_user(SRP_VBASE *vb, char *username)
 {
     int i;
     SRP_user_pwd *user;
+
+    if (vb == NULL)
+        return NULL;
+
+    for (i = 0; i < sk_SRP_user_pwd_num(vb->users_pwd); i++) {
+        user = sk_SRP_user_pwd_value(vb->users_pwd, i);
+        if (strcmp(user->id, username) == 0)
+            return user;
+    }
+
+    return NULL;
+}
+
+/*
+ * This method ignores the configured seed and fails for an unknown user.
+ * Ownership of the returned pointer is not released to the caller.
+ * In other words, caller must not free the result.
+ */
+SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username)
+{
+    return find_user(vb, username);
+}
+
+/*
+ * Ownership of the returned pointer is released to the caller.
+ * In other words, caller must free the result once done.
+ */
+SRP_user_pwd *SRP_VBASE_get1_by_user(SRP_VBASE *vb, char *username)
+{
+    SRP_user_pwd *user;
     unsigned char digv[SHA_DIGEST_LENGTH];
     unsigned char digs[SHA_DIGEST_LENGTH];
     EVP_MD_CTX ctxt;
 
     if (vb == NULL)
         return NULL;
-    for (i = 0; i < sk_SRP_user_pwd_num(vb->users_pwd); i++) {
-        user = sk_SRP_user_pwd_value(vb->users_pwd, i);
-        if (strcmp(user->id, username) == 0)
-            return user;
-    }
+
+    if ((user = find_user(vb, username)) != NULL)
+        return srp_user_pwd_dup(user);
+
     if ((vb->seed_key == NULL) ||
         (vb->default_g == NULL) || (vb->default_N == NULL))
         return NULL;
diff --git a/crypto/stack/stack.c b/crypto/stack/stack.c
index de437acf6a5c..fa50083e22b3 100644
--- a/crypto/stack/stack.c
+++ b/crypto/stack/stack.c
@@ -360,7 +360,7 @@ void *sk_set(_STACK *st, int i, void *value)
 
 void sk_sort(_STACK *st)
 {
-    if (st && !st->sorted) {
+    if (st && !st->sorted && st->comp != NULL) {
         int (*comp_func) (const void *, const void *);
 
         /*
diff --git a/crypto/x509/x509_vfy.c b/crypto/x509/x509_vfy.c
index 0429767032fd..4d34dbac9314 100644
--- a/crypto/x509/x509_vfy.c
+++ b/crypto/x509/x509_vfy.c
@@ -194,6 +194,9 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
     int num, j, retry;
     int (*cb) (int xok, X509_STORE_CTX *xctx);
     STACK_OF(X509) *sktmp = NULL;
+    int trust = X509_TRUST_UNTRUSTED;
+    int err;
+
     if (ctx->cert == NULL) {
         X509err(X509_F_X509_VERIFY_CERT, X509_R_NO_CERT_SET_FOR_US_TO_VERIFY);
         return -1;
@@ -216,7 +219,8 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
     if (((ctx->chain = sk_X509_new_null()) == NULL) ||
         (!sk_X509_push(ctx->chain, ctx->cert))) {
         X509err(X509_F_X509_VERIFY_CERT, ERR_R_MALLOC_FAILURE);
-        goto end;
+        ok = -1;
+        goto err;
     }
     CRYPTO_add(&ctx->cert->references, 1, CRYPTO_LOCK_X509);
     ctx->last_untrusted = 1;
@@ -225,7 +229,8 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
     if (ctx->untrusted != NULL
         && (sktmp = sk_X509_dup(ctx->untrusted)) == NULL) {
         X509err(X509_F_X509_VERIFY_CERT, ERR_R_MALLOC_FAILURE);
-        goto end;
+        ok = -1;
+        goto err;
     }
 
     num = sk_X509_num(ctx->chain);
@@ -249,7 +254,7 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
         if (ctx->param->flags & X509_V_FLAG_TRUSTED_FIRST) {
             ok = ctx->get_issuer(&xtmp, ctx, x);
             if (ok < 0)
-                goto end;
+                goto err;
             /*
              * If successful for now free up cert so it will be picked up
              * again later.
@@ -266,7 +271,8 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
             if (xtmp != NULL) {
                 if (!sk_X509_push(ctx->chain, xtmp)) {
                     X509err(X509_F_X509_VERIFY_CERT, ERR_R_MALLOC_FAILURE);
-                    goto end;
+                    ok = -1;
+                    goto err;
                 }
                 CRYPTO_add(&xtmp->references, 1, CRYPTO_LOCK_X509);
                 (void)sk_X509_delete_ptr(sktmp, xtmp);
@@ -314,7 +320,7 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
                     bad_chain = 1;
                     ok = cb(0, ctx);
                     if (!ok)
-                        goto end;
+                        goto err;
                 } else {
                     /*
                      * We have a match: replace certificate with store
@@ -347,25 +353,26 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
             ok = ctx->get_issuer(&xtmp, ctx, x);
 
             if (ok < 0)
-                goto end;
+                goto err;
             if (ok == 0)
                 break;
             x = xtmp;
             if (!sk_X509_push(ctx->chain, x)) {
                 X509_free(xtmp);
                 X509err(X509_F_X509_VERIFY_CERT, ERR_R_MALLOC_FAILURE);
-                ok = 0;
-                goto end;
+                ok = -1;
+                goto err;
             }
             num++;
         }
 
         /* we now have our chain, lets check it... */
-        i = check_trust(ctx);
+        if ((trust = check_trust(ctx)) == X509_TRUST_REJECTED) {
+            /* Callback already issued */
+            ok = 0;
+            goto err;
+        }
 
-        /* If explicitly rejected error */
-        if (i == X509_TRUST_REJECTED)
-            goto end;
         /*
          * If it's not explicitly trusted then check if there is an alternative
          * chain that could be used. We only do this if we haven't already
@@ -373,14 +380,14 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
          * chain checking
          */
         retry = 0;
-        if (i != X509_TRUST_TRUSTED
+        if (trust != X509_TRUST_TRUSTED
             && !(ctx->param->flags & X509_V_FLAG_TRUSTED_FIRST)
             && !(ctx->param->flags & X509_V_FLAG_NO_ALT_CHAINS)) {
             while (j-- > 1) {
                 xtmp2 = sk_X509_value(ctx->chain, j - 1);
                 ok = ctx->get_issuer(&xtmp, ctx, xtmp2);
                 if (ok < 0)
-                    goto end;
+                    goto err;
                 /* Check if we found an alternate chain */
                 if (ok > 0) {
                     /*
@@ -410,7 +417,7 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
      * self signed certificate in which case we've indicated an error already
      * and set bad_chain == 1
      */
-    if (i != X509_TRUST_TRUSTED && !bad_chain) {
+    if (trust != X509_TRUST_TRUSTED && !bad_chain) {
         if ((chain_ss == NULL) || !ctx->check_issued(ctx, x, chain_ss)) {
             if (ctx->last_untrusted >= num)
                 ctx->error = X509_V_ERR_UNABLE_TO_GET_ISSUER_CERT_LOCALLY;
@@ -431,26 +438,26 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
         bad_chain = 1;
         ok = cb(0, ctx);
         if (!ok)
-            goto end;
+            goto err;
     }
 
     /* We have the chain complete: now we need to check its purpose */
     ok = check_chain_extensions(ctx);
 
     if (!ok)
-        goto end;
+        goto err;
 
     /* Check name constraints */
 
     ok = check_name_constraints(ctx);
 
     if (!ok)
-        goto end;
+        goto err;
 
     ok = check_id(ctx);
 
     if (!ok)
-        goto end;
+        goto err;
 
     /* We may as well copy down any DSA parameters that are required */
     X509_get_pubkey_parameters(NULL, ctx->chain);
@@ -462,16 +469,16 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
 
     ok = ctx->check_revocation(ctx);
     if (!ok)
-        goto end;
+        goto err;
 
-    i = X509_chain_check_suiteb(&ctx->error_depth, NULL, ctx->chain,
-                                ctx->param->flags);
-    if (i != X509_V_OK) {
-        ctx->error = i;
+    err = X509_chain_check_suiteb(&ctx->error_depth, NULL, ctx->chain,
+                                  ctx->param->flags);
+    if (err != X509_V_OK) {
+        ctx->error = err;
         ctx->current_cert = sk_X509_value(ctx->chain, ctx->error_depth);
         ok = cb(0, ctx);
         if (!ok)
-            goto end;
+            goto err;
     }
 
     /* At this point, we have a chain and need to verify it */
@@ -480,25 +487,28 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
     else
         ok = internal_verify(ctx);
     if (!ok)
-        goto end;
+        goto err;
 
 #ifndef OPENSSL_NO_RFC3779
     /* RFC 3779 path validation, now that CRL check has been done */
     ok = v3_asid_validate_path(ctx);
     if (!ok)
-        goto end;
+        goto err;
     ok = v3_addr_validate_path(ctx);
     if (!ok)
-        goto end;
+        goto err;
 #endif
 
     /* If we get this far evaluate policies */
     if (!bad_chain && (ctx->param->flags & X509_V_FLAG_POLICY_CHECK))
         ok = ctx->check_policy(ctx);
     if (!ok)
-        goto end;
+        goto err;
     if (0) {
- end:
+ err:
+        /* Ensure we return an error */
+        if (ok > 0)
+            ok = 0;
         X509_get_pubkey_parameters(NULL, ctx->chain);
     }
     if (sktmp != NULL)
diff --git a/doc/apps/ciphers.pod b/doc/apps/ciphers.pod
index 1c26e3b3da36..9643b4d48ca8 100644
--- a/doc/apps/ciphers.pod
+++ b/doc/apps/ciphers.pod
@@ -38,25 +38,21 @@ SSL v2 and for SSL v3/TLS v1.
 
 Like B<-v>, but include cipher suite codes in output (hex format).
 
-=item B<-ssl3>
+=item B<-ssl3>, B<-tls1>
 
-only include SSL v3 ciphers.
+This lists ciphers compatible with any of SSLv3, TLSv1, TLSv1.1 or TLSv1.2.
 
 =item B<-ssl2>
 
-only include SSL v2 ciphers.
-
-=item B<-tls1>
-
-only include TLS v1 ciphers.
+Only include SSLv2 ciphers.
 
 =item B<-h>, B<-?>
 
-print a brief usage message.
+Print a brief usage message.
 
 =item B<cipherlist>
 
-a cipher list to convert to a cipher preference list. If it is not included
+A cipher list to convert to a cipher preference list. If it is not included
 then the default cipher list will be used. The format is described below.
 
 =back
@@ -109,9 +105,10 @@ The following is a list of all permitted cipher strings and their meanings.
 
 =item B<DEFAULT>
 
-the default cipher list. This is determined at compile time and
-is normally B<ALL:!EXPORT:!aNULL:!eNULL:!SSLv2>. This must be the firstcipher string
-specified.
+The default cipher list.
+This is determined at compile time and is normally
+B<ALL:!EXPORT:!aNULL:!eNULL:!SSLv2>.
+When used, this must be the first cipherstring specified.
 
 =item B<COMPLEMENTOFDEFAULT>
 
@@ -139,34 +136,46 @@ than 128 bits, and some cipher suites with 128-bit keys.
 
 =item B<LOW>
 
-"low" encryption cipher suites, currently those using 64 or 56 bit encryption algorithms
-but excluding export cipher suites.
+Low strength encryption cipher suites, currently those using 64 or 56 bit
+encryption algorithms but excluding export cipher suites.
+As of OpenSSL 1.0.2g, these are disabled in default builds.
 
 =item B<EXP>, B<EXPORT>
 
-export encryption algorithms. Including 40 and 56 bits algorithms.
+Export strength encryption algorithms. Including 40 and 56 bits algorithms.
+As of OpenSSL 1.0.2g, these are disabled in default builds.
 
 =item B<EXPORT40>
 
-40 bit export encryption algorithms
+40-bit export encryption algorithms
+As of OpenSSL 1.0.2g, these are disabled in default builds.
 
 =item B<EXPORT56>
 
-56 bit export encryption algorithms. In OpenSSL 0.9.8c and later the set of
+56-bit export encryption algorithms. In OpenSSL 0.9.8c and later the set of
 56 bit export ciphers is empty unless OpenSSL has been explicitly configured
 with support for experimental ciphers.
+As of OpenSSL 1.0.2g, these are disabled in default builds.
 
 =item B<eNULL>, B<NULL>
 
-the "NULL" ciphers that is those offering no encryption. Because these offer no
-encryption at all and are a security risk they are disabled unless explicitly
-included.
+The "NULL" ciphers that is those offering no encryption. Because these offer no
+encryption at all and are a security risk they are not enabled via either the
+B<DEFAULT> or B<ALL> cipher strings.
+Be careful when building cipherlists out of lower-level primitives such as
+B<kRSA> or B<aECDSA> as these do overlap with the B<eNULL> ciphers.
+When in doubt, include B<!eNULL> in your cipherlist.
 
 =item B<aNULL>
 
-the cipher suites offering no authentication. This is currently the anonymous
+The cipher suites offering no authentication. This is currently the anonymous
 DH algorithms and anonymous ECDH algorithms. These cipher suites are vulnerable
 to a "man in the middle" attack and so their use is normally discouraged.
+These are excluded from the B<DEFAULT> ciphers, but included in the B<ALL>
+ciphers.
+Be careful when building cipherlists out of lower-level primitives such as
+B<kDHE> or B<AES> as these do overlap with the B<aNULL> ciphers.
+When in doubt, include B<!aNULL> in your cipherlist.
 
 =item B<kRSA>, B<RSA>
 
@@ -582,11 +591,11 @@ Note: these ciphers can also be used in SSL v3.
 =head2 Deprecated SSL v2.0 cipher suites.
 
  SSL_CK_RC4_128_WITH_MD5                 RC4-MD5
- SSL_CK_RC4_128_EXPORT40_WITH_MD5        EXP-RC4-MD5
- SSL_CK_RC2_128_CBC_WITH_MD5             RC2-MD5
- SSL_CK_RC2_128_CBC_EXPORT40_WITH_MD5    EXP-RC2-MD5
+ SSL_CK_RC4_128_EXPORT40_WITH_MD5        Not implemented.
+ SSL_CK_RC2_128_CBC_WITH_MD5             RC2-CBC-MD5
+ SSL_CK_RC2_128_CBC_EXPORT40_WITH_MD5    Not implemented.
  SSL_CK_IDEA_128_CBC_WITH_MD5            IDEA-CBC-MD5
- SSL_CK_DES_64_CBC_WITH_MD5              DES-CBC-MD5
+ SSL_CK_DES_64_CBC_WITH_MD5              Not implemented.
  SSL_CK_DES_192_EDE3_CBC_WITH_MD5        DES-CBC3-MD5
 
 =head1 NOTES
diff --git a/doc/apps/pkeyutl.pod b/doc/apps/pkeyutl.pod
index 27be9a90079f..5da347c97d32 100644
--- a/doc/apps/pkeyutl.pod
+++ b/doc/apps/pkeyutl.pod
@@ -137,6 +137,19 @@ Unless otherwise mentioned all algorithms support the B<digest:alg> option
 which specifies the digest in use for sign, verify and verifyrecover operations.
 The value B<alg> should represent a digest name as used in the
 EVP_get_digestbyname() function for example B<sha1>.
+This value is used only for sanity-checking the lengths of data passed in to
+the B<pkeyutl> and for creating the structures that make up the signature
+(e.g. B<DigestInfo> in RSASSA PKCS#1 v1.5 signatures).
+In case of RSA, ECDSA and DSA signatures, this utility
+will not perform hashing on input data but rather use the data directly as
+input of signature algorithm. Depending on key type, signature type and mode
+of padding, the maximum acceptable lengths of input data differ. In general,
+with RSA the signed data can't be longer than the key modulus, in case of ECDSA
+and DSA the data shouldn't be longer than field size, otherwise it will be
+silently truncated to field size.
+
+In other words, if the value of digest is B<sha1> the input should be 20 bytes
+long binary encoding of SHA-1 hash function output.
 
 =head1 RSA ALGORITHM
 
diff --git a/doc/apps/req.pod b/doc/apps/req.pod
index 54a4d394d282..30653e509357 100644
--- a/doc/apps/req.pod
+++ b/doc/apps/req.pod
@@ -347,9 +347,12 @@ configuration file values.
 
 =item B<default_bits>
 
-This specifies the default key size in bits. If not specified then
-512 is used. It is used if the B<-new> option is used. It can be
-overridden by using the B<-newkey> option.
+Specifies the default key size in bits.
+
+This option is used in conjunction with the B<-new> option to generate
+a new key. It can be overridden by specifying an explicit key size in
+the B<-newkey> option. The smallest accepted key size is 512 bits. If
+no key size is specified then 2048 bits is used.
 
 =item B<default_keyfile>
 
diff --git a/doc/apps/s_client.pod b/doc/apps/s_client.pod
index 84d052706941..618df9659d3b 100644
--- a/doc/apps/s_client.pod
+++ b/doc/apps/s_client.pod
@@ -201,15 +201,11 @@ Use the PSK key B<key> when using a PSK cipher suite. The key is
 given as a hexadecimal number without leading 0x, for example -psk
 1a2b3c4d.
 
-=item B<-ssl2>, B<-ssl3>, B<-tls1>, B<-no_ssl2>, B<-no_ssl3>, B<-no_tls1>, B<-no_tls1_1>, B<-no_tls1_2>
+=item B<-ssl2>, B<-ssl3>, B<-tls1>, B<-tls1_1>, B<-tls1_2>, B<-no_ssl2>, B<-no_ssl3>, B<-no_tls1>, B<-no_tls1_1>, B<-no_tls1_2>
 
-these options disable the use of certain SSL or TLS protocols. By default
-the initial handshake uses a method which should be compatible with all
-servers and permit them to use SSL v3, SSL v2 or TLS as appropriate.
-
-Unfortunately there are still ancient and broken servers in use which
-cannot handle this technique and will fail to connect. Some servers only
-work if TLS is turned off.
+These options require or disable the use of the specified SSL or TLS protocols.
+By default the initial handshake uses a I<version-flexible> method which will
+negotiate the highest mutually supported protocol version.
 
 =item B<-fallback_scsv>
 
diff --git a/doc/apps/s_server.pod b/doc/apps/s_server.pod
index baca7792446f..6f4acb7006ff 100644
--- a/doc/apps/s_server.pod
+++ b/doc/apps/s_server.pod
@@ -217,11 +217,11 @@ Use the PSK key B<key> when using a PSK cipher suite. The key is
 given as a hexadecimal number without leading 0x, for example -psk
 1a2b3c4d.
 
-=item B<-ssl2>, B<-ssl3>, B<-tls1>, B<-no_ssl2>, B<-no_ssl3>, B<-no_tls1>
+=item B<-ssl2>, B<-ssl3>, B<-tls1>, B<-tls1_1>, B<-tls1_2>, B<-no_ssl2>, B<-no_ssl3>, B<-no_tls1>, B<-no_tls1_1>, B<-no_tls1_2>
 
-these options disable the use of certain SSL or TLS protocols. By default
-the initial handshake uses a method which should be compatible with all
-servers and permit them to use SSL v3, SSL v2 or TLS as appropriate.
+These options require or disable the use of the specified SSL or TLS protocols.
+By default the initial handshake uses a I<version-flexible> method which will
+negotiate the highest mutually supported protocol version.
 
 =item B<-bugs>
 
diff --git a/doc/crypto/BIO_s_mem.pod b/doc/crypto/BIO_s_mem.pod
index 8f85e0dceeb7..9f239648d752 100644
--- a/doc/crypto/BIO_s_mem.pod
+++ b/doc/crypto/BIO_s_mem.pod
@@ -16,7 +16,7 @@ BIO_get_mem_ptr, BIO_new_mem_buf - memory BIO
  BIO_set_mem_buf(BIO *b,BUF_MEM *bm,int c)
  BIO_get_mem_ptr(BIO *b,BUF_MEM **pp)
 
- BIO *BIO_new_mem_buf(void *buf, int len);
+ BIO *BIO_new_mem_buf(const void *buf, int len);
 
 =head1 DESCRIPTION
 
@@ -61,7 +61,7 @@ BIO_get_mem_ptr() places the underlying BUF_MEM structure in B<pp>. It is
 a macro.
 
 BIO_new_mem_buf() creates a memory BIO using B<len> bytes of data at B<buf>,
-if B<len> is -1 then the B<buf> is assumed to be null terminated and its
+if B<len> is -1 then the B<buf> is assumed to be nul terminated and its
 length is determined by B<strlen>. The BIO is set to a read only state and
 as a result cannot be written to. This is useful when some data needs to be
 made available from a static area of memory in the form of a BIO. The
diff --git a/doc/ssl/SSL_CONF_cmd.pod b/doc/ssl/SSL_CONF_cmd.pod
index 2bf1a60e9013..e81d76ae779a 100644
--- a/doc/ssl/SSL_CONF_cmd.pod
+++ b/doc/ssl/SSL_CONF_cmd.pod
@@ -74,7 +74,7 @@ B<prime256v1>). Curve names are case sensitive.
 
 =item B<-named_curve>
 
-This sets the temporary curve used for ephemeral ECDH modes. Only used by 
+This sets the temporary curve used for ephemeral ECDH modes. Only used by
 servers
 
 The B<value> argument is a curve name or the special value B<auto> which
@@ -85,7 +85,7 @@ can be either the B<NIST> name (e.g. B<P-256>) or an OpenSSL OID name
 =item B<-cipher>
 
 Sets the cipher suite list to B<value>. Note: syntax checking of B<value> is
-currently not performed unless a B<SSL> or B<SSL_CTX> structure is 
+currently not performed unless a B<SSL> or B<SSL_CTX> structure is
 associated with B<cctx>.
 
 =item B<-cert>
@@ -111,9 +111,9 @@ operations are permitted.
 
 =item B<-no_ssl2>, B<-no_ssl3>, B<-no_tls1>, B<-no_tls1_1>, B<-no_tls1_2>
 
-Disables protocol support for SSLv2, SSLv3, TLS 1.0, TLS 1.1 or TLS 1.2 
-by setting the corresponding options B<SSL_OP_NO_SSL2>, B<SSL_OP_NO_SSL3>,
-B<SSL_OP_NO_TLS1>, B<SSL_OP_NO_TLS1_1> and B<SSL_OP_NO_TLS1_2> respectively.
+Disables protocol support for SSLv2, SSLv3, TLSv1.0, TLSv1.1 or TLSv1.2
+by setting the corresponding options B<SSL_OP_NO_SSLv2>, B<SSL_OP_NO_SSLv3>,
+B<SSL_OP_NO_TLSv1>, B<SSL_OP_NO_TLSv1_1> and B<SSL_OP_NO_TLSv1_2> respectively.
 
 =item B<-bugs>
 
@@ -177,7 +177,7 @@ Note: the command prefix (if set) alters the recognised B<cmd> values.
 =item B<CipherString>
 
 Sets the cipher suite list to B<value>. Note: syntax checking of B<value> is
-currently not performed unless an B<SSL> or B<SSL_CTX> structure is 
+currently not performed unless an B<SSL> or B<SSL_CTX> structure is
 associated with B<cctx>.
 
 =item B<Certificate>
@@ -244,7 +244,7 @@ B<prime256v1>). Curve names are case sensitive.
 
 =item B<ECDHParameters>
 
-This sets the temporary curve used for ephemeral ECDH modes. Only used by 
+This sets the temporary curve used for ephemeral ECDH modes. Only used by
 servers
 
 The B<value> argument is a curve name or the special value B<Automatic> which
@@ -258,10 +258,11 @@ The supported versions of the SSL or TLS protocol.
 
 The B<value> argument is a comma separated list of supported protocols to
 enable or disable. If an protocol is preceded by B<-> that version is disabled.
-All versions are enabled by default, though applications may choose to
-explicitly disable some. Currently supported protocol values are B<SSLv2>,
-B<SSLv3>, B<TLSv1>, B<TLSv1.1> and B<TLSv1.2>. The special value B<ALL> refers
-to all supported versions.
+Currently supported protocol values are B<SSLv2>, B<SSLv3>, B<TLSv1>,
+B<TLSv1.1> and B<TLSv1.2>.
+All protocol versions other than B<SSLv2> are enabled by default.
+To avoid inadvertent enabling of B<SSLv2>, when SSLv2 is disabled, it is not
+possible to enable it via the B<Protocol> command.
 
 =item B<Options>
 
@@ -339,16 +340,16 @@ The value is a directory name.
 The order of operations is significant. This can be used to set either defaults
 or values which cannot be overridden. For example if an application calls:
 
- SSL_CONF_cmd(ctx, "Protocol", "-SSLv2");
+ SSL_CONF_cmd(ctx, "Protocol", "-SSLv3");
  SSL_CONF_cmd(ctx, userparam, uservalue);
 
-it will disable SSLv2 support by default but the user can override it. If 
+it will disable SSLv3 support by default but the user can override it. If
 however the call sequence is:
 
  SSL_CONF_cmd(ctx, userparam, uservalue);
- SSL_CONF_cmd(ctx, "Protocol", "-SSLv2");
+ SSL_CONF_cmd(ctx, "Protocol", "-SSLv3");
 
-SSLv2 is B<always> disabled and attempt to override this by the user are
+then SSLv3 is B<always> disabled and attempt to override this by the user are
 ignored.
 
 By checking the return code of SSL_CTX_cmd() it is possible to query if a
@@ -372,7 +373,7 @@ can be checked instead. If -3 is returned a required argument is missing
 and an error is indicated. If 0 is returned some other error occurred and
 this can be reported back to the user.
 
-The function SSL_CONF_cmd_value_type() can be used by applications to 
+The function SSL_CONF_cmd_value_type() can be used by applications to
 check for the existence of a command or to perform additional syntax
 checking or translation of the command value. For example if the return
 value is B<SSL_CONF_TYPE_FILE> an application could translate a relative
diff --git a/doc/ssl/SSL_CTX_new.pod b/doc/ssl/SSL_CTX_new.pod
index 491ac8c172cb..b8cc87978451 100644
--- a/doc/ssl/SSL_CTX_new.pod
+++ b/doc/ssl/SSL_CTX_new.pod
@@ -2,13 +2,55 @@
 
 =head1 NAME
 
-SSL_CTX_new - create a new SSL_CTX object as framework for TLS/SSL enabled functions
+SSL_CTX_new,
+SSLv23_method, SSLv23_server_method, SSLv23_client_method,
+TLSv1_2_method, TLSv1_2_server_method, TLSv1_2_client_method,
+TLSv1_1_method, TLSv1_1_server_method, TLSv1_1_client_method,
+TLSv1_method, TLSv1_server_method, TLSv1_client_method,
+SSLv3_method, SSLv3_server_method, SSLv3_client_method,
+SSLv2_method, SSLv2_server_method, SSLv2_client_method,
+DTLS_method, DTLS_server_method, DTLS_client_method,
+DTLSv1_2_method, DTLSv1_2_server_method, DTLSv1_2_client_method,
+DTLSv1_method, DTLSv1_server_method, DTLSv1_client_method -
+create a new SSL_CTX object as framework for TLS/SSL enabled functions
 
 =head1 SYNOPSIS
 
  #include <openssl/ssl.h>
 
  SSL_CTX *SSL_CTX_new(const SSL_METHOD *method);
+ const SSL_METHOD *SSLv23_method(void);
+ const SSL_METHOD *SSLv23_server_method(void);
+ const SSL_METHOD *SSLv23_client_method(void);
+ const SSL_METHOD *TLSv1_2_method(void);
+ const SSL_METHOD *TLSv1_2_server_method(void);
+ const SSL_METHOD *TLSv1_2_client_method(void);
+ const SSL_METHOD *TLSv1_1_method(void);
+ const SSL_METHOD *TLSv1_1_server_method(void);
+ const SSL_METHOD *TLSv1_1_client_method(void);
+ const SSL_METHOD *TLSv1_method(void);
+ const SSL_METHOD *TLSv1_server_method(void);
+ const SSL_METHOD *TLSv1_client_method(void);
+ #ifndef OPENSSL_NO_SSL3_METHOD
+ const SSL_METHOD *SSLv3_method(void);
+ const SSL_METHOD *SSLv3_server_method(void);
+ const SSL_METHOD *SSLv3_client_method(void);
+ #endif
+ #ifndef OPENSSL_NO_SSL2
+ const SSL_METHOD *SSLv2_method(void);
+ const SSL_METHOD *SSLv2_server_method(void);
+ const SSL_METHOD *SSLv2_client_method(void);
+ #endif
+
+ const SSL_METHOD *DTLS_method(void);
+ const SSL_METHOD *DTLS_server_method(void);
+ const SSL_METHOD *DTLS_client_method(void);
+ const SSL_METHOD *DTLSv1_2_method(void);
+ const SSL_METHOD *DTLSv1_2_server_method(void);
+ const SSL_METHOD *DTLSv1_2_client_method(void);
+ const SSL_METHOD *DTLSv1_method(void);
+ const SSL_METHOD *DTLSv1_server_method(void);
+ const SSL_METHOD *DTLSv1_client_method(void);
 
 =head1 DESCRIPTION
 
@@ -23,65 +65,88 @@ client only type. B<method> can be of the following types:
 
 =over 4
 
-=item SSLv2_method(void), SSLv2_server_method(void), SSLv2_client_method(void)
+=item SSLv23_method(), SSLv23_server_method(), SSLv23_client_method()
 
-A TLS/SSL connection established with these methods will only understand
-the SSLv2 protocol. A client will send out SSLv2 client hello messages
-and will also indicate that it only understand SSLv2. A server will only
-understand SSLv2 client hello messages.
+These are the general-purpose I<version-flexible> SSL/TLS methods.
+The actual protocol version used will be negotiated to the highest version
+mutually supported by the client and the server.
+The supported protocols are SSLv2, SSLv3, TLSv1, TLSv1.1 and TLSv1.2.
+Most applications should use these method, and avoid the version specific
+methods described below.
 
-=item SSLv3_method(void), SSLv3_server_method(void), SSLv3_client_method(void)
+The list of protocols available can be further limited using the
+B<SSL_OP_NO_SSLv2>, B<SSL_OP_NO_SSLv3>, B<SSL_OP_NO_TLSv1>,
+B<SSL_OP_NO_TLSv1_1> and B<SSL_OP_NO_TLSv1_2> options of the
+L<SSL_CTX_set_options(3)> or L<SSL_set_options(3)> functions.
+Clients should avoid creating "holes" in the set of protocols they support,
+when disabling a protocol, make sure that you also disable either all previous
+or all subsequent protocol versions.
+In clients, when a protocol version is disabled without disabling I<all>
+previous protocol versions, the effect is to also disable all subsequent
+protocol versions.
+
+The SSLv2 and SSLv3 protocols are deprecated and should generally not be used.
+Applications should typically use L<SSL_CTX_set_options(3)> in combination with
+the B<SSL_OP_NO_SSLv3> flag to disable negotiation of SSLv3 via the above
+I<version-flexible> SSL/TLS methods.
+The B<SSL_OP_NO_SSLv2> option is set by default, and would need to be cleared
+via L<SSL_CTX_clear_options(3)> in order to enable negotiation of SSLv2.
+
+=item TLSv1_2_method(), TLSv1_2_server_method(), TLSv1_2_client_method()
 
 A TLS/SSL connection established with these methods will only understand the
-SSLv3 protocol. A client will send out SSLv3 client hello messages
-and will indicate that it only understands SSLv3. A server will only understand
-SSLv3 client hello messages. This especially means, that it will
-not understand SSLv2 client hello messages which are widely used for
-compatibility reasons, see SSLv23_*_method().
+TLSv1.2 protocol.  A client will send out TLSv1.2 client hello messages and
+will also indicate that it only understand TLSv1.2.  A server will only
+understand TLSv1.2 client hello messages.
 
-=item TLSv1_method(void), TLSv1_server_method(void), TLSv1_client_method(void)
+=item TLSv1_1_method(), TLSv1_1_server_method(), TLSv1_1_client_method()
 
 A TLS/SSL connection established with these methods will only understand the
-TLSv1 protocol. A client will send out TLSv1 client hello messages
-and will indicate that it only understands TLSv1. A server will only understand
-TLSv1 client hello messages. This especially means, that it will
-not understand SSLv2 client hello messages which are widely used for
-compatibility reasons, see SSLv23_*_method(). It will also not understand
-SSLv3 client hello messages.
+TLSv1.1 protocol.  A client will send out TLSv1.1 client hello messages and
+will also indicate that it only understand TLSv1.1.  A server will only
+understand TLSv1.1 client hello messages.
 
-=item SSLv23_method(void), SSLv23_server_method(void), SSLv23_client_method(void)
+=item TLSv1_method(), TLSv1_server_method(), TLSv1_client_method()
 
-A TLS/SSL connection established with these methods may understand the SSLv2,
-SSLv3, TLSv1, TLSv1.1 and TLSv1.2 protocols.
+A TLS/SSL connection established with these methods will only understand the
+TLSv1 protocol.  A client will send out TLSv1 client hello messages and will
+indicate that it only understands TLSv1.  A server will only understand TLSv1
+client hello messages.
 
-If the cipher list does not contain any SSLv2 ciphersuites (the default
-cipher list does not) or extensions are required (for example server name)
-a client will send out TLSv1 client hello messages including extensions and
-will indicate that it also understands TLSv1.1, TLSv1.2 and permits a
-fallback to SSLv3. A server will support SSLv3, TLSv1, TLSv1.1 and TLSv1.2
-protocols. This is the best choice when compatibility is a concern.
+=item SSLv3_method(), SSLv3_server_method(), SSLv3_client_method()
 
-If any SSLv2 ciphersuites are included in the cipher list and no extensions
-are required then SSLv2 compatible client hellos will be used by clients and
-SSLv2 will be accepted by servers. This is B<not> recommended due to the
-insecurity of SSLv2 and the limited nature of the SSLv2 client hello
-prohibiting the use of extensions.
+A TLS/SSL connection established with these methods will only understand the
+SSLv3 protocol.  A client will send out SSLv3 client hello messages and will
+indicate that it only understands SSLv3.  A server will only understand SSLv3
+client hello messages.  The SSLv3 protocol is deprecated and should not be
+used.
+
+=item SSLv2_method(), SSLv2_server_method(), SSLv2_client_method()
+
+A TLS/SSL connection established with these methods will only understand the
+SSLv2 protocol.  A client will send out SSLv2 client hello messages and will
+also indicate that it only understand SSLv2.  A server will only understand
+SSLv2 client hello messages.  The SSLv2 protocol offers little to no security
+and should not be used.
+As of OpenSSL 1.0.2g, EXPORT ciphers and 56-bit DES are no longer available
+with SSLv2.
+
+=item DTLS_method(), DTLS_server_method(), DTLS_client_method()
+
+These are the version-flexible DTLS methods.
+
+=item DTLSv1_2_method(), DTLSv1_2_server_method(), DTLSv1_2_client_method()
+
+These are the version-specific methods for DTLSv1.2.
+
+=item DTLSv1_method(), DTLSv1_server_method(), DTLSv1_client_method()
+
+These are the version-specific methods for DTLSv1.
 
 =back
 
-The list of protocols available can later be limited using the SSL_OP_NO_SSLv2,
-SSL_OP_NO_SSLv3, SSL_OP_NO_TLSv1, SSL_OP_NO_TLSv1_1 and SSL_OP_NO_TLSv1_2
-options of the SSL_CTX_set_options() or SSL_set_options() functions.
-Using these options it is possible to choose e.g. SSLv23_server_method() and
-be able to negotiate with all possible clients, but to only allow newer
-protocols like TLSv1, TLSv1.1 or TLS v1.2.
-
-Applications which never want to support SSLv2 (even is the cipher string
-is configured to use SSLv2 ciphersuites) can set SSL_OP_NO_SSLv2.
-
-SSL_CTX_new() initializes the list of ciphers, the session cache setting,
-the callbacks, the keys and certificates and the options to its default
-values.
+SSL_CTX_new() initializes the list of ciphers, the session cache setting, the
+callbacks, the keys and certificates and the options to its default values.
 
 =head1 RETURN VALUES
 
@@ -91,8 +156,8 @@ The following return values can occur:
 
 =item NULL
 
-The creation of a new SSL_CTX object failed. Check the error stack to
-find out the reason.
+The creation of a new SSL_CTX object failed. Check the error stack to find out
+the reason.
 
 =item Pointer to an SSL_CTX object
 
@@ -102,6 +167,7 @@ The return value points to an allocated SSL_CTX object.
 
 =head1 SEE ALSO
 
+L<SSL_CTX_set_options(3)>, L<SSL_CTX_clear_options(3)>, L<SSL_set_options(3)>,
 L<SSL_CTX_free(3)|SSL_CTX_free(3)>, L<SSL_accept(3)|SSL_accept(3)>,
 L<ssl(3)|ssl(3)>,  L<SSL_set_connect_state(3)|SSL_set_connect_state(3)>
 
diff --git a/doc/ssl/SSL_CTX_set_options.pod b/doc/ssl/SSL_CTX_set_options.pod
index e80a72cd4d06..9a7e98c1d414 100644
--- a/doc/ssl/SSL_CTX_set_options.pod
+++ b/doc/ssl/SSL_CTX_set_options.pod
@@ -189,15 +189,25 @@ browser has a cert, it will crash/hang.  Works for 3.x and 4.xbeta
 =item SSL_OP_NO_SSLv2
 
 Do not use the SSLv2 protocol.
+As of OpenSSL 1.0.2g the B<SSL_OP_NO_SSLv2> option is set by default.
 
 =item SSL_OP_NO_SSLv3
 
 Do not use the SSLv3 protocol.
+It is recommended that applications should set this option.
 
 =item SSL_OP_NO_TLSv1
 
 Do not use the TLSv1 protocol.
 
+=item SSL_OP_NO_TLSv1_1
+
+Do not use the TLSv1.1 protocol.
+
+=item SSL_OP_NO_TLSv1_2
+
+Do not use the TLSv1.2 protocol.
+
 =item SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION
 
 When performing renegotiation as a server, always start a new session
diff --git a/doc/ssl/ssl.pod b/doc/ssl/ssl.pod
index 242087e691e3..70cca178a204 100644
--- a/doc/ssl/ssl.pod
+++ b/doc/ssl/ssl.pod
@@ -130,41 +130,86 @@ protocol methods defined in B<SSL_METHOD> structures.
 
 =over 4
 
-=item const SSL_METHOD *B<SSLv2_client_method>(void);
+=item const SSL_METHOD *B<SSLv23_method>(void);
 
-Constructor for the SSLv2 SSL_METHOD structure for a dedicated client.
+Constructor for the I<version-flexible> SSL_METHOD structure for
+clients, servers or both.
+See L<SSL_CTX_new(3)> for details.
 
-=item const SSL_METHOD *B<SSLv2_server_method>(void);
+=item const SSL_METHOD *B<SSLv23_client_method>(void);
 
-Constructor for the SSLv2 SSL_METHOD structure for a dedicated server.
+Constructor for the I<version-flexible> SSL_METHOD structure for
+clients.
 
-=item const SSL_METHOD *B<SSLv2_method>(void);
+=item const SSL_METHOD *B<SSLv23_client_method>(void);
 
-Constructor for the SSLv2 SSL_METHOD structure for combined client and server.
+Constructor for the I<version-flexible> SSL_METHOD structure for
+servers.
 
-=item const SSL_METHOD *B<SSLv3_client_method>(void);
+=item const SSL_METHOD *B<TLSv1_2_method>(void);
 
-Constructor for the SSLv3 SSL_METHOD structure for a dedicated client.
+Constructor for the TLSv1.2 SSL_METHOD structure for clients, servers
+or both.
 
-=item const SSL_METHOD *B<SSLv3_server_method>(void);
+=item const SSL_METHOD *B<TLSv1_2_client_method>(void);
 
-Constructor for the SSLv3 SSL_METHOD structure for a dedicated server.
+Constructor for the TLSv1.2 SSL_METHOD structure for clients.
 
-=item const SSL_METHOD *B<SSLv3_method>(void);
+=item const SSL_METHOD *B<TLSv1_2_server_method>(void);
 
-Constructor for the SSLv3 SSL_METHOD structure for combined client and server.
+Constructor for the TLSv1.2 SSL_METHOD structure for servers.
 
-=item const SSL_METHOD *B<TLSv1_client_method>(void);
+=item const SSL_METHOD *B<TLSv1_1_method>(void);
 
-Constructor for the TLSv1 SSL_METHOD structure for a dedicated client.
+Constructor for the TLSv1.1 SSL_METHOD structure for clients, servers
+or both.
 
-=item const SSL_METHOD *B<TLSv1_server_method>(void);
+=item const SSL_METHOD *B<TLSv1_1_client_method>(void);
 
-Constructor for the TLSv1 SSL_METHOD structure for a dedicated server.
+Constructor for the TLSv1.1 SSL_METHOD structure for clients.
+
+=item const SSL_METHOD *B<TLSv1_1_server_method>(void);
+
+Constructor for the TLSv1.1 SSL_METHOD structure for servers.
 
 =item const SSL_METHOD *B<TLSv1_method>(void);
 
-Constructor for the TLSv1 SSL_METHOD structure for combined client and server.
+Constructor for the TLSv1 SSL_METHOD structure for clients, servers
+or both.
+
+=item const SSL_METHOD *B<TLSv1_client_method>(void);
+
+Constructor for the TLSv1 SSL_METHOD structure for clients.
+
+=item const SSL_METHOD *B<TLSv1_server_method>(void);
+
+Constructor for the TLSv1 SSL_METHOD structure for servers.
+
+=item const SSL_METHOD *B<SSLv3_method>(void);
+
+Constructor for the SSLv3 SSL_METHOD structure for clients, servers
+or both.
+
+=item const SSL_METHOD *B<SSLv3_client_method>(void);
+
+Constructor for the SSLv3 SSL_METHOD structure for clients.
+
+=item const SSL_METHOD *B<SSLv3_server_method>(void);
+
+Constructor for the SSLv3 SSL_METHOD structure for servers.
+
+=item const SSL_METHOD *B<SSLv2_method>(void);
+
+Constructor for the SSLv2 SSL_METHOD structure for clients, servers
+or both.
+
+=item const SSL_METHOD *B<SSLv2_client_method>(void);
+
+Constructor for the SSLv2 SSL_METHOD structure for clients.
+
+=item const SSL_METHOD *B<SSLv2_server_method>(void);
+
+Constructor for the SSLv2 SSL_METHOD structure for servers.
 
 =back
 
diff --git a/engines/e_capi.c b/engines/e_capi.c
index f4cd2ffe7fa1..6e524633f3f0 100644
--- a/engines/e_capi.c
+++ b/engines/e_capi.c
@@ -114,6 +114,26 @@
 #  define CERT_SYSTEM_STORE_CURRENT_USER                  0x00010000
 # endif
 
+# ifndef ALG_SID_SHA_256
+#  define ALG_SID_SHA_256                 12
+# endif
+# ifndef ALG_SID_SHA_384
+#  define ALG_SID_SHA_384                 13
+# endif
+# ifndef ALG_SID_SHA_512
+#  define ALG_SID_SHA_512                 14
+# endif
+
+# ifndef CALG_SHA_256
+#  define CALG_SHA_256            (ALG_CLASS_HASH | ALG_TYPE_ANY | ALG_SID_SHA_256)
+# endif
+# ifndef CALG_SHA_384
+#  define CALG_SHA_384            (ALG_CLASS_HASH | ALG_TYPE_ANY | ALG_SID_SHA_384)
+# endif
+# ifndef CALG_SHA_512
+#  define CALG_SHA_512            (ALG_CLASS_HASH | ALG_TYPE_ANY | ALG_SID_SHA_512)
+# endif
+
 # include <openssl/engine.h>
 # include <openssl/pem.h>
 # include <openssl/x509v3.h>
@@ -800,6 +820,18 @@ int capi_rsa_sign(int dtype, const unsigned char *m, unsigned int m_len,
     }
 /* Convert the signature type to a CryptoAPI algorithm ID */
     switch (dtype) {
+    case NID_sha256:
+        alg = CALG_SHA_256;
+        break;
+
+    case NID_sha384:
+        alg = CALG_SHA_384;
+        break;
+
+    case NID_sha512:
+        alg = CALG_SHA_512;
+        break;
+
     case NID_sha1:
         alg = CALG_SHA1;
         break;
diff --git a/ssl/Makefile b/ssl/Makefile
index 7b90fb037550..b6dee5b5ea52 100644
--- a/ssl/Makefile
+++ b/ssl/Makefile
@@ -15,7 +15,7 @@ KRB5_INCLUDES=
 CFLAGS= $(INCLUDES) $(CFLAG)
 
 GENERAL=Makefile README ssl-lib.com install.com
-TEST=ssltest.c heartbeat_test.c clienthellotest.c
+TEST=ssltest.c heartbeat_test.c clienthellotest.c sslv2conftest.c
 APPS=
 
 LIB=$(TOP)/libssl.a
@@ -399,14 +399,14 @@ s2_clnt.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
 s2_clnt.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
 s2_clnt.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
 s2_clnt.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
-s2_clnt.o: ../include/openssl/pqueue.h ../include/openssl/rand.h
-s2_clnt.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-s2_clnt.o: ../include/openssl/sha.h ../include/openssl/srtp.h
-s2_clnt.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-s2_clnt.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-s2_clnt.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-s2_clnt.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-s2_clnt.o: ../include/openssl/x509_vfy.h s2_clnt.c ssl_locl.h
+s2_clnt.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
+s2_clnt.o: ../include/openssl/safestack.h ../include/openssl/sha.h
+s2_clnt.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+s2_clnt.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+s2_clnt.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+s2_clnt.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+s2_clnt.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s2_clnt.c
+s2_clnt.o: ssl_locl.h
 s2_enc.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 s2_enc.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 s2_enc.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -435,18 +435,18 @@ s2_lib.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
 s2_lib.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
 s2_lib.o: ../include/openssl/evp.h ../include/openssl/hmac.h
 s2_lib.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
-s2_lib.o: ../include/openssl/md5.h ../include/openssl/obj_mac.h
-s2_lib.o: ../include/openssl/objects.h ../include/openssl/opensslconf.h
-s2_lib.o: ../include/openssl/opensslv.h ../include/openssl/ossl_typ.h
-s2_lib.o: ../include/openssl/pem.h ../include/openssl/pem2.h
-s2_lib.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
-s2_lib.o: ../include/openssl/rsa.h ../include/openssl/safestack.h
-s2_lib.o: ../include/openssl/sha.h ../include/openssl/srtp.h
-s2_lib.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h
-s2_lib.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h
-s2_lib.o: ../include/openssl/stack.h ../include/openssl/symhacks.h
-s2_lib.o: ../include/openssl/tls1.h ../include/openssl/x509.h
-s2_lib.o: ../include/openssl/x509_vfy.h s2_lib.c ssl_locl.h
+s2_lib.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
+s2_lib.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
+s2_lib.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
+s2_lib.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
+s2_lib.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
+s2_lib.o: ../include/openssl/safestack.h ../include/openssl/sha.h
+s2_lib.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
+s2_lib.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
+s2_lib.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
+s2_lib.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
+s2_lib.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s2_lib.c
+s2_lib.o: ssl_locl.h
 s2_meth.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
 s2_meth.o: ../include/openssl/buffer.h ../include/openssl/comp.h
 s2_meth.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
@@ -487,20 +487,19 @@ s2_pkt.o: ../include/openssl/ssl3.h ../include/openssl/stack.h
 s2_pkt.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h
 s2_pkt.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s2_pkt.c
 s2_pkt.o: ssl_locl.h
-s2_srvr.o: ../crypto/constant_time_locl.h ../e_os.h ../include/openssl/asn1.h
-s2_srvr.o: ../include/openssl/bio.h ../include/openssl/buffer.h
-s2_srvr.o: ../include/openssl/comp.h ../include/openssl/crypto.h
-s2_srvr.o: ../include/openssl/dsa.h ../include/openssl/dtls1.h
-s2_srvr.o: ../include/openssl/e_os2.h ../include/openssl/ec.h
-s2_srvr.o: ../include/openssl/ecdh.h ../include/openssl/ecdsa.h
-s2_srvr.o: ../include/openssl/err.h ../include/openssl/evp.h
-s2_srvr.o: ../include/openssl/hmac.h ../include/openssl/kssl.h
-s2_srvr.o: ../include/openssl/lhash.h ../include/openssl/obj_mac.h
-s2_srvr.o: ../include/openssl/objects.h ../include/openssl/opensslconf.h
-s2_srvr.o: ../include/openssl/opensslv.h ../include/openssl/ossl_typ.h
-s2_srvr.o: ../include/openssl/pem.h ../include/openssl/pem2.h
-s2_srvr.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h
-s2_srvr.o: ../include/openssl/rand.h ../include/openssl/rsa.h
+s2_srvr.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h
+s2_srvr.o: ../include/openssl/buffer.h ../include/openssl/comp.h
+s2_srvr.o: ../include/openssl/crypto.h ../include/openssl/dsa.h
+s2_srvr.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h
+s2_srvr.o: ../include/openssl/ec.h ../include/openssl/ecdh.h
+s2_srvr.o: ../include/openssl/ecdsa.h ../include/openssl/err.h
+s2_srvr.o: ../include/openssl/evp.h ../include/openssl/hmac.h
+s2_srvr.o: ../include/openssl/kssl.h ../include/openssl/lhash.h
+s2_srvr.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h
+s2_srvr.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h
+s2_srvr.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h
+s2_srvr.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h
+s2_srvr.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h
 s2_srvr.o: ../include/openssl/safestack.h ../include/openssl/sha.h
 s2_srvr.o: ../include/openssl/srtp.h ../include/openssl/ssl.h
 s2_srvr.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h
diff --git a/ssl/s2_lib.c b/ssl/s2_lib.c
index d55b93f76bb7..a8036b357f0e 100644
--- a/ssl/s2_lib.c
+++ b/ssl/s2_lib.c
@@ -156,6 +156,7 @@ OPENSSL_GLOBAL const SSL_CIPHER ssl2_ciphers[] = {
      128,
      },
 
+# if 0
 /* RC4_128_EXPORT40_WITH_MD5 */
     {
      1,
@@ -171,6 +172,7 @@ OPENSSL_GLOBAL const SSL_CIPHER ssl2_ciphers[] = {
      40,
      128,
      },
+# endif
 
 /* RC2_128_CBC_WITH_MD5 */
     {
@@ -188,6 +190,7 @@ OPENSSL_GLOBAL const SSL_CIPHER ssl2_ciphers[] = {
      128,
      },
 
+# if 0
 /* RC2_128_CBC_EXPORT40_WITH_MD5 */
     {
      1,
@@ -203,6 +206,7 @@ OPENSSL_GLOBAL const SSL_CIPHER ssl2_ciphers[] = {
      40,
      128,
      },
+# endif
 
 # ifndef OPENSSL_NO_IDEA
 /* IDEA_128_CBC_WITH_MD5 */
@@ -222,6 +226,7 @@ OPENSSL_GLOBAL const SSL_CIPHER ssl2_ciphers[] = {
      },
 # endif
 
+# if 0
 /* DES_64_CBC_WITH_MD5 */
     {
      1,
@@ -237,6 +242,7 @@ OPENSSL_GLOBAL const SSL_CIPHER ssl2_ciphers[] = {
      56,
      56,
      },
+# endif
 
 /* DES_192_EDE3_CBC_WITH_MD5 */
     {
diff --git a/ssl/s3_lib.c b/ssl/s3_lib.c
index f846cb5b7b01..4aac3b279280 100644
--- a/ssl/s3_lib.c
+++ b/ssl/s3_lib.c
@@ -198,6 +198,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      },
 
 /* Cipher 03 */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
     {
      1,
      SSL3_TXT_RSA_RC4_40_MD5,
@@ -212,6 +213,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      40,
      128,
      },
+#endif
 
 /* Cipher 04 */
     {
@@ -246,6 +248,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      },
 
 /* Cipher 06 */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
     {
      1,
      SSL3_TXT_RSA_RC2_40_MD5,
@@ -260,6 +263,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      40,
      128,
      },
+#endif
 
 /* Cipher 07 */
 #ifndef OPENSSL_NO_IDEA
@@ -280,6 +284,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
 #endif
 
 /* Cipher 08 */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
     {
      1,
      SSL3_TXT_RSA_DES_40_CBC_SHA,
@@ -294,8 +299,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      40,
      56,
      },
+#endif
 
 /* Cipher 09 */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
     {
      1,
      SSL3_TXT_RSA_DES_64_CBC_SHA,
@@ -310,6 +317,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      56,
      56,
      },
+#endif
 
 /* Cipher 0A */
     {
@@ -329,6 +337,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
 
 /* The DH ciphers */
 /* Cipher 0B */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
     {
      0,
      SSL3_TXT_DH_DSS_DES_40_CBC_SHA,
@@ -343,8 +352,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      40,
      56,
      },
+#endif
 
 /* Cipher 0C */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
     {
      1,
      SSL3_TXT_DH_DSS_DES_64_CBC_SHA,
@@ -359,6 +370,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      56,
      56,
      },
+#endif
 
 /* Cipher 0D */
     {
@@ -377,6 +389,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      },
 
 /* Cipher 0E */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
     {
      0,
      SSL3_TXT_DH_RSA_DES_40_CBC_SHA,
@@ -391,8 +404,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      40,
      56,
      },
+#endif
 
 /* Cipher 0F */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
     {
      1,
      SSL3_TXT_DH_RSA_DES_64_CBC_SHA,
@@ -407,6 +422,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      56,
      56,
      },
+#endif
 
 /* Cipher 10 */
     {
@@ -426,6 +442,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
 
 /* The Ephemeral DH ciphers */
 /* Cipher 11 */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
     {
      1,
      SSL3_TXT_EDH_DSS_DES_40_CBC_SHA,
@@ -440,8 +457,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      40,
      56,
      },
+#endif
 
 /* Cipher 12 */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
     {
      1,
      SSL3_TXT_EDH_DSS_DES_64_CBC_SHA,
@@ -456,6 +475,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      56,
      56,
      },
+#endif
 
 /* Cipher 13 */
     {
@@ -474,6 +494,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      },
 
 /* Cipher 14 */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
     {
      1,
      SSL3_TXT_EDH_RSA_DES_40_CBC_SHA,
@@ -488,8 +509,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      40,
      56,
      },
+#endif
 
 /* Cipher 15 */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
     {
      1,
      SSL3_TXT_EDH_RSA_DES_64_CBC_SHA,
@@ -504,6 +527,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      56,
      56,
      },
+#endif
 
 /* Cipher 16 */
     {
@@ -522,6 +546,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      },
 
 /* Cipher 17 */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
     {
      1,
      SSL3_TXT_ADH_RC4_40_MD5,
@@ -536,6 +561,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      40,
      128,
      },
+#endif
 
 /* Cipher 18 */
     {
@@ -554,6 +580,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      },
 
 /* Cipher 19 */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
     {
      1,
      SSL3_TXT_ADH_DES_40_CBC_SHA,
@@ -568,8 +595,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      40,
      128,
      },
+#endif
 
 /* Cipher 1A */
+#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
     {
      1,
      SSL3_TXT_ADH_DES_64_CBC_SHA,
@@ -584,6 +613,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      56,
      56,
      },
+#endif
 
 /* Cipher 1B */
     {
@@ -655,6 +685,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
 #ifndef OPENSSL_NO_KRB5
 /* The Kerberos ciphers*/
 /* Cipher 1E */
+# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
     {
      1,
      SSL3_TXT_KRB5_DES_64_CBC_SHA,
@@ -669,6 +700,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      56,
      56,
      },
+# endif
 
 /* Cipher 1F */
     {
@@ -719,6 +751,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      },
 
 /* Cipher 22 */
+# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
     {
      1,
      SSL3_TXT_KRB5_DES_64_CBC_MD5,
@@ -733,6 +766,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      56,
      56,
      },
+# endif
 
 /* Cipher 23 */
     {
@@ -783,6 +817,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      },
 
 /* Cipher 26 */
+# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
     {
      1,
      SSL3_TXT_KRB5_DES_40_CBC_SHA,
@@ -797,8 +832,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      40,
      56,
      },
+# endif
 
 /* Cipher 27 */
+# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
     {
      1,
      SSL3_TXT_KRB5_RC2_40_CBC_SHA,
@@ -813,8 +850,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      40,
      128,
      },
+# endif
 
 /* Cipher 28 */
+# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
     {
      1,
      SSL3_TXT_KRB5_RC4_40_SHA,
@@ -829,8 +868,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      40,
      128,
      },
+# endif
 
 /* Cipher 29 */
+# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
     {
      1,
      SSL3_TXT_KRB5_DES_40_CBC_MD5,
@@ -845,8 +886,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      40,
      56,
      },
+# endif
 
 /* Cipher 2A */
+# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
     {
      1,
      SSL3_TXT_KRB5_RC2_40_CBC_MD5,
@@ -861,8 +904,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      40,
      128,
      },
+# endif
 
 /* Cipher 2B */
+# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
     {
      1,
      SSL3_TXT_KRB5_RC4_40_MD5,
@@ -877,6 +922,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      40,
      128,
      },
+# endif
 #endif                          /* OPENSSL_NO_KRB5 */
 
 /* New AES ciphersuites */
@@ -1300,6 +1346,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
 # endif
 
     /* Cipher 62 */
+# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
     {
      1,
      TLS1_TXT_RSA_EXPORT1024_WITH_DES_CBC_SHA,
@@ -1314,8 +1361,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      56,
      56,
      },
+# endif
 
     /* Cipher 63 */
+# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
     {
      1,
      TLS1_TXT_DHE_DSS_EXPORT1024_WITH_DES_CBC_SHA,
@@ -1330,8 +1379,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      56,
      56,
      },
+# endif
 
     /* Cipher 64 */
+# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
     {
      1,
      TLS1_TXT_RSA_EXPORT1024_WITH_RC4_56_SHA,
@@ -1346,8 +1397,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      56,
      128,
      },
+# endif
 
     /* Cipher 65 */
+# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS
     {
      1,
      TLS1_TXT_DHE_DSS_EXPORT1024_WITH_RC4_56_SHA,
@@ -1362,6 +1415,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
      56,
      128,
      },
+# endif
 
     /* Cipher 66 */
     {
@@ -4326,21 +4380,6 @@ int ssl3_shutdown(SSL *s)
         }
 #endif
     } else if (!(s->shutdown & SSL_RECEIVED_SHUTDOWN)) {
-        if (SSL_in_init(s)) {
-            /*
-             * We can't shutdown properly if we are in the middle of a
-             * handshake. Doing so is problematic because the peer may send a
-             * CCS before it acts on our close_notify. However we should not
-             * continue to process received handshake messages or CCS once our
-             * close_notify has been sent. Therefore any close_notify from
-             * the peer will be unreadable because we have not moved to the next
-             * cipher state. Its best just to avoid this can-of-worms. Return
-             * an error if we are wanting to wait for a close_notify from the
-             * peer and we are in init.
-             */
-            SSLerr(SSL_F_SSL3_SHUTDOWN, SSL_R_SHUTDOWN_WHILE_IN_INIT);
-            return -1;
-        }
         /*
          * If we are waiting for a close from our peer, we are closed
          */
diff --git a/ssl/ssl.h b/ssl/ssl.h
index ae8c92575e03..04d4007eeb8e 100644
--- a/ssl/ssl.h
+++ b/ssl/ssl.h
@@ -2713,7 +2713,6 @@ void ERR_load_SSL_strings(void);
 # define SSL_F_SSL3_SETUP_KEY_BLOCK                       157
 # define SSL_F_SSL3_SETUP_READ_BUFFER                     156
 # define SSL_F_SSL3_SETUP_WRITE_BUFFER                    291
-# define SSL_F_SSL3_SHUTDOWN                              396
 # define SSL_F_SSL3_WRITE_BYTES                           158
 # define SSL_F_SSL3_WRITE_PENDING                         159
 # define SSL_F_SSL_ADD_CERT_CHAIN                         318
diff --git a/ssl/ssl_conf.c b/ssl/ssl_conf.c
index 5478840deae9..8d3709d2b62c 100644
--- a/ssl/ssl_conf.c
+++ b/ssl/ssl_conf.c
@@ -330,11 +330,19 @@ static int cmd_Protocol(SSL_CONF_CTX *cctx, const char *value)
         SSL_FLAG_TBL_INV("TLSv1.1", SSL_OP_NO_TLSv1_1),
         SSL_FLAG_TBL_INV("TLSv1.2", SSL_OP_NO_TLSv1_2)
     };
+    int ret;
+    int sslv2off;
+
     if (!(cctx->flags & SSL_CONF_FLAG_FILE))
         return -2;
     cctx->tbl = ssl_protocol_list;
     cctx->ntbl = sizeof(ssl_protocol_list) / sizeof(ssl_flag_tbl);
-    return CONF_parse_list(value, ',', 1, ssl_set_option_list, cctx);
+
+    sslv2off = *cctx->poptions & SSL_OP_NO_SSLv2;
+    ret = CONF_parse_list(value, ',', 1, ssl_set_option_list, cctx);
+    /* Never turn on SSLv2 through configuration */
+    *cctx->poptions |= sslv2off;
+    return ret;
 }
 
 static int cmd_Options(SSL_CONF_CTX *cctx, const char *value)
diff --git a/ssl/ssl_err.c b/ssl/ssl_err.c
index dd3b2afd1ea6..704088dc469e 100644
--- a/ssl/ssl_err.c
+++ b/ssl/ssl_err.c
@@ -206,7 +206,6 @@ static ERR_STRING_DATA SSL_str_functs[] = {
     {ERR_FUNC(SSL_F_SSL3_SETUP_KEY_BLOCK), "ssl3_setup_key_block"},
     {ERR_FUNC(SSL_F_SSL3_SETUP_READ_BUFFER), "ssl3_setup_read_buffer"},
     {ERR_FUNC(SSL_F_SSL3_SETUP_WRITE_BUFFER), "ssl3_setup_write_buffer"},
-    {ERR_FUNC(SSL_F_SSL3_SHUTDOWN), "ssl3_shutdown"},
     {ERR_FUNC(SSL_F_SSL3_WRITE_BYTES), "ssl3_write_bytes"},
     {ERR_FUNC(SSL_F_SSL3_WRITE_PENDING), "ssl3_write_pending"},
     {ERR_FUNC(SSL_F_SSL_ADD_CERT_CHAIN), "ssl_add_cert_chain"},
diff --git a/ssl/ssl_lib.c b/ssl/ssl_lib.c
index 2744be8ad8ce..f1279bbf9103 100644
--- a/ssl/ssl_lib.c
+++ b/ssl/ssl_lib.c
@@ -1060,7 +1060,12 @@ int SSL_shutdown(SSL *s)
         return -1;
     }
 
-    return s->method->ssl_shutdown(s);
+    if (!SSL_in_init(s)) {
+        return s->method->ssl_shutdown(s);
+    } else {
+        SSLerr(SSL_F_SSL_SHUTDOWN, SSL_R_SHUTDOWN_WHILE_IN_INIT);
+        return -1;
+    }
 }
 
 int SSL_renegotiate(SSL *s)
@@ -2049,6 +2054,13 @@ SSL_CTX *SSL_CTX_new(const SSL_METHOD *meth)
      */
     ret->options |= SSL_OP_LEGACY_SERVER_CONNECT;
 
+    /*
+     * Disable SSLv2 by default, callers that want to enable SSLv2 will have to
+     * explicitly clear this option via either of SSL_CTX_clear_options() or
+     * SSL_clear_options().
+     */
+    ret->options |= SSL_OP_NO_SSLv2;
+
     return (ret);
  err:
     SSLerr(SSL_F_SSL_CTX_NEW, ERR_R_MALLOC_FAILURE);
diff --git a/ssl/sslv2conftest.c b/ssl/sslv2conftest.c
new file mode 100644
index 000000000000..1fd748b11866
--- /dev/null
+++ b/ssl/sslv2conftest.c
@@ -0,0 +1,231 @@
+/* Written by Matt Caswell for the OpenSSL Project */
+/* ====================================================================
+ * Copyright (c) 2016 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include <stdlib.h>
+#include <openssl/bio.h>
+#include <openssl/ssl.h>
+#include <openssl/err.h>
+
+
+#define TOTAL_NUM_TESTS                         2
+#define TEST_SSL_CTX                            0
+
+#define SSLV2ON                                 1
+#define SSLV2OFF                                0
+
+SSL_CONF_CTX *confctx;
+SSL_CTX *ctx;
+SSL *ssl;
+
+static int checksslv2(int test, int sslv2)
+{
+    int options;
+    if (test == TEST_SSL_CTX) {
+        options = SSL_CTX_get_options(ctx);
+    } else {
+        options = SSL_get_options(ssl);
+    }
+    return ((options & SSL_OP_NO_SSLv2) == 0) ^ (sslv2 == SSLV2OFF);
+}
+
+int main(int argc, char *argv[])
+{
+    BIO *err;
+    int testresult = 0;
+    int currtest;
+
+    SSL_library_init();
+    SSL_load_error_strings();
+
+    err = BIO_new_fp(stderr, BIO_NOCLOSE | BIO_FP_TEXT);
+
+    CRYPTO_malloc_debug_init();
+    CRYPTO_set_mem_debug_options(V_CRYPTO_MDEBUG_ALL);
+    CRYPTO_mem_ctrl(CRYPTO_MEM_CHECK_ON);
+
+
+    confctx = SSL_CONF_CTX_new();
+    ctx = SSL_CTX_new(SSLv23_method());
+    ssl = SSL_new(ctx);
+    if (confctx == NULL || ctx == NULL)
+        goto end;
+
+    SSL_CONF_CTX_set_flags(confctx, SSL_CONF_FLAG_FILE
+                                    | SSL_CONF_FLAG_CLIENT
+                                    | SSL_CONF_FLAG_SERVER);
+
+    /*
+     * For each test set up an SSL_CTX and SSL and see whether SSLv2 is enabled
+     * as expected after various SSL_CONF_cmd("Protocol", ...) calls.
+     */
+    for (currtest = 0; currtest < TOTAL_NUM_TESTS; currtest++) {
+        BIO_printf(err, "SSLv2 CONF Test number %d\n", currtest);
+        if (currtest == TEST_SSL_CTX)
+            SSL_CONF_CTX_set_ssl_ctx(confctx, ctx);
+        else
+            SSL_CONF_CTX_set_ssl(confctx, ssl);
+
+        /* SSLv2 should be off by default */
+        if (!checksslv2(currtest, SSLV2OFF)) {
+            BIO_printf(err, "SSLv2 CONF Test: Off by default test FAIL\n");
+            goto end;
+        }
+
+        if (SSL_CONF_cmd(confctx, "Protocol", "ALL") != 2
+                || !SSL_CONF_CTX_finish(confctx)) {
+            BIO_printf(err, "SSLv2 CONF Test: SSL_CONF command FAIL\n");
+            goto end;
+        }
+
+        /* Should still be off even after ALL Protocols on */
+        if (!checksslv2(currtest, SSLV2OFF)) {
+            BIO_printf(err, "SSLv2 CONF Test: Off after config #1 FAIL\n");
+            goto end;
+        }
+
+        if (SSL_CONF_cmd(confctx, "Protocol", "SSLv2") != 2
+                || !SSL_CONF_CTX_finish(confctx)) {
+            BIO_printf(err, "SSLv2 CONF Test: SSL_CONF command FAIL\n");
+            goto end;
+        }
+
+        /* Should still be off even if explicitly asked for */
+        if (!checksslv2(currtest, SSLV2OFF)) {
+            BIO_printf(err, "SSLv2 CONF Test: Off after config #2 FAIL\n");
+            goto end;
+        }
+
+        if (SSL_CONF_cmd(confctx, "Protocol", "-SSLv2") != 2
+                || !SSL_CONF_CTX_finish(confctx)) {
+            BIO_printf(err, "SSLv2 CONF Test: SSL_CONF command FAIL\n");;
+            goto end;
+        }
+
+        if (!checksslv2(currtest, SSLV2OFF)) {
+            BIO_printf(err, "SSLv2 CONF Test: Off after config #3 FAIL\n");
+            goto end;
+        }
+
+        if (currtest == TEST_SSL_CTX)
+            SSL_CTX_clear_options(ctx, SSL_OP_NO_SSLv2);
+        else
+            SSL_clear_options(ssl, SSL_OP_NO_SSLv2);
+
+        if (!checksslv2(currtest, SSLV2ON)) {
+            BIO_printf(err, "SSLv2 CONF Test: On after clear FAIL\n");
+            goto end;
+        }
+
+        if (SSL_CONF_cmd(confctx, "Protocol", "ALL") != 2
+                || !SSL_CONF_CTX_finish(confctx)) {
+            BIO_printf(err, "SSLv2 CONF Test: SSL_CONF command FAIL\n");
+            goto end;
+        }
+
+        /* Option has been cleared and config says have SSLv2 so should be on */
+        if (!checksslv2(currtest, SSLV2ON)) {
+            BIO_printf(err, "SSLv2 CONF Test: On after config #1 FAIL\n");
+            goto end;
+        }
+
+        if (SSL_CONF_cmd(confctx, "Protocol", "SSLv2") != 2
+                || !SSL_CONF_CTX_finish(confctx)) {
+            BIO_printf(err, "SSLv2 CONF Test: SSL_CONF command FAIL\n");
+            goto end;
+        }
+
+        /* Option has been cleared and config says have SSLv2 so should be on */
+        if (!checksslv2(currtest, SSLV2ON)) {
+            BIO_printf(err, "SSLv2 CONF Test: On after config #2 FAIL\n");
+            goto end;
+        }
+
+        if (SSL_CONF_cmd(confctx, "Protocol", "-SSLv2") != 2
+                || !SSL_CONF_CTX_finish(confctx)) {
+            BIO_printf(err, "SSLv2 CONF Test: SSL_CONF command FAIL\n");
+            goto end;
+        }
+
+        /* Option has been cleared but config says no SSLv2 so should be off */
+        if (!checksslv2(currtest, SSLV2OFF)) {
+            BIO_printf(err, "SSLv2 CONF Test: Off after config #4 FAIL\n");
+            goto end;
+        }
+
+    }
+
+    testresult = 1;
+
+ end:
+    SSL_free(ssl);
+    SSL_CTX_free(ctx);
+    SSL_CONF_CTX_free(confctx);
+
+    if (!testresult) {
+        printf("SSLv2 CONF test: FAILED (Test %d)\n", currtest);
+        ERR_print_errors(err);
+    } else {
+        printf("SSLv2 CONF test: PASSED\n");
+    }
+
+    ERR_free_strings();
+    ERR_remove_thread_state(NULL);
+    EVP_cleanup();
+    CRYPTO_cleanup_all_ex_data();
+    CRYPTO_mem_leaks(err);
+    BIO_free(err);
+
+    return testresult ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/util/libeay.num b/util/libeay.num
index 7f7487df5044..e5b3c6ea841c 100755
--- a/util/libeay.num
+++ b/util/libeay.num
@@ -1807,6 +1807,8 @@ ASN1_UTCTIME_get                        2350	NOEXIST::FUNCTION:
 X509_REQ_digest                         2362	EXIST::FUNCTION:EVP
 X509_CRL_digest                         2391	EXIST::FUNCTION:EVP
 ASN1_STRING_clear_free                  2392	EXIST::FUNCTION:
+SRP_VBASE_get1_by_user                  2393	EXIST::FUNCTION:SRP
+SRP_user_pwd_free                       2394	EXIST::FUNCTION:SRP
 d2i_ASN1_SET_OF_PKCS7                   2397	NOEXIST::FUNCTION:
 X509_ALGOR_cmp                          2398	EXIST::FUNCTION:
 EVP_CIPHER_CTX_set_key_length           2399	EXIST::FUNCTION:
diff --git a/util/mk1mf.pl b/util/mk1mf.pl
index 99652aff918c..2629a1c5dd64 100755
--- a/util/mk1mf.pl
+++ b/util/mk1mf.pl
@@ -290,6 +290,7 @@ $cflags.=" -DOPENSSL_NO_HW"   if $no_hw;
 $cflags.=" -DOPENSSL_FIPS"    if $fips;
 $cflags.=" -DOPENSSL_NO_JPAKE"    if $no_jpake;
 $cflags.=" -DOPENSSL_NO_EC2M"    if $no_ec2m;
+$cflags.=" -DOPENSSL_NO_WEAK_SSL_CIPHERS"   if $no_weak_ssl;
 $cflags.= " -DZLIB" if $zlib_opt;
 $cflags.= " -DZLIB_SHARED" if $zlib_opt == 2;
 
@@ -482,7 +483,7 @@ EX_LIBS=$ex_libs
 # The OpenSSL directory
 SRC_D=$src_dir
 
-LINK=$link
+LINK_CMD=$link
 LFLAGS=$lflags
 RSC=$rsc
 
@@ -1205,6 +1206,7 @@ sub read_options
 		"no-jpake" => \$no_jpake,
 		"no-ec2m" => \$no_ec2m,
 		"no-ec_nistp_64_gcc_128" => 0,
+		"no-weak-ssl-ciphers" => \$no_weak_ssl,
 		"no-err" => \$no_err,
 		"no-sock" => \$no_sock,
 		"no-krb5" => \$no_krb5,
diff --git a/util/pl/BC-32.pl b/util/pl/BC-32.pl
index f7161d7bfe2d..375b0a76dfcb 100644
--- a/util/pl/BC-32.pl
+++ b/util/pl/BC-32.pl
@@ -118,7 +118,7 @@ ___
 		{
 		local($ex)=($target =~ /O_SSL/)?' $(L_CRYPTO)':'';
 		$ex.=' ws2_32.lib gdi32.lib';
-		$ret.="\t\$(LINK) \$(MLFLAGS) $efile$target /def:ms/${Name}.def @<<\n  \$(SHLIB_EX_OBJ) $objs $ex\n<<\n";
+		$ret.="\t\$(LINK_CMD) \$(MLFLAGS) $efile$target /def:ms/${Name}.def @<<\n  \$(SHLIB_EX_OBJ) $objs $ex\n<<\n";
 		}
 	$ret.="\n";
 	return($ret);
@@ -132,7 +132,7 @@ sub do_link_rule
 	$file =~ s/\//$o/g if $o ne '/';
 	$n=&bname($target);
 	$ret.="$target: $files $dep_libs\n";
-	$ret.="\t\$(LINK) \$(LFLAGS) $files \$(APP_EX_OBJ), $target,, $libs\n\n";
+	$ret.="\t\$(LINK_CMD) \$(LFLAGS) $files \$(APP_EX_OBJ), $target,, $libs\n\n";
 	return($ret);
 	}
 
diff --git a/util/pl/Mingw32.pl b/util/pl/Mingw32.pl
index fe3fb27a7860..55c85f644714 100644
--- a/util/pl/Mingw32.pl
+++ b/util/pl/Mingw32.pl
@@ -98,7 +98,7 @@ sub do_link_rule
 	$file =~ s/\//$o/g if $o ne '/';
 	$n=&bname($target);
 	$ret.="$target: $files $dep_libs\n";
-	$ret.="\t\$(LINK) ${efile}$target \$(LFLAGS) $files $libs\n\n";
+	$ret.="\t\$(LINK_CMD) ${efile}$target \$(LFLAGS) $files $libs\n\n";
 	return($ret);
 	}
 1;
diff --git a/util/pl/OS2-EMX.pl b/util/pl/OS2-EMX.pl
index 28cd1169079a..92a332e6e906 100644
--- a/util/pl/OS2-EMX.pl
+++ b/util/pl/OS2-EMX.pl
@@ -99,7 +99,7 @@ sub do_lib_rule
 		{
 		local($ex)=($target =~ /O_SSL/)?' $(L_CRYPTO)':'';
 		$ex.=' -lsocket';
-		$ret.="\t\$(LINK) \$(SHLIB_CFLAGS) \$(MLFLAGS) $efile$target \$(SHLIB_EX_OBJ) \$(${Name}OBJ) $ex os2/${Name}.def\n";
+		$ret.="\t\$(LINK_CMD) \$(SHLIB_CFLAGS) \$(MLFLAGS) $efile$target \$(SHLIB_EX_OBJ) \$(${Name}OBJ) $ex os2/${Name}.def\n";
 		$ret.="\temximp -o $out_def/$name.a os2/${Name}.def\n";
 		$ret.="\temximp -o $out_def/$name.lib os2/${Name}.def\n\n";
 		}
@@ -113,7 +113,7 @@ sub do_link_rule
 	$file =~ s/\//$o/g if $o ne '/';
 	$n=&bname($target);
 	$ret.="$target: $files $dep_libs\n";
-	$ret.="\t\$(LINK) ${efile}$target \$(CFLAG) \$(LFLAGS) $files $libs\n\n";
+	$ret.="\t\$(LINK_CMD) ${efile}$target \$(CFLAG) \$(LFLAGS) $files $libs\n\n";
 	return($ret);
 	}
 
diff --git a/util/pl/VC-32.pl b/util/pl/VC-32.pl
index 0f5547f056c2..dba96cba5e75 100644
--- a/util/pl/VC-32.pl
+++ b/util/pl/VC-32.pl
@@ -330,7 +330,7 @@ sub do_lib_rule
  		if ($fips && $target =~ /O_CRYPTO/)
 			{
 			$ret.="$target: $objs \$(PREMAIN_DSO_EXE)";
-			$ret.="\n\tSET FIPS_LINK=\$(LINK)\n";
+			$ret.="\n\tSET FIPS_LINK=\$(LINK_CMD)\n";
 			$ret.="\tSET FIPS_CC=\$(CC)\n";
 			$ret.="\tSET FIPS_CC_ARGS=/Fo\$(OBJ_D)${o}fips_premain.obj \$(SHLIB_CFLAGS) -c\n";
 			$ret.="\tSET PREMAIN_DSO_EXE=\$(PREMAIN_DSO_EXE)\n";
@@ -344,7 +344,7 @@ sub do_lib_rule
 		else
 			{
 			$ret.="$target: $objs";
-			$ret.="\n\t\$(LINK) \$(MLFLAGS) $efile$target $name @<<\n  \$(SHLIB_EX_OBJ) $objs $ex \$(EX_LIBS)\n<<\n";
+			$ret.="\n\t\$(LINK_CMD) \$(MLFLAGS) $efile$target $name @<<\n  \$(SHLIB_EX_OBJ) $objs $ex \$(EX_LIBS)\n<<\n";
 			}
 		$ret.="\tIF EXIST \$@.manifest mt -nologo -manifest \$@.manifest -outputresource:\$@;2\n\n";
 		}
@@ -363,7 +363,7 @@ sub do_link_rule
 		{
 		$ret.=" \$(OBJ_D)${o}applink.obj" if $shlib;
 		$ret.="\n";
-		$ret.="  \$(LINK) \$(LFLAGS) $efile$target @<<\n\t";
+		$ret.="  \$(LINK_CMD) \$(LFLAGS) $efile$target @<<\n\t";
 		if ($files =~ /O_FIPSCANISTER/ && !$fipscanisterbuild) {
 			$ret.= "\$(EX_LIBS) ";
 			$ret.= "\$(OBJ_D)${o}applink.obj " if $shlib;
@@ -373,7 +373,7 @@ sub do_link_rule
 	elsif ($standalone == 2)
 		{
 		$ret.="\n";
-		$ret.="\tSET FIPS_LINK=\$(LINK)\n";
+		$ret.="\tSET FIPS_LINK=\$(LINK_CMD)\n";
 		$ret.="\tSET FIPS_CC=\$(CC)\n";
 		$ret.="\tSET FIPS_CC_ARGS=/Fo\$(OBJ_D)${o}fips_premain.obj \$(SHLIB_CFLAGS) -c\n";
 		$ret.="\tSET PREMAIN_DSO_EXE=\n";
@@ -386,7 +386,7 @@ sub do_link_rule
 	else
 		{
 		$ret.="\n";
-		$ret.="\t\$(LINK) \$(LFLAGS) $efile$target @<<\n";
+		$ret.="\t\$(LINK_CMD) \$(LFLAGS) $efile$target @<<\n";
 		$ret.="\t\$(APP_EX_OBJ) $files $libs\n<<\n";
 		}
     	$ret.="\tIF EXIST \$@.manifest mt -nologo -manifest \$@.manifest -outputresource:\$@;1\n\n";
diff --git a/util/pl/linux.pl b/util/pl/linux.pl
index d24f7b72913c..3362941f7bf3 100644
--- a/util/pl/linux.pl
+++ b/util/pl/linux.pl
@@ -78,7 +78,7 @@ sub do_link_rule
 	$file =~ s/\//$o/g if $o ne '/';
 	$n=&bname($target);
 	$ret.="$target: $files $dep_libs\n";
-	$ret.="\t\$(LINK) ${efile}$target \$(LFLAGS) $files $libs\n\n";
+	$ret.="\t\$(LINK_CMD) ${efile}$target \$(LFLAGS) $files $libs\n\n";
 	return($ret);
 	}
 
diff --git a/util/pl/netware.pl b/util/pl/netware.pl
index fe80a9bb8990..16f4f4ee37c5 100644
--- a/util/pl/netware.pl
+++ b/util/pl/netware.pl
@@ -506,22 +506,22 @@ sub do_link_rule
       if ($gnuc)
       {
          $ret.="\t\$(MKLIB) $lib_flags \$(TMP_D)${o}\$(E_EXE).a \$(filter-out \$(TMP_D)${o}\$(E_EXE)${obj},$files)\n";
-         $ret.="\t\$(LINK) \$(LFLAGS) $def_file2\n";
+         $ret.="\t\$(LINK_CMD) \$(LFLAGS) $def_file2\n";
          $ret.="\t\@$mv \$(E_EXE)2.nlm \$(TEST_D)\n";
       }
       else
       {
-         $ret.="\t\$(LINK) \$(LFLAGS) $def_file2 $files \"$prelude\" $libs -o $target2\n";
+         $ret.="\t\$(LINK_CMD) \$(LFLAGS) $def_file2 $files \"$prelude\" $libs -o $target2\n";
       }
    }
    if ($gnuc)
    {
-      $ret.="\t\$(LINK) \$(LFLAGS) $def_file\n";
+      $ret.="\t\$(LINK_CMD) \$(LFLAGS) $def_file\n";
       $ret.="\t\@$mv \$(\@F) \$(TEST_D)\n";
    }
    else
    {
-      $ret.="\t\$(LINK) \$(LFLAGS) $def_file $files \"$prelude\" $libs -o $target\n";
+      $ret.="\t\$(LINK_CMD) \$(LFLAGS) $def_file $files \"$prelude\" $libs -o $target\n";
    }
 
    $ret.="\n";
diff --git a/util/pl/ultrix.pl b/util/pl/ultrix.pl
index ea370c71f968..0c76c83b4a73 100644
--- a/util/pl/ultrix.pl
+++ b/util/pl/ultrix.pl
@@ -31,7 +31,7 @@ sub do_link_rule
 	$file =~ s/\//$o/g if $o ne '/';
 	$n=&bname($target);
 	$ret.="$target: $files $dep_libs\n";
-	$ret.="\t\$(LINK) ${efile}$target \$(LFLAGS) $files $libs\n\n";
+	$ret.="\t\$(LINK_CMD) ${efile}$target \$(LFLAGS) $files $libs\n\n";
 	return($ret);
 	}
 
diff --git a/util/pl/unix.pl b/util/pl/unix.pl
index 1d4e9dc5df19..8818c5bcb1b2 100644
--- a/util/pl/unix.pl
+++ b/util/pl/unix.pl
@@ -164,7 +164,7 @@ sub do_link_rule
 	$file =~ s/\//$o/g if $o ne '/';
 	$n=&bname($target);
 	$ret.="$target: $files $dep_libs\n";
-	$ret.="\t\$(LINK) ${efile}$target \$(LFLAGS) $files $libs\n\n";
+	$ret.="\t\$(LINK_CMD) ${efile}$target \$(LFLAGS) $files $libs\n\n";
 	return($ret);
 	}