From 2ab7aa997e7ad73a02856b2f25d993837b9db925 Mon Sep 17 00:00:00 2001 From: "Simon L. B. Nielsen" Date: Sun, 30 Jul 2006 14:17:54 +0000 Subject: [PATCH] Resolve conflicts after import of OpenSSL 0.9.8b. This was missed the first time around since eng_padlock.c was not part of OpenSSL 0.9.7e and therefor did not have the v0_9_7e CVS tag used during original resolve of conflicts. Noticed by: Antoine Brodin --- crypto/openssl/crypto/engine/eng_padlock.c | 268 +++++++++++++++++---- 1 file changed, 215 insertions(+), 53 deletions(-) diff --git a/crypto/openssl/crypto/engine/eng_padlock.c b/crypto/openssl/crypto/engine/eng_padlock.c index 4ca4f39d8969..8d92af6f8bd2 100644 --- a/crypto/openssl/crypto/engine/eng_padlock.c +++ b/crypto/openssl/crypto/engine/eng_padlock.c @@ -65,17 +65,17 @@ #include #include -#include -#ifdef _MSC_VER -# define alloca _alloca -# define snprintf _snprintf -#endif +#include #include #include #include #include +#ifndef OPENSSL_NO_AES #include +#endif +#include +#include #ifndef OPENSSL_NO_HW #ifndef OPENSSL_NO_HW_PADLOCK @@ -101,13 +101,13 @@ compiler choice is limited to GCC and Microsoft C. */ #undef COMPILE_HW_PADLOCK #if !defined(I386_ONLY) && !defined(OPENSSL_NO_INLINE_ASM) -# if defined(__i386__) || defined(__i386) || defined(_M_IX86) +# if (defined(__GNUC__) && (defined(__i386__) || defined(__i386))) || \ + (defined(_MSC_VER) && defined(_M_IX86)) # define COMPILE_HW_PADLOCK +static ENGINE *ENGINE_padlock (void); # endif #endif -static ENGINE *ENGINE_padlock (void); - void ENGINE_load_padlock (void) { /* On non-x86 CPUs it just returns. */ @@ -121,6 +121,15 @@ void ENGINE_load_padlock (void) } #ifdef COMPILE_HW_PADLOCK +/* We do these includes here to avoid header problems on platforms that + do not have the VIA padlock anyway... */ +#ifdef _MSC_VER +# include +# define alloca _alloca +#else +# include +#endif + /* Function for ENGINE detection and control */ static int padlock_available(void); static int padlock_init(ENGINE *e); @@ -129,7 +138,9 @@ static int padlock_init(ENGINE *e); static RAND_METHOD padlock_rand; /* Cipher Stuff */ +#ifndef OPENSSL_NO_AES static int padlock_ciphers(ENGINE *e, const EVP_CIPHER **cipher, const int **nids, int nid); +#endif /* Engine names */ static const char *padlock_id = "padlock"; @@ -138,7 +149,9 @@ static char padlock_name[100]; /* Available features */ static int padlock_use_ace = 0; /* Advanced Cryptography Engine */ static int padlock_use_rng = 0; /* Random Number Generator */ +#ifndef OPENSSL_NO_AES static int padlock_aes_align_required = 1; +#endif /* ===== Engine "management" functions ===== */ @@ -154,7 +167,8 @@ padlock_bind_helper(ENGINE *e) #endif /* Generate a nice engine name with available features */ - snprintf(padlock_name, sizeof(padlock_name), "VIA PadLock (%s, %s)", + BIO_snprintf(padlock_name, sizeof(padlock_name), + "VIA PadLock (%s, %s)", padlock_use_rng ? "RNG" : "no-RNG", padlock_use_ace ? "ACE" : "no-ACE"); @@ -163,8 +177,9 @@ padlock_bind_helper(ENGINE *e) !ENGINE_set_name(e, padlock_name) || !ENGINE_set_init_function(e, padlock_init) || - +#ifndef OPENSSL_NO_AES (padlock_use_ace && !ENGINE_set_ciphers (e, padlock_ciphers)) || +#endif (padlock_use_rng && !ENGINE_set_RAND (e, &padlock_rand))) { return 0; } @@ -222,6 +237,7 @@ IMPLEMENT_DYNAMIC_BIND_FN (padlock_bind_fn); /* ===== Here comes the "real" engine ===== */ +#ifndef OPENSSL_NO_AES /* Some AES-related constants */ #define AES_BLOCK_SIZE 16 #define AES_KEY_SIZE_128 16 @@ -241,10 +257,12 @@ struct padlock_cipher_data union { unsigned int pad[4]; struct { int rounds:4; - int algo:3; - int keygen:1; + int dgst:1; /* n/a in C3 */ + int align:1; /* n/a in C3 */ + int ciphr:1; /* n/a in C3 */ + unsigned int keygen:1; int interm:1; - int encdec:1; + unsigned int encdec:1; int ksize:2; } b; } cword; /* Control word */ @@ -258,6 +276,7 @@ struct padlock_cipher_data * so we accept the penatly... */ static volatile struct padlock_cipher_data *padlock_saved_context; +#endif /* * ======================================================= @@ -349,18 +368,20 @@ padlock_available(void) return padlock_use_ace + padlock_use_rng; } +#ifndef OPENSSL_NO_AES /* Our own htonl()/ntohl() */ static inline void padlock_bswapl(AES_KEY *ks) { size_t i = sizeof(ks->rd_key)/sizeof(ks->rd_key[0]); - unsigned long *key = ks->rd_key; + unsigned int *key = ks->rd_key; while (i--) { asm volatile ("bswapl %0" : "+r"(*key)); key++; } } +#endif /* Force key reload from memory to the CPU microcode. Loading EFLAGS from the stack clears EFLAGS[30] @@ -371,6 +392,7 @@ padlock_reload_key(void) asm volatile ("pushfl; popfl"); } +#ifndef OPENSSL_NO_AES /* * This is heuristic key context tracing. At first one * believes that one should use atomic swap instructions, @@ -385,14 +407,14 @@ padlock_verify_context(struct padlock_cipher_data *cdata) { asm volatile ( "pushfl\n" -" bt $30,(%%esp)\n" +" btl $30,(%%esp)\n" " jnc 1f\n" -" cmp %2,%1\n" +" cmpl %2,%1\n" " je 1f\n" -" mov %2,%0\n" " popfl\n" -" sub $4,%%esp\n" -"1: add $4,%%esp" +" subl $4,%%esp\n" +"1: addl $4,%%esp\n" +" movl %2,%0" :"+m"(padlock_saved_context) : "r"(padlock_saved_context), "r"(cdata) : "cc"); } @@ -420,10 +442,11 @@ static inline void *name(size_t cnt, \ } /* Generate all functions with appropriate opcodes */ -PADLOCK_XCRYPT_ASM(padlock_xcrypt_ecb, ".byte 0xf3,0x0f,0xa7,0xc8"); /* rep xcryptecb */ -PADLOCK_XCRYPT_ASM(padlock_xcrypt_cbc, ".byte 0xf3,0x0f,0xa7,0xd0"); /* rep xcryptcbc */ -PADLOCK_XCRYPT_ASM(padlock_xcrypt_cfb, ".byte 0xf3,0x0f,0xa7,0xe0"); /* rep xcryptcfb */ -PADLOCK_XCRYPT_ASM(padlock_xcrypt_ofb, ".byte 0xf3,0x0f,0xa7,0xe8"); /* rep xcryptofb */ +PADLOCK_XCRYPT_ASM(padlock_xcrypt_ecb, ".byte 0xf3,0x0f,0xa7,0xc8") /* rep xcryptecb */ +PADLOCK_XCRYPT_ASM(padlock_xcrypt_cbc, ".byte 0xf3,0x0f,0xa7,0xd0") /* rep xcryptcbc */ +PADLOCK_XCRYPT_ASM(padlock_xcrypt_cfb, ".byte 0xf3,0x0f,0xa7,0xe0") /* rep xcryptcfb */ +PADLOCK_XCRYPT_ASM(padlock_xcrypt_ofb, ".byte 0xf3,0x0f,0xa7,0xe8") /* rep xcryptofb */ +#endif /* The RNG call itself */ static inline unsigned int @@ -439,6 +462,29 @@ padlock_xstore(void *addr, unsigned int edx_in) return eax_out; } +/* Why not inline 'rep movsd'? I failed to find information on what + * value in Direction Flag one can expect and consequently have to + * apply "better-safe-than-sorry" approach and assume "undefined." + * I could explicitly clear it and restore the original value upon + * return from padlock_aes_cipher, but it's presumably too much + * trouble for too little gain... + * + * In case you wonder 'rep xcrypt*' instructions above are *not* + * affected by the Direction Flag and pointers advance toward + * larger addresses unconditionally. + */ +static inline unsigned char * +padlock_memcpy(void *dst,const void *src,size_t n) +{ + long *d=dst; + const long *s=src; + + n /= sizeof(*d); + do { *d++ = *s++; } while (--n); + + return dst; +} + #elif defined(_MSC_VER) /* * Unlike GCC these are real functions. In order to minimize impact @@ -492,10 +538,10 @@ padlock_verify_context(void *cdata) jnc skip cmp ecx,padlock_saved_context je skip - mov padlock_saved_context,ecx popfd sub esp,4 skip: add esp,4 + mov padlock_saved_context,ecx } } @@ -563,9 +609,15 @@ padlock_bswapl(void *key) popfd } } + +/* MS actually specifies status of Direction Flag and compiler even + * manages to compile following as 'rep movsd' all by itself... + */ +#define padlock_memcpy(o,i,n) ((unsigned char *)memcpy((o),(i),(n)&~3U)) #endif /* ===== AES encryption/decryption ===== */ +#ifndef OPENSSL_NO_AES #if defined(NID_aes_128_cfb128) && ! defined (NID_aes_128_cfb) #define NID_aes_128_cfb NID_aes_128_cfb128 @@ -600,13 +652,13 @@ static int padlock_cipher_nids[] = { NID_aes_192_ecb, NID_aes_192_cbc, -// NID_aes_192_cfb, /* FIXME: AES192/256 CFB/OFB don't work. */ -// NID_aes_192_ofb, + NID_aes_192_cfb, + NID_aes_192_ofb, NID_aes_256_ecb, NID_aes_256_cbc, -// NID_aes_256_cfb, -// NID_aes_256_ofb, + NID_aes_256_cfb, + NID_aes_256_ofb, }; static int padlock_cipher_nids_num = (sizeof(padlock_cipher_nids)/ sizeof(padlock_cipher_nids[0])); @@ -615,19 +667,24 @@ static int padlock_cipher_nids_num = (sizeof(padlock_cipher_nids)/ static int padlock_aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, const unsigned char *iv, int enc); static int padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, - const unsigned char *in, unsigned int nbytes); + const unsigned char *in, size_t nbytes); -#define NEAREST_ALIGNED(ptr) ( (char *)(ptr) + \ +#define NEAREST_ALIGNED(ptr) ( (unsigned char *)(ptr) + \ ( (0x10 - ((size_t)(ptr) & 0x0F)) & 0x0F ) ) #define ALIGNED_CIPHER_DATA(ctx) ((struct padlock_cipher_data *)\ NEAREST_ALIGNED(ctx->cipher_data)) +#define EVP_CIPHER_block_size_ECB AES_BLOCK_SIZE +#define EVP_CIPHER_block_size_CBC AES_BLOCK_SIZE +#define EVP_CIPHER_block_size_OFB 1 +#define EVP_CIPHER_block_size_CFB 1 + /* Declaring so many ciphers by hand would be a pain. Instead introduce a bit of preprocessor magic :-) */ #define DECLARE_AES_EVP(ksize,lmode,umode) \ static const EVP_CIPHER padlock_aes_##ksize##_##lmode = { \ NID_aes_##ksize##_##lmode, \ - AES_BLOCK_SIZE, \ + EVP_CIPHER_block_size_##umode, \ AES_KEY_SIZE_##ksize, \ AES_BLOCK_SIZE, \ 0 | EVP_CIPH_##umode##_MODE, \ @@ -729,7 +786,10 @@ padlock_aes_init_key (EVP_CIPHER_CTX *ctx, const unsigned char *key, memset(cdata, 0, sizeof(struct padlock_cipher_data)); /* Prepare Control word. */ - cdata->cword.b.encdec = (ctx->encrypt == 0); + if (EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_OFB_MODE) + cdata->cword.b.encdec = 0; + else + cdata->cword.b.encdec = (ctx->encrypt == 0); cdata->cword.b.rounds = 10 + (key_len - 128) / 32; cdata->cword.b.ksize = (key_len - 128) / 64; @@ -749,14 +809,16 @@ padlock_aes_init_key (EVP_CIPHER_CTX *ctx, const unsigned char *key, and is listed as hardware errata. They most likely will fix it at some point and then a check for stepping would be due here. */ - if (enc) + if (EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_CFB_MODE || + EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_OFB_MODE || + enc) AES_set_encrypt_key(key, key_len, &cdata->ks); else AES_set_decrypt_key(key, key_len, &cdata->ks); - - /* OpenSSL internal functions use byte-swapped extended key. */ +#ifndef AES_ASM + /* OpenSSL C functions use byte-swapped extended key. */ padlock_bswapl(&cdata->ks); - +#endif cdata->cword.b.keygen = 1; break; @@ -824,7 +886,7 @@ padlock_aes_cipher_omnivorous(EVP_CIPHER_CTX *ctx, unsigned char *out_arg, } #ifndef PADLOCK_CHUNK -# define PADLOCK_CHUNK 4096 /* Must be a power of 2 larger than 16 */ +# define PADLOCK_CHUNK 512 /* Must be a power of 2 larger than 16 */ #endif #if PADLOCK_CHUNK<16 || PADLOCK_CHUNK&(PADLOCK_CHUNK-1) # error "insane PADLOCK_CHUNK..." @@ -838,20 +900,68 @@ padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out_arg, { struct padlock_cipher_data *cdata; const void *inp; - char *out; + unsigned char *out; void *iv; int inp_misaligned, out_misaligned, realign_in_loop; size_t chunk, allocated=0; + /* ctx->num is maintained in byte-oriented modes, + such as CFB and OFB... */ + if ((chunk = ctx->num)) { /* borrow chunk variable */ + unsigned char *ivp=ctx->iv; + + switch (EVP_CIPHER_CTX_mode(ctx)) { + case EVP_CIPH_CFB_MODE: + if (chunk >= AES_BLOCK_SIZE) + return 0; /* bogus value */ + + if (ctx->encrypt) + while (chunknum = chunk%AES_BLOCK_SIZE; + break; + case EVP_CIPH_OFB_MODE: + if (chunk >= AES_BLOCK_SIZE) + return 0; /* bogus value */ + + while (chunknum = chunk%AES_BLOCK_SIZE; + break; + } + } + if (nbytes == 0) return 1; +#if 0 if (nbytes % AES_BLOCK_SIZE) return 0; /* are we expected to do tail processing? */ +#else + /* nbytes is always multiple of AES_BLOCK_SIZE in ECB and CBC + modes and arbitrary value in byte-oriented modes, such as + CFB and OFB... */ +#endif /* VIA promises CPUs that won't require alignment in the future. For now padlock_aes_align_required is initialized to 1 and the condition is never met... */ - if (!padlock_aes_align_required) + /* C7 core is capable to manage unaligned input in non-ECB[!] + mode, but performance penalties appear to be approximately + same as for software alignment below or ~3x. They promise to + improve it in the future, but for now we can just as well + pretend that it can only handle aligned input... */ + if (!padlock_aes_align_required && (nbytes%AES_BLOCK_SIZE)==0) return padlock_aes_cipher_omnivorous(ctx, out_arg, in_arg, nbytes); inp_misaligned = (((size_t)in_arg) & 0x0F); @@ -863,7 +973,7 @@ padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out_arg, * in order to improve L1 cache utilization... */ realign_in_loop = out_misaligned|inp_misaligned; - if (!realign_in_loop) + if (!realign_in_loop && (nbytes%AES_BLOCK_SIZE)==0) return padlock_aes_cipher_omnivorous(ctx, out_arg, in_arg, nbytes); /* this takes one "if" out of the loops */ @@ -887,7 +997,7 @@ padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out_arg, case EVP_CIPH_ECB_MODE: do { if (inp_misaligned) - inp = memcpy(out, in_arg, chunk&~3); + inp = padlock_memcpy(out, in_arg, chunk); else inp = in_arg; in_arg += chunk; @@ -895,7 +1005,7 @@ padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out_arg, padlock_xcrypt_ecb(chunk/AES_BLOCK_SIZE, cdata, out, inp); if (out_misaligned) - out_arg = (char *)memcpy(out_arg, out, chunk&~3) + chunk; + out_arg = padlock_memcpy(out_arg, out, chunk) + chunk; else out = out_arg+=chunk; @@ -913,7 +1023,7 @@ padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out_arg, chunk = PADLOCK_CHUNK; cbc_shortcut: /* optimize for small input */ if (inp_misaligned) - inp = memcpy(out, in_arg, chunk&~3); + inp = padlock_memcpy(out, in_arg, chunk); else inp = in_arg; in_arg += chunk; @@ -921,7 +1031,7 @@ padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out_arg, iv = padlock_xcrypt_cbc(chunk/AES_BLOCK_SIZE, cdata, out, inp); if (out_misaligned) - out_arg = (char *)memcpy(out_arg, out, chunk&~3) + chunk; + out_arg = padlock_memcpy(out_arg, out, chunk) + chunk; else out = out_arg+=chunk; @@ -930,15 +1040,17 @@ padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out_arg, break; case EVP_CIPH_CFB_MODE: - memcpy (cdata->iv, ctx->iv, AES_BLOCK_SIZE); - goto cfb_shortcut; + memcpy (iv = cdata->iv, ctx->iv, AES_BLOCK_SIZE); + chunk &= ~(AES_BLOCK_SIZE-1); + if (chunk) goto cfb_shortcut; + else goto cfb_skiploop; do { if (iv != cdata->iv) memcpy(cdata->iv, iv, AES_BLOCK_SIZE); chunk = PADLOCK_CHUNK; cfb_shortcut: /* optimize for small input */ if (inp_misaligned) - inp = memcpy(out, in_arg, chunk&~3); + inp = padlock_memcpy(out, in_arg, chunk); else inp = in_arg; in_arg += chunk; @@ -946,19 +1058,53 @@ padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out_arg, iv = padlock_xcrypt_cfb(chunk/AES_BLOCK_SIZE, cdata, out, inp); if (out_misaligned) - out_arg = (char *)memcpy(out_arg, out, chunk&~3) + chunk; + out_arg = padlock_memcpy(out_arg, out, chunk) + chunk; else out = out_arg+=chunk; - } while (nbytes -= chunk); + nbytes -= chunk; + } while (nbytes >= AES_BLOCK_SIZE); + + cfb_skiploop: + if (nbytes) { + unsigned char *ivp = cdata->iv; + + if (iv != ivp) { + memcpy(ivp, iv, AES_BLOCK_SIZE); + iv = ivp; + } + ctx->num = nbytes; + if (cdata->cword.b.encdec) { + cdata->cword.b.encdec=0; + padlock_reload_key(); + padlock_xcrypt_ecb(1,cdata,ivp,ivp); + cdata->cword.b.encdec=1; + padlock_reload_key(); + while(nbytes) { + unsigned char c = *(in_arg++); + *(out_arg++) = c ^ *ivp; + *(ivp++) = c, nbytes--; + } + } + else { padlock_reload_key(); + padlock_xcrypt_ecb(1,cdata,ivp,ivp); + padlock_reload_key(); + while (nbytes) { + *ivp = *(out_arg++) = *(in_arg++) ^ *ivp; + ivp++, nbytes--; + } + } + } + memcpy(ctx->iv, iv, AES_BLOCK_SIZE); break; case EVP_CIPH_OFB_MODE: memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE); - do { + chunk &= ~(AES_BLOCK_SIZE-1); + if (chunk) do { if (inp_misaligned) - inp = memcpy(out, in_arg, chunk&~3); + inp = padlock_memcpy(out, in_arg, chunk); else inp = in_arg; in_arg += chunk; @@ -966,13 +1112,27 @@ padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out_arg, padlock_xcrypt_ofb(chunk/AES_BLOCK_SIZE, cdata, out, inp); if (out_misaligned) - out_arg = (char *)memcpy(out_arg, out, chunk&~3) + chunk; + out_arg = padlock_memcpy(out_arg, out, chunk) + chunk; else out = out_arg+=chunk; nbytes -= chunk; chunk = PADLOCK_CHUNK; - } while (nbytes); + } while (nbytes >= AES_BLOCK_SIZE); + + if (nbytes) { + unsigned char *ivp = cdata->iv; + + ctx->num = nbytes; + padlock_reload_key(); /* empirically found */ + padlock_xcrypt_ecb(1,cdata,ivp,ivp); + padlock_reload_key(); /* empirically found */ + while (nbytes) { + *(out_arg++) = *(in_arg++) ^ *ivp; + ivp++, nbytes--; + } + } + memcpy(ctx->iv, cdata->iv, AES_BLOCK_SIZE); break; @@ -992,6 +1152,8 @@ padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out_arg, return 1; } +#endif /* OPENSSL_NO_AES */ + /* ===== Random Number Generator ===== */ /* * This code is not engaged. The reason is that it does not comply