public inbox for git-commits@fedoraproject.org
help / color / mirror / Atom feed
To: git-commits@fedoraproject.org
Subject: [rpms/openssl] rebase_40beta: - add support for Intel AES-NI
Date: Tue, 09 Jun 2026 12:42:03 GMT [thread overview]
Message-ID: <178100892320.1.15210935346111379515.rpms-openssl-7f0747ce733f@fedoraproject.org> (raw)
A new commit has been pushed.
Repo : rpms/openssl
Branch : rebase_40beta
Commit : 7f0747ce733f379ffe4a1d12f6f7f0f69ad23b24
Author : Tomáš Mráz <tmraz@fedoraproject.org>
Date : 2010-01-13T09:21:02+00:00
Stats : +2395/-2 in 2 file(s)
URL : https://src.fedoraproject.org/rpms/openssl/c/7f0747ce733f379ffe4a1d12f6f7f0f69ad23b24?branch=rebase_40beta
Log:
- add support for Intel AES-NI
---
diff --git a/openssl-1.0.0-beta4-aesni.patch b/openssl-1.0.0-beta4-aesni.patch
new file mode 100644
index 0000000..f57918b
--- /dev/null
+++ b/openssl-1.0.0-beta4-aesni.patch
@@ -0,0 +1,2388 @@
+diff -up openssl-1.0.0-beta4/Configure.aesni openssl-1.0.0-beta4/Configure
+--- openssl-1.0.0-beta4/Configure.aesni 2010-01-07 23:38:31.000000000 +0100
++++ openssl-1.0.0-beta4/Configure 2010-01-12 22:18:06.000000000 +0100
+@@ -123,11 +123,11 @@ my $tlib="-lnsl -lsocket";
+ my $bits1="THIRTY_TWO_BIT ";
+ my $bits2="SIXTY_FOUR_BIT ";
+
+-my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o:des-586.o crypt586.o:aes-586.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o:cmll-x86.o";
++my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o:des-586.o crypt586.o:aes-586.o aesni-x86.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o:cmll-x86.o";
+
+ my $x86_elf_asm="$x86_asm:elf";
+
+-my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o";
++my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o aesni-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o";
+ my $ia64_asm="ia64cpuid.o:bn-ia64.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::void";
+ my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::void";
+ my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::void";
+@@ -491,7 +491,7 @@ my %table=(
+ #
+ # Win64 targets, WIN64I denotes IA-64 and WIN64A - AMD64
+ "VC-WIN64I","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64I::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:ia64cpuid.o:ia64.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o:::::::ias:win32",
+-"VC-WIN64A","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:x86_64cpuid.o:bn_asm.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:auto:win32",
++"VC-WIN64A","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:x86_64cpuid.o:bn_asm.o x86_64-mont.o::aes-x86_64.o aesni-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:auto:win32",
+ # x86 Win32 target defaults to ANSI API, if you want UNICODE, complement
+ # 'perl Configure VC-WIN32' with '-DUNICODE -D_UNICODE'
+ "VC-WIN32","cl:-W3 -WX -Gs0 -GF -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -D_CRT_SECURE_NO_DEPRECATE:::WIN32::BN_LLONG RC4_INDEX EXPORT_VAR_AS_FN ${x86_gcc_opts}:${x86_asm}:win32n:win32",
+@@ -1410,6 +1410,7 @@ if ($rmd160_obj =~ /\.o$/)
+ if ($aes_obj =~ /\.o$/)
+ {
+ $cflags.=" -DAES_ASM";
++ $aes_obj =~ s/\s*aesni\-x86\.o// if ($no_sse2);
+ }
+ else {
+ $aes_obj=$aes_enc;
+diff -up openssl-1.0.0-beta4/crypto/aes/asm/aesni-x86.pl.aesni openssl-1.0.0-beta4/crypto/aes/asm/aesni-x86.pl
+--- openssl-1.0.0-beta4/crypto/aes/asm/aesni-x86.pl.aesni 2010-01-12 22:18:06.000000000 +0100
++++ openssl-1.0.0-beta4/crypto/aes/asm/aesni-x86.pl 2010-01-12 22:18:06.000000000 +0100
+@@ -0,0 +1,765 @@
++#!/usr/bin/env perl
++
++# ====================================================================
++# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
++# project. The module is, however, dual licensed under OpenSSL and
++# CRYPTOGAMS licenses depending on where you obtain it. For further
++# details see http://www.openssl.org/~appro/cryptogams/.
++# ====================================================================
++#
++# This module implements support for Intel AES-NI extension. In
++# OpenSSL context it's used with Intel engine, but can also be used as
++# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
++# details].
++
++$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
++ # generates drop-in replacement for
++ # crypto/aes/asm/aes-586.pl:-)
++
++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++push(@INC,"${dir}","${dir}../../perlasm");
++require "x86asm.pl";
++
++&asm_init($ARGV[0],$0);
++
++$movekey = eval($RREFIX eq "aseni" ? "*movaps" : "*movups");
++
++$len="eax";
++$rounds="ecx";
++$key="edx";
++$inp="esi";
++$out="edi";
++$rounds_="ebx"; # backup copy for $rounds
++$key_="ebp"; # backup copy for $key
++
++$inout0="xmm0";
++$inout1="xmm1";
++$inout2="xmm2";
++$rndkey0="xmm3";
++$rndkey1="xmm4";
++$ivec="xmm5";
++$in0="xmm6";
++$in1="xmm7"; $inout3="xmm7";
++
++# Inline version of internal aesni_[en|de]crypt1
++sub aesni_inline_generate1
++{ my $p=shift;
++
++ &$movekey ($rndkey0,&QWP(0,$key));
++ &$movekey ($rndkey1,&QWP(16,$key));
++ &lea ($key,&DWP(32,$key));
++ &pxor ($inout0,$rndkey0);
++ &set_label("${p}1_loop");
++ eval"&aes${p} ($inout0,$rndkey1)";
++ &dec ($rounds);
++ &$movekey ($rndkey1,&QWP(0,$key));
++ &lea ($key,&DWP(16,$key));
++ &jnz (&label("${p}1_loop"));
++ eval"&aes${p}last ($inout0,$rndkey1)";
++}
++
++sub aesni_generate1 # fully unrolled loop
++{ my $p=shift;
++
++ &function_begin_B("_aesni_${p}rypt1");
++ &$movekey ($rndkey0,&QWP(0,$key));
++ &$movekey ($rndkey1,&QWP(0x10,$key));
++ &cmp ($rounds,11);
++ &pxor ($inout0,$rndkey0);
++ &$movekey ($rndkey0,&QWP(0x20,$key));
++ &lea ($key,&DWP(0x30,$key));
++ &jb (&label("${p}128"));
++ &lea ($key,&DWP(0x20,$key));
++ &je (&label("${p}192"));
++ &lea ($key,&DWP(0x20,$key));
++ eval"&aes${p} ($inout0,$rndkey1)";
++ &$movekey ($rndkey1,&QWP(-0x40,$key));
++ eval"&aes${p} ($inout0,$rndkey0)";
++ &$movekey ($rndkey0,&QWP(-0x30,$key));
++ &set_label("${p}192");
++ eval"&aes${p} ($inout0,$rndkey1)";
++ &$movekey ($rndkey1,&QWP(-0x20,$key));
++ eval"&aes${p} ($inout0,$rndkey0)";
++ &$movekey ($rndkey0,&QWP(-0x10,$key));
++ &set_label("${p}128");
++ eval"&aes${p} ($inout0,$rndkey1)";
++ &$movekey ($rndkey1,&QWP(0,$key));
++ eval"&aes${p} ($inout0,$rndkey0)";
++ &$movekey ($rndkey0,&QWP(0x10,$key));
++ eval"&aes${p} ($inout0,$rndkey1)";
++ &$movekey ($rndkey1,&QWP(0x20,$key));
++ eval"&aes${p} ($inout0,$rndkey0)";
++ &$movekey ($rndkey0,&QWP(0x30,$key));
++ eval"&aes${p} ($inout0,$rndkey1)";
++ &$movekey ($rndkey1,&QWP(0x40,$key));
++ eval"&aes${p} ($inout0,$rndkey0)";
++ &$movekey ($rndkey0,&QWP(0x50,$key));
++ eval"&aes${p} ($inout0,$rndkey1)";
++ &$movekey ($rndkey1,&QWP(0x60,$key));
++ eval"&aes${p} ($inout0,$rndkey0)";
++ &$movekey ($rndkey0,&QWP(0x70,$key));
++ eval"&aes${p} ($inout0,$rndkey1)";
++ eval"&aes${p}last ($inout0,$rndkey0)";
++ &ret();
++ &function_end_B("_aesni_${p}rypt1");
++}
++
++# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
++# &aesni_generate1("dec");
++&function_begin_B("${PREFIX}_encrypt");
++ &mov ("eax",&wparam(0));
++ &mov ($key,&wparam(2));
++ &movups ($inout0,&QWP(0,"eax"));
++ &mov ($rounds,&DWP(240,$key));
++ &mov ("eax",&wparam(1));
++ &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt1");
++ &movups (&QWP(0,"eax"),$inout0);
++ &ret ();
++&function_end_B("${PREFIX}_encrypt");
++
++# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
++# &aesni_generate1("dec");
++&function_begin_B("${PREFIX}_decrypt");
++ &mov ("eax",&wparam(0));
++ &mov ($key,&wparam(2));
++ &movups ($inout0,&QWP(0,"eax"));
++ &mov ($rounds,&DWP(240,$key));
++ &mov ("eax",&wparam(1));
++ &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt1");
++ &movups (&QWP(0,"eax"),$inout0);
++ &ret ();
++&function_end_B("${PREFIX}_decrypt");
++\f
++# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
++# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
++# latency is 6, it turned out that it can be scheduled only every
++# *second* cycle. Thus 3x interleave is the one providing optimal
++# utilization, i.e. when subroutine's throughput is virtually same as
++# of non-interleaved subroutine [for number of input blocks up to 3].
++# This is why it makes no sense to implement 2x subroutine. As soon
++# as/if Intel improves throughput by making it possible to schedule
++# the instructions in question *every* cycles I would have to
++# implement 6x interleave and use it in loop...
++sub aesni_generate3
++{ my $p=shift;
++
++ &function_begin_B("_aesni_${p}rypt3");
++ &$movekey ($rndkey0,&QWP(0,$key));
++ &shr ($rounds,1);
++ &$movekey ($rndkey1,&QWP(16,$key));
++ &lea ($key,&DWP(32,$key));
++ &pxor ($inout0,$rndkey0);
++ &pxor ($inout1,$rndkey0);
++ &pxor ($inout2,$rndkey0);
++ &jmp (&label("${p}3_loop"));
++ &set_label("${p}3_loop",16);
++ eval"&aes${p} ($inout0,$rndkey1)";
++ &$movekey ($rndkey0,&QWP(0,$key));
++ eval"&aes${p} ($inout1,$rndkey1)";
++ &dec ($rounds);
++ eval"&aes${p} ($inout2,$rndkey1)";
++ &$movekey ($rndkey1,&QWP(16,$key));
++ eval"&aes${p} ($inout0,$rndkey0)";
++ &lea ($key,&DWP(32,$key));
++ eval"&aes${p} ($inout1,$rndkey0)";
++ eval"&aes${p} ($inout2,$rndkey0)";
++ &jnz (&label("${p}3_loop"));
++ eval"&aes${p} ($inout0,$rndkey1)";
++ &$movekey ($rndkey0,&QWP(0,$key));
++ eval"&aes${p} ($inout1,$rndkey1)";
++ eval"&aes${p} ($inout2,$rndkey1)";
++ eval"&aes${p}last ($inout0,$rndkey0)";
++ eval"&aes${p}last ($inout1,$rndkey0)";
++ eval"&aes${p}last ($inout2,$rndkey0)";
++ &ret();
++ &function_end_B("_aesni_${p}rypt3");
++}
++
++# 4x interleave is implemented to improve small block performance,
++# most notably [and naturally] 4 block by ~30%. One can argue that one
++# should have implemented 5x as well, but improvement would be <20%,
++# so it's not worth it...
++sub aesni_generate4
++{ my $p=shift;
++
++ &function_begin_B("_aesni_${p}rypt4");
++ &$movekey ($rndkey0,&QWP(0,$key));
++ &$movekey ($rndkey1,&QWP(16,$key));
++ &shr ($rounds,1);
++ &lea ($key,&DWP(32,$key));
++ &pxor ($inout0,$rndkey0);
++ &pxor ($inout1,$rndkey0);
++ &pxor ($inout2,$rndkey0);
++ &pxor ($inout3,$rndkey0);
++ &jmp (&label("${p}3_loop"));
++ &set_label("${p}3_loop",16);
++ eval"&aes${p} ($inout0,$rndkey1)";
++ &$movekey ($rndkey0,&QWP(0,$key));
++ eval"&aes${p} ($inout1,$rndkey1)";
++ &dec ($rounds);
++ eval"&aes${p} ($inout2,$rndkey1)";
++ eval"&aes${p} ($inout3,$rndkey1)";
++ &$movekey ($rndkey1,&QWP(16,$key));
++ eval"&aes${p} ($inout0,$rndkey0)";
++ &lea ($key,&DWP(32,$key));
++ eval"&aes${p} ($inout1,$rndkey0)";
++ eval"&aes${p} ($inout2,$rndkey0)";
++ eval"&aes${p} ($inout3,$rndkey0)";
++ &jnz (&label("${p}3_loop"));
++ eval"&aes${p} ($inout0,$rndkey1)";
++ &$movekey ($rndkey0,&QWP(0,$key));
++ eval"&aes${p} ($inout1,$rndkey1)";
++ eval"&aes${p} ($inout2,$rndkey1)";
++ eval"&aes${p} ($inout3,$rndkey1)";
++ eval"&aes${p}last ($inout0,$rndkey0)";
++ eval"&aes${p}last ($inout1,$rndkey0)";
++ eval"&aes${p}last ($inout2,$rndkey0)";
++ eval"&aes${p}last ($inout3,$rndkey0)";
++ &ret();
++ &function_end_B("_aesni_${p}rypt4");
++}
++&aesni_generate3("enc") if ($PREFIX eq "aesni");
++&aesni_generate3("dec");
++&aesni_generate4("enc") if ($PREFIX eq "aesni");
++&aesni_generate4("dec");
++
++if ($PREFIX eq "aesni") {
++# void aesni_ecb_encrypt (const void *in, void *out,
++# size_t length, const AES_KEY *key,
++# int enc);
++&function_begin("aesni_ecb_encrypt");
++ &mov ($inp,&wparam(0));
++ &mov ($out,&wparam(1));
++ &mov ($len,&wparam(2));
++ &mov ($key,&wparam(3));
++ &mov ($rounds,&wparam(4));
++ &cmp ($len,16);
++ &jb (&label("ecb_ret"));
++ &and ($len,-16);
++ &test ($rounds,$rounds)
++ &mov ($rounds,&DWP(240,$key));
++ &mov ($key_,$key); # backup $key
++ &mov ($rounds_,$rounds); # backup $rounds
++ &jz (&label("ecb_decrypt"));
++
++ &sub ($len,0x40);
++ &jbe (&label("ecb_enc_tail"));
++ &jmp (&label("ecb_enc_loop3"));
++
++&set_label("ecb_enc_loop3",16);
++ &movups ($inout0,&QWP(0,$inp));
++ &movups ($inout1,&QWP(0x10,$inp));
++ &movups ($inout2,&QWP(0x20,$inp));
++ &call ("_aesni_encrypt3");
++ &sub ($len,0x30);
++ &lea ($inp,&DWP(0x30,$inp));
++ &lea ($out,&DWP(0x30,$out));
++ &movups (&QWP(-0x30,$out),$inout0);
++ &mov ($key,$key_); # restore $key
++ &movups (&QWP(-0x20,$out),$inout1);
++ &mov ($rounds,$rounds_); # restore $rounds
++ &movups (&QWP(-0x10,$out),$inout2);
++ &ja (&label("ecb_enc_loop3"));
++
++&set_label("ecb_enc_tail");
++ &add ($len,0x40);
++ &jz (&label("ecb_ret"));
++
++ &cmp ($len,0x10);
++ &movups ($inout0,&QWP(0,$inp));
++ &je (&label("ecb_enc_one"));
++ &cmp ($len,0x20);
++ &movups ($inout1,&QWP(0x10,$inp));
++ &je (&label("ecb_enc_two"));
++ &cmp ($len,0x30);
++ &movups ($inout2,&QWP(0x20,$inp));
++ &je (&label("ecb_enc_three"));
++ &movups ($inout3,&QWP(0x30,$inp));
++ &call ("_aesni_encrypt4");
++ &movups (&QWP(0,$out),$inout0);
++ &movups (&QWP(0x10,$out),$inout1);
++ &movups (&QWP(0x20,$out),$inout2);
++ &movups (&QWP(0x30,$out),$inout3);
++ jmp (&label("ecb_ret"));
++
++&set_label("ecb_enc_one",16);
++ &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt1");
++ &movups (&QWP(0,$out),$inout0);
++ &jmp (&label("ecb_ret"));
++
++&set_label("ecb_enc_two",16);
++ &call ("_aesni_encrypt3");
++ &movups (&QWP(0,$out),$inout0);
++ &movups (&QWP(0x10,$out),$inout1);
++ &jmp (&label("ecb_ret"));
++
++&set_label("ecb_enc_three",16);
++ &call ("_aesni_encrypt3");
++ &movups (&QWP(0,$out),$inout0);
++ &movups (&QWP(0x10,$out),$inout1);
++ &movups (&QWP(0x20,$out),$inout2);
++ &jmp (&label("ecb_ret"));
++
++&set_label("ecb_decrypt",16);
++ &sub ($len,0x40);
++ &jbe (&label("ecb_dec_tail"));
++ &jmp (&label("ecb_dec_loop3"));
++
++&set_label("ecb_dec_loop3",16);
++ &movups ($inout0,&QWP(0,$inp));
++ &movups ($inout1,&QWP(0x10,$inp));
++ &movups ($inout2,&QWP(0x20,$inp));
++ &call ("_aesni_decrypt3");
++ &sub ($len,0x30);
++ &lea ($inp,&DWP(0x30,$inp));
++ &lea ($out,&DWP(0x30,$out));
++ &movups (&QWP(-0x30,$out),$inout0);
++ &mov ($key,$key_); # restore $key
++ &movups (&QWP(-0x20,$out),$inout1);
++ &mov ($rounds,$rounds_); # restore $rounds
++ &movups (&QWP(-0x10,$out),$inout2);
++ &ja (&label("ecb_dec_loop3"));
++
++&set_label("ecb_dec_tail");
++ &add ($len,0x40);
++ &jz (&label("ecb_ret"));
++
++ &cmp ($len,0x10);
++ &movups ($inout0,&QWP(0,$inp));
++ &je (&label("ecb_dec_one"));
++ &cmp ($len,0x20);
++ &movups ($inout1,&QWP(0x10,$inp));
++ &je (&label("ecb_dec_two"));
++ &cmp ($len,0x30);
++ &movups ($inout2,&QWP(0x20,$inp));
++ &je (&label("ecb_dec_three"));
++ &movups ($inout3,&QWP(0x30,$inp));
++ &call ("_aesni_decrypt4");
++ &movups (&QWP(0,$out),$inout0);
++ &movups (&QWP(0x10,$out),$inout1);
++ &movups (&QWP(0x20,$out),$inout2);
++ &movups (&QWP(0x30,$out),$inout3);
++ &jmp (&label("ecb_ret"));
++
++&set_label("ecb_dec_one",16);
++ &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt3");
++ &movups (&QWP(0,$out),$inout0);
++ &jmp (&label("ecb_ret"));
++
++&set_label("ecb_dec_two",16);
++ &call ("_aesni_decrypt3");
++ &movups (&QWP(0,$out),$inout0);
++ &movups (&QWP(0x10,$out),$inout1);
++ &jmp (&label("ecb_ret"));
++
++&set_label("ecb_dec_three",16);
++ &call ("_aesni_decrypt3");
++ &movups (&QWP(0,$out),$inout0);
++ &movups (&QWP(0x10,$out),$inout1);
++ &movups (&QWP(0x20,$out),$inout2);
++
++&set_label("ecb_ret");
++&function_end("aesni_ecb_encrypt");
++}
++
++# void $PREFIX_cbc_encrypt (const void *inp, void *out,
++# size_t length, const AES_KEY *key,
++# unsigned char *ivp,const int enc);
++&function_begin("${PREFIX}_cbc_encrypt");
++ &mov ($inp,&wparam(0));
++ &mov ($out,&wparam(1));
++ &mov ($len,&wparam(2));
++ &mov ($key,&wparam(3));
++ &test ($len,$len);
++ &mov ($key_,&wparam(4));
++ &jz (&label("cbc_ret"));
++
++ &cmp (&wparam(5),0);
++ &movups ($ivec,&QWP(0,$key_)); # load IV
++ &mov ($rounds,&DWP(240,$key));
++ &mov ($key_,$key); # backup $key
++ &mov ($rounds_,$rounds); # backup $rounds
++ &je (&label("cbc_decrypt"));
++
++ &movaps ($inout0,$ivec);
++ &cmp ($len,16);
++ &jb (&label("cbc_enc_tail"));
++ &sub ($len,16);
++ &jmp (&label("cbc_enc_loop"));
++
++&set_label("cbc_enc_loop",16);
++ &movups ($ivec,&QWP(0,$inp));
++ &lea ($inp,&DWP(16,$inp));
++ &pxor ($inout0,$ivec);
++ &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt3");
++ &sub ($len,16);
++ &lea ($out,&DWP(16,$out));
++ &mov ($rounds,$rounds_); # restore $rounds
++ &mov ($key,$key_); # restore $key
++ &movups (&QWP(-16,$out),$inout0);
++ &jnc (&label("cbc_enc_loop"));
++ &add ($len,16);
++ &jnz (&label("cbc_enc_tail"));
++ &movaps ($ivec,$inout0);
++ &jmp (&label("cbc_ret"));
++
++&set_label("cbc_enc_tail");
++ &mov ("ecx",$len); # zaps $rounds
++ &data_word(0xA4F3F689); # rep movsb
++ &mov ("ecx",16); # zero tail
++ &sub ("ecx",$len);
++ &xor ("eax","eax"); # zaps $len
++ &data_word(0xAAF3F689); # rep stosb
++ &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block
++ &mov ($rounds,$rounds_); # restore $rounds
++ &mov ($inp,$out); # $inp and $out are the same
++ &mov ($key,$key_); # restore $key
++ &jmp (&label("cbc_enc_loop"));
++
++&set_label("cbc_decrypt",16);
++ &sub ($len,0x40);
++ &jbe (&label("cbc_dec_tail"));
++ &jmp (&label("cbc_dec_loop3"));
++
++&set_label("cbc_dec_loop3",16);
++ &movups ($inout0,&QWP(0,$inp));
++ &movups ($inout1,&QWP(0x10,$inp));
++ &movups ($inout2,&QWP(0x20,$inp));
++ &movaps ($in0,$inout0);
++ &movaps ($in1,$inout1);
++ &call ("_aesni_decrypt3");
++ &sub ($len,0x30);
++ &lea ($inp,&DWP(0x30,$inp));
++ &lea ($out,&DWP(0x30,$out));
++ &pxor ($inout0,$ivec);
++ &pxor ($inout1,$in0);
++ &movups ($ivec,&QWP(-0x10,$inp));
++ &pxor ($inout2,$in1);
++ &movups (&QWP(-0x30,$out),$inout0);
++ &mov ($rounds,$rounds_) # restore $rounds
++ &movups (&QWP(-0x20,$out),$inout1);
++ &mov ($key,$key_); # restore $key
++ &movups (&QWP(-0x10,$out),$inout2);
++ &ja (&label("cbc_dec_loop3"));
++
++&set_label("cbc_dec_tail");
++ &add ($len,0x40);
++ &jz (&label("cbc_ret"));
++
++ &movups ($inout0,&QWP(0,$inp));
++ &cmp ($len,0x10);
++ &movaps ($in0,$inout0);
++ &jbe (&label("cbc_dec_one"));
++ &movups ($inout1,&QWP(0x10,$inp));
++ &cmp ($len,0x20);
++ &movaps ($in1,$inout1);
++ &jbe (&label("cbc_dec_two"));
++ &movups ($inout2,&QWP(0x20,$inp));
++ &cmp ($len,0x30);
++ &jbe (&label("cbc_dec_three"));
++ &movups ($inout3,&QWP(0x30,$inp));
++ &call ("_aesni_decrypt4");
++ &movups ($rndkey0,&QWP(0x10,$inp));
++ &movups ($rndkey1,&QWP(0x20,$inp));
++ &pxor ($inout0,$ivec);
++ &pxor ($inout1,$in0);
++ &movups ($ivec,&QWP(0x30,$inp));
++ &movups (&QWP(0,$out),$inout0);
++ &pxor ($inout2,$rndkey0);
++ &pxor ($inout3,$rndkey1);
++ &movups (&QWP(0x10,$out),$inout1);
++ &movups (&QWP(0x20,$out),$inout2);
++ &movaps ($inout0,$inout3);
++ &lea ($out,&DWP(0x30,$out));
++ &jmp (&label("cbc_dec_tail_collected"));
++
++&set_label("cbc_dec_one");
++ &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt3");
++ &pxor ($inout0,$ivec);
++ &movaps ($ivec,$in0);
++ &jmp (&label("cbc_dec_tail_collected"));
++
++&set_label("cbc_dec_two");
++ &call ("_aesni_decrypt3");
++ &pxor ($inout0,$ivec);
++ &pxor ($inout1,$in0);
++ &movups (&QWP(0,$out),$inout0);
++ &movaps ($inout0,$inout1);
++ &movaps ($ivec,$in1);
++ &lea ($out,&DWP(0x10,$out));
++ &jmp (&label("cbc_dec_tail_collected"));
++
++&set_label("cbc_dec_three");
++ &call ("_aesni_decrypt3");
++ &pxor ($inout0,$ivec);
++ &pxor ($inout1,$in0);
++ &pxor ($inout2,$in1);
++ &movups (&QWP(0,$out),$inout0);
++ &movups (&QWP(0x10,$out),$inout1);
++ &movaps ($inout0,$inout2);
++ &movups ($ivec,&QWP(0x20,$inp));
++ &lea ($out,&DWP(0x20,$out));
++
++&set_label("cbc_dec_tail_collected");
++ &and ($len,15);
++ &jnz (&label("cbc_dec_tail_partial"));
++ &movups (&QWP(0,$out),$inout0);
++ &jmp (&label("cbc_ret"));
++
++&set_label("cbc_dec_tail_partial");
++ &mov ($key_,"esp");
++ &sub ("esp",16);
++ &and ("esp",-16);
++ &movaps (&QWP(0,"esp"),$inout0);
++ &mov ($inp,"esp");
++ &mov ("ecx",$len);
++ &data_word(0xA4F3F689); # rep movsb
++ &mov ("esp",$key_);
++
++&set_label("cbc_ret");
++ &mov ($key_,&wparam(4));
++ &movups (&QWP(0,$key_),$ivec); # output IV
++&function_end("${PREFIX}_cbc_encrypt");
++
++# Mechanical port from aesni-x86_64.pl.
++#
++# _aesni_set_encrypt_key is private interface,
++# input:
++# "eax" const unsigned char *userKey
++# $rounds int bits
++# $key AES_KEY *key
++# output:
++# "eax" return code
++# $round rounds
++
++&function_begin_B("_aesni_set_encrypt_key");
++ &test ("eax","eax");
++ &jz (&label("bad_pointer"));
++ &test ($key,$key);
++ &jz (&label("bad_pointer"));
++
++ &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
++ &pxor ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
++ &lea ($key,&DWP(16,$key));
++ &cmp ($rounds,256);
++ &je (&label("14rounds"));
++ &cmp ($rounds,192);
++ &je (&label("12rounds"));
++ &cmp ($rounds,128);
++ &jne (&label("bad_keybits"));
++
++&set_label("10rounds",16);
++ &mov ($rounds,9);
++ &$movekey (&QWP(-16,$key),"xmm0"); # round 0
++ &aeskeygenassist("xmm1","xmm0",0x01); # round 1
++ &call (&label("key_128_cold"));
++ &aeskeygenassist("xmm1","xmm0",0x2); # round 2
++ &call (&label("key_128"));
++ &aeskeygenassist("xmm1","xmm0",0x04); # round 3
++ &call (&label("key_128"));
++ &aeskeygenassist("xmm1","xmm0",0x08); # round 4
++ &call (&label("key_128"));
++ &aeskeygenassist("xmm1","xmm0",0x10); # round 5
++ &call (&label("key_128"));
++ &aeskeygenassist("xmm1","xmm0",0x20); # round 6
++ &call (&label("key_128"));
++ &aeskeygenassist("xmm1","xmm0",0x40); # round 7
++ &call (&label("key_128"));
++ &aeskeygenassist("xmm1","xmm0",0x80); # round 8
++ &call (&label("key_128"));
++ &aeskeygenassist("xmm1","xmm0",0x1b); # round 9
++ &call (&label("key_128"));
++ &aeskeygenassist("xmm1","xmm0",0x36); # round 10
++ &call (&label("key_128"));
++ &$movekey (&QWP(0,$key),"xmm0");
++ &mov (&DWP(80,$key),$rounds);
++ &xor ("eax","eax");
++ &ret();
++
++&set_label("key_128",16);
++ &$movekey (&QWP(0,$key),"xmm0");
++ &lea ($key,&DWP(16,$key));
++&set_label("key_128_cold");
++ &shufps ("xmm4","xmm0",0b00010000);
++ &pxor ("xmm0","xmm4");
++ &shufps ("xmm4","xmm0",0b10001100,);
++ &pxor ("xmm0","xmm4");
++ &pshufd ("xmm1","xmm1",0b11111111); # critical path
++ &pxor ("xmm0","xmm1");
++ &ret();
++
++&set_label("12rounds",16);
++ &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
++ &mov ($rounds,11);
++ &$movekey (&QWP(-16,$key),"xmm0") # round 0
++ &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
++ &call (&label("key_192a_cold"));
++ &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
++ &call (&label("key_192b"));
++ &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
++ &call (&label("key_192a"));
++ &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
++ &call (&label("key_192b"));
++ &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
++ &call (&label("key_192a"));
++ &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
++ &call (&label("key_192b"));
++ &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
++ &call (&label("key_192a"));
++ &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
++ &call (&label("key_192b"));
++ &$movekey (&QWP(0,$key),"xmm0");
++ &mov (&DWP(48,$key),$rounds);
++ &xor ("eax","eax");
++ &ret();
++
++&set_label("key_192a",16);
++ &$movekey (&QWP(0,$key),"xmm0");
++ &lea ($key,&DWP(16,$key));
++&set_label("key_192a_cold",16);
++ &movaps ("xmm5","xmm2");
++&set_label("key_192b_warm");
++ &shufps ("xmm4","xmm0",0b00010000);
++ &movaps ("xmm3","xmm2");
++ &pxor ("xmm0","xmm4");
++ &shufps ("xmm4","xmm0",0b10001100);
++ &pslldq ("xmm3",4);
++ &pxor ("xmm0","xmm4");
++ &pshufd ("xmm1","xmm1",0b01010101); # critical path
++ &pxor ("xmm2","xmm3");
++ &pxor ("xmm0","xmm1");
++ &pshufd ("xmm3","xmm0",0b11111111);
++ &pxor ("xmm2","xmm3");
++ &ret();
++
++&set_label("key_192b",16);
++ &movaps ("xmm3","xmm0");
++ &shufps ("xmm5","xmm0",0b01000100);
++ &$movekey (&QWP(0,$key),"xmm5");
++ &shufps ("xmm3","xmm2",0b01001110);
++ &$movekey (&QWP(16,$key),"xmm3");
++ &lea ($key,&DWP(32,$key));
++ &jmp (&label("key_192b_warm"));
++
++&set_label("14rounds",16);
++ &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
++ &mov ($rounds,13);
++ &lea ($key,&DWP(16,$key));
++ &$movekey (&QWP(-32,$key),"xmm0"); # round 0
++ &$movekey (&QWP(-16,$key),"xmm2"); # round 1
++ &aeskeygenassist("xmm1","xmm2",0x01); # round 2
++ &call (&label("key_256a_cold"));
++ &aeskeygenassist("xmm1","xmm0",0x01); # round 3
++ &call (&label("key_256b"));
++ &aeskeygenassist("xmm1","xmm2",0x02); # round 4
++ &call (&label("key_256a"));
++ &aeskeygenassist("xmm1","xmm0",0x02); # round 5
++ &call (&label("key_256b"));
++ &aeskeygenassist("xmm1","xmm2",0x04); # round 6
++ &call (&label("key_256a"));
++ &aeskeygenassist("xmm1","xmm0",0x04); # round 7
++ &call (&label("key_256b"));
++ &aeskeygenassist("xmm1","xmm2",0x08); # round 8
++ &call (&label("key_256a"));
++ &aeskeygenassist("xmm1","xmm0",0x08); # round 9
++ &call (&label("key_256b"));
++ &aeskeygenassist("xmm1","xmm2",0x10); # round 10
++ &call (&label("key_256a"));
++ &aeskeygenassist("xmm1","xmm0",0x10); # round 11
++ &call (&label("key_256b"));
++ &aeskeygenassist("xmm1","xmm2",0x20); # round 12
++ &call (&label("key_256a"));
++ &aeskeygenassist("xmm1","xmm0",0x20); # round 13
++ &call (&label("key_256b"));
++ &aeskeygenassist("xmm1","xmm2",0x40); # round 14
++ &call (&label("key_256a"));
++ &$movekey (&QWP(0,$key),"xmm0");
++ &mov (&DWP(16,$key),$rounds);
++ &xor ("eax","eax");
++ &ret();
++
++&set_label("key_256a",16);
++ &$movekey (&QWP(0,$key),"xmm2");
++ &lea ($key,&DWP(16,$key));
++&set_label("key_256a_cold");
++ &shufps ("xmm4","xmm0",0b00010000);
++ &pxor ("xmm0","xmm4");
++ &shufps ("xmm4","xmm0",0b10001100);
++ &pxor ("xmm0","xmm4");
++ &pshufd ("xmm1","xmm1",0b11111111); # critical path
++ &pxor ("xmm0","xmm1");
++ &ret();
++
++&set_label("key_256b",16);
++ &$movekey (&QWP(0,$key),"xmm0");
++ &lea ($key,&DWP(16,$key));
++
++ &shufps ("xmm4","xmm2",0b00010000);
++ &pxor ("xmm2","xmm4");
++ &shufps ("xmm4","xmm2",0b10001100);
++ &pxor ("xmm2","xmm4");
++ &pshufd ("xmm1","xmm1",0b10101010); # critical path
++ &pxor ("xmm2","xmm1");
++ &ret();
++
++&set_label("bad_pointer",4);
++ &mov ("eax",-1);
++ &ret ();
++&set_label("bad_keybits",4);
++ &mov ("eax",-2);
++ &ret ();
++&function_end_B("_aesni_set_encrypt_key");
++
++# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
++# AES_KEY *key)
++&function_begin_B("${PREFIX}_set_encrypt_key");
++ &mov ("eax",&wparam(0));
++ &mov ($rounds,&wparam(1));
++ &mov ($key,&wparam(2));
++ &call ("_aesni_set_encrypt_key");
++ &ret ();
++&function_end_B("${PREFIX}_set_encrypt_key");
++
++# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
++# AES_KEY *key)
++&function_begin_B("${PREFIX}_set_decrypt_key");
++ &mov ("eax",&wparam(0));
++ &mov ($rounds,&wparam(1));
++ &mov ($key,&wparam(2));
++ &call ("_aesni_set_encrypt_key");
++ &mov ($key,&wparam(2));
++ &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key
++ &test ("eax","eax");
++ &jnz (&label("dec_key_ret"));
++ &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
++
++ &$movekey ("xmm0",&QWP(0,$key)); # just swap
++ &$movekey ("xmm1",&QWP(0,"eax"));
++ &$movekey (&QWP(0,"eax"),"xmm0");
++ &$movekey (&QWP(0,$key),"xmm1");
++ &lea ($key,&DWP(16,$key));
++ &lea ("eax",&DWP(-16,"eax"));
++
++&set_label("dec_key_inverse");
++ &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
++ &$movekey ("xmm1",&QWP(0,"eax"));
++ &aesimc ("xmm0","xmm0");
++ &aesimc ("xmm1","xmm1");
++ &lea ($key,&DWP(16,$key));
++ &lea ("eax",&DWP(-16,"eax"));
++ &cmp ("eax",$key);
++ &$movekey (&QWP(16,"eax"),"xmm0");
++ &$movekey (&QWP(-16,$key),"xmm1");
++ &ja (&label("dec_key_inverse"));
++
++ &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
++ &aesimc ("xmm0","xmm0");
++ &$movekey (&QWP(0,$key),"xmm0");
++
++ &xor ("eax","eax"); # return success
++&set_label("dec_key_ret");
++ &ret ();
++&function_end_B("${PREFIX}_set_decrypt_key");
++&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
++
++&asm_finish();
+diff -up openssl-1.0.0-beta4/crypto/aes/asm/aesni-x86_64.pl.aesni openssl-1.0.0-beta4/crypto/aes/asm/aesni-x86_64.pl
+--- openssl-1.0.0-beta4/crypto/aes/asm/aesni-x86_64.pl.aesni 2010-01-12 22:18:06.000000000 +0100
++++ openssl-1.0.0-beta4/crypto/aes/asm/aesni-x86_64.pl 2010-01-12 22:18:06.000000000 +0100
+@@ -0,0 +1,991 @@
++#!/usr/bin/env perl
++#
++# ====================================================================
++# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
++# project. The module is, however, dual licensed under OpenSSL and
++# CRYPTOGAMS licenses depending on where you obtain it. For further
++# details see http://www.openssl.org/~appro/cryptogams/.
++# ====================================================================
++#
++# This module implements support for Intel AES-NI extension. In
++# OpenSSL context it's used with Intel engine, but can also be used as
++# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
++# details].
++
++$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
++ # generates drop-in replacement for
++ # crypto/aes/asm/aes-x86_64.pl:-)
++
++$flavour = shift;
++$output = shift;
++if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
++
++$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
++
++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
++( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
++die "can't locate x86_64-xlate.pl";
++
++open STDOUT,"| $^X $xlate $flavour $output";
++
++$movkey = $PREFIX eq "aesni" ? "movaps" : "movups";
++@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
++ ("%rdi","%rsi","%rdx","%rcx"); # Unix order
++
++$code=".text\n";
++
++$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
++# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
++$inp="%rdi";
++$out="%rsi";
++$len="%rdx";
++$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
++$ivp="%r8"; # cbc
++
++$rnds_="%r10d"; # backup copy for $rounds
++$key_="%r11"; # backup copy for $key
++
++# %xmm register layout
++$inout0="%xmm0"; $inout1="%xmm1";
++$inout2="%xmm2"; $inout3="%xmm3";
++$rndkey0="%xmm4"; $rndkey1="%xmm5";
++
++$iv="%xmm6"; $in0="%xmm7"; # used in CBC decrypt
++$in1="%xmm8"; $in2="%xmm9";
++\f
++# Inline version of internal aesni_[en|de]crypt1.
++#
++# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
++# cycles which take care of loop variables...
++{ my $sn;
++sub aesni_generate1 {
++my ($p,$key,$rounds)=@_;
++++$sn;
++$code.=<<___;
++ $movkey ($key),$rndkey0
++ $movkey 16($key),$rndkey1
++ lea 32($key),$key
++ pxor $rndkey0,$inout0
++.Loop_${p}1_$sn:
++ aes${p} $rndkey1,$inout0
++ dec $rounds
++ $movkey ($key),$rndkey1
++ lea 16($key),$key
++ jnz .Loop_${p}1_$sn # loop body is 16 bytes
++ aes${p}last $rndkey1,$inout0
++___
++}}
++# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
++#
++{ my ($inp,$out,$key) = @_4args;
++
++$code.=<<___;
++.globl ${PREFIX}_encrypt
++.type ${PREFIX}_encrypt,\@abi-omnipotent
++.align 16
++${PREFIX}_encrypt:
++ movups ($inp),$inout0 # load input
++ mov 240($key),$rounds # pull $rounds
++___
++ &aesni_generate1("enc",$key,$rounds);
++$code.=<<___;
++ movups $inout0,($out) # output
++ ret
++.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
++
++.globl ${PREFIX}_decrypt
++.type ${PREFIX}_decrypt,\@abi-omnipotent
++.align 16
++${PREFIX}_decrypt:
++ movups ($inp),$inout0 # load input
++ mov 240($key),$rounds # pull $rounds
++___
++ &aesni_generate1("dec",$key,$rounds);
++$code.=<<___;
++ movups $inout0,($out) # output
++ ret
++.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
++___
++}
++\f
++# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
++# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
++# latency is 6, it turned out that it can be scheduled only every
++# *second* cycle. Thus 3x interleave is the one providing optimal
++# utilization, i.e. when subroutine's throughput is virtually same as
++# of non-interleaved subroutine [for number of input blocks up to 3].
++# This is why it makes no sense to implement 2x subroutine. As soon
++# as/if Intel improves throughput by making it possible to schedule
++# the instructions in question *every* cycles I would have to
++# implement 6x interleave and use it in loop...
++sub aesni_generate3 {
++my $dir=shift;
++# As already mentioned it takes in $key and $rounds, which are *not*
++# preserved. $inout[0-2] is cipher/clear text...
++$code.=<<___;
++.type _aesni_${dir}rypt3,\@abi-omnipotent
++.align 16
++_aesni_${dir}rypt3:
++ $movkey ($key),$rndkey0
++ shr \$1,$rounds
++ $movkey 16($key),$rndkey1
++ lea 32($key),$key
++ pxor $rndkey0,$inout0
++ pxor $rndkey0,$inout1
++ pxor $rndkey0,$inout2
++
++.L${dir}_loop3:
++ aes${dir} $rndkey1,$inout0
++ $movkey ($key),$rndkey0
++ aes${dir} $rndkey1,$inout1
++ dec $rounds
++ aes${dir} $rndkey1,$inout2
++ aes${dir} $rndkey0,$inout0
++ $movkey 16($key),$rndkey1
++ aes${dir} $rndkey0,$inout1
++ lea 32($key),$key
++ aes${dir} $rndkey0,$inout2
++ jnz .L${dir}_loop3
++
++ aes${dir} $rndkey1,$inout0
++ $movkey ($key),$rndkey0
++ aes${dir} $rndkey1,$inout1
++ aes${dir} $rndkey1,$inout2
++ aes${dir}last $rndkey0,$inout0
++ aes${dir}last $rndkey0,$inout1
++ aes${dir}last $rndkey0,$inout2
++ ret
++.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
++___
++}
++# 4x interleave is implemented to improve small block performance,
++# most notably [and naturally] 4 block by ~30%. One can argue that one
++# should have implemented 5x as well, but improvement would be <20%,
++# so it's not worth it...
++sub aesni_generate4 {
++my $dir=shift;
++# As already mentioned it takes in $key and $rounds, which are *not*
++# preserved. $inout[0-3] is cipher/clear text...
++$code.=<<___;
++.type _aesni_${dir}rypt4,\@abi-omnipotent
++.align 16
++_aesni_${dir}rypt4:
++ $movkey ($key),$rndkey0
++ shr \$1,$rounds
++ $movkey 16($key),$rndkey1
++ lea 32($key),$key
++ pxor $rndkey0,$inout0
++ pxor $rndkey0,$inout1
++ pxor $rndkey0,$inout2
++ pxor $rndkey0,$inout3
++
++.L${dir}_loop4:
++ aes${dir} $rndkey1,$inout0
++ $movkey ($key),$rndkey0
++ aes${dir} $rndkey1,$inout1
++ dec $rounds
++ aes${dir} $rndkey1,$inout2
++ aes${dir} $rndkey1,$inout3
++ aes${dir} $rndkey0,$inout0
++ $movkey 16($key),$rndkey1
++ aes${dir} $rndkey0,$inout1
++ lea 32($key),$key
++ aes${dir} $rndkey0,$inout2
++ aes${dir} $rndkey0,$inout3
++ jnz .L${dir}_loop4
++
++ aes${dir} $rndkey1,$inout0
++ $movkey ($key),$rndkey0
++ aes${dir} $rndkey1,$inout1
++ aes${dir} $rndkey1,$inout2
++ aes${dir} $rndkey1,$inout3
++ aes${dir}last $rndkey0,$inout0
++ aes${dir}last $rndkey0,$inout1
++ aes${dir}last $rndkey0,$inout2
++ aes${dir}last $rndkey0,$inout3
++ ret
++.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
++___
++}
++&aesni_generate3("enc") if ($PREFIX eq "aesni");
++&aesni_generate3("dec");
++&aesni_generate4("enc") if ($PREFIX eq "aesni");
++&aesni_generate4("dec");
++\f
++if ($PREFIX eq "aesni") {
++# void aesni_ecb_encrypt (const void *in, void *out,
++# size_t length, const AES_KEY *key,
++# int enc);
++$code.=<<___;
++.globl aesni_ecb_encrypt
++.type aesni_ecb_encrypt,\@function,5
++.align 16
++aesni_ecb_encrypt:
++ cmp \$16,$len # check length
++ jb .Lecb_ret
++
++ mov 240($key),$rounds # pull $rounds
++ and \$-16,$len
++ mov $key,$key_ # backup $key
++ test %r8d,%r8d # 5th argument
++ mov $rounds,$rnds_ # backup $rounds
++ jz .Lecb_decrypt
++#--------------------------- ECB ENCRYPT ------------------------------#
++ sub \$0x40,$len
++ jbe .Lecb_enc_tail
++ jmp .Lecb_enc_loop3
++.align 16
++.Lecb_enc_loop3:
++ movups ($inp),$inout0
++ movups 0x10($inp),$inout1
++ movups 0x20($inp),$inout2
++ call _aesni_encrypt3
++ sub \$0x30,$len
++ lea 0x30($inp),$inp
++ lea 0x30($out),$out
++ movups $inout0,-0x30($out)
++ mov $rnds_,$rounds # restore $rounds
++ movups $inout1,-0x20($out)
++ mov $key_,$key # restore $key
++ movups $inout2,-0x10($out)
++ ja .Lecb_enc_loop3
++
++.Lecb_enc_tail:
++ add \$0x40,$len
++ jz .Lecb_ret
++
++ cmp \$0x10,$len
++ movups ($inp),$inout0
++ je .Lecb_enc_one
++ cmp \$0x20,$len
++ movups 0x10($inp),$inout1
++ je .Lecb_enc_two
++ cmp \$0x30,$len
++ movups 0x20($inp),$inout2
++ je .Lecb_enc_three
++ movups 0x30($inp),$inout3
++ call _aesni_encrypt4
++ movups $inout0,($out)
++ movups $inout1,0x10($out)
++ movups $inout2,0x20($out)
++ movups $inout3,0x30($out)
++ jmp .Lecb_ret
++.align 16
++.Lecb_enc_one:
++___
++ &aesni_generate1("enc",$key,$rounds);
++$code.=<<___;
++ movups $inout0,($out)
++ jmp .Lecb_ret
++.align 16
++.Lecb_enc_two:
++ call _aesni_encrypt3
++ movups $inout0,($out)
++ movups $inout1,0x10($out)
++ jmp .Lecb_ret
++.align 16
++.Lecb_enc_three:
++ call _aesni_encrypt3
++ movups $inout0,($out)
++ movups $inout1,0x10($out)
++ movups $inout2,0x20($out)
++ jmp .Lecb_ret
++\f#--------------------------- ECB DECRYPT ------------------------------#
++.align 16
++.Lecb_decrypt:
++ sub \$0x40,$len
++ jbe .Lecb_dec_tail
++ jmp .Lecb_dec_loop3
++.align 16
++.Lecb_dec_loop3:
++ movups ($inp),$inout0
++ movups 0x10($inp),$inout1
++ movups 0x20($inp),$inout2
++ call _aesni_decrypt3
++ sub \$0x30,$len
++ lea 0x30($inp),$inp
++ lea 0x30($out),$out
++ movups $inout0,-0x30($out)
++ mov $rnds_,$rounds # restore $rounds
++ movups $inout1,-0x20($out)
++ mov $key_,$key # restore $key
++ movups $inout2,-0x10($out)
++ ja .Lecb_dec_loop3
++
++.Lecb_dec_tail:
++ add \$0x40,$len
++ jz .Lecb_ret
++
++ cmp \$0x10,$len
++ movups ($inp),$inout0
++ je .Lecb_dec_one
++ cmp \$0x20,$len
++ movups 0x10($inp),$inout1
++ je .Lecb_dec_two
++ cmp \$0x30,$len
++ movups 0x20($inp),$inout2
++ je .Lecb_dec_three
++ movups 0x30($inp),$inout3
++ call _aesni_decrypt4
++ movups $inout0,($out)
++ movups $inout1,0x10($out)
++ movups $inout2,0x20($out)
++ movups $inout3,0x30($out)
++ jmp .Lecb_ret
++.align 16
++.Lecb_dec_one:
++___
++ &aesni_generate1("dec",$key,$rounds);
++$code.=<<___;
++ movups $inout0,($out)
++ jmp .Lecb_ret
++.align 16
++.Lecb_dec_two:
++ call _aesni_decrypt3
++ movups $inout0,($out)
++ movups $inout1,0x10($out)
++ jmp .Lecb_ret
++.align 16
++.Lecb_dec_three:
++ call _aesni_decrypt3
++ movups $inout0,($out)
++ movups $inout1,0x10($out)
++ movups $inout2,0x20($out)
++
++.Lecb_ret:
++ ret
++.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
++___
++}
++\f
++# void $PREFIX_cbc_encrypt (const void *inp, void *out,
++# size_t length, const AES_KEY *key,
++# unsigned char *ivp,const int enc);
++$reserved = $win64?0x40:-0x18; # used in decrypt
++$code.=<<___;
++.globl ${PREFIX}_cbc_encrypt
++.type ${PREFIX}_cbc_encrypt,\@function,6
++.align 16
++${PREFIX}_cbc_encrypt:
++ test $len,$len # check length
++ jz .Lcbc_ret
++
++ mov 240($key),$rnds_ # pull $rounds
++ mov $key,$key_ # backup $key
++ test %r9d,%r9d # 6th argument
++ jz .Lcbc_decrypt
++#--------------------------- CBC ENCRYPT ------------------------------#
++ movups ($ivp),$inout0 # load iv as initial state
++ cmp \$16,$len
++ mov $rnds_,$rounds
++ jb .Lcbc_enc_tail
++ sub \$16,$len
++ jmp .Lcbc_enc_loop
++.align 16
++.Lcbc_enc_loop:
++ movups ($inp),$inout1 # load input
++ lea 16($inp),$inp
++ pxor $inout1,$inout0
++___
++ &aesni_generate1("enc",$key,$rounds);
++$code.=<<___;
++ sub \$16,$len
++ lea 16($out),$out
++ mov $rnds_,$rounds # restore $rounds
++ mov $key_,$key # restore $key
++ movups $inout0,-16($out) # store output
++ jnc .Lcbc_enc_loop
++ add \$16,$len
++ jnz .Lcbc_enc_tail
++ movups $inout0,($ivp)
++ jmp .Lcbc_ret
++
++.Lcbc_enc_tail:
++ mov $len,%rcx # zaps $key
++ xchg $inp,$out # $inp is %rsi and $out is %rdi now
++ .long 0x9066A4F3 # rep movsb
++ mov \$16,%ecx # zero tail
++ sub $len,%rcx
++ xor %eax,%eax
++ .long 0x9066AAF3 # rep stosb
++ lea -16(%rdi),%rdi # rewind $out by 1 block
++ mov $rnds_,$rounds # restore $rounds
++ mov %rdi,%rsi # $inp and $out are the same
++ mov $key_,$key # restore $key
++ xor $len,$len # len=16
++ jmp .Lcbc_enc_loop # one more spin
++\f#--------------------------- CBC DECRYPT ------------------------------#
++.align 16
++.Lcbc_decrypt:
++___
++$code.=<<___ if ($win64);
++ lea -0x58(%rsp),%rsp
++ movaps %xmm6,(%rsp)
++ movaps %xmm7,0x10(%rsp)
++ movaps %xmm8,0x20(%rsp)
++ movaps %xmm9,0x30(%rsp)
++.Lcbc_decrypt_body:
++___
++$code.=<<___;
++ movups ($ivp),$iv
++ sub \$0x40,$len
++ mov $rnds_,$rounds
++ jbe .Lcbc_dec_tail
++ jmp .Lcbc_dec_loop3
++.align 16
++.Lcbc_dec_loop3:
++ movups ($inp),$inout0
++ movups 0x10($inp),$inout1
++ movups 0x20($inp),$inout2
++ movaps $inout0,$in0
++ movaps $inout1,$in1
++ movaps $inout2,$in2
++ call _aesni_decrypt3
++ sub \$0x30,$len
++ lea 0x30($inp),$inp
++ lea 0x30($out),$out
++ pxor $iv,$inout0
++ pxor $in0,$inout1
++ movaps $in2,$iv
++ pxor $in1,$inout2
++ movups $inout0,-0x30($out)
++ mov $rnds_,$rounds # restore $rounds
++ movups $inout1,-0x20($out)
++ mov $key_,$key # restore $key
++ movups $inout2,-0x10($out)
++ ja .Lcbc_dec_loop3
++
++.Lcbc_dec_tail:
++ add \$0x40,$len
++ movups $iv,($ivp)
++ jz .Lcbc_dec_ret
++
++ movups ($inp),$inout0
++ cmp \$0x10,$len
++ movaps $inout0,$in0
++ jbe .Lcbc_dec_one
++ movups 0x10($inp),$inout1
++ cmp \$0x20,$len
++ movaps $inout1,$in1
++ jbe .Lcbc_dec_two
++ movups 0x20($inp),$inout2
++ cmp \$0x30,$len
++ movaps $inout2,$in2
++ jbe .Lcbc_dec_three
++ movups 0x30($inp),$inout3
++ call _aesni_decrypt4
++ pxor $iv,$inout0
++ movups 0x30($inp),$iv
++ pxor $in0,$inout1
++ movups $inout0,($out)
++ pxor $in1,$inout2
++ movups $inout1,0x10($out)
++ pxor $in2,$inout3
++ movups $inout2,0x20($out)
++ movaps $inout3,$inout0
++ lea 0x30($out),$out
++ jmp .Lcbc_dec_tail_collected
++.align 16
++.Lcbc_dec_one:
++___
++ &aesni_generate1("dec",$key,$rounds);
++$code.=<<___;
++ pxor $iv,$inout0
++ movaps $in0,$iv
++ jmp .Lcbc_dec_tail_collected
++.align 16
++.Lcbc_dec_two:
++ call _aesni_decrypt3
++ pxor $iv,$inout0
++ pxor $in0,$inout1
++ movups $inout0,($out)
++ movaps $in1,$iv
++ movaps $inout1,$inout0
++ lea 0x10($out),$out
++ jmp .Lcbc_dec_tail_collected
++.align 16
++.Lcbc_dec_three:
++ call _aesni_decrypt3
++ pxor $iv,$inout0
++ pxor $in0,$inout1
++ movups $inout0,($out)
++ pxor $in1,$inout2
++ movups $inout1,0x10($out)
++ movaps $in2,$iv
++ movaps $inout2,$inout0
++ lea 0x20($out),$out
++ jmp .Lcbc_dec_tail_collected
++.align 16
++.Lcbc_dec_tail_collected:
++ and \$15,$len
++ movups $iv,($ivp)
++ jnz .Lcbc_dec_tail_partial
++ movups $inout0,($out)
++ jmp .Lcbc_dec_ret
++.Lcbc_dec_tail_partial:
++ movaps $inout0,$reserved(%rsp)
++ mov $out,%rdi
++ mov $len,%rcx
++ lea $reserved(%rsp),%rsi
++ .long 0x9066A4F3 # rep movsb
++
++.Lcbc_dec_ret:
++___
++$code.=<<___ if ($win64);
++ movaps (%rsp),%xmm6
++ movaps 0x10(%rsp),%xmm7
++ movaps 0x20(%rsp),%xmm8
++ movaps 0x30(%rsp),%xmm9
++ lea 0x58(%rsp),%rsp
++___
++$code.=<<___;
++.Lcbc_ret:
++ ret
++.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
++___
++\f
++# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey,
++# int bits, AES_KEY *key)
++{ my ($inp,$bits,$key) = @_4args;
++ $bits =~ s/%r/%e/;
++
++$code.=<<___;
++.globl ${PREFIX}_set_decrypt_key
++.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
++.align 16
++${PREFIX}_set_decrypt_key:
++ .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
++ call _aesni_set_encrypt_key
++ shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
++ test %eax,%eax
++ jnz .Ldec_key_ret
++ lea 16($key,$bits),$inp # points at the end of key schedule
++
++ $movkey ($key),%xmm0 # just swap
++ $movkey ($inp),%xmm1
++ $movkey %xmm0,($inp)
++ $movkey %xmm1,($key)
++ lea 16($key),$key
++ lea -16($inp),$inp
++
++.Ldec_key_inverse:
++ $movkey ($key),%xmm0 # swap and inverse
++ $movkey ($inp),%xmm1
++ aesimc %xmm0,%xmm0
++ aesimc %xmm1,%xmm1
++ lea 16($key),$key
++ lea -16($inp),$inp
++ cmp $key,$inp
++ $movkey %xmm0,16($inp)
++ $movkey %xmm1,-16($key)
++ ja .Ldec_key_inverse
++
++ $movkey ($key),%xmm0 # inverse middle
++ aesimc %xmm0,%xmm0
++ $movkey %xmm0,($inp)
++.Ldec_key_ret:
++ add \$8,%rsp
++ ret
++.LSEH_end_set_decrypt_key:
++.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
++___
++\f
++# This is based on submission by
++#
++# Huang Ying <ying.huang@intel.com>
++# Vinodh Gopal <vinodh.gopal@intel.com>
++# Kahraman Akdemir
++#
++# Agressively optimized in respect to aeskeygenassist's critical path
++# and is contained in %xmm0-5 to meet Win64 ABI requirement.
++#
++$code.=<<___;
++.globl ${PREFIX}_set_encrypt_key
++.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
++.align 16
++${PREFIX}_set_encrypt_key:
++_aesni_set_encrypt_key:
++ .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
++ test $inp,$inp
++ mov \$-1,%rax
++ jz .Lenc_key_ret
++ test $key,$key
++ jz .Lenc_key_ret
++
++ movups ($inp),%xmm0 # pull first 128 bits of *userKey
++ pxor %xmm4,%xmm4 # low dword of xmm4 is assumed 0
++ lea 16($key),%rax
++ cmp \$256,$bits
++ je .L14rounds
++ cmp \$192,$bits
++ je .L12rounds
++ cmp \$128,$bits
++ jne .Lbad_keybits
++
++.L10rounds:
++ mov \$9,$bits # 10 rounds for 128-bit key
++ $movkey %xmm0,($key) # round 0
++ aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
++ call .Lkey_expansion_128_cold
++ aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2
++ call .Lkey_expansion_128
++ aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3
++ call .Lkey_expansion_128
++ aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4
++ call .Lkey_expansion_128
++ aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5
++ call .Lkey_expansion_128
++ aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6
++ call .Lkey_expansion_128
++ aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7
++ call .Lkey_expansion_128
++ aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8
++ call .Lkey_expansion_128
++ aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9
++ call .Lkey_expansion_128
++ aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10
++ call .Lkey_expansion_128
++ $movkey %xmm0,(%rax)
++ mov $bits,80(%rax) # 240(%rdx)
++ xor %eax,%eax
++ jmp .Lenc_key_ret
++
++.align 16
++.L12rounds:
++ movq 16($inp),%xmm2 # remaining 1/3 of *userKey
++ mov \$11,$bits # 12 rounds for 192
++ $movkey %xmm0,($key) # round 0
++ aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
++ call .Lkey_expansion_192a_cold
++ aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
++ call .Lkey_expansion_192b
++ aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
++ call .Lkey_expansion_192a
++ aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
++ call .Lkey_expansion_192b
++ aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
++ call .Lkey_expansion_192a
++ aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
++ call .Lkey_expansion_192b
++ aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
++ call .Lkey_expansion_192a
++ aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
++ call .Lkey_expansion_192b
++ $movkey %xmm0,(%rax)
++ mov $bits,48(%rax) # 240(%rdx)
++ xor %rax, %rax
++ jmp .Lenc_key_ret
++
++.align 16
++.L14rounds:
++ movups 16($inp),%xmm2 # remaning half of *userKey
++ mov \$13,$bits # 14 rounds for 256
++ lea 16(%rax),%rax
++ $movkey %xmm0,($key) # round 0
++ $movkey %xmm2,16($key) # round 1
++ aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
++ call .Lkey_expansion_256a_cold
++ aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
++ call .Lkey_expansion_256b
++ aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
++ call .Lkey_expansion_256a
++ aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
++ call .Lkey_expansion_256b
++ aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
++ call .Lkey_expansion_256a
++ aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
++ call .Lkey_expansion_256b
++ aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
++ call .Lkey_expansion_256a
++ aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
++ call .Lkey_expansion_256b
++ aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
++ call .Lkey_expansion_256a
++ aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
++ call .Lkey_expansion_256b
++ aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
++ call .Lkey_expansion_256a
++ aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
++ call .Lkey_expansion_256b
++ aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
++ call .Lkey_expansion_256a
++ $movkey %xmm0,(%rax)
++ mov $bits,16(%rax) # 240(%rdx)
++ xor %rax,%rax
++ jmp .Lenc_key_ret
++
++.align 16
++.Lbad_keybits:
++ mov \$-2,%rax
++.Lenc_key_ret:
++ add \$8,%rsp
++ ret
++.LSEH_end_set_encrypt_key:
++\f
++.align 16
++.Lkey_expansion_128:
++ $movkey %xmm0,(%rax)
++ lea 16(%rax),%rax
++.Lkey_expansion_128_cold:
++ shufps \$0b00010000,%xmm0,%xmm4
++ pxor %xmm4, %xmm0
++ shufps \$0b10001100,%xmm0,%xmm4
++ pxor %xmm4, %xmm0
++ pshufd \$0b11111111,%xmm1,%xmm1 # critical path
++ pxor %xmm1,%xmm0
++ ret
++
++.align 16
++.Lkey_expansion_192a:
++ $movkey %xmm0,(%rax)
++ lea 16(%rax),%rax
++.Lkey_expansion_192a_cold:
++ movaps %xmm2, %xmm5
++.Lkey_expansion_192b_warm:
++ shufps \$0b00010000,%xmm0,%xmm4
++ movaps %xmm2,%xmm3
++ pxor %xmm4,%xmm0
++ shufps \$0b10001100,%xmm0,%xmm4
++ pslldq \$4,%xmm3
++ pxor %xmm4,%xmm0
++ pshufd \$0b01010101,%xmm1,%xmm1 # critical path
++ pxor %xmm3,%xmm2
++ pxor %xmm1,%xmm0
++ pshufd \$0b11111111,%xmm0,%xmm3
++ pxor %xmm3,%xmm2
++ ret
++
++.align 16
++.Lkey_expansion_192b:
++ movaps %xmm0,%xmm3
++ shufps \$0b01000100,%xmm0,%xmm5
++ $movkey %xmm5,(%rax)
++ shufps \$0b01001110,%xmm2,%xmm3
++ $movkey %xmm3,16(%rax)
++ lea 32(%rax),%rax
++ jmp .Lkey_expansion_192b_warm
++
++.align 16
++.Lkey_expansion_256a:
++ $movkey %xmm2,(%rax)
++ lea 16(%rax),%rax
++.Lkey_expansion_256a_cold:
++ shufps \$0b00010000,%xmm0,%xmm4
++ pxor %xmm4,%xmm0
++ shufps \$0b10001100,%xmm0,%xmm4
++ pxor %xmm4,%xmm0
++ pshufd \$0b11111111,%xmm1,%xmm1 # critical path
++ pxor %xmm1,%xmm0
++ ret
++
++.align 16
++.Lkey_expansion_256b:
++ $movkey %xmm0,(%rax)
++ lea 16(%rax),%rax
++
++ shufps \$0b00010000,%xmm2,%xmm4
++ pxor %xmm4,%xmm2
++ shufps \$0b10001100,%xmm2,%xmm4
++ pxor %xmm4,%xmm2
++ pshufd \$0b10101010,%xmm1,%xmm1 # critical path
++ pxor %xmm1,%xmm2
++ ret
++.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
++___
++}
++\f
++$code.=<<___;
++.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
++.align 64
++___
++
++# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
++# CONTEXT *context,DISPATCHER_CONTEXT *disp)
++if ($win64) {
++$rec="%rcx";
++$frame="%rdx";
++$context="%r8";
++$disp="%r9";
++
++$code.=<<___;
++.extern __imp_RtlVirtualUnwind
++.type cbc_se_handler,\@abi-omnipotent
++.align 16
++cbc_se_handler:
++ push %rsi
++ push %rdi
++ push %rbx
++ push %rbp
++ push %r12
++ push %r13
++ push %r14
++ push %r15
++ pushfq
++ sub \$64,%rsp
++
++ mov 152($context),%rax # pull context->Rsp
++ mov 248($context),%rbx # pull context->Rip
++
++ lea .Lcbc_decrypt(%rip),%r10
++ cmp %r10,%rbx # context->Rip<"prologue" label
++ jb .Lin_prologue
++
++ lea .Lcbc_decrypt_body(%rip),%r10
++ cmp %r10,%rbx # context->Rip<cbc_decrypt_body
++ jb .Lrestore_rax
++
++ lea .Lcbc_ret(%rip),%r10
++ cmp %r10,%rbx # context->Rip>="epilogue" label
++ jae .Lin_prologue
++
++ lea 0(%rax),%rsi # top of stack
++ lea 512($context),%rdi # &context.Xmm6
++ mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
++ .long 0xa548f3fc # cld; rep movsq
++ lea 0x58(%rax),%rax # adjust stack pointer
++ jmp .Lin_prologue
++
++.Lrestore_rax:
++ mov 120($context),%rax
++.Lin_prologue:
++ mov 8(%rax),%rdi
++ mov 16(%rax),%rsi
++ mov %rax,152($context) # restore context->Rsp
++ mov %rsi,168($context) # restore context->Rsi
++ mov %rdi,176($context) # restore context->Rdi
++
++ jmp .Lcommon_seh_exit
++.size cbc_se_handler,.-cbc_se_handler
++
++.type ecb_se_handler,\@abi-omnipotent
++.align 16
++ecb_se_handler:
++ push %rsi
++ push %rdi
++ push %rbx
++ push %rbp
++ push %r12
++ push %r13
++ push %r14
++ push %r15
++ pushfq
++ sub \$64,%rsp
++
++ mov 152($context),%rax # pull context->Rsp
++ mov 8(%rax),%rdi
++ mov 16(%rax),%rsi
++ mov %rsi,168($context) # restore context->Rsi
++ mov %rdi,176($context) # restore context->Rdi
++
++.Lcommon_seh_exit:
++
++ mov 40($disp),%rdi # disp->ContextRecord
++ mov $context,%rsi # context
++ mov \$154,%ecx # sizeof(CONTEXT)
++ .long 0xa548f3fc # cld; rep movsq
++
++ mov $disp,%rsi
++ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
++ mov 8(%rsi),%rdx # arg2, disp->ImageBase
++ mov 0(%rsi),%r8 # arg3, disp->ControlPc
++ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
++ mov 40(%rsi),%r10 # disp->ContextRecord
++ lea 56(%rsi),%r11 # &disp->HandlerData
++ lea 24(%rsi),%r12 # &disp->EstablisherFrame
++ mov %r10,32(%rsp) # arg5
++ mov %r11,40(%rsp) # arg6
++ mov %r12,48(%rsp) # arg7
++ mov %rcx,56(%rsp) # arg8, (NULL)
++ call *__imp_RtlVirtualUnwind(%rip)
++
++ mov \$1,%eax # ExceptionContinueSearch
++ add \$64,%rsp
++ popfq
++ pop %r15
++ pop %r14
++ pop %r13
++ pop %r12
++ pop %rbp
++ pop %rbx
++ pop %rdi
++ pop %rsi
++ ret
++.size cbc_se_handler,.-cbc_se_handler
++
++.section .pdata
++.align 4
++ .rva .LSEH_begin_${PREFIX}_ecb_encrypt
++ .rva .LSEH_end_${PREFIX}_ecb_encrypt
++ .rva .LSEH_info_ecb
++
++ .rva .LSEH_begin_${PREFIX}_cbc_encrypt
++ .rva .LSEH_end_${PREFIX}_cbc_encrypt
++ .rva .LSEH_info_cbc
++
++ .rva ${PREFIX}_set_decrypt_key
++ .rva .LSEH_end_set_decrypt_key
++ .rva .LSEH_info_key
++
++ .rva ${PREFIX}_set_encrypt_key
++ .rva .LSEH_end_set_encrypt_key
++ .rva .LSEH_info_key
++.section .xdata
++.align 8
++.LSEH_info_ecb:
++ .byte 9,0,0,0
++ .rva ecb_se_handler
++.LSEH_info_cbc:
++ .byte 9,0,0,0
++ .rva cbc_se_handler
++.LSEH_info_key:
++ .byte 0x01,0x04,0x01,0x00
++ .byte 0x04,0x02,0x00,0x00
++___
++}
++
++sub rex {
++ local *opcode=shift;
++ my ($dst,$src)=@_;
++
++ if ($dst>=8 || $src>=8) {
++ $rex=0x40;
++ $rex|=0x04 if($dst>=8);
++ $rex|=0x01 if($src>=8);
++ push @opcode,$rex;
++ }
++}
++
++sub aesni {
++ my $line=shift;
++ my @opcode=(0x66);
++
++ if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
++ rex(\@opcode,$4,$3);
++ push @opcode,0x0f,0x3a,0xdf;
++ push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
++ my $c=$2;
++ push @opcode,$c=~/^0/?oct($c):$c;
++ return ".byte\t".join(',',@opcode);
++ }
++ elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
++ my %opcodelet = (
++ "aesimc" => 0xdb,
++ "aesenc" => 0xdc, "aesenclast" => 0xdd,
++ "aesdec" => 0xde, "aesdeclast" => 0xdf
++ );
++ return undef if (!defined($opcodelet{$1}));
++ rex(\@opcode,$3,$2);
++ push @opcode,0x0f,0x38,$opcodelet{$1};
++ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
++ return ".byte\t".join(',',@opcode);
++ }
++ return $line;
++}
++
++$code =~ s/\`([^\`]*)\`/eval($1)/gem;
++$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
++
++print $code;
++
++close STDOUT;
+diff -up openssl-1.0.0-beta4/crypto/aes/Makefile.aesni openssl-1.0.0-beta4/crypto/aes/Makefile
+--- openssl-1.0.0-beta4/crypto/aes/Makefile.aesni 2008-12-23 12:33:00.000000000 +0100
++++ openssl-1.0.0-beta4/crypto/aes/Makefile 2010-01-12 22:18:06.000000000 +0100
+@@ -50,9 +50,13 @@ aes-ia64.s: asm/aes-ia64.S
+
+ aes-586.s: asm/aes-586.pl ../perlasm/x86asm.pl
+ $(PERL) asm/aes-586.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
++aesni-x86.s: asm/aesni-x86.pl ../perlasm/x86asm.pl
++ $(PERL) asm/aesni-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
+
+ aes-x86_64.s: asm/aes-x86_64.pl
+ $(PERL) asm/aes-x86_64.pl $(PERLASM_SCHEME) > $@
++aesni-x86_64.s: asm/aesni-x86_64.pl
++ $(PERL) asm/aesni-x86_64.pl $(PERLASM_SCHEME) > $@
+
+ aes-sparcv9.s: asm/aes-sparcv9.pl
+ $(PERL) asm/aes-sparcv9.pl $(CFLAGS) > $@
+diff -up openssl-1.0.0-beta4/crypto/engine/eng_aesni.c.aesni openssl-1.0.0-beta4/crypto/engine/eng_aesni.c
+--- openssl-1.0.0-beta4/crypto/engine/eng_aesni.c.aesni 2010-01-12 22:18:06.000000000 +0100
++++ openssl-1.0.0-beta4/crypto/engine/eng_aesni.c 2010-01-12 22:18:06.000000000 +0100
+@@ -0,0 +1,413 @@
++/*
++ * Support for Intel AES-NI intruction set
++ * Author: Huang Ying <ying.huang@intel.com>
++ *
++ * Intel AES-NI is a new set of Single Instruction Multiple Data
++ * (SIMD) instructions that are going to be introduced in the next
++ * generation of Intel processor, as of 2009. These instructions
++ * enable fast and secure data encryption and decryption, using the
++ * Advanced Encryption Standard (AES), defined by FIPS Publication
++ * number 197. The architecture introduces six instructions that
++ * offer full hardware support for AES. Four of them support high
++ * performance data encryption and decryption, and the other two
++ * instructions support the AES key expansion procedure.
++ *
++ * The white paper can be downloaded from:
++ * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
++ *
++ * This file is based on engines/e_padlock.c
++ */
++
++/* ====================================================================
++ * Copyright (c) 1999-2001 The OpenSSL Project. All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above copyright
++ * notice, this list of conditions and the following disclaimer in
++ * the documentation and/or other materials provided with the
++ * distribution.
++ *
++ * 3. All advertising materials mentioning features or use of this
++ * software must display the following acknowledgment:
++ * "This product includes software developed by the OpenSSL Project
++ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
++ *
++ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
++ * endorse or promote products derived from this software without
++ * prior written permission. For written permission, please contact
++ * licensing@OpenSSL.org.
++ *
++ * 5. Products derived from this software may not be called "OpenSSL"
++ * nor may "OpenSSL" appear in their names without prior written
++ * permission of the OpenSSL Project.
++ *
++ * 6. Redistributions of any form whatsoever must retain the following
++ * acknowledgment:
++ * "This product includes software developed by the OpenSSL Project
++ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
++ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
++ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
++ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
++ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
++ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
++ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
++ * OF THE POSSIBILITY OF SUCH DAMAGE.
++ * ====================================================================
++ *
++ * This product includes cryptographic software written by Eric Young
++ * (eay@cryptsoft.com). This product includes software written by Tim
++ * Hudson (tjh@cryptsoft.com).
++ *
++ */
++
++
++#include <openssl/opensslconf.h>
++
++#if !defined(OPENSSL_NO_HW) && !defined(OPENSSL_NO_HW_AES_NI) && !defined(OPENSSL_NO_AES)
++
++#include <stdio.h>
++#include "cryptlib.h"
++#include <openssl/dso.h>
++#include <openssl/engine.h>
++#include <openssl/evp.h>
++#include <openssl/aes.h>
++#include <openssl/err.h>
++#include <openssl/modes.h>
++
++/* AES-NI is available *ONLY* on some x86 CPUs. Not only that it
++ doesn't exist elsewhere, but it even can't be compiled on other
++ platforms! */
++#undef COMPILE_HW_AESNI
++#if (defined(__x86_64) || defined(__x86_64__) || \
++ defined(_M_AMD64) || defined(_M_X64) || \
++ defined(OPENSSL_IA32_SSE2)) && !defined(OPENSSL_NO_ASM)
++#define COMPILE_HW_AESNI
++static ENGINE *ENGINE_aesni (void);
++#endif
++
++void ENGINE_load_aesni (void)
++{
++/* On non-x86 CPUs it just returns. */
++#ifdef COMPILE_HW_AESNI
++ ENGINE *toadd = ENGINE_aesni();
++ if (!toadd)
++ return;
++ ENGINE_add (toadd);
++ ENGINE_register_complete (toadd);
++ ENGINE_free (toadd);
++ ERR_clear_error ();
++#endif
++}
++
++#ifdef COMPILE_HW_AESNI
++int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
++ AES_KEY *key);
++int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
++ AES_KEY *key);
++
++void aesni_encrypt(const unsigned char *in, unsigned char *out,
++ const AES_KEY *key);
++void aesni_decrypt(const unsigned char *in, unsigned char *out,
++ const AES_KEY *key);
++
++void aesni_ecb_encrypt(const unsigned char *in,
++ unsigned char *out,
++ size_t length,
++ const AES_KEY *key,
++ int enc);
++void aesni_cbc_encrypt(const unsigned char *in,
++ unsigned char *out,
++ size_t length,
++ const AES_KEY *key,
++ unsigned char *ivec, int enc);
++
++/* Function for ENGINE detection and control */
++static int aesni_init(ENGINE *e);
++
++/* Cipher Stuff */
++static int aesni_ciphers(ENGINE *e, const EVP_CIPHER **cipher,
++ const int **nids, int nid);
++
++#define AESNI_MIN_ALIGN 16
++#define AESNI_ALIGN(x) \
++ ((void *)(((unsigned long)(x)+AESNI_MIN_ALIGN-1)&~(AESNI_MIN_ALIGN-1)))
++
++/* Engine names */
++static const char aesni_id[] = "aesni",
++ aesni_name[] = "Intel AES-NI engine",
++ no_aesni_name[] = "Intel AES-NI engine (no-aesni)";
++
++/* ===== Engine "management" functions ===== */
++
++#if defined(_WIN32)
++typedef unsigned __int64 IA32CAP;
++#else
++typedef unsigned long long IA32CAP;
++#endif
++
++/* Prepare the ENGINE structure for registration */
++static int
++aesni_bind_helper(ENGINE *e)
++{
++ int engage;
++ if (sizeof(OPENSSL_ia32cap_P) > 4) {
++ engage = (OPENSSL_ia32cap_P >> 57) & 1;
++ } else {
++ IA32CAP OPENSSL_ia32_cpuid(void);
++ engage = (OPENSSL_ia32_cpuid() >> 57) & 1;
++ }
++
++ /* Register everything or return with an error */
++ if (!ENGINE_set_id(e, aesni_id) ||
++ !ENGINE_set_name(e, engage ? aesni_name : no_aesni_name) ||
++
++ !ENGINE_set_init_function(e, aesni_init) ||
++ (engage && !ENGINE_set_ciphers (e, aesni_ciphers))
++ )
++ return 0;
++
++ /* Everything looks good */
++ return 1;
++}
++
++/* Constructor */
++static ENGINE *
++ENGINE_aesni(void)
++{
++ ENGINE *eng = ENGINE_new();
++
++ if (!eng) {
++ return NULL;
++ }
++
++ if (!aesni_bind_helper(eng)) {
++ ENGINE_free(eng);
++ return NULL;
++ }
++
++ return eng;
++}
++
++/* Check availability of the engine */
++static int
++aesni_init(ENGINE *e)
++{
++ return 1;
++}
++
++#if defined(NID_aes_128_cfb128) && ! defined (NID_aes_128_cfb)
++#define NID_aes_128_cfb NID_aes_128_cfb128
++#endif
++
++#if defined(NID_aes_128_ofb128) && ! defined (NID_aes_128_ofb)
++#define NID_aes_128_ofb NID_aes_128_ofb128
++#endif
++
++#if defined(NID_aes_192_cfb128) && ! defined (NID_aes_192_cfb)
++#define NID_aes_192_cfb NID_aes_192_cfb128
++#endif
++
++#if defined(NID_aes_192_ofb128) && ! defined (NID_aes_192_ofb)
++#define NID_aes_192_ofb NID_aes_192_ofb128
++#endif
++
++#if defined(NID_aes_256_cfb128) && ! defined (NID_aes_256_cfb)
++#define NID_aes_256_cfb NID_aes_256_cfb128
++#endif
++
++#if defined(NID_aes_256_ofb128) && ! defined (NID_aes_256_ofb)
++#define NID_aes_256_ofb NID_aes_256_ofb128
++#endif
++
++/* List of supported ciphers. */
++static int aesni_cipher_nids[] = {
++ NID_aes_128_ecb,
++ NID_aes_128_cbc,
++ NID_aes_128_cfb,
++ NID_aes_128_ofb,
++
++ NID_aes_192_ecb,
++ NID_aes_192_cbc,
++ NID_aes_192_cfb,
++ NID_aes_192_ofb,
++
++ NID_aes_256_ecb,
++ NID_aes_256_cbc,
++ NID_aes_256_cfb,
++ NID_aes_256_ofb,
++};
++static int aesni_cipher_nids_num =
++ (sizeof(aesni_cipher_nids)/sizeof(aesni_cipher_nids[0]));
++
++typedef struct
++{
++ AES_KEY ks;
++ unsigned int _pad1[3];
++} AESNI_KEY;
++
++static int
++aesni_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *user_key,
++ const unsigned char *iv, int enc)
++{
++ int ret;
++ AES_KEY *key = AESNI_ALIGN(ctx->cipher_data);
++
++ if ((ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_CFB_MODE
++ || (ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_OFB_MODE
++ || enc)
++ ret=aesni_set_encrypt_key(user_key, ctx->key_len * 8, key);
++ else
++ ret=aesni_set_decrypt_key(user_key, ctx->key_len * 8, key);
++
++ if(ret < 0) {
++ EVPerr(EVP_F_AESNI_INIT_KEY,EVP_R_AES_KEY_SETUP_FAILED);
++ return 0;
++ }
++
++ return 1;
++}
++
++static int aesni_cipher_ecb(EVP_CIPHER_CTX *ctx, unsigned char *out,
++ const unsigned char *in, size_t inl)
++{ AES_KEY *key = AESNI_ALIGN(ctx->cipher_data);
++ aesni_ecb_encrypt(in, out, inl, key, ctx->encrypt);
++ return 1;
++}
++static int aesni_cipher_cbc(EVP_CIPHER_CTX *ctx, unsigned char *out,
++ const unsigned char *in, size_t inl)
++{ AES_KEY *key = AESNI_ALIGN(ctx->cipher_data);
++ aesni_cbc_encrypt(in, out, inl, key,
++ ctx->iv, ctx->encrypt);
++ return 1;
++}
++static int aesni_cipher_cfb(EVP_CIPHER_CTX *ctx, unsigned char *out,
++ const unsigned char *in, size_t inl)
++{ AES_KEY *key = AESNI_ALIGN(ctx->cipher_data);
++ CRYPTO_cfb128_encrypt(in, out, inl, key, ctx->iv,
++ &ctx->num, ctx->encrypt,
++ (block128_f)aesni_encrypt);
++ return 1;
++}
++static int aesni_cipher_ofb(EVP_CIPHER_CTX *ctx, unsigned char *out,
++ const unsigned char *in, size_t inl)
++{ AES_KEY *key = AESNI_ALIGN(ctx->cipher_data);
++ CRYPTO_ofb128_encrypt(in, out, inl, key, ctx->iv,
++ &ctx->num, (block128_f)aesni_encrypt);
++ return 1;
++}
++
++#define AES_BLOCK_SIZE 16
++
++#define EVP_CIPHER_block_size_ECB AES_BLOCK_SIZE
++#define EVP_CIPHER_block_size_CBC AES_BLOCK_SIZE
++#define EVP_CIPHER_block_size_OFB 1
++#define EVP_CIPHER_block_size_CFB 1
++
++/* Declaring so many ciphers by hand would be a pain.
++ Instead introduce a bit of preprocessor magic :-) */
++#define DECLARE_AES_EVP(ksize,lmode,umode) \
++static const EVP_CIPHER aesni_##ksize##_##lmode = { \
++ NID_aes_##ksize##_##lmode, \
++ EVP_CIPHER_block_size_##umode, \
++ ksize / 8, \
++ AES_BLOCK_SIZE, \
++ 0 | EVP_CIPH_##umode##_MODE, \
++ aesni_init_key, \
++ aesni_cipher_##lmode, \
++ NULL, \
++ sizeof(AESNI_KEY), \
++ EVP_CIPHER_set_asn1_iv, \
++ EVP_CIPHER_get_asn1_iv, \
++ NULL, \
++ NULL \
++}
++
++DECLARE_AES_EVP(128,ecb,ECB);
++DECLARE_AES_EVP(128,cbc,CBC);
++DECLARE_AES_EVP(128,cfb,CFB);
++DECLARE_AES_EVP(128,ofb,OFB);
++
++DECLARE_AES_EVP(192,ecb,ECB);
++DECLARE_AES_EVP(192,cbc,CBC);
++DECLARE_AES_EVP(192,cfb,CFB);
++DECLARE_AES_EVP(192,ofb,OFB);
++
++DECLARE_AES_EVP(256,ecb,ECB);
++DECLARE_AES_EVP(256,cbc,CBC);
++DECLARE_AES_EVP(256,cfb,CFB);
++DECLARE_AES_EVP(256,ofb,OFB);
++
++static int
++aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher,
++ const int **nids, int nid)
++{
++ /* No specific cipher => return a list of supported nids ... */
++ if (!cipher) {
++ *nids = aesni_cipher_nids;
++ return aesni_cipher_nids_num;
++ }
++
++ /* ... or the requested "cipher" otherwise */
++ switch (nid) {
++ case NID_aes_128_ecb:
++ *cipher = &aesni_128_ecb;
++ break;
++ case NID_aes_128_cbc:
++ *cipher = &aesni_128_cbc;
++ break;
++ case NID_aes_128_cfb:
++ *cipher = &aesni_128_cfb;
++ break;
++ case NID_aes_128_ofb:
++ *cipher = &aesni_128_ofb;
++ break;
++
++ case NID_aes_192_ecb:
++ *cipher = &aesni_192_ecb;
++ break;
++ case NID_aes_192_cbc:
++ *cipher = &aesni_192_cbc;
++ break;
++ case NID_aes_192_cfb:
++ *cipher = &aesni_192_cfb;
++ break;
++ case NID_aes_192_ofb:
++ *cipher = &aesni_192_ofb;
++ break;
++
++ case NID_aes_256_ecb:
++ *cipher = &aesni_256_ecb;
++ break;
++ case NID_aes_256_cbc:
++ *cipher = &aesni_256_cbc;
++ break;
++ case NID_aes_256_cfb:
++ *cipher = &aesni_256_cfb;
++ break;
++ case NID_aes_256_ofb:
++ *cipher = &aesni_256_ofb;
++ break;
++
++ default:
++ /* Sorry, we don't support this NID */
++ *cipher = NULL;
++ return 0;
++ }
++
++ return 1;
++}
++
++#endif /* COMPILE_HW_AESNI */
++#endif /* !defined(OPENSSL_NO_HW) && !defined(OPENSSL_NO_HW_AESNI) && !defined(OPENSSL_NO_AES) */
+diff -up openssl-1.0.0-beta4/crypto/engine/eng_all.c.aesni openssl-1.0.0-beta4/crypto/engine/eng_all.c
+--- openssl-1.0.0-beta4/crypto/engine/eng_all.c.aesni 2010-01-07 23:38:31.000000000 +0100
++++ openssl-1.0.0-beta4/crypto/engine/eng_all.c 2010-01-12 22:18:06.000000000 +0100
+@@ -85,6 +85,9 @@ void ENGINE_load_builtin_engines(void)
+ #if !defined(OPENSSL_NO_HW) && (defined(__OpenBSD__) || defined(__FreeBSD__) || defined(HAVE_CRYPTODEV))
+ ENGINE_load_cryptodev();
+ #endif
++#if !defined(OPENSSL_NO_HW) && !defined(OPENSSL_NO_HW_AESNI)
++ ENGINE_load_aesni();
++#endif
+ ENGINE_load_dynamic();
+ #ifndef OPENSSL_NO_STATIC_ENGINE
+ #ifndef OPENSSL_NO_HW
+diff -up openssl-1.0.0-beta4/crypto/engine/engine.h.aesni openssl-1.0.0-beta4/crypto/engine/engine.h
+--- openssl-1.0.0-beta4/crypto/engine/engine.h.aesni 2010-01-07 23:38:30.000000000 +0100
++++ openssl-1.0.0-beta4/crypto/engine/engine.h 2010-01-12 22:18:06.000000000 +0100
+@@ -342,6 +342,7 @@ void ENGINE_load_gost(void);
+ #endif
+ #endif
+ void ENGINE_load_cryptodev(void);
++void ENGINE_load_aesni(void);
+ void ENGINE_load_builtin_engines(void);
+
+ /* Get and set global flags (ENGINE_TABLE_FLAG_***) for the implementation
+diff -up openssl-1.0.0-beta4/crypto/engine/Makefile.aesni openssl-1.0.0-beta4/crypto/engine/Makefile
+--- openssl-1.0.0-beta4/crypto/engine/Makefile.aesni 2008-06-04 13:01:29.000000000 +0200
++++ openssl-1.0.0-beta4/crypto/engine/Makefile 2010-01-12 22:18:06.000000000 +0100
+@@ -21,12 +21,14 @@ LIBSRC= eng_err.c eng_lib.c eng_list.c e
+ eng_table.c eng_pkey.c eng_fat.c eng_all.c \
+ tb_rsa.c tb_dsa.c tb_ecdsa.c tb_dh.c tb_ecdh.c tb_rand.c tb_store.c \
+ tb_cipher.c tb_digest.c tb_pkmeth.c tb_asnmth.c \
+- eng_openssl.c eng_cnf.c eng_dyn.c eng_cryptodev.c
++ eng_openssl.c eng_cnf.c eng_dyn.c eng_cryptodev.c \
++ eng_aesni.c
+ LIBOBJ= eng_err.o eng_lib.o eng_list.o eng_init.o eng_ctrl.o \
+ eng_table.o eng_pkey.o eng_fat.o eng_all.o \
+ tb_rsa.o tb_dsa.o tb_ecdsa.o tb_dh.o tb_ecdh.o tb_rand.o tb_store.o \
+ tb_cipher.o tb_digest.o tb_pkmeth.o tb_asnmth.o \
+- eng_openssl.o eng_cnf.o eng_dyn.o eng_cryptodev.o
++ eng_openssl.o eng_cnf.o eng_dyn.o eng_cryptodev.o \
++ eng_aesni.o
+
+ SRC= $(LIBSRC)
+
+diff -up openssl-1.0.0-beta4/crypto/evp/evp_err.c.aesni openssl-1.0.0-beta4/crypto/evp/evp_err.c
+--- openssl-1.0.0-beta4/crypto/evp/evp_err.c.aesni 2010-01-07 23:38:31.000000000 +0100
++++ openssl-1.0.0-beta4/crypto/evp/evp_err.c 2010-01-12 22:18:06.000000000 +0100
+@@ -1,6 +1,6 @@
+ /* crypto/evp/evp_err.c */
+ /* ====================================================================
+- * Copyright (c) 1999-2008 The OpenSSL Project. All rights reserved.
++ * Copyright (c) 1999-2009 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+@@ -70,6 +70,7 @@
+
+ static ERR_STRING_DATA EVP_str_functs[]=
+ {
++{ERR_FUNC(EVP_F_AESNI_INIT_KEY), "AESNI_INIT_KEY"},
+ {ERR_FUNC(EVP_F_AES_INIT_KEY), "AES_INIT_KEY"},
+ {ERR_FUNC(EVP_F_CAMELLIA_INIT_KEY), "CAMELLIA_INIT_KEY"},
+ {ERR_FUNC(EVP_F_D2I_PKEY), "D2I_PKEY"},
+@@ -85,7 +86,7 @@ static ERR_STRING_DATA EVP_str_functs[]=
+ {ERR_FUNC(EVP_F_EVP_DIGESTINIT_EX), "EVP_DigestInit_ex"},
+ {ERR_FUNC(EVP_F_EVP_ENCRYPTFINAL_EX), "EVP_EncryptFinal_ex"},
+ {ERR_FUNC(EVP_F_EVP_MD_CTX_COPY_EX), "EVP_MD_CTX_copy_ex"},
+-{ERR_FUNC(EVP_F_EVP_MD_SIZE), "EVP_MD_SIZE"},
++{ERR_FUNC(EVP_F_EVP_MD_SIZE), "EVP_MD_size"},
+ {ERR_FUNC(EVP_F_EVP_OPENINIT), "EVP_OpenInit"},
+ {ERR_FUNC(EVP_F_EVP_PBE_ALG_ADD), "EVP_PBE_alg_add"},
+ {ERR_FUNC(EVP_F_EVP_PBE_ALG_ADD_TYPE), "EVP_PBE_alg_add_type"},
+diff -up openssl-1.0.0-beta4/crypto/evp/evp.h.aesni openssl-1.0.0-beta4/crypto/evp/evp.h
+--- openssl-1.0.0-beta4/crypto/evp/evp.h.aesni 2010-01-07 23:38:31.000000000 +0100
++++ openssl-1.0.0-beta4/crypto/evp/evp.h 2010-01-12 22:18:06.000000000 +0100
+@@ -1162,6 +1162,7 @@ void ERR_load_EVP_strings(void);
+ /* Error codes for the EVP functions. */
+
+ /* Function codes. */
++#define EVP_F_AESNI_INIT_KEY 163
+ #define EVP_F_AES_INIT_KEY 133
+ #define EVP_F_CAMELLIA_INIT_KEY 159
+ #define EVP_F_D2I_PKEY 100
+diff -up openssl-1.0.0-beta4/test/test_aesni.aesni openssl-1.0.0-beta4/test/test_aesni
+--- openssl-1.0.0-beta4/test/test_aesni.aesni 2010-01-12 22:18:06.000000000 +0100
++++ openssl-1.0.0-beta4/test/test_aesni 2010-01-12 22:18:06.000000000 +0100
+@@ -0,0 +1,69 @@
++#!/bin/sh
++
++PROG=$1
++
++if [ -x $PROG ]; then
++ if expr "x`$PROG version`" : "xOpenSSL" > /dev/null; then
++ :
++ else
++ echo "$PROG is not OpenSSL executable"
++ exit 1
++ fi
++else
++ echo "$PROG is not executable"
++ exit 1;
++fi
++
++if $PROG engine aesni | grep -v no-aesni; then
++
++ HASH=`cat $PROG | $PROG dgst -hex`
++
++ AES_ALGS=" aes-128-ecb aes-192-ecb aes-256-ecb \
++ aes-128-cbc aes-192-cbc aes-256-cbc \
++ aes-128-cfb aes-192-cfb aes-256-cfb \
++ aes-128-ofb aes-192-ofb aes-256-ofb"
++ BUFSIZE="16 32 48 64 80 96 128 144 999"
++
++ nerr=0
++
++ for alg in $AES_ALGS; do
++ echo $alg
++ for bufsize in $BUFSIZE; do
++ TEST=`( cat $PROG | \
++ $PROG enc -e -k "$HASH" -$alg -bufsize $bufsize -engine aesni | \
++ $PROG enc -d -k "$HASH" -$alg | \
++ $PROG dgst -hex ) 2>/dev/null`
++ if [ "$TEST" != "$HASH" ]; then
++ echo "-$alg/$bufsize encrypt test failed"
++ nerr=`expr $nerr + 1`
++ fi
++ done
++ for bufsize in $BUFSIZE; do
++ TEST=`( cat $PROG | \
++ $PROG enc -e -k "$HASH" -$alg | \
++ $PROG enc -d -k "$HASH" -$alg -bufsize $bufsize -engine aesni | \
++ $PROG dgst -hex ) 2>/dev/null`
++ if [ "$TEST" != "$HASH" ]; then
++ echo "-$alg/$bufsize decrypt test failed"
++ nerr=`expr $nerr + 1`
++ fi
++ done
++ TEST=`( cat $PROG | \
++ $PROG enc -e -k "$HASH" -$alg -engine aesni | \
++ $PROG enc -d -k "$HASH" -$alg -engine aesni | \
++ $PROG dgst -hex ) 2>/dev/null`
++ if [ "$TEST" != "$HASH" ]; then
++ echo "-$alg en/decrypt test failed"
++ nerr=`expr $nerr + 1`
++ fi
++ done
++
++ if [ $nerr -gt 0 ]; then
++ echo "AESNI engine test failed."
++ exit 1;
++ fi
++else
++ echo "AESNI engine is not available"
++fi
++
++exit 0
diff --git a/openssl.spec b/openssl.spec
index 49af8f2..2724b5e 100644
--- a/openssl.spec
+++ b/openssl.spec
@@ -23,7 +23,7 @@
Summary: A general purpose cryptography library with TLS implementation
Name: openssl
Version: 1.0.0
-Release: 0.17.%{beta}%{?dist}
+Release: 0.18.%{beta}%{?dist}
# We remove certain patented algorithms from the openssl source tarball
# with the hobble-openssl script which is included below.
Source: openssl-%{version}-%{beta}-usa.tar.bz2
@@ -72,6 +72,7 @@ Patch65: openssl-1.0.0-beta4-dtls-reneg.patch
Patch66: openssl-1.0.0-beta4-backports2.patch
Patch67: openssl-1.0.0-beta4-reneg-scsv.patch
Patch68: openssl-1.0.0-beta4-tls-comp.patch
+Patch69: openssl-1.0.0-beta4-aesni.patch
License: OpenSSL
Group: System Environment/Libraries
@@ -160,6 +161,7 @@ from other formats to the formats used by the OpenSSL toolkit.
%patch66 -p1 -b .backports2
%patch67 -p1 -b .scsv
%patch68 -p1 -b .tls-comp
+%patch69 -p1 -b .aesni
# Modify the various perl scripts to reference perl in the right location.
perl util/perlpath.pl `dirname %{__perl}`
@@ -408,10 +410,13 @@ rm -rf $RPM_BUILD_ROOT/%{_libdir}/fipscanister.*
%postun -p /sbin/ldconfig
%changelog
+* Wed Jan 13 2010 Tomas Mraz <tmraz@redhat.com> 1.0.0-0.18.beta4
+- add support for Intel AES-NI
+
* Thu Jan 7 2010 Tomas Mraz <tmraz@redhat.com> 1.0.0-0.17.beta4
- upstream fix compression handling on session resumption
- various null checks and other small fixes from upstream
-- upstream changes for the renegotiation info according to the latest draft
+- upstream changes for the renegotiation info according to the latest draft
* Mon Nov 23 2009 Tomas Mraz <tmraz@redhat.com> 1.0.0-0.16.beta4
- fix non-fips mingw build (patch by Kalev Lember)
reply other threads:[~2026-06-09 12:42 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=178100892320.1.15210935346111379515.rpms-openssl-7f0747ce733f@fedoraproject.org \
--to=git-commits@fedoraproject.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox