From 226ae36af51105cd21a5d2bdcc21e9f4062f14bd Mon Sep 17 00:00:00 2001 From: Nikos Mavrogiannopoulos Date: Sat, 14 Dec 2013 13:00:55 +0100 Subject: [PATCH] Added Mike Hamburg's SSSE3 AES implementation. --- cfg.mk | 50 +- devel/perlasm/aes-ssse3-x86.pl | 902 ++++++++++++++++ devel/perlasm/aes-ssse3-x86.pl.license | 1 + devel/perlasm/aes-ssse3-x86_64.pl | 1206 ++++++++++++++++++++++ devel/perlasm/aes-ssse3-x86_64.pl.license | 1 + devel/perlasm/aesni-x86.pl.license | 1 + devel/perlasm/aesni-x86_64.pl.license | 1 + devel/perlasm/cbc.pl.license | 1 + devel/perlasm/cpuid-x86.pl.license | 1 + devel/perlasm/cpuid-x86_64.pl.license | 1 + devel/perlasm/e_padlock-x86.pl.license | 1 + devel/perlasm/e_padlock-x86_64.pl.license | 1 + devel/perlasm/ghash-x86.pl.license | 1 + devel/perlasm/ghash-x86_64.pl.license | 1 + devel/perlasm/license-gnutls.txt | 3 +- devel/perlasm/license-vpaes.txt | 12 + devel/perlasm/license.txt | 2 +- devel/perlasm/md5-x86_64.pl.license | 1 + devel/perlasm/openssl-cpuid-x86.pl.license | 1 + devel/perlasm/ppc-xlate.pl.license | 1 + devel/perlasm/sha1-ssse3-x86.pl.license | 1 + devel/perlasm/sha1-ssse3-x86_64.pl.license | 1 + devel/perlasm/sha256-ssse3-x86.pl.license | 1 + devel/perlasm/sha512-ssse3-x86.pl.license | 1 + devel/perlasm/sha512-ssse3-x86_64.pl.license | 1 + lib/accelerated/x86/aes-x86.c | 84 +- lib/accelerated/x86/aes-x86.h | 6 + lib/accelerated/x86/coff/aes-ssse3-x86.s | 662 ++++++++++++ lib/accelerated/x86/coff/aes-ssse3-x86_64.s | 1137 ++++++++++++++++++++ lib/accelerated/x86/coff/aesni-x86.s | 2 +- lib/accelerated/x86/coff/aesni-x86_64.s | 2 +- lib/accelerated/x86/coff/cpuid-x86.s | 3 +- lib/accelerated/x86/coff/cpuid-x86_64.s | 3 +- lib/accelerated/x86/coff/e_padlock-x86.s | 2 +- lib/accelerated/x86/coff/e_padlock-x86_64.s | 2 +- lib/accelerated/x86/coff/ghash-x86_64.s | 2 +- lib/accelerated/x86/coff/sha1-ssse3-x86.s | 2 +- lib/accelerated/x86/coff/sha1-ssse3-x86_64.s | 2 +- lib/accelerated/x86/coff/sha256-ssse3-x86.s | 2 +- lib/accelerated/x86/coff/sha512-ssse3-x86.s | 2 +- lib/accelerated/x86/coff/sha512-ssse3-x86_64.s | 2 +- lib/accelerated/x86/elf/aes-ssse3-x86.s | 675 ++++++++++++ lib/accelerated/x86/elf/aes-ssse3-x86_64.s | 841 +++++++++++++++ lib/accelerated/x86/elf/aesni-x86.s | 2 +- lib/accelerated/x86/elf/aesni-x86_64.s | 2 +- lib/accelerated/x86/elf/cpuid-x86.s | 3 +- lib/accelerated/x86/elf/cpuid-x86_64.s | 3 +- lib/accelerated/x86/elf/e_padlock-x86.s | 2 +- lib/accelerated/x86/elf/e_padlock-x86_64.s | 2 +- lib/accelerated/x86/elf/ghash-x86_64.s | 2 +- lib/accelerated/x86/elf/sha1-ssse3-x86.s | 2 +- lib/accelerated/x86/elf/sha1-ssse3-x86_64.s | 2 +- lib/accelerated/x86/elf/sha256-ssse3-x86.s | 2 +- lib/accelerated/x86/elf/sha512-ssse3-x86.s | 2 +- lib/accelerated/x86/elf/sha512-ssse3-x86_64.s | 2 +- lib/accelerated/x86/files.mk | 12 +- lib/accelerated/x86/license.txt | 11 + lib/accelerated/x86/macosx/aes-ssse3-x86.s | 649 ++++++++++++ lib/accelerated/x86/macosx/aes-ssse3-x86_64.s | 841 +++++++++++++++ lib/accelerated/x86/macosx/aesni-x86.s | 2 +- lib/accelerated/x86/macosx/aesni-x86_64.s | 2 +- lib/accelerated/x86/macosx/cpuid-x86.s | 3 +- lib/accelerated/x86/macosx/cpuid-x86_64.s | 3 +- lib/accelerated/x86/macosx/e_padlock-x86.s | 2 +- lib/accelerated/x86/macosx/e_padlock-x86_64.s | 2 +- lib/accelerated/x86/macosx/ghash-x86_64.s | 2 +- lib/accelerated/x86/macosx/sha1-ssse3-x86.s | 2 +- lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s | 2 +- lib/accelerated/x86/macosx/sha256-ssse3-x86.s | 2 +- lib/accelerated/x86/macosx/sha512-ssse3-x86.s | 2 +- lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s | 2 +- 71 files changed, 7104 insertions(+), 86 deletions(-) create mode 100644 devel/perlasm/aes-ssse3-x86.pl create mode 120000 devel/perlasm/aes-ssse3-x86.pl.license create mode 100644 devel/perlasm/aes-ssse3-x86_64.pl create mode 120000 devel/perlasm/aes-ssse3-x86_64.pl.license create mode 120000 devel/perlasm/aesni-x86.pl.license create mode 120000 devel/perlasm/aesni-x86_64.pl.license create mode 120000 devel/perlasm/cbc.pl.license create mode 120000 devel/perlasm/cpuid-x86.pl.license create mode 120000 devel/perlasm/cpuid-x86_64.pl.license create mode 120000 devel/perlasm/e_padlock-x86.pl.license create mode 120000 devel/perlasm/e_padlock-x86_64.pl.license create mode 120000 devel/perlasm/ghash-x86.pl.license create mode 120000 devel/perlasm/ghash-x86_64.pl.license create mode 100644 devel/perlasm/license-vpaes.txt create mode 120000 devel/perlasm/md5-x86_64.pl.license create mode 120000 devel/perlasm/openssl-cpuid-x86.pl.license create mode 120000 devel/perlasm/ppc-xlate.pl.license create mode 120000 devel/perlasm/sha1-ssse3-x86.pl.license create mode 120000 devel/perlasm/sha1-ssse3-x86_64.pl.license create mode 120000 devel/perlasm/sha256-ssse3-x86.pl.license create mode 120000 devel/perlasm/sha512-ssse3-x86.pl.license create mode 120000 devel/perlasm/sha512-ssse3-x86_64.pl.license create mode 100644 lib/accelerated/x86/coff/aes-ssse3-x86.s create mode 100644 lib/accelerated/x86/coff/aes-ssse3-x86_64.s create mode 100644 lib/accelerated/x86/elf/aes-ssse3-x86.s create mode 100644 lib/accelerated/x86/elf/aes-ssse3-x86_64.s create mode 100644 lib/accelerated/x86/macosx/aes-ssse3-x86.s create mode 100644 lib/accelerated/x86/macosx/aes-ssse3-x86_64.s diff --git a/cfg.mk b/cfg.mk index 8959d6e..6350071 100644 --- a/cfg.mk +++ b/cfg.mk @@ -176,6 +176,8 @@ ASM_SOURCES_XXX := \ lib/accelerated/x86/XXX/sha256-ssse3-x86.s \ lib/accelerated/x86/XXX/sha512-ssse3-x86.s \ lib/accelerated/x86/XXX/sha512-ssse3-x86_64.s \ + lib/accelerated/x86/XXX/aes-ssse3-x86.s \ + lib/accelerated/x86/XXX/aes-ssse3-x86_64.s ASM_SOURCES_ELF := $(subst XXX,elf,$(ASM_SOURCES_XXX)) ASM_SOURCES_COFF := $(subst XXX,coff,$(ASM_SOURCES_XXX)) @@ -187,10 +189,10 @@ asm-sources-clean: rm -f $(ASM_SOURCES_ELF) $(ASM_SOURCES_COFF) $(ASM_SOURCES_MACOSX) lib/accelerated/x86/files.mk X86_FILES=XXX/aesni-x86.s XXX/cpuid-x86.s XXX/e_padlock-x86.s XXX/sha1-ssse3-x86.s \ - XXX/sha256-ssse3-x86.s XXX/sha512-ssse3-x86.s + XXX/sha256-ssse3-x86.s XXX/sha512-ssse3-x86.s XXX/aes-ssse3-x86.s X86_64_FILES=XXX/aesni-x86_64.s XXX/cpuid-x86_64.s XXX/e_padlock-x86_64.s XXX/ghash-x86_64.s \ - XXX/sha1-ssse3-x86_64.s XXX/sha512-ssse3-x86_64.s + XXX/sha1-ssse3-x86_64.s XXX/sha512-ssse3-x86_64.s XXX/aes-ssse3-x86_64.s X86_FILES_ELF := $(subst XXX,elf,$(X86_FILES)) X86_FILES_COFF := $(subst XXX,coff,$(X86_FILES)) @@ -199,7 +201,7 @@ X86_64_FILES_ELF := $(subst XXX,elf,$(X86_64_FILES)) X86_64_FILES_COFF := $(subst XXX,coff,$(X86_64_FILES)) X86_64_FILES_MACOSX := $(subst XXX,macosx,$(X86_64_FILES)) -lib/accelerated/x86/files.mk: +lib/accelerated/x86/files.mk: $(ASM_SOURCES_ELF) echo X86_FILES_ELF=$(X86_FILES_ELF) > $@.tmp echo X86_FILES_COFF=$(X86_FILES_COFF) >> $@.tmp echo X86_FILES_MACOSX=$(X86_FILES_MACOSX) >> $@.tmp @@ -208,56 +210,30 @@ lib/accelerated/x86/files.mk: echo X86_64_FILES_MACOSX=$(X86_64_FILES_MACOSX) >> $@.tmp mv $@.tmp $@ -# CPUID is handled differently (other license) -lib/accelerated/x86/elf/cpuid-%.s: devel/perlasm/cpuid-%.pl $(objects) - cat devel/perlasm/license-gnutls.txt > $@ - perl $< elf >> $@ - echo "" >> $@ - echo ".section .note.GNU-stack,\"\",%progbits" >> $@ - -lib/accelerated/x86/coff/cpuid-x86.s: devel/perlasm/cpuid-x86.pl $(objects) - cat devel/perlasm/license-gnutls.txt > $@ - perl $< coff >> $@ - echo "" >> $@ - echo ".section .note.GNU-stack,\"\",%progbits" >> $@ - -lib/accelerated/x86/coff/cpuid-x86_64.s: devel/perlasm/cpuid-x86_64.pl $(objects) - cat devel/perlasm/license-gnutls.txt > $@ - perl $< mingw64 >> $@ - echo "" >> $@ - echo ".section .note.GNU-stack,\"\",%progbits" >> $@ - -lib/accelerated/x86/macosx/cpuid-%.s: devel/perlasm/cpuid-%.pl $(objects) - cat devel/perlasm/license-gnutls.txt > $@ - perl $< macosx >> $@ - echo "" >> $@ - echo ".section .note.GNU-stack,\"\",%progbits" >> $@ - - # Appro's code -lib/accelerated/x86/elf/%.s: devel/perlasm/%.pl $(objects) - cat devel/perlasm/license.txt > $@ +lib/accelerated/x86/elf/%.s: devel/perlasm/%.pl + cat $^.license > $@ perl $< elf >> $@ echo "" >> $@ echo ".section .note.GNU-stack,\"\",%progbits" >> $@ sed -i 's/OPENSSL_ia32cap_P/_gnutls_x86_cpuid_s/g' $@ -lib/accelerated/x86/coff/%-x86.s: devel/perlasm/%-x86.pl $(objects) - cat devel/perlasm/license.txt > $@ +lib/accelerated/x86/coff/%-x86.s: devel/perlasm/%-x86.pl + cat $^.license > $@ perl $< coff >> $@ echo "" >> $@ echo ".section .note.GNU-stack,\"\",%progbits" >> $@ sed -i 's/OPENSSL_ia32cap_P/_gnutls_x86_cpuid_s/g' $@ -lib/accelerated/x86/coff/%-x86_64.s: devel/perlasm/%-x86_64.pl $(objects) - cat devel/perlasm/license.txt > $@ +lib/accelerated/x86/coff/%-x86_64.s: devel/perlasm/%-x86_64.pl + cat $^.license > $@ perl $< mingw64 >> $@ echo "" >> $@ echo ".section .note.GNU-stack,\"\",%progbits" >> $@ sed -i 's/OPENSSL_ia32cap_P/_gnutls_x86_cpuid_s/g' $@ -lib/accelerated/x86/macosx/%.s: devel/perlasm/%.pl $(objects) - cat devel/perlasm/license.txt > $@ +lib/accelerated/x86/macosx/%.s: devel/perlasm/%.pl + cat $^.license > $@ perl $< macosx >> $@ echo "" >> $@ echo ".section .note.GNU-stack,\"\",%progbits" >> $@ diff --git a/devel/perlasm/aes-ssse3-x86.pl b/devel/perlasm/aes-ssse3-x86.pl new file mode 100644 index 0000000..bacf42c --- /dev/null +++ b/devel/perlasm/aes-ssse3-x86.pl @@ -0,0 +1,902 @@ +#!/usr/bin/env perl + +###################################################################### +## Constant-time SSSE3 AES core implementation. +## version 0.1 +## +## By Mike Hamburg (Stanford University), 2009 +## Public domain. +## +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. + +###################################################################### +# September 2011. +# +# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for +# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt +# doesn't handle partial vectors (doesn't have to if called from +# EVP only). "Drop-in" implies that this module doesn't share key +# schedule structure with the original nor does it make assumption +# about its alignment... +# +# Performance summary. aes-586.pl column lists large-block CBC +# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per +# byte processed with 128-bit key, and vpaes-x86.pl column - [also +# large-block CBC] encrypt/decrypt. +# +# aes-586.pl vpaes-x86.pl +# +# Core 2(**) 28.1/41.4/18.3 21.9/25.2(***) +# Nehalem 27.9/40.4/18.1 10.2/11.9 +# Atom 70.7/92.1/60.1 61.1/75.4(***) +# +# (*) "Hyper-threading" in the context refers rather to cache shared +# among multiple cores, than to specifically Intel HTT. As vast +# majority of contemporary cores share cache, slower code path +# is common place. In other words "with-hyper-threading-off" +# results are presented mostly for reference purposes. +# +# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. +# +# (***) Less impressive improvement on Core 2 and Atom is due to slow +# pshufb, yet it's respectable +28%/64% improvement on Core 2 +# and +15% on Atom (as implied, over "hyper-threading-safe" +# code path). +# +# + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); + +$PREFIX="vpaes"; + +my ($round, $base, $magic, $key, $const, $inp, $out)= + ("eax", "ebx", "ecx", "edx","ebp", "esi","edi"); + +&static_label("_vpaes_consts"); +&static_label("_vpaes_schedule_low_round"); + +&set_label("_vpaes_consts",64); +$k_inv=-0x30; # inv, inva + &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309); + &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C); + +$k_s0F=-0x10; # s0F + &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F); + +$k_ipt=0x00; # input transform (lo, hi) + &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090); + &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC); + +$k_sb1=0x20; # sb1u, sb1t + &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E); + &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1); +$k_sb2=0x40; # sb2u, sb2t + &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955); + &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8); +$k_sbo=0x60; # sbou, sbot + &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A); + &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1); + +$k_mc_forward=0x80; # mc_forward + &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D); + &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201); + &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605); + &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09); + +$k_mc_backward=0xc0; # mc_backward + &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F); + &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B); + &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407); + &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003); + +$k_sr=0x100; # sr + &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C); + &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C); + &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C); + &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C); + +$k_rcon=0x140; # rcon + &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808); + +$k_s63=0x150; # s63: all equal to 0x63 transformed + &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B); + +$k_opt=0x160; # output transform + &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121); + &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1); + +$k_deskew=0x180; # deskew tables: inverts the sbox's "skew" + &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A); + &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB); +## +## Decryption stuff +## Key schedule constants +## +$k_dksd=0x1a0; # decryption key schedule: invskew x*D + &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4); + &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA); +$k_dksb=0x1c0; # decryption key schedule: invskew x*B + &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386); + &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F); +$k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63 + &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C); + &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A); +$k_dks9=0x200; # decryption key schedule: invskew x*9 + &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334); + &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC); + +## +## Decryption stuff +## Round function constants +## +$k_dipt=0x220; # decryption input transform + &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E); + &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772); + +$k_dsb9=0x240; # decryption sbox output *9*u, *9*t + &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50); + &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E); +$k_dsbd=0x260; # decryption sbox output *D*u, *D*t + &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13); + &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D); +$k_dsbb=0x280; # decryption sbox output *B*u, *B*t + &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6); + &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E); +$k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t + &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004); + &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B); +$k_dsbo=0x2c0; # decryption sbox final output + &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9); + &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159); +&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)"); +&align (64); + +&function_begin_B("_vpaes_preheat"); + &add ($const,&DWP(0,"esp")); + &movdqa ("xmm7",&QWP($k_inv,$const)); + &movdqa ("xmm6",&QWP($k_s0F,$const)); + &ret (); +&function_end_B("_vpaes_preheat"); + +## +## _aes_encrypt_core +## +## AES-encrypt %xmm0. +## +## Inputs: +## %xmm0 = input +## %xmm6-%xmm7 as in _vpaes_preheat +## (%edx) = scheduled keys +## +## Output in %xmm0 +## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx +## +## +&function_begin_B("_vpaes_encrypt_core"); + &mov ($magic,16); + &mov ($round,&DWP(240,$key)); + &movdqa ("xmm1","xmm6") + &movdqa ("xmm2",&QWP($k_ipt,$const)); + &pandn ("xmm1","xmm0"); + &pand ("xmm0","xmm6"); + &movdqu ("xmm5",&QWP(0,$key)); + &pshufb ("xmm2","xmm0"); + &movdqa ("xmm0",&QWP($k_ipt+16,$const)); + &pxor ("xmm2","xmm5"); + &psrld ("xmm1",4); + &add ($key,16); + &pshufb ("xmm0","xmm1"); + &lea ($base,&DWP($k_mc_backward,$const)); + &pxor ("xmm0","xmm2"); + &jmp (&label("enc_entry")); + + +&set_label("enc_loop",16); + # middle of middle round + &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u + &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t + &pshufb ("xmm4","xmm2"); # 4 = sb1u + &pshufb ("xmm0","xmm3"); # 0 = sb1t + &pxor ("xmm4","xmm5"); # 4 = sb1u + k + &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u + &pxor ("xmm0","xmm4"); # 0 = A + &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[] + &pshufb ("xmm5","xmm2"); # 4 = sb2u + &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t + &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[] + &pshufb ("xmm2","xmm3"); # 2 = sb2t + &movdqa ("xmm3","xmm0"); # 3 = A + &pxor ("xmm2","xmm5"); # 2 = 2A + &pshufb ("xmm0","xmm1"); # 0 = B + &add ($key,16); # next key + &pxor ("xmm0","xmm2"); # 0 = 2A+B + &pshufb ("xmm3","xmm4"); # 3 = D + &add ($magic,16); # next mc + &pxor ("xmm3","xmm0"); # 3 = 2A+B+D + &pshufb ("xmm0","xmm1"); # 0 = 2B+C + &and ($magic,0x30); # ... mod 4 + &sub ($round,1); # nr-- + &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D + +&set_label("enc_entry"); + # top of round + &movdqa ("xmm1","xmm6"); # 1 : i + &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k + &pandn ("xmm1","xmm0"); # 1 = i<<4 + &psrld ("xmm1",4); # 1 = i + &pand ("xmm0","xmm6"); # 0 = k + &pshufb ("xmm5","xmm0"); # 2 = a/k + &movdqa ("xmm3","xmm7"); # 3 : 1/i + &pxor ("xmm0","xmm1"); # 0 = j + &pshufb ("xmm3","xmm1"); # 3 = 1/i + &movdqa ("xmm4","xmm7"); # 4 : 1/j + &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k + &pshufb ("xmm4","xmm0"); # 4 = 1/j + &movdqa ("xmm2","xmm7"); # 2 : 1/iak + &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k + &pshufb ("xmm2","xmm3"); # 2 = 1/iak + &movdqa ("xmm3","xmm7"); # 3 : 1/jak + &pxor ("xmm2","xmm0"); # 2 = io + &pshufb ("xmm3","xmm4"); # 3 = 1/jak + &movdqu ("xmm5",&QWP(0,$key)); + &pxor ("xmm3","xmm1"); # 3 = jo + &jnz (&label("enc_loop")); + + # middle of last round + &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo + &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16 + &pshufb ("xmm4","xmm2"); # 4 = sbou + &pxor ("xmm4","xmm5"); # 4 = sb1u + k + &pshufb ("xmm0","xmm3"); # 0 = sb1t + &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[] + &pxor ("xmm0","xmm4"); # 0 = A + &pshufb ("xmm0","xmm1"); + &ret (); +&function_end_B("_vpaes_encrypt_core"); + +## +## Decryption core +## +## Same API as encryption core. +## +&function_begin_B("_vpaes_decrypt_core"); + &lea ($base,&DWP($k_dsbd,$const)); + &mov ($round,&DWP(240,$key)); + &movdqa ("xmm1","xmm6"); + &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base)); + &pandn ("xmm1","xmm0"); + &mov ($magic,$round); + &psrld ("xmm1",4) + &movdqu ("xmm5",&QWP(0,$key)); + &shl ($magic,4); + &pand ("xmm0","xmm6"); + &pshufb ("xmm2","xmm0"); + &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base)); + &xor ($magic,0x30); + &pshufb ("xmm0","xmm1"); + &and ($magic,0x30); + &pxor ("xmm2","xmm5"); + &movdqa ("xmm5",&QWP($k_mc_forward+48,$const)); + &pxor ("xmm0","xmm2"); + &add ($key,16); + &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic)); + &jmp (&label("dec_entry")); + +&set_label("dec_loop",16); +## +## Inverse mix columns +## + &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u + &movdqa ("xmm1",&QWP(-0x10,$base)); # 0 : sb9t + &pshufb ("xmm4","xmm2"); # 4 = sb9u + &pshufb ("xmm1","xmm3"); # 0 = sb9t + &pxor ("xmm0","xmm4"); + &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu + &pxor ("xmm0","xmm1"); # 0 = ch + &movdqa ("xmm1",&QWP(0x10,$base)); # 0 : sbdt + + &pshufb ("xmm4","xmm2"); # 4 = sbdu + &pshufb ("xmm0","xmm5"); # MC ch + &pshufb ("xmm1","xmm3"); # 0 = sbdt + &pxor ("xmm0","xmm4"); # 4 = ch + &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu + &pxor ("xmm0","xmm1"); # 0 = ch + &movdqa ("xmm1",&QWP(0x30,$base)); # 0 : sbbt + + &pshufb ("xmm4","xmm2"); # 4 = sbbu + &pshufb ("xmm0","xmm5"); # MC ch + &pshufb ("xmm1","xmm3"); # 0 = sbbt + &pxor ("xmm0","xmm4"); # 4 = ch + &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu + &pxor ("xmm0","xmm1"); # 0 = ch + &movdqa ("xmm1",&QWP(0x50,$base)); # 0 : sbet + + &pshufb ("xmm4","xmm2"); # 4 = sbeu + &pshufb ("xmm0","xmm5"); # MC ch + &pshufb ("xmm1","xmm3"); # 0 = sbet + &pxor ("xmm0","xmm4"); # 4 = ch + &add ($key,16); # next round key + &palignr("xmm5","xmm5",12); + &pxor ("xmm0","xmm1"); # 0 = ch + &sub ($round,1); # nr-- + +&set_label("dec_entry"); + # top of round + &movdqa ("xmm1","xmm6"); # 1 : i + &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k + &pandn ("xmm1","xmm0"); # 1 = i<<4 + &pand ("xmm0","xmm6"); # 0 = k + &psrld ("xmm1",4); # 1 = i + &pshufb ("xmm2","xmm0"); # 2 = a/k + &movdqa ("xmm3","xmm7"); # 3 : 1/i + &pxor ("xmm0","xmm1"); # 0 = j + &pshufb ("xmm3","xmm1"); # 3 = 1/i + &movdqa ("xmm4","xmm7"); # 4 : 1/j + &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k + &pshufb ("xmm4","xmm0"); # 4 = 1/j + &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k + &movdqa ("xmm2","xmm7"); # 2 : 1/iak + &pshufb ("xmm2","xmm3"); # 2 = 1/iak + &movdqa ("xmm3","xmm7"); # 3 : 1/jak + &pxor ("xmm2","xmm0"); # 2 = io + &pshufb ("xmm3","xmm4"); # 3 = 1/jak + &movdqu ("xmm0",&QWP(0,$key)); + &pxor ("xmm3","xmm1"); # 3 = jo + &jnz (&label("dec_loop")); + + # middle of last round + &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou + &pshufb ("xmm4","xmm2"); # 4 = sbou + &pxor ("xmm4","xmm0"); # 4 = sb1u + k + &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot + &movdqa ("xmm2",&QWP(0,$magic)); + &pshufb ("xmm0","xmm3"); # 0 = sb1t + &pxor ("xmm0","xmm4"); # 0 = A + &pshufb ("xmm0","xmm2"); + &ret (); +&function_end_B("_vpaes_decrypt_core"); + +######################################################## +## ## +## AES key schedule ## +## ## +######################################################## +&function_begin_B("_vpaes_schedule_core"); + &add ($const,&DWP(0,"esp")); + &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned) + &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon + + # input transform + &movdqa ("xmm3","xmm0"); + &lea ($base,&DWP($k_ipt,$const)); + &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8 + &call ("_vpaes_schedule_transform"); + &movdqa ("xmm7","xmm0"); + + &test ($out,$out); + &jnz (&label("schedule_am_decrypting")); + + # encrypting, output zeroth round key after transform + &movdqu (&QWP(0,$key),"xmm0"); + &jmp (&label("schedule_go")); + +&set_label("schedule_am_decrypting"); + # decrypting, output zeroth round key after shiftrows + &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); + &pshufb ("xmm3","xmm1"); + &movdqu (&QWP(0,$key),"xmm3"); + &xor ($magic,0x30); + +&set_label("schedule_go"); + &cmp ($round,192); + &ja (&label("schedule_256")); + &je (&label("schedule_192")); + # 128: fall though + +## +## .schedule_128 +## +## 128-bit specific part of key schedule. +## +## This schedule is really simple, because all its parts +## are accomplished by the subroutines. +## +&set_label("schedule_128"); + &mov ($round,10); + +&set_label("loop_schedule_128"); + &call ("_vpaes_schedule_round"); + &dec ($round); + &jz (&label("schedule_mangle_last")); + &call ("_vpaes_schedule_mangle"); # write output + &jmp (&label("loop_schedule_128")); + +## +## .aes_schedule_192 +## +## 192-bit specific part of key schedule. +## +## The main body of this schedule is the same as the 128-bit +## schedule, but with more smearing. The long, high side is +## stored in %xmm7 as before, and the short, low side is in +## the high bits of %xmm6. +## +## This schedule is somewhat nastier, however, because each +## round produces 192 bits of key material, or 1.5 round keys. +## Therefore, on each cycle we do 2 rounds and produce 3 round +## keys. +## +&set_label("schedule_192",16); + &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned) + &call ("_vpaes_schedule_transform"); # input transform + &movdqa ("xmm6","xmm0"); # save short part + &pxor ("xmm4","xmm4"); # clear 4 + &movhlps("xmm6","xmm4"); # clobber low side with zeros + &mov ($round,4); + +&set_label("loop_schedule_192"); + &call ("_vpaes_schedule_round"); + &palignr("xmm0","xmm6",8); + &call ("_vpaes_schedule_mangle"); # save key n + &call ("_vpaes_schedule_192_smear"); + &call ("_vpaes_schedule_mangle"); # save key n+1 + &call ("_vpaes_schedule_round"); + &dec ($round); + &jz (&label("schedule_mangle_last")); + &call ("_vpaes_schedule_mangle"); # save key n+2 + &call ("_vpaes_schedule_192_smear"); + &jmp (&label("loop_schedule_192")); + +## +## .aes_schedule_256 +## +## 256-bit specific part of key schedule. +## +## The structure here is very similar to the 128-bit +## schedule, but with an additional "low side" in +## %xmm6. The low side's rounds are the same as the +## high side's, except no rcon and no rotation. +## +&set_label("schedule_256",16); + &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned) + &call ("_vpaes_schedule_transform"); # input transform + &mov ($round,7); + +&set_label("loop_schedule_256"); + &call ("_vpaes_schedule_mangle"); # output low result + &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6 + + # high round + &call ("_vpaes_schedule_round"); + &dec ($round); + &jz (&label("schedule_mangle_last")); + &call ("_vpaes_schedule_mangle"); + + # low round. swap xmm7 and xmm6 + &pshufd ("xmm0","xmm0",0xFF); + &movdqa (&QWP(20,"esp"),"xmm7"); + &movdqa ("xmm7","xmm6"); + &call ("_vpaes_schedule_low_round"); + &movdqa ("xmm7",&QWP(20,"esp")); + + &jmp (&label("loop_schedule_256")); + +## +## .aes_schedule_mangle_last +## +## Mangler for last round of key schedule +## Mangles %xmm0 +## when encrypting, outputs out(%xmm0) ^ 63 +## when decrypting, outputs unskew(%xmm0) +## +## Always called right before return... jumps to cleanup and exits +## +&set_label("schedule_mangle_last",16); + # schedule last round key from xmm0 + &lea ($base,&DWP($k_deskew,$const)); + &test ($out,$out); + &jnz (&label("schedule_mangle_last_dec")); + + # encrypting + &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); + &pshufb ("xmm0","xmm1"); # output permute + &lea ($base,&DWP($k_opt,$const)); # prepare to output transform + &add ($key,32); + +&set_label("schedule_mangle_last_dec"); + &add ($key,-16); + &pxor ("xmm0",&QWP($k_s63,$const)); + &call ("_vpaes_schedule_transform"); # output transform + &movdqu (&QWP(0,$key),"xmm0"); # save last key + + # cleanup + &pxor ("xmm0","xmm0"); + &pxor ("xmm1","xmm1"); + &pxor ("xmm2","xmm2"); + &pxor ("xmm3","xmm3"); + &pxor ("xmm4","xmm4"); + &pxor ("xmm5","xmm5"); + &pxor ("xmm6","xmm6"); + &pxor ("xmm7","xmm7"); + &ret (); +&function_end_B("_vpaes_schedule_core"); + +## +## .aes_schedule_192_smear +## +## Smear the short, low side in the 192-bit key schedule. +## +## Inputs: +## %xmm7: high side, b a x y +## %xmm6: low side, d c 0 0 +## %xmm13: 0 +## +## Outputs: +## %xmm6: b+c+d b+c 0 0 +## %xmm0: b+c+d b+c b a +## +&function_begin_B("_vpaes_schedule_192_smear"); + &pshufd ("xmm1","xmm6",0x80); # d c 0 0 -> c 0 0 0 + &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a + &pxor ("xmm6","xmm1"); # -> c+d c 0 0 + &pxor ("xmm1","xmm1"); + &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a + &movdqa ("xmm0","xmm6"); + &movhlps("xmm6","xmm1"); # clobber low side with zeros + &ret (); +&function_end_B("_vpaes_schedule_192_smear"); + +## +## .aes_schedule_round +## +## Runs one main round of the key schedule on %xmm0, %xmm7 +## +## Specifically, runs subbytes on the high dword of %xmm0 +## then rotates it by one byte and xors into the low dword of +## %xmm7. +## +## Adds rcon from low byte of %xmm8, then rotates %xmm8 for +## next rcon. +## +## Smears the dwords of %xmm7 by xoring the low into the +## second low, result into third, result into highest. +## +## Returns results in %xmm7 = %xmm0. +## Clobbers %xmm1-%xmm5. +## +&function_begin_B("_vpaes_schedule_round"); + # extract rcon from xmm8 + &movdqa ("xmm2",&QWP(8,"esp")); # xmm8 + &pxor ("xmm1","xmm1"); + &palignr("xmm1","xmm2",15); + &palignr("xmm2","xmm2",15); + &pxor ("xmm7","xmm1"); + + # rotate + &pshufd ("xmm0","xmm0",0xFF); + &palignr("xmm0","xmm0",1); + + # fall through... + &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8 + + # low round: same as high round, but no rotation and no rcon. +&set_label("_vpaes_schedule_low_round"); + # smear xmm7 + &movdqa ("xmm1","xmm7"); + &pslldq ("xmm7",4); + &pxor ("xmm7","xmm1"); + &movdqa ("xmm1","xmm7"); + &pslldq ("xmm7",8); + &pxor ("xmm7","xmm1"); + &pxor ("xmm7",&QWP($k_s63,$const)); + + # subbyte + &movdqa ("xmm4",&QWP($k_s0F,$const)); + &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j + &movdqa ("xmm1","xmm4"); + &pandn ("xmm1","xmm0"); + &psrld ("xmm1",4); # 1 = i + &pand ("xmm0","xmm4"); # 0 = k + &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k + &pshufb ("xmm2","xmm0"); # 2 = a/k + &pxor ("xmm0","xmm1"); # 0 = j + &movdqa ("xmm3","xmm5"); # 3 : 1/i + &pshufb ("xmm3","xmm1"); # 3 = 1/i + &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k + &movdqa ("xmm4","xmm5"); # 4 : 1/j + &pshufb ("xmm4","xmm0"); # 4 = 1/j + &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k + &movdqa ("xmm2","xmm5"); # 2 : 1/iak + &pshufb ("xmm2","xmm3"); # 2 = 1/iak + &pxor ("xmm2","xmm0"); # 2 = io + &movdqa ("xmm3","xmm5"); # 3 : 1/jak + &pshufb ("xmm3","xmm4"); # 3 = 1/jak + &pxor ("xmm3","xmm1"); # 3 = jo + &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou + &pshufb ("xmm4","xmm2"); # 4 = sbou + &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot + &pshufb ("xmm0","xmm3"); # 0 = sb1t + &pxor ("xmm0","xmm4"); # 0 = sbox output + + # add in smeared stuff + &pxor ("xmm0","xmm7"); + &movdqa ("xmm7","xmm0"); + &ret (); +&function_end_B("_vpaes_schedule_round"); + +## +## .aes_schedule_transform +## +## Linear-transform %xmm0 according to tables at (%ebx) +## +## Output in %xmm0 +## Clobbers %xmm1, %xmm2 +## +&function_begin_B("_vpaes_schedule_transform"); + &movdqa ("xmm2",&QWP($k_s0F,$const)); + &movdqa ("xmm1","xmm2"); + &pandn ("xmm1","xmm0"); + &psrld ("xmm1",4); + &pand ("xmm0","xmm2"); + &movdqa ("xmm2",&QWP(0,$base)); + &pshufb ("xmm2","xmm0"); + &movdqa ("xmm0",&QWP(16,$base)); + &pshufb ("xmm0","xmm1"); + &pxor ("xmm0","xmm2"); + &ret (); +&function_end_B("_vpaes_schedule_transform"); + +## +## .aes_schedule_mangle +## +## Mangle xmm0 from (basis-transformed) standard version +## to our version. +## +## On encrypt, +## xor with 0x63 +## multiply by circulant 0,1,1,1 +## apply shiftrows transform +## +## On decrypt, +## xor with 0x63 +## multiply by "inverse mixcolumns" circulant E,B,D,9 +## deskew +## apply shiftrows transform +## +## +## Writes out to (%edx), and increments or decrements it +## Keeps track of round number mod 4 in %ecx +## Preserves xmm0 +## Clobbers xmm1-xmm5 +## +&function_begin_B("_vpaes_schedule_mangle"); + &movdqa ("xmm4","xmm0"); # save xmm0 for later + &movdqa ("xmm5",&QWP($k_mc_forward,$const)); + &test ($out,$out); + &jnz (&label("schedule_mangle_dec")); + + # encrypting + &add ($key,16); + &pxor ("xmm4",&QWP($k_s63,$const)); + &pshufb ("xmm4","xmm5"); + &movdqa ("xmm3","xmm4"); + &pshufb ("xmm4","xmm5"); + &pxor ("xmm3","xmm4"); + &pshufb ("xmm4","xmm5"); + &pxor ("xmm3","xmm4"); + + &jmp (&label("schedule_mangle_both")); + +&set_label("schedule_mangle_dec",16); + # inverse mix columns + &movdqa ("xmm2",&QWP($k_s0F,$const)); + &lea ($inp,&DWP($k_dksd,$const)); + &movdqa ("xmm1","xmm2"); + &pandn ("xmm1","xmm4"); + &psrld ("xmm1",4); # 1 = hi + &pand ("xmm4","xmm2"); # 4 = lo + + &movdqa ("xmm2",&QWP(0,$inp)); + &pshufb ("xmm2","xmm4"); + &movdqa ("xmm3",&QWP(0x10,$inp)); + &pshufb ("xmm3","xmm1"); + &pxor ("xmm3","xmm2"); + &pshufb ("xmm3","xmm5"); + + &movdqa ("xmm2",&QWP(0x20,$inp)); + &pshufb ("xmm2","xmm4"); + &pxor ("xmm2","xmm3"); + &movdqa ("xmm3",&QWP(0x30,$inp)); + &pshufb ("xmm3","xmm1"); + &pxor ("xmm3","xmm2"); + &pshufb ("xmm3","xmm5"); + + &movdqa ("xmm2",&QWP(0x40,$inp)); + &pshufb ("xmm2","xmm4"); + &pxor ("xmm2","xmm3"); + &movdqa ("xmm3",&QWP(0x50,$inp)); + &pshufb ("xmm3","xmm1"); + &pxor ("xmm3","xmm2"); + &pshufb ("xmm3","xmm5"); + + &movdqa ("xmm2",&QWP(0x60,$inp)); + &pshufb ("xmm2","xmm4"); + &pxor ("xmm2","xmm3"); + &movdqa ("xmm3",&QWP(0x70,$inp)); + &pshufb ("xmm3","xmm1"); + &pxor ("xmm3","xmm2"); + + &add ($key,-16); + +&set_label("schedule_mangle_both"); + &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); + &pshufb ("xmm3","xmm1"); + &add ($magic,-16); + &and ($magic,0x30); + &movdqu (&QWP(0,$key),"xmm3"); + &ret (); +&function_end_B("_vpaes_schedule_mangle"); + +# +# Interface to OpenSSL +# +&function_begin("${PREFIX}_set_encrypt_key"); + &mov ($inp,&wparam(0)); # inp + &lea ($base,&DWP(-56,"esp")); + &mov ($round,&wparam(1)); # bits + &and ($base,-16); + &mov ($key,&wparam(2)); # key + &xchg ($base,"esp"); # alloca + &mov (&DWP(48,"esp"),$base); + + &mov ($base,$round); + &shr ($base,5); + &add ($base,5); + &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; + &mov ($magic,0x30); + &mov ($out,0); + + &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); + &call ("_vpaes_schedule_core"); +&set_label("pic_point"); + + &mov ("esp",&DWP(48,"esp")); + &xor ("eax","eax"); +&function_end("${PREFIX}_set_encrypt_key"); + +&function_begin("${PREFIX}_set_decrypt_key"); + &mov ($inp,&wparam(0)); # inp + &lea ($base,&DWP(-56,"esp")); + &mov ($round,&wparam(1)); # bits + &and ($base,-16); + &mov ($key,&wparam(2)); # key + &xchg ($base,"esp"); # alloca + &mov (&DWP(48,"esp"),$base); + + &mov ($base,$round); + &shr ($base,5); + &add ($base,5); + &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; + &shl ($base,4); + &lea ($key,&DWP(16,$key,$base)); + + &mov ($out,1); + &mov ($magic,$round); + &shr ($magic,1); + &and ($magic,32); + &xor ($magic,32); # nbist==192?0:32; + + &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); + &call ("_vpaes_schedule_core"); +&set_label("pic_point"); + + &mov ("esp",&DWP(48,"esp")); + &xor ("eax","eax"); +&function_end("${PREFIX}_set_decrypt_key"); + +&function_begin("${PREFIX}_encrypt"); + &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); + &call ("_vpaes_preheat"); +&set_label("pic_point"); + &mov ($inp,&wparam(0)); # inp + &lea ($base,&DWP(-56,"esp")); + &mov ($out,&wparam(1)); # out + &and ($base,-16); + &mov ($key,&wparam(2)); # key + &xchg ($base,"esp"); # alloca + &mov (&DWP(48,"esp"),$base); + + &movdqu ("xmm0",&QWP(0,$inp)); + &call ("_vpaes_encrypt_core"); + &movdqu (&QWP(0,$out),"xmm0"); + + &mov ("esp",&DWP(48,"esp")); +&function_end("${PREFIX}_encrypt"); + +&function_begin("${PREFIX}_decrypt"); + &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); + &call ("_vpaes_preheat"); +&set_label("pic_point"); + &mov ($inp,&wparam(0)); # inp + &lea ($base,&DWP(-56,"esp")); + &mov ($out,&wparam(1)); # out + &and ($base,-16); + &mov ($key,&wparam(2)); # key + &xchg ($base,"esp"); # alloca + &mov (&DWP(48,"esp"),$base); + + &movdqu ("xmm0",&QWP(0,$inp)); + &call ("_vpaes_decrypt_core"); + &movdqu (&QWP(0,$out),"xmm0"); + + &mov ("esp",&DWP(48,"esp")); +&function_end("${PREFIX}_decrypt"); + +&function_begin("${PREFIX}_cbc_encrypt"); + &mov ($inp,&wparam(0)); # inp + &mov ($out,&wparam(1)); # out + &mov ($round,&wparam(2)); # len + &mov ($key,&wparam(3)); # key + &sub ($round,16); + &jc (&label("cbc_abort")); + &lea ($base,&DWP(-56,"esp")); + &mov ($const,&wparam(4)); # ivp + &and ($base,-16); + &mov ($magic,&wparam(5)); # enc + &xchg ($base,"esp"); # alloca + &movdqu ("xmm1",&QWP(0,$const)); # load IV + &sub ($out,$inp); + &mov (&DWP(48,"esp"),$base); + + &mov (&DWP(0,"esp"),$out); # save out + &mov (&DWP(4,"esp"),$key) # save key + &mov (&DWP(8,"esp"),$const); # save ivp + &mov ($out,$round); # $out works as $len + + &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); + &call ("_vpaes_preheat"); +&set_label("pic_point"); + &cmp ($magic,0); + &je (&label("cbc_dec_loop")); + &jmp (&label("cbc_enc_loop")); + +&set_label("cbc_enc_loop",16); + &movdqu ("xmm0",&QWP(0,$inp)); # load input + &pxor ("xmm0","xmm1"); # inp^=iv + &call ("_vpaes_encrypt_core"); + &mov ($base,&DWP(0,"esp")); # restore out + &mov ($key,&DWP(4,"esp")); # restore key + &movdqa ("xmm1","xmm0"); + &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output + &lea ($inp,&DWP(16,$inp)); + &sub ($out,16); + &jnc (&label("cbc_enc_loop")); + &jmp (&label("cbc_done")); + +&set_label("cbc_dec_loop",16); + &movdqu ("xmm0",&QWP(0,$inp)); # load input + &movdqa (&QWP(16,"esp"),"xmm1"); # save IV + &movdqa (&QWP(32,"esp"),"xmm0"); # save future IV + &call ("_vpaes_decrypt_core"); + &mov ($base,&DWP(0,"esp")); # restore out + &mov ($key,&DWP(4,"esp")); # restore key + &pxor ("xmm0",&QWP(16,"esp")); # out^=iv + &movdqa ("xmm1",&QWP(32,"esp")); # load next IV + &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output + &lea ($inp,&DWP(16,$inp)); + &sub ($out,16); + &jnc (&label("cbc_dec_loop")); + +&set_label("cbc_done"); + &mov ($base,&DWP(8,"esp")); # restore ivp + &mov ("esp",&DWP(48,"esp")); + &movdqu (&QWP(0,$base),"xmm1"); # write IV +&set_label("cbc_abort"); +&function_end("${PREFIX}_cbc_encrypt"); + +&asm_finish(); diff --git a/devel/perlasm/aes-ssse3-x86.pl.license b/devel/perlasm/aes-ssse3-x86.pl.license new file mode 120000 index 0000000..155c831 --- /dev/null +++ b/devel/perlasm/aes-ssse3-x86.pl.license @@ -0,0 +1 @@ +license-vpaes.txt \ No newline at end of file diff --git a/devel/perlasm/aes-ssse3-x86_64.pl b/devel/perlasm/aes-ssse3-x86_64.pl new file mode 100644 index 0000000..212394b --- /dev/null +++ b/devel/perlasm/aes-ssse3-x86_64.pl @@ -0,0 +1,1206 @@ +#!/usr/bin/env perl + +###################################################################### +## Constant-time SSSE3 AES core implementation. +## version 0.1 +## +## By Mike Hamburg (Stanford University), 2009 +## Public domain. +## +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. + +###################################################################### +# September 2011. +# +# Interface to OpenSSL as "almost" drop-in replacement for +# aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt +# doesn't handle partial vectors (doesn't have to if called from +# EVP only). "Drop-in" implies that this module doesn't share key +# schedule structure with the original nor does it make assumption +# about its alignment... +# +# Performance summary. aes-x86_64.pl column lists large-block CBC +# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per +# byte processed with 128-bit key, and vpaes-x86_64.pl column - +# [also large-block CBC] encrypt/decrypt. +# +# aes-x86_64.pl vpaes-x86_64.pl +# +# Core 2(**) 29.6/41.1/14.3 21.9/25.2(***) +# Nehalem 29.6/40.3/14.6 10.0/11.8 +# Atom 57.3/74.2/32.1 60.9/77.2(***) +# +# (*) "Hyper-threading" in the context refers rather to cache shared +# among multiple cores, than to specifically Intel HTT. As vast +# majority of contemporary cores share cache, slower code path +# is common place. In other words "with-hyper-threading-off" +# results are presented mostly for reference purposes. +# +# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. +# +# (***) Less impressive improvement on Core 2 and Atom is due to slow +# pshufb, yet it's respectable +36%/62% improvement on Core 2 +# (as implied, over "hyper-threading-safe" code path). +# +# + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +$PREFIX="vpaes"; + +$code.=<<___; +.text + +## +## _aes_encrypt_core +## +## AES-encrypt %xmm0. +## +## Inputs: +## %xmm0 = input +## %xmm9-%xmm15 as in _vpaes_preheat +## (%rdx) = scheduled keys +## +## Output in %xmm0 +## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax +## Preserves %xmm6 - %xmm8 so you get some local vectors +## +## +.type _vpaes_encrypt_core,\@abi-omnipotent +.align 16 +_vpaes_encrypt_core: + mov %rdx, %r9 + mov \$16, %r11 + mov 240(%rdx),%eax + movdqa %xmm9, %xmm1 + movdqa .Lk_ipt(%rip), %xmm2 # iptlo + pandn %xmm0, %xmm1 + movdqu (%r9), %xmm5 # round0 key + psrld \$4, %xmm1 + pand %xmm9, %xmm0 + pshufb %xmm0, %xmm2 + movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi + pshufb %xmm1, %xmm0 + pxor %xmm5, %xmm2 + add \$16, %r9 + pxor %xmm2, %xmm0 + lea .Lk_mc_backward(%rip),%r10 + jmp .Lenc_entry + +.align 16 +.Lenc_loop: + # middle of middle round + movdqa %xmm13, %xmm4 # 4 : sb1u + movdqa %xmm12, %xmm0 # 0 : sb1t + pshufb %xmm2, %xmm4 # 4 = sb1u + pshufb %xmm3, %xmm0 # 0 = sb1t + pxor %xmm5, %xmm4 # 4 = sb1u + k + movdqa %xmm15, %xmm5 # 4 : sb2u + pxor %xmm4, %xmm0 # 0 = A + movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + pshufb %xmm2, %xmm5 # 4 = sb2u + movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + movdqa %xmm14, %xmm2 # 2 : sb2t + pshufb %xmm3, %xmm2 # 2 = sb2t + movdqa %xmm0, %xmm3 # 3 = A + pxor %xmm5, %xmm2 # 2 = 2A + pshufb %xmm1, %xmm0 # 0 = B + add \$16, %r9 # next key + pxor %xmm2, %xmm0 # 0 = 2A+B + pshufb %xmm4, %xmm3 # 3 = D + add \$16, %r11 # next mc + pxor %xmm0, %xmm3 # 3 = 2A+B+D + pshufb %xmm1, %xmm0 # 0 = 2B+C + and \$0x30, %r11 # ... mod 4 + sub \$1,%rax # nr-- + pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D + +.Lenc_entry: + # top of round + movdqa %xmm9, %xmm1 # 1 : i + movdqa %xmm11, %xmm5 # 2 : a/k + pandn %xmm0, %xmm1 # 1 = i<<4 + psrld \$4, %xmm1 # 1 = i + pand %xmm9, %xmm0 # 0 = k + pshufb %xmm0, %xmm5 # 2 = a/k + movdqa %xmm10, %xmm3 # 3 : 1/i + pxor %xmm1, %xmm0 # 0 = j + pshufb %xmm1, %xmm3 # 3 = 1/i + movdqa %xmm10, %xmm4 # 4 : 1/j + pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k + pshufb %xmm0, %xmm4 # 4 = 1/j + movdqa %xmm10, %xmm2 # 2 : 1/iak + pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k + pshufb %xmm3, %xmm2 # 2 = 1/iak + movdqa %xmm10, %xmm3 # 3 : 1/jak + pxor %xmm0, %xmm2 # 2 = io + pshufb %xmm4, %xmm3 # 3 = 1/jak + movdqu (%r9), %xmm5 + pxor %xmm1, %xmm3 # 3 = jo + jnz .Lenc_loop + + # middle of last round + movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + pshufb %xmm2, %xmm4 # 4 = sbou + pxor %xmm5, %xmm4 # 4 = sb1u + k + pshufb %xmm3, %xmm0 # 0 = sb1t + movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + pxor %xmm4, %xmm0 # 0 = A + pshufb %xmm1, %xmm0 + ret +.size _vpaes_encrypt_core,.-_vpaes_encrypt_core + +## +## Decryption core +## +## Same API as encryption core. +## +.type _vpaes_decrypt_core,\@abi-omnipotent +.align 16 +_vpaes_decrypt_core: + mov %rdx, %r9 # load key + mov 240(%rdx),%eax + movdqa %xmm9, %xmm1 + movdqa .Lk_dipt(%rip), %xmm2 # iptlo + pandn %xmm0, %xmm1 + mov %rax, %r11 + psrld \$4, %xmm1 + movdqu (%r9), %xmm5 # round0 key + shl \$4, %r11 + pand %xmm9, %xmm0 + pshufb %xmm0, %xmm2 + movdqa .Lk_dipt+16(%rip), %xmm0 # ipthi + xor \$0x30, %r11 + lea .Lk_dsbd(%rip),%r10 + pshufb %xmm1, %xmm0 + and \$0x30, %r11 + pxor %xmm5, %xmm2 + movdqa .Lk_mc_forward+48(%rip), %xmm5 + pxor %xmm2, %xmm0 + add \$16, %r9 + add %r10, %r11 + jmp .Ldec_entry + +.align 16 +.Ldec_loop: +## +## Inverse mix columns +## + movdqa -0x20(%r10),%xmm4 # 4 : sb9u + movdqa -0x10(%r10),%xmm1 # 0 : sb9t + pshufb %xmm2, %xmm4 # 4 = sb9u + pshufb %xmm3, %xmm1 # 0 = sb9t + pxor %xmm4, %xmm0 + movdqa 0x00(%r10),%xmm4 # 4 : sbdu + pxor %xmm1, %xmm0 # 0 = ch + movdqa 0x10(%r10),%xmm1 # 0 : sbdt + + pshufb %xmm2, %xmm4 # 4 = sbdu + pshufb %xmm5, %xmm0 # MC ch + pshufb %xmm3, %xmm1 # 0 = sbdt + pxor %xmm4, %xmm0 # 4 = ch + movdqa 0x20(%r10),%xmm4 # 4 : sbbu + pxor %xmm1, %xmm0 # 0 = ch + movdqa 0x30(%r10),%xmm1 # 0 : sbbt + + pshufb %xmm2, %xmm4 # 4 = sbbu + pshufb %xmm5, %xmm0 # MC ch + pshufb %xmm3, %xmm1 # 0 = sbbt + pxor %xmm4, %xmm0 # 4 = ch + movdqa 0x40(%r10),%xmm4 # 4 : sbeu + pxor %xmm1, %xmm0 # 0 = ch + movdqa 0x50(%r10),%xmm1 # 0 : sbet + + pshufb %xmm2, %xmm4 # 4 = sbeu + pshufb %xmm5, %xmm0 # MC ch + pshufb %xmm3, %xmm1 # 0 = sbet + pxor %xmm4, %xmm0 # 4 = ch + add \$16, %r9 # next round key + palignr \$12, %xmm5, %xmm5 + pxor %xmm1, %xmm0 # 0 = ch + sub \$1,%rax # nr-- + +.Ldec_entry: + # top of round + movdqa %xmm9, %xmm1 # 1 : i + pandn %xmm0, %xmm1 # 1 = i<<4 + movdqa %xmm11, %xmm2 # 2 : a/k + psrld \$4, %xmm1 # 1 = i + pand %xmm9, %xmm0 # 0 = k + pshufb %xmm0, %xmm2 # 2 = a/k + movdqa %xmm10, %xmm3 # 3 : 1/i + pxor %xmm1, %xmm0 # 0 = j + pshufb %xmm1, %xmm3 # 3 = 1/i + movdqa %xmm10, %xmm4 # 4 : 1/j + pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k + pshufb %xmm0, %xmm4 # 4 = 1/j + pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k + movdqa %xmm10, %xmm2 # 2 : 1/iak + pshufb %xmm3, %xmm2 # 2 = 1/iak + movdqa %xmm10, %xmm3 # 3 : 1/jak + pxor %xmm0, %xmm2 # 2 = io + pshufb %xmm4, %xmm3 # 3 = 1/jak + movdqu (%r9), %xmm0 + pxor %xmm1, %xmm3 # 3 = jo + jnz .Ldec_loop + + # middle of last round + movdqa 0x60(%r10), %xmm4 # 3 : sbou + pshufb %xmm2, %xmm4 # 4 = sbou + pxor %xmm0, %xmm4 # 4 = sb1u + k + movdqa 0x70(%r10), %xmm0 # 0 : sbot + movdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 + pshufb %xmm3, %xmm0 # 0 = sb1t + pxor %xmm4, %xmm0 # 0 = A + pshufb %xmm2, %xmm0 + ret +.size _vpaes_decrypt_core,.-_vpaes_decrypt_core + +######################################################## +## ## +## AES key schedule ## +## ## +######################################################## +.type _vpaes_schedule_core,\@abi-omnipotent +.align 16 +_vpaes_schedule_core: + # rdi = key + # rsi = size in bits + # rdx = buffer + # rcx = direction. 0=encrypt, 1=decrypt + + call _vpaes_preheat # load the tables + movdqa .Lk_rcon(%rip), %xmm8 # load rcon + movdqu (%rdi), %xmm0 # load key (unaligned) + + # input transform + movdqa %xmm0, %xmm3 + lea .Lk_ipt(%rip), %r11 + call _vpaes_schedule_transform + movdqa %xmm0, %xmm7 + + lea .Lk_sr(%rip),%r10 + test %rcx, %rcx + jnz .Lschedule_am_decrypting + + # encrypting, output zeroth round key after transform + movdqu %xmm0, (%rdx) + jmp .Lschedule_go + +.Lschedule_am_decrypting: + # decrypting, output zeroth round key after shiftrows + movdqa (%r8,%r10),%xmm1 + pshufb %xmm1, %xmm3 + movdqu %xmm3, (%rdx) + xor \$0x30, %r8 + +.Lschedule_go: + cmp \$192, %esi + ja .Lschedule_256 + je .Lschedule_192 + # 128: fall though + +## +## .schedule_128 +## +## 128-bit specific part of key schedule. +## +## This schedule is really simple, because all its parts +## are accomplished by the subroutines. +## +.Lschedule_128: + mov \$10, %esi + +.Loop_schedule_128: + call _vpaes_schedule_round + dec %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle # write output + jmp .Loop_schedule_128 + +## +## .aes_schedule_192 +## +## 192-bit specific part of key schedule. +## +## The main body of this schedule is the same as the 128-bit +## schedule, but with more smearing. The long, high side is +## stored in %xmm7 as before, and the short, low side is in +## the high bits of %xmm6. +## +## This schedule is somewhat nastier, however, because each +## round produces 192 bits of key material, or 1.5 round keys. +## Therefore, on each cycle we do 2 rounds and produce 3 round +## keys. +## +.align 16 +.Lschedule_192: + movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) + call _vpaes_schedule_transform # input transform + movdqa %xmm0, %xmm6 # save short part + pxor %xmm4, %xmm4 # clear 4 + movhlps %xmm4, %xmm6 # clobber low side with zeros + mov \$4, %esi + +.Loop_schedule_192: + call _vpaes_schedule_round + palignr \$8,%xmm6,%xmm0 + call _vpaes_schedule_mangle # save key n + call _vpaes_schedule_192_smear + call _vpaes_schedule_mangle # save key n+1 + call _vpaes_schedule_round + dec %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle # save key n+2 + call _vpaes_schedule_192_smear + jmp .Loop_schedule_192 + +## +## .aes_schedule_256 +## +## 256-bit specific part of key schedule. +## +## The structure here is very similar to the 128-bit +## schedule, but with an additional "low side" in +## %xmm6. The low side's rounds are the same as the +## high side's, except no rcon and no rotation. +## +.align 16 +.Lschedule_256: + movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + call _vpaes_schedule_transform # input transform + mov \$7, %esi + +.Loop_schedule_256: + call _vpaes_schedule_mangle # output low result + movdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + # high round + call _vpaes_schedule_round + dec %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle + + # low round. swap xmm7 and xmm6 + pshufd \$0xFF, %xmm0, %xmm0 + movdqa %xmm7, %xmm5 + movdqa %xmm6, %xmm7 + call _vpaes_schedule_low_round + movdqa %xmm5, %xmm7 + + jmp .Loop_schedule_256 + + +## +## .aes_schedule_mangle_last +## +## Mangler for last round of key schedule +## Mangles %xmm0 +## when encrypting, outputs out(%xmm0) ^ 63 +## when decrypting, outputs unskew(%xmm0) +## +## Always called right before return... jumps to cleanup and exits +## +.align 16 +.Lschedule_mangle_last: + # schedule last round key from xmm0 + lea .Lk_deskew(%rip),%r11 # prepare to deskew + test %rcx, %rcx + jnz .Lschedule_mangle_last_dec + + # encrypting + movdqa (%r8,%r10),%xmm1 + pshufb %xmm1, %xmm0 # output permute + lea .Lk_opt(%rip), %r11 # prepare to output transform + add \$32, %rdx + +.Lschedule_mangle_last_dec: + add \$-16, %rdx + pxor .Lk_s63(%rip), %xmm0 + call _vpaes_schedule_transform # output transform + movdqu %xmm0, (%rdx) # save last key + + # cleanup + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + ret +.size _vpaes_schedule_core,.-_vpaes_schedule_core + +## +## .aes_schedule_192_smear +## +## Smear the short, low side in the 192-bit key schedule. +## +## Inputs: +## %xmm7: high side, b a x y +## %xmm6: low side, d c 0 0 +## %xmm13: 0 +## +## Outputs: +## %xmm6: b+c+d b+c 0 0 +## %xmm0: b+c+d b+c b a +## +.type _vpaes_schedule_192_smear,\@abi-omnipotent +.align 16 +_vpaes_schedule_192_smear: + pshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 + pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + pxor %xmm1, %xmm6 # -> c+d c 0 0 + pxor %xmm1, %xmm1 + pxor %xmm0, %xmm6 # -> b+c+d b+c b a + movdqa %xmm6, %xmm0 + movhlps %xmm1, %xmm6 # clobber low side with zeros + ret +.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear + +## +## .aes_schedule_round +## +## Runs one main round of the key schedule on %xmm0, %xmm7 +## +## Specifically, runs subbytes on the high dword of %xmm0 +## then rotates it by one byte and xors into the low dword of +## %xmm7. +## +## Adds rcon from low byte of %xmm8, then rotates %xmm8 for +## next rcon. +## +## Smears the dwords of %xmm7 by xoring the low into the +## second low, result into third, result into highest. +## +## Returns results in %xmm7 = %xmm0. +## Clobbers %xmm1-%xmm4, %r11. +## +.type _vpaes_schedule_round,\@abi-omnipotent +.align 16 +_vpaes_schedule_round: + # extract rcon from xmm8 + pxor %xmm1, %xmm1 + palignr \$15, %xmm8, %xmm1 + palignr \$15, %xmm8, %xmm8 + pxor %xmm1, %xmm7 + + # rotate + pshufd \$0xFF, %xmm0, %xmm0 + palignr \$1, %xmm0, %xmm0 + + # fall through... + + # low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + # smear xmm7 + movdqa %xmm7, %xmm1 + pslldq \$4, %xmm7 + pxor %xmm1, %xmm7 + movdqa %xmm7, %xmm1 + pslldq \$8, %xmm7 + pxor %xmm1, %xmm7 + pxor .Lk_s63(%rip), %xmm7 + + # subbytes + movdqa %xmm9, %xmm1 + pandn %xmm0, %xmm1 + psrld \$4, %xmm1 # 1 = i + pand %xmm9, %xmm0 # 0 = k + movdqa %xmm11, %xmm2 # 2 : a/k + pshufb %xmm0, %xmm2 # 2 = a/k + pxor %xmm1, %xmm0 # 0 = j + movdqa %xmm10, %xmm3 # 3 : 1/i + pshufb %xmm1, %xmm3 # 3 = 1/i + pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k + movdqa %xmm10, %xmm4 # 4 : 1/j + pshufb %xmm0, %xmm4 # 4 = 1/j + pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k + movdqa %xmm10, %xmm2 # 2 : 1/iak + pshufb %xmm3, %xmm2 # 2 = 1/iak + pxor %xmm0, %xmm2 # 2 = io + movdqa %xmm10, %xmm3 # 3 : 1/jak + pshufb %xmm4, %xmm3 # 3 = 1/jak + pxor %xmm1, %xmm3 # 3 = jo + movdqa %xmm13, %xmm4 # 4 : sbou + pshufb %xmm2, %xmm4 # 4 = sbou + movdqa %xmm12, %xmm0 # 0 : sbot + pshufb %xmm3, %xmm0 # 0 = sb1t + pxor %xmm4, %xmm0 # 0 = sbox output + + # add in smeared stuff + pxor %xmm7, %xmm0 + movdqa %xmm0, %xmm7 + ret +.size _vpaes_schedule_round,.-_vpaes_schedule_round + +## +## .aes_schedule_transform +## +## Linear-transform %xmm0 according to tables at (%r11) +## +## Requires that %xmm9 = 0x0F0F... as in preheat +## Output in %xmm0 +## Clobbers %xmm1, %xmm2 +## +.type _vpaes_schedule_transform,\@abi-omnipotent +.align 16 +_vpaes_schedule_transform: + movdqa %xmm9, %xmm1 + pandn %xmm0, %xmm1 + psrld \$4, %xmm1 + pand %xmm9, %xmm0 + movdqa (%r11), %xmm2 # lo + pshufb %xmm0, %xmm2 + movdqa 16(%r11), %xmm0 # hi + pshufb %xmm1, %xmm0 + pxor %xmm2, %xmm0 + ret +.size _vpaes_schedule_transform,.-_vpaes_schedule_transform + +## +## .aes_schedule_mangle +## +## Mangle xmm0 from (basis-transformed) standard version +## to our version. +## +## On encrypt, +## xor with 0x63 +## multiply by circulant 0,1,1,1 +## apply shiftrows transform +## +## On decrypt, +## xor with 0x63 +## multiply by "inverse mixcolumns" circulant E,B,D,9 +## deskew +## apply shiftrows transform +## +## +## Writes out to (%rdx), and increments or decrements it +## Keeps track of round number mod 4 in %r8 +## Preserves xmm0 +## Clobbers xmm1-xmm5 +## +.type _vpaes_schedule_mangle,\@abi-omnipotent +.align 16 +_vpaes_schedule_mangle: + movdqa %xmm0, %xmm4 # save xmm0 for later + movdqa .Lk_mc_forward(%rip),%xmm5 + test %rcx, %rcx + jnz .Lschedule_mangle_dec + + # encrypting + add \$16, %rdx + pxor .Lk_s63(%rip),%xmm4 + pshufb %xmm5, %xmm4 + movdqa %xmm4, %xmm3 + pshufb %xmm5, %xmm4 + pxor %xmm4, %xmm3 + pshufb %xmm5, %xmm4 + pxor %xmm4, %xmm3 + + jmp .Lschedule_mangle_both +.align 16 +.Lschedule_mangle_dec: + # inverse mix columns + lea .Lk_dksd(%rip),%r11 + movdqa %xmm9, %xmm1 + pandn %xmm4, %xmm1 + psrld \$4, %xmm1 # 1 = hi + pand %xmm9, %xmm4 # 4 = lo + + movdqa 0x00(%r11), %xmm2 + pshufb %xmm4, %xmm2 + movdqa 0x10(%r11), %xmm3 + pshufb %xmm1, %xmm3 + pxor %xmm2, %xmm3 + pshufb %xmm5, %xmm3 + + movdqa 0x20(%r11), %xmm2 + pshufb %xmm4, %xmm2 + pxor %xmm3, %xmm2 + movdqa 0x30(%r11), %xmm3 + pshufb %xmm1, %xmm3 + pxor %xmm2, %xmm3 + pshufb %xmm5, %xmm3 + + movdqa 0x40(%r11), %xmm2 + pshufb %xmm4, %xmm2 + pxor %xmm3, %xmm2 + movdqa 0x50(%r11), %xmm3 + pshufb %xmm1, %xmm3 + pxor %xmm2, %xmm3 + pshufb %xmm5, %xmm3 + + movdqa 0x60(%r11), %xmm2 + pshufb %xmm4, %xmm2 + pxor %xmm3, %xmm2 + movdqa 0x70(%r11), %xmm3 + pshufb %xmm1, %xmm3 + pxor %xmm2, %xmm3 + + add \$-16, %rdx + +.Lschedule_mangle_both: + movdqa (%r8,%r10),%xmm1 + pshufb %xmm1,%xmm3 + add \$-16, %r8 + and \$0x30, %r8 + movdqu %xmm3, (%rdx) + ret +.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle + +# +# Interface to OpenSSL +# +.globl ${PREFIX}_set_encrypt_key +.type ${PREFIX}_set_encrypt_key,\@function,3 +.align 16 +${PREFIX}_set_encrypt_key: +___ +$code.=<<___ if ($win64); + lea -0xb8(%rsp),%rsp + movaps %xmm6,0x10(%rsp) + movaps %xmm7,0x20(%rsp) + movaps %xmm8,0x30(%rsp) + movaps %xmm9,0x40(%rsp) + movaps %xmm10,0x50(%rsp) + movaps %xmm11,0x60(%rsp) + movaps %xmm12,0x70(%rsp) + movaps %xmm13,0x80(%rsp) + movaps %xmm14,0x90(%rsp) + movaps %xmm15,0xa0(%rsp) +.Lenc_key_body: +___ +$code.=<<___; + mov %esi,%eax + shr \$5,%eax + add \$5,%eax + mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + mov \$0,%ecx + mov \$0x30,%r8d + call _vpaes_schedule_core +___ +$code.=<<___ if ($win64); + movaps 0x10(%rsp),%xmm6 + movaps 0x20(%rsp),%xmm7 + movaps 0x30(%rsp),%xmm8 + movaps 0x40(%rsp),%xmm9 + movaps 0x50(%rsp),%xmm10 + movaps 0x60(%rsp),%xmm11 + movaps 0x70(%rsp),%xmm12 + movaps 0x80(%rsp),%xmm13 + movaps 0x90(%rsp),%xmm14 + movaps 0xa0(%rsp),%xmm15 + lea 0xb8(%rsp),%rsp +.Lenc_key_epilogue: +___ +$code.=<<___; + xor %eax,%eax + ret +.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key + +.globl ${PREFIX}_set_decrypt_key +.type ${PREFIX}_set_decrypt_key,\@function,3 +.align 16 +${PREFIX}_set_decrypt_key: +___ +$code.=<<___ if ($win64); + lea -0xb8(%rsp),%rsp + movaps %xmm6,0x10(%rsp) + movaps %xmm7,0x20(%rsp) + movaps %xmm8,0x30(%rsp) + movaps %xmm9,0x40(%rsp) + movaps %xmm10,0x50(%rsp) + movaps %xmm11,0x60(%rsp) + movaps %xmm12,0x70(%rsp) + movaps %xmm13,0x80(%rsp) + movaps %xmm14,0x90(%rsp) + movaps %xmm15,0xa0(%rsp) +.Ldec_key_body: +___ +$code.=<<___; + mov %esi,%eax + shr \$5,%eax + add \$5,%eax + mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + shl \$4,%eax + lea 16(%rdx,%rax),%rdx + + mov \$1,%ecx + mov %esi,%r8d + shr \$1,%r8d + and \$32,%r8d + xor \$32,%r8d # nbits==192?0:32 + call _vpaes_schedule_core +___ +$code.=<<___ if ($win64); + movaps 0x10(%rsp),%xmm6 + movaps 0x20(%rsp),%xmm7 + movaps 0x30(%rsp),%xmm8 + movaps 0x40(%rsp),%xmm9 + movaps 0x50(%rsp),%xmm10 + movaps 0x60(%rsp),%xmm11 + movaps 0x70(%rsp),%xmm12 + movaps 0x80(%rsp),%xmm13 + movaps 0x90(%rsp),%xmm14 + movaps 0xa0(%rsp),%xmm15 + lea 0xb8(%rsp),%rsp +.Ldec_key_epilogue: +___ +$code.=<<___; + xor %eax,%eax + ret +.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key + +.globl ${PREFIX}_encrypt +.type ${PREFIX}_encrypt,\@function,3 +.align 16 +${PREFIX}_encrypt: +___ +$code.=<<___ if ($win64); + lea -0xb8(%rsp),%rsp + movaps %xmm6,0x10(%rsp) + movaps %xmm7,0x20(%rsp) + movaps %xmm8,0x30(%rsp) + movaps %xmm9,0x40(%rsp) + movaps %xmm10,0x50(%rsp) + movaps %xmm11,0x60(%rsp) + movaps %xmm12,0x70(%rsp) + movaps %xmm13,0x80(%rsp) + movaps %xmm14,0x90(%rsp) + movaps %xmm15,0xa0(%rsp) +.Lenc_body: +___ +$code.=<<___; + movdqu (%rdi),%xmm0 + call _vpaes_preheat + call _vpaes_encrypt_core + movdqu %xmm0,(%rsi) +___ +$code.=<<___ if ($win64); + movaps 0x10(%rsp),%xmm6 + movaps 0x20(%rsp),%xmm7 + movaps 0x30(%rsp),%xmm8 + movaps 0x40(%rsp),%xmm9 + movaps 0x50(%rsp),%xmm10 + movaps 0x60(%rsp),%xmm11 + movaps 0x70(%rsp),%xmm12 + movaps 0x80(%rsp),%xmm13 + movaps 0x90(%rsp),%xmm14 + movaps 0xa0(%rsp),%xmm15 + lea 0xb8(%rsp),%rsp +.Lenc_epilogue: +___ +$code.=<<___; + ret +.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt + +.globl ${PREFIX}_decrypt +.type ${PREFIX}_decrypt,\@function,3 +.align 16 +${PREFIX}_decrypt: +___ +$code.=<<___ if ($win64); + lea -0xb8(%rsp),%rsp + movaps %xmm6,0x10(%rsp) + movaps %xmm7,0x20(%rsp) + movaps %xmm8,0x30(%rsp) + movaps %xmm9,0x40(%rsp) + movaps %xmm10,0x50(%rsp) + movaps %xmm11,0x60(%rsp) + movaps %xmm12,0x70(%rsp) + movaps %xmm13,0x80(%rsp) + movaps %xmm14,0x90(%rsp) + movaps %xmm15,0xa0(%rsp) +.Ldec_body: +___ +$code.=<<___; + movdqu (%rdi),%xmm0 + call _vpaes_preheat + call _vpaes_decrypt_core + movdqu %xmm0,(%rsi) +___ +$code.=<<___ if ($win64); + movaps 0x10(%rsp),%xmm6 + movaps 0x20(%rsp),%xmm7 + movaps 0x30(%rsp),%xmm8 + movaps 0x40(%rsp),%xmm9 + movaps 0x50(%rsp),%xmm10 + movaps 0x60(%rsp),%xmm11 + movaps 0x70(%rsp),%xmm12 + movaps 0x80(%rsp),%xmm13 + movaps 0x90(%rsp),%xmm14 + movaps 0xa0(%rsp),%xmm15 + lea 0xb8(%rsp),%rsp +.Ldec_epilogue: +___ +$code.=<<___; + ret +.size ${PREFIX}_decrypt,.-${PREFIX}_decrypt +___ +{ +my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); +# void AES_cbc_encrypt (const void char *inp, unsigned char *out, +# size_t length, const AES_KEY *key, +# unsigned char *ivp,const int enc); +$code.=<<___; +.globl ${PREFIX}_cbc_encrypt +.type ${PREFIX}_cbc_encrypt,\@function,6 +.align 16 +${PREFIX}_cbc_encrypt: + xchg $key,$len +___ +($len,$key)=($key,$len); +$code.=<<___; + sub \$16,$len + jc .Lcbc_abort +___ +$code.=<<___ if ($win64); + lea -0xb8(%rsp),%rsp + movaps %xmm6,0x10(%rsp) + movaps %xmm7,0x20(%rsp) + movaps %xmm8,0x30(%rsp) + movaps %xmm9,0x40(%rsp) + movaps %xmm10,0x50(%rsp) + movaps %xmm11,0x60(%rsp) + movaps %xmm12,0x70(%rsp) + movaps %xmm13,0x80(%rsp) + movaps %xmm14,0x90(%rsp) + movaps %xmm15,0xa0(%rsp) +.Lcbc_body: +___ +$code.=<<___; + movdqu ($ivp),%xmm6 # load IV + sub $inp,$out + call _vpaes_preheat + cmp \$0,${enc}d + je .Lcbc_dec_loop + jmp .Lcbc_enc_loop +.align 16 +.Lcbc_enc_loop: + movdqu ($inp),%xmm0 + pxor %xmm6,%xmm0 + call _vpaes_encrypt_core + movdqa %xmm0,%xmm6 + movdqu %xmm0,($out,$inp) + lea 16($inp),$inp + sub \$16,$len + jnc .Lcbc_enc_loop + jmp .Lcbc_done +.align 16 +.Lcbc_dec_loop: + movdqu ($inp),%xmm0 + movdqa %xmm0,%xmm7 + call _vpaes_decrypt_core + pxor %xmm6,%xmm0 + movdqa %xmm7,%xmm6 + movdqu %xmm0,($out,$inp) + lea 16($inp),$inp + sub \$16,$len + jnc .Lcbc_dec_loop +.Lcbc_done: + movdqu %xmm6,($ivp) # save IV +___ +$code.=<<___ if ($win64); + movaps 0x10(%rsp),%xmm6 + movaps 0x20(%rsp),%xmm7 + movaps 0x30(%rsp),%xmm8 + movaps 0x40(%rsp),%xmm9 + movaps 0x50(%rsp),%xmm10 + movaps 0x60(%rsp),%xmm11 + movaps 0x70(%rsp),%xmm12 + movaps 0x80(%rsp),%xmm13 + movaps 0x90(%rsp),%xmm14 + movaps 0xa0(%rsp),%xmm15 + lea 0xb8(%rsp),%rsp +.Lcbc_epilogue: +___ +$code.=<<___; +.Lcbc_abort: + ret +.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt +___ +} +$code.=<<___; +## +## _aes_preheat +## +## Fills register %r10 -> .aes_consts (so you can -fPIC) +## and %xmm9-%xmm15 as specified below. +## +.type _vpaes_preheat,\@abi-omnipotent +.align 16 +_vpaes_preheat: + lea .Lk_s0F(%rip), %r10 + movdqa -0x20(%r10), %xmm10 # .Lk_inv + movdqa -0x10(%r10), %xmm11 # .Lk_inv+16 + movdqa 0x00(%r10), %xmm9 # .Lk_s0F + movdqa 0x30(%r10), %xmm13 # .Lk_sb1 + movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16 + movdqa 0x50(%r10), %xmm15 # .Lk_sb2 + movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16 + ret +.size _vpaes_preheat,.-_vpaes_preheat +######################################################## +## ## +## Constants ## +## ## +######################################################## +.type _vpaes_consts,\@object +.align 64 +_vpaes_consts: +.Lk_inv: # inv, inva + .quad 0x0E05060F0D080180, 0x040703090A0B0C02 + .quad 0x01040A060F0B0780, 0x030D0E0C02050809 + +.Lk_s0F: # s0F + .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F + +.Lk_ipt: # input transform (lo, hi) + .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 + .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 + +.Lk_sb1: # sb1u, sb1t + .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 + .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.Lk_sb2: # sb2u, sb2t + .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD + .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.Lk_sbo: # sbou, sbot + .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 + .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA + +.Lk_mc_forward: # mc_forward + .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 + .quad 0x080B0A0904070605, 0x000302010C0F0E0D + .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 + .quad 0x000302010C0F0E0D, 0x080B0A0904070605 + +.Lk_mc_backward:# mc_backward + .quad 0x0605040702010003, 0x0E0D0C0F0A09080B + .quad 0x020100030E0D0C0F, 0x0A09080B06050407 + .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 + .quad 0x0A09080B06050407, 0x020100030E0D0C0F + +.Lk_sr: # sr + .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 + .quad 0x030E09040F0A0500, 0x0B06010C07020D08 + .quad 0x0F060D040B020900, 0x070E050C030A0108 + .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +.Lk_rcon: # rcon + .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +.Lk_s63: # s63: all equal to 0x63 transformed + .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B + +.Lk_opt: # output transform + .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 + .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 + +.Lk_deskew: # deskew tables: inverts the sbox's "skew" + .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A + .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + +## +## Decryption stuff +## Key schedule constants +## +.Lk_dksd: # decryption key schedule: invskew x*D + .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 + .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +.Lk_dksb: # decryption key schedule: invskew x*B + .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 + .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +.Lk_dkse: # decryption key schedule: invskew x*E + 0x63 + .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 + .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +.Lk_dks9: # decryption key schedule: invskew x*9 + .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC + .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + +## +## Decryption stuff +## Round function constants +## +.Lk_dipt: # decryption input transform + .quad 0x0F505B040B545F00, 0x154A411E114E451A + .quad 0x86E383E660056500, 0x12771772F491F194 + +.Lk_dsb9: # decryption sbox output *9*u, *9*t + .quad 0x851C03539A86D600, 0xCAD51F504F994CC9 + .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 +.Lk_dsbd: # decryption sbox output *D*u, *D*t + .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 + .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 +.Lk_dsbb: # decryption sbox output *B*u, *B*t + .quad 0xD022649296B44200, 0x602646F6B0F2D404 + .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B +.Lk_dsbe: # decryption sbox output *E*u, *E*t + .quad 0x46F2929626D4D000, 0x2242600464B4F6B0 + .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 +.Lk_dsbo: # decryption sbox final output + .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D + .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C +.asciz "Vector Permutaion AES for x86_64/SSSE3, Mike Hamburg (Stanford University)" +.align 64 +.size _vpaes_consts,.-_vpaes_consts +___ + +if ($win64) { +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->RipRsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lin_prologue + + lea 16(%rax),%rsi # %xmm save area + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) + .long 0xa548f3fc # cld; rep movsq + lea 0xb8(%rax),%rax # adjust stack pointer + +.Lin_prologue: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$`1232/8`,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_${PREFIX}_set_encrypt_key + .rva .LSEH_end_${PREFIX}_set_encrypt_key + .rva .LSEH_info_${PREFIX}_set_encrypt_key + + .rva .LSEH_begin_${PREFIX}_set_decrypt_key + .rva .LSEH_end_${PREFIX}_set_decrypt_key + .rva .LSEH_info_${PREFIX}_set_decrypt_key + + .rva .LSEH_begin_${PREFIX}_encrypt + .rva .LSEH_end_${PREFIX}_encrypt + .rva .LSEH_info_${PREFIX}_encrypt + + .rva .LSEH_begin_${PREFIX}_decrypt + .rva .LSEH_end_${PREFIX}_decrypt + .rva .LSEH_info_${PREFIX}_decrypt + + .rva .LSEH_begin_${PREFIX}_cbc_encrypt + .rva .LSEH_end_${PREFIX}_cbc_encrypt + .rva .LSEH_info_${PREFIX}_cbc_encrypt + +.section .xdata +.align 8 +.LSEH_info_${PREFIX}_set_encrypt_key: + .byte 9,0,0,0 + .rva se_handler + .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[] +.LSEH_info_${PREFIX}_set_decrypt_key: + .byte 9,0,0,0 + .rva se_handler + .rva .Ldec_key_body,.Ldec_key_epilogue # HandlerData[] +.LSEH_info_${PREFIX}_encrypt: + .byte 9,0,0,0 + .rva se_handler + .rva .Lenc_body,.Lenc_epilogue # HandlerData[] +.LSEH_info_${PREFIX}_decrypt: + .byte 9,0,0,0 + .rva se_handler + .rva .Ldec_body,.Ldec_epilogue # HandlerData[] +.LSEH_info_${PREFIX}_cbc_encrypt: + .byte 9,0,0,0 + .rva se_handler + .rva .Lcbc_body,.Lcbc_epilogue # HandlerData[] +___ +} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; + +print $code; + +close STDOUT; diff --git a/devel/perlasm/aes-ssse3-x86_64.pl.license b/devel/perlasm/aes-ssse3-x86_64.pl.license new file mode 120000 index 0000000..155c831 --- /dev/null +++ b/devel/perlasm/aes-ssse3-x86_64.pl.license @@ -0,0 +1 @@ +license-vpaes.txt \ No newline at end of file diff --git a/devel/perlasm/aesni-x86.pl.license b/devel/perlasm/aesni-x86.pl.license new file mode 120000 index 0000000..cd301a4 --- /dev/null +++ b/devel/perlasm/aesni-x86.pl.license @@ -0,0 +1 @@ +license.txt \ No newline at end of file diff --git a/devel/perlasm/aesni-x86_64.pl.license b/devel/perlasm/aesni-x86_64.pl.license new file mode 120000 index 0000000..cd301a4 --- /dev/null +++ b/devel/perlasm/aesni-x86_64.pl.license @@ -0,0 +1 @@ +license.txt \ No newline at end of file diff --git a/devel/perlasm/cbc.pl.license b/devel/perlasm/cbc.pl.license new file mode 120000 index 0000000..cd301a4 --- /dev/null +++ b/devel/perlasm/cbc.pl.license @@ -0,0 +1 @@ +license.txt \ No newline at end of file diff --git a/devel/perlasm/cpuid-x86.pl.license b/devel/perlasm/cpuid-x86.pl.license new file mode 120000 index 0000000..6879a72 --- /dev/null +++ b/devel/perlasm/cpuid-x86.pl.license @@ -0,0 +1 @@ +license-gnutls.txt \ No newline at end of file diff --git a/devel/perlasm/cpuid-x86_64.pl.license b/devel/perlasm/cpuid-x86_64.pl.license new file mode 120000 index 0000000..6879a72 --- /dev/null +++ b/devel/perlasm/cpuid-x86_64.pl.license @@ -0,0 +1 @@ +license-gnutls.txt \ No newline at end of file diff --git a/devel/perlasm/e_padlock-x86.pl.license b/devel/perlasm/e_padlock-x86.pl.license new file mode 120000 index 0000000..cd301a4 --- /dev/null +++ b/devel/perlasm/e_padlock-x86.pl.license @@ -0,0 +1 @@ +license.txt \ No newline at end of file diff --git a/devel/perlasm/e_padlock-x86_64.pl.license b/devel/perlasm/e_padlock-x86_64.pl.license new file mode 120000 index 0000000..cd301a4 --- /dev/null +++ b/devel/perlasm/e_padlock-x86_64.pl.license @@ -0,0 +1 @@ +license.txt \ No newline at end of file diff --git a/devel/perlasm/ghash-x86.pl.license b/devel/perlasm/ghash-x86.pl.license new file mode 120000 index 0000000..cd301a4 --- /dev/null +++ b/devel/perlasm/ghash-x86.pl.license @@ -0,0 +1 @@ +license.txt \ No newline at end of file diff --git a/devel/perlasm/ghash-x86_64.pl.license b/devel/perlasm/ghash-x86_64.pl.license new file mode 120000 index 0000000..cd301a4 --- /dev/null +++ b/devel/perlasm/ghash-x86_64.pl.license @@ -0,0 +1 @@ +license.txt \ No newline at end of file diff --git a/devel/perlasm/license-gnutls.txt b/devel/perlasm/license-gnutls.txt index 3c3bacc..70bded0 100644 --- a/devel/perlasm/license-gnutls.txt +++ b/devel/perlasm/license-gnutls.txt @@ -1,5 +1,6 @@ # -# Copyright (C) 2011-2012 Free Software Foundation, Inc. +# Copyright (C) 2011-2013 Free Software Foundation, Inc. +# Copyright (C) 2013 Nikos Mavrogiannopoulos # # Author: Nikos Mavrogiannopoulos # diff --git a/devel/perlasm/license-vpaes.txt b/devel/perlasm/license-vpaes.txt new file mode 100644 index 0000000..b1c4569 --- /dev/null +++ b/devel/perlasm/license-vpaes.txt @@ -0,0 +1,12 @@ +###################################################################### +## Constant-time SSSE3 AES core implementation. +## version 0.1 +## +## By Mike Hamburg (Stanford University), 2009 +## Public domain. +## +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. +# +# *** This file is auto-generated *** +# diff --git a/devel/perlasm/license.txt b/devel/perlasm/license.txt index 718ea8d..748d2c1 100644 --- a/devel/perlasm/license.txt +++ b/devel/perlasm/license.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/devel/perlasm/md5-x86_64.pl.license b/devel/perlasm/md5-x86_64.pl.license new file mode 120000 index 0000000..cd301a4 --- /dev/null +++ b/devel/perlasm/md5-x86_64.pl.license @@ -0,0 +1 @@ +license.txt \ No newline at end of file diff --git a/devel/perlasm/openssl-cpuid-x86.pl.license b/devel/perlasm/openssl-cpuid-x86.pl.license new file mode 120000 index 0000000..cd301a4 --- /dev/null +++ b/devel/perlasm/openssl-cpuid-x86.pl.license @@ -0,0 +1 @@ +license.txt \ No newline at end of file diff --git a/devel/perlasm/ppc-xlate.pl.license b/devel/perlasm/ppc-xlate.pl.license new file mode 120000 index 0000000..cd301a4 --- /dev/null +++ b/devel/perlasm/ppc-xlate.pl.license @@ -0,0 +1 @@ +license.txt \ No newline at end of file diff --git a/devel/perlasm/sha1-ssse3-x86.pl.license b/devel/perlasm/sha1-ssse3-x86.pl.license new file mode 120000 index 0000000..cd301a4 --- /dev/null +++ b/devel/perlasm/sha1-ssse3-x86.pl.license @@ -0,0 +1 @@ +license.txt \ No newline at end of file diff --git a/devel/perlasm/sha1-ssse3-x86_64.pl.license b/devel/perlasm/sha1-ssse3-x86_64.pl.license new file mode 120000 index 0000000..cd301a4 --- /dev/null +++ b/devel/perlasm/sha1-ssse3-x86_64.pl.license @@ -0,0 +1 @@ +license.txt \ No newline at end of file diff --git a/devel/perlasm/sha256-ssse3-x86.pl.license b/devel/perlasm/sha256-ssse3-x86.pl.license new file mode 120000 index 0000000..cd301a4 --- /dev/null +++ b/devel/perlasm/sha256-ssse3-x86.pl.license @@ -0,0 +1 @@ +license.txt \ No newline at end of file diff --git a/devel/perlasm/sha512-ssse3-x86.pl.license b/devel/perlasm/sha512-ssse3-x86.pl.license new file mode 120000 index 0000000..cd301a4 --- /dev/null +++ b/devel/perlasm/sha512-ssse3-x86.pl.license @@ -0,0 +1 @@ +license.txt \ No newline at end of file diff --git a/devel/perlasm/sha512-ssse3-x86_64.pl.license b/devel/perlasm/sha512-ssse3-x86_64.pl.license new file mode 120000 index 0000000..cd301a4 --- /dev/null +++ b/devel/perlasm/sha512-ssse3-x86_64.pl.license @@ -0,0 +1 @@ +license.txt \ No newline at end of file diff --git a/lib/accelerated/x86/aes-x86.c b/lib/accelerated/x86/aes-x86.c index c50836a..9e00df5 100644 --- a/lib/accelerated/x86/aes-x86.c +++ b/lib/accelerated/x86/aes-x86.c @@ -118,7 +118,7 @@ static void aes_deinit(void *_ctx) gnutls_free(_ctx); } -static const gnutls_crypto_cipher_st cipher_struct = { +static const gnutls_crypto_cipher_st aesni_struct = { .init = aes_cipher_init, .setkey = aes_cipher_setkey, .setiv = aes_setiv, @@ -127,6 +127,59 @@ static const gnutls_crypto_cipher_st cipher_struct = { .deinit = aes_deinit, }; +static int +aes_ssse3_cipher_setkey(void *_ctx, const void *userkey, size_t keysize) +{ + struct aes_ctx *ctx = _ctx; + int ret; + + if (ctx->enc) + ret = + vpaes_set_encrypt_key(userkey, keysize * 8, + ALIGN16(&ctx->expanded_key)); + else + ret = + vpaes_set_decrypt_key(userkey, keysize * 8, + ALIGN16(&ctx->expanded_key)); + + if (ret != 0) + return gnutls_assert_val(GNUTLS_E_ENCRYPTION_FAILED); + + return 0; +} + +static int +aes_ssse3_encrypt(void *_ctx, const void *src, size_t src_size, + void *dst, size_t dst_size) +{ + struct aes_ctx *ctx = _ctx; + + vpaes_cbc_encrypt(src, dst, src_size, ALIGN16(&ctx->expanded_key), + ctx->iv, 1); + return 0; +} + +static int +aes_ssse3_decrypt(void *_ctx, const void *src, size_t src_size, + void *dst, size_t dst_size) +{ + struct aes_ctx *ctx = _ctx; + + vpaes_cbc_encrypt(src, dst, src_size, ALIGN16(&ctx->expanded_key), + ctx->iv, 0); + + return 0; +} + +static const gnutls_crypto_cipher_st aes_ssse3_struct = { + .init = aes_cipher_init, + .setkey = aes_ssse3_cipher_setkey, + .setiv = aes_setiv, + .encrypt = aes_ssse3_encrypt, + .decrypt = aes_ssse3_decrypt, + .deinit = aes_deinit, +}; + static unsigned check_optimized_aes(void) { return (_gnutls_x86_cpuid_s[2] & 0x2000000); @@ -172,7 +225,28 @@ void register_x86_crypto(void) if (check_ssse3()) { _gnutls_debug_log("Intel SSSE3 was detected\n"); - + + ret = + gnutls_crypto_single_cipher_register + (GNUTLS_CIPHER_AES_128_CBC, 79, &aes_ssse3_struct); + if (ret < 0) { + gnutls_assert(); + } + + ret = + gnutls_crypto_single_cipher_register + (GNUTLS_CIPHER_AES_192_CBC, 79, &aes_ssse3_struct); + if (ret < 0) { + gnutls_assert(); + } + + ret = + gnutls_crypto_single_cipher_register + (GNUTLS_CIPHER_AES_256_CBC, 79, &aes_ssse3_struct); + if (ret < 0) { + gnutls_assert(); + } + ret = gnutls_crypto_single_digest_register(GNUTLS_DIG_SHA1, 80, @@ -253,21 +327,21 @@ void register_x86_crypto(void) _gnutls_debug_log("Intel AES accelerator was detected\n"); ret = gnutls_crypto_single_cipher_register - (GNUTLS_CIPHER_AES_128_CBC, 80, &cipher_struct); + (GNUTLS_CIPHER_AES_128_CBC, 80, &aesni_struct); if (ret < 0) { gnutls_assert(); } ret = gnutls_crypto_single_cipher_register - (GNUTLS_CIPHER_AES_192_CBC, 80, &cipher_struct); + (GNUTLS_CIPHER_AES_192_CBC, 80, &aesni_struct); if (ret < 0) { gnutls_assert(); } ret = gnutls_crypto_single_cipher_register - (GNUTLS_CIPHER_AES_256_CBC, 80, &cipher_struct); + (GNUTLS_CIPHER_AES_256_CBC, 80, &aesni_struct); if (ret < 0) { gnutls_assert(); } diff --git a/lib/accelerated/x86/aes-x86.h b/lib/accelerated/x86/aes-x86.h index 379dbe6..2c81bb3 100644 --- a/lib/accelerated/x86/aes-x86.h +++ b/lib/accelerated/x86/aes-x86.h @@ -36,6 +36,12 @@ void aesni_ctr32_encrypt_blocks(const unsigned char *in, const void *key, const unsigned char *ivec); +int vpaes_set_encrypt_key(const unsigned char *userKey, int bits, AES_KEY *key); +int vpaes_set_decrypt_key(const unsigned char *userKey, int bits, AES_KEY *key); +void vpaes_cbc_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const AES_KEY *key, unsigned char *ivec, int enc); +void vpaes_encrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key); +void vpaes_decrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key); extern const gnutls_crypto_cipher_st aes_gcm_struct; diff --git a/lib/accelerated/x86/coff/aes-ssse3-x86.s b/lib/accelerated/x86/coff/aes-ssse3-x86.s new file mode 100644 index 0000000..6894b14 --- /dev/null +++ b/lib/accelerated/x86/coff/aes-ssse3-x86.s @@ -0,0 +1,662 @@ +###################################################################### +## Constant-time SSSE3 AES core implementation. +## version 0.1 +## +## By Mike Hamburg (Stanford University), 2009 +## Public domain. +## +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. +# +# *** This file is auto-generated *** +# +.file "vpaes-x86.s" +.text +.align 64 +.L_vpaes_consts: +.long 218628480,235210255,168496130,67568393 +.long 252381056,17041926,33884169,51187212 +.long 252645135,252645135,252645135,252645135 +.long 1512730624,3266504856,1377990664,3401244816 +.long 830229760,1275146365,2969422977,3447763452 +.long 3411033600,2979783055,338359620,2782886510 +.long 4209124096,907596821,221174255,1006095553 +.long 191964160,3799684038,3164090317,1589111125 +.long 182528256,1777043520,2877432650,3265356744 +.long 1874708224,3503451415,3305285752,363511674 +.long 1606117888,3487855781,1093350906,2384367825 +.long 197121,67569157,134941193,202313229 +.long 67569157,134941193,202313229,197121 +.long 134941193,202313229,197121,67569157 +.long 202313229,197121,67569157,134941193 +.long 33619971,100992007,168364043,235736079 +.long 235736079,33619971,100992007,168364043 +.long 168364043,235736079,33619971,100992007 +.long 100992007,168364043,235736079,33619971 +.long 50462976,117835012,185207048,252579084 +.long 252314880,51251460,117574920,184942860 +.long 184682752,252054788,50987272,118359308 +.long 118099200,185467140,251790600,50727180 +.long 2946363062,528716217,1300004225,1881839624 +.long 1532713819,1532713819,1532713819,1532713819 +.long 3602276352,4288629033,3737020424,4153884961 +.long 1354558464,32357713,2958822624,3775749553 +.long 1201988352,132424512,1572796698,503232858 +.long 2213177600,1597421020,4103937655,675398315 +.long 2749646592,4273543773,1511898873,121693092 +.long 3040248576,1103263732,2871565598,1608280554 +.long 2236667136,2588920351,482954393,64377734 +.long 3069987328,291237287,2117370568,3650299247 +.long 533321216,3573750986,2572112006,1401264716 +.long 1339849704,2721158661,548607111,3445553514 +.long 2128193280,3054596040,2183486460,1257083700 +.long 655635200,1165381986,3923443150,2344132524 +.long 190078720,256924420,290342170,357187870 +.long 1610966272,2263057382,4103205268,309794674 +.long 2592527872,2233205587,1335446729,3402964816 +.long 3973531904,3225098121,3002836325,1918774430 +.long 3870401024,2102906079,2284471353,4117666579 +.long 617007872,1021508343,366931923,691083277 +.long 2528395776,3491914898,2968704004,1613121270 +.long 3445188352,3247741094,844474987,4093578302 +.long 651481088,1190302358,1689581232,574775300 +.long 4289380608,206939853,2555985458,2489840491 +.long 2130264064,327674451,3566485037,3349835193 +.long 2470714624,316102159,3636825756,3393945945 +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105 +.byte 111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83 +.byte 83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117 +.byte 114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105 +.byte 118,101,114,115,105,116,121,41,0 +.align 64 +.def __vpaes_preheat; .scl 3; .type 32; .endef +.align 16 +__vpaes_preheat: + addl (%esp),%ebp + movdqa -48(%ebp),%xmm7 + movdqa -16(%ebp),%xmm6 + ret +.def __vpaes_encrypt_core; .scl 3; .type 32; .endef +.align 16 +__vpaes_encrypt_core: + movl $16,%ecx + movl 240(%edx),%eax + movdqa %xmm6,%xmm1 + movdqa (%ebp),%xmm2 + pandn %xmm0,%xmm1 + pand %xmm6,%xmm0 + movdqu (%edx),%xmm5 +.byte 102,15,56,0,208 + movdqa 16(%ebp),%xmm0 + pxor %xmm5,%xmm2 + psrld $4,%xmm1 + addl $16,%edx +.byte 102,15,56,0,193 + leal 192(%ebp),%ebx + pxor %xmm2,%xmm0 + jmp .L000enc_entry +.align 16 +.L001enc_loop: + movdqa 32(%ebp),%xmm4 + movdqa 48(%ebp),%xmm0 +.byte 102,15,56,0,226 +.byte 102,15,56,0,195 + pxor %xmm5,%xmm4 + movdqa 64(%ebp),%xmm5 + pxor %xmm4,%xmm0 + movdqa -64(%ebx,%ecx,1),%xmm1 +.byte 102,15,56,0,234 + movdqa 80(%ebp),%xmm2 + movdqa (%ebx,%ecx,1),%xmm4 +.byte 102,15,56,0,211 + movdqa %xmm0,%xmm3 + pxor %xmm5,%xmm2 +.byte 102,15,56,0,193 + addl $16,%edx + pxor %xmm2,%xmm0 +.byte 102,15,56,0,220 + addl $16,%ecx + pxor %xmm0,%xmm3 +.byte 102,15,56,0,193 + andl $48,%ecx + subl $1,%eax + pxor %xmm3,%xmm0 +.L000enc_entry: + movdqa %xmm6,%xmm1 + movdqa -32(%ebp),%xmm5 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm6,%xmm0 +.byte 102,15,56,0,232 + movdqa %xmm7,%xmm3 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,217 + movdqa %xmm7,%xmm4 + pxor %xmm5,%xmm3 +.byte 102,15,56,0,224 + movdqa %xmm7,%xmm2 + pxor %xmm5,%xmm4 +.byte 102,15,56,0,211 + movdqa %xmm7,%xmm3 + pxor %xmm0,%xmm2 +.byte 102,15,56,0,220 + movdqu (%edx),%xmm5 + pxor %xmm1,%xmm3 + jnz .L001enc_loop + movdqa 96(%ebp),%xmm4 + movdqa 112(%ebp),%xmm0 +.byte 102,15,56,0,226 + pxor %xmm5,%xmm4 +.byte 102,15,56,0,195 + movdqa 64(%ebx,%ecx,1),%xmm1 + pxor %xmm4,%xmm0 +.byte 102,15,56,0,193 + ret +.def __vpaes_decrypt_core; .scl 3; .type 32; .endef +.align 16 +__vpaes_decrypt_core: + leal 608(%ebp),%ebx + movl 240(%edx),%eax + movdqa %xmm6,%xmm1 + movdqa -64(%ebx),%xmm2 + pandn %xmm0,%xmm1 + movl %eax,%ecx + psrld $4,%xmm1 + movdqu (%edx),%xmm5 + shll $4,%ecx + pand %xmm6,%xmm0 +.byte 102,15,56,0,208 + movdqa -48(%ebx),%xmm0 + xorl $48,%ecx +.byte 102,15,56,0,193 + andl $48,%ecx + pxor %xmm5,%xmm2 + movdqa 176(%ebp),%xmm5 + pxor %xmm2,%xmm0 + addl $16,%edx + leal -352(%ebx,%ecx,1),%ecx + jmp .L002dec_entry +.align 16 +.L003dec_loop: + movdqa -32(%ebx),%xmm4 + movdqa -16(%ebx),%xmm1 +.byte 102,15,56,0,226 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa (%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 16(%ebx),%xmm1 +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa 32(%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 48(%ebx),%xmm1 +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa 64(%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 80(%ebx),%xmm1 +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + addl $16,%edx +.byte 102,15,58,15,237,12 + pxor %xmm1,%xmm0 + subl $1,%eax +.L002dec_entry: + movdqa %xmm6,%xmm1 + movdqa -32(%ebp),%xmm2 + pandn %xmm0,%xmm1 + pand %xmm6,%xmm0 + psrld $4,%xmm1 +.byte 102,15,56,0,208 + movdqa %xmm7,%xmm3 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,217 + movdqa %xmm7,%xmm4 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,224 + pxor %xmm2,%xmm4 + movdqa %xmm7,%xmm2 +.byte 102,15,56,0,211 + movdqa %xmm7,%xmm3 + pxor %xmm0,%xmm2 +.byte 102,15,56,0,220 + movdqu (%edx),%xmm0 + pxor %xmm1,%xmm3 + jnz .L003dec_loop + movdqa 96(%ebx),%xmm4 +.byte 102,15,56,0,226 + pxor %xmm0,%xmm4 + movdqa 112(%ebx),%xmm0 + movdqa (%ecx),%xmm2 +.byte 102,15,56,0,195 + pxor %xmm4,%xmm0 +.byte 102,15,56,0,194 + ret +.def __vpaes_schedule_core; .scl 3; .type 32; .endef +.align 16 +__vpaes_schedule_core: + addl (%esp),%ebp + movdqu (%esi),%xmm0 + movdqa 320(%ebp),%xmm2 + movdqa %xmm0,%xmm3 + leal (%ebp),%ebx + movdqa %xmm2,4(%esp) + call __vpaes_schedule_transform + movdqa %xmm0,%xmm7 + testl %edi,%edi + jnz .L004schedule_am_decrypting + movdqu %xmm0,(%edx) + jmp .L005schedule_go +.L004schedule_am_decrypting: + movdqa 256(%ebp,%ecx,1),%xmm1 +.byte 102,15,56,0,217 + movdqu %xmm3,(%edx) + xorl $48,%ecx +.L005schedule_go: + cmpl $192,%eax + ja .L006schedule_256 + je .L007schedule_192 +.L008schedule_128: + movl $10,%eax +.L009loop_schedule_128: + call __vpaes_schedule_round + decl %eax + jz .L010schedule_mangle_last + call __vpaes_schedule_mangle + jmp .L009loop_schedule_128 +.align 16 +.L007schedule_192: + movdqu 8(%esi),%xmm0 + call __vpaes_schedule_transform + movdqa %xmm0,%xmm6 + pxor %xmm4,%xmm4 + movhlps %xmm4,%xmm6 + movl $4,%eax +.L011loop_schedule_192: + call __vpaes_schedule_round +.byte 102,15,58,15,198,8 + call __vpaes_schedule_mangle + call __vpaes_schedule_192_smear + call __vpaes_schedule_mangle + call __vpaes_schedule_round + decl %eax + jz .L010schedule_mangle_last + call __vpaes_schedule_mangle + call __vpaes_schedule_192_smear + jmp .L011loop_schedule_192 +.align 16 +.L006schedule_256: + movdqu 16(%esi),%xmm0 + call __vpaes_schedule_transform + movl $7,%eax +.L012loop_schedule_256: + call __vpaes_schedule_mangle + movdqa %xmm0,%xmm6 + call __vpaes_schedule_round + decl %eax + jz .L010schedule_mangle_last + call __vpaes_schedule_mangle + pshufd $255,%xmm0,%xmm0 + movdqa %xmm7,20(%esp) + movdqa %xmm6,%xmm7 + call .L_vpaes_schedule_low_round + movdqa 20(%esp),%xmm7 + jmp .L012loop_schedule_256 +.align 16 +.L010schedule_mangle_last: + leal 384(%ebp),%ebx + testl %edi,%edi + jnz .L013schedule_mangle_last_dec + movdqa 256(%ebp,%ecx,1),%xmm1 +.byte 102,15,56,0,193 + leal 352(%ebp),%ebx + addl $32,%edx +.L013schedule_mangle_last_dec: + addl $-16,%edx + pxor 336(%ebp),%xmm0 + call __vpaes_schedule_transform + movdqu %xmm0,(%edx) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + ret +.def __vpaes_schedule_192_smear; .scl 3; .type 32; .endef +.align 16 +__vpaes_schedule_192_smear: + pshufd $128,%xmm6,%xmm1 + pshufd $254,%xmm7,%xmm0 + pxor %xmm1,%xmm6 + pxor %xmm1,%xmm1 + pxor %xmm0,%xmm6 + movdqa %xmm6,%xmm0 + movhlps %xmm1,%xmm6 + ret +.def __vpaes_schedule_round; .scl 3; .type 32; .endef +.align 16 +__vpaes_schedule_round: + movdqa 8(%esp),%xmm2 + pxor %xmm1,%xmm1 +.byte 102,15,58,15,202,15 +.byte 102,15,58,15,210,15 + pxor %xmm1,%xmm7 + pshufd $255,%xmm0,%xmm0 +.byte 102,15,58,15,192,1 + movdqa %xmm2,8(%esp) +.L_vpaes_schedule_low_round: + movdqa %xmm7,%xmm1 + pslldq $4,%xmm7 + pxor %xmm1,%xmm7 + movdqa %xmm7,%xmm1 + pslldq $8,%xmm7 + pxor %xmm1,%xmm7 + pxor 336(%ebp),%xmm7 + movdqa -16(%ebp),%xmm4 + movdqa -48(%ebp),%xmm5 + movdqa %xmm4,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm4,%xmm0 + movdqa -32(%ebp),%xmm2 +.byte 102,15,56,0,208 + pxor %xmm1,%xmm0 + movdqa %xmm5,%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 + movdqa %xmm5,%xmm4 +.byte 102,15,56,0,224 + pxor %xmm2,%xmm4 + movdqa %xmm5,%xmm2 +.byte 102,15,56,0,211 + pxor %xmm0,%xmm2 + movdqa %xmm5,%xmm3 +.byte 102,15,56,0,220 + pxor %xmm1,%xmm3 + movdqa 32(%ebp),%xmm4 +.byte 102,15,56,0,226 + movdqa 48(%ebp),%xmm0 +.byte 102,15,56,0,195 + pxor %xmm4,%xmm0 + pxor %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + ret +.def __vpaes_schedule_transform; .scl 3; .type 32; .endef +.align 16 +__vpaes_schedule_transform: + movdqa -16(%ebp),%xmm2 + movdqa %xmm2,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm0 + movdqa (%ebx),%xmm2 +.byte 102,15,56,0,208 + movdqa 16(%ebx),%xmm0 +.byte 102,15,56,0,193 + pxor %xmm2,%xmm0 + ret +.def __vpaes_schedule_mangle; .scl 3; .type 32; .endef +.align 16 +__vpaes_schedule_mangle: + movdqa %xmm0,%xmm4 + movdqa 128(%ebp),%xmm5 + testl %edi,%edi + jnz .L014schedule_mangle_dec + addl $16,%edx + pxor 336(%ebp),%xmm4 +.byte 102,15,56,0,229 + movdqa %xmm4,%xmm3 +.byte 102,15,56,0,229 + pxor %xmm4,%xmm3 +.byte 102,15,56,0,229 + pxor %xmm4,%xmm3 + jmp .L015schedule_mangle_both +.align 16 +.L014schedule_mangle_dec: + movdqa -16(%ebp),%xmm2 + leal 416(%ebp),%esi + movdqa %xmm2,%xmm1 + pandn %xmm4,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm4 + movdqa (%esi),%xmm2 +.byte 102,15,56,0,212 + movdqa 16(%esi),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + movdqa 32(%esi),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 48(%esi),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + movdqa 64(%esi),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 80(%esi),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + movdqa 96(%esi),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 112(%esi),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 + addl $-16,%edx +.L015schedule_mangle_both: + movdqa 256(%ebp,%ecx,1),%xmm1 +.byte 102,15,56,0,217 + addl $-16,%ecx + andl $48,%ecx + movdqu %xmm3,(%edx) + ret +.globl _vpaes_set_encrypt_key +.def _vpaes_set_encrypt_key; .scl 2; .type 32; .endef +.align 16 +_vpaes_set_encrypt_key: +.L_vpaes_set_encrypt_key_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%eax + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movl %eax,%ebx + shrl $5,%ebx + addl $5,%ebx + movl %ebx,240(%edx) + movl $48,%ecx + movl $0,%edi + leal .L_vpaes_consts+0x30-.L016pic_point,%ebp + call __vpaes_schedule_core +.L016pic_point: + movl 48(%esp),%esp + xorl %eax,%eax + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _vpaes_set_decrypt_key +.def _vpaes_set_decrypt_key; .scl 2; .type 32; .endef +.align 16 +_vpaes_set_decrypt_key: +.L_vpaes_set_decrypt_key_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%eax + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movl %eax,%ebx + shrl $5,%ebx + addl $5,%ebx + movl %ebx,240(%edx) + shll $4,%ebx + leal 16(%edx,%ebx,1),%edx + movl $1,%edi + movl %eax,%ecx + shrl $1,%ecx + andl $32,%ecx + xorl $32,%ecx + leal .L_vpaes_consts+0x30-.L017pic_point,%ebp + call __vpaes_schedule_core +.L017pic_point: + movl 48(%esp),%esp + xorl %eax,%eax + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _vpaes_encrypt +.def _vpaes_encrypt; .scl 2; .type 32; .endef +.align 16 +_vpaes_encrypt: +.L_vpaes_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + leal .L_vpaes_consts+0x30-.L018pic_point,%ebp + call __vpaes_preheat +.L018pic_point: + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%edi + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movdqu (%esi),%xmm0 + call __vpaes_encrypt_core + movdqu %xmm0,(%edi) + movl 48(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _vpaes_decrypt +.def _vpaes_decrypt; .scl 2; .type 32; .endef +.align 16 +_vpaes_decrypt: +.L_vpaes_decrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + leal .L_vpaes_consts+0x30-.L019pic_point,%ebp + call __vpaes_preheat +.L019pic_point: + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%edi + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movdqu (%esi),%xmm0 + call __vpaes_decrypt_core + movdqu %xmm0,(%edi) + movl 48(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _vpaes_cbc_encrypt +.def _vpaes_cbc_encrypt; .scl 2; .type 32; .endef +.align 16 +_vpaes_cbc_encrypt: +.L_vpaes_cbc_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + subl $16,%eax + jc .L020cbc_abort + leal -56(%esp),%ebx + movl 36(%esp),%ebp + andl $-16,%ebx + movl 40(%esp),%ecx + xchgl %esp,%ebx + movdqu (%ebp),%xmm1 + subl %esi,%edi + movl %ebx,48(%esp) + movl %edi,(%esp) + movl %edx,4(%esp) + movl %ebp,8(%esp) + movl %eax,%edi + leal .L_vpaes_consts+0x30-.L021pic_point,%ebp + call __vpaes_preheat +.L021pic_point: + cmpl $0,%ecx + je .L022cbc_dec_loop + jmp .L023cbc_enc_loop +.align 16 +.L023cbc_enc_loop: + movdqu (%esi),%xmm0 + pxor %xmm1,%xmm0 + call __vpaes_encrypt_core + movl (%esp),%ebx + movl 4(%esp),%edx + movdqa %xmm0,%xmm1 + movdqu %xmm0,(%ebx,%esi,1) + leal 16(%esi),%esi + subl $16,%edi + jnc .L023cbc_enc_loop + jmp .L024cbc_done +.align 16 +.L022cbc_dec_loop: + movdqu (%esi),%xmm0 + movdqa %xmm1,16(%esp) + movdqa %xmm0,32(%esp) + call __vpaes_decrypt_core + movl (%esp),%ebx + movl 4(%esp),%edx + pxor 16(%esp),%xmm0 + movdqa 32(%esp),%xmm1 + movdqu %xmm0,(%ebx,%esi,1) + leal 16(%esi),%esi + subl $16,%edi + jnc .L022cbc_dec_loop +.L024cbc_done: + movl 8(%esp),%ebx + movl 48(%esp),%esp + movdqu %xmm1,(%ebx) +.L020cbc_abort: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + +.section .note.GNU-stack,"",%progbits diff --git a/lib/accelerated/x86/coff/aes-ssse3-x86_64.s b/lib/accelerated/x86/coff/aes-ssse3-x86_64.s new file mode 100644 index 0000000..f8dbd26 --- /dev/null +++ b/lib/accelerated/x86/coff/aes-ssse3-x86_64.s @@ -0,0 +1,1137 @@ +###################################################################### +## Constant-time SSSE3 AES core implementation. +## version 0.1 +## +## By Mike Hamburg (Stanford University), 2009 +## Public domain. +## +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. +# +# *** This file is auto-generated *** +# +.text + + + + + + + + + + + + + + + + +.def _vpaes_encrypt_core; .scl 3; .type 32; .endef +.p2align 4 +_vpaes_encrypt_core: + movq %rdx,%r9 + movq $16,%r11 + movl 240(%rdx),%eax + movdqa %xmm9,%xmm1 + movdqa .Lk_ipt(%rip),%xmm2 + pandn %xmm0,%xmm1 + movdqu (%r9),%xmm5 + psrld $4,%xmm1 + pand %xmm9,%xmm0 +.byte 102,15,56,0,208 + movdqa .Lk_ipt+16(%rip),%xmm0 +.byte 102,15,56,0,193 + pxor %xmm5,%xmm2 + addq $16,%r9 + pxor %xmm2,%xmm0 + leaq .Lk_mc_backward(%rip),%r10 + jmp .Lenc_entry + +.p2align 4 +.Lenc_loop: + + movdqa %xmm13,%xmm4 + movdqa %xmm12,%xmm0 +.byte 102,15,56,0,226 +.byte 102,15,56,0,195 + pxor %xmm5,%xmm4 + movdqa %xmm15,%xmm5 + pxor %xmm4,%xmm0 + movdqa -64(%r11,%r10,1),%xmm1 +.byte 102,15,56,0,234 + movdqa (%r11,%r10,1),%xmm4 + movdqa %xmm14,%xmm2 +.byte 102,15,56,0,211 + movdqa %xmm0,%xmm3 + pxor %xmm5,%xmm2 +.byte 102,15,56,0,193 + addq $16,%r9 + pxor %xmm2,%xmm0 +.byte 102,15,56,0,220 + addq $16,%r11 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,193 + andq $48,%r11 + subq $1,%rax + pxor %xmm3,%xmm0 + +.Lenc_entry: + + movdqa %xmm9,%xmm1 + movdqa %xmm11,%xmm5 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 +.byte 102,15,56,0,232 + movdqa %xmm10,%xmm3 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,217 + movdqa %xmm10,%xmm4 + pxor %xmm5,%xmm3 +.byte 102,15,56,0,224 + movdqa %xmm10,%xmm2 + pxor %xmm5,%xmm4 +.byte 102,15,56,0,211 + movdqa %xmm10,%xmm3 + pxor %xmm0,%xmm2 +.byte 102,15,56,0,220 + movdqu (%r9),%xmm5 + pxor %xmm1,%xmm3 + jnz .Lenc_loop + + + movdqa -96(%r10),%xmm4 + movdqa -80(%r10),%xmm0 +.byte 102,15,56,0,226 + pxor %xmm5,%xmm4 +.byte 102,15,56,0,195 + movdqa 64(%r11,%r10,1),%xmm1 + pxor %xmm4,%xmm0 +.byte 102,15,56,0,193 + .byte 0xf3,0xc3 + + + + + + + +.def _vpaes_decrypt_core; .scl 3; .type 32; .endef +.p2align 4 +_vpaes_decrypt_core: + movq %rdx,%r9 + movl 240(%rdx),%eax + movdqa %xmm9,%xmm1 + movdqa .Lk_dipt(%rip),%xmm2 + pandn %xmm0,%xmm1 + movq %rax,%r11 + psrld $4,%xmm1 + movdqu (%r9),%xmm5 + shlq $4,%r11 + pand %xmm9,%xmm0 +.byte 102,15,56,0,208 + movdqa .Lk_dipt+16(%rip),%xmm0 + xorq $48,%r11 + leaq .Lk_dsbd(%rip),%r10 +.byte 102,15,56,0,193 + andq $48,%r11 + pxor %xmm5,%xmm2 + movdqa .Lk_mc_forward+48(%rip),%xmm5 + pxor %xmm2,%xmm0 + addq $16,%r9 + addq %r10,%r11 + jmp .Ldec_entry + +.p2align 4 +.Ldec_loop: + + + + movdqa -32(%r10),%xmm4 + movdqa -16(%r10),%xmm1 +.byte 102,15,56,0,226 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa 0(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 16(%r10),%xmm1 + +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa 32(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 48(%r10),%xmm1 + +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa 64(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 80(%r10),%xmm1 + +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + addq $16,%r9 +.byte 102,15,58,15,237,12 + pxor %xmm1,%xmm0 + subq $1,%rax + +.Ldec_entry: + + movdqa %xmm9,%xmm1 + pandn %xmm0,%xmm1 + movdqa %xmm11,%xmm2 + psrld $4,%xmm1 + pand %xmm9,%xmm0 +.byte 102,15,56,0,208 + movdqa %xmm10,%xmm3 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,217 + movdqa %xmm10,%xmm4 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,224 + pxor %xmm2,%xmm4 + movdqa %xmm10,%xmm2 +.byte 102,15,56,0,211 + movdqa %xmm10,%xmm3 + pxor %xmm0,%xmm2 +.byte 102,15,56,0,220 + movdqu (%r9),%xmm0 + pxor %xmm1,%xmm3 + jnz .Ldec_loop + + + movdqa 96(%r10),%xmm4 +.byte 102,15,56,0,226 + pxor %xmm0,%xmm4 + movdqa 112(%r10),%xmm0 + movdqa -352(%r11),%xmm2 +.byte 102,15,56,0,195 + pxor %xmm4,%xmm0 +.byte 102,15,56,0,194 + .byte 0xf3,0xc3 + + + + + + + +.def _vpaes_schedule_core; .scl 3; .type 32; .endef +.p2align 4 +_vpaes_schedule_core: + + + + + + call _vpaes_preheat + movdqa .Lk_rcon(%rip),%xmm8 + movdqu (%rdi),%xmm0 + + + movdqa %xmm0,%xmm3 + leaq .Lk_ipt(%rip),%r11 + call _vpaes_schedule_transform + movdqa %xmm0,%xmm7 + + leaq .Lk_sr(%rip),%r10 + testq %rcx,%rcx + jnz .Lschedule_am_decrypting + + + movdqu %xmm0,(%rdx) + jmp .Lschedule_go + +.Lschedule_am_decrypting: + + movdqa (%r8,%r10,1),%xmm1 +.byte 102,15,56,0,217 + movdqu %xmm3,(%rdx) + xorq $48,%r8 + +.Lschedule_go: + cmpl $192,%esi + ja .Lschedule_256 + je .Lschedule_192 + + + + + + + + + + +.Lschedule_128: + movl $10,%esi + +.Loop_schedule_128: + call _vpaes_schedule_round + decq %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle + jmp .Loop_schedule_128 + + + + + + + + + + + + + + + + +.p2align 4 +.Lschedule_192: + movdqu 8(%rdi),%xmm0 + call _vpaes_schedule_transform + movdqa %xmm0,%xmm6 + pxor %xmm4,%xmm4 + movhlps %xmm4,%xmm6 + movl $4,%esi + +.Loop_schedule_192: + call _vpaes_schedule_round +.byte 102,15,58,15,198,8 + call _vpaes_schedule_mangle + call _vpaes_schedule_192_smear + call _vpaes_schedule_mangle + call _vpaes_schedule_round + decq %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle + call _vpaes_schedule_192_smear + jmp .Loop_schedule_192 + + + + + + + + + + + +.p2align 4 +.Lschedule_256: + movdqu 16(%rdi),%xmm0 + call _vpaes_schedule_transform + movl $7,%esi + +.Loop_schedule_256: + call _vpaes_schedule_mangle + movdqa %xmm0,%xmm6 + + + call _vpaes_schedule_round + decq %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle + + + pshufd $255,%xmm0,%xmm0 + movdqa %xmm7,%xmm5 + movdqa %xmm6,%xmm7 + call _vpaes_schedule_low_round + movdqa %xmm5,%xmm7 + + jmp .Loop_schedule_256 + + + + + + + + + + + + +.p2align 4 +.Lschedule_mangle_last: + + leaq .Lk_deskew(%rip),%r11 + testq %rcx,%rcx + jnz .Lschedule_mangle_last_dec + + + movdqa (%r8,%r10,1),%xmm1 +.byte 102,15,56,0,193 + leaq .Lk_opt(%rip),%r11 + addq $32,%rdx + +.Lschedule_mangle_last_dec: + addq $-16,%rdx + pxor .Lk_s63(%rip),%xmm0 + call _vpaes_schedule_transform + movdqu %xmm0,(%rdx) + + + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + .byte 0xf3,0xc3 + + + + + + + + + + + + + + + + +.def _vpaes_schedule_192_smear; .scl 3; .type 32; .endef +.p2align 4 +_vpaes_schedule_192_smear: + pshufd $128,%xmm6,%xmm1 + pshufd $254,%xmm7,%xmm0 + pxor %xmm1,%xmm6 + pxor %xmm1,%xmm1 + pxor %xmm0,%xmm6 + movdqa %xmm6,%xmm0 + movhlps %xmm1,%xmm6 + .byte 0xf3,0xc3 + + + + + + + + + + + + + + + + + + + + +.def _vpaes_schedule_round; .scl 3; .type 32; .endef +.p2align 4 +_vpaes_schedule_round: + + pxor %xmm1,%xmm1 +.byte 102,65,15,58,15,200,15 +.byte 102,69,15,58,15,192,15 + pxor %xmm1,%xmm7 + + + pshufd $255,%xmm0,%xmm0 +.byte 102,15,58,15,192,1 + + + + +_vpaes_schedule_low_round: + + movdqa %xmm7,%xmm1 + pslldq $4,%xmm7 + pxor %xmm1,%xmm7 + movdqa %xmm7,%xmm1 + pslldq $8,%xmm7 + pxor %xmm1,%xmm7 + pxor .Lk_s63(%rip),%xmm7 + + + movdqa %xmm9,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 + movdqa %xmm11,%xmm2 +.byte 102,15,56,0,208 + pxor %xmm1,%xmm0 + movdqa %xmm10,%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 + movdqa %xmm10,%xmm4 +.byte 102,15,56,0,224 + pxor %xmm2,%xmm4 + movdqa %xmm10,%xmm2 +.byte 102,15,56,0,211 + pxor %xmm0,%xmm2 + movdqa %xmm10,%xmm3 +.byte 102,15,56,0,220 + pxor %xmm1,%xmm3 + movdqa %xmm13,%xmm4 +.byte 102,15,56,0,226 + movdqa %xmm12,%xmm0 +.byte 102,15,56,0,195 + pxor %xmm4,%xmm0 + + + pxor %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + .byte 0xf3,0xc3 + + + + + + + + + + + +.def _vpaes_schedule_transform; .scl 3; .type 32; .endef +.p2align 4 +_vpaes_schedule_transform: + movdqa %xmm9,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 + movdqa (%r11),%xmm2 +.byte 102,15,56,0,208 + movdqa 16(%r11),%xmm0 +.byte 102,15,56,0,193 + pxor %xmm2,%xmm0 + .byte 0xf3,0xc3 + + + + + + + + + + + + + + + + + + + + + + + + + +.def _vpaes_schedule_mangle; .scl 3; .type 32; .endef +.p2align 4 +_vpaes_schedule_mangle: + movdqa %xmm0,%xmm4 + movdqa .Lk_mc_forward(%rip),%xmm5 + testq %rcx,%rcx + jnz .Lschedule_mangle_dec + + + addq $16,%rdx + pxor .Lk_s63(%rip),%xmm4 +.byte 102,15,56,0,229 + movdqa %xmm4,%xmm3 +.byte 102,15,56,0,229 + pxor %xmm4,%xmm3 +.byte 102,15,56,0,229 + pxor %xmm4,%xmm3 + + jmp .Lschedule_mangle_both +.p2align 4 +.Lschedule_mangle_dec: + + leaq .Lk_dksd(%rip),%r11 + movdqa %xmm9,%xmm1 + pandn %xmm4,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm4 + + movdqa 0(%r11),%xmm2 +.byte 102,15,56,0,212 + movdqa 16(%r11),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + + movdqa 32(%r11),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 48(%r11),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + + movdqa 64(%r11),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 80(%r11),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + + movdqa 96(%r11),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 112(%r11),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 + + addq $-16,%rdx + +.Lschedule_mangle_both: + movdqa (%r8,%r10,1),%xmm1 +.byte 102,15,56,0,217 + addq $-16,%r8 + andq $48,%r8 + movdqu %xmm3,(%rdx) + .byte 0xf3,0xc3 + + + + + +.globl vpaes_set_encrypt_key +.def vpaes_set_encrypt_key; .scl 2; .type 32; .endef +.p2align 4 +vpaes_set_encrypt_key: + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_vpaes_set_encrypt_key: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + leaq -184(%rsp),%rsp + movaps %xmm6,16(%rsp) + movaps %xmm7,32(%rsp) + movaps %xmm8,48(%rsp) + movaps %xmm9,64(%rsp) + movaps %xmm10,80(%rsp) + movaps %xmm11,96(%rsp) + movaps %xmm12,112(%rsp) + movaps %xmm13,128(%rsp) + movaps %xmm14,144(%rsp) + movaps %xmm15,160(%rsp) +.Lenc_key_body: + movl %esi,%eax + shrl $5,%eax + addl $5,%eax + movl %eax,240(%rdx) + + movl $0,%ecx + movl $48,%r8d + call _vpaes_schedule_core + movaps 16(%rsp),%xmm6 + movaps 32(%rsp),%xmm7 + movaps 48(%rsp),%xmm8 + movaps 64(%rsp),%xmm9 + movaps 80(%rsp),%xmm10 + movaps 96(%rsp),%xmm11 + movaps 112(%rsp),%xmm12 + movaps 128(%rsp),%xmm13 + movaps 144(%rsp),%xmm14 + movaps 160(%rsp),%xmm15 + leaq 184(%rsp),%rsp +.Lenc_key_epilogue: + xorl %eax,%eax + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 +.LSEH_end_vpaes_set_encrypt_key: + +.globl vpaes_set_decrypt_key +.def vpaes_set_decrypt_key; .scl 2; .type 32; .endef +.p2align 4 +vpaes_set_decrypt_key: + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_vpaes_set_decrypt_key: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + leaq -184(%rsp),%rsp + movaps %xmm6,16(%rsp) + movaps %xmm7,32(%rsp) + movaps %xmm8,48(%rsp) + movaps %xmm9,64(%rsp) + movaps %xmm10,80(%rsp) + movaps %xmm11,96(%rsp) + movaps %xmm12,112(%rsp) + movaps %xmm13,128(%rsp) + movaps %xmm14,144(%rsp) + movaps %xmm15,160(%rsp) +.Ldec_key_body: + movl %esi,%eax + shrl $5,%eax + addl $5,%eax + movl %eax,240(%rdx) + shll $4,%eax + leaq 16(%rdx,%rax,1),%rdx + + movl $1,%ecx + movl %esi,%r8d + shrl $1,%r8d + andl $32,%r8d + xorl $32,%r8d + call _vpaes_schedule_core + movaps 16(%rsp),%xmm6 + movaps 32(%rsp),%xmm7 + movaps 48(%rsp),%xmm8 + movaps 64(%rsp),%xmm9 + movaps 80(%rsp),%xmm10 + movaps 96(%rsp),%xmm11 + movaps 112(%rsp),%xmm12 + movaps 128(%rsp),%xmm13 + movaps 144(%rsp),%xmm14 + movaps 160(%rsp),%xmm15 + leaq 184(%rsp),%rsp +.Ldec_key_epilogue: + xorl %eax,%eax + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 +.LSEH_end_vpaes_set_decrypt_key: + +.globl vpaes_encrypt +.def vpaes_encrypt; .scl 2; .type 32; .endef +.p2align 4 +vpaes_encrypt: + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_vpaes_encrypt: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + leaq -184(%rsp),%rsp + movaps %xmm6,16(%rsp) + movaps %xmm7,32(%rsp) + movaps %xmm8,48(%rsp) + movaps %xmm9,64(%rsp) + movaps %xmm10,80(%rsp) + movaps %xmm11,96(%rsp) + movaps %xmm12,112(%rsp) + movaps %xmm13,128(%rsp) + movaps %xmm14,144(%rsp) + movaps %xmm15,160(%rsp) +.Lenc_body: + movdqu (%rdi),%xmm0 + call _vpaes_preheat + call _vpaes_encrypt_core + movdqu %xmm0,(%rsi) + movaps 16(%rsp),%xmm6 + movaps 32(%rsp),%xmm7 + movaps 48(%rsp),%xmm8 + movaps 64(%rsp),%xmm9 + movaps 80(%rsp),%xmm10 + movaps 96(%rsp),%xmm11 + movaps 112(%rsp),%xmm12 + movaps 128(%rsp),%xmm13 + movaps 144(%rsp),%xmm14 + movaps 160(%rsp),%xmm15 + leaq 184(%rsp),%rsp +.Lenc_epilogue: + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 +.LSEH_end_vpaes_encrypt: + +.globl vpaes_decrypt +.def vpaes_decrypt; .scl 2; .type 32; .endef +.p2align 4 +vpaes_decrypt: + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_vpaes_decrypt: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + leaq -184(%rsp),%rsp + movaps %xmm6,16(%rsp) + movaps %xmm7,32(%rsp) + movaps %xmm8,48(%rsp) + movaps %xmm9,64(%rsp) + movaps %xmm10,80(%rsp) + movaps %xmm11,96(%rsp) + movaps %xmm12,112(%rsp) + movaps %xmm13,128(%rsp) + movaps %xmm14,144(%rsp) + movaps %xmm15,160(%rsp) +.Ldec_body: + movdqu (%rdi),%xmm0 + call _vpaes_preheat + call _vpaes_decrypt_core + movdqu %xmm0,(%rsi) + movaps 16(%rsp),%xmm6 + movaps 32(%rsp),%xmm7 + movaps 48(%rsp),%xmm8 + movaps 64(%rsp),%xmm9 + movaps 80(%rsp),%xmm10 + movaps 96(%rsp),%xmm11 + movaps 112(%rsp),%xmm12 + movaps 128(%rsp),%xmm13 + movaps 144(%rsp),%xmm14 + movaps 160(%rsp),%xmm15 + leaq 184(%rsp),%rsp +.Ldec_epilogue: + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 +.LSEH_end_vpaes_decrypt: +.globl vpaes_cbc_encrypt +.def vpaes_cbc_encrypt; .scl 2; .type 32; .endef +.p2align 4 +vpaes_cbc_encrypt: + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_vpaes_cbc_encrypt: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + movq 48(%rsp),%r9 + + xchgq %rcx,%rdx + subq $16,%rcx + jc .Lcbc_abort + leaq -184(%rsp),%rsp + movaps %xmm6,16(%rsp) + movaps %xmm7,32(%rsp) + movaps %xmm8,48(%rsp) + movaps %xmm9,64(%rsp) + movaps %xmm10,80(%rsp) + movaps %xmm11,96(%rsp) + movaps %xmm12,112(%rsp) + movaps %xmm13,128(%rsp) + movaps %xmm14,144(%rsp) + movaps %xmm15,160(%rsp) +.Lcbc_body: + movdqu (%r8),%xmm6 + subq %rdi,%rsi + call _vpaes_preheat + cmpl $0,%r9d + je .Lcbc_dec_loop + jmp .Lcbc_enc_loop +.p2align 4 +.Lcbc_enc_loop: + movdqu (%rdi),%xmm0 + pxor %xmm6,%xmm0 + call _vpaes_encrypt_core + movdqa %xmm0,%xmm6 + movdqu %xmm0,(%rsi,%rdi,1) + leaq 16(%rdi),%rdi + subq $16,%rcx + jnc .Lcbc_enc_loop + jmp .Lcbc_done +.p2align 4 +.Lcbc_dec_loop: + movdqu (%rdi),%xmm0 + movdqa %xmm0,%xmm7 + call _vpaes_decrypt_core + pxor %xmm6,%xmm0 + movdqa %xmm7,%xmm6 + movdqu %xmm0,(%rsi,%rdi,1) + leaq 16(%rdi),%rdi + subq $16,%rcx + jnc .Lcbc_dec_loop +.Lcbc_done: + movdqu %xmm6,(%r8) + movaps 16(%rsp),%xmm6 + movaps 32(%rsp),%xmm7 + movaps 48(%rsp),%xmm8 + movaps 64(%rsp),%xmm9 + movaps 80(%rsp),%xmm10 + movaps 96(%rsp),%xmm11 + movaps 112(%rsp),%xmm12 + movaps 128(%rsp),%xmm13 + movaps 144(%rsp),%xmm14 + movaps 160(%rsp),%xmm15 + leaq 184(%rsp),%rsp +.Lcbc_epilogue: +.Lcbc_abort: + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 +.LSEH_end_vpaes_cbc_encrypt: + + + + + + +.def _vpaes_preheat; .scl 3; .type 32; .endef +.p2align 4 +_vpaes_preheat: + leaq .Lk_s0F(%rip),%r10 + movdqa -32(%r10),%xmm10 + movdqa -16(%r10),%xmm11 + movdqa 0(%r10),%xmm9 + movdqa 48(%r10),%xmm13 + movdqa 64(%r10),%xmm12 + movdqa 80(%r10),%xmm15 + movdqa 96(%r10),%xmm14 + .byte 0xf3,0xc3 + + + + + + + +.p2align 6 +_vpaes_consts: +.Lk_inv: +.quad 0x0E05060F0D080180, 0x040703090A0B0C02 +.quad 0x01040A060F0B0780, 0x030D0E0C02050809 + +.Lk_s0F: +.quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F + +.Lk_ipt: +.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 +.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 + +.Lk_sb1: +.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.Lk_sb2: +.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD +.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.Lk_sbo: +.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 +.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA + +.Lk_mc_forward: +.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 +.quad 0x080B0A0904070605, 0x000302010C0F0E0D +.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 +.quad 0x000302010C0F0E0D, 0x080B0A0904070605 + +.Lk_mc_backward: +.quad 0x0605040702010003, 0x0E0D0C0F0A09080B +.quad 0x020100030E0D0C0F, 0x0A09080B06050407 +.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 +.quad 0x0A09080B06050407, 0x020100030E0D0C0F + +.Lk_sr: +.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 +.quad 0x030E09040F0A0500, 0x0B06010C07020D08 +.quad 0x0F060D040B020900, 0x070E050C030A0108 +.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +.Lk_rcon: +.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +.Lk_s63: +.quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B + +.Lk_opt: +.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 +.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 + +.Lk_deskew: +.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A +.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + + + + + +.Lk_dksd: +.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 +.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +.Lk_dksb: +.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 +.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +.Lk_dkse: +.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 +.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +.Lk_dks9: +.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC +.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + + + + + +.Lk_dipt: +.quad 0x0F505B040B545F00, 0x154A411E114E451A +.quad 0x86E383E660056500, 0x12771772F491F194 + +.Lk_dsb9: +.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 +.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 +.Lk_dsbd: +.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 +.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 +.Lk_dsbb: +.quad 0xD022649296B44200, 0x602646F6B0F2D404 +.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B +.Lk_dsbe: +.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 +.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 +.Lk_dsbo: +.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D +.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.p2align 6 + + +.def se_handler; .scl 3; .type 32; .endef +.p2align 4 +se_handler: + pushq %rsi + pushq %rdi + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushfq + subq $64,%rsp + + movq 120(%r8),%rax + movq 248(%r8),%rbx + + movq 8(%r9),%rsi + movq 56(%r9),%r11 + + movl 0(%r11),%r10d + leaq (%rsi,%r10,1),%r10 + cmpq %r10,%rbx + jb .Lin_prologue + + movq 152(%r8),%rax + + movl 4(%r11),%r10d + leaq (%rsi,%r10,1),%r10 + cmpq %r10,%rbx + jae .Lin_prologue + + leaq 16(%rax),%rsi + leaq 512(%r8),%rdi + movl $20,%ecx +.long 0xa548f3fc + leaq 184(%rax),%rax + +.Lin_prologue: + movq 8(%rax),%rdi + movq 16(%rax),%rsi + movq %rax,152(%r8) + movq %rsi,168(%r8) + movq %rdi,176(%r8) + + movq 40(%r9),%rdi + movq %r8,%rsi + movl $154,%ecx +.long 0xa548f3fc + + movq %r9,%rsi + xorq %rcx,%rcx + movq 8(%rsi),%rdx + movq 0(%rsi),%r8 + movq 16(%rsi),%r9 + movq 40(%rsi),%r10 + leaq 56(%rsi),%r11 + leaq 24(%rsi),%r12 + movq %r10,32(%rsp) + movq %r11,40(%rsp) + movq %r12,48(%rsp) + movq %rcx,56(%rsp) + call *__imp_RtlVirtualUnwind(%rip) + + movl $1,%eax + addq $64,%rsp + popfq + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + popq %rdi + popq %rsi + .byte 0xf3,0xc3 + + +.section .pdata +.p2align 2 +.rva .LSEH_begin_vpaes_set_encrypt_key +.rva .LSEH_end_vpaes_set_encrypt_key +.rva .LSEH_info_vpaes_set_encrypt_key + +.rva .LSEH_begin_vpaes_set_decrypt_key +.rva .LSEH_end_vpaes_set_decrypt_key +.rva .LSEH_info_vpaes_set_decrypt_key + +.rva .LSEH_begin_vpaes_encrypt +.rva .LSEH_end_vpaes_encrypt +.rva .LSEH_info_vpaes_encrypt + +.rva .LSEH_begin_vpaes_decrypt +.rva .LSEH_end_vpaes_decrypt +.rva .LSEH_info_vpaes_decrypt + +.rva .LSEH_begin_vpaes_cbc_encrypt +.rva .LSEH_end_vpaes_cbc_encrypt +.rva .LSEH_info_vpaes_cbc_encrypt + +.section .xdata +.p2align 3 +.LSEH_info_vpaes_set_encrypt_key: +.byte 9,0,0,0 +.rva se_handler +.rva .Lenc_key_body,.Lenc_key_epilogue +.LSEH_info_vpaes_set_decrypt_key: +.byte 9,0,0,0 +.rva se_handler +.rva .Ldec_key_body,.Ldec_key_epilogue +.LSEH_info_vpaes_encrypt: +.byte 9,0,0,0 +.rva se_handler +.rva .Lenc_body,.Lenc_epilogue +.LSEH_info_vpaes_decrypt: +.byte 9,0,0,0 +.rva se_handler +.rva .Ldec_body,.Ldec_epilogue +.LSEH_info_vpaes_cbc_encrypt: +.byte 9,0,0,0 +.rva se_handler +.rva .Lcbc_body,.Lcbc_epilogue + +.section .note.GNU-stack,"",%progbits diff --git a/lib/accelerated/x86/coff/aesni-x86.s b/lib/accelerated/x86/coff/aesni-x86.s index 1970712..9c982a2 100644 --- a/lib/accelerated/x86/coff/aesni-x86.s +++ b/lib/accelerated/x86/coff/aesni-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/coff/aesni-x86_64.s b/lib/accelerated/x86/coff/aesni-x86_64.s index 85b5108..30b8223 100644 --- a/lib/accelerated/x86/coff/aesni-x86_64.s +++ b/lib/accelerated/x86/coff/aesni-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/coff/cpuid-x86.s b/lib/accelerated/x86/coff/cpuid-x86.s index f35cfba..9931ff0 100644 --- a/lib/accelerated/x86/coff/cpuid-x86.s +++ b/lib/accelerated/x86/coff/cpuid-x86.s @@ -1,5 +1,6 @@ # -# Copyright (C) 2011-2012 Free Software Foundation, Inc. +# Copyright (C) 2011-2013 Free Software Foundation, Inc. +# Copyright (C) 2013 Nikos Mavrogiannopoulos # # Author: Nikos Mavrogiannopoulos # diff --git a/lib/accelerated/x86/coff/cpuid-x86_64.s b/lib/accelerated/x86/coff/cpuid-x86_64.s index 033df92..3add190 100644 --- a/lib/accelerated/x86/coff/cpuid-x86_64.s +++ b/lib/accelerated/x86/coff/cpuid-x86_64.s @@ -1,5 +1,6 @@ # -# Copyright (C) 2011-2012 Free Software Foundation, Inc. +# Copyright (C) 2011-2013 Free Software Foundation, Inc. +# Copyright (C) 2013 Nikos Mavrogiannopoulos # # Author: Nikos Mavrogiannopoulos # diff --git a/lib/accelerated/x86/coff/e_padlock-x86.s b/lib/accelerated/x86/coff/e_padlock-x86.s index d51d62f..328e646 100644 --- a/lib/accelerated/x86/coff/e_padlock-x86.s +++ b/lib/accelerated/x86/coff/e_padlock-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/coff/e_padlock-x86_64.s b/lib/accelerated/x86/coff/e_padlock-x86_64.s index 14c62fd..6b73825 100644 --- a/lib/accelerated/x86/coff/e_padlock-x86_64.s +++ b/lib/accelerated/x86/coff/e_padlock-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/coff/ghash-x86_64.s b/lib/accelerated/x86/coff/ghash-x86_64.s index 951ee89..d61c82b 100644 --- a/lib/accelerated/x86/coff/ghash-x86_64.s +++ b/lib/accelerated/x86/coff/ghash-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/coff/sha1-ssse3-x86.s b/lib/accelerated/x86/coff/sha1-ssse3-x86.s index 9bd41a0..450f574 100644 --- a/lib/accelerated/x86/coff/sha1-ssse3-x86.s +++ b/lib/accelerated/x86/coff/sha1-ssse3-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s b/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s index 75868a4..98fd50d 100644 --- a/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s +++ b/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/coff/sha256-ssse3-x86.s b/lib/accelerated/x86/coff/sha256-ssse3-x86.s index 6fe2774..117b2bd 100644 --- a/lib/accelerated/x86/coff/sha256-ssse3-x86.s +++ b/lib/accelerated/x86/coff/sha256-ssse3-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/coff/sha512-ssse3-x86.s b/lib/accelerated/x86/coff/sha512-ssse3-x86.s index 79098da..d68eeff 100644 --- a/lib/accelerated/x86/coff/sha512-ssse3-x86.s +++ b/lib/accelerated/x86/coff/sha512-ssse3-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s b/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s index bbb2661..dd80574 100644 --- a/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s +++ b/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/elf/aes-ssse3-x86.s b/lib/accelerated/x86/elf/aes-ssse3-x86.s new file mode 100644 index 0000000..3aa2212 --- /dev/null +++ b/lib/accelerated/x86/elf/aes-ssse3-x86.s @@ -0,0 +1,675 @@ +###################################################################### +## Constant-time SSSE3 AES core implementation. +## version 0.1 +## +## By Mike Hamburg (Stanford University), 2009 +## Public domain. +## +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. +# +# *** This file is auto-generated *** +# +.file "vpaes-x86.s" +.text +.align 64 +.L_vpaes_consts: +.long 218628480,235210255,168496130,67568393 +.long 252381056,17041926,33884169,51187212 +.long 252645135,252645135,252645135,252645135 +.long 1512730624,3266504856,1377990664,3401244816 +.long 830229760,1275146365,2969422977,3447763452 +.long 3411033600,2979783055,338359620,2782886510 +.long 4209124096,907596821,221174255,1006095553 +.long 191964160,3799684038,3164090317,1589111125 +.long 182528256,1777043520,2877432650,3265356744 +.long 1874708224,3503451415,3305285752,363511674 +.long 1606117888,3487855781,1093350906,2384367825 +.long 197121,67569157,134941193,202313229 +.long 67569157,134941193,202313229,197121 +.long 134941193,202313229,197121,67569157 +.long 202313229,197121,67569157,134941193 +.long 33619971,100992007,168364043,235736079 +.long 235736079,33619971,100992007,168364043 +.long 168364043,235736079,33619971,100992007 +.long 100992007,168364043,235736079,33619971 +.long 50462976,117835012,185207048,252579084 +.long 252314880,51251460,117574920,184942860 +.long 184682752,252054788,50987272,118359308 +.long 118099200,185467140,251790600,50727180 +.long 2946363062,528716217,1300004225,1881839624 +.long 1532713819,1532713819,1532713819,1532713819 +.long 3602276352,4288629033,3737020424,4153884961 +.long 1354558464,32357713,2958822624,3775749553 +.long 1201988352,132424512,1572796698,503232858 +.long 2213177600,1597421020,4103937655,675398315 +.long 2749646592,4273543773,1511898873,121693092 +.long 3040248576,1103263732,2871565598,1608280554 +.long 2236667136,2588920351,482954393,64377734 +.long 3069987328,291237287,2117370568,3650299247 +.long 533321216,3573750986,2572112006,1401264716 +.long 1339849704,2721158661,548607111,3445553514 +.long 2128193280,3054596040,2183486460,1257083700 +.long 655635200,1165381986,3923443150,2344132524 +.long 190078720,256924420,290342170,357187870 +.long 1610966272,2263057382,4103205268,309794674 +.long 2592527872,2233205587,1335446729,3402964816 +.long 3973531904,3225098121,3002836325,1918774430 +.long 3870401024,2102906079,2284471353,4117666579 +.long 617007872,1021508343,366931923,691083277 +.long 2528395776,3491914898,2968704004,1613121270 +.long 3445188352,3247741094,844474987,4093578302 +.long 651481088,1190302358,1689581232,574775300 +.long 4289380608,206939853,2555985458,2489840491 +.long 2130264064,327674451,3566485037,3349835193 +.long 2470714624,316102159,3636825756,3393945945 +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105 +.byte 111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83 +.byte 83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117 +.byte 114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105 +.byte 118,101,114,115,105,116,121,41,0 +.align 64 +.type _vpaes_preheat,@function +.align 16 +_vpaes_preheat: + addl (%esp),%ebp + movdqa -48(%ebp),%xmm7 + movdqa -16(%ebp),%xmm6 + ret +.size _vpaes_preheat,.-_vpaes_preheat +.type _vpaes_encrypt_core,@function +.align 16 +_vpaes_encrypt_core: + movl $16,%ecx + movl 240(%edx),%eax + movdqa %xmm6,%xmm1 + movdqa (%ebp),%xmm2 + pandn %xmm0,%xmm1 + pand %xmm6,%xmm0 + movdqu (%edx),%xmm5 +.byte 102,15,56,0,208 + movdqa 16(%ebp),%xmm0 + pxor %xmm5,%xmm2 + psrld $4,%xmm1 + addl $16,%edx +.byte 102,15,56,0,193 + leal 192(%ebp),%ebx + pxor %xmm2,%xmm0 + jmp .L000enc_entry +.align 16 +.L001enc_loop: + movdqa 32(%ebp),%xmm4 + movdqa 48(%ebp),%xmm0 +.byte 102,15,56,0,226 +.byte 102,15,56,0,195 + pxor %xmm5,%xmm4 + movdqa 64(%ebp),%xmm5 + pxor %xmm4,%xmm0 + movdqa -64(%ebx,%ecx,1),%xmm1 +.byte 102,15,56,0,234 + movdqa 80(%ebp),%xmm2 + movdqa (%ebx,%ecx,1),%xmm4 +.byte 102,15,56,0,211 + movdqa %xmm0,%xmm3 + pxor %xmm5,%xmm2 +.byte 102,15,56,0,193 + addl $16,%edx + pxor %xmm2,%xmm0 +.byte 102,15,56,0,220 + addl $16,%ecx + pxor %xmm0,%xmm3 +.byte 102,15,56,0,193 + andl $48,%ecx + subl $1,%eax + pxor %xmm3,%xmm0 +.L000enc_entry: + movdqa %xmm6,%xmm1 + movdqa -32(%ebp),%xmm5 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm6,%xmm0 +.byte 102,15,56,0,232 + movdqa %xmm7,%xmm3 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,217 + movdqa %xmm7,%xmm4 + pxor %xmm5,%xmm3 +.byte 102,15,56,0,224 + movdqa %xmm7,%xmm2 + pxor %xmm5,%xmm4 +.byte 102,15,56,0,211 + movdqa %xmm7,%xmm3 + pxor %xmm0,%xmm2 +.byte 102,15,56,0,220 + movdqu (%edx),%xmm5 + pxor %xmm1,%xmm3 + jnz .L001enc_loop + movdqa 96(%ebp),%xmm4 + movdqa 112(%ebp),%xmm0 +.byte 102,15,56,0,226 + pxor %xmm5,%xmm4 +.byte 102,15,56,0,195 + movdqa 64(%ebx,%ecx,1),%xmm1 + pxor %xmm4,%xmm0 +.byte 102,15,56,0,193 + ret +.size _vpaes_encrypt_core,.-_vpaes_encrypt_core +.type _vpaes_decrypt_core,@function +.align 16 +_vpaes_decrypt_core: + leal 608(%ebp),%ebx + movl 240(%edx),%eax + movdqa %xmm6,%xmm1 + movdqa -64(%ebx),%xmm2 + pandn %xmm0,%xmm1 + movl %eax,%ecx + psrld $4,%xmm1 + movdqu (%edx),%xmm5 + shll $4,%ecx + pand %xmm6,%xmm0 +.byte 102,15,56,0,208 + movdqa -48(%ebx),%xmm0 + xorl $48,%ecx +.byte 102,15,56,0,193 + andl $48,%ecx + pxor %xmm5,%xmm2 + movdqa 176(%ebp),%xmm5 + pxor %xmm2,%xmm0 + addl $16,%edx + leal -352(%ebx,%ecx,1),%ecx + jmp .L002dec_entry +.align 16 +.L003dec_loop: + movdqa -32(%ebx),%xmm4 + movdqa -16(%ebx),%xmm1 +.byte 102,15,56,0,226 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa (%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 16(%ebx),%xmm1 +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa 32(%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 48(%ebx),%xmm1 +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa 64(%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 80(%ebx),%xmm1 +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + addl $16,%edx +.byte 102,15,58,15,237,12 + pxor %xmm1,%xmm0 + subl $1,%eax +.L002dec_entry: + movdqa %xmm6,%xmm1 + movdqa -32(%ebp),%xmm2 + pandn %xmm0,%xmm1 + pand %xmm6,%xmm0 + psrld $4,%xmm1 +.byte 102,15,56,0,208 + movdqa %xmm7,%xmm3 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,217 + movdqa %xmm7,%xmm4 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,224 + pxor %xmm2,%xmm4 + movdqa %xmm7,%xmm2 +.byte 102,15,56,0,211 + movdqa %xmm7,%xmm3 + pxor %xmm0,%xmm2 +.byte 102,15,56,0,220 + movdqu (%edx),%xmm0 + pxor %xmm1,%xmm3 + jnz .L003dec_loop + movdqa 96(%ebx),%xmm4 +.byte 102,15,56,0,226 + pxor %xmm0,%xmm4 + movdqa 112(%ebx),%xmm0 + movdqa (%ecx),%xmm2 +.byte 102,15,56,0,195 + pxor %xmm4,%xmm0 +.byte 102,15,56,0,194 + ret +.size _vpaes_decrypt_core,.-_vpaes_decrypt_core +.type _vpaes_schedule_core,@function +.align 16 +_vpaes_schedule_core: + addl (%esp),%ebp + movdqu (%esi),%xmm0 + movdqa 320(%ebp),%xmm2 + movdqa %xmm0,%xmm3 + leal (%ebp),%ebx + movdqa %xmm2,4(%esp) + call _vpaes_schedule_transform + movdqa %xmm0,%xmm7 + testl %edi,%edi + jnz .L004schedule_am_decrypting + movdqu %xmm0,(%edx) + jmp .L005schedule_go +.L004schedule_am_decrypting: + movdqa 256(%ebp,%ecx,1),%xmm1 +.byte 102,15,56,0,217 + movdqu %xmm3,(%edx) + xorl $48,%ecx +.L005schedule_go: + cmpl $192,%eax + ja .L006schedule_256 + je .L007schedule_192 +.L008schedule_128: + movl $10,%eax +.L009loop_schedule_128: + call _vpaes_schedule_round + decl %eax + jz .L010schedule_mangle_last + call _vpaes_schedule_mangle + jmp .L009loop_schedule_128 +.align 16 +.L007schedule_192: + movdqu 8(%esi),%xmm0 + call _vpaes_schedule_transform + movdqa %xmm0,%xmm6 + pxor %xmm4,%xmm4 + movhlps %xmm4,%xmm6 + movl $4,%eax +.L011loop_schedule_192: + call _vpaes_schedule_round +.byte 102,15,58,15,198,8 + call _vpaes_schedule_mangle + call _vpaes_schedule_192_smear + call _vpaes_schedule_mangle + call _vpaes_schedule_round + decl %eax + jz .L010schedule_mangle_last + call _vpaes_schedule_mangle + call _vpaes_schedule_192_smear + jmp .L011loop_schedule_192 +.align 16 +.L006schedule_256: + movdqu 16(%esi),%xmm0 + call _vpaes_schedule_transform + movl $7,%eax +.L012loop_schedule_256: + call _vpaes_schedule_mangle + movdqa %xmm0,%xmm6 + call _vpaes_schedule_round + decl %eax + jz .L010schedule_mangle_last + call _vpaes_schedule_mangle + pshufd $255,%xmm0,%xmm0 + movdqa %xmm7,20(%esp) + movdqa %xmm6,%xmm7 + call .L_vpaes_schedule_low_round + movdqa 20(%esp),%xmm7 + jmp .L012loop_schedule_256 +.align 16 +.L010schedule_mangle_last: + leal 384(%ebp),%ebx + testl %edi,%edi + jnz .L013schedule_mangle_last_dec + movdqa 256(%ebp,%ecx,1),%xmm1 +.byte 102,15,56,0,193 + leal 352(%ebp),%ebx + addl $32,%edx +.L013schedule_mangle_last_dec: + addl $-16,%edx + pxor 336(%ebp),%xmm0 + call _vpaes_schedule_transform + movdqu %xmm0,(%edx) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + ret +.size _vpaes_schedule_core,.-_vpaes_schedule_core +.type _vpaes_schedule_192_smear,@function +.align 16 +_vpaes_schedule_192_smear: + pshufd $128,%xmm6,%xmm1 + pshufd $254,%xmm7,%xmm0 + pxor %xmm1,%xmm6 + pxor %xmm1,%xmm1 + pxor %xmm0,%xmm6 + movdqa %xmm6,%xmm0 + movhlps %xmm1,%xmm6 + ret +.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear +.type _vpaes_schedule_round,@function +.align 16 +_vpaes_schedule_round: + movdqa 8(%esp),%xmm2 + pxor %xmm1,%xmm1 +.byte 102,15,58,15,202,15 +.byte 102,15,58,15,210,15 + pxor %xmm1,%xmm7 + pshufd $255,%xmm0,%xmm0 +.byte 102,15,58,15,192,1 + movdqa %xmm2,8(%esp) +.L_vpaes_schedule_low_round: + movdqa %xmm7,%xmm1 + pslldq $4,%xmm7 + pxor %xmm1,%xmm7 + movdqa %xmm7,%xmm1 + pslldq $8,%xmm7 + pxor %xmm1,%xmm7 + pxor 336(%ebp),%xmm7 + movdqa -16(%ebp),%xmm4 + movdqa -48(%ebp),%xmm5 + movdqa %xmm4,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm4,%xmm0 + movdqa -32(%ebp),%xmm2 +.byte 102,15,56,0,208 + pxor %xmm1,%xmm0 + movdqa %xmm5,%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 + movdqa %xmm5,%xmm4 +.byte 102,15,56,0,224 + pxor %xmm2,%xmm4 + movdqa %xmm5,%xmm2 +.byte 102,15,56,0,211 + pxor %xmm0,%xmm2 + movdqa %xmm5,%xmm3 +.byte 102,15,56,0,220 + pxor %xmm1,%xmm3 + movdqa 32(%ebp),%xmm4 +.byte 102,15,56,0,226 + movdqa 48(%ebp),%xmm0 +.byte 102,15,56,0,195 + pxor %xmm4,%xmm0 + pxor %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + ret +.size _vpaes_schedule_round,.-_vpaes_schedule_round +.type _vpaes_schedule_transform,@function +.align 16 +_vpaes_schedule_transform: + movdqa -16(%ebp),%xmm2 + movdqa %xmm2,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm0 + movdqa (%ebx),%xmm2 +.byte 102,15,56,0,208 + movdqa 16(%ebx),%xmm0 +.byte 102,15,56,0,193 + pxor %xmm2,%xmm0 + ret +.size _vpaes_schedule_transform,.-_vpaes_schedule_transform +.type _vpaes_schedule_mangle,@function +.align 16 +_vpaes_schedule_mangle: + movdqa %xmm0,%xmm4 + movdqa 128(%ebp),%xmm5 + testl %edi,%edi + jnz .L014schedule_mangle_dec + addl $16,%edx + pxor 336(%ebp),%xmm4 +.byte 102,15,56,0,229 + movdqa %xmm4,%xmm3 +.byte 102,15,56,0,229 + pxor %xmm4,%xmm3 +.byte 102,15,56,0,229 + pxor %xmm4,%xmm3 + jmp .L015schedule_mangle_both +.align 16 +.L014schedule_mangle_dec: + movdqa -16(%ebp),%xmm2 + leal 416(%ebp),%esi + movdqa %xmm2,%xmm1 + pandn %xmm4,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm4 + movdqa (%esi),%xmm2 +.byte 102,15,56,0,212 + movdqa 16(%esi),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + movdqa 32(%esi),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 48(%esi),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + movdqa 64(%esi),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 80(%esi),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + movdqa 96(%esi),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 112(%esi),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 + addl $-16,%edx +.L015schedule_mangle_both: + movdqa 256(%ebp,%ecx,1),%xmm1 +.byte 102,15,56,0,217 + addl $-16,%ecx + andl $48,%ecx + movdqu %xmm3,(%edx) + ret +.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle +.globl vpaes_set_encrypt_key +.type vpaes_set_encrypt_key,@function +.align 16 +vpaes_set_encrypt_key: +.L_vpaes_set_encrypt_key_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%eax + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movl %eax,%ebx + shrl $5,%ebx + addl $5,%ebx + movl %ebx,240(%edx) + movl $48,%ecx + movl $0,%edi + leal .L_vpaes_consts+0x30-.L016pic_point,%ebp + call _vpaes_schedule_core +.L016pic_point: + movl 48(%esp),%esp + xorl %eax,%eax + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size vpaes_set_encrypt_key,.-.L_vpaes_set_encrypt_key_begin +.globl vpaes_set_decrypt_key +.type vpaes_set_decrypt_key,@function +.align 16 +vpaes_set_decrypt_key: +.L_vpaes_set_decrypt_key_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%eax + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movl %eax,%ebx + shrl $5,%ebx + addl $5,%ebx + movl %ebx,240(%edx) + shll $4,%ebx + leal 16(%edx,%ebx,1),%edx + movl $1,%edi + movl %eax,%ecx + shrl $1,%ecx + andl $32,%ecx + xorl $32,%ecx + leal .L_vpaes_consts+0x30-.L017pic_point,%ebp + call _vpaes_schedule_core +.L017pic_point: + movl 48(%esp),%esp + xorl %eax,%eax + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size vpaes_set_decrypt_key,.-.L_vpaes_set_decrypt_key_begin +.globl vpaes_encrypt +.type vpaes_encrypt,@function +.align 16 +vpaes_encrypt: +.L_vpaes_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + leal .L_vpaes_consts+0x30-.L018pic_point,%ebp + call _vpaes_preheat +.L018pic_point: + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%edi + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movdqu (%esi),%xmm0 + call _vpaes_encrypt_core + movdqu %xmm0,(%edi) + movl 48(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size vpaes_encrypt,.-.L_vpaes_encrypt_begin +.globl vpaes_decrypt +.type vpaes_decrypt,@function +.align 16 +vpaes_decrypt: +.L_vpaes_decrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + leal .L_vpaes_consts+0x30-.L019pic_point,%ebp + call _vpaes_preheat +.L019pic_point: + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%edi + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movdqu (%esi),%xmm0 + call _vpaes_decrypt_core + movdqu %xmm0,(%edi) + movl 48(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size vpaes_decrypt,.-.L_vpaes_decrypt_begin +.globl vpaes_cbc_encrypt +.type vpaes_cbc_encrypt,@function +.align 16 +vpaes_cbc_encrypt: +.L_vpaes_cbc_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + subl $16,%eax + jc .L020cbc_abort + leal -56(%esp),%ebx + movl 36(%esp),%ebp + andl $-16,%ebx + movl 40(%esp),%ecx + xchgl %esp,%ebx + movdqu (%ebp),%xmm1 + subl %esi,%edi + movl %ebx,48(%esp) + movl %edi,(%esp) + movl %edx,4(%esp) + movl %ebp,8(%esp) + movl %eax,%edi + leal .L_vpaes_consts+0x30-.L021pic_point,%ebp + call _vpaes_preheat +.L021pic_point: + cmpl $0,%ecx + je .L022cbc_dec_loop + jmp .L023cbc_enc_loop +.align 16 +.L023cbc_enc_loop: + movdqu (%esi),%xmm0 + pxor %xmm1,%xmm0 + call _vpaes_encrypt_core + movl (%esp),%ebx + movl 4(%esp),%edx + movdqa %xmm0,%xmm1 + movdqu %xmm0,(%ebx,%esi,1) + leal 16(%esi),%esi + subl $16,%edi + jnc .L023cbc_enc_loop + jmp .L024cbc_done +.align 16 +.L022cbc_dec_loop: + movdqu (%esi),%xmm0 + movdqa %xmm1,16(%esp) + movdqa %xmm0,32(%esp) + call _vpaes_decrypt_core + movl (%esp),%ebx + movl 4(%esp),%edx + pxor 16(%esp),%xmm0 + movdqa 32(%esp),%xmm1 + movdqu %xmm0,(%ebx,%esi,1) + leal 16(%esi),%esi + subl $16,%edi + jnc .L022cbc_dec_loop +.L024cbc_done: + movl 8(%esp),%ebx + movl 48(%esp),%esp + movdqu %xmm1,(%ebx) +.L020cbc_abort: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size vpaes_cbc_encrypt,.-.L_vpaes_cbc_encrypt_begin + +.section .note.GNU-stack,"",%progbits diff --git a/lib/accelerated/x86/elf/aes-ssse3-x86_64.s b/lib/accelerated/x86/elf/aes-ssse3-x86_64.s new file mode 100644 index 0000000..bef787b --- /dev/null +++ b/lib/accelerated/x86/elf/aes-ssse3-x86_64.s @@ -0,0 +1,841 @@ +###################################################################### +## Constant-time SSSE3 AES core implementation. +## version 0.1 +## +## By Mike Hamburg (Stanford University), 2009 +## Public domain. +## +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. +# +# *** This file is auto-generated *** +# +.text + + + + + + + + + + + + + + + + +.type _vpaes_encrypt_core,@function +.align 16 +_vpaes_encrypt_core: + movq %rdx,%r9 + movq $16,%r11 + movl 240(%rdx),%eax + movdqa %xmm9,%xmm1 + movdqa .Lk_ipt(%rip),%xmm2 + pandn %xmm0,%xmm1 + movdqu (%r9),%xmm5 + psrld $4,%xmm1 + pand %xmm9,%xmm0 +.byte 102,15,56,0,208 + movdqa .Lk_ipt+16(%rip),%xmm0 +.byte 102,15,56,0,193 + pxor %xmm5,%xmm2 + addq $16,%r9 + pxor %xmm2,%xmm0 + leaq .Lk_mc_backward(%rip),%r10 + jmp .Lenc_entry + +.align 16 +.Lenc_loop: + + movdqa %xmm13,%xmm4 + movdqa %xmm12,%xmm0 +.byte 102,15,56,0,226 +.byte 102,15,56,0,195 + pxor %xmm5,%xmm4 + movdqa %xmm15,%xmm5 + pxor %xmm4,%xmm0 + movdqa -64(%r11,%r10,1),%xmm1 +.byte 102,15,56,0,234 + movdqa (%r11,%r10,1),%xmm4 + movdqa %xmm14,%xmm2 +.byte 102,15,56,0,211 + movdqa %xmm0,%xmm3 + pxor %xmm5,%xmm2 +.byte 102,15,56,0,193 + addq $16,%r9 + pxor %xmm2,%xmm0 +.byte 102,15,56,0,220 + addq $16,%r11 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,193 + andq $48,%r11 + subq $1,%rax + pxor %xmm3,%xmm0 + +.Lenc_entry: + + movdqa %xmm9,%xmm1 + movdqa %xmm11,%xmm5 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 +.byte 102,15,56,0,232 + movdqa %xmm10,%xmm3 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,217 + movdqa %xmm10,%xmm4 + pxor %xmm5,%xmm3 +.byte 102,15,56,0,224 + movdqa %xmm10,%xmm2 + pxor %xmm5,%xmm4 +.byte 102,15,56,0,211 + movdqa %xmm10,%xmm3 + pxor %xmm0,%xmm2 +.byte 102,15,56,0,220 + movdqu (%r9),%xmm5 + pxor %xmm1,%xmm3 + jnz .Lenc_loop + + + movdqa -96(%r10),%xmm4 + movdqa -80(%r10),%xmm0 +.byte 102,15,56,0,226 + pxor %xmm5,%xmm4 +.byte 102,15,56,0,195 + movdqa 64(%r11,%r10,1),%xmm1 + pxor %xmm4,%xmm0 +.byte 102,15,56,0,193 + .byte 0xf3,0xc3 +.size _vpaes_encrypt_core,.-_vpaes_encrypt_core + + + + + + +.type _vpaes_decrypt_core,@function +.align 16 +_vpaes_decrypt_core: + movq %rdx,%r9 + movl 240(%rdx),%eax + movdqa %xmm9,%xmm1 + movdqa .Lk_dipt(%rip),%xmm2 + pandn %xmm0,%xmm1 + movq %rax,%r11 + psrld $4,%xmm1 + movdqu (%r9),%xmm5 + shlq $4,%r11 + pand %xmm9,%xmm0 +.byte 102,15,56,0,208 + movdqa .Lk_dipt+16(%rip),%xmm0 + xorq $48,%r11 + leaq .Lk_dsbd(%rip),%r10 +.byte 102,15,56,0,193 + andq $48,%r11 + pxor %xmm5,%xmm2 + movdqa .Lk_mc_forward+48(%rip),%xmm5 + pxor %xmm2,%xmm0 + addq $16,%r9 + addq %r10,%r11 + jmp .Ldec_entry + +.align 16 +.Ldec_loop: + + + + movdqa -32(%r10),%xmm4 + movdqa -16(%r10),%xmm1 +.byte 102,15,56,0,226 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa 0(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 16(%r10),%xmm1 + +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa 32(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 48(%r10),%xmm1 + +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa 64(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 80(%r10),%xmm1 + +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + addq $16,%r9 +.byte 102,15,58,15,237,12 + pxor %xmm1,%xmm0 + subq $1,%rax + +.Ldec_entry: + + movdqa %xmm9,%xmm1 + pandn %xmm0,%xmm1 + movdqa %xmm11,%xmm2 + psrld $4,%xmm1 + pand %xmm9,%xmm0 +.byte 102,15,56,0,208 + movdqa %xmm10,%xmm3 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,217 + movdqa %xmm10,%xmm4 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,224 + pxor %xmm2,%xmm4 + movdqa %xmm10,%xmm2 +.byte 102,15,56,0,211 + movdqa %xmm10,%xmm3 + pxor %xmm0,%xmm2 +.byte 102,15,56,0,220 + movdqu (%r9),%xmm0 + pxor %xmm1,%xmm3 + jnz .Ldec_loop + + + movdqa 96(%r10),%xmm4 +.byte 102,15,56,0,226 + pxor %xmm0,%xmm4 + movdqa 112(%r10),%xmm0 + movdqa -352(%r11),%xmm2 +.byte 102,15,56,0,195 + pxor %xmm4,%xmm0 +.byte 102,15,56,0,194 + .byte 0xf3,0xc3 +.size _vpaes_decrypt_core,.-_vpaes_decrypt_core + + + + + + +.type _vpaes_schedule_core,@function +.align 16 +_vpaes_schedule_core: + + + + + + call _vpaes_preheat + movdqa .Lk_rcon(%rip),%xmm8 + movdqu (%rdi),%xmm0 + + + movdqa %xmm0,%xmm3 + leaq .Lk_ipt(%rip),%r11 + call _vpaes_schedule_transform + movdqa %xmm0,%xmm7 + + leaq .Lk_sr(%rip),%r10 + testq %rcx,%rcx + jnz .Lschedule_am_decrypting + + + movdqu %xmm0,(%rdx) + jmp .Lschedule_go + +.Lschedule_am_decrypting: + + movdqa (%r8,%r10,1),%xmm1 +.byte 102,15,56,0,217 + movdqu %xmm3,(%rdx) + xorq $48,%r8 + +.Lschedule_go: + cmpl $192,%esi + ja .Lschedule_256 + je .Lschedule_192 + + + + + + + + + + +.Lschedule_128: + movl $10,%esi + +.Loop_schedule_128: + call _vpaes_schedule_round + decq %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle + jmp .Loop_schedule_128 + + + + + + + + + + + + + + + + +.align 16 +.Lschedule_192: + movdqu 8(%rdi),%xmm0 + call _vpaes_schedule_transform + movdqa %xmm0,%xmm6 + pxor %xmm4,%xmm4 + movhlps %xmm4,%xmm6 + movl $4,%esi + +.Loop_schedule_192: + call _vpaes_schedule_round +.byte 102,15,58,15,198,8 + call _vpaes_schedule_mangle + call _vpaes_schedule_192_smear + call _vpaes_schedule_mangle + call _vpaes_schedule_round + decq %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle + call _vpaes_schedule_192_smear + jmp .Loop_schedule_192 + + + + + + + + + + + +.align 16 +.Lschedule_256: + movdqu 16(%rdi),%xmm0 + call _vpaes_schedule_transform + movl $7,%esi + +.Loop_schedule_256: + call _vpaes_schedule_mangle + movdqa %xmm0,%xmm6 + + + call _vpaes_schedule_round + decq %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle + + + pshufd $255,%xmm0,%xmm0 + movdqa %xmm7,%xmm5 + movdqa %xmm6,%xmm7 + call _vpaes_schedule_low_round + movdqa %xmm5,%xmm7 + + jmp .Loop_schedule_256 + + + + + + + + + + + + +.align 16 +.Lschedule_mangle_last: + + leaq .Lk_deskew(%rip),%r11 + testq %rcx,%rcx + jnz .Lschedule_mangle_last_dec + + + movdqa (%r8,%r10,1),%xmm1 +.byte 102,15,56,0,193 + leaq .Lk_opt(%rip),%r11 + addq $32,%rdx + +.Lschedule_mangle_last_dec: + addq $-16,%rdx + pxor .Lk_s63(%rip),%xmm0 + call _vpaes_schedule_transform + movdqu %xmm0,(%rdx) + + + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + .byte 0xf3,0xc3 +.size _vpaes_schedule_core,.-_vpaes_schedule_core + + + + + + + + + + + + + + + +.type _vpaes_schedule_192_smear,@function +.align 16 +_vpaes_schedule_192_smear: + pshufd $128,%xmm6,%xmm1 + pshufd $254,%xmm7,%xmm0 + pxor %xmm1,%xmm6 + pxor %xmm1,%xmm1 + pxor %xmm0,%xmm6 + movdqa %xmm6,%xmm0 + movhlps %xmm1,%xmm6 + .byte 0xf3,0xc3 +.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear + + + + + + + + + + + + + + + + + + + +.type _vpaes_schedule_round,@function +.align 16 +_vpaes_schedule_round: + + pxor %xmm1,%xmm1 +.byte 102,65,15,58,15,200,15 +.byte 102,69,15,58,15,192,15 + pxor %xmm1,%xmm7 + + + pshufd $255,%xmm0,%xmm0 +.byte 102,15,58,15,192,1 + + + + +_vpaes_schedule_low_round: + + movdqa %xmm7,%xmm1 + pslldq $4,%xmm7 + pxor %xmm1,%xmm7 + movdqa %xmm7,%xmm1 + pslldq $8,%xmm7 + pxor %xmm1,%xmm7 + pxor .Lk_s63(%rip),%xmm7 + + + movdqa %xmm9,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 + movdqa %xmm11,%xmm2 +.byte 102,15,56,0,208 + pxor %xmm1,%xmm0 + movdqa %xmm10,%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 + movdqa %xmm10,%xmm4 +.byte 102,15,56,0,224 + pxor %xmm2,%xmm4 + movdqa %xmm10,%xmm2 +.byte 102,15,56,0,211 + pxor %xmm0,%xmm2 + movdqa %xmm10,%xmm3 +.byte 102,15,56,0,220 + pxor %xmm1,%xmm3 + movdqa %xmm13,%xmm4 +.byte 102,15,56,0,226 + movdqa %xmm12,%xmm0 +.byte 102,15,56,0,195 + pxor %xmm4,%xmm0 + + + pxor %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + .byte 0xf3,0xc3 +.size _vpaes_schedule_round,.-_vpaes_schedule_round + + + + + + + + + + +.type _vpaes_schedule_transform,@function +.align 16 +_vpaes_schedule_transform: + movdqa %xmm9,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 + movdqa (%r11),%xmm2 +.byte 102,15,56,0,208 + movdqa 16(%r11),%xmm0 +.byte 102,15,56,0,193 + pxor %xmm2,%xmm0 + .byte 0xf3,0xc3 +.size _vpaes_schedule_transform,.-_vpaes_schedule_transform + + + + + + + + + + + + + + + + + + + + + + + + +.type _vpaes_schedule_mangle,@function +.align 16 +_vpaes_schedule_mangle: + movdqa %xmm0,%xmm4 + movdqa .Lk_mc_forward(%rip),%xmm5 + testq %rcx,%rcx + jnz .Lschedule_mangle_dec + + + addq $16,%rdx + pxor .Lk_s63(%rip),%xmm4 +.byte 102,15,56,0,229 + movdqa %xmm4,%xmm3 +.byte 102,15,56,0,229 + pxor %xmm4,%xmm3 +.byte 102,15,56,0,229 + pxor %xmm4,%xmm3 + + jmp .Lschedule_mangle_both +.align 16 +.Lschedule_mangle_dec: + + leaq .Lk_dksd(%rip),%r11 + movdqa %xmm9,%xmm1 + pandn %xmm4,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm4 + + movdqa 0(%r11),%xmm2 +.byte 102,15,56,0,212 + movdqa 16(%r11),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + + movdqa 32(%r11),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 48(%r11),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + + movdqa 64(%r11),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 80(%r11),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + + movdqa 96(%r11),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 112(%r11),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 + + addq $-16,%rdx + +.Lschedule_mangle_both: + movdqa (%r8,%r10,1),%xmm1 +.byte 102,15,56,0,217 + addq $-16,%r8 + andq $48,%r8 + movdqu %xmm3,(%rdx) + .byte 0xf3,0xc3 +.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle + + + + +.globl vpaes_set_encrypt_key +.type vpaes_set_encrypt_key,@function +.align 16 +vpaes_set_encrypt_key: + movl %esi,%eax + shrl $5,%eax + addl $5,%eax + movl %eax,240(%rdx) + + movl $0,%ecx + movl $48,%r8d + call _vpaes_schedule_core + xorl %eax,%eax + .byte 0xf3,0xc3 +.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key + +.globl vpaes_set_decrypt_key +.type vpaes_set_decrypt_key,@function +.align 16 +vpaes_set_decrypt_key: + movl %esi,%eax + shrl $5,%eax + addl $5,%eax + movl %eax,240(%rdx) + shll $4,%eax + leaq 16(%rdx,%rax,1),%rdx + + movl $1,%ecx + movl %esi,%r8d + shrl $1,%r8d + andl $32,%r8d + xorl $32,%r8d + call _vpaes_schedule_core + xorl %eax,%eax + .byte 0xf3,0xc3 +.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key + +.globl vpaes_encrypt +.type vpaes_encrypt,@function +.align 16 +vpaes_encrypt: + movdqu (%rdi),%xmm0 + call _vpaes_preheat + call _vpaes_encrypt_core + movdqu %xmm0,(%rsi) + .byte 0xf3,0xc3 +.size vpaes_encrypt,.-vpaes_encrypt + +.globl vpaes_decrypt +.type vpaes_decrypt,@function +.align 16 +vpaes_decrypt: + movdqu (%rdi),%xmm0 + call _vpaes_preheat + call _vpaes_decrypt_core + movdqu %xmm0,(%rsi) + .byte 0xf3,0xc3 +.size vpaes_decrypt,.-vpaes_decrypt +.globl vpaes_cbc_encrypt +.type vpaes_cbc_encrypt,@function +.align 16 +vpaes_cbc_encrypt: + xchgq %rcx,%rdx + subq $16,%rcx + jc .Lcbc_abort + movdqu (%r8),%xmm6 + subq %rdi,%rsi + call _vpaes_preheat + cmpl $0,%r9d + je .Lcbc_dec_loop + jmp .Lcbc_enc_loop +.align 16 +.Lcbc_enc_loop: + movdqu (%rdi),%xmm0 + pxor %xmm6,%xmm0 + call _vpaes_encrypt_core + movdqa %xmm0,%xmm6 + movdqu %xmm0,(%rsi,%rdi,1) + leaq 16(%rdi),%rdi + subq $16,%rcx + jnc .Lcbc_enc_loop + jmp .Lcbc_done +.align 16 +.Lcbc_dec_loop: + movdqu (%rdi),%xmm0 + movdqa %xmm0,%xmm7 + call _vpaes_decrypt_core + pxor %xmm6,%xmm0 + movdqa %xmm7,%xmm6 + movdqu %xmm0,(%rsi,%rdi,1) + leaq 16(%rdi),%rdi + subq $16,%rcx + jnc .Lcbc_dec_loop +.Lcbc_done: + movdqu %xmm6,(%r8) +.Lcbc_abort: + .byte 0xf3,0xc3 +.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt + + + + + + +.type _vpaes_preheat,@function +.align 16 +_vpaes_preheat: + leaq .Lk_s0F(%rip),%r10 + movdqa -32(%r10),%xmm10 + movdqa -16(%r10),%xmm11 + movdqa 0(%r10),%xmm9 + movdqa 48(%r10),%xmm13 + movdqa 64(%r10),%xmm12 + movdqa 80(%r10),%xmm15 + movdqa 96(%r10),%xmm14 + .byte 0xf3,0xc3 +.size _vpaes_preheat,.-_vpaes_preheat + + + + + +.type _vpaes_consts,@object +.align 64 +_vpaes_consts: +.Lk_inv: +.quad 0x0E05060F0D080180, 0x040703090A0B0C02 +.quad 0x01040A060F0B0780, 0x030D0E0C02050809 + +.Lk_s0F: +.quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F + +.Lk_ipt: +.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 +.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 + +.Lk_sb1: +.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.Lk_sb2: +.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD +.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.Lk_sbo: +.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 +.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA + +.Lk_mc_forward: +.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 +.quad 0x080B0A0904070605, 0x000302010C0F0E0D +.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 +.quad 0x000302010C0F0E0D, 0x080B0A0904070605 + +.Lk_mc_backward: +.quad 0x0605040702010003, 0x0E0D0C0F0A09080B +.quad 0x020100030E0D0C0F, 0x0A09080B06050407 +.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 +.quad 0x0A09080B06050407, 0x020100030E0D0C0F + +.Lk_sr: +.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 +.quad 0x030E09040F0A0500, 0x0B06010C07020D08 +.quad 0x0F060D040B020900, 0x070E050C030A0108 +.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +.Lk_rcon: +.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +.Lk_s63: +.quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B + +.Lk_opt: +.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 +.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 + +.Lk_deskew: +.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A +.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + + + + + +.Lk_dksd: +.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 +.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +.Lk_dksb: +.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 +.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +.Lk_dkse: +.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 +.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +.Lk_dks9: +.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC +.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + + + + + +.Lk_dipt: +.quad 0x0F505B040B545F00, 0x154A411E114E451A +.quad 0x86E383E660056500, 0x12771772F491F194 + +.Lk_dsb9: +.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 +.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 +.Lk_dsbd: +.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 +.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 +.Lk_dsbb: +.quad 0xD022649296B44200, 0x602646F6B0F2D404 +.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B +.Lk_dsbe: +.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 +.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 +.Lk_dsbo: +.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D +.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.align 64 +.size _vpaes_consts,.-_vpaes_consts + +.section .note.GNU-stack,"",%progbits diff --git a/lib/accelerated/x86/elf/aesni-x86.s b/lib/accelerated/x86/elf/aesni-x86.s index 5f07797..5ee5549 100644 --- a/lib/accelerated/x86/elf/aesni-x86.s +++ b/lib/accelerated/x86/elf/aesni-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/elf/aesni-x86_64.s b/lib/accelerated/x86/elf/aesni-x86_64.s index d3734a6..d073cb2 100644 --- a/lib/accelerated/x86/elf/aesni-x86_64.s +++ b/lib/accelerated/x86/elf/aesni-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/elf/cpuid-x86.s b/lib/accelerated/x86/elf/cpuid-x86.s index a9db647..83a6d23 100644 --- a/lib/accelerated/x86/elf/cpuid-x86.s +++ b/lib/accelerated/x86/elf/cpuid-x86.s @@ -1,5 +1,6 @@ # -# Copyright (C) 2011-2012 Free Software Foundation, Inc. +# Copyright (C) 2011-2013 Free Software Foundation, Inc. +# Copyright (C) 2013 Nikos Mavrogiannopoulos # # Author: Nikos Mavrogiannopoulos # diff --git a/lib/accelerated/x86/elf/cpuid-x86_64.s b/lib/accelerated/x86/elf/cpuid-x86_64.s index 41a0061..626c8f6 100644 --- a/lib/accelerated/x86/elf/cpuid-x86_64.s +++ b/lib/accelerated/x86/elf/cpuid-x86_64.s @@ -1,5 +1,6 @@ # -# Copyright (C) 2011-2012 Free Software Foundation, Inc. +# Copyright (C) 2011-2013 Free Software Foundation, Inc. +# Copyright (C) 2013 Nikos Mavrogiannopoulos # # Author: Nikos Mavrogiannopoulos # diff --git a/lib/accelerated/x86/elf/e_padlock-x86.s b/lib/accelerated/x86/elf/e_padlock-x86.s index 2199255..0b8fc28 100644 --- a/lib/accelerated/x86/elf/e_padlock-x86.s +++ b/lib/accelerated/x86/elf/e_padlock-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/elf/e_padlock-x86_64.s b/lib/accelerated/x86/elf/e_padlock-x86_64.s index 2ac113d..da5547f 100644 --- a/lib/accelerated/x86/elf/e_padlock-x86_64.s +++ b/lib/accelerated/x86/elf/e_padlock-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/elf/ghash-x86_64.s b/lib/accelerated/x86/elf/ghash-x86_64.s index 9755951..a2c26f9 100644 --- a/lib/accelerated/x86/elf/ghash-x86_64.s +++ b/lib/accelerated/x86/elf/ghash-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/elf/sha1-ssse3-x86.s b/lib/accelerated/x86/elf/sha1-ssse3-x86.s index e2f22e7..8c40615 100644 --- a/lib/accelerated/x86/elf/sha1-ssse3-x86.s +++ b/lib/accelerated/x86/elf/sha1-ssse3-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/elf/sha1-ssse3-x86_64.s b/lib/accelerated/x86/elf/sha1-ssse3-x86_64.s index 149edef..a61d7f6 100644 --- a/lib/accelerated/x86/elf/sha1-ssse3-x86_64.s +++ b/lib/accelerated/x86/elf/sha1-ssse3-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/elf/sha256-ssse3-x86.s b/lib/accelerated/x86/elf/sha256-ssse3-x86.s index 81470f5..212962a 100644 --- a/lib/accelerated/x86/elf/sha256-ssse3-x86.s +++ b/lib/accelerated/x86/elf/sha256-ssse3-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/elf/sha512-ssse3-x86.s b/lib/accelerated/x86/elf/sha512-ssse3-x86.s index 088a0fa..7fa849a 100644 --- a/lib/accelerated/x86/elf/sha512-ssse3-x86.s +++ b/lib/accelerated/x86/elf/sha512-ssse3-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/elf/sha512-ssse3-x86_64.s b/lib/accelerated/x86/elf/sha512-ssse3-x86_64.s index ea1915d..7808a1b 100644 --- a/lib/accelerated/x86/elf/sha512-ssse3-x86_64.s +++ b/lib/accelerated/x86/elf/sha512-ssse3-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/files.mk b/lib/accelerated/x86/files.mk index c22d758..7f2ba3e 100644 --- a/lib/accelerated/x86/files.mk +++ b/lib/accelerated/x86/files.mk @@ -1,6 +1,6 @@ -X86_FILES_ELF=elf/aesni-x86.s elf/cpuid-x86.s elf/e_padlock-x86.s elf/sha1-ssse3-x86.s elf/sha256-ssse3-x86.s elf/sha512-ssse3-x86.s -X86_FILES_COFF=coff/aesni-x86.s coff/cpuid-x86.s coff/e_padlock-x86.s coff/sha1-ssse3-x86.s coff/sha256-ssse3-x86.s coff/sha512-ssse3-x86.s -X86_FILES_MACOSX=macosx/aesni-x86.s macosx/cpuid-x86.s macosx/e_padlock-x86.s macosx/sha1-ssse3-x86.s macosx/sha256-ssse3-x86.s macosx/sha512-ssse3-x86.s -X86_64_FILES_ELF=elf/aesni-x86_64.s elf/cpuid-x86_64.s elf/e_padlock-x86_64.s elf/ghash-x86_64.s elf/sha1-ssse3-x86_64.s elf/sha512-ssse3-x86_64.s -X86_64_FILES_COFF=coff/aesni-x86_64.s coff/cpuid-x86_64.s coff/e_padlock-x86_64.s coff/ghash-x86_64.s coff/sha1-ssse3-x86_64.s coff/sha512-ssse3-x86_64.s -X86_64_FILES_MACOSX=macosx/aesni-x86_64.s macosx/cpuid-x86_64.s macosx/e_padlock-x86_64.s macosx/ghash-x86_64.s macosx/sha1-ssse3-x86_64.s macosx/sha512-ssse3-x86_64.s +X86_FILES_ELF=elf/aesni-x86.s elf/cpuid-x86.s elf/e_padlock-x86.s elf/sha1-ssse3-x86.s elf/sha256-ssse3-x86.s elf/sha512-ssse3-x86.s elf/aes-ssse3-x86.s +X86_FILES_COFF=coff/aesni-x86.s coff/cpuid-x86.s coff/e_padlock-x86.s coff/sha1-ssse3-x86.s coff/sha256-ssse3-x86.s coff/sha512-ssse3-x86.s coff/aes-ssse3-x86.s +X86_FILES_MACOSX=macosx/aesni-x86.s macosx/cpuid-x86.s macosx/e_padlock-x86.s macosx/sha1-ssse3-x86.s macosx/sha256-ssse3-x86.s macosx/sha512-ssse3-x86.s macosx/aes-ssse3-x86.s +X86_64_FILES_ELF=elf/aesni-x86_64.s elf/cpuid-x86_64.s elf/e_padlock-x86_64.s elf/ghash-x86_64.s elf/sha1-ssse3-x86_64.s elf/sha512-ssse3-x86_64.s elf/aes-ssse3-x86_64.s +X86_64_FILES_COFF=coff/aesni-x86_64.s coff/cpuid-x86_64.s coff/e_padlock-x86_64.s coff/ghash-x86_64.s coff/sha1-ssse3-x86_64.s coff/sha512-ssse3-x86_64.s coff/aes-ssse3-x86_64.s +X86_64_FILES_MACOSX=macosx/aesni-x86_64.s macosx/cpuid-x86_64.s macosx/e_padlock-x86_64.s macosx/ghash-x86_64.s macosx/sha1-ssse3-x86_64.s macosx/sha512-ssse3-x86_64.s macosx/aes-ssse3-x86_64.s diff --git a/lib/accelerated/x86/license.txt b/lib/accelerated/x86/license.txt index 929ddd5..a50b3a0 100755 --- a/lib/accelerated/x86/license.txt +++ b/lib/accelerated/x86/license.txt @@ -41,3 +41,14 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +###################################################################### +## Constant-time SSSE3 AES core implementation. +## version 0.1 +## +## By Mike Hamburg (Stanford University), 2009 +## Public domain. +## +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. diff --git a/lib/accelerated/x86/macosx/aes-ssse3-x86.s b/lib/accelerated/x86/macosx/aes-ssse3-x86.s new file mode 100644 index 0000000..2f49887 --- /dev/null +++ b/lib/accelerated/x86/macosx/aes-ssse3-x86.s @@ -0,0 +1,649 @@ +###################################################################### +## Constant-time SSSE3 AES core implementation. +## version 0.1 +## +## By Mike Hamburg (Stanford University), 2009 +## Public domain. +## +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. +# +# *** This file is auto-generated *** +# +.file "vpaes-x86.s" +.text +.align 6,0x90 +L_vpaes_consts: +.long 218628480,235210255,168496130,67568393 +.long 252381056,17041926,33884169,51187212 +.long 252645135,252645135,252645135,252645135 +.long 1512730624,3266504856,1377990664,3401244816 +.long 830229760,1275146365,2969422977,3447763452 +.long 3411033600,2979783055,338359620,2782886510 +.long 4209124096,907596821,221174255,1006095553 +.long 191964160,3799684038,3164090317,1589111125 +.long 182528256,1777043520,2877432650,3265356744 +.long 1874708224,3503451415,3305285752,363511674 +.long 1606117888,3487855781,1093350906,2384367825 +.long 197121,67569157,134941193,202313229 +.long 67569157,134941193,202313229,197121 +.long 134941193,202313229,197121,67569157 +.long 202313229,197121,67569157,134941193 +.long 33619971,100992007,168364043,235736079 +.long 235736079,33619971,100992007,168364043 +.long 168364043,235736079,33619971,100992007 +.long 100992007,168364043,235736079,33619971 +.long 50462976,117835012,185207048,252579084 +.long 252314880,51251460,117574920,184942860 +.long 184682752,252054788,50987272,118359308 +.long 118099200,185467140,251790600,50727180 +.long 2946363062,528716217,1300004225,1881839624 +.long 1532713819,1532713819,1532713819,1532713819 +.long 3602276352,4288629033,3737020424,4153884961 +.long 1354558464,32357713,2958822624,3775749553 +.long 1201988352,132424512,1572796698,503232858 +.long 2213177600,1597421020,4103937655,675398315 +.long 2749646592,4273543773,1511898873,121693092 +.long 3040248576,1103263732,2871565598,1608280554 +.long 2236667136,2588920351,482954393,64377734 +.long 3069987328,291237287,2117370568,3650299247 +.long 533321216,3573750986,2572112006,1401264716 +.long 1339849704,2721158661,548607111,3445553514 +.long 2128193280,3054596040,2183486460,1257083700 +.long 655635200,1165381986,3923443150,2344132524 +.long 190078720,256924420,290342170,357187870 +.long 1610966272,2263057382,4103205268,309794674 +.long 2592527872,2233205587,1335446729,3402964816 +.long 3973531904,3225098121,3002836325,1918774430 +.long 3870401024,2102906079,2284471353,4117666579 +.long 617007872,1021508343,366931923,691083277 +.long 2528395776,3491914898,2968704004,1613121270 +.long 3445188352,3247741094,844474987,4093578302 +.long 651481088,1190302358,1689581232,574775300 +.long 4289380608,206939853,2555985458,2489840491 +.long 2130264064,327674451,3566485037,3349835193 +.long 2470714624,316102159,3636825756,3393945945 +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105 +.byte 111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83 +.byte 83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117 +.byte 114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105 +.byte 118,101,114,115,105,116,121,41,0 +.align 6,0x90 +.align 4 +__vpaes_preheat: + addl (%esp),%ebp + movdqa -48(%ebp),%xmm7 + movdqa -16(%ebp),%xmm6 + ret +.align 4 +__vpaes_encrypt_core: + movl $16,%ecx + movl 240(%edx),%eax + movdqa %xmm6,%xmm1 + movdqa (%ebp),%xmm2 + pandn %xmm0,%xmm1 + pand %xmm6,%xmm0 + movdqu (%edx),%xmm5 +.byte 102,15,56,0,208 + movdqa 16(%ebp),%xmm0 + pxor %xmm5,%xmm2 + psrld $4,%xmm1 + addl $16,%edx +.byte 102,15,56,0,193 + leal 192(%ebp),%ebx + pxor %xmm2,%xmm0 + jmp L000enc_entry +.align 4,0x90 +L001enc_loop: + movdqa 32(%ebp),%xmm4 + movdqa 48(%ebp),%xmm0 +.byte 102,15,56,0,226 +.byte 102,15,56,0,195 + pxor %xmm5,%xmm4 + movdqa 64(%ebp),%xmm5 + pxor %xmm4,%xmm0 + movdqa -64(%ebx,%ecx,1),%xmm1 +.byte 102,15,56,0,234 + movdqa 80(%ebp),%xmm2 + movdqa (%ebx,%ecx,1),%xmm4 +.byte 102,15,56,0,211 + movdqa %xmm0,%xmm3 + pxor %xmm5,%xmm2 +.byte 102,15,56,0,193 + addl $16,%edx + pxor %xmm2,%xmm0 +.byte 102,15,56,0,220 + addl $16,%ecx + pxor %xmm0,%xmm3 +.byte 102,15,56,0,193 + andl $48,%ecx + subl $1,%eax + pxor %xmm3,%xmm0 +L000enc_entry: + movdqa %xmm6,%xmm1 + movdqa -32(%ebp),%xmm5 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm6,%xmm0 +.byte 102,15,56,0,232 + movdqa %xmm7,%xmm3 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,217 + movdqa %xmm7,%xmm4 + pxor %xmm5,%xmm3 +.byte 102,15,56,0,224 + movdqa %xmm7,%xmm2 + pxor %xmm5,%xmm4 +.byte 102,15,56,0,211 + movdqa %xmm7,%xmm3 + pxor %xmm0,%xmm2 +.byte 102,15,56,0,220 + movdqu (%edx),%xmm5 + pxor %xmm1,%xmm3 + jnz L001enc_loop + movdqa 96(%ebp),%xmm4 + movdqa 112(%ebp),%xmm0 +.byte 102,15,56,0,226 + pxor %xmm5,%xmm4 +.byte 102,15,56,0,195 + movdqa 64(%ebx,%ecx,1),%xmm1 + pxor %xmm4,%xmm0 +.byte 102,15,56,0,193 + ret +.align 4 +__vpaes_decrypt_core: + leal 608(%ebp),%ebx + movl 240(%edx),%eax + movdqa %xmm6,%xmm1 + movdqa -64(%ebx),%xmm2 + pandn %xmm0,%xmm1 + movl %eax,%ecx + psrld $4,%xmm1 + movdqu (%edx),%xmm5 + shll $4,%ecx + pand %xmm6,%xmm0 +.byte 102,15,56,0,208 + movdqa -48(%ebx),%xmm0 + xorl $48,%ecx +.byte 102,15,56,0,193 + andl $48,%ecx + pxor %xmm5,%xmm2 + movdqa 176(%ebp),%xmm5 + pxor %xmm2,%xmm0 + addl $16,%edx + leal -352(%ebx,%ecx,1),%ecx + jmp L002dec_entry +.align 4,0x90 +L003dec_loop: + movdqa -32(%ebx),%xmm4 + movdqa -16(%ebx),%xmm1 +.byte 102,15,56,0,226 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa (%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 16(%ebx),%xmm1 +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa 32(%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 48(%ebx),%xmm1 +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa 64(%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 80(%ebx),%xmm1 +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + addl $16,%edx +.byte 102,15,58,15,237,12 + pxor %xmm1,%xmm0 + subl $1,%eax +L002dec_entry: + movdqa %xmm6,%xmm1 + movdqa -32(%ebp),%xmm2 + pandn %xmm0,%xmm1 + pand %xmm6,%xmm0 + psrld $4,%xmm1 +.byte 102,15,56,0,208 + movdqa %xmm7,%xmm3 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,217 + movdqa %xmm7,%xmm4 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,224 + pxor %xmm2,%xmm4 + movdqa %xmm7,%xmm2 +.byte 102,15,56,0,211 + movdqa %xmm7,%xmm3 + pxor %xmm0,%xmm2 +.byte 102,15,56,0,220 + movdqu (%edx),%xmm0 + pxor %xmm1,%xmm3 + jnz L003dec_loop + movdqa 96(%ebx),%xmm4 +.byte 102,15,56,0,226 + pxor %xmm0,%xmm4 + movdqa 112(%ebx),%xmm0 + movdqa (%ecx),%xmm2 +.byte 102,15,56,0,195 + pxor %xmm4,%xmm0 +.byte 102,15,56,0,194 + ret +.align 4 +__vpaes_schedule_core: + addl (%esp),%ebp + movdqu (%esi),%xmm0 + movdqa 320(%ebp),%xmm2 + movdqa %xmm0,%xmm3 + leal (%ebp),%ebx + movdqa %xmm2,4(%esp) + call __vpaes_schedule_transform + movdqa %xmm0,%xmm7 + testl %edi,%edi + jnz L004schedule_am_decrypting + movdqu %xmm0,(%edx) + jmp L005schedule_go +L004schedule_am_decrypting: + movdqa 256(%ebp,%ecx,1),%xmm1 +.byte 102,15,56,0,217 + movdqu %xmm3,(%edx) + xorl $48,%ecx +L005schedule_go: + cmpl $192,%eax + ja L006schedule_256 + je L007schedule_192 +L008schedule_128: + movl $10,%eax +L009loop_schedule_128: + call __vpaes_schedule_round + decl %eax + jz L010schedule_mangle_last + call __vpaes_schedule_mangle + jmp L009loop_schedule_128 +.align 4,0x90 +L007schedule_192: + movdqu 8(%esi),%xmm0 + call __vpaes_schedule_transform + movdqa %xmm0,%xmm6 + pxor %xmm4,%xmm4 + movhlps %xmm4,%xmm6 + movl $4,%eax +L011loop_schedule_192: + call __vpaes_schedule_round +.byte 102,15,58,15,198,8 + call __vpaes_schedule_mangle + call __vpaes_schedule_192_smear + call __vpaes_schedule_mangle + call __vpaes_schedule_round + decl %eax + jz L010schedule_mangle_last + call __vpaes_schedule_mangle + call __vpaes_schedule_192_smear + jmp L011loop_schedule_192 +.align 4,0x90 +L006schedule_256: + movdqu 16(%esi),%xmm0 + call __vpaes_schedule_transform + movl $7,%eax +L012loop_schedule_256: + call __vpaes_schedule_mangle + movdqa %xmm0,%xmm6 + call __vpaes_schedule_round + decl %eax + jz L010schedule_mangle_last + call __vpaes_schedule_mangle + pshufd $255,%xmm0,%xmm0 + movdqa %xmm7,20(%esp) + movdqa %xmm6,%xmm7 + call L_vpaes_schedule_low_round + movdqa 20(%esp),%xmm7 + jmp L012loop_schedule_256 +.align 4,0x90 +L010schedule_mangle_last: + leal 384(%ebp),%ebx + testl %edi,%edi + jnz L013schedule_mangle_last_dec + movdqa 256(%ebp,%ecx,1),%xmm1 +.byte 102,15,56,0,193 + leal 352(%ebp),%ebx + addl $32,%edx +L013schedule_mangle_last_dec: + addl $-16,%edx + pxor 336(%ebp),%xmm0 + call __vpaes_schedule_transform + movdqu %xmm0,(%edx) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + ret +.align 4 +__vpaes_schedule_192_smear: + pshufd $128,%xmm6,%xmm1 + pshufd $254,%xmm7,%xmm0 + pxor %xmm1,%xmm6 + pxor %xmm1,%xmm1 + pxor %xmm0,%xmm6 + movdqa %xmm6,%xmm0 + movhlps %xmm1,%xmm6 + ret +.align 4 +__vpaes_schedule_round: + movdqa 8(%esp),%xmm2 + pxor %xmm1,%xmm1 +.byte 102,15,58,15,202,15 +.byte 102,15,58,15,210,15 + pxor %xmm1,%xmm7 + pshufd $255,%xmm0,%xmm0 +.byte 102,15,58,15,192,1 + movdqa %xmm2,8(%esp) +L_vpaes_schedule_low_round: + movdqa %xmm7,%xmm1 + pslldq $4,%xmm7 + pxor %xmm1,%xmm7 + movdqa %xmm7,%xmm1 + pslldq $8,%xmm7 + pxor %xmm1,%xmm7 + pxor 336(%ebp),%xmm7 + movdqa -16(%ebp),%xmm4 + movdqa -48(%ebp),%xmm5 + movdqa %xmm4,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm4,%xmm0 + movdqa -32(%ebp),%xmm2 +.byte 102,15,56,0,208 + pxor %xmm1,%xmm0 + movdqa %xmm5,%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 + movdqa %xmm5,%xmm4 +.byte 102,15,56,0,224 + pxor %xmm2,%xmm4 + movdqa %xmm5,%xmm2 +.byte 102,15,56,0,211 + pxor %xmm0,%xmm2 + movdqa %xmm5,%xmm3 +.byte 102,15,56,0,220 + pxor %xmm1,%xmm3 + movdqa 32(%ebp),%xmm4 +.byte 102,15,56,0,226 + movdqa 48(%ebp),%xmm0 +.byte 102,15,56,0,195 + pxor %xmm4,%xmm0 + pxor %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + ret +.align 4 +__vpaes_schedule_transform: + movdqa -16(%ebp),%xmm2 + movdqa %xmm2,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm0 + movdqa (%ebx),%xmm2 +.byte 102,15,56,0,208 + movdqa 16(%ebx),%xmm0 +.byte 102,15,56,0,193 + pxor %xmm2,%xmm0 + ret +.align 4 +__vpaes_schedule_mangle: + movdqa %xmm0,%xmm4 + movdqa 128(%ebp),%xmm5 + testl %edi,%edi + jnz L014schedule_mangle_dec + addl $16,%edx + pxor 336(%ebp),%xmm4 +.byte 102,15,56,0,229 + movdqa %xmm4,%xmm3 +.byte 102,15,56,0,229 + pxor %xmm4,%xmm3 +.byte 102,15,56,0,229 + pxor %xmm4,%xmm3 + jmp L015schedule_mangle_both +.align 4,0x90 +L014schedule_mangle_dec: + movdqa -16(%ebp),%xmm2 + leal 416(%ebp),%esi + movdqa %xmm2,%xmm1 + pandn %xmm4,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm4 + movdqa (%esi),%xmm2 +.byte 102,15,56,0,212 + movdqa 16(%esi),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + movdqa 32(%esi),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 48(%esi),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + movdqa 64(%esi),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 80(%esi),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + movdqa 96(%esi),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 112(%esi),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 + addl $-16,%edx +L015schedule_mangle_both: + movdqa 256(%ebp,%ecx,1),%xmm1 +.byte 102,15,56,0,217 + addl $-16,%ecx + andl $48,%ecx + movdqu %xmm3,(%edx) + ret +.globl _vpaes_set_encrypt_key +.align 4 +_vpaes_set_encrypt_key: +L_vpaes_set_encrypt_key_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%eax + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movl %eax,%ebx + shrl $5,%ebx + addl $5,%ebx + movl %ebx,240(%edx) + movl $48,%ecx + movl $0,%edi + leal L_vpaes_consts+0x30-L016pic_point,%ebp + call __vpaes_schedule_core +L016pic_point: + movl 48(%esp),%esp + xorl %eax,%eax + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _vpaes_set_decrypt_key +.align 4 +_vpaes_set_decrypt_key: +L_vpaes_set_decrypt_key_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%eax + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movl %eax,%ebx + shrl $5,%ebx + addl $5,%ebx + movl %ebx,240(%edx) + shll $4,%ebx + leal 16(%edx,%ebx,1),%edx + movl $1,%edi + movl %eax,%ecx + shrl $1,%ecx + andl $32,%ecx + xorl $32,%ecx + leal L_vpaes_consts+0x30-L017pic_point,%ebp + call __vpaes_schedule_core +L017pic_point: + movl 48(%esp),%esp + xorl %eax,%eax + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _vpaes_encrypt +.align 4 +_vpaes_encrypt: +L_vpaes_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + leal L_vpaes_consts+0x30-L018pic_point,%ebp + call __vpaes_preheat +L018pic_point: + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%edi + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movdqu (%esi),%xmm0 + call __vpaes_encrypt_core + movdqu %xmm0,(%edi) + movl 48(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _vpaes_decrypt +.align 4 +_vpaes_decrypt: +L_vpaes_decrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + leal L_vpaes_consts+0x30-L019pic_point,%ebp + call __vpaes_preheat +L019pic_point: + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%edi + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movdqu (%esi),%xmm0 + call __vpaes_decrypt_core + movdqu %xmm0,(%edi) + movl 48(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _vpaes_cbc_encrypt +.align 4 +_vpaes_cbc_encrypt: +L_vpaes_cbc_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + subl $16,%eax + jc L020cbc_abort + leal -56(%esp),%ebx + movl 36(%esp),%ebp + andl $-16,%ebx + movl 40(%esp),%ecx + xchgl %esp,%ebx + movdqu (%ebp),%xmm1 + subl %esi,%edi + movl %ebx,48(%esp) + movl %edi,(%esp) + movl %edx,4(%esp) + movl %ebp,8(%esp) + movl %eax,%edi + leal L_vpaes_consts+0x30-L021pic_point,%ebp + call __vpaes_preheat +L021pic_point: + cmpl $0,%ecx + je L022cbc_dec_loop + jmp L023cbc_enc_loop +.align 4,0x90 +L023cbc_enc_loop: + movdqu (%esi),%xmm0 + pxor %xmm1,%xmm0 + call __vpaes_encrypt_core + movl (%esp),%ebx + movl 4(%esp),%edx + movdqa %xmm0,%xmm1 + movdqu %xmm0,(%ebx,%esi,1) + leal 16(%esi),%esi + subl $16,%edi + jnc L023cbc_enc_loop + jmp L024cbc_done +.align 4,0x90 +L022cbc_dec_loop: + movdqu (%esi),%xmm0 + movdqa %xmm1,16(%esp) + movdqa %xmm0,32(%esp) + call __vpaes_decrypt_core + movl (%esp),%ebx + movl 4(%esp),%edx + pxor 16(%esp),%xmm0 + movdqa 32(%esp),%xmm1 + movdqu %xmm0,(%ebx,%esi,1) + leal 16(%esi),%esi + subl $16,%edi + jnc L022cbc_dec_loop +L024cbc_done: + movl 8(%esp),%ebx + movl 48(%esp),%esp + movdqu %xmm1,(%ebx) +L020cbc_abort: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + +.section .note.GNU-stack,"",%progbits diff --git a/lib/accelerated/x86/macosx/aes-ssse3-x86_64.s b/lib/accelerated/x86/macosx/aes-ssse3-x86_64.s new file mode 100644 index 0000000..7705bda --- /dev/null +++ b/lib/accelerated/x86/macosx/aes-ssse3-x86_64.s @@ -0,0 +1,841 @@ +###################################################################### +## Constant-time SSSE3 AES core implementation. +## version 0.1 +## +## By Mike Hamburg (Stanford University), 2009 +## Public domain. +## +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. +# +# *** This file is auto-generated *** +# +.text + + + + + + + + + + + + + + + + + +.p2align 4 +_vpaes_encrypt_core: + movq %rdx,%r9 + movq $16,%r11 + movl 240(%rdx),%eax + movdqa %xmm9,%xmm1 + movdqa L$k_ipt(%rip),%xmm2 + pandn %xmm0,%xmm1 + movdqu (%r9),%xmm5 + psrld $4,%xmm1 + pand %xmm9,%xmm0 +.byte 102,15,56,0,208 + movdqa L$k_ipt+16(%rip),%xmm0 +.byte 102,15,56,0,193 + pxor %xmm5,%xmm2 + addq $16,%r9 + pxor %xmm2,%xmm0 + leaq L$k_mc_backward(%rip),%r10 + jmp L$enc_entry + +.p2align 4 +L$enc_loop: + + movdqa %xmm13,%xmm4 + movdqa %xmm12,%xmm0 +.byte 102,15,56,0,226 +.byte 102,15,56,0,195 + pxor %xmm5,%xmm4 + movdqa %xmm15,%xmm5 + pxor %xmm4,%xmm0 + movdqa -64(%r11,%r10,1),%xmm1 +.byte 102,15,56,0,234 + movdqa (%r11,%r10,1),%xmm4 + movdqa %xmm14,%xmm2 +.byte 102,15,56,0,211 + movdqa %xmm0,%xmm3 + pxor %xmm5,%xmm2 +.byte 102,15,56,0,193 + addq $16,%r9 + pxor %xmm2,%xmm0 +.byte 102,15,56,0,220 + addq $16,%r11 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,193 + andq $48,%r11 + subq $1,%rax + pxor %xmm3,%xmm0 + +L$enc_entry: + + movdqa %xmm9,%xmm1 + movdqa %xmm11,%xmm5 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 +.byte 102,15,56,0,232 + movdqa %xmm10,%xmm3 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,217 + movdqa %xmm10,%xmm4 + pxor %xmm5,%xmm3 +.byte 102,15,56,0,224 + movdqa %xmm10,%xmm2 + pxor %xmm5,%xmm4 +.byte 102,15,56,0,211 + movdqa %xmm10,%xmm3 + pxor %xmm0,%xmm2 +.byte 102,15,56,0,220 + movdqu (%r9),%xmm5 + pxor %xmm1,%xmm3 + jnz L$enc_loop + + + movdqa -96(%r10),%xmm4 + movdqa -80(%r10),%xmm0 +.byte 102,15,56,0,226 + pxor %xmm5,%xmm4 +.byte 102,15,56,0,195 + movdqa 64(%r11,%r10,1),%xmm1 + pxor %xmm4,%xmm0 +.byte 102,15,56,0,193 + .byte 0xf3,0xc3 + + + + + + + + +.p2align 4 +_vpaes_decrypt_core: + movq %rdx,%r9 + movl 240(%rdx),%eax + movdqa %xmm9,%xmm1 + movdqa L$k_dipt(%rip),%xmm2 + pandn %xmm0,%xmm1 + movq %rax,%r11 + psrld $4,%xmm1 + movdqu (%r9),%xmm5 + shlq $4,%r11 + pand %xmm9,%xmm0 +.byte 102,15,56,0,208 + movdqa L$k_dipt+16(%rip),%xmm0 + xorq $48,%r11 + leaq L$k_dsbd(%rip),%r10 +.byte 102,15,56,0,193 + andq $48,%r11 + pxor %xmm5,%xmm2 + movdqa L$k_mc_forward+48(%rip),%xmm5 + pxor %xmm2,%xmm0 + addq $16,%r9 + addq %r10,%r11 + jmp L$dec_entry + +.p2align 4 +L$dec_loop: + + + + movdqa -32(%r10),%xmm4 + movdqa -16(%r10),%xmm1 +.byte 102,15,56,0,226 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa 0(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 16(%r10),%xmm1 + +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa 32(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 48(%r10),%xmm1 + +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa 64(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 80(%r10),%xmm1 + +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + addq $16,%r9 +.byte 102,15,58,15,237,12 + pxor %xmm1,%xmm0 + subq $1,%rax + +L$dec_entry: + + movdqa %xmm9,%xmm1 + pandn %xmm0,%xmm1 + movdqa %xmm11,%xmm2 + psrld $4,%xmm1 + pand %xmm9,%xmm0 +.byte 102,15,56,0,208 + movdqa %xmm10,%xmm3 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,217 + movdqa %xmm10,%xmm4 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,224 + pxor %xmm2,%xmm4 + movdqa %xmm10,%xmm2 +.byte 102,15,56,0,211 + movdqa %xmm10,%xmm3 + pxor %xmm0,%xmm2 +.byte 102,15,56,0,220 + movdqu (%r9),%xmm0 + pxor %xmm1,%xmm3 + jnz L$dec_loop + + + movdqa 96(%r10),%xmm4 +.byte 102,15,56,0,226 + pxor %xmm0,%xmm4 + movdqa 112(%r10),%xmm0 + movdqa -352(%r11),%xmm2 +.byte 102,15,56,0,195 + pxor %xmm4,%xmm0 +.byte 102,15,56,0,194 + .byte 0xf3,0xc3 + + + + + + + + +.p2align 4 +_vpaes_schedule_core: + + + + + + call _vpaes_preheat + movdqa L$k_rcon(%rip),%xmm8 + movdqu (%rdi),%xmm0 + + + movdqa %xmm0,%xmm3 + leaq L$k_ipt(%rip),%r11 + call _vpaes_schedule_transform + movdqa %xmm0,%xmm7 + + leaq L$k_sr(%rip),%r10 + testq %rcx,%rcx + jnz L$schedule_am_decrypting + + + movdqu %xmm0,(%rdx) + jmp L$schedule_go + +L$schedule_am_decrypting: + + movdqa (%r8,%r10,1),%xmm1 +.byte 102,15,56,0,217 + movdqu %xmm3,(%rdx) + xorq $48,%r8 + +L$schedule_go: + cmpl $192,%esi + ja L$schedule_256 + je L$schedule_192 + + + + + + + + + + +L$schedule_128: + movl $10,%esi + +L$oop_schedule_128: + call _vpaes_schedule_round + decq %rsi + jz L$schedule_mangle_last + call _vpaes_schedule_mangle + jmp L$oop_schedule_128 + + + + + + + + + + + + + + + + +.p2align 4 +L$schedule_192: + movdqu 8(%rdi),%xmm0 + call _vpaes_schedule_transform + movdqa %xmm0,%xmm6 + pxor %xmm4,%xmm4 + movhlps %xmm4,%xmm6 + movl $4,%esi + +L$oop_schedule_192: + call _vpaes_schedule_round +.byte 102,15,58,15,198,8 + call _vpaes_schedule_mangle + call _vpaes_schedule_192_smear + call _vpaes_schedule_mangle + call _vpaes_schedule_round + decq %rsi + jz L$schedule_mangle_last + call _vpaes_schedule_mangle + call _vpaes_schedule_192_smear + jmp L$oop_schedule_192 + + + + + + + + + + + +.p2align 4 +L$schedule_256: + movdqu 16(%rdi),%xmm0 + call _vpaes_schedule_transform + movl $7,%esi + +L$oop_schedule_256: + call _vpaes_schedule_mangle + movdqa %xmm0,%xmm6 + + + call _vpaes_schedule_round + decq %rsi + jz L$schedule_mangle_last + call _vpaes_schedule_mangle + + + pshufd $255,%xmm0,%xmm0 + movdqa %xmm7,%xmm5 + movdqa %xmm6,%xmm7 + call _vpaes_schedule_low_round + movdqa %xmm5,%xmm7 + + jmp L$oop_schedule_256 + + + + + + + + + + + + +.p2align 4 +L$schedule_mangle_last: + + leaq L$k_deskew(%rip),%r11 + testq %rcx,%rcx + jnz L$schedule_mangle_last_dec + + + movdqa (%r8,%r10,1),%xmm1 +.byte 102,15,56,0,193 + leaq L$k_opt(%rip),%r11 + addq $32,%rdx + +L$schedule_mangle_last_dec: + addq $-16,%rdx + pxor L$k_s63(%rip),%xmm0 + call _vpaes_schedule_transform + movdqu %xmm0,(%rdx) + + + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + .byte 0xf3,0xc3 + + + + + + + + + + + + + + + + + +.p2align 4 +_vpaes_schedule_192_smear: + pshufd $128,%xmm6,%xmm1 + pshufd $254,%xmm7,%xmm0 + pxor %xmm1,%xmm6 + pxor %xmm1,%xmm1 + pxor %xmm0,%xmm6 + movdqa %xmm6,%xmm0 + movhlps %xmm1,%xmm6 + .byte 0xf3,0xc3 + + + + + + + + + + + + + + + + + + + + + +.p2align 4 +_vpaes_schedule_round: + + pxor %xmm1,%xmm1 +.byte 102,65,15,58,15,200,15 +.byte 102,69,15,58,15,192,15 + pxor %xmm1,%xmm7 + + + pshufd $255,%xmm0,%xmm0 +.byte 102,15,58,15,192,1 + + + + +_vpaes_schedule_low_round: + + movdqa %xmm7,%xmm1 + pslldq $4,%xmm7 + pxor %xmm1,%xmm7 + movdqa %xmm7,%xmm1 + pslldq $8,%xmm7 + pxor %xmm1,%xmm7 + pxor L$k_s63(%rip),%xmm7 + + + movdqa %xmm9,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 + movdqa %xmm11,%xmm2 +.byte 102,15,56,0,208 + pxor %xmm1,%xmm0 + movdqa %xmm10,%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 + movdqa %xmm10,%xmm4 +.byte 102,15,56,0,224 + pxor %xmm2,%xmm4 + movdqa %xmm10,%xmm2 +.byte 102,15,56,0,211 + pxor %xmm0,%xmm2 + movdqa %xmm10,%xmm3 +.byte 102,15,56,0,220 + pxor %xmm1,%xmm3 + movdqa %xmm13,%xmm4 +.byte 102,15,56,0,226 + movdqa %xmm12,%xmm0 +.byte 102,15,56,0,195 + pxor %xmm4,%xmm0 + + + pxor %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + .byte 0xf3,0xc3 + + + + + + + + + + + + +.p2align 4 +_vpaes_schedule_transform: + movdqa %xmm9,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 + movdqa (%r11),%xmm2 +.byte 102,15,56,0,208 + movdqa 16(%r11),%xmm0 +.byte 102,15,56,0,193 + pxor %xmm2,%xmm0 + .byte 0xf3,0xc3 + + + + + + + + + + + + + + + + + + + + + + + + + + +.p2align 4 +_vpaes_schedule_mangle: + movdqa %xmm0,%xmm4 + movdqa L$k_mc_forward(%rip),%xmm5 + testq %rcx,%rcx + jnz L$schedule_mangle_dec + + + addq $16,%rdx + pxor L$k_s63(%rip),%xmm4 +.byte 102,15,56,0,229 + movdqa %xmm4,%xmm3 +.byte 102,15,56,0,229 + pxor %xmm4,%xmm3 +.byte 102,15,56,0,229 + pxor %xmm4,%xmm3 + + jmp L$schedule_mangle_both +.p2align 4 +L$schedule_mangle_dec: + + leaq L$k_dksd(%rip),%r11 + movdqa %xmm9,%xmm1 + pandn %xmm4,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm4 + + movdqa 0(%r11),%xmm2 +.byte 102,15,56,0,212 + movdqa 16(%r11),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + + movdqa 32(%r11),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 48(%r11),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + + movdqa 64(%r11),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 80(%r11),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + + movdqa 96(%r11),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 112(%r11),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 + + addq $-16,%rdx + +L$schedule_mangle_both: + movdqa (%r8,%r10,1),%xmm1 +.byte 102,15,56,0,217 + addq $-16,%r8 + andq $48,%r8 + movdqu %xmm3,(%rdx) + .byte 0xf3,0xc3 + + + + + +.globl _vpaes_set_encrypt_key + +.p2align 4 +_vpaes_set_encrypt_key: + movl %esi,%eax + shrl $5,%eax + addl $5,%eax + movl %eax,240(%rdx) + + movl $0,%ecx + movl $48,%r8d + call _vpaes_schedule_core + xorl %eax,%eax + .byte 0xf3,0xc3 + + +.globl _vpaes_set_decrypt_key + +.p2align 4 +_vpaes_set_decrypt_key: + movl %esi,%eax + shrl $5,%eax + addl $5,%eax + movl %eax,240(%rdx) + shll $4,%eax + leaq 16(%rdx,%rax,1),%rdx + + movl $1,%ecx + movl %esi,%r8d + shrl $1,%r8d + andl $32,%r8d + xorl $32,%r8d + call _vpaes_schedule_core + xorl %eax,%eax + .byte 0xf3,0xc3 + + +.globl _vpaes_encrypt + +.p2align 4 +_vpaes_encrypt: + movdqu (%rdi),%xmm0 + call _vpaes_preheat + call _vpaes_encrypt_core + movdqu %xmm0,(%rsi) + .byte 0xf3,0xc3 + + +.globl _vpaes_decrypt + +.p2align 4 +_vpaes_decrypt: + movdqu (%rdi),%xmm0 + call _vpaes_preheat + call _vpaes_decrypt_core + movdqu %xmm0,(%rsi) + .byte 0xf3,0xc3 + +.globl _vpaes_cbc_encrypt + +.p2align 4 +_vpaes_cbc_encrypt: + xchgq %rcx,%rdx + subq $16,%rcx + jc L$cbc_abort + movdqu (%r8),%xmm6 + subq %rdi,%rsi + call _vpaes_preheat + cmpl $0,%r9d + je L$cbc_dec_loop + jmp L$cbc_enc_loop +.p2align 4 +L$cbc_enc_loop: + movdqu (%rdi),%xmm0 + pxor %xmm6,%xmm0 + call _vpaes_encrypt_core + movdqa %xmm0,%xmm6 + movdqu %xmm0,(%rsi,%rdi,1) + leaq 16(%rdi),%rdi + subq $16,%rcx + jnc L$cbc_enc_loop + jmp L$cbc_done +.p2align 4 +L$cbc_dec_loop: + movdqu (%rdi),%xmm0 + movdqa %xmm0,%xmm7 + call _vpaes_decrypt_core + pxor %xmm6,%xmm0 + movdqa %xmm7,%xmm6 + movdqu %xmm0,(%rsi,%rdi,1) + leaq 16(%rdi),%rdi + subq $16,%rcx + jnc L$cbc_dec_loop +L$cbc_done: + movdqu %xmm6,(%r8) +L$cbc_abort: + .byte 0xf3,0xc3 + + + + + + + + +.p2align 4 +_vpaes_preheat: + leaq L$k_s0F(%rip),%r10 + movdqa -32(%r10),%xmm10 + movdqa -16(%r10),%xmm11 + movdqa 0(%r10),%xmm9 + movdqa 48(%r10),%xmm13 + movdqa 64(%r10),%xmm12 + movdqa 80(%r10),%xmm15 + movdqa 96(%r10),%xmm14 + .byte 0xf3,0xc3 + + + + + + + +.p2align 6 +_vpaes_consts: +L$k_inv: +.quad 0x0E05060F0D080180, 0x040703090A0B0C02 +.quad 0x01040A060F0B0780, 0x030D0E0C02050809 + +L$k_s0F: +.quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F + +L$k_ipt: +.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 +.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 + +L$k_sb1: +.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +L$k_sb2: +.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD +.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +L$k_sbo: +.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 +.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA + +L$k_mc_forward: +.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 +.quad 0x080B0A0904070605, 0x000302010C0F0E0D +.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 +.quad 0x000302010C0F0E0D, 0x080B0A0904070605 + +L$k_mc_backward: +.quad 0x0605040702010003, 0x0E0D0C0F0A09080B +.quad 0x020100030E0D0C0F, 0x0A09080B06050407 +.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 +.quad 0x0A09080B06050407, 0x020100030E0D0C0F + +L$k_sr: +.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 +.quad 0x030E09040F0A0500, 0x0B06010C07020D08 +.quad 0x0F060D040B020900, 0x070E050C030A0108 +.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +L$k_rcon: +.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +L$k_s63: +.quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B + +L$k_opt: +.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 +.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 + +L$k_deskew: +.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A +.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + + + + + +L$k_dksd: +.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 +.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +L$k_dksb: +.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 +.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +L$k_dkse: +.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 +.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +L$k_dks9: +.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC +.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + + + + + +L$k_dipt: +.quad 0x0F505B040B545F00, 0x154A411E114E451A +.quad 0x86E383E660056500, 0x12771772F491F194 + +L$k_dsb9: +.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 +.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 +L$k_dsbd: +.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 +.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 +L$k_dsbb: +.quad 0xD022649296B44200, 0x602646F6B0F2D404 +.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B +L$k_dsbe: +.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 +.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 +L$k_dsbo: +.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D +.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.p2align 6 + + +.section .note.GNU-stack,"",%progbits diff --git a/lib/accelerated/x86/macosx/aesni-x86.s b/lib/accelerated/x86/macosx/aesni-x86.s index 4cb2d98..58d58d6 100644 --- a/lib/accelerated/x86/macosx/aesni-x86.s +++ b/lib/accelerated/x86/macosx/aesni-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/macosx/aesni-x86_64.s b/lib/accelerated/x86/macosx/aesni-x86_64.s index 85f26e0..10c168e 100644 --- a/lib/accelerated/x86/macosx/aesni-x86_64.s +++ b/lib/accelerated/x86/macosx/aesni-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/macosx/cpuid-x86.s b/lib/accelerated/x86/macosx/cpuid-x86.s index 978b232..874dca4 100644 --- a/lib/accelerated/x86/macosx/cpuid-x86.s +++ b/lib/accelerated/x86/macosx/cpuid-x86.s @@ -1,5 +1,6 @@ # -# Copyright (C) 2011-2012 Free Software Foundation, Inc. +# Copyright (C) 2011-2013 Free Software Foundation, Inc. +# Copyright (C) 2013 Nikos Mavrogiannopoulos # # Author: Nikos Mavrogiannopoulos # diff --git a/lib/accelerated/x86/macosx/cpuid-x86_64.s b/lib/accelerated/x86/macosx/cpuid-x86_64.s index cf8fea9..e98a630 100644 --- a/lib/accelerated/x86/macosx/cpuid-x86_64.s +++ b/lib/accelerated/x86/macosx/cpuid-x86_64.s @@ -1,5 +1,6 @@ # -# Copyright (C) 2011-2012 Free Software Foundation, Inc. +# Copyright (C) 2011-2013 Free Software Foundation, Inc. +# Copyright (C) 2013 Nikos Mavrogiannopoulos # # Author: Nikos Mavrogiannopoulos # diff --git a/lib/accelerated/x86/macosx/e_padlock-x86.s b/lib/accelerated/x86/macosx/e_padlock-x86.s index 4bdadb9..049c730 100644 --- a/lib/accelerated/x86/macosx/e_padlock-x86.s +++ b/lib/accelerated/x86/macosx/e_padlock-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/macosx/e_padlock-x86_64.s b/lib/accelerated/x86/macosx/e_padlock-x86_64.s index 29723b3..c6a6200 100644 --- a/lib/accelerated/x86/macosx/e_padlock-x86_64.s +++ b/lib/accelerated/x86/macosx/e_padlock-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/macosx/ghash-x86_64.s b/lib/accelerated/x86/macosx/ghash-x86_64.s index 2f5ac65..a400ade 100644 --- a/lib/accelerated/x86/macosx/ghash-x86_64.s +++ b/lib/accelerated/x86/macosx/ghash-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/macosx/sha1-ssse3-x86.s b/lib/accelerated/x86/macosx/sha1-ssse3-x86.s index 0e0c719..f03312e 100644 --- a/lib/accelerated/x86/macosx/sha1-ssse3-x86.s +++ b/lib/accelerated/x86/macosx/sha1-ssse3-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s b/lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s index 88bf435..7e1118f 100644 --- a/lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s +++ b/lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/macosx/sha256-ssse3-x86.s b/lib/accelerated/x86/macosx/sha256-ssse3-x86.s index d6cf6cb..0e863d5 100644 --- a/lib/accelerated/x86/macosx/sha256-ssse3-x86.s +++ b/lib/accelerated/x86/macosx/sha256-ssse3-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/macosx/sha512-ssse3-x86.s b/lib/accelerated/x86/macosx/sha512-ssse3-x86.s index cb097f1..2294b4a 100644 --- a/lib/accelerated/x86/macosx/sha512-ssse3-x86.s +++ b/lib/accelerated/x86/macosx/sha512-ssse3-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s b/lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s index a845708..99102b5 100644 --- a/lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s +++ b/lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov +# Copyright (c) 2011-2013, Andy Polyakov # All rights reserved. # # Redistribution and use in source and binary forms, with or without -- 2.1.4