From 9567d93c07f87ecb5c8560b7a45125de28710bc1 Mon Sep 17 00:00:00 2001 From: Nikos Mavrogiannopoulos Date: Mon, 19 Mar 2012 22:55:14 +0100 Subject: [PATCH] updated openssl code --- NEWS | 2 + devel/perlasm/e_padlock-x86.pl | 104 +++++++-- devel/perlasm/e_padlock-x86_64.pl | 178 +++++++++++----- devel/perlasm/ghash-x86.pl | 28 +-- lib/accelerated/x86/README | 4 +- lib/accelerated/x86/coff/padlock-x86-64-coff.s | 162 ++++++++++---- lib/accelerated/x86/coff/padlock-x86-coff.s | 232 ++++++++++++++------ lib/accelerated/x86/elf/padlock-x86-64.s | 162 ++++++++++---- lib/accelerated/x86/license.txt | 2 +- lib/accelerated/x86/macosx/padlock-x86-64-macosx.s | 162 ++++++++++---- lib/accelerated/x86/macosx/padlock-x86-macosx.s | 234 +++++++++++++++------ 11 files changed, 924 insertions(+), 346 deletions(-) diff --git a/NEWS b/NEWS index 27a258c..93fa1ab 100644 --- a/NEWS +++ b/NEWS @@ -7,6 +7,8 @@ See the end for copying conditions. ** certtool: Avoid a Y2K38 bug when generating certificates. Patch by Robert Millan. +** libgnutls: Updated assembler files. + ** libgnutls: Time in generated certificates is stored as GeneralizedTime instead of UTCTime (which only stores 2 digits of a year). diff --git a/devel/perlasm/e_padlock-x86.pl b/devel/perlasm/e_padlock-x86.pl index 7a52528..71ecad3 100644 --- a/devel/perlasm/e_padlock-x86.pl +++ b/devel/perlasm/e_padlock-x86.pl @@ -37,7 +37,7 @@ require "x86asm.pl"; &asm_init($ARGV[0],$0); -%PADLOCK_MARGIN=(ecb=>128, cbc=>64); # prefetch errata +%PADLOCK_PREFETCH=(ecb=>128, cbc=>64); # prefetch errata $PADLOCK_CHUNK=512; # Must be a power of 2 larger than 16 $ctx="edx"; @@ -188,10 +188,6 @@ my ($mode,$opcode) = @_; &movq ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter } else { &xor ("ebx","ebx"); - if ($PADLOCK_MARGIN{$mode}) { - &cmp ($len,$PADLOCK_MARGIN{$mode}); - &jbe (&label("${mode}_short")); - } &test (&DWP(0,$ctx),1<<5); # align bit in control word &jnz (&label("${mode}_aligned")); &test ($out,0x0f); @@ -212,7 +208,27 @@ my ($mode,$opcode) = @_; &neg ("eax"); &and ($chunk,$PADLOCK_CHUNK-1); # chunk=len%PADLOCK_CHUNK &lea ("esp",&DWP(0,"eax","ebp")); # alloca + &mov ("eax",$PADLOCK_CHUNK); + &cmovz ($chunk,"eax"); # chunk=chunk?:PADLOCK_CHUNK + &mov ("eax","ebp"); + &and ("ebp",-16); &and ("esp",-16); + &mov (&DWP(16,"ebp"),"eax"); + if ($PADLOCK_PREFETCH{$mode}) { + &cmp ($len,$chunk); + &ja (&label("${mode}_loop")); + &mov ("eax",$inp); # check if prefetch crosses page + &cmp ("ebp","esp"); + &cmove ("eax",$out); + &add ("eax",$len); + &neg ("eax"); + &and ("eax",0xfff); # distance to page boundary + &cmp ("eax",$PADLOCK_PREFETCH{$mode}); + &mov ("eax",-$PADLOCK_PREFETCH{$mode}); + &cmovae ("eax",$chunk); # mask=distance128, cbc=>64, ctr32=>64); # prefetch errata +%PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata $PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20 $ctx="%rdx"; @@ -285,17 +285,6 @@ padlock_${mode}_encrypt: lea 16($ctx),$ctx # control word xor %eax,%eax xor %ebx,%ebx -___ -# Formally speaking correct condtion is $len<=$margin and $inp+$margin -# crosses page boundary [and next page is unreadable]. But $inp can -# be unaligned in which case data can be copied to $out if latter is -# aligned, in which case $out+$margin has to be checked. Covering all -# cases appears more complicated than just copying short input... -$code.=<<___ if ($PADLOCK_MARGIN{$mode}); - cmp \$$PADLOCK_MARGIN{$mode},$len - jbe .L${mode}_short -___ -$code.=<<___; testl \$`1<<5`,($ctx) # align bit in control word jnz .L${mode}_aligned test \$0x0f,$out @@ -315,6 +304,8 @@ $code.=<<___; neg %rax and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK lea (%rax,%rbp),%rsp + mov \$$PADLOCK_CHUNK,%rax + cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK ___ $code.=<<___ if ($mode eq "ctr32"); .L${mode}_reenter: @@ -322,10 +313,27 @@ $code.=<<___ if ($mode eq "ctr32"); bswap %eax neg %eax and \$`$PADLOCK_CHUNK/16-1`,%eax - jz .L${mode}_loop + mov \$$PADLOCK_CHUNK,$chunk shl \$4,%eax + cmovz $chunk,%rax cmp %rax,$len cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK + cmovbe $len,$chunk +___ +$code.=<<___ if ($PADLOCK_PREFETCH{$mode}); + cmp $chunk,$len + ja .L${mode}_loop + mov $inp,%rax # check if prefetch crosses page + cmp %rsp,%rbp + cmove $out,%rax + add $len,%rax + neg %rax + and \$0xfff,%rax # distance to page boundary + cmp \$$PADLOCK_PREFETCH{$mode},%rax + mov \$-$PADLOCK_PREFETCH{$mode},%rax + cmovae $chunk,%rax # mask=distance2-5 times faster than gcc-generated code. To # anchor it to something else SHA1 assembler processes one byte in -# 11-13 cycles on contemporary x86 cores. As for choice of MMX in -# particular, see comment at the end of the file... +# ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE +# in particular, see comment at the end of the file... # May 2010 # @@ -331,7 +331,7 @@ if (!$x86only) {{{ &static_label("rem_4bit"); -if (0) {{ # "May" MMX version is kept for reference... +if (!$sse2) {{ # pure-MMX "May" version... $S=12; # shift factor for rem_4bit @@ -1273,13 +1273,6 @@ my ($Xhi,$Xi)=@_; &set_label("bswap",64); &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial -}} # $sse2 - -&set_label("rem_4bit",64); - &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S); - &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S); - &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S); - &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S); &set_label("rem_8bit",64); &data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E); &data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E); @@ -1313,6 +1306,13 @@ my ($Xhi,$Xi)=@_; &data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E); &data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE); &data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE); +}} # $sse2 + +&set_label("rem_4bit",64); + &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S); + &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S); + &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S); + &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S); }}} # !$x86only &asciz("GHASH for x86, CRYPTOGAMS by "); diff --git a/lib/accelerated/x86/README b/lib/accelerated/x86/README index 0dd5cb9..ca3c546 100644 --- a/lib/accelerated/x86/README +++ b/lib/accelerated/x86/README @@ -1,4 +1,4 @@ -The AES-NI and Padlock implementation by Andy Polyakov is not part of the -GnuTLS library, but is used with GnuTLS. Its license is included in +The AES-NI and Padlock implementation by Andy Polyakov are not part of the +GnuTLS library, but is used with GnuTLS. Their license is included in license.txt. diff --git a/lib/accelerated/x86/coff/padlock-x86-64-coff.s b/lib/accelerated/x86/coff/padlock-x86-64-coff.s index b69b332..9f658ee 100644 --- a/lib/accelerated/x86/coff/padlock-x86-64-coff.s +++ b/lib/accelerated/x86/coff/padlock-x86-64-coff.s @@ -354,8 +354,6 @@ padlock_ecb_encrypt: leaq 16(%rdx),%rdx xorl %eax,%eax xorl %ebx,%ebx - cmpq $128,%rcx - jbe .Lecb_short testl $32,(%rdx) jnz .Lecb_aligned testq $15,%rdi @@ -375,6 +373,21 @@ padlock_ecb_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx + cmpq %rbx,%rcx + ja .Lecb_loop + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $128,%rax + movq $-128,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jz .Lecb_unaligned_tail jmp .Lecb_loop .p2align 4 .Lecb_loop: @@ -404,8 +417,8 @@ padlock_ecb_encrypt: testq $15,%rdi jz .Lecb_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi + shrq $3,%rcx .byte 0xf3,0x48,0xa5 subq %rbx,%rdi .Lecb_out_aligned: @@ -415,9 +428,26 @@ padlock_ecb_encrypt: addq %rbx,%rsi subq %rbx,%rcx movq $512,%rbx - jnz .Lecb_loop - + jz .Lecb_break + cmpq %rbx,%rcx + jae .Lecb_loop +.Lecb_unaligned_tail: + xorl %eax,%eax cmpq %rsp,%rbp + cmoveq %rcx,%rax + movq %rdi,%r8 + movq %rcx,%rbx + subq %rax,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + movq %rsp,%rsi + movq %r8,%rdi + movq %rbx,%rcx + jmp .Lecb_loop +.p2align 4 +.Lecb_break: + cmpq %rbp,%rsp je .Lecb_done pxor %xmm0,%xmm0 @@ -431,26 +461,39 @@ padlock_ecb_encrypt: .Lecb_done: leaq (%rbp),%rsp jmp .Lecb_exit -.p2align 4 -.Lecb_short: - movq %rsp,%rbp - subq %rcx,%rsp - xorq %rbx,%rbx -.Lecb_short_copy: - movups (%rsi,%rbx,1),%xmm0 - leaq 16(%rbx),%rbx - cmpq %rbx,%rcx - movaps %xmm0,-16(%rsp,%rbx,1) - ja .Lecb_short_copy - movq %rsp,%rsi - movq %rcx,%rbx - jmp .Lecb_loop + .p2align 4 .Lecb_aligned: + leaq (%rsi,%rcx,1),%rbp + negq %rbp + andq $4095,%rbp + xorl %eax,%eax + cmpq $128,%rbp + movq $128-1,%rbp + cmovaeq %rax,%rbp + andq %rcx,%rbp + subq %rbp,%rcx + jz .Lecb_aligned_tail leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx .byte 0xf3,0x0f,0xa7,200 + testq %rbp,%rbp + jz .Lecb_exit + +.Lecb_aligned_tail: + movq %rdi,%r8 + movq %rbp,%rbx + movq %rbp,%rcx + leaq (%rsp),%rbp + subq %rcx,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + leaq (%r8),%rdi + leaq (%rsp),%rsi + movq %rbx,%rcx + jmp .Lecb_loop .Lecb_exit: movl $1,%eax leaq 8(%rsp),%rsp @@ -489,8 +532,6 @@ padlock_cbc_encrypt: leaq 16(%rdx),%rdx xorl %eax,%eax xorl %ebx,%ebx - cmpq $64,%rcx - jbe .Lcbc_short testl $32,(%rdx) jnz .Lcbc_aligned testq $15,%rdi @@ -510,6 +551,21 @@ padlock_cbc_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx + cmpq %rbx,%rcx + ja .Lcbc_loop + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $64,%rax + movq $-64,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jz .Lcbc_unaligned_tail jmp .Lcbc_loop .p2align 4 .Lcbc_loop: @@ -541,8 +597,8 @@ padlock_cbc_encrypt: testq $15,%rdi jz .Lcbc_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi + shrq $3,%rcx .byte 0xf3,0x48,0xa5 subq %rbx,%rdi .Lcbc_out_aligned: @@ -552,9 +608,26 @@ padlock_cbc_encrypt: addq %rbx,%rsi subq %rbx,%rcx movq $512,%rbx - jnz .Lcbc_loop - + jz .Lcbc_break + cmpq %rbx,%rcx + jae .Lcbc_loop +.Lcbc_unaligned_tail: + xorl %eax,%eax cmpq %rsp,%rbp + cmoveq %rcx,%rax + movq %rdi,%r8 + movq %rcx,%rbx + subq %rax,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + movq %rsp,%rsi + movq %r8,%rdi + movq %rbx,%rcx + jmp .Lcbc_loop +.p2align 4 +.Lcbc_break: + cmpq %rbp,%rsp je .Lcbc_done pxor %xmm0,%xmm0 @@ -568,28 +641,41 @@ padlock_cbc_encrypt: .Lcbc_done: leaq (%rbp),%rsp jmp .Lcbc_exit -.p2align 4 -.Lcbc_short: - movq %rsp,%rbp - subq %rcx,%rsp - xorq %rbx,%rbx -.Lcbc_short_copy: - movups (%rsi,%rbx,1),%xmm0 - leaq 16(%rbx),%rbx - cmpq %rbx,%rcx - movaps %xmm0,-16(%rsp,%rbx,1) - ja .Lcbc_short_copy - movq %rsp,%rsi - movq %rcx,%rbx - jmp .Lcbc_loop + .p2align 4 .Lcbc_aligned: + leaq (%rsi,%rcx,1),%rbp + negq %rbp + andq $4095,%rbp + xorl %eax,%eax + cmpq $64,%rbp + movq $64-1,%rbp + cmovaeq %rax,%rbp + andq %rcx,%rbp + subq %rbp,%rcx + jz .Lcbc_aligned_tail leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx .byte 0xf3,0x0f,0xa7,208 movdqa (%rax),%xmm0 movdqa %xmm0,-16(%rdx) + testq %rbp,%rbp + jz .Lcbc_exit + +.Lcbc_aligned_tail: + movq %rdi,%r8 + movq %rbp,%rbx + movq %rbp,%rcx + leaq (%rsp),%rbp + subq %rcx,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + leaq (%r8),%rdi + leaq (%rsp),%rsi + movq %rbx,%rcx + jmp .Lcbc_loop .Lcbc_exit: movl $1,%eax leaq 8(%rsp),%rsp diff --git a/lib/accelerated/x86/coff/padlock-x86-coff.s b/lib/accelerated/x86/coff/padlock-x86-coff.s index b068083..69eb468 100644 --- a/lib/accelerated/x86/coff/padlock-x86-coff.s +++ b/lib/accelerated/x86/coff/padlock-x86-coff.s @@ -180,16 +180,14 @@ _padlock_ecb_encrypt: leal 16(%edx),%edx xorl %eax,%eax xorl %ebx,%ebx - cmpl $128,%ecx - jbe .L006ecb_short testl $32,(%edx) - jnz .L007ecb_aligned + jnz .L006ecb_aligned testl $15,%edi setz %al testl $15,%esi setz %bl testl %ebx,%eax - jnz .L007ecb_aligned + jnz .L006ecb_aligned negl %eax movl $512,%ebx notl %eax @@ -201,10 +199,28 @@ _padlock_ecb_encrypt: negl %eax andl $511,%ebx leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp andl $-16,%esp - jmp .L008ecb_loop + movl %eax,16(%ebp) + cmpl %ebx,%ecx + ja .L007ecb_loop + movl %esi,%eax + cmpl %esp,%ebp + cmovel %edi,%eax + addl %ecx,%eax + negl %eax + andl $4095,%eax + cmpl $128,%eax + movl $-128,%eax + cmovael %ebx,%eax + andl %eax,%ebx + jz .L008ecb_unaligned_tail + jmp .L007ecb_loop .align 16 -.L008ecb_loop: +.L007ecb_loop: movl %edi,(%ebp) movl %esi,4(%ebp) movl %ecx,8(%ebp) @@ -229,8 +245,8 @@ _padlock_ecb_encrypt: testl $15,%edi jz .L010ecb_out_aligned movl %ebx,%ecx - shrl $2,%ecx leal (%esp),%esi + shrl $2,%ecx .byte 243,165 subl %ebx,%edi .L010ecb_out_aligned: @@ -240,43 +256,75 @@ _padlock_ecb_encrypt: addl %ebx,%esi subl %ebx,%ecx movl $512,%ebx - jnz .L008ecb_loop + jz .L011ecb_break + cmpl %ebx,%ecx + jae .L007ecb_loop +.L008ecb_unaligned_tail: + xorl %eax,%eax + cmpl %ebp,%esp + cmovel %ecx,%eax + subl %eax,%esp + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp .L007ecb_loop +.align 16 +.L011ecb_break: cmpl %ebp,%esp - je .L011ecb_done + je .L012ecb_done pxor %xmm0,%xmm0 leal (%esp),%eax -.L012ecb_bzero: +.L013ecb_bzero: movaps %xmm0,(%eax) leal 16(%eax),%eax cmpl %eax,%ebp - ja .L012ecb_bzero -.L011ecb_done: + ja .L013ecb_bzero +.L012ecb_done: + movl 16(%ebp),%ebp leal 24(%ebp),%esp - jmp .L013ecb_exit + jmp .L014ecb_exit .align 16 -.L006ecb_short: +.L006ecb_aligned: + leal (%esi,%ecx,1),%ebp + negl %ebp + andl $4095,%ebp xorl %eax,%eax - leal -24(%esp),%ebp - subl %ecx,%eax - leal (%eax,%ebp,1),%esp - andl $-16,%esp - xorl %ebx,%ebx -.L014ecb_short_copy: - movups (%esi,%ebx,1),%xmm0 - leal 16(%ebx),%ebx - cmpl %ebx,%ecx - movaps %xmm0,-16(%esp,%ebx,1) - ja .L014ecb_short_copy - movl %esp,%esi - movl %ecx,%ebx - jmp .L008ecb_loop -.align 16 -.L007ecb_aligned: + cmpl $128,%ebp + movl $127,%ebp + cmovael %eax,%ebp + andl %ecx,%ebp + subl %ebp,%ecx + jz .L015ecb_aligned_tail leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx .byte 243,15,167,200 -.L013ecb_exit: + testl %ebp,%ebp + jz .L014ecb_exit +.L015ecb_aligned_tail: + movl %ebp,%ecx + leal -24(%esp),%ebp + movl %ebp,%esp + movl %ebp,%eax + subl %ecx,%esp + andl $-16,%ebp + andl $-16,%esp + movl %eax,16(%ebp) + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp .L007ecb_loop +.L014ecb_exit: movl $1,%eax leal 4(%esp),%esp .L004ecb_abort: @@ -299,19 +347,17 @@ _padlock_cbc_encrypt: movl 28(%esp),%edx movl 32(%esp),%ecx testl $15,%edx - jnz .L015cbc_abort + jnz .L016cbc_abort testl $15,%ecx - jnz .L015cbc_abort + jnz .L016cbc_abort leal .Lpadlock_saved_context,%eax pushfl cld call __padlock_verify_ctx -.L016cbc_pic_point: +.L017cbc_pic_point: leal 16(%edx),%edx xorl %eax,%eax xorl %ebx,%ebx - cmpl $64,%ecx - jbe .L017cbc_short testl $32,(%edx) jnz .L018cbc_aligned testl $15,%edi @@ -331,7 +377,25 @@ _padlock_cbc_encrypt: negl %eax andl $511,%ebx leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp andl $-16,%esp + movl %eax,16(%ebp) + cmpl %ebx,%ecx + ja .L019cbc_loop + movl %esi,%eax + cmpl %esp,%ebp + cmovel %edi,%eax + addl %ecx,%eax + negl %eax + andl $4095,%eax + cmpl $64,%eax + movl $-64,%eax + cmovael %ebx,%eax + andl %eax,%ebx + jz .L020cbc_unaligned_tail jmp .L019cbc_loop .align 16 .L019cbc_loop: @@ -343,13 +407,13 @@ _padlock_cbc_encrypt: testl $15,%edi cmovnzl %esp,%edi testl $15,%esi - jz .L020cbc_inp_aligned + jz .L021cbc_inp_aligned shrl $2,%ecx .byte 243,165 subl %ebx,%edi movl %ebx,%ecx movl %edi,%esi -.L020cbc_inp_aligned: +.L021cbc_inp_aligned: leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx @@ -359,61 +423,93 @@ _padlock_cbc_encrypt: movl (%ebp),%edi movl 12(%ebp),%ebx testl $15,%edi - jz .L021cbc_out_aligned + jz .L022cbc_out_aligned movl %ebx,%ecx - shrl $2,%ecx leal (%esp),%esi + shrl $2,%ecx .byte 243,165 subl %ebx,%edi -.L021cbc_out_aligned: +.L022cbc_out_aligned: movl 4(%ebp),%esi movl 8(%ebp),%ecx addl %ebx,%edi addl %ebx,%esi subl %ebx,%ecx movl $512,%ebx - jnz .L019cbc_loop + jz .L023cbc_break + cmpl %ebx,%ecx + jae .L019cbc_loop +.L020cbc_unaligned_tail: + xorl %eax,%eax + cmpl %ebp,%esp + cmovel %ecx,%eax + subl %eax,%esp + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp .L019cbc_loop +.align 16 +.L023cbc_break: cmpl %ebp,%esp - je .L022cbc_done + je .L024cbc_done pxor %xmm0,%xmm0 leal (%esp),%eax -.L023cbc_bzero: +.L025cbc_bzero: movaps %xmm0,(%eax) leal 16(%eax),%eax cmpl %eax,%ebp - ja .L023cbc_bzero -.L022cbc_done: + ja .L025cbc_bzero +.L024cbc_done: + movl 16(%ebp),%ebp leal 24(%ebp),%esp - jmp .L024cbc_exit -.align 16 -.L017cbc_short: - xorl %eax,%eax - leal -24(%esp),%ebp - subl %ecx,%eax - leal (%eax,%ebp,1),%esp - andl $-16,%esp - xorl %ebx,%ebx -.L025cbc_short_copy: - movups (%esi,%ebx,1),%xmm0 - leal 16(%ebx),%ebx - cmpl %ebx,%ecx - movaps %xmm0,-16(%esp,%ebx,1) - ja .L025cbc_short_copy - movl %esp,%esi - movl %ecx,%ebx - jmp .L019cbc_loop + jmp .L026cbc_exit .align 16 .L018cbc_aligned: + leal (%esi,%ecx,1),%ebp + negl %ebp + andl $4095,%ebp + xorl %eax,%eax + cmpl $64,%ebp + movl $63,%ebp + cmovael %eax,%ebp + andl %ecx,%ebp + subl %ebp,%ecx + jz .L027cbc_aligned_tail leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx .byte 243,15,167,208 movaps (%eax),%xmm0 movaps %xmm0,-16(%edx) -.L024cbc_exit: + testl %ebp,%ebp + jz .L026cbc_exit +.L027cbc_aligned_tail: + movl %ebp,%ecx + leal -24(%esp),%ebp + movl %ebp,%esp + movl %ebp,%eax + subl %ecx,%esp + andl $-16,%ebp + andl $-16,%esp + movl %eax,16(%ebp) + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp .L019cbc_loop +.L026cbc_exit: movl $1,%eax leal 4(%esp),%esp -.L015cbc_abort: +.L016cbc_abort: popl %edi popl %esi popl %ebx @@ -437,10 +533,10 @@ __win32_segv_handler: movl 4(%esp),%edx movl 12(%esp),%ecx cmpl $3221225477,(%edx) - jne .L026ret + jne .L028ret addl $4,184(%ecx) movl $0,%eax -.L026ret: +.L028ret: ret .globl _padlock_sha1_oneshot .def _padlock_sha1_oneshot; .scl 2; .type 32; .endef diff --git a/lib/accelerated/x86/elf/padlock-x86-64.s b/lib/accelerated/x86/elf/padlock-x86-64.s index bf5f626..4709ac2 100644 --- a/lib/accelerated/x86/elf/padlock-x86-64.s +++ b/lib/accelerated/x86/elf/padlock-x86-64.s @@ -276,8 +276,6 @@ padlock_ecb_encrypt: leaq 16(%rdx),%rdx xorl %eax,%eax xorl %ebx,%ebx - cmpq $128,%rcx - jbe .Lecb_short testl $32,(%rdx) jnz .Lecb_aligned testq $15,%rdi @@ -297,6 +295,21 @@ padlock_ecb_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx + cmpq %rbx,%rcx + ja .Lecb_loop + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $128,%rax + movq $-128,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jz .Lecb_unaligned_tail jmp .Lecb_loop .align 16 .Lecb_loop: @@ -326,8 +339,8 @@ padlock_ecb_encrypt: testq $15,%rdi jz .Lecb_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi + shrq $3,%rcx .byte 0xf3,0x48,0xa5 subq %rbx,%rdi .Lecb_out_aligned: @@ -337,9 +350,26 @@ padlock_ecb_encrypt: addq %rbx,%rsi subq %rbx,%rcx movq $512,%rbx - jnz .Lecb_loop - + jz .Lecb_break + cmpq %rbx,%rcx + jae .Lecb_loop +.Lecb_unaligned_tail: + xorl %eax,%eax cmpq %rsp,%rbp + cmoveq %rcx,%rax + movq %rdi,%r8 + movq %rcx,%rbx + subq %rax,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + movq %rsp,%rsi + movq %r8,%rdi + movq %rbx,%rcx + jmp .Lecb_loop +.align 16 +.Lecb_break: + cmpq %rbp,%rsp je .Lecb_done pxor %xmm0,%xmm0 @@ -353,26 +383,39 @@ padlock_ecb_encrypt: .Lecb_done: leaq (%rbp),%rsp jmp .Lecb_exit -.align 16 -.Lecb_short: - movq %rsp,%rbp - subq %rcx,%rsp - xorq %rbx,%rbx -.Lecb_short_copy: - movups (%rsi,%rbx,1),%xmm0 - leaq 16(%rbx),%rbx - cmpq %rbx,%rcx - movaps %xmm0,-16(%rsp,%rbx,1) - ja .Lecb_short_copy - movq %rsp,%rsi - movq %rcx,%rbx - jmp .Lecb_loop + .align 16 .Lecb_aligned: + leaq (%rsi,%rcx,1),%rbp + negq %rbp + andq $4095,%rbp + xorl %eax,%eax + cmpq $128,%rbp + movq $128-1,%rbp + cmovaeq %rax,%rbp + andq %rcx,%rbp + subq %rbp,%rcx + jz .Lecb_aligned_tail leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx .byte 0xf3,0x0f,0xa7,200 + testq %rbp,%rbp + jz .Lecb_exit + +.Lecb_aligned_tail: + movq %rdi,%r8 + movq %rbp,%rbx + movq %rbp,%rcx + leaq (%rsp),%rbp + subq %rcx,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + leaq (%r8),%rdi + leaq (%rsp),%rsi + movq %rbx,%rcx + jmp .Lecb_loop .Lecb_exit: movl $1,%eax leaq 8(%rsp),%rsp @@ -400,8 +443,6 @@ padlock_cbc_encrypt: leaq 16(%rdx),%rdx xorl %eax,%eax xorl %ebx,%ebx - cmpq $64,%rcx - jbe .Lcbc_short testl $32,(%rdx) jnz .Lcbc_aligned testq $15,%rdi @@ -421,6 +462,21 @@ padlock_cbc_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx + cmpq %rbx,%rcx + ja .Lcbc_loop + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $64,%rax + movq $-64,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jz .Lcbc_unaligned_tail jmp .Lcbc_loop .align 16 .Lcbc_loop: @@ -452,8 +508,8 @@ padlock_cbc_encrypt: testq $15,%rdi jz .Lcbc_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi + shrq $3,%rcx .byte 0xf3,0x48,0xa5 subq %rbx,%rdi .Lcbc_out_aligned: @@ -463,9 +519,26 @@ padlock_cbc_encrypt: addq %rbx,%rsi subq %rbx,%rcx movq $512,%rbx - jnz .Lcbc_loop - + jz .Lcbc_break + cmpq %rbx,%rcx + jae .Lcbc_loop +.Lcbc_unaligned_tail: + xorl %eax,%eax cmpq %rsp,%rbp + cmoveq %rcx,%rax + movq %rdi,%r8 + movq %rcx,%rbx + subq %rax,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + movq %rsp,%rsi + movq %r8,%rdi + movq %rbx,%rcx + jmp .Lcbc_loop +.align 16 +.Lcbc_break: + cmpq %rbp,%rsp je .Lcbc_done pxor %xmm0,%xmm0 @@ -479,28 +552,41 @@ padlock_cbc_encrypt: .Lcbc_done: leaq (%rbp),%rsp jmp .Lcbc_exit -.align 16 -.Lcbc_short: - movq %rsp,%rbp - subq %rcx,%rsp - xorq %rbx,%rbx -.Lcbc_short_copy: - movups (%rsi,%rbx,1),%xmm0 - leaq 16(%rbx),%rbx - cmpq %rbx,%rcx - movaps %xmm0,-16(%rsp,%rbx,1) - ja .Lcbc_short_copy - movq %rsp,%rsi - movq %rcx,%rbx - jmp .Lcbc_loop + .align 16 .Lcbc_aligned: + leaq (%rsi,%rcx,1),%rbp + negq %rbp + andq $4095,%rbp + xorl %eax,%eax + cmpq $64,%rbp + movq $64-1,%rbp + cmovaeq %rax,%rbp + andq %rcx,%rbp + subq %rbp,%rcx + jz .Lcbc_aligned_tail leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx .byte 0xf3,0x0f,0xa7,208 movdqa (%rax),%xmm0 movdqa %xmm0,-16(%rdx) + testq %rbp,%rbp + jz .Lcbc_exit + +.Lcbc_aligned_tail: + movq %rdi,%r8 + movq %rbp,%rbx + movq %rbp,%rcx + leaq (%rsp),%rbp + subq %rcx,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + leaq (%r8),%rdi + leaq (%rsp),%rsi + movq %rbx,%rcx + jmp .Lcbc_loop .Lcbc_exit: movl $1,%eax leaq 8(%rsp),%rsp diff --git a/lib/accelerated/x86/license.txt b/lib/accelerated/x86/license.txt index c87ba42..929ddd5 100755 --- a/lib/accelerated/x86/license.txt +++ b/lib/accelerated/x86/license.txt @@ -5,7 +5,7 @@ CRYPTOGAMS licenses depending on where you obtain it. For further details see http://www.openssl.org/~appro/cryptogams/. ==================================================================== -Copyright (c) 2006, CRYPTOGAMS by +Copyright (c) 2006-2012, CRYPTOGAMS by All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s b/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s index 9b912f9..dbd89da 100644 --- a/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s +++ b/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s @@ -276,8 +276,6 @@ _padlock_ecb_encrypt: leaq 16(%rdx),%rdx xorl %eax,%eax xorl %ebx,%ebx - cmpq $128,%rcx - jbe L$ecb_short testl $32,(%rdx) jnz L$ecb_aligned testq $15,%rdi @@ -297,6 +295,21 @@ _padlock_ecb_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx + cmpq %rbx,%rcx + ja L$ecb_loop + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $128,%rax + movq $-128,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jz L$ecb_unaligned_tail jmp L$ecb_loop .p2align 4 L$ecb_loop: @@ -326,8 +339,8 @@ L$ecb_inp_aligned: testq $15,%rdi jz L$ecb_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi + shrq $3,%rcx .byte 0xf3,0x48,0xa5 subq %rbx,%rdi L$ecb_out_aligned: @@ -337,9 +350,26 @@ L$ecb_out_aligned: addq %rbx,%rsi subq %rbx,%rcx movq $512,%rbx - jnz L$ecb_loop - + jz L$ecb_break + cmpq %rbx,%rcx + jae L$ecb_loop +L$ecb_unaligned_tail: + xorl %eax,%eax cmpq %rsp,%rbp + cmoveq %rcx,%rax + movq %rdi,%r8 + movq %rcx,%rbx + subq %rax,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + movq %rsp,%rsi + movq %r8,%rdi + movq %rbx,%rcx + jmp L$ecb_loop +.p2align 4 +L$ecb_break: + cmpq %rbp,%rsp je L$ecb_done pxor %xmm0,%xmm0 @@ -353,26 +383,39 @@ L$ecb_bzero: L$ecb_done: leaq (%rbp),%rsp jmp L$ecb_exit -.p2align 4 -L$ecb_short: - movq %rsp,%rbp - subq %rcx,%rsp - xorq %rbx,%rbx -L$ecb_short_copy: - movups (%rsi,%rbx,1),%xmm0 - leaq 16(%rbx),%rbx - cmpq %rbx,%rcx - movaps %xmm0,-16(%rsp,%rbx,1) - ja L$ecb_short_copy - movq %rsp,%rsi - movq %rcx,%rbx - jmp L$ecb_loop + .p2align 4 L$ecb_aligned: + leaq (%rsi,%rcx,1),%rbp + negq %rbp + andq $4095,%rbp + xorl %eax,%eax + cmpq $128,%rbp + movq $128-1,%rbp + cmovaeq %rax,%rbp + andq %rcx,%rbp + subq %rbp,%rcx + jz L$ecb_aligned_tail leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx .byte 0xf3,0x0f,0xa7,200 + testq %rbp,%rbp + jz L$ecb_exit + +L$ecb_aligned_tail: + movq %rdi,%r8 + movq %rbp,%rbx + movq %rbp,%rcx + leaq (%rsp),%rbp + subq %rcx,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + leaq (%r8),%rdi + leaq (%rsp),%rsi + movq %rbx,%rcx + jmp L$ecb_loop L$ecb_exit: movl $1,%eax leaq 8(%rsp),%rsp @@ -400,8 +443,6 @@ _padlock_cbc_encrypt: leaq 16(%rdx),%rdx xorl %eax,%eax xorl %ebx,%ebx - cmpq $64,%rcx - jbe L$cbc_short testl $32,(%rdx) jnz L$cbc_aligned testq $15,%rdi @@ -421,6 +462,21 @@ _padlock_cbc_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx + cmpq %rbx,%rcx + ja L$cbc_loop + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $64,%rax + movq $-64,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jz L$cbc_unaligned_tail jmp L$cbc_loop .p2align 4 L$cbc_loop: @@ -452,8 +508,8 @@ L$cbc_inp_aligned: testq $15,%rdi jz L$cbc_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi + shrq $3,%rcx .byte 0xf3,0x48,0xa5 subq %rbx,%rdi L$cbc_out_aligned: @@ -463,9 +519,26 @@ L$cbc_out_aligned: addq %rbx,%rsi subq %rbx,%rcx movq $512,%rbx - jnz L$cbc_loop - + jz L$cbc_break + cmpq %rbx,%rcx + jae L$cbc_loop +L$cbc_unaligned_tail: + xorl %eax,%eax cmpq %rsp,%rbp + cmoveq %rcx,%rax + movq %rdi,%r8 + movq %rcx,%rbx + subq %rax,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + movq %rsp,%rsi + movq %r8,%rdi + movq %rbx,%rcx + jmp L$cbc_loop +.p2align 4 +L$cbc_break: + cmpq %rbp,%rsp je L$cbc_done pxor %xmm0,%xmm0 @@ -479,28 +552,41 @@ L$cbc_bzero: L$cbc_done: leaq (%rbp),%rsp jmp L$cbc_exit -.p2align 4 -L$cbc_short: - movq %rsp,%rbp - subq %rcx,%rsp - xorq %rbx,%rbx -L$cbc_short_copy: - movups (%rsi,%rbx,1),%xmm0 - leaq 16(%rbx),%rbx - cmpq %rbx,%rcx - movaps %xmm0,-16(%rsp,%rbx,1) - ja L$cbc_short_copy - movq %rsp,%rsi - movq %rcx,%rbx - jmp L$cbc_loop + .p2align 4 L$cbc_aligned: + leaq (%rsi,%rcx,1),%rbp + negq %rbp + andq $4095,%rbp + xorl %eax,%eax + cmpq $64,%rbp + movq $64-1,%rbp + cmovaeq %rax,%rbp + andq %rcx,%rbp + subq %rbp,%rcx + jz L$cbc_aligned_tail leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx .byte 0xf3,0x0f,0xa7,208 movdqa (%rax),%xmm0 movdqa %xmm0,-16(%rdx) + testq %rbp,%rbp + jz L$cbc_exit + +L$cbc_aligned_tail: + movq %rdi,%r8 + movq %rbp,%rbx + movq %rbp,%rcx + leaq (%rsp),%rbp + subq %rcx,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + leaq (%r8),%rdi + leaq (%rsp),%rsi + movq %rbx,%rcx + jmp L$cbc_loop L$cbc_exit: movl $1,%eax leaq 8(%rsp),%rsp diff --git a/lib/accelerated/x86/macosx/padlock-x86-macosx.s b/lib/accelerated/x86/macosx/padlock-x86-macosx.s index 02b427e..40cfce9 100644 --- a/lib/accelerated/x86/macosx/padlock-x86-macosx.s +++ b/lib/accelerated/x86/macosx/padlock-x86-macosx.s @@ -174,16 +174,14 @@ L005ecb_pic_point: leal 16(%edx),%edx xorl %eax,%eax xorl %ebx,%ebx - cmpl $128,%ecx - jbe L006ecb_short testl $32,(%edx) - jnz L007ecb_aligned + jnz L006ecb_aligned testl $15,%edi setz %al testl $15,%esi setz %bl testl %ebx,%eax - jnz L007ecb_aligned + jnz L006ecb_aligned negl %eax movl $512,%ebx notl %eax @@ -195,10 +193,28 @@ L005ecb_pic_point: negl %eax andl $511,%ebx leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp andl $-16,%esp - jmp L008ecb_loop + movl %eax,16(%ebp) + cmpl %ebx,%ecx + ja L007ecb_loop + movl %esi,%eax + cmpl %esp,%ebp + cmovel %edi,%eax + addl %ecx,%eax + negl %eax + andl $4095,%eax + cmpl $128,%eax + movl $-128,%eax + cmovael %ebx,%eax + andl %eax,%ebx + jz L008ecb_unaligned_tail + jmp L007ecb_loop .align 4,0x90 -L008ecb_loop: +L007ecb_loop: movl %edi,(%ebp) movl %esi,4(%ebp) movl %ecx,8(%ebp) @@ -223,8 +239,8 @@ L009ecb_inp_aligned: testl $15,%edi jz L010ecb_out_aligned movl %ebx,%ecx - shrl $2,%ecx leal (%esp),%esi + shrl $2,%ecx .byte 243,165 subl %ebx,%edi L010ecb_out_aligned: @@ -234,43 +250,75 @@ L010ecb_out_aligned: addl %ebx,%esi subl %ebx,%ecx movl $512,%ebx - jnz L008ecb_loop + jz L011ecb_break + cmpl %ebx,%ecx + jae L007ecb_loop +L008ecb_unaligned_tail: + xorl %eax,%eax + cmpl %ebp,%esp + cmovel %ecx,%eax + subl %eax,%esp + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp L007ecb_loop +.align 4,0x90 +L011ecb_break: cmpl %ebp,%esp - je L011ecb_done + je L012ecb_done pxor %xmm0,%xmm0 leal (%esp),%eax -L012ecb_bzero: +L013ecb_bzero: movaps %xmm0,(%eax) leal 16(%eax),%eax cmpl %eax,%ebp - ja L012ecb_bzero -L011ecb_done: + ja L013ecb_bzero +L012ecb_done: + movl 16(%ebp),%ebp leal 24(%ebp),%esp - jmp L013ecb_exit + jmp L014ecb_exit .align 4,0x90 -L006ecb_short: +L006ecb_aligned: + leal (%esi,%ecx,1),%ebp + negl %ebp + andl $4095,%ebp xorl %eax,%eax - leal -24(%esp),%ebp - subl %ecx,%eax - leal (%eax,%ebp,1),%esp - andl $-16,%esp - xorl %ebx,%ebx -L014ecb_short_copy: - movups (%esi,%ebx,1),%xmm0 - leal 16(%ebx),%ebx - cmpl %ebx,%ecx - movaps %xmm0,-16(%esp,%ebx,1) - ja L014ecb_short_copy - movl %esp,%esi - movl %ecx,%ebx - jmp L008ecb_loop -.align 4,0x90 -L007ecb_aligned: + cmpl $128,%ebp + movl $127,%ebp + cmovael %eax,%ebp + andl %ecx,%ebp + subl %ebp,%ecx + jz L015ecb_aligned_tail leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx .byte 243,15,167,200 -L013ecb_exit: + testl %ebp,%ebp + jz L014ecb_exit +L015ecb_aligned_tail: + movl %ebp,%ecx + leal -24(%esp),%ebp + movl %ebp,%esp + movl %ebp,%eax + subl %ecx,%esp + andl $-16,%ebp + andl $-16,%esp + movl %eax,16(%ebp) + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp L007ecb_loop +L014ecb_exit: movl $1,%eax leal 4(%esp),%esp L004ecb_abort: @@ -292,19 +340,17 @@ L_padlock_cbc_encrypt_begin: movl 28(%esp),%edx movl 32(%esp),%ecx testl $15,%edx - jnz L015cbc_abort + jnz L016cbc_abort testl $15,%ecx - jnz L015cbc_abort - leal Lpadlock_saved_context-L016cbc_pic_point,%eax + jnz L016cbc_abort + leal Lpadlock_saved_context-L017cbc_pic_point,%eax pushfl cld call __padlock_verify_ctx -L016cbc_pic_point: +L017cbc_pic_point: leal 16(%edx),%edx xorl %eax,%eax xorl %ebx,%ebx - cmpl $64,%ecx - jbe L017cbc_short testl $32,(%edx) jnz L018cbc_aligned testl $15,%edi @@ -324,7 +370,25 @@ L016cbc_pic_point: negl %eax andl $511,%ebx leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp andl $-16,%esp + movl %eax,16(%ebp) + cmpl %ebx,%ecx + ja L019cbc_loop + movl %esi,%eax + cmpl %esp,%ebp + cmovel %edi,%eax + addl %ecx,%eax + negl %eax + andl $4095,%eax + cmpl $64,%eax + movl $-64,%eax + cmovael %ebx,%eax + andl %eax,%ebx + jz L020cbc_unaligned_tail jmp L019cbc_loop .align 4,0x90 L019cbc_loop: @@ -336,13 +400,13 @@ L019cbc_loop: testl $15,%edi cmovnzl %esp,%edi testl $15,%esi - jz L020cbc_inp_aligned + jz L021cbc_inp_aligned shrl $2,%ecx .byte 243,165 subl %ebx,%edi movl %ebx,%ecx movl %edi,%esi -L020cbc_inp_aligned: +L021cbc_inp_aligned: leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx @@ -352,61 +416,93 @@ L020cbc_inp_aligned: movl (%ebp),%edi movl 12(%ebp),%ebx testl $15,%edi - jz L021cbc_out_aligned + jz L022cbc_out_aligned movl %ebx,%ecx - shrl $2,%ecx leal (%esp),%esi + shrl $2,%ecx .byte 243,165 subl %ebx,%edi -L021cbc_out_aligned: +L022cbc_out_aligned: movl 4(%ebp),%esi movl 8(%ebp),%ecx addl %ebx,%edi addl %ebx,%esi subl %ebx,%ecx movl $512,%ebx - jnz L019cbc_loop + jz L023cbc_break + cmpl %ebx,%ecx + jae L019cbc_loop +L020cbc_unaligned_tail: + xorl %eax,%eax + cmpl %ebp,%esp + cmovel %ecx,%eax + subl %eax,%esp + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp L019cbc_loop +.align 4,0x90 +L023cbc_break: cmpl %ebp,%esp - je L022cbc_done + je L024cbc_done pxor %xmm0,%xmm0 leal (%esp),%eax -L023cbc_bzero: +L025cbc_bzero: movaps %xmm0,(%eax) leal 16(%eax),%eax cmpl %eax,%ebp - ja L023cbc_bzero -L022cbc_done: + ja L025cbc_bzero +L024cbc_done: + movl 16(%ebp),%ebp leal 24(%ebp),%esp - jmp L024cbc_exit -.align 4,0x90 -L017cbc_short: - xorl %eax,%eax - leal -24(%esp),%ebp - subl %ecx,%eax - leal (%eax,%ebp,1),%esp - andl $-16,%esp - xorl %ebx,%ebx -L025cbc_short_copy: - movups (%esi,%ebx,1),%xmm0 - leal 16(%ebx),%ebx - cmpl %ebx,%ecx - movaps %xmm0,-16(%esp,%ebx,1) - ja L025cbc_short_copy - movl %esp,%esi - movl %ecx,%ebx - jmp L019cbc_loop + jmp L026cbc_exit .align 4,0x90 L018cbc_aligned: + leal (%esi,%ecx,1),%ebp + negl %ebp + andl $4095,%ebp + xorl %eax,%eax + cmpl $64,%ebp + movl $63,%ebp + cmovael %eax,%ebp + andl %ecx,%ebp + subl %ebp,%ecx + jz L027cbc_aligned_tail leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx .byte 243,15,167,208 movaps (%eax),%xmm0 movaps %xmm0,-16(%edx) -L024cbc_exit: + testl %ebp,%ebp + jz L026cbc_exit +L027cbc_aligned_tail: + movl %ebp,%ecx + leal -24(%esp),%ebp + movl %ebp,%esp + movl %ebp,%eax + subl %ecx,%esp + andl $-16,%ebp + andl $-16,%esp + movl %eax,16(%ebp) + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp L019cbc_loop +L026cbc_exit: movl $1,%eax leal 4(%esp),%esp -L015cbc_abort: +L016cbc_abort: popl %edi popl %esi popl %ebx @@ -428,10 +524,10 @@ __win32_segv_handler: movl 4(%esp),%edx movl 12(%esp),%ecx cmpl $3221225477,(%edx) - jne L026ret + jne L028ret addl $4,184(%ecx) movl $0,%eax -L026ret: +L028ret: ret .globl _padlock_sha1_oneshot .align 4 -- 2.1.4