3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # Assembler helpers for Padlock engine. See even e_padlock-x86.pl for
17 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
19 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
21 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
22 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
23 ( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or
24 die "can't locate x86_64-xlate.pl";
26 open STDOUT,"| $^X $xlate $flavour $output";
30 %PADLOCK_MARGIN=(ecb=>128, cbc=>64, ctr32=>64); # prefetch errata
31 $PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20
39 ($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
40 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
43 .globl padlock_capability
44 .type padlock_capability,\@abi-omnipotent
51 cmp \$`"0x".unpack("H*",'tneC')`,%ebx
53 cmp \$`"0x".unpack("H*",'Hrua')`,%edx
55 cmp \$`"0x".unpack("H*",'slua')`,%ecx
67 or \$0x10,%eax # set Nano bit#4
71 .size padlock_capability,.-padlock_capability
73 .globl padlock_key_bswap
74 .type padlock_key_bswap,\@abi-omnipotent,0
86 .size padlock_key_bswap,.-padlock_key_bswap
88 .globl padlock_verify_context
89 .type padlock_verify_context,\@abi-omnipotent
91 padlock_verify_context:
94 lea .Lpadlock_saved_context(%rip),%rax
95 call _padlock_verify_ctx
98 .size padlock_verify_context,.-padlock_verify_context
100 .type _padlock_verify_ctx,\@abi-omnipotent
113 .size _padlock_verify_ctx,.-_padlock_verify_ctx
115 .globl padlock_reload_key
116 .type padlock_reload_key,\@abi-omnipotent
122 .size padlock_reload_key,.-padlock_reload_key
124 .globl padlock_aes_block
125 .type padlock_aes_block,\@function,3
130 lea 32($ctx),%rbx # key
131 lea 16($ctx),$ctx # control word
132 .byte 0xf3,0x0f,0xa7,0xc8 # rep xcryptecb
135 .size padlock_aes_block,.-padlock_aes_block
137 .globl padlock_xstore
138 .type padlock_xstore,\@function,2
142 .byte 0x0f,0xa7,0xc0 # xstore
144 .size padlock_xstore,.-padlock_xstore
146 .globl padlock_sha1_oneshot
147 .type padlock_sha1_oneshot,\@function,3
149 padlock_sha1_oneshot:
151 mov %rdi,%rdx # put aside %rdi
152 movups (%rdi),%xmm0 # copy-in context
159 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
163 movups %xmm0,(%rdx) # copy-out context
166 .size padlock_sha1_oneshot,.-padlock_sha1_oneshot
168 .globl padlock_sha1_blocks
169 .type padlock_sha1_blocks,\@function,3
173 mov %rdi,%rdx # put aside %rdi
174 movups (%rdi),%xmm0 # copy-in context
181 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
185 movups %xmm0,(%rdx) # copy-out context
188 .size padlock_sha1_blocks,.-padlock_sha1_blocks
190 .globl padlock_sha256_oneshot
191 .type padlock_sha256_oneshot,\@function,3
193 padlock_sha256_oneshot:
195 mov %rdi,%rdx # put aside %rdi
196 movups (%rdi),%xmm0 # copy-in context
198 movups 16(%rdi),%xmm1
201 movaps %xmm1,16(%rsp)
203 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
205 movaps 16(%rsp),%xmm1
207 movups %xmm0,(%rdx) # copy-out context
208 movups %xmm1,16(%rdx)
210 .size padlock_sha256_oneshot,.-padlock_sha256_oneshot
212 .globl padlock_sha256_blocks
213 .type padlock_sha256_blocks,\@function,3
215 padlock_sha256_blocks:
217 mov %rdi,%rdx # put aside %rdi
218 movups (%rdi),%xmm0 # copy-in context
220 movups 16(%rdi),%xmm1
223 movaps %xmm1,16(%rsp)
225 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
227 movaps 16(%rsp),%xmm1
229 movups %xmm0,(%rdx) # copy-out context
230 movups %xmm1,16(%rdx)
232 .size padlock_sha256_blocks,.-padlock_sha256_blocks
234 .globl padlock_sha512_blocks
235 .type padlock_sha512_blocks,\@function,3
237 padlock_sha512_blocks:
239 mov %rdi,%rdx # put aside %rdi
240 movups (%rdi),%xmm0 # copy-in context
242 movups 16(%rdi),%xmm1
243 movups 32(%rdi),%xmm2
244 movups 48(%rdi),%xmm3
247 movaps %xmm1,16(%rsp)
248 movaps %xmm2,32(%rsp)
249 movaps %xmm3,48(%rsp)
250 .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512
252 movaps 16(%rsp),%xmm1
253 movaps 32(%rsp),%xmm2
254 movaps 48(%rsp),%xmm3
256 movups %xmm0,(%rdx) # copy-out context
257 movups %xmm1,16(%rdx)
258 movups %xmm2,32(%rdx)
259 movups %xmm3,48(%rdx)
261 .size padlock_sha512_blocks,.-padlock_sha512_blocks
265 my ($mode,$opcode) = @_;
266 # int padlock_$mode_encrypt(void *out, const void *inp,
267 # struct padlock_cipher_data *ctx, size_t len);
269 .globl padlock_${mode}_encrypt
270 .type padlock_${mode}_encrypt,\@function,4
272 padlock_${mode}_encrypt:
281 lea .Lpadlock_saved_context(%rip),%rax
284 call _padlock_verify_ctx
285 lea 16($ctx),$ctx # control word
289 # Formally speaking correct condtion is $len<=$margin and $inp+$margin
290 # crosses page boundary [and next page is unreadable]. But $inp can
291 # be unaligned in which case data can be copied to $out if latter is
292 # aligned, in which case $out+$margin has to be checked. Covering all
293 # cases appears more complicated than just copying short input...
294 $code.=<<___ if ($PADLOCK_MARGIN{$mode});
295 cmp \$$PADLOCK_MARGIN{$mode},$len
299 testl \$`1<<5`,($ctx) # align bit in control word
300 jnz .L${mode}_aligned
302 setz %al # !out_misaligned
304 setz %bl # !inp_misaligned
306 jnz .L${mode}_aligned
308 mov \$$PADLOCK_CHUNK,$chunk
309 not %rax # out_misaligned?-1:0
312 cmovc $len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
313 and $chunk,%rax # out_misaligned?chunk:0
316 and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK
319 $code.=<<___ if ($mode eq "ctr32");
321 mov -4($ctx),%eax # pull 32-bit counter
324 and \$`$PADLOCK_CHUNK/16-1`,%eax
328 cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK
334 cmp $len,$chunk # ctr32 artefact
335 cmova $len,$chunk # ctr32 artefact
336 mov $out,%r8 # save parameters
341 test \$0x0f,$out # out_misaligned
343 test \$0x0f,$inp # inp_misaligned
344 jz .L${mode}_inp_aligned
346 .byte 0xf3,0x48,0xa5 # rep movsq
350 .L${mode}_inp_aligned:
351 lea -16($ctx),%rax # ivp
352 lea 16($ctx),%rbx # key
354 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
356 $code.=<<___ if ($mode !~ /ecb|ctr/);
358 movdqa %xmm0,-16($ctx) # copy [or refresh] iv
360 $code.=<<___ if ($mode eq "ctr32");
361 mov -4($ctx),%eax # pull 32-bit counter
362 test \$0xffff0000,%eax
363 jnz .L${mode}_no_corr
371 mov %r8,$out # restore paramters
374 jz .L${mode}_out_aligned
378 .byte 0xf3,0x48,0xa5 # rep movsq
380 .L${mode}_out_aligned:
386 mov \$$PADLOCK_CHUNK,$chunk
404 $code.=<<___ if ($PADLOCK_MARGIN{$mode});
410 .L${mode}_short_copy:
411 movups ($inp,$chunk),%xmm0
412 lea 16($chunk),$chunk
414 movaps %xmm0,-16(%rsp,$chunk)
415 ja .L${mode}_short_copy
418 jmp .L${mode}_`${mode} eq "ctr32"?"reenter":"loop"`
424 $code.=<<___ if ($mode eq "ctr32");
425 mov -4($ctx),%eax # pull 32-bit counter
426 mov \$`16*0x10000`,$chunk
432 jz .L${mode}_aligned_loop
435 cmova %rax,$chunk # don't let counter cross 2^16
436 jmp .L${mode}_aligned_loop
438 .L${mode}_aligned_loop:
441 mov $len,%r10 # save parameters
446 lea -16($ctx),%rax # ivp
447 lea 16($ctx),%rbx # key
448 shr \$4,$len # len/=AES_BLOCK_SIZE
449 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
451 $code.=<<___ if ($mode !~ /ecb|ctr/);
453 movdqa %xmm0,-16($ctx) # copy [or refresh] iv
455 $code.=<<___ if ($mode eq "ctr32");
456 mov -4($ctx),%eax # pull 32-bit counter
462 mov %r11,$chunk # restore paramters
465 mov \$`16*0x10000`,$chunk
466 jnz .L${mode}_aligned_loop
476 .size padlock_${mode}_encrypt,.-padlock_${mode}_encrypt
480 &generate_mode("ecb",0xc8);
481 &generate_mode("cbc",0xd0);
482 #&generate_mode("cfb",0xe0);
483 #&generate_mode("ofb",0xe8);
484 #&generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR...
487 .asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>"
491 .Lpadlock_saved_context:
494 $code =~ s/\`([^\`]*)\`/eval($1)/gem;