updated licenses
[gnutls:gnutls.git] / lib / accelerated / x86 / elf / appro-aes-gcm-x86-64.s
1 # Copyright (c) 2011-2012, Andy Polyakov <appro@openssl.org>
2 # All rights reserved.
3 #
4 # Redistribution and use in source and binary forms, with or without
5 # modification, are permitted provided that the following conditions
6 # are met:
7
8 #     * Redistributions of source code must retain copyright notices,
9 #      this list of conditions and the following disclaimer.
10 #
11 #     * Redistributions in binary form must reproduce the above
12 #      copyright notice, this list of conditions and the following
13 #      disclaimer in the documentation and/or other materials
14 #      provided with the distribution.
15 #
16 #     * Neither the name of the Andy Polyakov nor the names of its
17 #      copyright holder and contributors may be used to endorse or
18 #      promote products derived from this software without specific
19 #      prior written permission.
20 #
21 # ALTERNATIVELY, provided that this notice is retained in full, this
22 # product may be distributed under the terms of the GNU General Public
23 # License (GPL), in which case the provisions of the GPL apply INSTEAD OF
24 # those given above.
25 #
26 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
27 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
32 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
33 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
34 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
36 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 #
38 # *** This file is auto-generated ***
39 #
40 .text   
41
42 .globl  gcm_gmult_4bit
43 .type   gcm_gmult_4bit,@function
44 .align  16
45 gcm_gmult_4bit:
46         pushq   %rbx
47         pushq   %rbp
48         pushq   %r12
49 .Lgmult_prologue:
50
51         movzbq  15(%rdi),%r8
52         leaq    .Lrem_4bit(%rip),%r11
53         xorq    %rax,%rax
54         xorq    %rbx,%rbx
55         movb    %r8b,%al
56         movb    %r8b,%bl
57         shlb    $4,%al
58         movq    $14,%rcx
59         movq    8(%rsi,%rax,1),%r8
60         movq    (%rsi,%rax,1),%r9
61         andb    $240,%bl
62         movq    %r8,%rdx
63         jmp     .Loop1
64
65 .align  16
66 .Loop1:
67         shrq    $4,%r8
68         andq    $15,%rdx
69         movq    %r9,%r10
70         movb    (%rdi,%rcx,1),%al
71         shrq    $4,%r9
72         xorq    8(%rsi,%rbx,1),%r8
73         shlq    $60,%r10
74         xorq    (%rsi,%rbx,1),%r9
75         movb    %al,%bl
76         xorq    (%r11,%rdx,8),%r9
77         movq    %r8,%rdx
78         shlb    $4,%al
79         xorq    %r10,%r8
80         decq    %rcx
81         js      .Lbreak1
82
83         shrq    $4,%r8
84         andq    $15,%rdx
85         movq    %r9,%r10
86         shrq    $4,%r9
87         xorq    8(%rsi,%rax,1),%r8
88         shlq    $60,%r10
89         xorq    (%rsi,%rax,1),%r9
90         andb    $240,%bl
91         xorq    (%r11,%rdx,8),%r9
92         movq    %r8,%rdx
93         xorq    %r10,%r8
94         jmp     .Loop1
95
96 .align  16
97 .Lbreak1:
98         shrq    $4,%r8
99         andq    $15,%rdx
100         movq    %r9,%r10
101         shrq    $4,%r9
102         xorq    8(%rsi,%rax,1),%r8
103         shlq    $60,%r10
104         xorq    (%rsi,%rax,1),%r9
105         andb    $240,%bl
106         xorq    (%r11,%rdx,8),%r9
107         movq    %r8,%rdx
108         xorq    %r10,%r8
109
110         shrq    $4,%r8
111         andq    $15,%rdx
112         movq    %r9,%r10
113         shrq    $4,%r9
114         xorq    8(%rsi,%rbx,1),%r8
115         shlq    $60,%r10
116         xorq    (%rsi,%rbx,1),%r9
117         xorq    %r10,%r8
118         xorq    (%r11,%rdx,8),%r9
119
120         bswapq  %r8
121         bswapq  %r9
122         movq    %r8,8(%rdi)
123         movq    %r9,(%rdi)
124
125         movq    16(%rsp),%rbx
126         leaq    24(%rsp),%rsp
127 .Lgmult_epilogue:
128         .byte   0xf3,0xc3
129 .size   gcm_gmult_4bit,.-gcm_gmult_4bit
130 .globl  gcm_ghash_4bit
131 .type   gcm_ghash_4bit,@function
132 .align  16
133 gcm_ghash_4bit:
134         pushq   %rbx
135         pushq   %rbp
136         pushq   %r12
137         pushq   %r13
138         pushq   %r14
139         pushq   %r15
140         subq    $280,%rsp
141 .Lghash_prologue:
142         movq    %rdx,%r14
143         movq    %rcx,%r15
144         subq    $-128,%rsi
145         leaq    16+128(%rsp),%rbp
146         xorl    %edx,%edx
147         movq    0+0-128(%rsi),%r8
148         movq    0+8-128(%rsi),%rax
149         movb    %al,%dl
150         shrq    $4,%rax
151         movq    %r8,%r10
152         shrq    $4,%r8
153         movq    16+0-128(%rsi),%r9
154         shlb    $4,%dl
155         movq    16+8-128(%rsi),%rbx
156         shlq    $60,%r10
157         movb    %dl,0(%rsp)
158         orq     %r10,%rax
159         movb    %bl,%dl
160         shrq    $4,%rbx
161         movq    %r9,%r10
162         shrq    $4,%r9
163         movq    %r8,0(%rbp)
164         movq    32+0-128(%rsi),%r8
165         shlb    $4,%dl
166         movq    %rax,0-128(%rbp)
167         movq    32+8-128(%rsi),%rax
168         shlq    $60,%r10
169         movb    %dl,1(%rsp)
170         orq     %r10,%rbx
171         movb    %al,%dl
172         shrq    $4,%rax
173         movq    %r8,%r10
174         shrq    $4,%r8
175         movq    %r9,8(%rbp)
176         movq    48+0-128(%rsi),%r9
177         shlb    $4,%dl
178         movq    %rbx,8-128(%rbp)
179         movq    48+8-128(%rsi),%rbx
180         shlq    $60,%r10
181         movb    %dl,2(%rsp)
182         orq     %r10,%rax
183         movb    %bl,%dl
184         shrq    $4,%rbx
185         movq    %r9,%r10
186         shrq    $4,%r9
187         movq    %r8,16(%rbp)
188         movq    64+0-128(%rsi),%r8
189         shlb    $4,%dl
190         movq    %rax,16-128(%rbp)
191         movq    64+8-128(%rsi),%rax
192         shlq    $60,%r10
193         movb    %dl,3(%rsp)
194         orq     %r10,%rbx
195         movb    %al,%dl
196         shrq    $4,%rax
197         movq    %r8,%r10
198         shrq    $4,%r8
199         movq    %r9,24(%rbp)
200         movq    80+0-128(%rsi),%r9
201         shlb    $4,%dl
202         movq    %rbx,24-128(%rbp)
203         movq    80+8-128(%rsi),%rbx
204         shlq    $60,%r10
205         movb    %dl,4(%rsp)
206         orq     %r10,%rax
207         movb    %bl,%dl
208         shrq    $4,%rbx
209         movq    %r9,%r10
210         shrq    $4,%r9
211         movq    %r8,32(%rbp)
212         movq    96+0-128(%rsi),%r8
213         shlb    $4,%dl
214         movq    %rax,32-128(%rbp)
215         movq    96+8-128(%rsi),%rax
216         shlq    $60,%r10
217         movb    %dl,5(%rsp)
218         orq     %r10,%rbx
219         movb    %al,%dl
220         shrq    $4,%rax
221         movq    %r8,%r10
222         shrq    $4,%r8
223         movq    %r9,40(%rbp)
224         movq    112+0-128(%rsi),%r9
225         shlb    $4,%dl
226         movq    %rbx,40-128(%rbp)
227         movq    112+8-128(%rsi),%rbx
228         shlq    $60,%r10
229         movb    %dl,6(%rsp)
230         orq     %r10,%rax
231         movb    %bl,%dl
232         shrq    $4,%rbx
233         movq    %r9,%r10
234         shrq    $4,%r9
235         movq    %r8,48(%rbp)
236         movq    128+0-128(%rsi),%r8
237         shlb    $4,%dl
238         movq    %rax,48-128(%rbp)
239         movq    128+8-128(%rsi),%rax
240         shlq    $60,%r10
241         movb    %dl,7(%rsp)
242         orq     %r10,%rbx
243         movb    %al,%dl
244         shrq    $4,%rax
245         movq    %r8,%r10
246         shrq    $4,%r8
247         movq    %r9,56(%rbp)
248         movq    144+0-128(%rsi),%r9
249         shlb    $4,%dl
250         movq    %rbx,56-128(%rbp)
251         movq    144+8-128(%rsi),%rbx
252         shlq    $60,%r10
253         movb    %dl,8(%rsp)
254         orq     %r10,%rax
255         movb    %bl,%dl
256         shrq    $4,%rbx
257         movq    %r9,%r10
258         shrq    $4,%r9
259         movq    %r8,64(%rbp)
260         movq    160+0-128(%rsi),%r8
261         shlb    $4,%dl
262         movq    %rax,64-128(%rbp)
263         movq    160+8-128(%rsi),%rax
264         shlq    $60,%r10
265         movb    %dl,9(%rsp)
266         orq     %r10,%rbx
267         movb    %al,%dl
268         shrq    $4,%rax
269         movq    %r8,%r10
270         shrq    $4,%r8
271         movq    %r9,72(%rbp)
272         movq    176+0-128(%rsi),%r9
273         shlb    $4,%dl
274         movq    %rbx,72-128(%rbp)
275         movq    176+8-128(%rsi),%rbx
276         shlq    $60,%r10
277         movb    %dl,10(%rsp)
278         orq     %r10,%rax
279         movb    %bl,%dl
280         shrq    $4,%rbx
281         movq    %r9,%r10
282         shrq    $4,%r9
283         movq    %r8,80(%rbp)
284         movq    192+0-128(%rsi),%r8
285         shlb    $4,%dl
286         movq    %rax,80-128(%rbp)
287         movq    192+8-128(%rsi),%rax
288         shlq    $60,%r10
289         movb    %dl,11(%rsp)
290         orq     %r10,%rbx
291         movb    %al,%dl
292         shrq    $4,%rax
293         movq    %r8,%r10
294         shrq    $4,%r8
295         movq    %r9,88(%rbp)
296         movq    208+0-128(%rsi),%r9
297         shlb    $4,%dl
298         movq    %rbx,88-128(%rbp)
299         movq    208+8-128(%rsi),%rbx
300         shlq    $60,%r10
301         movb    %dl,12(%rsp)
302         orq     %r10,%rax
303         movb    %bl,%dl
304         shrq    $4,%rbx
305         movq    %r9,%r10
306         shrq    $4,%r9
307         movq    %r8,96(%rbp)
308         movq    224+0-128(%rsi),%r8
309         shlb    $4,%dl
310         movq    %rax,96-128(%rbp)
311         movq    224+8-128(%rsi),%rax
312         shlq    $60,%r10
313         movb    %dl,13(%rsp)
314         orq     %r10,%rbx
315         movb    %al,%dl
316         shrq    $4,%rax
317         movq    %r8,%r10
318         shrq    $4,%r8
319         movq    %r9,104(%rbp)
320         movq    240+0-128(%rsi),%r9
321         shlb    $4,%dl
322         movq    %rbx,104-128(%rbp)
323         movq    240+8-128(%rsi),%rbx
324         shlq    $60,%r10
325         movb    %dl,14(%rsp)
326         orq     %r10,%rax
327         movb    %bl,%dl
328         shrq    $4,%rbx
329         movq    %r9,%r10
330         shrq    $4,%r9
331         movq    %r8,112(%rbp)
332         shlb    $4,%dl
333         movq    %rax,112-128(%rbp)
334         shlq    $60,%r10
335         movb    %dl,15(%rsp)
336         orq     %r10,%rbx
337         movq    %r9,120(%rbp)
338         movq    %rbx,120-128(%rbp)
339         addq    $-128,%rsi
340         movq    8(%rdi),%r8
341         movq    0(%rdi),%r9
342         addq    %r14,%r15
343         leaq    .Lrem_8bit(%rip),%r11
344         jmp     .Louter_loop
345 .align  16
346 .Louter_loop:
347         xorq    (%r14),%r9
348         movq    8(%r14),%rdx
349         leaq    16(%r14),%r14
350         xorq    %r8,%rdx
351         movq    %r9,(%rdi)
352         movq    %rdx,8(%rdi)
353         shrq    $32,%rdx
354         xorq    %rax,%rax
355         roll    $8,%edx
356         movb    %dl,%al
357         movzbl  %dl,%ebx
358         shlb    $4,%al
359         shrl    $4,%ebx
360         roll    $8,%edx
361         movq    8(%rsi,%rax,1),%r8
362         movq    (%rsi,%rax,1),%r9
363         movb    %dl,%al
364         movzbl  %dl,%ecx
365         shlb    $4,%al
366         movzbq  (%rsp,%rbx,1),%r12
367         shrl    $4,%ecx
368         xorq    %r8,%r12
369         movq    %r9,%r10
370         shrq    $8,%r8
371         movzbq  %r12b,%r12
372         shrq    $8,%r9
373         xorq    -128(%rbp,%rbx,8),%r8
374         shlq    $56,%r10
375         xorq    (%rbp,%rbx,8),%r9
376         roll    $8,%edx
377         xorq    8(%rsi,%rax,1),%r8
378         xorq    (%rsi,%rax,1),%r9
379         movb    %dl,%al
380         xorq    %r10,%r8
381         movzwq  (%r11,%r12,2),%r12
382         movzbl  %dl,%ebx
383         shlb    $4,%al
384         movzbq  (%rsp,%rcx,1),%r13
385         shrl    $4,%ebx
386         shlq    $48,%r12
387         xorq    %r8,%r13
388         movq    %r9,%r10
389         xorq    %r12,%r9
390         shrq    $8,%r8
391         movzbq  %r13b,%r13
392         shrq    $8,%r9
393         xorq    -128(%rbp,%rcx,8),%r8
394         shlq    $56,%r10
395         xorq    (%rbp,%rcx,8),%r9
396         roll    $8,%edx
397         xorq    8(%rsi,%rax,1),%r8
398         xorq    (%rsi,%rax,1),%r9
399         movb    %dl,%al
400         xorq    %r10,%r8
401         movzwq  (%r11,%r13,2),%r13
402         movzbl  %dl,%ecx
403         shlb    $4,%al
404         movzbq  (%rsp,%rbx,1),%r12
405         shrl    $4,%ecx
406         shlq    $48,%r13
407         xorq    %r8,%r12
408         movq    %r9,%r10
409         xorq    %r13,%r9
410         shrq    $8,%r8
411         movzbq  %r12b,%r12
412         movl    8(%rdi),%edx
413         shrq    $8,%r9
414         xorq    -128(%rbp,%rbx,8),%r8
415         shlq    $56,%r10
416         xorq    (%rbp,%rbx,8),%r9
417         roll    $8,%edx
418         xorq    8(%rsi,%rax,1),%r8
419         xorq    (%rsi,%rax,1),%r9
420         movb    %dl,%al
421         xorq    %r10,%r8
422         movzwq  (%r11,%r12,2),%r12
423         movzbl  %dl,%ebx
424         shlb    $4,%al
425         movzbq  (%rsp,%rcx,1),%r13
426         shrl    $4,%ebx
427         shlq    $48,%r12
428         xorq    %r8,%r13
429         movq    %r9,%r10
430         xorq    %r12,%r9
431         shrq    $8,%r8
432         movzbq  %r13b,%r13
433         shrq    $8,%r9
434         xorq    -128(%rbp,%rcx,8),%r8
435         shlq    $56,%r10
436         xorq    (%rbp,%rcx,8),%r9
437         roll    $8,%edx
438         xorq    8(%rsi,%rax,1),%r8
439         xorq    (%rsi,%rax,1),%r9
440         movb    %dl,%al
441         xorq    %r10,%r8
442         movzwq  (%r11,%r13,2),%r13
443         movzbl  %dl,%ecx
444         shlb    $4,%al
445         movzbq  (%rsp,%rbx,1),%r12
446         shrl    $4,%ecx
447         shlq    $48,%r13
448         xorq    %r8,%r12
449         movq    %r9,%r10
450         xorq    %r13,%r9
451         shrq    $8,%r8
452         movzbq  %r12b,%r12
453         shrq    $8,%r9
454         xorq    -128(%rbp,%rbx,8),%r8
455         shlq    $56,%r10
456         xorq    (%rbp,%rbx,8),%r9
457         roll    $8,%edx
458         xorq    8(%rsi,%rax,1),%r8
459         xorq    (%rsi,%rax,1),%r9
460         movb    %dl,%al
461         xorq    %r10,%r8
462         movzwq  (%r11,%r12,2),%r12
463         movzbl  %dl,%ebx
464         shlb    $4,%al
465         movzbq  (%rsp,%rcx,1),%r13
466         shrl    $4,%ebx
467         shlq    $48,%r12
468         xorq    %r8,%r13
469         movq    %r9,%r10
470         xorq    %r12,%r9
471         shrq    $8,%r8
472         movzbq  %r13b,%r13
473         shrq    $8,%r9
474         xorq    -128(%rbp,%rcx,8),%r8
475         shlq    $56,%r10
476         xorq    (%rbp,%rcx,8),%r9
477         roll    $8,%edx
478         xorq    8(%rsi,%rax,1),%r8
479         xorq    (%rsi,%rax,1),%r9
480         movb    %dl,%al
481         xorq    %r10,%r8
482         movzwq  (%r11,%r13,2),%r13
483         movzbl  %dl,%ecx
484         shlb    $4,%al
485         movzbq  (%rsp,%rbx,1),%r12
486         shrl    $4,%ecx
487         shlq    $48,%r13
488         xorq    %r8,%r12
489         movq    %r9,%r10
490         xorq    %r13,%r9
491         shrq    $8,%r8
492         movzbq  %r12b,%r12
493         movl    4(%rdi),%edx
494         shrq    $8,%r9
495         xorq    -128(%rbp,%rbx,8),%r8
496         shlq    $56,%r10
497         xorq    (%rbp,%rbx,8),%r9
498         roll    $8,%edx
499         xorq    8(%rsi,%rax,1),%r8
500         xorq    (%rsi,%rax,1),%r9
501         movb    %dl,%al
502         xorq    %r10,%r8
503         movzwq  (%r11,%r12,2),%r12
504         movzbl  %dl,%ebx
505         shlb    $4,%al
506         movzbq  (%rsp,%rcx,1),%r13
507         shrl    $4,%ebx
508         shlq    $48,%r12
509         xorq    %r8,%r13
510         movq    %r9,%r10
511         xorq    %r12,%r9
512         shrq    $8,%r8
513         movzbq  %r13b,%r13
514         shrq    $8,%r9
515         xorq    -128(%rbp,%rcx,8),%r8
516         shlq    $56,%r10
517         xorq    (%rbp,%rcx,8),%r9
518         roll    $8,%edx
519         xorq    8(%rsi,%rax,1),%r8
520         xorq    (%rsi,%rax,1),%r9
521         movb    %dl,%al
522         xorq    %r10,%r8
523         movzwq  (%r11,%r13,2),%r13
524         movzbl  %dl,%ecx
525         shlb    $4,%al
526         movzbq  (%rsp,%rbx,1),%r12
527         shrl    $4,%ecx
528         shlq    $48,%r13
529         xorq    %r8,%r12
530         movq    %r9,%r10
531         xorq    %r13,%r9
532         shrq    $8,%r8
533         movzbq  %r12b,%r12
534         shrq    $8,%r9
535         xorq    -128(%rbp,%rbx,8),%r8
536         shlq    $56,%r10
537         xorq    (%rbp,%rbx,8),%r9
538         roll    $8,%edx
539         xorq    8(%rsi,%rax,1),%r8
540         xorq    (%rsi,%rax,1),%r9
541         movb    %dl,%al
542         xorq    %r10,%r8
543         movzwq  (%r11,%r12,2),%r12
544         movzbl  %dl,%ebx
545         shlb    $4,%al
546         movzbq  (%rsp,%rcx,1),%r13
547         shrl    $4,%ebx
548         shlq    $48,%r12
549         xorq    %r8,%r13
550         movq    %r9,%r10
551         xorq    %r12,%r9
552         shrq    $8,%r8
553         movzbq  %r13b,%r13
554         shrq    $8,%r9
555         xorq    -128(%rbp,%rcx,8),%r8
556         shlq    $56,%r10
557         xorq    (%rbp,%rcx,8),%r9
558         roll    $8,%edx
559         xorq    8(%rsi,%rax,1),%r8
560         xorq    (%rsi,%rax,1),%r9
561         movb    %dl,%al
562         xorq    %r10,%r8
563         movzwq  (%r11,%r13,2),%r13
564         movzbl  %dl,%ecx
565         shlb    $4,%al
566         movzbq  (%rsp,%rbx,1),%r12
567         shrl    $4,%ecx
568         shlq    $48,%r13
569         xorq    %r8,%r12
570         movq    %r9,%r10
571         xorq    %r13,%r9
572         shrq    $8,%r8
573         movzbq  %r12b,%r12
574         movl    0(%rdi),%edx
575         shrq    $8,%r9
576         xorq    -128(%rbp,%rbx,8),%r8
577         shlq    $56,%r10
578         xorq    (%rbp,%rbx,8),%r9
579         roll    $8,%edx
580         xorq    8(%rsi,%rax,1),%r8
581         xorq    (%rsi,%rax,1),%r9
582         movb    %dl,%al
583         xorq    %r10,%r8
584         movzwq  (%r11,%r12,2),%r12
585         movzbl  %dl,%ebx
586         shlb    $4,%al
587         movzbq  (%rsp,%rcx,1),%r13
588         shrl    $4,%ebx
589         shlq    $48,%r12
590         xorq    %r8,%r13
591         movq    %r9,%r10
592         xorq    %r12,%r9
593         shrq    $8,%r8
594         movzbq  %r13b,%r13
595         shrq    $8,%r9
596         xorq    -128(%rbp,%rcx,8),%r8
597         shlq    $56,%r10
598         xorq    (%rbp,%rcx,8),%r9
599         roll    $8,%edx
600         xorq    8(%rsi,%rax,1),%r8
601         xorq    (%rsi,%rax,1),%r9
602         movb    %dl,%al
603         xorq    %r10,%r8
604         movzwq  (%r11,%r13,2),%r13
605         movzbl  %dl,%ecx
606         shlb    $4,%al
607         movzbq  (%rsp,%rbx,1),%r12
608         shrl    $4,%ecx
609         shlq    $48,%r13
610         xorq    %r8,%r12
611         movq    %r9,%r10
612         xorq    %r13,%r9
613         shrq    $8,%r8
614         movzbq  %r12b,%r12
615         shrq    $8,%r9
616         xorq    -128(%rbp,%rbx,8),%r8
617         shlq    $56,%r10
618         xorq    (%rbp,%rbx,8),%r9
619         roll    $8,%edx
620         xorq    8(%rsi,%rax,1),%r8
621         xorq    (%rsi,%rax,1),%r9
622         movb    %dl,%al
623         xorq    %r10,%r8
624         movzwq  (%r11,%r12,2),%r12
625         movzbl  %dl,%ebx
626         shlb    $4,%al
627         movzbq  (%rsp,%rcx,1),%r13
628         shrl    $4,%ebx
629         shlq    $48,%r12
630         xorq    %r8,%r13
631         movq    %r9,%r10
632         xorq    %r12,%r9
633         shrq    $8,%r8
634         movzbq  %r13b,%r13
635         shrq    $8,%r9
636         xorq    -128(%rbp,%rcx,8),%r8
637         shlq    $56,%r10
638         xorq    (%rbp,%rcx,8),%r9
639         roll    $8,%edx
640         xorq    8(%rsi,%rax,1),%r8
641         xorq    (%rsi,%rax,1),%r9
642         movb    %dl,%al
643         xorq    %r10,%r8
644         movzwq  (%r11,%r13,2),%r13
645         movzbl  %dl,%ecx
646         shlb    $4,%al
647         movzbq  (%rsp,%rbx,1),%r12
648         andl    $240,%ecx
649         shlq    $48,%r13
650         xorq    %r8,%r12
651         movq    %r9,%r10
652         xorq    %r13,%r9
653         shrq    $8,%r8
654         movzbq  %r12b,%r12
655         movl    -4(%rdi),%edx
656         shrq    $8,%r9
657         xorq    -128(%rbp,%rbx,8),%r8
658         shlq    $56,%r10
659         xorq    (%rbp,%rbx,8),%r9
660         movzwq  (%r11,%r12,2),%r12
661         xorq    8(%rsi,%rax,1),%r8
662         xorq    (%rsi,%rax,1),%r9
663         shlq    $48,%r12
664         xorq    %r10,%r8
665         xorq    %r12,%r9
666         movzbq  %r8b,%r13
667         shrq    $4,%r8
668         movq    %r9,%r10
669         shlb    $4,%r13b
670         shrq    $4,%r9
671         xorq    8(%rsi,%rcx,1),%r8
672         movzwq  (%r11,%r13,2),%r13
673         shlq    $60,%r10
674         xorq    (%rsi,%rcx,1),%r9
675         xorq    %r10,%r8
676         shlq    $48,%r13
677         bswapq  %r8
678         xorq    %r13,%r9
679         bswapq  %r9
680         cmpq    %r15,%r14
681         jb      .Louter_loop
682         movq    %r8,8(%rdi)
683         movq    %r9,(%rdi)
684
685         leaq    280(%rsp),%rsi
686         movq    0(%rsi),%r15
687         movq    8(%rsi),%r14
688         movq    16(%rsi),%r13
689         movq    24(%rsi),%r12
690         movq    32(%rsi),%rbp
691         movq    40(%rsi),%rbx
692         leaq    48(%rsi),%rsp
693 .Lghash_epilogue:
694         .byte   0xf3,0xc3
695 .size   gcm_ghash_4bit,.-gcm_ghash_4bit
696 .globl  gcm_init_clmul
697 .type   gcm_init_clmul,@function
698 .align  16
699 gcm_init_clmul:
700         movdqu  (%rsi),%xmm2
701         pshufd  $78,%xmm2,%xmm2
702
703
704         pshufd  $255,%xmm2,%xmm4
705         movdqa  %xmm2,%xmm3
706         psllq   $1,%xmm2
707         pxor    %xmm5,%xmm5
708         psrlq   $63,%xmm3
709         pcmpgtd %xmm4,%xmm5
710         pslldq  $8,%xmm3
711         por     %xmm3,%xmm2
712
713
714         pand    .L0x1c2_polynomial(%rip),%xmm5
715         pxor    %xmm5,%xmm2
716
717
718         movdqa  %xmm2,%xmm0
719         movdqa  %xmm0,%xmm1
720         pshufd  $78,%xmm0,%xmm3
721         pshufd  $78,%xmm2,%xmm4
722         pxor    %xmm0,%xmm3
723         pxor    %xmm2,%xmm4
724 .byte   102,15,58,68,194,0
725 .byte   102,15,58,68,202,17
726 .byte   102,15,58,68,220,0
727         pxor    %xmm0,%xmm3
728         pxor    %xmm1,%xmm3
729
730         movdqa  %xmm3,%xmm4
731         psrldq  $8,%xmm3
732         pslldq  $8,%xmm4
733         pxor    %xmm3,%xmm1
734         pxor    %xmm4,%xmm0
735
736         movdqa  %xmm0,%xmm3
737         psllq   $1,%xmm0
738         pxor    %xmm3,%xmm0
739         psllq   $5,%xmm0
740         pxor    %xmm3,%xmm0
741         psllq   $57,%xmm0
742         movdqa  %xmm0,%xmm4
743         pslldq  $8,%xmm0
744         psrldq  $8,%xmm4
745         pxor    %xmm3,%xmm0
746         pxor    %xmm4,%xmm1
747
748
749         movdqa  %xmm0,%xmm4
750         psrlq   $5,%xmm0
751         pxor    %xmm4,%xmm0
752         psrlq   $1,%xmm0
753         pxor    %xmm4,%xmm0
754         pxor    %xmm1,%xmm4
755         psrlq   $1,%xmm0
756         pxor    %xmm4,%xmm0
757         movdqu  %xmm2,(%rdi)
758         movdqu  %xmm0,16(%rdi)
759         .byte   0xf3,0xc3
760 .size   gcm_init_clmul,.-gcm_init_clmul
761 .globl  gcm_gmult_clmul
762 .type   gcm_gmult_clmul,@function
763 .align  16
764 gcm_gmult_clmul:
765         movdqu  (%rdi),%xmm0
766         movdqa  .Lbswap_mask(%rip),%xmm5
767         movdqu  (%rsi),%xmm2
768 .byte   102,15,56,0,197
769         movdqa  %xmm0,%xmm1
770         pshufd  $78,%xmm0,%xmm3
771         pshufd  $78,%xmm2,%xmm4
772         pxor    %xmm0,%xmm3
773         pxor    %xmm2,%xmm4
774 .byte   102,15,58,68,194,0
775 .byte   102,15,58,68,202,17
776 .byte   102,15,58,68,220,0
777         pxor    %xmm0,%xmm3
778         pxor    %xmm1,%xmm3
779
780         movdqa  %xmm3,%xmm4
781         psrldq  $8,%xmm3
782         pslldq  $8,%xmm4
783         pxor    %xmm3,%xmm1
784         pxor    %xmm4,%xmm0
785
786         movdqa  %xmm0,%xmm3
787         psllq   $1,%xmm0
788         pxor    %xmm3,%xmm0
789         psllq   $5,%xmm0
790         pxor    %xmm3,%xmm0
791         psllq   $57,%xmm0
792         movdqa  %xmm0,%xmm4
793         pslldq  $8,%xmm0
794         psrldq  $8,%xmm4
795         pxor    %xmm3,%xmm0
796         pxor    %xmm4,%xmm1
797
798
799         movdqa  %xmm0,%xmm4
800         psrlq   $5,%xmm0
801         pxor    %xmm4,%xmm0
802         psrlq   $1,%xmm0
803         pxor    %xmm4,%xmm0
804         pxor    %xmm1,%xmm4
805         psrlq   $1,%xmm0
806         pxor    %xmm4,%xmm0
807 .byte   102,15,56,0,197
808         movdqu  %xmm0,(%rdi)
809         .byte   0xf3,0xc3
810 .size   gcm_gmult_clmul,.-gcm_gmult_clmul
811 .globl  gcm_ghash_clmul
812 .type   gcm_ghash_clmul,@function
813 .align  16
814 gcm_ghash_clmul:
815         movdqa  .Lbswap_mask(%rip),%xmm5
816
817         movdqu  (%rdi),%xmm0
818         movdqu  (%rsi),%xmm2
819 .byte   102,15,56,0,197
820
821         subq    $16,%rcx
822         jz      .Lodd_tail
823
824         movdqu  16(%rsi),%xmm8
825
826
827
828
829
830         movdqu  (%rdx),%xmm3
831         movdqu  16(%rdx),%xmm6
832 .byte   102,15,56,0,221
833 .byte   102,15,56,0,245
834         pxor    %xmm3,%xmm0
835         movdqa  %xmm6,%xmm7
836         pshufd  $78,%xmm6,%xmm3
837         pshufd  $78,%xmm2,%xmm4
838         pxor    %xmm6,%xmm3
839         pxor    %xmm2,%xmm4
840 .byte   102,15,58,68,242,0
841 .byte   102,15,58,68,250,17
842 .byte   102,15,58,68,220,0
843         pxor    %xmm6,%xmm3
844         pxor    %xmm7,%xmm3
845
846         movdqa  %xmm3,%xmm4
847         psrldq  $8,%xmm3
848         pslldq  $8,%xmm4
849         pxor    %xmm3,%xmm7
850         pxor    %xmm4,%xmm6
851         movdqa  %xmm0,%xmm1
852         pshufd  $78,%xmm0,%xmm3
853         pshufd  $78,%xmm8,%xmm4
854         pxor    %xmm0,%xmm3
855         pxor    %xmm8,%xmm4
856
857         leaq    32(%rdx),%rdx
858         subq    $32,%rcx
859         jbe     .Leven_tail
860
861 .Lmod_loop:
862 .byte   102,65,15,58,68,192,0
863 .byte   102,65,15,58,68,200,17
864 .byte   102,15,58,68,220,0
865         pxor    %xmm0,%xmm3
866         pxor    %xmm1,%xmm3
867
868         movdqa  %xmm3,%xmm4
869         psrldq  $8,%xmm3
870         pslldq  $8,%xmm4
871         pxor    %xmm3,%xmm1
872         pxor    %xmm4,%xmm0
873         movdqu  (%rdx),%xmm3
874         pxor    %xmm6,%xmm0
875         pxor    %xmm7,%xmm1
876
877         movdqu  16(%rdx),%xmm6
878 .byte   102,15,56,0,221
879 .byte   102,15,56,0,245
880
881         movdqa  %xmm6,%xmm7
882         pshufd  $78,%xmm6,%xmm9
883         pshufd  $78,%xmm2,%xmm10
884         pxor    %xmm6,%xmm9
885         pxor    %xmm2,%xmm10
886         pxor    %xmm3,%xmm1
887
888         movdqa  %xmm0,%xmm3
889         psllq   $1,%xmm0
890         pxor    %xmm3,%xmm0
891         psllq   $5,%xmm0
892         pxor    %xmm3,%xmm0
893 .byte   102,15,58,68,242,0
894         psllq   $57,%xmm0
895         movdqa  %xmm0,%xmm4
896         pslldq  $8,%xmm0
897         psrldq  $8,%xmm4
898         pxor    %xmm3,%xmm0
899         pxor    %xmm4,%xmm1
900
901 .byte   102,15,58,68,250,17
902         movdqa  %xmm0,%xmm4
903         psrlq   $5,%xmm0
904         pxor    %xmm4,%xmm0
905         psrlq   $1,%xmm0
906         pxor    %xmm4,%xmm0
907         pxor    %xmm1,%xmm4
908         psrlq   $1,%xmm0
909         pxor    %xmm4,%xmm0
910
911 .byte   102,69,15,58,68,202,0
912         movdqa  %xmm0,%xmm1
913         pshufd  $78,%xmm0,%xmm3
914         pshufd  $78,%xmm8,%xmm4
915         pxor    %xmm0,%xmm3
916         pxor    %xmm8,%xmm4
917
918         pxor    %xmm6,%xmm9
919         pxor    %xmm7,%xmm9
920         movdqa  %xmm9,%xmm10
921         psrldq  $8,%xmm9
922         pslldq  $8,%xmm10
923         pxor    %xmm9,%xmm7
924         pxor    %xmm10,%xmm6
925
926         leaq    32(%rdx),%rdx
927         subq    $32,%rcx
928         ja      .Lmod_loop
929
930 .Leven_tail:
931 .byte   102,65,15,58,68,192,0
932 .byte   102,65,15,58,68,200,17
933 .byte   102,15,58,68,220,0
934         pxor    %xmm0,%xmm3
935         pxor    %xmm1,%xmm3
936
937         movdqa  %xmm3,%xmm4
938         psrldq  $8,%xmm3
939         pslldq  $8,%xmm4
940         pxor    %xmm3,%xmm1
941         pxor    %xmm4,%xmm0
942         pxor    %xmm6,%xmm0
943         pxor    %xmm7,%xmm1
944
945         movdqa  %xmm0,%xmm3
946         psllq   $1,%xmm0
947         pxor    %xmm3,%xmm0
948         psllq   $5,%xmm0
949         pxor    %xmm3,%xmm0
950         psllq   $57,%xmm0
951         movdqa  %xmm0,%xmm4
952         pslldq  $8,%xmm0
953         psrldq  $8,%xmm4
954         pxor    %xmm3,%xmm0
955         pxor    %xmm4,%xmm1
956
957
958         movdqa  %xmm0,%xmm4
959         psrlq   $5,%xmm0
960         pxor    %xmm4,%xmm0
961         psrlq   $1,%xmm0
962         pxor    %xmm4,%xmm0
963         pxor    %xmm1,%xmm4
964         psrlq   $1,%xmm0
965         pxor    %xmm4,%xmm0
966         testq   %rcx,%rcx
967         jnz     .Ldone
968
969 .Lodd_tail:
970         movdqu  (%rdx),%xmm3
971 .byte   102,15,56,0,221
972         pxor    %xmm3,%xmm0
973         movdqa  %xmm0,%xmm1
974         pshufd  $78,%xmm0,%xmm3
975         pshufd  $78,%xmm2,%xmm4
976         pxor    %xmm0,%xmm3
977         pxor    %xmm2,%xmm4
978 .byte   102,15,58,68,194,0
979 .byte   102,15,58,68,202,17
980 .byte   102,15,58,68,220,0
981         pxor    %xmm0,%xmm3
982         pxor    %xmm1,%xmm3
983
984         movdqa  %xmm3,%xmm4
985         psrldq  $8,%xmm3
986         pslldq  $8,%xmm4
987         pxor    %xmm3,%xmm1
988         pxor    %xmm4,%xmm0
989
990         movdqa  %xmm0,%xmm3
991         psllq   $1,%xmm0
992         pxor    %xmm3,%xmm0
993         psllq   $5,%xmm0
994         pxor    %xmm3,%xmm0
995         psllq   $57,%xmm0
996         movdqa  %xmm0,%xmm4
997         pslldq  $8,%xmm0
998         psrldq  $8,%xmm4
999         pxor    %xmm3,%xmm0
1000         pxor    %xmm4,%xmm1
1001
1002
1003         movdqa  %xmm0,%xmm4
1004         psrlq   $5,%xmm0
1005         pxor    %xmm4,%xmm0
1006         psrlq   $1,%xmm0
1007         pxor    %xmm4,%xmm0
1008         pxor    %xmm1,%xmm4
1009         psrlq   $1,%xmm0
1010         pxor    %xmm4,%xmm0
1011 .Ldone:
1012 .byte   102,15,56,0,197
1013         movdqu  %xmm0,(%rdi)
1014         .byte   0xf3,0xc3
1015 .LSEH_end_gcm_ghash_clmul:
1016 .size   gcm_ghash_clmul,.-gcm_ghash_clmul
1017 .align  64
1018 .Lbswap_mask:
1019 .byte   15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1020 .L0x1c2_polynomial:
1021 .byte   1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1022 .align  64
1023 .type   .Lrem_4bit,@object
1024 .Lrem_4bit:
1025 .long   0,0,0,471859200,0,943718400,0,610271232
1026 .long   0,1887436800,0,1822425088,0,1220542464,0,1423966208
1027 .long   0,3774873600,0,4246732800,0,3644850176,0,3311403008
1028 .long   0,2441084928,0,2376073216,0,2847932416,0,3051356160
1029 .type   .Lrem_8bit,@object
1030 .Lrem_8bit:
1031 .value  0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
1032 .value  0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
1033 .value  0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
1034 .value  0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
1035 .value  0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
1036 .value  0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
1037 .value  0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
1038 .value  0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
1039 .value  0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
1040 .value  0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
1041 .value  0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
1042 .value  0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
1043 .value  0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
1044 .value  0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
1045 .value  0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
1046 .value  0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
1047 .value  0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
1048 .value  0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
1049 .value  0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
1050 .value  0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
1051 .value  0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
1052 .value  0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
1053 .value  0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
1054 .value  0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
1055 .value  0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
1056 .value  0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
1057 .value  0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
1058 .value  0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
1059 .value  0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
1060 .value  0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
1061 .value  0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
1062 .value  0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
1063
1064 .byte   71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1065 .align  64
1066
1067 .section .note.GNU-stack,"",%progbits