Utilize the optimized SHA functions in Padlock HMAC.
[gnutls:gnutls.git] / lib / accelerated / x86 / macosx / appro-aes-gcm-x86-64-macosx.s
1 /*
2 # Copyright (c) 2011-2012, Andy Polyakov <appro@openssl.org>
3 # All rights reserved.
4 #
5 # Redistribution and use in source and binary forms, with or without
6 # modification, are permitted provided that the following conditions
7 # are met:
8
9 #     * Redistributions of source code must retain copyright notices,
10 #      this list of conditions and the following disclaimer.
11 #
12 #     * Redistributions in binary form must reproduce the above
13 #      copyright notice, this list of conditions and the following
14 #      disclaimer in the documentation and/or other materials
15 #      provided with the distribution.
16 #
17 #     * Neither the name of the Andy Polyakov nor the names of its
18 #      copyright holder and contributors may be used to endorse or
19 #      promote products derived from this software without specific
20 #      prior written permission.
21 #
22 # ALTERNATIVELY, provided that this notice is retained in full, this
23 # product may be distributed under the terms of the GNU General Public
24 # License (GPL), in which case the provisions of the GPL apply INSTEAD OF
25 # those given above.
26 #
27 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
28 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 #
39 # *** This file is auto-generated ***
40 #
41 */
42 .text   
43
44 .globl  _gcm_gmult_4bit
45
46 .p2align        4
47 _gcm_gmult_4bit:
48         pushq   %rbx
49         pushq   %rbp
50         pushq   %r12
51 L$gmult_prologue:
52
53         movzbq  15(%rdi),%r8
54         leaq    L$rem_4bit(%rip),%r11
55         xorq    %rax,%rax
56         xorq    %rbx,%rbx
57         movb    %r8b,%al
58         movb    %r8b,%bl
59         shlb    $4,%al
60         movq    $14,%rcx
61         movq    8(%rsi,%rax,1),%r8
62         movq    (%rsi,%rax,1),%r9
63         andb    $240,%bl
64         movq    %r8,%rdx
65         jmp     L$oop1
66
67 .p2align        4
68 L$oop1:
69         shrq    $4,%r8
70         andq    $15,%rdx
71         movq    %r9,%r10
72         movb    (%rdi,%rcx,1),%al
73         shrq    $4,%r9
74         xorq    8(%rsi,%rbx,1),%r8
75         shlq    $60,%r10
76         xorq    (%rsi,%rbx,1),%r9
77         movb    %al,%bl
78         xorq    (%r11,%rdx,8),%r9
79         movq    %r8,%rdx
80         shlb    $4,%al
81         xorq    %r10,%r8
82         decq    %rcx
83         js      L$break1
84
85         shrq    $4,%r8
86         andq    $15,%rdx
87         movq    %r9,%r10
88         shrq    $4,%r9
89         xorq    8(%rsi,%rax,1),%r8
90         shlq    $60,%r10
91         xorq    (%rsi,%rax,1),%r9
92         andb    $240,%bl
93         xorq    (%r11,%rdx,8),%r9
94         movq    %r8,%rdx
95         xorq    %r10,%r8
96         jmp     L$oop1
97
98 .p2align        4
99 L$break1:
100         shrq    $4,%r8
101         andq    $15,%rdx
102         movq    %r9,%r10
103         shrq    $4,%r9
104         xorq    8(%rsi,%rax,1),%r8
105         shlq    $60,%r10
106         xorq    (%rsi,%rax,1),%r9
107         andb    $240,%bl
108         xorq    (%r11,%rdx,8),%r9
109         movq    %r8,%rdx
110         xorq    %r10,%r8
111
112         shrq    $4,%r8
113         andq    $15,%rdx
114         movq    %r9,%r10
115         shrq    $4,%r9
116         xorq    8(%rsi,%rbx,1),%r8
117         shlq    $60,%r10
118         xorq    (%rsi,%rbx,1),%r9
119         xorq    %r10,%r8
120         xorq    (%r11,%rdx,8),%r9
121
122         bswapq  %r8
123         bswapq  %r9
124         movq    %r8,8(%rdi)
125         movq    %r9,(%rdi)
126
127         movq    16(%rsp),%rbx
128         leaq    24(%rsp),%rsp
129 L$gmult_epilogue:
130         .byte   0xf3,0xc3
131
132 .globl  _gcm_ghash_4bit
133
134 .p2align        4
135 _gcm_ghash_4bit:
136         pushq   %rbx
137         pushq   %rbp
138         pushq   %r12
139         pushq   %r13
140         pushq   %r14
141         pushq   %r15
142         subq    $280,%rsp
143 L$ghash_prologue:
144         movq    %rdx,%r14
145         movq    %rcx,%r15
146         subq    $-128,%rsi
147         leaq    16+128(%rsp),%rbp
148         xorl    %edx,%edx
149         movq    0+0-128(%rsi),%r8
150         movq    0+8-128(%rsi),%rax
151         movb    %al,%dl
152         shrq    $4,%rax
153         movq    %r8,%r10
154         shrq    $4,%r8
155         movq    16+0-128(%rsi),%r9
156         shlb    $4,%dl
157         movq    16+8-128(%rsi),%rbx
158         shlq    $60,%r10
159         movb    %dl,0(%rsp)
160         orq     %r10,%rax
161         movb    %bl,%dl
162         shrq    $4,%rbx
163         movq    %r9,%r10
164         shrq    $4,%r9
165         movq    %r8,0(%rbp)
166         movq    32+0-128(%rsi),%r8
167         shlb    $4,%dl
168         movq    %rax,0-128(%rbp)
169         movq    32+8-128(%rsi),%rax
170         shlq    $60,%r10
171         movb    %dl,1(%rsp)
172         orq     %r10,%rbx
173         movb    %al,%dl
174         shrq    $4,%rax
175         movq    %r8,%r10
176         shrq    $4,%r8
177         movq    %r9,8(%rbp)
178         movq    48+0-128(%rsi),%r9
179         shlb    $4,%dl
180         movq    %rbx,8-128(%rbp)
181         movq    48+8-128(%rsi),%rbx
182         shlq    $60,%r10
183         movb    %dl,2(%rsp)
184         orq     %r10,%rax
185         movb    %bl,%dl
186         shrq    $4,%rbx
187         movq    %r9,%r10
188         shrq    $4,%r9
189         movq    %r8,16(%rbp)
190         movq    64+0-128(%rsi),%r8
191         shlb    $4,%dl
192         movq    %rax,16-128(%rbp)
193         movq    64+8-128(%rsi),%rax
194         shlq    $60,%r10
195         movb    %dl,3(%rsp)
196         orq     %r10,%rbx
197         movb    %al,%dl
198         shrq    $4,%rax
199         movq    %r8,%r10
200         shrq    $4,%r8
201         movq    %r9,24(%rbp)
202         movq    80+0-128(%rsi),%r9
203         shlb    $4,%dl
204         movq    %rbx,24-128(%rbp)
205         movq    80+8-128(%rsi),%rbx
206         shlq    $60,%r10
207         movb    %dl,4(%rsp)
208         orq     %r10,%rax
209         movb    %bl,%dl
210         shrq    $4,%rbx
211         movq    %r9,%r10
212         shrq    $4,%r9
213         movq    %r8,32(%rbp)
214         movq    96+0-128(%rsi),%r8
215         shlb    $4,%dl
216         movq    %rax,32-128(%rbp)
217         movq    96+8-128(%rsi),%rax
218         shlq    $60,%r10
219         movb    %dl,5(%rsp)
220         orq     %r10,%rbx
221         movb    %al,%dl
222         shrq    $4,%rax
223         movq    %r8,%r10
224         shrq    $4,%r8
225         movq    %r9,40(%rbp)
226         movq    112+0-128(%rsi),%r9
227         shlb    $4,%dl
228         movq    %rbx,40-128(%rbp)
229         movq    112+8-128(%rsi),%rbx
230         shlq    $60,%r10
231         movb    %dl,6(%rsp)
232         orq     %r10,%rax
233         movb    %bl,%dl
234         shrq    $4,%rbx
235         movq    %r9,%r10
236         shrq    $4,%r9
237         movq    %r8,48(%rbp)
238         movq    128+0-128(%rsi),%r8
239         shlb    $4,%dl
240         movq    %rax,48-128(%rbp)
241         movq    128+8-128(%rsi),%rax
242         shlq    $60,%r10
243         movb    %dl,7(%rsp)
244         orq     %r10,%rbx
245         movb    %al,%dl
246         shrq    $4,%rax
247         movq    %r8,%r10
248         shrq    $4,%r8
249         movq    %r9,56(%rbp)
250         movq    144+0-128(%rsi),%r9
251         shlb    $4,%dl
252         movq    %rbx,56-128(%rbp)
253         movq    144+8-128(%rsi),%rbx
254         shlq    $60,%r10
255         movb    %dl,8(%rsp)
256         orq     %r10,%rax
257         movb    %bl,%dl
258         shrq    $4,%rbx
259         movq    %r9,%r10
260         shrq    $4,%r9
261         movq    %r8,64(%rbp)
262         movq    160+0-128(%rsi),%r8
263         shlb    $4,%dl
264         movq    %rax,64-128(%rbp)
265         movq    160+8-128(%rsi),%rax
266         shlq    $60,%r10
267         movb    %dl,9(%rsp)
268         orq     %r10,%rbx
269         movb    %al,%dl
270         shrq    $4,%rax
271         movq    %r8,%r10
272         shrq    $4,%r8
273         movq    %r9,72(%rbp)
274         movq    176+0-128(%rsi),%r9
275         shlb    $4,%dl
276         movq    %rbx,72-128(%rbp)
277         movq    176+8-128(%rsi),%rbx
278         shlq    $60,%r10
279         movb    %dl,10(%rsp)
280         orq     %r10,%rax
281         movb    %bl,%dl
282         shrq    $4,%rbx
283         movq    %r9,%r10
284         shrq    $4,%r9
285         movq    %r8,80(%rbp)
286         movq    192+0-128(%rsi),%r8
287         shlb    $4,%dl
288         movq    %rax,80-128(%rbp)
289         movq    192+8-128(%rsi),%rax
290         shlq    $60,%r10
291         movb    %dl,11(%rsp)
292         orq     %r10,%rbx
293         movb    %al,%dl
294         shrq    $4,%rax
295         movq    %r8,%r10
296         shrq    $4,%r8
297         movq    %r9,88(%rbp)
298         movq    208+0-128(%rsi),%r9
299         shlb    $4,%dl
300         movq    %rbx,88-128(%rbp)
301         movq    208+8-128(%rsi),%rbx
302         shlq    $60,%r10
303         movb    %dl,12(%rsp)
304         orq     %r10,%rax
305         movb    %bl,%dl
306         shrq    $4,%rbx
307         movq    %r9,%r10
308         shrq    $4,%r9
309         movq    %r8,96(%rbp)
310         movq    224+0-128(%rsi),%r8
311         shlb    $4,%dl
312         movq    %rax,96-128(%rbp)
313         movq    224+8-128(%rsi),%rax
314         shlq    $60,%r10
315         movb    %dl,13(%rsp)
316         orq     %r10,%rbx
317         movb    %al,%dl
318         shrq    $4,%rax
319         movq    %r8,%r10
320         shrq    $4,%r8
321         movq    %r9,104(%rbp)
322         movq    240+0-128(%rsi),%r9
323         shlb    $4,%dl
324         movq    %rbx,104-128(%rbp)
325         movq    240+8-128(%rsi),%rbx
326         shlq    $60,%r10
327         movb    %dl,14(%rsp)
328         orq     %r10,%rax
329         movb    %bl,%dl
330         shrq    $4,%rbx
331         movq    %r9,%r10
332         shrq    $4,%r9
333         movq    %r8,112(%rbp)
334         shlb    $4,%dl
335         movq    %rax,112-128(%rbp)
336         shlq    $60,%r10
337         movb    %dl,15(%rsp)
338         orq     %r10,%rbx
339         movq    %r9,120(%rbp)
340         movq    %rbx,120-128(%rbp)
341         addq    $-128,%rsi
342         movq    8(%rdi),%r8
343         movq    0(%rdi),%r9
344         addq    %r14,%r15
345         leaq    L$rem_8bit(%rip),%r11
346         jmp     L$outer_loop
347 .p2align        4
348 L$outer_loop:
349         xorq    (%r14),%r9
350         movq    8(%r14),%rdx
351         leaq    16(%r14),%r14
352         xorq    %r8,%rdx
353         movq    %r9,(%rdi)
354         movq    %rdx,8(%rdi)
355         shrq    $32,%rdx
356         xorq    %rax,%rax
357         roll    $8,%edx
358         movb    %dl,%al
359         movzbl  %dl,%ebx
360         shlb    $4,%al
361         shrl    $4,%ebx
362         roll    $8,%edx
363         movq    8(%rsi,%rax,1),%r8
364         movq    (%rsi,%rax,1),%r9
365         movb    %dl,%al
366         movzbl  %dl,%ecx
367         shlb    $4,%al
368         movzbq  (%rsp,%rbx,1),%r12
369         shrl    $4,%ecx
370         xorq    %r8,%r12
371         movq    %r9,%r10
372         shrq    $8,%r8
373         movzbq  %r12b,%r12
374         shrq    $8,%r9
375         xorq    -128(%rbp,%rbx,8),%r8
376         shlq    $56,%r10
377         xorq    (%rbp,%rbx,8),%r9
378         roll    $8,%edx
379         xorq    8(%rsi,%rax,1),%r8
380         xorq    (%rsi,%rax,1),%r9
381         movb    %dl,%al
382         xorq    %r10,%r8
383         movzwq  (%r11,%r12,2),%r12
384         movzbl  %dl,%ebx
385         shlb    $4,%al
386         movzbq  (%rsp,%rcx,1),%r13
387         shrl    $4,%ebx
388         shlq    $48,%r12
389         xorq    %r8,%r13
390         movq    %r9,%r10
391         xorq    %r12,%r9
392         shrq    $8,%r8
393         movzbq  %r13b,%r13
394         shrq    $8,%r9
395         xorq    -128(%rbp,%rcx,8),%r8
396         shlq    $56,%r10
397         xorq    (%rbp,%rcx,8),%r9
398         roll    $8,%edx
399         xorq    8(%rsi,%rax,1),%r8
400         xorq    (%rsi,%rax,1),%r9
401         movb    %dl,%al
402         xorq    %r10,%r8
403         movzwq  (%r11,%r13,2),%r13
404         movzbl  %dl,%ecx
405         shlb    $4,%al
406         movzbq  (%rsp,%rbx,1),%r12
407         shrl    $4,%ecx
408         shlq    $48,%r13
409         xorq    %r8,%r12
410         movq    %r9,%r10
411         xorq    %r13,%r9
412         shrq    $8,%r8
413         movzbq  %r12b,%r12
414         movl    8(%rdi),%edx
415         shrq    $8,%r9
416         xorq    -128(%rbp,%rbx,8),%r8
417         shlq    $56,%r10
418         xorq    (%rbp,%rbx,8),%r9
419         roll    $8,%edx
420         xorq    8(%rsi,%rax,1),%r8
421         xorq    (%rsi,%rax,1),%r9
422         movb    %dl,%al
423         xorq    %r10,%r8
424         movzwq  (%r11,%r12,2),%r12
425         movzbl  %dl,%ebx
426         shlb    $4,%al
427         movzbq  (%rsp,%rcx,1),%r13
428         shrl    $4,%ebx
429         shlq    $48,%r12
430         xorq    %r8,%r13
431         movq    %r9,%r10
432         xorq    %r12,%r9
433         shrq    $8,%r8
434         movzbq  %r13b,%r13
435         shrq    $8,%r9
436         xorq    -128(%rbp,%rcx,8),%r8
437         shlq    $56,%r10
438         xorq    (%rbp,%rcx,8),%r9
439         roll    $8,%edx
440         xorq    8(%rsi,%rax,1),%r8
441         xorq    (%rsi,%rax,1),%r9
442         movb    %dl,%al
443         xorq    %r10,%r8
444         movzwq  (%r11,%r13,2),%r13
445         movzbl  %dl,%ecx
446         shlb    $4,%al
447         movzbq  (%rsp,%rbx,1),%r12
448         shrl    $4,%ecx
449         shlq    $48,%r13
450         xorq    %r8,%r12
451         movq    %r9,%r10
452         xorq    %r13,%r9
453         shrq    $8,%r8
454         movzbq  %r12b,%r12
455         shrq    $8,%r9
456         xorq    -128(%rbp,%rbx,8),%r8
457         shlq    $56,%r10
458         xorq    (%rbp,%rbx,8),%r9
459         roll    $8,%edx
460         xorq    8(%rsi,%rax,1),%r8
461         xorq    (%rsi,%rax,1),%r9
462         movb    %dl,%al
463         xorq    %r10,%r8
464         movzwq  (%r11,%r12,2),%r12
465         movzbl  %dl,%ebx
466         shlb    $4,%al
467         movzbq  (%rsp,%rcx,1),%r13
468         shrl    $4,%ebx
469         shlq    $48,%r12
470         xorq    %r8,%r13
471         movq    %r9,%r10
472         xorq    %r12,%r9
473         shrq    $8,%r8
474         movzbq  %r13b,%r13
475         shrq    $8,%r9
476         xorq    -128(%rbp,%rcx,8),%r8
477         shlq    $56,%r10
478         xorq    (%rbp,%rcx,8),%r9
479         roll    $8,%edx
480         xorq    8(%rsi,%rax,1),%r8
481         xorq    (%rsi,%rax,1),%r9
482         movb    %dl,%al
483         xorq    %r10,%r8
484         movzwq  (%r11,%r13,2),%r13
485         movzbl  %dl,%ecx
486         shlb    $4,%al
487         movzbq  (%rsp,%rbx,1),%r12
488         shrl    $4,%ecx
489         shlq    $48,%r13
490         xorq    %r8,%r12
491         movq    %r9,%r10
492         xorq    %r13,%r9
493         shrq    $8,%r8
494         movzbq  %r12b,%r12
495         movl    4(%rdi),%edx
496         shrq    $8,%r9
497         xorq    -128(%rbp,%rbx,8),%r8
498         shlq    $56,%r10
499         xorq    (%rbp,%rbx,8),%r9
500         roll    $8,%edx
501         xorq    8(%rsi,%rax,1),%r8
502         xorq    (%rsi,%rax,1),%r9
503         movb    %dl,%al
504         xorq    %r10,%r8
505         movzwq  (%r11,%r12,2),%r12
506         movzbl  %dl,%ebx
507         shlb    $4,%al
508         movzbq  (%rsp,%rcx,1),%r13
509         shrl    $4,%ebx
510         shlq    $48,%r12
511         xorq    %r8,%r13
512         movq    %r9,%r10
513         xorq    %r12,%r9
514         shrq    $8,%r8
515         movzbq  %r13b,%r13
516         shrq    $8,%r9
517         xorq    -128(%rbp,%rcx,8),%r8
518         shlq    $56,%r10
519         xorq    (%rbp,%rcx,8),%r9
520         roll    $8,%edx
521         xorq    8(%rsi,%rax,1),%r8
522         xorq    (%rsi,%rax,1),%r9
523         movb    %dl,%al
524         xorq    %r10,%r8
525         movzwq  (%r11,%r13,2),%r13
526         movzbl  %dl,%ecx
527         shlb    $4,%al
528         movzbq  (%rsp,%rbx,1),%r12
529         shrl    $4,%ecx
530         shlq    $48,%r13
531         xorq    %r8,%r12
532         movq    %r9,%r10
533         xorq    %r13,%r9
534         shrq    $8,%r8
535         movzbq  %r12b,%r12
536         shrq    $8,%r9
537         xorq    -128(%rbp,%rbx,8),%r8
538         shlq    $56,%r10
539         xorq    (%rbp,%rbx,8),%r9
540         roll    $8,%edx
541         xorq    8(%rsi,%rax,1),%r8
542         xorq    (%rsi,%rax,1),%r9
543         movb    %dl,%al
544         xorq    %r10,%r8
545         movzwq  (%r11,%r12,2),%r12
546         movzbl  %dl,%ebx
547         shlb    $4,%al
548         movzbq  (%rsp,%rcx,1),%r13
549         shrl    $4,%ebx
550         shlq    $48,%r12
551         xorq    %r8,%r13
552         movq    %r9,%r10
553         xorq    %r12,%r9
554         shrq    $8,%r8
555         movzbq  %r13b,%r13
556         shrq    $8,%r9
557         xorq    -128(%rbp,%rcx,8),%r8
558         shlq    $56,%r10
559         xorq    (%rbp,%rcx,8),%r9
560         roll    $8,%edx
561         xorq    8(%rsi,%rax,1),%r8
562         xorq    (%rsi,%rax,1),%r9
563         movb    %dl,%al
564         xorq    %r10,%r8
565         movzwq  (%r11,%r13,2),%r13
566         movzbl  %dl,%ecx
567         shlb    $4,%al
568         movzbq  (%rsp,%rbx,1),%r12
569         shrl    $4,%ecx
570         shlq    $48,%r13
571         xorq    %r8,%r12
572         movq    %r9,%r10
573         xorq    %r13,%r9
574         shrq    $8,%r8
575         movzbq  %r12b,%r12
576         movl    0(%rdi),%edx
577         shrq    $8,%r9
578         xorq    -128(%rbp,%rbx,8),%r8
579         shlq    $56,%r10
580         xorq    (%rbp,%rbx,8),%r9
581         roll    $8,%edx
582         xorq    8(%rsi,%rax,1),%r8
583         xorq    (%rsi,%rax,1),%r9
584         movb    %dl,%al
585         xorq    %r10,%r8
586         movzwq  (%r11,%r12,2),%r12
587         movzbl  %dl,%ebx
588         shlb    $4,%al
589         movzbq  (%rsp,%rcx,1),%r13
590         shrl    $4,%ebx
591         shlq    $48,%r12
592         xorq    %r8,%r13
593         movq    %r9,%r10
594         xorq    %r12,%r9
595         shrq    $8,%r8
596         movzbq  %r13b,%r13
597         shrq    $8,%r9
598         xorq    -128(%rbp,%rcx,8),%r8
599         shlq    $56,%r10
600         xorq    (%rbp,%rcx,8),%r9
601         roll    $8,%edx
602         xorq    8(%rsi,%rax,1),%r8
603         xorq    (%rsi,%rax,1),%r9
604         movb    %dl,%al
605         xorq    %r10,%r8
606         movzwq  (%r11,%r13,2),%r13
607         movzbl  %dl,%ecx
608         shlb    $4,%al
609         movzbq  (%rsp,%rbx,1),%r12
610         shrl    $4,%ecx
611         shlq    $48,%r13
612         xorq    %r8,%r12
613         movq    %r9,%r10
614         xorq    %r13,%r9
615         shrq    $8,%r8
616         movzbq  %r12b,%r12
617         shrq    $8,%r9
618         xorq    -128(%rbp,%rbx,8),%r8
619         shlq    $56,%r10
620         xorq    (%rbp,%rbx,8),%r9
621         roll    $8,%edx
622         xorq    8(%rsi,%rax,1),%r8
623         xorq    (%rsi,%rax,1),%r9
624         movb    %dl,%al
625         xorq    %r10,%r8
626         movzwq  (%r11,%r12,2),%r12
627         movzbl  %dl,%ebx
628         shlb    $4,%al
629         movzbq  (%rsp,%rcx,1),%r13
630         shrl    $4,%ebx
631         shlq    $48,%r12
632         xorq    %r8,%r13
633         movq    %r9,%r10
634         xorq    %r12,%r9
635         shrq    $8,%r8
636         movzbq  %r13b,%r13
637         shrq    $8,%r9
638         xorq    -128(%rbp,%rcx,8),%r8
639         shlq    $56,%r10
640         xorq    (%rbp,%rcx,8),%r9
641         roll    $8,%edx
642         xorq    8(%rsi,%rax,1),%r8
643         xorq    (%rsi,%rax,1),%r9
644         movb    %dl,%al
645         xorq    %r10,%r8
646         movzwq  (%r11,%r13,2),%r13
647         movzbl  %dl,%ecx
648         shlb    $4,%al
649         movzbq  (%rsp,%rbx,1),%r12
650         andl    $240,%ecx
651         shlq    $48,%r13
652         xorq    %r8,%r12
653         movq    %r9,%r10
654         xorq    %r13,%r9
655         shrq    $8,%r8
656         movzbq  %r12b,%r12
657         movl    -4(%rdi),%edx
658         shrq    $8,%r9
659         xorq    -128(%rbp,%rbx,8),%r8
660         shlq    $56,%r10
661         xorq    (%rbp,%rbx,8),%r9
662         movzwq  (%r11,%r12,2),%r12
663         xorq    8(%rsi,%rax,1),%r8
664         xorq    (%rsi,%rax,1),%r9
665         shlq    $48,%r12
666         xorq    %r10,%r8
667         xorq    %r12,%r9
668         movzbq  %r8b,%r13
669         shrq    $4,%r8
670         movq    %r9,%r10
671         shlb    $4,%r13b
672         shrq    $4,%r9
673         xorq    8(%rsi,%rcx,1),%r8
674         movzwq  (%r11,%r13,2),%r13
675         shlq    $60,%r10
676         xorq    (%rsi,%rcx,1),%r9
677         xorq    %r10,%r8
678         shlq    $48,%r13
679         bswapq  %r8
680         xorq    %r13,%r9
681         bswapq  %r9
682         cmpq    %r15,%r14
683         jb      L$outer_loop
684         movq    %r8,8(%rdi)
685         movq    %r9,(%rdi)
686
687         leaq    280(%rsp),%rsi
688         movq    0(%rsi),%r15
689         movq    8(%rsi),%r14
690         movq    16(%rsi),%r13
691         movq    24(%rsi),%r12
692         movq    32(%rsi),%rbp
693         movq    40(%rsi),%rbx
694         leaq    48(%rsi),%rsp
695 L$ghash_epilogue:
696         .byte   0xf3,0xc3
697
698 .globl  _gcm_init_clmul
699
700 .p2align        4
701 _gcm_init_clmul:
702 L$_init_clmul:
703         movdqu  (%rsi),%xmm2
704         pshufd  $78,%xmm2,%xmm2
705
706
707         pshufd  $255,%xmm2,%xmm4
708         movdqa  %xmm2,%xmm3
709         psllq   $1,%xmm2
710         pxor    %xmm5,%xmm5
711         psrlq   $63,%xmm3
712         pcmpgtd %xmm4,%xmm5
713         pslldq  $8,%xmm3
714         por     %xmm3,%xmm2
715
716
717         pand    L$0x1c2_polynomial(%rip),%xmm5
718         pxor    %xmm5,%xmm2
719
720
721         pshufd  $78,%xmm2,%xmm6
722         movdqa  %xmm2,%xmm0
723         pxor    %xmm2,%xmm6
724         movdqa  %xmm0,%xmm1
725         pshufd  $78,%xmm0,%xmm3
726         pxor    %xmm0,%xmm3
727 .byte   102,15,58,68,194,0
728 .byte   102,15,58,68,202,17
729 .byte   102,15,58,68,222,0
730         pxor    %xmm0,%xmm3
731         pxor    %xmm1,%xmm3
732
733         movdqa  %xmm3,%xmm4
734         psrldq  $8,%xmm3
735         pslldq  $8,%xmm4
736         pxor    %xmm3,%xmm1
737         pxor    %xmm4,%xmm0
738
739         movdqa  %xmm0,%xmm4
740         movdqa  %xmm0,%xmm3
741         psllq   $5,%xmm0
742         pxor    %xmm0,%xmm3
743         psllq   $1,%xmm0
744         pxor    %xmm3,%xmm0
745         psllq   $57,%xmm0
746         movdqa  %xmm0,%xmm3
747         pslldq  $8,%xmm0
748         psrldq  $8,%xmm3
749         pxor    %xmm4,%xmm0
750         pxor    %xmm3,%xmm1
751
752
753         movdqa  %xmm0,%xmm4
754         psrlq   $1,%xmm0
755         pxor    %xmm4,%xmm1
756         pxor    %xmm0,%xmm4
757         psrlq   $5,%xmm0
758         pxor    %xmm4,%xmm0
759         psrlq   $1,%xmm0
760         pxor    %xmm1,%xmm0
761         pshufd  $78,%xmm2,%xmm3
762         pshufd  $78,%xmm0,%xmm4
763         pxor    %xmm2,%xmm3
764         movdqu  %xmm2,0(%rdi)
765         pxor    %xmm0,%xmm4
766         movdqu  %xmm0,16(%rdi)
767 .byte   102,15,58,15,227,8
768         movdqu  %xmm4,32(%rdi)
769         movdqa  %xmm0,%xmm1
770         pshufd  $78,%xmm0,%xmm3
771         pxor    %xmm0,%xmm3
772 .byte   102,15,58,68,194,0
773 .byte   102,15,58,68,202,17
774 .byte   102,15,58,68,222,0
775         pxor    %xmm0,%xmm3
776         pxor    %xmm1,%xmm3
777
778         movdqa  %xmm3,%xmm4
779         psrldq  $8,%xmm3
780         pslldq  $8,%xmm4
781         pxor    %xmm3,%xmm1
782         pxor    %xmm4,%xmm0
783
784         movdqa  %xmm0,%xmm4
785         movdqa  %xmm0,%xmm3
786         psllq   $5,%xmm0
787         pxor    %xmm0,%xmm3
788         psllq   $1,%xmm0
789         pxor    %xmm3,%xmm0
790         psllq   $57,%xmm0
791         movdqa  %xmm0,%xmm3
792         pslldq  $8,%xmm0
793         psrldq  $8,%xmm3
794         pxor    %xmm4,%xmm0
795         pxor    %xmm3,%xmm1
796
797
798         movdqa  %xmm0,%xmm4
799         psrlq   $1,%xmm0
800         pxor    %xmm4,%xmm1
801         pxor    %xmm0,%xmm4
802         psrlq   $5,%xmm0
803         pxor    %xmm4,%xmm0
804         psrlq   $1,%xmm0
805         pxor    %xmm1,%xmm0
806         movdqa  %xmm0,%xmm5
807         movdqa  %xmm0,%xmm1
808         pshufd  $78,%xmm0,%xmm3
809         pxor    %xmm0,%xmm3
810 .byte   102,15,58,68,194,0
811 .byte   102,15,58,68,202,17
812 .byte   102,15,58,68,222,0
813         pxor    %xmm0,%xmm3
814         pxor    %xmm1,%xmm3
815
816         movdqa  %xmm3,%xmm4
817         psrldq  $8,%xmm3
818         pslldq  $8,%xmm4
819         pxor    %xmm3,%xmm1
820         pxor    %xmm4,%xmm0
821
822         movdqa  %xmm0,%xmm4
823         movdqa  %xmm0,%xmm3
824         psllq   $5,%xmm0
825         pxor    %xmm0,%xmm3
826         psllq   $1,%xmm0
827         pxor    %xmm3,%xmm0
828         psllq   $57,%xmm0
829         movdqa  %xmm0,%xmm3
830         pslldq  $8,%xmm0
831         psrldq  $8,%xmm3
832         pxor    %xmm4,%xmm0
833         pxor    %xmm3,%xmm1
834
835
836         movdqa  %xmm0,%xmm4
837         psrlq   $1,%xmm0
838         pxor    %xmm4,%xmm1
839         pxor    %xmm0,%xmm4
840         psrlq   $5,%xmm0
841         pxor    %xmm4,%xmm0
842         psrlq   $1,%xmm0
843         pxor    %xmm1,%xmm0
844         pshufd  $78,%xmm5,%xmm3
845         pshufd  $78,%xmm0,%xmm4
846         pxor    %xmm5,%xmm3
847         movdqu  %xmm5,48(%rdi)
848         pxor    %xmm0,%xmm4
849         movdqu  %xmm0,64(%rdi)
850 .byte   102,15,58,15,227,8
851         movdqu  %xmm4,80(%rdi)
852         .byte   0xf3,0xc3
853
854 .globl  _gcm_gmult_clmul
855
856 .p2align        4
857 _gcm_gmult_clmul:
858 L$_gmult_clmul:
859         movdqu  (%rdi),%xmm0
860         movdqa  L$bswap_mask(%rip),%xmm5
861         movdqu  (%rsi),%xmm2
862         movdqu  32(%rsi),%xmm4
863 .byte   102,15,56,0,197
864         movdqa  %xmm0,%xmm1
865         pshufd  $78,%xmm0,%xmm3
866         pxor    %xmm0,%xmm3
867 .byte   102,15,58,68,194,0
868 .byte   102,15,58,68,202,17
869 .byte   102,15,58,68,220,0
870         pxor    %xmm0,%xmm3
871         pxor    %xmm1,%xmm3
872
873         movdqa  %xmm3,%xmm4
874         psrldq  $8,%xmm3
875         pslldq  $8,%xmm4
876         pxor    %xmm3,%xmm1
877         pxor    %xmm4,%xmm0
878
879         movdqa  %xmm0,%xmm4
880         movdqa  %xmm0,%xmm3
881         psllq   $5,%xmm0
882         pxor    %xmm0,%xmm3
883         psllq   $1,%xmm0
884         pxor    %xmm3,%xmm0
885         psllq   $57,%xmm0
886         movdqa  %xmm0,%xmm3
887         pslldq  $8,%xmm0
888         psrldq  $8,%xmm3
889         pxor    %xmm4,%xmm0
890         pxor    %xmm3,%xmm1
891
892
893         movdqa  %xmm0,%xmm4
894         psrlq   $1,%xmm0
895         pxor    %xmm4,%xmm1
896         pxor    %xmm0,%xmm4
897         psrlq   $5,%xmm0
898         pxor    %xmm4,%xmm0
899         psrlq   $1,%xmm0
900         pxor    %xmm1,%xmm0
901 .byte   102,15,56,0,197
902         movdqu  %xmm0,(%rdi)
903         .byte   0xf3,0xc3
904
905 .globl  _gcm_ghash_clmul
906
907 .p2align        5
908 _gcm_ghash_clmul:
909 L$_ghash_clmul:
910         movdqa  L$bswap_mask(%rip),%xmm5
911         movq    $11547335547999543296,%rax
912
913         movdqu  (%rdi),%xmm0
914         movdqu  (%rsi),%xmm2
915         movdqu  32(%rsi),%xmm10
916 .byte   102,15,56,0,197
917
918         subq    $16,%rcx
919         jz      L$odd_tail
920
921         movdqu  16(%rsi),%xmm9
922         cmpq    $48,%rcx
923         jb      L$skip4x
924
925         subq    $48,%rcx
926         movdqu  48(%rsi),%xmm14
927         movdqu  64(%rsi),%xmm15
928
929
930
931
932         movdqu  48(%rdx),%xmm6
933         movdqu  32(%rdx),%xmm11
934 .byte   102,15,56,0,245
935 .byte   102,68,15,56,0,221
936         movdqa  %xmm6,%xmm8
937         pshufd  $78,%xmm6,%xmm7
938         pxor    %xmm6,%xmm7
939 .byte   102,15,58,68,242,0
940 .byte   102,68,15,58,68,194,17
941 .byte   102,65,15,58,68,250,0
942
943         movdqa  %xmm11,%xmm13
944         pshufd  $78,%xmm11,%xmm12
945         pxor    %xmm11,%xmm12
946 .byte   102,69,15,58,68,217,0
947 .byte   102,69,15,58,68,233,17
948         xorps   %xmm11,%xmm6
949 .byte   102,69,15,58,68,226,16
950         xorps   %xmm13,%xmm8
951         movups  80(%rsi),%xmm10
952         xorps   %xmm12,%xmm7
953
954         movdqu  16(%rdx),%xmm11
955         movdqu  0(%rdx),%xmm3
956 .byte   102,68,15,56,0,221
957 .byte   102,15,56,0,221
958         movdqa  %xmm11,%xmm13
959         pshufd  $78,%xmm11,%xmm12
960         pxor    %xmm3,%xmm0
961         pxor    %xmm11,%xmm12
962 .byte   102,69,15,58,68,222,0
963         movdqa  %xmm0,%xmm1
964         pshufd  $78,%xmm0,%xmm3
965         pxor    %xmm0,%xmm3
966 .byte   102,69,15,58,68,238,17
967         xorps   %xmm11,%xmm6
968 .byte   102,69,15,58,68,226,0
969         xorps   %xmm13,%xmm8
970
971         leaq    64(%rdx),%rdx
972         subq    $64,%rcx
973         jc      L$tail4x
974
975         jmp     L$mod4_loop
976 .p2align        5
977 L$mod4_loop:
978 .byte   102,65,15,58,68,199,0
979         xorps   %xmm12,%xmm7
980         movdqu  48(%rdx),%xmm11
981 .byte   102,68,15,56,0,221
982 .byte   102,65,15,58,68,207,17
983         xorps   %xmm6,%xmm0
984         movdqu  32(%rdx),%xmm6
985         movdqa  %xmm11,%xmm13
986         pshufd  $78,%xmm11,%xmm12
987 .byte   102,65,15,58,68,218,16
988         xorps   %xmm8,%xmm1
989         pxor    %xmm11,%xmm12
990 .byte   102,15,56,0,245
991         movups  32(%rsi),%xmm10
992 .byte   102,68,15,58,68,218,0
993         xorps   %xmm7,%xmm3
994         movdqa  %xmm6,%xmm8
995         pshufd  $78,%xmm6,%xmm7
996
997         pxor    %xmm0,%xmm3
998         pxor    %xmm6,%xmm7
999         pxor    %xmm1,%xmm3
1000         movdqa  %xmm3,%xmm4
1001         pslldq  $8,%xmm3
1002 .byte   102,68,15,58,68,234,17
1003         psrldq  $8,%xmm4
1004         pxor    %xmm3,%xmm0
1005         movdqa  L$7_mask(%rip),%xmm3
1006         pxor    %xmm4,%xmm1
1007 .byte   102,72,15,110,224
1008
1009         pand    %xmm0,%xmm3
1010 .byte   102,15,56,0,227
1011 .byte   102,69,15,58,68,226,0
1012         pxor    %xmm0,%xmm4
1013         psllq   $57,%xmm4
1014         movdqa  %xmm4,%xmm3
1015         pslldq  $8,%xmm4
1016 .byte   102,65,15,58,68,241,0
1017         psrldq  $8,%xmm3
1018         pxor    %xmm4,%xmm0
1019         pxor    %xmm3,%xmm1
1020         movdqu  0(%rdx),%xmm3
1021
1022         movdqa  %xmm0,%xmm4
1023         psrlq   $1,%xmm0
1024 .byte   102,69,15,58,68,193,17
1025         xorps   %xmm11,%xmm6
1026         movdqu  16(%rdx),%xmm11
1027 .byte   102,68,15,56,0,221
1028 .byte   102,65,15,58,68,250,16
1029         xorps   %xmm13,%xmm8
1030         movups  80(%rsi),%xmm10
1031 .byte   102,15,56,0,221
1032         pxor    %xmm4,%xmm1
1033         pxor    %xmm0,%xmm4
1034         psrlq   $5,%xmm0
1035
1036         movdqa  %xmm11,%xmm13
1037         pxor    %xmm12,%xmm7
1038         pshufd  $78,%xmm11,%xmm12
1039         pxor    %xmm11,%xmm12
1040 .byte   102,69,15,58,68,222,0
1041         pxor    %xmm4,%xmm0
1042         pxor    %xmm3,%xmm1
1043         psrlq   $1,%xmm0
1044 .byte   102,69,15,58,68,238,17
1045         xorps   %xmm11,%xmm6
1046         pxor    %xmm1,%xmm0
1047
1048 .byte   102,69,15,58,68,226,0
1049         xorps   %xmm13,%xmm8
1050
1051         movdqa  %xmm0,%xmm1
1052         pshufd  $78,%xmm0,%xmm3
1053         pxor    %xmm0,%xmm3
1054
1055         leaq    64(%rdx),%rdx
1056         subq    $64,%rcx
1057         jnc     L$mod4_loop
1058
1059 L$tail4x:
1060 .byte   102,65,15,58,68,199,0
1061         xorps   %xmm12,%xmm7
1062 .byte   102,65,15,58,68,207,17
1063         xorps   %xmm6,%xmm0
1064 .byte   102,65,15,58,68,218,16
1065         xorps   %xmm8,%xmm1
1066         pxor    %xmm0,%xmm1
1067         pxor    %xmm7,%xmm3
1068
1069         pxor    %xmm1,%xmm3
1070         pxor    %xmm0,%xmm1
1071
1072         movdqa  %xmm3,%xmm4
1073         psrldq  $8,%xmm3
1074         pslldq  $8,%xmm4
1075         pxor    %xmm3,%xmm1
1076         pxor    %xmm4,%xmm0
1077
1078         movdqa  %xmm0,%xmm4
1079         movdqa  %xmm0,%xmm3
1080         psllq   $5,%xmm0
1081         pxor    %xmm0,%xmm3
1082         psllq   $1,%xmm0
1083         pxor    %xmm3,%xmm0
1084         psllq   $57,%xmm0
1085         movdqa  %xmm0,%xmm3
1086         pslldq  $8,%xmm0
1087         psrldq  $8,%xmm3
1088         pxor    %xmm4,%xmm0
1089         pxor    %xmm3,%xmm1
1090
1091
1092         movdqa  %xmm0,%xmm4
1093         psrlq   $1,%xmm0
1094         pxor    %xmm4,%xmm1
1095         pxor    %xmm0,%xmm4
1096         psrlq   $5,%xmm0
1097         pxor    %xmm4,%xmm0
1098         psrlq   $1,%xmm0
1099         pxor    %xmm1,%xmm0
1100         addq    $64,%rcx
1101         jz      L$done
1102         movdqu  32(%rsi),%xmm10
1103         subq    $16,%rcx
1104         jz      L$odd_tail
1105 L$skip4x:
1106
1107
1108
1109
1110
1111         movdqu  (%rdx),%xmm3
1112         movdqu  16(%rdx),%xmm6
1113 .byte   102,15,56,0,221
1114 .byte   102,15,56,0,245
1115         pxor    %xmm3,%xmm0
1116
1117         movdqa  %xmm6,%xmm8
1118         pshufd  $78,%xmm6,%xmm3
1119         pxor    %xmm6,%xmm3
1120 .byte   102,15,58,68,242,0
1121 .byte   102,68,15,58,68,194,17
1122 .byte   102,65,15,58,68,218,0
1123
1124         leaq    32(%rdx),%rdx
1125         subq    $32,%rcx
1126         jbe     L$even_tail
1127         jmp     L$mod_loop
1128
1129 .p2align        5
1130 L$mod_loop:
1131         movdqa  %xmm0,%xmm1
1132         pshufd  $78,%xmm0,%xmm4
1133         pxor    %xmm0,%xmm4
1134
1135 .byte   102,65,15,58,68,193,0
1136 .byte   102,65,15,58,68,201,17
1137 .byte   102,65,15,58,68,226,16
1138
1139         pxor    %xmm6,%xmm0
1140         pxor    %xmm8,%xmm1
1141         movdqu  (%rdx),%xmm8
1142 .byte   102,68,15,56,0,197
1143         movdqu  16(%rdx),%xmm6
1144
1145         pxor    %xmm0,%xmm3
1146         pxor    %xmm1,%xmm3
1147         pxor    %xmm8,%xmm1
1148         pxor    %xmm3,%xmm4
1149 .byte   102,15,56,0,245
1150         movdqa  %xmm4,%xmm3
1151         psrldq  $8,%xmm3
1152         pslldq  $8,%xmm4
1153         pxor    %xmm3,%xmm1
1154         pxor    %xmm4,%xmm0
1155
1156         movdqa  %xmm6,%xmm8
1157
1158         movdqa  %xmm0,%xmm4
1159         movdqa  %xmm0,%xmm3
1160         psllq   $5,%xmm0
1161 .byte   102,15,58,68,242,0
1162         pxor    %xmm0,%xmm3
1163         psllq   $1,%xmm0
1164         pxor    %xmm3,%xmm0
1165         psllq   $57,%xmm0
1166         movdqa  %xmm0,%xmm3
1167         pslldq  $8,%xmm0
1168         psrldq  $8,%xmm3
1169         pxor    %xmm4,%xmm0
1170         pxor    %xmm3,%xmm1
1171         pshufd  $78,%xmm8,%xmm3
1172         pxor    %xmm8,%xmm3
1173
1174 .byte   102,68,15,58,68,194,17
1175         movdqa  %xmm0,%xmm4
1176         psrlq   $1,%xmm0
1177         pxor    %xmm4,%xmm1
1178         pxor    %xmm0,%xmm4
1179         psrlq   $5,%xmm0
1180         pxor    %xmm4,%xmm0
1181         psrlq   $1,%xmm0
1182 .byte   102,65,15,58,68,218,0
1183         pxor    %xmm1,%xmm0
1184
1185         leaq    32(%rdx),%rdx
1186         subq    $32,%rcx
1187         ja      L$mod_loop
1188
1189 L$even_tail:
1190         movdqa  %xmm0,%xmm1
1191         pshufd  $78,%xmm0,%xmm4
1192         pxor    %xmm0,%xmm4
1193
1194 .byte   102,65,15,58,68,193,0
1195 .byte   102,65,15,58,68,201,17
1196 .byte   102,65,15,58,68,226,16
1197
1198         pxor    %xmm6,%xmm0
1199         pxor    %xmm8,%xmm1
1200         pxor    %xmm0,%xmm3
1201         pxor    %xmm1,%xmm3
1202         pxor    %xmm3,%xmm4
1203         movdqa  %xmm4,%xmm3
1204         psrldq  $8,%xmm3
1205         pslldq  $8,%xmm4
1206         pxor    %xmm3,%xmm1
1207         pxor    %xmm4,%xmm0
1208
1209         movdqa  %xmm0,%xmm4
1210         movdqa  %xmm0,%xmm3
1211         psllq   $5,%xmm0
1212         pxor    %xmm0,%xmm3
1213         psllq   $1,%xmm0
1214         pxor    %xmm3,%xmm0
1215         psllq   $57,%xmm0
1216         movdqa  %xmm0,%xmm3
1217         pslldq  $8,%xmm0
1218         psrldq  $8,%xmm3
1219         pxor    %xmm4,%xmm0
1220         pxor    %xmm3,%xmm1
1221
1222
1223         movdqa  %xmm0,%xmm4
1224         psrlq   $1,%xmm0
1225         pxor    %xmm4,%xmm1
1226         pxor    %xmm0,%xmm4
1227         psrlq   $5,%xmm0
1228         pxor    %xmm4,%xmm0
1229         psrlq   $1,%xmm0
1230         pxor    %xmm1,%xmm0
1231         testq   %rcx,%rcx
1232         jnz     L$done
1233
1234 L$odd_tail:
1235         movdqu  (%rdx),%xmm3
1236 .byte   102,15,56,0,221
1237         pxor    %xmm3,%xmm0
1238         movdqa  %xmm0,%xmm1
1239         pshufd  $78,%xmm0,%xmm3
1240         pxor    %xmm0,%xmm3
1241 .byte   102,15,58,68,194,0
1242 .byte   102,15,58,68,202,17
1243 .byte   102,65,15,58,68,218,0
1244         pxor    %xmm0,%xmm3
1245         pxor    %xmm1,%xmm3
1246
1247         movdqa  %xmm3,%xmm4
1248         psrldq  $8,%xmm3
1249         pslldq  $8,%xmm4
1250         pxor    %xmm3,%xmm1
1251         pxor    %xmm4,%xmm0
1252
1253         movdqa  %xmm0,%xmm4
1254         movdqa  %xmm0,%xmm3
1255         psllq   $5,%xmm0
1256         pxor    %xmm0,%xmm3
1257         psllq   $1,%xmm0
1258         pxor    %xmm3,%xmm0
1259         psllq   $57,%xmm0
1260         movdqa  %xmm0,%xmm3
1261         pslldq  $8,%xmm0
1262         psrldq  $8,%xmm3
1263         pxor    %xmm4,%xmm0
1264         pxor    %xmm3,%xmm1
1265
1266
1267         movdqa  %xmm0,%xmm4
1268         psrlq   $1,%xmm0
1269         pxor    %xmm4,%xmm1
1270         pxor    %xmm0,%xmm4
1271         psrlq   $5,%xmm0
1272         pxor    %xmm4,%xmm0
1273         psrlq   $1,%xmm0
1274         pxor    %xmm1,%xmm0
1275 L$done:
1276 .byte   102,15,56,0,197
1277         movdqu  %xmm0,(%rdi)
1278         .byte   0xf3,0xc3
1279
1280 .globl  _gcm_init_avx
1281
1282 .p2align        5
1283 _gcm_init_avx:
1284         jmp     L$_init_clmul
1285
1286 .globl  _gcm_gmult_avx
1287
1288 .p2align        5
1289 _gcm_gmult_avx:
1290         jmp     L$_gmult_clmul
1291
1292 .globl  _gcm_ghash_avx
1293
1294 .p2align        5
1295 _gcm_ghash_avx:
1296         jmp     L$_ghash_clmul
1297
1298 .p2align        6
1299 L$bswap_mask:
1300 .byte   15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1301 L$0x1c2_polynomial:
1302 .byte   1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1303 L$7_mask:
1304 .long   7,0,7,0
1305 L$7_mask_poly:
1306 .long   7,0,450,0
1307 .p2align        6
1308
1309 L$rem_4bit:
1310 .long   0,0,0,471859200,0,943718400,0,610271232
1311 .long   0,1887436800,0,1822425088,0,1220542464,0,1423966208
1312 .long   0,3774873600,0,4246732800,0,3644850176,0,3311403008
1313 .long   0,2441084928,0,2376073216,0,2847932416,0,3051356160
1314
1315 L$rem_8bit:
1316 .value  0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
1317 .value  0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
1318 .value  0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
1319 .value  0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
1320 .value  0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
1321 .value  0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
1322 .value  0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
1323 .value  0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
1324 .value  0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
1325 .value  0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
1326 .value  0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
1327 .value  0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
1328 .value  0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
1329 .value  0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
1330 .value  0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
1331 .value  0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
1332 .value  0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
1333 .value  0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
1334 .value  0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
1335 .value  0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
1336 .value  0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
1337 .value  0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
1338 .value  0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
1339 .value  0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
1340 .value  0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
1341 .value  0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
1342 .value  0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
1343 .value  0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
1344 .value  0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
1345 .value  0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
1346 .value  0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
1347 .value  0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
1348
1349 .byte   71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1350 .p2align        6