Added Mike Hamburg's SSSE3 AES implementation.
[gnutls:gnutls.git] / lib / accelerated / x86 / coff / ghash-x86_64.s
1 # Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
2 # All rights reserved.
3 #
4 # Redistribution and use in source and binary forms, with or without
5 # modification, are permitted provided that the following conditions
6 # are met:
7
8 #     * Redistributions of source code must retain copyright notices,
9 #      this list of conditions and the following disclaimer.
10 #
11 #     * Redistributions in binary form must reproduce the above
12 #      copyright notice, this list of conditions and the following
13 #      disclaimer in the documentation and/or other materials
14 #      provided with the distribution.
15 #
16 #     * Neither the name of the Andy Polyakov nor the names of its
17 #      copyright holder and contributors may be used to endorse or
18 #      promote products derived from this software without specific
19 #      prior written permission.
20 #
21 # ALTERNATIVELY, provided that this notice is retained in full, this
22 # product may be distributed under the terms of the GNU General Public
23 # License (GPL), in which case the provisions of the GPL apply INSTEAD OF
24 # those given above.
25 #
26 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
27 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
32 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
33 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
34 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
36 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 #
38 # *** This file is auto-generated ***
39 #
40 .text   
41
42 .globl  gcm_gmult_4bit
43 .def    gcm_gmult_4bit; .scl 2; .type 32;       .endef
44 .p2align        4
45 gcm_gmult_4bit:
46         movq    %rdi,8(%rsp)
47         movq    %rsi,16(%rsp)
48         movq    %rsp,%rax
49 .LSEH_begin_gcm_gmult_4bit:
50         movq    %rcx,%rdi
51         movq    %rdx,%rsi
52
53         pushq   %rbx
54         pushq   %rbp
55         pushq   %r12
56 .Lgmult_prologue:
57
58         movzbq  15(%rdi),%r8
59         leaq    .Lrem_4bit(%rip),%r11
60         xorq    %rax,%rax
61         xorq    %rbx,%rbx
62         movb    %r8b,%al
63         movb    %r8b,%bl
64         shlb    $4,%al
65         movq    $14,%rcx
66         movq    8(%rsi,%rax,1),%r8
67         movq    (%rsi,%rax,1),%r9
68         andb    $240,%bl
69         movq    %r8,%rdx
70         jmp     .Loop1
71
72 .p2align        4
73 .Loop1:
74         shrq    $4,%r8
75         andq    $15,%rdx
76         movq    %r9,%r10
77         movb    (%rdi,%rcx,1),%al
78         shrq    $4,%r9
79         xorq    8(%rsi,%rbx,1),%r8
80         shlq    $60,%r10
81         xorq    (%rsi,%rbx,1),%r9
82         movb    %al,%bl
83         xorq    (%r11,%rdx,8),%r9
84         movq    %r8,%rdx
85         shlb    $4,%al
86         xorq    %r10,%r8
87         decq    %rcx
88         js      .Lbreak1
89
90         shrq    $4,%r8
91         andq    $15,%rdx
92         movq    %r9,%r10
93         shrq    $4,%r9
94         xorq    8(%rsi,%rax,1),%r8
95         shlq    $60,%r10
96         xorq    (%rsi,%rax,1),%r9
97         andb    $240,%bl
98         xorq    (%r11,%rdx,8),%r9
99         movq    %r8,%rdx
100         xorq    %r10,%r8
101         jmp     .Loop1
102
103 .p2align        4
104 .Lbreak1:
105         shrq    $4,%r8
106         andq    $15,%rdx
107         movq    %r9,%r10
108         shrq    $4,%r9
109         xorq    8(%rsi,%rax,1),%r8
110         shlq    $60,%r10
111         xorq    (%rsi,%rax,1),%r9
112         andb    $240,%bl
113         xorq    (%r11,%rdx,8),%r9
114         movq    %r8,%rdx
115         xorq    %r10,%r8
116
117         shrq    $4,%r8
118         andq    $15,%rdx
119         movq    %r9,%r10
120         shrq    $4,%r9
121         xorq    8(%rsi,%rbx,1),%r8
122         shlq    $60,%r10
123         xorq    (%rsi,%rbx,1),%r9
124         xorq    %r10,%r8
125         xorq    (%r11,%rdx,8),%r9
126
127         bswapq  %r8
128         bswapq  %r9
129         movq    %r8,8(%rdi)
130         movq    %r9,(%rdi)
131
132         movq    16(%rsp),%rbx
133         leaq    24(%rsp),%rsp
134 .Lgmult_epilogue:
135         movq    8(%rsp),%rdi
136         movq    16(%rsp),%rsi
137         .byte   0xf3,0xc3
138 .LSEH_end_gcm_gmult_4bit:
139 .globl  gcm_ghash_4bit
140 .def    gcm_ghash_4bit; .scl 2; .type 32;       .endef
141 .p2align        4
142 gcm_ghash_4bit:
143         movq    %rdi,8(%rsp)
144         movq    %rsi,16(%rsp)
145         movq    %rsp,%rax
146 .LSEH_begin_gcm_ghash_4bit:
147         movq    %rcx,%rdi
148         movq    %rdx,%rsi
149         movq    %r8,%rdx
150         movq    %r9,%rcx
151
152         pushq   %rbx
153         pushq   %rbp
154         pushq   %r12
155         pushq   %r13
156         pushq   %r14
157         pushq   %r15
158         subq    $280,%rsp
159 .Lghash_prologue:
160         movq    %rdx,%r14
161         movq    %rcx,%r15
162         subq    $-128,%rsi
163         leaq    16+128(%rsp),%rbp
164         xorl    %edx,%edx
165         movq    0+0-128(%rsi),%r8
166         movq    0+8-128(%rsi),%rax
167         movb    %al,%dl
168         shrq    $4,%rax
169         movq    %r8,%r10
170         shrq    $4,%r8
171         movq    16+0-128(%rsi),%r9
172         shlb    $4,%dl
173         movq    16+8-128(%rsi),%rbx
174         shlq    $60,%r10
175         movb    %dl,0(%rsp)
176         orq     %r10,%rax
177         movb    %bl,%dl
178         shrq    $4,%rbx
179         movq    %r9,%r10
180         shrq    $4,%r9
181         movq    %r8,0(%rbp)
182         movq    32+0-128(%rsi),%r8
183         shlb    $4,%dl
184         movq    %rax,0-128(%rbp)
185         movq    32+8-128(%rsi),%rax
186         shlq    $60,%r10
187         movb    %dl,1(%rsp)
188         orq     %r10,%rbx
189         movb    %al,%dl
190         shrq    $4,%rax
191         movq    %r8,%r10
192         shrq    $4,%r8
193         movq    %r9,8(%rbp)
194         movq    48+0-128(%rsi),%r9
195         shlb    $4,%dl
196         movq    %rbx,8-128(%rbp)
197         movq    48+8-128(%rsi),%rbx
198         shlq    $60,%r10
199         movb    %dl,2(%rsp)
200         orq     %r10,%rax
201         movb    %bl,%dl
202         shrq    $4,%rbx
203         movq    %r9,%r10
204         shrq    $4,%r9
205         movq    %r8,16(%rbp)
206         movq    64+0-128(%rsi),%r8
207         shlb    $4,%dl
208         movq    %rax,16-128(%rbp)
209         movq    64+8-128(%rsi),%rax
210         shlq    $60,%r10
211         movb    %dl,3(%rsp)
212         orq     %r10,%rbx
213         movb    %al,%dl
214         shrq    $4,%rax
215         movq    %r8,%r10
216         shrq    $4,%r8
217         movq    %r9,24(%rbp)
218         movq    80+0-128(%rsi),%r9
219         shlb    $4,%dl
220         movq    %rbx,24-128(%rbp)
221         movq    80+8-128(%rsi),%rbx
222         shlq    $60,%r10
223         movb    %dl,4(%rsp)
224         orq     %r10,%rax
225         movb    %bl,%dl
226         shrq    $4,%rbx
227         movq    %r9,%r10
228         shrq    $4,%r9
229         movq    %r8,32(%rbp)
230         movq    96+0-128(%rsi),%r8
231         shlb    $4,%dl
232         movq    %rax,32-128(%rbp)
233         movq    96+8-128(%rsi),%rax
234         shlq    $60,%r10
235         movb    %dl,5(%rsp)
236         orq     %r10,%rbx
237         movb    %al,%dl
238         shrq    $4,%rax
239         movq    %r8,%r10
240         shrq    $4,%r8
241         movq    %r9,40(%rbp)
242         movq    112+0-128(%rsi),%r9
243         shlb    $4,%dl
244         movq    %rbx,40-128(%rbp)
245         movq    112+8-128(%rsi),%rbx
246         shlq    $60,%r10
247         movb    %dl,6(%rsp)
248         orq     %r10,%rax
249         movb    %bl,%dl
250         shrq    $4,%rbx
251         movq    %r9,%r10
252         shrq    $4,%r9
253         movq    %r8,48(%rbp)
254         movq    128+0-128(%rsi),%r8
255         shlb    $4,%dl
256         movq    %rax,48-128(%rbp)
257         movq    128+8-128(%rsi),%rax
258         shlq    $60,%r10
259         movb    %dl,7(%rsp)
260         orq     %r10,%rbx
261         movb    %al,%dl
262         shrq    $4,%rax
263         movq    %r8,%r10
264         shrq    $4,%r8
265         movq    %r9,56(%rbp)
266         movq    144+0-128(%rsi),%r9
267         shlb    $4,%dl
268         movq    %rbx,56-128(%rbp)
269         movq    144+8-128(%rsi),%rbx
270         shlq    $60,%r10
271         movb    %dl,8(%rsp)
272         orq     %r10,%rax
273         movb    %bl,%dl
274         shrq    $4,%rbx
275         movq    %r9,%r10
276         shrq    $4,%r9
277         movq    %r8,64(%rbp)
278         movq    160+0-128(%rsi),%r8
279         shlb    $4,%dl
280         movq    %rax,64-128(%rbp)
281         movq    160+8-128(%rsi),%rax
282         shlq    $60,%r10
283         movb    %dl,9(%rsp)
284         orq     %r10,%rbx
285         movb    %al,%dl
286         shrq    $4,%rax
287         movq    %r8,%r10
288         shrq    $4,%r8
289         movq    %r9,72(%rbp)
290         movq    176+0-128(%rsi),%r9
291         shlb    $4,%dl
292         movq    %rbx,72-128(%rbp)
293         movq    176+8-128(%rsi),%rbx
294         shlq    $60,%r10
295         movb    %dl,10(%rsp)
296         orq     %r10,%rax
297         movb    %bl,%dl
298         shrq    $4,%rbx
299         movq    %r9,%r10
300         shrq    $4,%r9
301         movq    %r8,80(%rbp)
302         movq    192+0-128(%rsi),%r8
303         shlb    $4,%dl
304         movq    %rax,80-128(%rbp)
305         movq    192+8-128(%rsi),%rax
306         shlq    $60,%r10
307         movb    %dl,11(%rsp)
308         orq     %r10,%rbx
309         movb    %al,%dl
310         shrq    $4,%rax
311         movq    %r8,%r10
312         shrq    $4,%r8
313         movq    %r9,88(%rbp)
314         movq    208+0-128(%rsi),%r9
315         shlb    $4,%dl
316         movq    %rbx,88-128(%rbp)
317         movq    208+8-128(%rsi),%rbx
318         shlq    $60,%r10
319         movb    %dl,12(%rsp)
320         orq     %r10,%rax
321         movb    %bl,%dl
322         shrq    $4,%rbx
323         movq    %r9,%r10
324         shrq    $4,%r9
325         movq    %r8,96(%rbp)
326         movq    224+0-128(%rsi),%r8
327         shlb    $4,%dl
328         movq    %rax,96-128(%rbp)
329         movq    224+8-128(%rsi),%rax
330         shlq    $60,%r10
331         movb    %dl,13(%rsp)
332         orq     %r10,%rbx
333         movb    %al,%dl
334         shrq    $4,%rax
335         movq    %r8,%r10
336         shrq    $4,%r8
337         movq    %r9,104(%rbp)
338         movq    240+0-128(%rsi),%r9
339         shlb    $4,%dl
340         movq    %rbx,104-128(%rbp)
341         movq    240+8-128(%rsi),%rbx
342         shlq    $60,%r10
343         movb    %dl,14(%rsp)
344         orq     %r10,%rax
345         movb    %bl,%dl
346         shrq    $4,%rbx
347         movq    %r9,%r10
348         shrq    $4,%r9
349         movq    %r8,112(%rbp)
350         shlb    $4,%dl
351         movq    %rax,112-128(%rbp)
352         shlq    $60,%r10
353         movb    %dl,15(%rsp)
354         orq     %r10,%rbx
355         movq    %r9,120(%rbp)
356         movq    %rbx,120-128(%rbp)
357         addq    $-128,%rsi
358         movq    8(%rdi),%r8
359         movq    0(%rdi),%r9
360         addq    %r14,%r15
361         leaq    .Lrem_8bit(%rip),%r11
362         jmp     .Louter_loop
363 .p2align        4
364 .Louter_loop:
365         xorq    (%r14),%r9
366         movq    8(%r14),%rdx
367         leaq    16(%r14),%r14
368         xorq    %r8,%rdx
369         movq    %r9,(%rdi)
370         movq    %rdx,8(%rdi)
371         shrq    $32,%rdx
372         xorq    %rax,%rax
373         roll    $8,%edx
374         movb    %dl,%al
375         movzbl  %dl,%ebx
376         shlb    $4,%al
377         shrl    $4,%ebx
378         roll    $8,%edx
379         movq    8(%rsi,%rax,1),%r8
380         movq    (%rsi,%rax,1),%r9
381         movb    %dl,%al
382         movzbl  %dl,%ecx
383         shlb    $4,%al
384         movzbq  (%rsp,%rbx,1),%r12
385         shrl    $4,%ecx
386         xorq    %r8,%r12
387         movq    %r9,%r10
388         shrq    $8,%r8
389         movzbq  %r12b,%r12
390         shrq    $8,%r9
391         xorq    -128(%rbp,%rbx,8),%r8
392         shlq    $56,%r10
393         xorq    (%rbp,%rbx,8),%r9
394         roll    $8,%edx
395         xorq    8(%rsi,%rax,1),%r8
396         xorq    (%rsi,%rax,1),%r9
397         movb    %dl,%al
398         xorq    %r10,%r8
399         movzwq  (%r11,%r12,2),%r12
400         movzbl  %dl,%ebx
401         shlb    $4,%al
402         movzbq  (%rsp,%rcx,1),%r13
403         shrl    $4,%ebx
404         shlq    $48,%r12
405         xorq    %r8,%r13
406         movq    %r9,%r10
407         xorq    %r12,%r9
408         shrq    $8,%r8
409         movzbq  %r13b,%r13
410         shrq    $8,%r9
411         xorq    -128(%rbp,%rcx,8),%r8
412         shlq    $56,%r10
413         xorq    (%rbp,%rcx,8),%r9
414         roll    $8,%edx
415         xorq    8(%rsi,%rax,1),%r8
416         xorq    (%rsi,%rax,1),%r9
417         movb    %dl,%al
418         xorq    %r10,%r8
419         movzwq  (%r11,%r13,2),%r13
420         movzbl  %dl,%ecx
421         shlb    $4,%al
422         movzbq  (%rsp,%rbx,1),%r12
423         shrl    $4,%ecx
424         shlq    $48,%r13
425         xorq    %r8,%r12
426         movq    %r9,%r10
427         xorq    %r13,%r9
428         shrq    $8,%r8
429         movzbq  %r12b,%r12
430         movl    8(%rdi),%edx
431         shrq    $8,%r9
432         xorq    -128(%rbp,%rbx,8),%r8
433         shlq    $56,%r10
434         xorq    (%rbp,%rbx,8),%r9
435         roll    $8,%edx
436         xorq    8(%rsi,%rax,1),%r8
437         xorq    (%rsi,%rax,1),%r9
438         movb    %dl,%al
439         xorq    %r10,%r8
440         movzwq  (%r11,%r12,2),%r12
441         movzbl  %dl,%ebx
442         shlb    $4,%al
443         movzbq  (%rsp,%rcx,1),%r13
444         shrl    $4,%ebx
445         shlq    $48,%r12
446         xorq    %r8,%r13
447         movq    %r9,%r10
448         xorq    %r12,%r9
449         shrq    $8,%r8
450         movzbq  %r13b,%r13
451         shrq    $8,%r9
452         xorq    -128(%rbp,%rcx,8),%r8
453         shlq    $56,%r10
454         xorq    (%rbp,%rcx,8),%r9
455         roll    $8,%edx
456         xorq    8(%rsi,%rax,1),%r8
457         xorq    (%rsi,%rax,1),%r9
458         movb    %dl,%al
459         xorq    %r10,%r8
460         movzwq  (%r11,%r13,2),%r13
461         movzbl  %dl,%ecx
462         shlb    $4,%al
463         movzbq  (%rsp,%rbx,1),%r12
464         shrl    $4,%ecx
465         shlq    $48,%r13
466         xorq    %r8,%r12
467         movq    %r9,%r10
468         xorq    %r13,%r9
469         shrq    $8,%r8
470         movzbq  %r12b,%r12
471         shrq    $8,%r9
472         xorq    -128(%rbp,%rbx,8),%r8
473         shlq    $56,%r10
474         xorq    (%rbp,%rbx,8),%r9
475         roll    $8,%edx
476         xorq    8(%rsi,%rax,1),%r8
477         xorq    (%rsi,%rax,1),%r9
478         movb    %dl,%al
479         xorq    %r10,%r8
480         movzwq  (%r11,%r12,2),%r12
481         movzbl  %dl,%ebx
482         shlb    $4,%al
483         movzbq  (%rsp,%rcx,1),%r13
484         shrl    $4,%ebx
485         shlq    $48,%r12
486         xorq    %r8,%r13
487         movq    %r9,%r10
488         xorq    %r12,%r9
489         shrq    $8,%r8
490         movzbq  %r13b,%r13
491         shrq    $8,%r9
492         xorq    -128(%rbp,%rcx,8),%r8
493         shlq    $56,%r10
494         xorq    (%rbp,%rcx,8),%r9
495         roll    $8,%edx
496         xorq    8(%rsi,%rax,1),%r8
497         xorq    (%rsi,%rax,1),%r9
498         movb    %dl,%al
499         xorq    %r10,%r8
500         movzwq  (%r11,%r13,2),%r13
501         movzbl  %dl,%ecx
502         shlb    $4,%al
503         movzbq  (%rsp,%rbx,1),%r12
504         shrl    $4,%ecx
505         shlq    $48,%r13
506         xorq    %r8,%r12
507         movq    %r9,%r10
508         xorq    %r13,%r9
509         shrq    $8,%r8
510         movzbq  %r12b,%r12
511         movl    4(%rdi),%edx
512         shrq    $8,%r9
513         xorq    -128(%rbp,%rbx,8),%r8
514         shlq    $56,%r10
515         xorq    (%rbp,%rbx,8),%r9
516         roll    $8,%edx
517         xorq    8(%rsi,%rax,1),%r8
518         xorq    (%rsi,%rax,1),%r9
519         movb    %dl,%al
520         xorq    %r10,%r8
521         movzwq  (%r11,%r12,2),%r12
522         movzbl  %dl,%ebx
523         shlb    $4,%al
524         movzbq  (%rsp,%rcx,1),%r13
525         shrl    $4,%ebx
526         shlq    $48,%r12
527         xorq    %r8,%r13
528         movq    %r9,%r10
529         xorq    %r12,%r9
530         shrq    $8,%r8
531         movzbq  %r13b,%r13
532         shrq    $8,%r9
533         xorq    -128(%rbp,%rcx,8),%r8
534         shlq    $56,%r10
535         xorq    (%rbp,%rcx,8),%r9
536         roll    $8,%edx
537         xorq    8(%rsi,%rax,1),%r8
538         xorq    (%rsi,%rax,1),%r9
539         movb    %dl,%al
540         xorq    %r10,%r8
541         movzwq  (%r11,%r13,2),%r13
542         movzbl  %dl,%ecx
543         shlb    $4,%al
544         movzbq  (%rsp,%rbx,1),%r12
545         shrl    $4,%ecx
546         shlq    $48,%r13
547         xorq    %r8,%r12
548         movq    %r9,%r10
549         xorq    %r13,%r9
550         shrq    $8,%r8
551         movzbq  %r12b,%r12
552         shrq    $8,%r9
553         xorq    -128(%rbp,%rbx,8),%r8
554         shlq    $56,%r10
555         xorq    (%rbp,%rbx,8),%r9
556         roll    $8,%edx
557         xorq    8(%rsi,%rax,1),%r8
558         xorq    (%rsi,%rax,1),%r9
559         movb    %dl,%al
560         xorq    %r10,%r8
561         movzwq  (%r11,%r12,2),%r12
562         movzbl  %dl,%ebx
563         shlb    $4,%al
564         movzbq  (%rsp,%rcx,1),%r13
565         shrl    $4,%ebx
566         shlq    $48,%r12
567         xorq    %r8,%r13
568         movq    %r9,%r10
569         xorq    %r12,%r9
570         shrq    $8,%r8
571         movzbq  %r13b,%r13
572         shrq    $8,%r9
573         xorq    -128(%rbp,%rcx,8),%r8
574         shlq    $56,%r10
575         xorq    (%rbp,%rcx,8),%r9
576         roll    $8,%edx
577         xorq    8(%rsi,%rax,1),%r8
578         xorq    (%rsi,%rax,1),%r9
579         movb    %dl,%al
580         xorq    %r10,%r8
581         movzwq  (%r11,%r13,2),%r13
582         movzbl  %dl,%ecx
583         shlb    $4,%al
584         movzbq  (%rsp,%rbx,1),%r12
585         shrl    $4,%ecx
586         shlq    $48,%r13
587         xorq    %r8,%r12
588         movq    %r9,%r10
589         xorq    %r13,%r9
590         shrq    $8,%r8
591         movzbq  %r12b,%r12
592         movl    0(%rdi),%edx
593         shrq    $8,%r9
594         xorq    -128(%rbp,%rbx,8),%r8
595         shlq    $56,%r10
596         xorq    (%rbp,%rbx,8),%r9
597         roll    $8,%edx
598         xorq    8(%rsi,%rax,1),%r8
599         xorq    (%rsi,%rax,1),%r9
600         movb    %dl,%al
601         xorq    %r10,%r8
602         movzwq  (%r11,%r12,2),%r12
603         movzbl  %dl,%ebx
604         shlb    $4,%al
605         movzbq  (%rsp,%rcx,1),%r13
606         shrl    $4,%ebx
607         shlq    $48,%r12
608         xorq    %r8,%r13
609         movq    %r9,%r10
610         xorq    %r12,%r9
611         shrq    $8,%r8
612         movzbq  %r13b,%r13
613         shrq    $8,%r9
614         xorq    -128(%rbp,%rcx,8),%r8
615         shlq    $56,%r10
616         xorq    (%rbp,%rcx,8),%r9
617         roll    $8,%edx
618         xorq    8(%rsi,%rax,1),%r8
619         xorq    (%rsi,%rax,1),%r9
620         movb    %dl,%al
621         xorq    %r10,%r8
622         movzwq  (%r11,%r13,2),%r13
623         movzbl  %dl,%ecx
624         shlb    $4,%al
625         movzbq  (%rsp,%rbx,1),%r12
626         shrl    $4,%ecx
627         shlq    $48,%r13
628         xorq    %r8,%r12
629         movq    %r9,%r10
630         xorq    %r13,%r9
631         shrq    $8,%r8
632         movzbq  %r12b,%r12
633         shrq    $8,%r9
634         xorq    -128(%rbp,%rbx,8),%r8
635         shlq    $56,%r10
636         xorq    (%rbp,%rbx,8),%r9
637         roll    $8,%edx
638         xorq    8(%rsi,%rax,1),%r8
639         xorq    (%rsi,%rax,1),%r9
640         movb    %dl,%al
641         xorq    %r10,%r8
642         movzwq  (%r11,%r12,2),%r12
643         movzbl  %dl,%ebx
644         shlb    $4,%al
645         movzbq  (%rsp,%rcx,1),%r13
646         shrl    $4,%ebx
647         shlq    $48,%r12
648         xorq    %r8,%r13
649         movq    %r9,%r10
650         xorq    %r12,%r9
651         shrq    $8,%r8
652         movzbq  %r13b,%r13
653         shrq    $8,%r9
654         xorq    -128(%rbp,%rcx,8),%r8
655         shlq    $56,%r10
656         xorq    (%rbp,%rcx,8),%r9
657         roll    $8,%edx
658         xorq    8(%rsi,%rax,1),%r8
659         xorq    (%rsi,%rax,1),%r9
660         movb    %dl,%al
661         xorq    %r10,%r8
662         movzwq  (%r11,%r13,2),%r13
663         movzbl  %dl,%ecx
664         shlb    $4,%al
665         movzbq  (%rsp,%rbx,1),%r12
666         andl    $240,%ecx
667         shlq    $48,%r13
668         xorq    %r8,%r12
669         movq    %r9,%r10
670         xorq    %r13,%r9
671         shrq    $8,%r8
672         movzbq  %r12b,%r12
673         movl    -4(%rdi),%edx
674         shrq    $8,%r9
675         xorq    -128(%rbp,%rbx,8),%r8
676         shlq    $56,%r10
677         xorq    (%rbp,%rbx,8),%r9
678         movzwq  (%r11,%r12,2),%r12
679         xorq    8(%rsi,%rax,1),%r8
680         xorq    (%rsi,%rax,1),%r9
681         shlq    $48,%r12
682         xorq    %r10,%r8
683         xorq    %r12,%r9
684         movzbq  %r8b,%r13
685         shrq    $4,%r8
686         movq    %r9,%r10
687         shlb    $4,%r13b
688         shrq    $4,%r9
689         xorq    8(%rsi,%rcx,1),%r8
690         movzwq  (%r11,%r13,2),%r13
691         shlq    $60,%r10
692         xorq    (%rsi,%rcx,1),%r9
693         xorq    %r10,%r8
694         shlq    $48,%r13
695         bswapq  %r8
696         xorq    %r13,%r9
697         bswapq  %r9
698         cmpq    %r15,%r14
699         jb      .Louter_loop
700         movq    %r8,8(%rdi)
701         movq    %r9,(%rdi)
702
703         leaq    280(%rsp),%rsi
704         movq    0(%rsi),%r15
705         movq    8(%rsi),%r14
706         movq    16(%rsi),%r13
707         movq    24(%rsi),%r12
708         movq    32(%rsi),%rbp
709         movq    40(%rsi),%rbx
710         leaq    48(%rsi),%rsp
711 .Lghash_epilogue:
712         movq    8(%rsp),%rdi
713         movq    16(%rsp),%rsi
714         .byte   0xf3,0xc3
715 .LSEH_end_gcm_ghash_4bit:
716 .globl  gcm_init_clmul
717 .def    gcm_init_clmul; .scl 2; .type 32;       .endef
718 .p2align        4
719 gcm_init_clmul:
720 .L_init_clmul:
721 .LSEH_begin_gcm_init_clmul:
722
723 .byte   0x48,0x83,0xec,0x18             
724 .byte   0x0f,0x29,0x34,0x24             
725         movdqu  (%rdx),%xmm2
726         pshufd  $78,%xmm2,%xmm2
727
728
729         pshufd  $255,%xmm2,%xmm4
730         movdqa  %xmm2,%xmm3
731         psllq   $1,%xmm2
732         pxor    %xmm5,%xmm5
733         psrlq   $63,%xmm3
734         pcmpgtd %xmm4,%xmm5
735         pslldq  $8,%xmm3
736         por     %xmm3,%xmm2
737
738
739         pand    .L0x1c2_polynomial(%rip),%xmm5
740         pxor    %xmm5,%xmm2
741
742
743         pshufd  $78,%xmm2,%xmm6
744         movdqa  %xmm2,%xmm0
745         pxor    %xmm2,%xmm6
746         movdqa  %xmm0,%xmm1
747         pshufd  $78,%xmm0,%xmm3
748         pxor    %xmm0,%xmm3
749 .byte   102,15,58,68,194,0
750 .byte   102,15,58,68,202,17
751 .byte   102,15,58,68,222,0
752         pxor    %xmm0,%xmm3
753         pxor    %xmm1,%xmm3
754
755         movdqa  %xmm3,%xmm4
756         psrldq  $8,%xmm3
757         pslldq  $8,%xmm4
758         pxor    %xmm3,%xmm1
759         pxor    %xmm4,%xmm0
760
761         movdqa  %xmm0,%xmm4
762         movdqa  %xmm0,%xmm3
763         psllq   $5,%xmm0
764         pxor    %xmm0,%xmm3
765         psllq   $1,%xmm0
766         pxor    %xmm3,%xmm0
767         psllq   $57,%xmm0
768         movdqa  %xmm0,%xmm3
769         pslldq  $8,%xmm0
770         psrldq  $8,%xmm3
771         pxor    %xmm4,%xmm0
772         pxor    %xmm3,%xmm1
773
774
775         movdqa  %xmm0,%xmm4
776         psrlq   $1,%xmm0
777         pxor    %xmm4,%xmm1
778         pxor    %xmm0,%xmm4
779         psrlq   $5,%xmm0
780         pxor    %xmm4,%xmm0
781         psrlq   $1,%xmm0
782         pxor    %xmm1,%xmm0
783         pshufd  $78,%xmm2,%xmm3
784         pshufd  $78,%xmm0,%xmm4
785         pxor    %xmm2,%xmm3
786         movdqu  %xmm2,0(%rcx)
787         pxor    %xmm0,%xmm4
788         movdqu  %xmm0,16(%rcx)
789 .byte   102,15,58,15,227,8
790         movdqu  %xmm4,32(%rcx)
791         movdqa  %xmm0,%xmm1
792         pshufd  $78,%xmm0,%xmm3
793         pxor    %xmm0,%xmm3
794 .byte   102,15,58,68,194,0
795 .byte   102,15,58,68,202,17
796 .byte   102,15,58,68,222,0
797         pxor    %xmm0,%xmm3
798         pxor    %xmm1,%xmm3
799
800         movdqa  %xmm3,%xmm4
801         psrldq  $8,%xmm3
802         pslldq  $8,%xmm4
803         pxor    %xmm3,%xmm1
804         pxor    %xmm4,%xmm0
805
806         movdqa  %xmm0,%xmm4
807         movdqa  %xmm0,%xmm3
808         psllq   $5,%xmm0
809         pxor    %xmm0,%xmm3
810         psllq   $1,%xmm0
811         pxor    %xmm3,%xmm0
812         psllq   $57,%xmm0
813         movdqa  %xmm0,%xmm3
814         pslldq  $8,%xmm0
815         psrldq  $8,%xmm3
816         pxor    %xmm4,%xmm0
817         pxor    %xmm3,%xmm1
818
819
820         movdqa  %xmm0,%xmm4
821         psrlq   $1,%xmm0
822         pxor    %xmm4,%xmm1
823         pxor    %xmm0,%xmm4
824         psrlq   $5,%xmm0
825         pxor    %xmm4,%xmm0
826         psrlq   $1,%xmm0
827         pxor    %xmm1,%xmm0
828         movdqa  %xmm0,%xmm5
829         movdqa  %xmm0,%xmm1
830         pshufd  $78,%xmm0,%xmm3
831         pxor    %xmm0,%xmm3
832 .byte   102,15,58,68,194,0
833 .byte   102,15,58,68,202,17
834 .byte   102,15,58,68,222,0
835         pxor    %xmm0,%xmm3
836         pxor    %xmm1,%xmm3
837
838         movdqa  %xmm3,%xmm4
839         psrldq  $8,%xmm3
840         pslldq  $8,%xmm4
841         pxor    %xmm3,%xmm1
842         pxor    %xmm4,%xmm0
843
844         movdqa  %xmm0,%xmm4
845         movdqa  %xmm0,%xmm3
846         psllq   $5,%xmm0
847         pxor    %xmm0,%xmm3
848         psllq   $1,%xmm0
849         pxor    %xmm3,%xmm0
850         psllq   $57,%xmm0
851         movdqa  %xmm0,%xmm3
852         pslldq  $8,%xmm0
853         psrldq  $8,%xmm3
854         pxor    %xmm4,%xmm0
855         pxor    %xmm3,%xmm1
856
857
858         movdqa  %xmm0,%xmm4
859         psrlq   $1,%xmm0
860         pxor    %xmm4,%xmm1
861         pxor    %xmm0,%xmm4
862         psrlq   $5,%xmm0
863         pxor    %xmm4,%xmm0
864         psrlq   $1,%xmm0
865         pxor    %xmm1,%xmm0
866         pshufd  $78,%xmm5,%xmm3
867         pshufd  $78,%xmm0,%xmm4
868         pxor    %xmm5,%xmm3
869         movdqu  %xmm5,48(%rcx)
870         pxor    %xmm0,%xmm4
871         movdqu  %xmm0,64(%rcx)
872 .byte   102,15,58,15,227,8
873         movdqu  %xmm4,80(%rcx)
874         movaps  (%rsp),%xmm6
875         leaq    24(%rsp),%rsp
876 .LSEH_end_gcm_init_clmul:
877         .byte   0xf3,0xc3
878
879 .globl  gcm_gmult_clmul
880 .def    gcm_gmult_clmul;        .scl 2; .type 32;       .endef
881 .p2align        4
882 gcm_gmult_clmul:
883 .L_gmult_clmul:
884         movdqu  (%rcx),%xmm0
885         movdqa  .Lbswap_mask(%rip),%xmm5
886         movdqu  (%rdx),%xmm2
887         movdqu  32(%rdx),%xmm4
888 .byte   102,15,56,0,197
889         movdqa  %xmm0,%xmm1
890         pshufd  $78,%xmm0,%xmm3
891         pxor    %xmm0,%xmm3
892 .byte   102,15,58,68,194,0
893 .byte   102,15,58,68,202,17
894 .byte   102,15,58,68,220,0
895         pxor    %xmm0,%xmm3
896         pxor    %xmm1,%xmm3
897
898         movdqa  %xmm3,%xmm4
899         psrldq  $8,%xmm3
900         pslldq  $8,%xmm4
901         pxor    %xmm3,%xmm1
902         pxor    %xmm4,%xmm0
903
904         movdqa  %xmm0,%xmm4
905         movdqa  %xmm0,%xmm3
906         psllq   $5,%xmm0
907         pxor    %xmm0,%xmm3
908         psllq   $1,%xmm0
909         pxor    %xmm3,%xmm0
910         psllq   $57,%xmm0
911         movdqa  %xmm0,%xmm3
912         pslldq  $8,%xmm0
913         psrldq  $8,%xmm3
914         pxor    %xmm4,%xmm0
915         pxor    %xmm3,%xmm1
916
917
918         movdqa  %xmm0,%xmm4
919         psrlq   $1,%xmm0
920         pxor    %xmm4,%xmm1
921         pxor    %xmm0,%xmm4
922         psrlq   $5,%xmm0
923         pxor    %xmm4,%xmm0
924         psrlq   $1,%xmm0
925         pxor    %xmm1,%xmm0
926 .byte   102,15,56,0,197
927         movdqu  %xmm0,(%rcx)
928         .byte   0xf3,0xc3
929
930 .globl  gcm_ghash_clmul
931 .def    gcm_ghash_clmul;        .scl 2; .type 32;       .endef
932 .p2align        5
933 gcm_ghash_clmul:
934 .L_ghash_clmul:
935         leaq    -136(%rsp),%rax
936 .LSEH_begin_gcm_ghash_clmul:
937
938 .byte   0x48,0x8d,0x60,0xe0             
939 .byte   0x0f,0x29,0x70,0xe0             
940 .byte   0x0f,0x29,0x78,0xf0             
941 .byte   0x44,0x0f,0x29,0x00             
942 .byte   0x44,0x0f,0x29,0x48,0x10        
943 .byte   0x44,0x0f,0x29,0x50,0x20        
944 .byte   0x44,0x0f,0x29,0x58,0x30        
945 .byte   0x44,0x0f,0x29,0x60,0x40        
946 .byte   0x44,0x0f,0x29,0x68,0x50        
947 .byte   0x44,0x0f,0x29,0x70,0x60        
948 .byte   0x44,0x0f,0x29,0x78,0x70        
949         movdqa  .Lbswap_mask(%rip),%xmm5
950         movq    $11547335547999543296,%rax
951
952         movdqu  (%rcx),%xmm0
953         movdqu  (%rdx),%xmm2
954         movdqu  32(%rdx),%xmm10
955 .byte   102,15,56,0,197
956
957         subq    $16,%r9
958         jz      .Lodd_tail
959
960         movdqu  16(%rdx),%xmm9
961         cmpq    $48,%r9
962         jb      .Lskip4x
963
964         subq    $48,%r9
965         movdqu  48(%rdx),%xmm14
966         movdqu  64(%rdx),%xmm15
967
968
969
970
971         movdqu  48(%r8),%xmm6
972         movdqu  32(%r8),%xmm11
973 .byte   102,15,56,0,245
974 .byte   102,68,15,56,0,221
975         movdqa  %xmm6,%xmm8
976         pshufd  $78,%xmm6,%xmm7
977         pxor    %xmm6,%xmm7
978 .byte   102,15,58,68,242,0
979 .byte   102,68,15,58,68,194,17
980 .byte   102,65,15,58,68,250,0
981
982         movdqa  %xmm11,%xmm13
983         pshufd  $78,%xmm11,%xmm12
984         pxor    %xmm11,%xmm12
985 .byte   102,69,15,58,68,217,0
986 .byte   102,69,15,58,68,233,17
987         xorps   %xmm11,%xmm6
988 .byte   102,69,15,58,68,226,16
989         xorps   %xmm13,%xmm8
990         movups  80(%rdx),%xmm10
991         xorps   %xmm12,%xmm7
992
993         movdqu  16(%r8),%xmm11
994         movdqu  0(%r8),%xmm3
995 .byte   102,68,15,56,0,221
996 .byte   102,15,56,0,221
997         movdqa  %xmm11,%xmm13
998         pshufd  $78,%xmm11,%xmm12
999         pxor    %xmm3,%xmm0
1000         pxor    %xmm11,%xmm12
1001 .byte   102,69,15,58,68,222,0
1002         movdqa  %xmm0,%xmm1
1003         pshufd  $78,%xmm0,%xmm3
1004         pxor    %xmm0,%xmm3
1005 .byte   102,69,15,58,68,238,17
1006         xorps   %xmm11,%xmm6
1007 .byte   102,69,15,58,68,226,0
1008         xorps   %xmm13,%xmm8
1009
1010         leaq    64(%r8),%r8
1011         subq    $64,%r9
1012         jc      .Ltail4x
1013
1014         jmp     .Lmod4_loop
1015 .p2align        5
1016 .Lmod4_loop:
1017 .byte   102,65,15,58,68,199,0
1018         xorps   %xmm12,%xmm7
1019         movdqu  48(%r8),%xmm11
1020 .byte   102,68,15,56,0,221
1021 .byte   102,65,15,58,68,207,17
1022         xorps   %xmm6,%xmm0
1023         movdqu  32(%r8),%xmm6
1024         movdqa  %xmm11,%xmm13
1025         pshufd  $78,%xmm11,%xmm12
1026 .byte   102,65,15,58,68,218,16
1027         xorps   %xmm8,%xmm1
1028         pxor    %xmm11,%xmm12
1029 .byte   102,15,56,0,245
1030         movups  32(%rdx),%xmm10
1031 .byte   102,68,15,58,68,218,0
1032         xorps   %xmm7,%xmm3
1033         movdqa  %xmm6,%xmm8
1034         pshufd  $78,%xmm6,%xmm7
1035
1036         pxor    %xmm0,%xmm3
1037         pxor    %xmm6,%xmm7
1038         pxor    %xmm1,%xmm3
1039         movdqa  %xmm3,%xmm4
1040         pslldq  $8,%xmm3
1041 .byte   102,68,15,58,68,234,17
1042         psrldq  $8,%xmm4
1043         pxor    %xmm3,%xmm0
1044         movdqa  .L7_mask(%rip),%xmm3
1045         pxor    %xmm4,%xmm1
1046 .byte   102,72,15,110,224
1047
1048         pand    %xmm0,%xmm3
1049 .byte   102,15,56,0,227
1050 .byte   102,69,15,58,68,226,0
1051         pxor    %xmm0,%xmm4
1052         psllq   $57,%xmm4
1053         movdqa  %xmm4,%xmm3
1054         pslldq  $8,%xmm4
1055 .byte   102,65,15,58,68,241,0
1056         psrldq  $8,%xmm3
1057         pxor    %xmm4,%xmm0
1058         pxor    %xmm3,%xmm1
1059         movdqu  0(%r8),%xmm3
1060
1061         movdqa  %xmm0,%xmm4
1062         psrlq   $1,%xmm0
1063 .byte   102,69,15,58,68,193,17
1064         xorps   %xmm11,%xmm6
1065         movdqu  16(%r8),%xmm11
1066 .byte   102,68,15,56,0,221
1067 .byte   102,65,15,58,68,250,16
1068         xorps   %xmm13,%xmm8
1069         movups  80(%rdx),%xmm10
1070 .byte   102,15,56,0,221
1071         pxor    %xmm4,%xmm1
1072         pxor    %xmm0,%xmm4
1073         psrlq   $5,%xmm0
1074
1075         movdqa  %xmm11,%xmm13
1076         pxor    %xmm12,%xmm7
1077         pshufd  $78,%xmm11,%xmm12
1078         pxor    %xmm11,%xmm12
1079 .byte   102,69,15,58,68,222,0
1080         pxor    %xmm4,%xmm0
1081         pxor    %xmm3,%xmm1
1082         psrlq   $1,%xmm0
1083 .byte   102,69,15,58,68,238,17
1084         xorps   %xmm11,%xmm6
1085         pxor    %xmm1,%xmm0
1086
1087 .byte   102,69,15,58,68,226,0
1088         xorps   %xmm13,%xmm8
1089
1090         movdqa  %xmm0,%xmm1
1091         pshufd  $78,%xmm0,%xmm3
1092         pxor    %xmm0,%xmm3
1093
1094         leaq    64(%r8),%r8
1095         subq    $64,%r9
1096         jnc     .Lmod4_loop
1097
1098 .Ltail4x:
1099 .byte   102,65,15,58,68,199,0
1100         xorps   %xmm12,%xmm7
1101 .byte   102,65,15,58,68,207,17
1102         xorps   %xmm6,%xmm0
1103 .byte   102,65,15,58,68,218,16
1104         xorps   %xmm8,%xmm1
1105         pxor    %xmm0,%xmm1
1106         pxor    %xmm7,%xmm3
1107
1108         pxor    %xmm1,%xmm3
1109         pxor    %xmm0,%xmm1
1110
1111         movdqa  %xmm3,%xmm4
1112         psrldq  $8,%xmm3
1113         pslldq  $8,%xmm4
1114         pxor    %xmm3,%xmm1
1115         pxor    %xmm4,%xmm0
1116
1117         movdqa  %xmm0,%xmm4
1118         movdqa  %xmm0,%xmm3
1119         psllq   $5,%xmm0
1120         pxor    %xmm0,%xmm3
1121         psllq   $1,%xmm0
1122         pxor    %xmm3,%xmm0
1123         psllq   $57,%xmm0
1124         movdqa  %xmm0,%xmm3
1125         pslldq  $8,%xmm0
1126         psrldq  $8,%xmm3
1127         pxor    %xmm4,%xmm0
1128         pxor    %xmm3,%xmm1
1129
1130
1131         movdqa  %xmm0,%xmm4
1132         psrlq   $1,%xmm0
1133         pxor    %xmm4,%xmm1
1134         pxor    %xmm0,%xmm4
1135         psrlq   $5,%xmm0
1136         pxor    %xmm4,%xmm0
1137         psrlq   $1,%xmm0
1138         pxor    %xmm1,%xmm0
1139         addq    $64,%r9
1140         jz      .Ldone
1141         movdqu  32(%rdx),%xmm10
1142         subq    $16,%r9
1143         jz      .Lodd_tail
1144 .Lskip4x:
1145
1146
1147
1148
1149
1150         movdqu  (%r8),%xmm3
1151         movdqu  16(%r8),%xmm6
1152 .byte   102,15,56,0,221
1153 .byte   102,15,56,0,245
1154         pxor    %xmm3,%xmm0
1155
1156         movdqa  %xmm6,%xmm8
1157         pshufd  $78,%xmm6,%xmm3
1158         pxor    %xmm6,%xmm3
1159 .byte   102,15,58,68,242,0
1160 .byte   102,68,15,58,68,194,17
1161 .byte   102,65,15,58,68,218,0
1162
1163         leaq    32(%r8),%r8
1164         subq    $32,%r9
1165         jbe     .Leven_tail
1166         jmp     .Lmod_loop
1167
1168 .p2align        5
1169 .Lmod_loop:
1170         movdqa  %xmm0,%xmm1
1171         pshufd  $78,%xmm0,%xmm4
1172         pxor    %xmm0,%xmm4
1173
1174 .byte   102,65,15,58,68,193,0
1175 .byte   102,65,15,58,68,201,17
1176 .byte   102,65,15,58,68,226,16
1177
1178         pxor    %xmm6,%xmm0
1179         pxor    %xmm8,%xmm1
1180         movdqu  (%r8),%xmm8
1181 .byte   102,68,15,56,0,197
1182         movdqu  16(%r8),%xmm6
1183
1184         pxor    %xmm0,%xmm3
1185         pxor    %xmm1,%xmm3
1186         pxor    %xmm8,%xmm1
1187         pxor    %xmm3,%xmm4
1188 .byte   102,15,56,0,245
1189         movdqa  %xmm4,%xmm3
1190         psrldq  $8,%xmm3
1191         pslldq  $8,%xmm4
1192         pxor    %xmm3,%xmm1
1193         pxor    %xmm4,%xmm0
1194
1195         movdqa  %xmm6,%xmm8
1196
1197         movdqa  %xmm0,%xmm4
1198         movdqa  %xmm0,%xmm3
1199         psllq   $5,%xmm0
1200 .byte   102,15,58,68,242,0
1201         pxor    %xmm0,%xmm3
1202         psllq   $1,%xmm0
1203         pxor    %xmm3,%xmm0
1204         psllq   $57,%xmm0
1205         movdqa  %xmm0,%xmm3
1206         pslldq  $8,%xmm0
1207         psrldq  $8,%xmm3
1208         pxor    %xmm4,%xmm0
1209         pxor    %xmm3,%xmm1
1210         pshufd  $78,%xmm8,%xmm3
1211         pxor    %xmm8,%xmm3
1212
1213 .byte   102,68,15,58,68,194,17
1214         movdqa  %xmm0,%xmm4
1215         psrlq   $1,%xmm0
1216         pxor    %xmm4,%xmm1
1217         pxor    %xmm0,%xmm4
1218         psrlq   $5,%xmm0
1219         pxor    %xmm4,%xmm0
1220         psrlq   $1,%xmm0
1221 .byte   102,65,15,58,68,218,0
1222         pxor    %xmm1,%xmm0
1223
1224         leaq    32(%r8),%r8
1225         subq    $32,%r9
1226         ja      .Lmod_loop
1227
1228 .Leven_tail:
1229         movdqa  %xmm0,%xmm1
1230         pshufd  $78,%xmm0,%xmm4
1231         pxor    %xmm0,%xmm4
1232
1233 .byte   102,65,15,58,68,193,0
1234 .byte   102,65,15,58,68,201,17
1235 .byte   102,65,15,58,68,226,16
1236
1237         pxor    %xmm6,%xmm0
1238         pxor    %xmm8,%xmm1
1239         pxor    %xmm0,%xmm3
1240         pxor    %xmm1,%xmm3
1241         pxor    %xmm3,%xmm4
1242         movdqa  %xmm4,%xmm3
1243         psrldq  $8,%xmm3
1244         pslldq  $8,%xmm4
1245         pxor    %xmm3,%xmm1
1246         pxor    %xmm4,%xmm0
1247
1248         movdqa  %xmm0,%xmm4
1249         movdqa  %xmm0,%xmm3
1250         psllq   $5,%xmm0
1251         pxor    %xmm0,%xmm3
1252         psllq   $1,%xmm0
1253         pxor    %xmm3,%xmm0
1254         psllq   $57,%xmm0
1255         movdqa  %xmm0,%xmm3
1256         pslldq  $8,%xmm0
1257         psrldq  $8,%xmm3
1258         pxor    %xmm4,%xmm0
1259         pxor    %xmm3,%xmm1
1260
1261
1262         movdqa  %xmm0,%xmm4
1263         psrlq   $1,%xmm0
1264         pxor    %xmm4,%xmm1
1265         pxor    %xmm0,%xmm4
1266         psrlq   $5,%xmm0
1267         pxor    %xmm4,%xmm0
1268         psrlq   $1,%xmm0
1269         pxor    %xmm1,%xmm0
1270         testq   %r9,%r9
1271         jnz     .Ldone
1272
1273 .Lodd_tail:
1274         movdqu  (%r8),%xmm3
1275 .byte   102,15,56,0,221
1276         pxor    %xmm3,%xmm0
1277         movdqa  %xmm0,%xmm1
1278         pshufd  $78,%xmm0,%xmm3
1279         pxor    %xmm0,%xmm3
1280 .byte   102,15,58,68,194,0
1281 .byte   102,15,58,68,202,17
1282 .byte   102,65,15,58,68,218,0
1283         pxor    %xmm0,%xmm3
1284         pxor    %xmm1,%xmm3
1285
1286         movdqa  %xmm3,%xmm4
1287         psrldq  $8,%xmm3
1288         pslldq  $8,%xmm4
1289         pxor    %xmm3,%xmm1
1290         pxor    %xmm4,%xmm0
1291
1292         movdqa  %xmm0,%xmm4
1293         movdqa  %xmm0,%xmm3
1294         psllq   $5,%xmm0
1295         pxor    %xmm0,%xmm3
1296         psllq   $1,%xmm0
1297         pxor    %xmm3,%xmm0
1298         psllq   $57,%xmm0
1299         movdqa  %xmm0,%xmm3
1300         pslldq  $8,%xmm0
1301         psrldq  $8,%xmm3
1302         pxor    %xmm4,%xmm0
1303         pxor    %xmm3,%xmm1
1304
1305
1306         movdqa  %xmm0,%xmm4
1307         psrlq   $1,%xmm0
1308         pxor    %xmm4,%xmm1
1309         pxor    %xmm0,%xmm4
1310         psrlq   $5,%xmm0
1311         pxor    %xmm4,%xmm0
1312         psrlq   $1,%xmm0
1313         pxor    %xmm1,%xmm0
1314 .Ldone:
1315 .byte   102,15,56,0,197
1316         movdqu  %xmm0,(%rcx)
1317         movaps  (%rsp),%xmm6
1318         movaps  16(%rsp),%xmm7
1319         movaps  32(%rsp),%xmm8
1320         movaps  48(%rsp),%xmm9
1321         movaps  64(%rsp),%xmm10
1322         movaps  80(%rsp),%xmm11
1323         movaps  96(%rsp),%xmm12
1324         movaps  112(%rsp),%xmm13
1325         movaps  128(%rsp),%xmm14
1326         movaps  144(%rsp),%xmm15
1327         leaq    168(%rsp),%rsp
1328 .LSEH_end_gcm_ghash_clmul:
1329         .byte   0xf3,0xc3
1330
1331 .globl  gcm_init_avx
1332 .def    gcm_init_avx;   .scl 2; .type 32;       .endef
1333 .p2align        5
1334 gcm_init_avx:
1335         jmp     .L_init_clmul
1336
1337 .globl  gcm_gmult_avx
1338 .def    gcm_gmult_avx;  .scl 2; .type 32;       .endef
1339 .p2align        5
1340 gcm_gmult_avx:
1341         jmp     .L_gmult_clmul
1342
1343 .globl  gcm_ghash_avx
1344 .def    gcm_ghash_avx;  .scl 2; .type 32;       .endef
1345 .p2align        5
1346 gcm_ghash_avx:
1347         jmp     .L_ghash_clmul
1348
1349 .p2align        6
1350 .Lbswap_mask:
1351 .byte   15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1352 .L0x1c2_polynomial:
1353 .byte   1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1354 .L7_mask:
1355 .long   7,0,7,0
1356 .L7_mask_poly:
1357 .long   7,0,450,0
1358 .p2align        6
1359
1360 .Lrem_4bit:
1361 .long   0,0,0,471859200,0,943718400,0,610271232
1362 .long   0,1887436800,0,1822425088,0,1220542464,0,1423966208
1363 .long   0,3774873600,0,4246732800,0,3644850176,0,3311403008
1364 .long   0,2441084928,0,2376073216,0,2847932416,0,3051356160
1365
1366 .Lrem_8bit:
1367 .value  0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
1368 .value  0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
1369 .value  0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
1370 .value  0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
1371 .value  0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
1372 .value  0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
1373 .value  0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
1374 .value  0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
1375 .value  0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
1376 .value  0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
1377 .value  0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
1378 .value  0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
1379 .value  0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
1380 .value  0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
1381 .value  0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
1382 .value  0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
1383 .value  0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
1384 .value  0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
1385 .value  0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
1386 .value  0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
1387 .value  0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
1388 .value  0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
1389 .value  0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
1390 .value  0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
1391 .value  0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
1392 .value  0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
1393 .value  0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
1394 .value  0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
1395 .value  0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
1396 .value  0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
1397 .value  0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
1398 .value  0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
1399
1400 .byte   71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1401 .p2align        6
1402
1403 .def    se_handler;     .scl 3; .type 32;       .endef
1404 .p2align        4
1405 se_handler:
1406         pushq   %rsi
1407         pushq   %rdi
1408         pushq   %rbx
1409         pushq   %rbp
1410         pushq   %r12
1411         pushq   %r13
1412         pushq   %r14
1413         pushq   %r15
1414         pushfq
1415         subq    $64,%rsp
1416
1417         movq    120(%r8),%rax
1418         movq    248(%r8),%rbx
1419
1420         movq    8(%r9),%rsi
1421         movq    56(%r9),%r11
1422
1423         movl    0(%r11),%r10d
1424         leaq    (%rsi,%r10,1),%r10
1425         cmpq    %r10,%rbx
1426         jb      .Lin_prologue
1427
1428         movq    152(%r8),%rax
1429
1430         movl    4(%r11),%r10d
1431         leaq    (%rsi,%r10,1),%r10
1432         cmpq    %r10,%rbx
1433         jae     .Lin_prologue
1434
1435         leaq    24(%rax),%rax
1436
1437         movq    -8(%rax),%rbx
1438         movq    -16(%rax),%rbp
1439         movq    -24(%rax),%r12
1440         movq    %rbx,144(%r8)
1441         movq    %rbp,160(%r8)
1442         movq    %r12,216(%r8)
1443
1444 .Lin_prologue:
1445         movq    8(%rax),%rdi
1446         movq    16(%rax),%rsi
1447         movq    %rax,152(%r8)
1448         movq    %rsi,168(%r8)
1449         movq    %rdi,176(%r8)
1450
1451         movq    40(%r9),%rdi
1452         movq    %r8,%rsi
1453         movl    $154,%ecx
1454 .long   0xa548f3fc              
1455
1456         movq    %r9,%rsi
1457         xorq    %rcx,%rcx
1458         movq    8(%rsi),%rdx
1459         movq    0(%rsi),%r8
1460         movq    16(%rsi),%r9
1461         movq    40(%rsi),%r10
1462         leaq    56(%rsi),%r11
1463         leaq    24(%rsi),%r12
1464         movq    %r10,32(%rsp)
1465         movq    %r11,40(%rsp)
1466         movq    %r12,48(%rsp)
1467         movq    %rcx,56(%rsp)
1468         call    *__imp_RtlVirtualUnwind(%rip)
1469
1470         movl    $1,%eax
1471         addq    $64,%rsp
1472         popfq
1473         popq    %r15
1474         popq    %r14
1475         popq    %r13
1476         popq    %r12
1477         popq    %rbp
1478         popq    %rbx
1479         popq    %rdi
1480         popq    %rsi
1481         .byte   0xf3,0xc3
1482
1483
1484 .section        .pdata
1485 .p2align        2
1486 .rva    .LSEH_begin_gcm_gmult_4bit
1487 .rva    .LSEH_end_gcm_gmult_4bit
1488 .rva    .LSEH_info_gcm_gmult_4bit
1489
1490 .rva    .LSEH_begin_gcm_ghash_4bit
1491 .rva    .LSEH_end_gcm_ghash_4bit
1492 .rva    .LSEH_info_gcm_ghash_4bit
1493
1494 .rva    .LSEH_begin_gcm_init_clmul
1495 .rva    .LSEH_end_gcm_init_clmul
1496 .rva    .LSEH_info_gcm_init_clmul
1497
1498 .rva    .LSEH_begin_gcm_ghash_clmul
1499 .rva    .LSEH_end_gcm_ghash_clmul
1500 .rva    .LSEH_info_gcm_ghash_clmul
1501 .section        .xdata
1502 .p2align        3
1503 .LSEH_info_gcm_gmult_4bit:
1504 .byte   9,0,0,0
1505 .rva    se_handler
1506 .rva    .Lgmult_prologue,.Lgmult_epilogue       
1507 .LSEH_info_gcm_ghash_4bit:
1508 .byte   9,0,0,0
1509 .rva    se_handler
1510 .rva    .Lghash_prologue,.Lghash_epilogue       
1511 .LSEH_info_gcm_init_clmul:
1512 .byte   0x01,0x08,0x03,0x00
1513 .byte   0x08,0x68,0x00,0x00     
1514 .byte   0x04,0x22,0x00,0x00     
1515 .LSEH_info_gcm_ghash_clmul:
1516 .byte   0x01,0x33,0x16,0x00
1517 .byte   0x33,0xf8,0x09,0x00     
1518 .byte   0x2e,0xe8,0x08,0x00     
1519 .byte   0x29,0xd8,0x07,0x00     
1520 .byte   0x24,0xc8,0x06,0x00     
1521 .byte   0x1f,0xb8,0x05,0x00     
1522 .byte   0x1a,0xa8,0x04,0x00     
1523 .byte   0x15,0x98,0x03,0x00     
1524 .byte   0x10,0x88,0x02,0x00     
1525 .byte   0x0c,0x78,0x01,0x00     
1526 .byte   0x08,0x68,0x00,0x00     
1527 .byte   0x04,0x01,0x15,0x00     
1528
1529 .section .note.GNU-stack,"",%progbits