aesni-intel_asm.S 60.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11
/*
 * Implement AES algorithm in Intel AES-NI instructions.
 *
 * The white paper of AES-NI instructions can be downloaded from:
 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
 *
 * Copyright (C) 2008, Intel Corp.
 *    Author: Huang Ying <ying.huang@intel.com>
 *            Vinodh Gopal <vinodh.gopal@intel.com>
 *            Kahraman Akdemir
 *
12 13 14 15 16 17 18 19 20 21 22
 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
 * interface for 64-bit kernels.
 *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
 *             Aidan O'Mahony (aidan.o.mahony@intel.com)
 *             Adrian Hoban <adrian.hoban@intel.com>
 *             James Guilford (james.guilford@intel.com)
 *             Gabriele Paoloni <gabriele.paoloni@intel.com>
 *             Tadeusz Struk (tadeusz.struk@intel.com)
 *             Wajdi Feghali (wajdi.k.feghali@intel.com)
 *    Copyright (c) 2010, Intel Corporation.
 *
23 24 25
 * Ported x86_64 version to x86:
 *    Author: Mathias Krause <minipli@googlemail.com>
 *
26 27 28 29 30 31 32
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 */

#include <linux/linkage.h>
33
#include <asm/inst.h>
34

35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
.data
POLY:   .octa 0xC2000000000000000000000000000001
TWOONE: .octa 0x00000001000000000000000000000001

# order of these constants should not change.
# more specifically, ALL_F should follow SHIFT_MASK,
# and ZERO should follow ALL_F

SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
MASK1:      .octa 0x0000000000000000ffffffffffffffff
MASK2:      .octa 0xffffffffffffffff0000000000000000
SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
ZERO:       .octa 0x00000000000000000000000000000000
ONE:        .octa 0x00000000000000000000000000000001
F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
dec:        .octa 0x1
enc:        .octa 0x2


55 56
.text

57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88

#define	STACK_OFFSET    8*3
#define	HashKey		16*0	// store HashKey <<1 mod poly here
#define	HashKey_2	16*1	// store HashKey^2 <<1 mod poly here
#define	HashKey_3	16*2	// store HashKey^3 <<1 mod poly here
#define	HashKey_4	16*3	// store HashKey^4 <<1 mod poly here
#define	HashKey_k	16*4	// store XOR of High 64 bits and Low 64
				// bits of  HashKey <<1 mod poly here
				//(for Karatsuba purposes)
#define	HashKey_2_k	16*5	// store XOR of High 64 bits and Low 64
				// bits of  HashKey^2 <<1 mod poly here
				// (for Karatsuba purposes)
#define	HashKey_3_k	16*6	// store XOR of High 64 bits and Low 64
				// bits of  HashKey^3 <<1 mod poly here
				// (for Karatsuba purposes)
#define	HashKey_4_k	16*7	// store XOR of High 64 bits and Low 64
				// bits of  HashKey^4 <<1 mod poly here
				// (for Karatsuba purposes)
#define	VARIABLE_OFFSET	16*8

#define arg1 rdi
#define arg2 rsi
#define arg3 rdx
#define arg4 rcx
#define arg5 r8
#define arg6 r9
#define arg7 STACK_OFFSET+8(%r14)
#define arg8 STACK_OFFSET+16(%r14)
#define arg9 STACK_OFFSET+24(%r14)
#define arg10 STACK_OFFSET+32(%r14)


89 90 91 92 93 94 95 96 97 98 99 100
#define STATE1	%xmm0
#define STATE2	%xmm4
#define STATE3	%xmm5
#define STATE4	%xmm6
#define STATE	STATE1
#define IN1	%xmm1
#define IN2	%xmm7
#define IN3	%xmm8
#define IN4	%xmm9
#define IN	IN1
#define KEY	%xmm2
#define IV	%xmm3
101

102 103 104
#define BSWAP_MASK %xmm10
#define CTR	%xmm11
#define INC	%xmm12
105

106 107
#ifdef __x86_64__
#define AREG	%rax
108 109
#define KEYP	%rdi
#define OUTP	%rsi
110
#define UKEYP	OUTP
111 112 113 114 115 116 117
#define INP	%rdx
#define LEN	%rcx
#define IVP	%r8
#define KLEN	%r9d
#define T1	%r10
#define TKEYP	T1
#define T2	%r11
118
#define TCTR_LOW T2
119 120 121 122 123 124 125 126 127 128 129 130
#else
#define AREG	%eax
#define KEYP	%edi
#define OUTP	AREG
#define UKEYP	OUTP
#define INP	%edx
#define LEN	%esi
#define IVP	%ebp
#define KLEN	%ebx
#define T1	%ecx
#define TKEYP	T1
#endif
131

132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260

/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
*
*
* Input: A and B (128-bits each, bit-reflected)
* Output: C = A*B*x mod poly, (i.e. >>1 )
* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
*
*/
.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
	movdqa	  \GH, \TMP1
	pshufd	  $78, \GH, \TMP2
	pshufd	  $78, \HK, \TMP3
	pxor	  \GH, \TMP2            # TMP2 = a1+a0
	pxor	  \HK, \TMP3            # TMP3 = b1+b0
	PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1
	PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0
	PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
	pxor	  \GH, \TMP2
	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
	movdqa	  \TMP2, \TMP3
	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
	pxor	  \TMP3, \GH
	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK

        # first phase of the reduction

	movdqa    \GH, \TMP2
	movdqa    \GH, \TMP3
	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
					# in in order to perform
					# independent shifts
	pslld     $31, \TMP2            # packed right shift <<31
	pslld     $30, \TMP3            # packed right shift <<30
	pslld     $25, \TMP4            # packed right shift <<25
	pxor      \TMP3, \TMP2          # xor the shifted versions
	pxor      \TMP4, \TMP2
	movdqa    \TMP2, \TMP5
	psrldq    $4, \TMP5             # right shift TMP5 1 DW
	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
	pxor      \TMP2, \GH

        # second phase of the reduction

	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
					# in in order to perform
					# independent shifts
	movdqa    \GH,\TMP3
	movdqa    \GH,\TMP4
	psrld     $1,\TMP2              # packed left shift >>1
	psrld     $2,\TMP3              # packed left shift >>2
	psrld     $7,\TMP4              # packed left shift >>7
	pxor      \TMP3,\TMP2		# xor the shifted versions
	pxor      \TMP4,\TMP2
	pxor      \TMP5, \TMP2
	pxor      \TMP2, \GH
	pxor      \TMP1, \GH            # result is in TMP1
.endm

/*
* if a = number of total plaintext bytes
* b = floor(a/16)
* num_initial_blocks = b mod 4
* encrypt the initial num_initial_blocks blocks and apply ghash on
* the ciphertext
* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
* are clobbered
* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
*/

.macro INITIAL_BLOCKS num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation

	mov	   arg7, %r10           # %r10 = AAD
	mov	   arg8, %r12           # %r12 = aadLen
	mov	   %r12, %r11
	pxor	   %xmm\i, %xmm\i
_get_AAD_loop\num_initial_blocks\operation:
	movd	   (%r10), \TMP1
	pslldq	   $12, \TMP1
	psrldq	   $4, %xmm\i
	pxor	   \TMP1, %xmm\i
	add	   $4, %r10
	sub	   $4, %r12
	jne	   _get_AAD_loop\num_initial_blocks\operation
	cmp	   $16, %r11
	je	   _get_AAD_loop2_done\num_initial_blocks\operation
	mov	   $16, %r12
_get_AAD_loop2\num_initial_blocks\operation:
	psrldq	   $4, %xmm\i
	sub	   $4, %r12
	cmp	   %r11, %r12
	jne	   _get_AAD_loop2\num_initial_blocks\operation
_get_AAD_loop2_done\num_initial_blocks\operation:
	pshufb	   SHUF_MASK(%rip), %xmm\i # byte-reflect the AAD data
	xor	   %r11, %r11 # initialise the data pointer offset as zero

        # start AES for num_initial_blocks blocks

	mov	   %arg5, %rax                      # %rax = *Y0
	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
	pshufb	   SHUF_MASK(%rip), \XMM0
.if \i_seq != 0
.irpc index, \i_seq
	paddd	   ONE(%rip), \XMM0                 # INCR Y0
	movdqa	   \XMM0, %xmm\index
	pshufb	   SHUF_MASK(%rip), %xmm\index      # perform a 16 byte swap
.endr
.irpc index, \i_seq
	pxor	   16*0(%arg1), %xmm\index
.endr
.irpc index, \i_seq
	movaps 0x10(%rdi), \TMP1
	AESENC     \TMP1, %xmm\index          # Round 1
.endr
.irpc index, \i_seq
	movaps 0x20(%arg1), \TMP1
	AESENC     \TMP1, %xmm\index          # Round 2
.endr
.irpc index, \i_seq
	movaps 0x30(%arg1), \TMP1
	AESENC     \TMP1, %xmm\index          # Round 2
.endr
.irpc index, \i_seq
	movaps 0x40(%arg1), \TMP1
	AESENC     \TMP1, %xmm\index          # Round 2
.endr
.irpc index, \i_seq
	movaps 0x50(%arg1), \TMP1
	AESENC     \TMP1, %xmm\index          # Round 2
.endr
.irpc index, \i_seq
	movaps 0x60(%arg1), \TMP1
	AESENC     \TMP1, %xmm\index          # Round 2
.endr
.irpc index, \i_seq
	movaps 0x70(%arg1), \TMP1
	AESENC     \TMP1, %xmm\index          # Round 2
.endr
.irpc index, \i_seq
	movaps 0x80(%arg1), \TMP1
	AESENC     \TMP1, %xmm\index          # Round 2
.endr
.irpc index, \i_seq
	movaps 0x90(%arg1), \TMP1
	AESENC     \TMP1, %xmm\index          # Round 2
.endr
.irpc index, \i_seq
	movaps 0xa0(%arg1), \TMP1
	AESENCLAST \TMP1, %xmm\index         # Round 10
.endr
.irpc index, \i_seq
	movdqu	   (%arg3 , %r11, 1), \TMP1
	pxor	   \TMP1, %xmm\index
	movdqu	   %xmm\index, (%arg2 , %r11, 1)
	# write back plaintext/ciphertext for num_initial_blocks
	add	   $16, %r11
.if \operation == dec
	movdqa     \TMP1, %xmm\index
.endif
	pshufb	   SHUF_MASK(%rip), %xmm\index
		# prepare plaintext/ciphertext for GHASH computation
.endr
.endif
	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
        # apply GHASH on num_initial_blocks blocks

.if \i == 5
        pxor       %xmm5, %xmm6
	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
        pxor       %xmm6, %xmm7
	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
        pxor       %xmm7, %xmm8
	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
.elseif \i == 6
        pxor       %xmm6, %xmm7
	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
        pxor       %xmm7, %xmm8
	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
.elseif \i == 7
        pxor       %xmm7, %xmm8
	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
.endif
	cmp	   $64, %r13
	jl	_initial_blocks_done\num_initial_blocks\operation
	# no need for precomputed values
/*
*
* Precomputations for HashKey parallel with encryption of first 4 blocks.
* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
*/
	paddd	   ONE(%rip), \XMM0              # INCR Y0
	movdqa	   \XMM0, \XMM1
	pshufb	   SHUF_MASK(%rip), \XMM1        # perform a 16 byte swap
	paddd	   ONE(%rip), \XMM0              # INCR Y0
	movdqa	   \XMM0, \XMM2
	pshufb	   SHUF_MASK(%rip), \XMM2        # perform a 16 byte swap
	paddd	   ONE(%rip), \XMM0              # INCR Y0
	movdqa	   \XMM0, \XMM3
	pshufb	   SHUF_MASK(%rip), \XMM3        # perform a 16 byte swap
	paddd	   ONE(%rip), \XMM0              # INCR Y0
	movdqa	   \XMM0, \XMM4
	pshufb	   SHUF_MASK(%rip), \XMM4        # perform a 16 byte swap
	pxor	   16*0(%arg1), \XMM1
	pxor	   16*0(%arg1), \XMM2
	pxor	   16*0(%arg1), \XMM3
	pxor	   16*0(%arg1), \XMM4
	movdqa	   \TMP3, \TMP5
	pshufd	   $78, \TMP3, \TMP1
	pxor	   \TMP3, \TMP1
	movdqa	   \TMP1, HashKey_k(%rsp)
	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
# TMP5 = HashKey^2<<1 (mod poly)
	movdqa	   \TMP5, HashKey_2(%rsp)
# HashKey_2 = HashKey^2<<1 (mod poly)
	pshufd	   $78, \TMP5, \TMP1
	pxor	   \TMP5, \TMP1
	movdqa	   \TMP1, HashKey_2_k(%rsp)
.irpc index, 1234 # do 4 rounds
	movaps 0x10*\index(%arg1), \TMP1
	AESENC	   \TMP1, \XMM1
	AESENC	   \TMP1, \XMM2
	AESENC	   \TMP1, \XMM3
	AESENC	   \TMP1, \XMM4
.endr
	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
# TMP5 = HashKey^3<<1 (mod poly)
	movdqa	   \TMP5, HashKey_3(%rsp)
	pshufd	   $78, \TMP5, \TMP1
	pxor	   \TMP5, \TMP1
	movdqa	   \TMP1, HashKey_3_k(%rsp)
.irpc index, 56789 # do next 5 rounds
	movaps 0x10*\index(%arg1), \TMP1
	AESENC	   \TMP1, \XMM1
	AESENC	   \TMP1, \XMM2
	AESENC	   \TMP1, \XMM3
	AESENC	   \TMP1, \XMM4
.endr
	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
# TMP5 = HashKey^3<<1 (mod poly)
	movdqa	   \TMP5, HashKey_4(%rsp)
	pshufd	   $78, \TMP5, \TMP1
	pxor	   \TMP5, \TMP1
	movdqa	   \TMP1, HashKey_4_k(%rsp)
	movaps 0xa0(%arg1), \TMP2
	AESENCLAST \TMP2, \XMM1
	AESENCLAST \TMP2, \XMM2
	AESENCLAST \TMP2, \XMM3
	AESENCLAST \TMP2, \XMM4
	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
	pxor	   \TMP1, \XMM1
.if \operation == dec
	movdqu	   \XMM1, 16*0(%arg2 , %r11 , 1)
	movdqa     \TMP1, \XMM1
.endif
	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
	pxor	   \TMP1, \XMM2
.if \operation == dec
	movdqu	   \XMM2, 16*1(%arg2 , %r11 , 1)
	movdqa     \TMP1, \XMM2
.endif
	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
	pxor	   \TMP1, \XMM3
.if \operation == dec
	movdqu	   \XMM3, 16*2(%arg2 , %r11 , 1)
	movdqa     \TMP1, \XMM3
.endif
	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
	pxor	   \TMP1, \XMM4
.if \operation == dec
	movdqu	   \XMM4, 16*3(%arg2 , %r11 , 1)
	movdqa     \TMP1, \XMM4
.else
	movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
	movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
	movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
	movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
.endif
	add	   $64, %r11
	pshufb	   SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
	pxor	   \XMMDst, \XMM1
# combine GHASHed value with the corresponding ciphertext
	pshufb	   SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
	pshufb	   SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
	pshufb	   SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
_initial_blocks_done\num_initial_blocks\operation:
.endm

/*
* encrypt 4 blocks at a time
* ghash the 4 previously encrypted ciphertext blocks
* arg1, %arg2, %arg3 are used as pointers only, not modified
* %r11 is the data offset value
*/
.macro GHASH_4_ENCRYPT_4_PARALLEL TMP1 TMP2 TMP3 TMP4 TMP5 \
TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation

	movdqa	  \XMM1, \XMM5
	movdqa	  \XMM2, \XMM6
	movdqa	  \XMM3, \XMM7
	movdqa	  \XMM4, \XMM8

        # multiply TMP5 * HashKey using karatsuba

	movdqa	  \XMM5, \TMP4
	pshufd	  $78, \XMM5, \TMP6
	pxor	  \XMM5, \TMP6
	paddd     ONE(%rip), \XMM0		# INCR CNT
	movdqa	  HashKey_4(%rsp), \TMP5
	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
	movdqa    \XMM0, \XMM1
	paddd     ONE(%rip), \XMM0		# INCR CNT
	movdqa    \XMM0, \XMM2
	paddd     ONE(%rip), \XMM0		# INCR CNT
	movdqa    \XMM0, \XMM3
	paddd     ONE(%rip), \XMM0		# INCR CNT
	movdqa    \XMM0, \XMM4
	pshufb	  SHUF_MASK(%rip), \XMM1	# perform a 16 byte swap
	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
	pshufb	  SHUF_MASK(%rip), \XMM2	# perform a 16 byte swap
	pshufb	  SHUF_MASK(%rip), \XMM3	# perform a 16 byte swap
	pshufb	  SHUF_MASK(%rip), \XMM4	# perform a 16 byte swap
	pxor	  (%arg1), \XMM1
	pxor	  (%arg1), \XMM2
	pxor	  (%arg1), \XMM3
	pxor	  (%arg1), \XMM4
	movdqa	  HashKey_4_k(%rsp), \TMP5
	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
	movaps 0x10(%arg1), \TMP1
	AESENC	  \TMP1, \XMM1              # Round 1
	AESENC	  \TMP1, \XMM2
	AESENC	  \TMP1, \XMM3
	AESENC	  \TMP1, \XMM4
	movaps 0x20(%arg1), \TMP1
	AESENC	  \TMP1, \XMM1              # Round 2
	AESENC	  \TMP1, \XMM2
	AESENC	  \TMP1, \XMM3
	AESENC	  \TMP1, \XMM4
	movdqa	  \XMM6, \TMP1
	pshufd	  $78, \XMM6, \TMP2
	pxor	  \XMM6, \TMP2
	movdqa	  HashKey_3(%rsp), \TMP5
	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
	movaps 0x30(%arg1), \TMP3
	AESENC    \TMP3, \XMM1              # Round 3
	AESENC    \TMP3, \XMM2
	AESENC    \TMP3, \XMM3
	AESENC    \TMP3, \XMM4
	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
	movaps 0x40(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1              # Round 4
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	movdqa	  HashKey_3_k(%rsp), \TMP5
	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
	movaps 0x50(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1              # Round 5
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	pxor	  \TMP1, \TMP4
# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
	pxor	  \XMM6, \XMM5
	pxor	  \TMP2, \TMP6
	movdqa	  \XMM7, \TMP1
	pshufd	  $78, \XMM7, \TMP2
	pxor	  \XMM7, \TMP2
	movdqa	  HashKey_2(%rsp ), \TMP5

        # Multiply TMP5 * HashKey using karatsuba

	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
	movaps 0x60(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1              # Round 6
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
	movaps 0x70(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1             # Round 7
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	movdqa	  HashKey_2_k(%rsp), \TMP5
	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
	movaps 0x80(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1             # Round 8
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	pxor	  \TMP1, \TMP4
# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
	pxor	  \XMM7, \XMM5
	pxor	  \TMP2, \TMP6

        # Multiply XMM8 * HashKey
        # XMM8 and TMP5 hold the values for the two operands

	movdqa	  \XMM8, \TMP1
	pshufd	  $78, \XMM8, \TMP2
	pxor	  \XMM8, \TMP2
	movdqa	  HashKey(%rsp), \TMP5
	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
	movaps 0x90(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1            # Round 9
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
	movaps 0xa0(%arg1), \TMP3
	AESENCLAST \TMP3, \XMM1           # Round 10
	AESENCLAST \TMP3, \XMM2
	AESENCLAST \TMP3, \XMM3
	AESENCLAST \TMP3, \XMM4
	movdqa    HashKey_k(%rsp), \TMP5
	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
	movdqu	  (%arg3,%r11,1), \TMP3
	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
.if \operation == dec
	movdqu	  \XMM1, (%arg2,%r11,1)        # Write to plaintext buffer
	movdqa    \TMP3, \XMM1
.endif
	movdqu	  16(%arg3,%r11,1), \TMP3
	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
.if \operation == dec
	movdqu	  \XMM2, 16(%arg2,%r11,1)      # Write to plaintext buffer
	movdqa    \TMP3, \XMM2
.endif
	movdqu	  32(%arg3,%r11,1), \TMP3
	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
.if \operation == dec
	movdqu	  \XMM3, 32(%arg2,%r11,1)      # Write to plaintext buffer
	movdqa    \TMP3, \XMM3
.endif
	movdqu	  48(%arg3,%r11,1), \TMP3
	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
.if \operation == dec
	movdqu	  \XMM4, 48(%arg2,%r11,1)      # Write to plaintext buffer
	movdqa    \TMP3, \XMM4
.else
    movdqu    \XMM1, (%arg2,%r11,1)        # Write to the ciphertext buffer
    movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to the ciphertext buffer
    movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to the ciphertext buffer
    movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to the ciphertext buffer
.endif
	pshufb	  SHUF_MASK(%rip), \XMM1       # perform a 16 byte swap
	pshufb	  SHUF_MASK(%rip), \XMM2       # perform a 16 byte swap
	pshufb	  SHUF_MASK(%rip), \XMM3       # perform a 16 byte swap
	pshufb	  SHUF_MASK(%rip), \XMM4       # perform a 16 byte sway

	pxor	  \TMP4, \TMP1
	pxor	  \XMM8, \XMM5
	pxor	  \TMP6, \TMP2
	pxor	  \TMP1, \TMP2
	pxor	  \XMM5, \TMP2
	movdqa	  \TMP2, \TMP3
	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
	pxor	  \TMP3, \XMM5
	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5

        # first phase of reduction

	movdqa    \XMM5, \TMP2
	movdqa    \XMM5, \TMP3
	movdqa    \XMM5, \TMP4
# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
	pslld     $31, \TMP2                   # packed right shift << 31
	pslld     $30, \TMP3                   # packed right shift << 30
	pslld     $25, \TMP4                   # packed right shift << 25
	pxor      \TMP3, \TMP2	               # xor the shifted versions
	pxor      \TMP4, \TMP2
	movdqa    \TMP2, \TMP5
	psrldq    $4, \TMP5                    # right shift T5 1 DW
	pslldq    $12, \TMP2                   # left shift T2 3 DWs
	pxor      \TMP2, \XMM5

        # second phase of reduction

	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
	movdqa    \XMM5,\TMP3
	movdqa    \XMM5,\TMP4
	psrld     $1, \TMP2                    # packed left shift >>1
	psrld     $2, \TMP3                    # packed left shift >>2
	psrld     $7, \TMP4                    # packed left shift >>7
	pxor      \TMP3,\TMP2		       # xor the shifted versions
	pxor      \TMP4,\TMP2
	pxor      \TMP5, \TMP2
	pxor      \TMP2, \XMM5
	pxor      \TMP1, \XMM5                 # result is in TMP1

	pxor	  \XMM5, \XMM1
.endm

/* GHASH the last 4 ciphertext blocks. */
.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst

        # Multiply TMP6 * HashKey (using Karatsuba)

	movdqa	  \XMM1, \TMP6
	pshufd	  $78, \XMM1, \TMP2
	pxor	  \XMM1, \TMP2
	movdqa	  HashKey_4(%rsp), \TMP5
	PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
	PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
	movdqa	  HashKey_4_k(%rsp), \TMP4
	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
	movdqa	  \XMM1, \XMMDst
	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1

        # Multiply TMP1 * HashKey (using Karatsuba)

	movdqa	  \XMM2, \TMP1
	pshufd	  $78, \XMM2, \TMP2
	pxor	  \XMM2, \TMP2
	movdqa	  HashKey_3(%rsp), \TMP5
	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
	PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
	movdqa	  HashKey_3_k(%rsp), \TMP4
	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
	pxor	  \TMP1, \TMP6
	pxor	  \XMM2, \XMMDst
	pxor	  \TMP2, \XMM1
# results accumulated in TMP6, XMMDst, XMM1

        # Multiply TMP1 * HashKey (using Karatsuba)

	movdqa	  \XMM3, \TMP1
	pshufd	  $78, \XMM3, \TMP2
	pxor	  \XMM3, \TMP2
	movdqa	  HashKey_2(%rsp), \TMP5
	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
	PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
	movdqa	  HashKey_2_k(%rsp), \TMP4
	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
	pxor	  \TMP1, \TMP6
	pxor	  \XMM3, \XMMDst
	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1

        # Multiply TMP1 * HashKey (using Karatsuba)
	movdqa	  \XMM4, \TMP1
	pshufd	  $78, \XMM4, \TMP2
	pxor	  \XMM4, \TMP2
	movdqa	  HashKey(%rsp), \TMP5
	PCLMULQDQ 0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
	PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
	movdqa	  HashKey_k(%rsp), \TMP4
	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
	pxor	  \TMP1, \TMP6
	pxor	  \XMM4, \XMMDst
	pxor	  \XMM1, \TMP2
	pxor	  \TMP6, \TMP2
	pxor	  \XMMDst, \TMP2
	# middle section of the temp results combined as in karatsuba algorithm
	movdqa	  \TMP2, \TMP4
	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
	pxor	  \TMP4, \XMMDst
	pxor	  \TMP2, \TMP6
# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
	# first phase of the reduction
	movdqa    \XMMDst, \TMP2
	movdqa    \XMMDst, \TMP3
	movdqa    \XMMDst, \TMP4
# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
	pslld     $31, \TMP2                # packed right shifting << 31
	pslld     $30, \TMP3                # packed right shifting << 30
	pslld     $25, \TMP4                # packed right shifting << 25
	pxor      \TMP3, \TMP2              # xor the shifted versions
	pxor      \TMP4, \TMP2
	movdqa    \TMP2, \TMP7
	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
	pxor      \TMP2, \XMMDst

        # second phase of the reduction
	movdqa    \XMMDst, \TMP2
	# make 3 copies of XMMDst for doing 3 shift operations
	movdqa    \XMMDst, \TMP3
	movdqa    \XMMDst, \TMP4
	psrld     $1, \TMP2                 # packed left shift >> 1
	psrld     $2, \TMP3                 # packed left shift >> 2
	psrld     $7, \TMP4                 # packed left shift >> 7
	pxor      \TMP3, \TMP2              # xor the shifted versions
	pxor      \TMP4, \TMP2
	pxor      \TMP7, \TMP2
	pxor      \TMP2, \XMMDst
	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
.endm

/* Encryption of a single block done*/
.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1

	pxor	(%arg1), \XMM0
        movaps 16(%arg1), \TMP1
	AESENC	\TMP1, \XMM0
        movaps 32(%arg1), \TMP1
	AESENC	\TMP1, \XMM0
        movaps 48(%arg1), \TMP1
	AESENC	\TMP1, \XMM0
        movaps 64(%arg1), \TMP1
	AESENC	\TMP1, \XMM0
        movaps 80(%arg1), \TMP1
	AESENC	\TMP1, \XMM0
        movaps 96(%arg1), \TMP1
	AESENC	\TMP1, \XMM0
        movaps 112(%arg1), \TMP1
	AESENC	\TMP1, \XMM0
        movaps 128(%arg1), \TMP1
	AESENC	\TMP1, \XMM0
        movaps 144(%arg1), \TMP1
	AESENC	\TMP1, \XMM0
        movaps 160(%arg1), \TMP1
	AESENCLAST	\TMP1, \XMM0
.endm


/*****************************************************************************
* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
*                   const u8 *in,      // Ciphertext input
*                   u64 plaintext_len, // Length of data in bytes for decryption.
*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
*                   const u8 *aad,     // Additional Authentication Data (AAD)
*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
*                                      // given authentication tag and only return the plaintext if they match.
*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
*                                      // (most likely), 12 or 8.
*
* Assumptions:
*
* keys:
*       keys are pre-expanded and aligned to 16 bytes. we are using the first
*       set of 11 keys in the data structure void *aes_ctx
*
* iv:
*       0                   1                   2                   3
*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                             Salt  (From the SA)               |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                     Initialization Vector                     |
*       |         (This is the sequence number from IPSec header)       |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                              0x1                              |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
*
*
* AAD:
*       AAD padded to 128 bits with 0
*       for example, assume AAD is a u32 vector
*
*       if AAD is 8 bytes:
*       AAD[3] = {A0, A1};
*       padded AAD in xmm register = {A1 A0 0 0}
*
*       0                   1                   2                   3
*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                               SPI (A1)                        |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                     32-bit Sequence Number (A0)               |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                              0x0                              |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
*                                       AAD Format with 32-bit Sequence Number
*
*       if AAD is 12 bytes:
*       AAD[3] = {A0, A1, A2};
*       padded AAD in xmm register = {A2 A1 A0 0}
*
*       0                   1                   2                   3
*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                               SPI (A2)                        |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                 64-bit Extended Sequence Number {A1,A0}       |
*       |                                                               |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                              0x0                              |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
*                        AAD Format with 64-bit Extended Sequence Number
*
* aadLen:
*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
*       The code supports 16 too but for other sizes, the code will fail.
*
* TLen:
*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
*       For other sizes, the code will fail.
*
* poly = x^128 + x^127 + x^126 + x^121 + 1
*
*****************************************************************************/

ENTRY(aesni_gcm_dec)
	push	%r12
	push	%r13
	push	%r14
	mov	%rsp, %r14
/*
* states of %xmm registers %xmm6:%xmm15 not saved
* all %xmm registers are clobbered
*/
	sub	$VARIABLE_OFFSET, %rsp
	and	$~63, %rsp                        # align rsp to 64 bytes
	mov	%arg6, %r12
	movdqu	(%r12), %xmm13			  # %xmm13 = HashKey
	pshufb	SHUF_MASK(%rip), %xmm13

# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)

	movdqa	%xmm13, %xmm2
	psllq	$1, %xmm13
	psrlq	$63, %xmm2
	movdqa	%xmm2, %xmm1
	pslldq	$8, %xmm2
	psrldq	$8, %xmm1
	por	%xmm2, %xmm13

        # Reduction

	pshufd	$0x24, %xmm1, %xmm2
	pcmpeqd TWOONE(%rip), %xmm2
	pand	POLY(%rip), %xmm2
	pxor	%xmm2, %xmm13     # %xmm13 holds the HashKey<<1 (mod poly)


        # Decrypt first few blocks

	movdqa %xmm13, HashKey(%rsp)           # store HashKey<<1 (mod poly)
	mov %arg4, %r13    # save the number of bytes of plaintext/ciphertext
	and $-16, %r13                      # %r13 = %r13 - (%r13 mod 16)
	mov %r13, %r12
	and $(3<<4), %r12
	jz _initial_num_blocks_is_0_decrypt
	cmp $(2<<4), %r12
	jb _initial_num_blocks_is_1_decrypt
	je _initial_num_blocks_is_2_decrypt
_initial_num_blocks_is_3_decrypt:
	INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
	sub	$48, %r13
	jmp	_initial_blocks_decrypted
_initial_num_blocks_is_2_decrypt:
	INITIAL_BLOCKS	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
	sub	$32, %r13
	jmp	_initial_blocks_decrypted
_initial_num_blocks_is_1_decrypt:
	INITIAL_BLOCKS	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
	sub	$16, %r13
	jmp	_initial_blocks_decrypted
_initial_num_blocks_is_0_decrypt:
	INITIAL_BLOCKS	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
_initial_blocks_decrypted:
	cmp	$0, %r13
	je	_zero_cipher_left_decrypt
	sub	$64, %r13
	je	_four_cipher_left_decrypt
_decrypt_by_4:
	GHASH_4_ENCRYPT_4_PARALLEL	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
	add	$64, %r11
	sub	$64, %r13
	jne	_decrypt_by_4
_four_cipher_left_decrypt:
	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
_zero_cipher_left_decrypt:
	mov	%arg4, %r13
	and	$15, %r13				# %r13 = arg4 (mod 16)
	je	_multiple_of_16_bytes_decrypt

        # Handle the last <16 byte block seperately

	paddd ONE(%rip), %xmm0         # increment CNT to get Yn
	pshufb SHUF_MASK(%rip), %xmm0
	ENCRYPT_SINGLE_BLOCK  %xmm0, %xmm1    # E(K, Yn)
	sub $16, %r11
	add %r13, %r11
	movdqu (%arg3,%r11,1), %xmm1   # recieve the last <16 byte block
	lea SHIFT_MASK+16(%rip), %r12
	sub %r13, %r12
# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
# (%r13 is the number of bytes in plaintext mod 16)
	movdqu (%r12), %xmm2           # get the appropriate shuffle mask
	pshufb %xmm2, %xmm1            # right shift 16-%r13 butes
	movdqa  %xmm1, %xmm2
	pxor %xmm1, %xmm0            # Ciphertext XOR E(K, Yn)
	movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
	# get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
	pand %xmm1, %xmm0            # mask out top 16-%r13 bytes of %xmm0
	pand    %xmm1, %xmm2
	pshufb SHUF_MASK(%rip),%xmm2
	pxor %xmm2, %xmm8
	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
	          # GHASH computation for the last <16 byte block
	sub %r13, %r11
	add $16, %r11

        # output %r13 bytes
	movq	%xmm0, %rax
	cmp	$8, %r13
	jle	_less_than_8_bytes_left_decrypt
	mov	%rax, (%arg2 , %r11, 1)
	add	$8, %r11
	psrldq	$8, %xmm0
	movq	%xmm0, %rax
	sub	$8, %r13
_less_than_8_bytes_left_decrypt:
	mov	%al,  (%arg2, %r11, 1)
	add	$1, %r11
	shr	$8, %rax
	sub	$1, %r13
	jne	_less_than_8_bytes_left_decrypt
_multiple_of_16_bytes_decrypt:
	mov	arg8, %r12		  # %r13 = aadLen (number of bytes)
	shl	$3, %r12		  # convert into number of bits
	movd	%r12d, %xmm15		  # len(A) in %xmm15
	shl	$3, %arg4		  # len(C) in bits (*128)
	movq	%arg4, %xmm1
	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
	pxor	%xmm15, %xmm8
	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
	         # final GHASH computation
	pshufb	SHUF_MASK(%rip), %xmm8
	mov	%arg5, %rax		  # %rax = *Y0
	movdqu	(%rax), %xmm0		  # %xmm0 = Y0
	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
	pxor	%xmm8, %xmm0
_return_T_decrypt:
	mov	arg9, %r10                # %r10 = authTag
	mov	arg10, %r11               # %r11 = auth_tag_len
	cmp	$16, %r11
	je	_T_16_decrypt
	cmp	$12, %r11
	je	_T_12_decrypt
_T_8_decrypt:
	movq	%xmm0, %rax
	mov	%rax, (%r10)
	jmp	_return_T_done_decrypt
_T_12_decrypt:
	movq	%xmm0, %rax
	mov	%rax, (%r10)
	psrldq	$8, %xmm0
	movd	%xmm0, %eax
	mov	%eax, 8(%r10)
	jmp	_return_T_done_decrypt
_T_16_decrypt:
	movdqu	%xmm0, (%r10)
_return_T_done_decrypt:
	mov	%r14, %rsp
	pop	%r14
	pop	%r13
	pop	%r12
	ret


/*****************************************************************************
* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
*                    const u8 *in,       // Plaintext input
*                    u64 plaintext_len,  // Length of data in bytes for encryption.
*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
*                    const u8 *aad,      // Additional Authentication Data (AAD)
*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
*                    u8 *auth_tag,       // Authenticated Tag output.
*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
*                                        // 12 or 8.
*
* Assumptions:
*
* keys:
*       keys are pre-expanded and aligned to 16 bytes. we are using the
*       first set of 11 keys in the data structure void *aes_ctx
*
*
* iv:
*       0                   1                   2                   3
*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                             Salt  (From the SA)               |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                     Initialization Vector                     |
*       |         (This is the sequence number from IPSec header)       |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                              0x1                              |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
*
*
* AAD:
*       AAD padded to 128 bits with 0
*       for example, assume AAD is a u32 vector
*
*       if AAD is 8 bytes:
*       AAD[3] = {A0, A1};
*       padded AAD in xmm register = {A1 A0 0 0}
*
*       0                   1                   2                   3
*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                               SPI (A1)                        |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                     32-bit Sequence Number (A0)               |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                              0x0                              |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
*                                 AAD Format with 32-bit Sequence Number
*
*       if AAD is 12 bytes:
*       AAD[3] = {A0, A1, A2};
*       padded AAD in xmm register = {A2 A1 A0 0}
*
*       0                   1                   2                   3
*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                               SPI (A2)                        |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                 64-bit Extended Sequence Number {A1,A0}       |
*       |                                                               |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                              0x0                              |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
*                         AAD Format with 64-bit Extended Sequence Number
*
* aadLen:
*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
*       The code supports 16 too but for other sizes, the code will fail.
*
* TLen:
*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
*       For other sizes, the code will fail.
*
* poly = x^128 + x^127 + x^126 + x^121 + 1
***************************************************************************/
ENTRY(aesni_gcm_enc)
	push	%r12
	push	%r13
	push	%r14
	mov	%rsp, %r14
#
# states of %xmm registers %xmm6:%xmm15 not saved
# all %xmm registers are clobbered
#
	sub	$VARIABLE_OFFSET, %rsp
	and	$~63, %rsp
	mov	%arg6, %r12
	movdqu	(%r12), %xmm13
	pshufb	SHUF_MASK(%rip), %xmm13

# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)

	movdqa	%xmm13, %xmm2
	psllq	$1, %xmm13
	psrlq	$63, %xmm2
	movdqa	%xmm2, %xmm1
	pslldq	$8, %xmm2
	psrldq	$8, %xmm1
	por	%xmm2, %xmm13

        # reduce HashKey<<1

	pshufd	$0x24, %xmm1, %xmm2
	pcmpeqd TWOONE(%rip), %xmm2
	pand	POLY(%rip), %xmm2
	pxor	%xmm2, %xmm13
	movdqa	%xmm13, HashKey(%rsp)
	mov	%arg4, %r13            # %xmm13 holds HashKey<<1 (mod poly)
	and	$-16, %r13
	mov	%r13, %r12

        # Encrypt first few blocks

	and	$(3<<4), %r12
	jz	_initial_num_blocks_is_0_encrypt
	cmp	$(2<<4), %r12
	jb	_initial_num_blocks_is_1_encrypt
	je	_initial_num_blocks_is_2_encrypt
_initial_num_blocks_is_3_encrypt:
	INITIAL_BLOCKS	3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
	sub	$48, %r13
	jmp	_initial_blocks_encrypted
_initial_num_blocks_is_2_encrypt:
	INITIAL_BLOCKS	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
	sub	$32, %r13
	jmp	_initial_blocks_encrypted
_initial_num_blocks_is_1_encrypt:
	INITIAL_BLOCKS	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
	sub	$16, %r13
	jmp	_initial_blocks_encrypted
_initial_num_blocks_is_0_encrypt:
	INITIAL_BLOCKS	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
_initial_blocks_encrypted:

        # Main loop - Encrypt remaining blocks

	cmp	$0, %r13
	je	_zero_cipher_left_encrypt
	sub	$64, %r13
	je	_four_cipher_left_encrypt
_encrypt_by_4_encrypt:
	GHASH_4_ENCRYPT_4_PARALLEL	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
	add	$64, %r11
	sub	$64, %r13
	jne	_encrypt_by_4_encrypt
_four_cipher_left_encrypt:
	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
_zero_cipher_left_encrypt:
	mov	%arg4, %r13
	and	$15, %r13			# %r13 = arg4 (mod 16)
	je	_multiple_of_16_bytes_encrypt

         # Handle the last <16 Byte block seperately
	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
	pshufb SHUF_MASK(%rip), %xmm0
	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
	sub $16, %r11
	add %r13, %r11
	movdqu (%arg3,%r11,1), %xmm1     # receive the last <16 byte blocks
	lea SHIFT_MASK+16(%rip), %r12
	sub %r13, %r12
	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
	# (%r13 is the number of bytes in plaintext mod 16)
	movdqu	(%r12), %xmm2           # get the appropriate shuffle mask
	pshufb	%xmm2, %xmm1            # shift right 16-r13 byte
	pxor	%xmm1, %xmm0            # Plaintext XOR Encrypt(K, Yn)
	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0

	pshufb	SHUF_MASK(%rip),%xmm0
	pxor	%xmm0, %xmm8
	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
	# GHASH computation for the last <16 byte block
	sub	%r13, %r11
	add	$16, %r11
	pshufb SHUF_MASK(%rip), %xmm0
	# shuffle xmm0 back to output as ciphertext

        # Output %r13 bytes
	movq %xmm0, %rax
	cmp $8, %r13
	jle _less_than_8_bytes_left_encrypt
	mov %rax, (%arg2 , %r11, 1)
	add $8, %r11
	psrldq $8, %xmm0
	movq %xmm0, %rax
	sub $8, %r13
_less_than_8_bytes_left_encrypt:
	mov %al,  (%arg2, %r11, 1)
	add $1, %r11
	shr $8, %rax
	sub $1, %r13
	jne _less_than_8_bytes_left_encrypt
_multiple_of_16_bytes_encrypt:
	mov	arg8, %r12    # %r12 = addLen (number of bytes)
	shl	$3, %r12
	movd	%r12d, %xmm15       # len(A) in %xmm15
	shl	$3, %arg4               # len(C) in bits (*128)
	movq	%arg4, %xmm1
	pslldq	$8, %xmm15          # %xmm15 = len(A)||0x0000000000000000
	pxor	%xmm1, %xmm15       # %xmm15 = len(A)||len(C)
	pxor	%xmm15, %xmm8
	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
	# final GHASH computation

	pshufb	SHUF_MASK(%rip), %xmm8         # perform a 16 byte swap
	mov	%arg5, %rax		       # %rax  = *Y0
	movdqu	(%rax), %xmm0		       # %xmm0 = Y0
	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm15         # Encrypt(K, Y0)
	pxor	%xmm8, %xmm0
_return_T_encrypt:
	mov	arg9, %r10                     # %r10 = authTag
	mov	arg10, %r11                    # %r11 = auth_tag_len
	cmp	$16, %r11
	je	_T_16_encrypt
	cmp	$12, %r11
	je	_T_12_encrypt
_T_8_encrypt:
	movq	%xmm0, %rax
	mov	%rax, (%r10)
	jmp	_return_T_done_encrypt
_T_12_encrypt:
	movq	%xmm0, %rax
	mov	%rax, (%r10)
	psrldq	$8, %xmm0
	movd	%xmm0, %eax
	mov	%eax, 8(%r10)
	jmp	_return_T_done_encrypt
_T_16_encrypt:
	movdqu	%xmm0, (%r10)
_return_T_done_encrypt:
	mov	%r14, %rsp
	pop	%r14
	pop	%r13
	pop	%r12
	ret



1261 1262 1263 1264 1265 1266 1267 1268
_key_expansion_128:
_key_expansion_256a:
	pshufd $0b11111111, %xmm1, %xmm1
	shufps $0b00010000, %xmm0, %xmm4
	pxor %xmm4, %xmm0
	shufps $0b10001100, %xmm0, %xmm4
	pxor %xmm4, %xmm0
	pxor %xmm1, %xmm0
1269 1270
	movaps %xmm0, (TKEYP)
	add $0x10, TKEYP
1271 1272
	ret

1273
.align 4
1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290
_key_expansion_192a:
	pshufd $0b01010101, %xmm1, %xmm1
	shufps $0b00010000, %xmm0, %xmm4
	pxor %xmm4, %xmm0
	shufps $0b10001100, %xmm0, %xmm4
	pxor %xmm4, %xmm0
	pxor %xmm1, %xmm0

	movaps %xmm2, %xmm5
	movaps %xmm2, %xmm6
	pslldq $4, %xmm5
	pshufd $0b11111111, %xmm0, %xmm3
	pxor %xmm3, %xmm2
	pxor %xmm5, %xmm2

	movaps %xmm0, %xmm1
	shufps $0b01000100, %xmm0, %xmm6
1291
	movaps %xmm6, (TKEYP)
1292
	shufps $0b01001110, %xmm2, %xmm1
1293 1294
	movaps %xmm1, 0x10(TKEYP)
	add $0x20, TKEYP
1295 1296
	ret

1297
.align 4
1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311
_key_expansion_192b:
	pshufd $0b01010101, %xmm1, %xmm1
	shufps $0b00010000, %xmm0, %xmm4
	pxor %xmm4, %xmm0
	shufps $0b10001100, %xmm0, %xmm4
	pxor %xmm4, %xmm0
	pxor %xmm1, %xmm0

	movaps %xmm2, %xmm5
	pslldq $4, %xmm5
	pshufd $0b11111111, %xmm0, %xmm3
	pxor %xmm3, %xmm2
	pxor %xmm5, %xmm2

1312 1313
	movaps %xmm0, (TKEYP)
	add $0x10, TKEYP
1314 1315
	ret

1316
.align 4
1317 1318 1319 1320 1321 1322 1323
_key_expansion_256b:
	pshufd $0b10101010, %xmm1, %xmm1
	shufps $0b00010000, %xmm2, %xmm4
	pxor %xmm4, %xmm2
	shufps $0b10001100, %xmm2, %xmm4
	pxor %xmm4, %xmm2
	pxor %xmm1, %xmm2
1324 1325
	movaps %xmm2, (TKEYP)
	add $0x10, TKEYP
1326 1327 1328 1329 1330 1331 1332
	ret

/*
 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
 *                   unsigned int key_len)
 */
ENTRY(aesni_set_key)
1333 1334 1335 1336 1337 1338 1339 1340 1341 1342
#ifndef __x86_64__
	pushl KEYP
	movl 8(%esp), KEYP		# ctx
	movl 12(%esp), UKEYP		# in_key
	movl 16(%esp), %edx		# key_len
#endif
	movups (UKEYP), %xmm0		# user key (first 16 bytes)
	movaps %xmm0, (KEYP)
	lea 0x10(KEYP), TKEYP		# key addr
	movl %edx, 480(KEYP)
1343 1344 1345 1346
	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
	cmp $24, %dl
	jb .Lenc_key128
	je .Lenc_key192
1347 1348 1349
	movups 0x10(UKEYP), %xmm2	# other user key
	movaps %xmm2, (TKEYP)
	add $0x10, TKEYP
1350
	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1351
	call _key_expansion_256a
1352
	AESKEYGENASSIST 0x1 %xmm0 %xmm1
1353
	call _key_expansion_256b
1354
	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1355
	call _key_expansion_256a
1356
	AESKEYGENASSIST 0x2 %xmm0 %xmm1
1357
	call _key_expansion_256b
1358
	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1359
	call _key_expansion_256a
1360
	AESKEYGENASSIST 0x4 %xmm0 %xmm1
1361
	call _key_expansion_256b
1362
	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1363
	call _key_expansion_256a
1364
	AESKEYGENASSIST 0x8 %xmm0 %xmm1
1365
	call _key_expansion_256b
1366
	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1367
	call _key_expansion_256a
1368
	AESKEYGENASSIST 0x10 %xmm0 %xmm1
1369
	call _key_expansion_256b
1370
	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1371
	call _key_expansion_256a
1372
	AESKEYGENASSIST 0x20 %xmm0 %xmm1
1373
	call _key_expansion_256b
1374
	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1375 1376 1377
	call _key_expansion_256a
	jmp .Ldec_key
.Lenc_key192:
1378
	movq 0x10(UKEYP), %xmm2		# other user key
1379
	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1380
	call _key_expansion_192a
1381
	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1382
	call _key_expansion_192b
1383
	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1384
	call _key_expansion_192a
1385
	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1386
	call _key_expansion_192b
1387
	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1388
	call _key_expansion_192a
1389
	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1390
	call _key_expansion_192b
1391
	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1392
	call _key_expansion_192a
1393
	AESKEYGENASSIST 0x80 %xmm2 %xmm1	# round 8
1394 1395 1396
	call _key_expansion_192b
	jmp .Ldec_key
.Lenc_key128:
1397
	AESKEYGENASSIST 0x1 %xmm0 %xmm1		# round 1
1398
	call _key_expansion_128
1399
	AESKEYGENASSIST 0x2 %xmm0 %xmm1		# round 2
1400
	call _key_expansion_128
1401
	AESKEYGENASSIST 0x4 %xmm0 %xmm1		# round 3
1402
	call _key_expansion_128
1403
	AESKEYGENASSIST 0x8 %xmm0 %xmm1		# round 4
1404
	call _key_expansion_128
1405
	AESKEYGENASSIST 0x10 %xmm0 %xmm1	# round 5
1406
	call _key_expansion_128
1407
	AESKEYGENASSIST 0x20 %xmm0 %xmm1	# round 6
1408
	call _key_expansion_128
1409
	AESKEYGENASSIST 0x40 %xmm0 %xmm1	# round 7
1410
	call _key_expansion_128
1411
	AESKEYGENASSIST 0x80 %xmm0 %xmm1	# round 8
1412
	call _key_expansion_128
1413
	AESKEYGENASSIST 0x1b %xmm0 %xmm1	# round 9
1414
	call _key_expansion_128
1415
	AESKEYGENASSIST 0x36 %xmm0 %xmm1	# round 10
1416 1417
	call _key_expansion_128
.Ldec_key:
1418 1419 1420 1421 1422 1423 1424
	sub $0x10, TKEYP
	movaps (KEYP), %xmm0
	movaps (TKEYP), %xmm1
	movaps %xmm0, 240(TKEYP)
	movaps %xmm1, 240(KEYP)
	add $0x10, KEYP
	lea 240-16(TKEYP), UKEYP
1425 1426
.align 4
.Ldec_key_loop:
1427
	movaps (KEYP), %xmm0
1428
	AESIMC %xmm0 %xmm1
1429 1430 1431 1432
	movaps %xmm1, (UKEYP)
	add $0x10, KEYP
	sub $0x10, UKEYP
	cmp TKEYP, KEYP
1433
	jb .Ldec_key_loop
1434 1435 1436 1437
	xor AREG, AREG
#ifndef __x86_64__
	popl KEYP
#endif
1438 1439 1440 1441 1442 1443
	ret

/*
 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
 */
ENTRY(aesni_enc)
1444 1445 1446 1447 1448 1449 1450
#ifndef __x86_64__
	pushl KEYP
	pushl KLEN
	movl 12(%esp), KEYP
	movl 16(%esp), OUTP
	movl 20(%esp), INP
#endif
1451 1452 1453 1454
	movl 480(KEYP), KLEN		# key length
	movups (INP), STATE		# input
	call _aesni_enc1
	movups STATE, (OUTP)		# output
1455 1456 1457 1458
#ifndef __x86_64__
	popl KLEN
	popl KEYP
#endif
1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472
	ret

/*
 * _aesni_enc1:		internal ABI
 * input:
 *	KEYP:		key struct pointer
 *	KLEN:		round count
 *	STATE:		initial state (input)
 * output:
 *	STATE:		finial state (output)
 * changed:
 *	KEY
 *	TKEYP (T1)
 */
1473
.align 4
1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484
_aesni_enc1:
	movaps (KEYP), KEY		# key
	mov KEYP, TKEYP
	pxor KEY, STATE		# round 0
	add $0x30, TKEYP
	cmp $24, KLEN
	jb .Lenc128
	lea 0x20(TKEYP), TKEYP
	je .Lenc192
	add $0x20, TKEYP
	movaps -0x60(TKEYP), KEY
1485
	AESENC KEY STATE
1486
	movaps -0x50(TKEYP), KEY
1487
	AESENC KEY STATE
1488 1489 1490
.align 4
.Lenc192:
	movaps -0x40(TKEYP), KEY
1491
	AESENC KEY STATE
1492
	movaps -0x30(TKEYP), KEY
1493
	AESENC KEY STATE
1494 1495 1496
.align 4
.Lenc128:
	movaps -0x20(TKEYP), KEY
1497
	AESENC KEY STATE
1498
	movaps -0x10(TKEYP), KEY
1499
	AESENC KEY STATE
1500
	movaps (TKEYP), KEY
1501
	AESENC KEY STATE
1502
	movaps 0x10(TKEYP), KEY
1503
	AESENC KEY STATE
1504
	movaps 0x20(TKEYP), KEY
1505
	AESENC KEY STATE
1506
	movaps 0x30(TKEYP), KEY
1507
	AESENC KEY STATE
1508
	movaps 0x40(TKEYP), KEY
1509
	AESENC KEY STATE
1510
	movaps 0x50(TKEYP), KEY
1511
	AESENC KEY STATE
1512
	movaps 0x60(TKEYP), KEY
1513
	AESENC KEY STATE
1514
	movaps 0x70(TKEYP), KEY
1515
	AESENCLAST KEY STATE
1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535
	ret

/*
 * _aesni_enc4:	internal ABI
 * input:
 *	KEYP:		key struct pointer
 *	KLEN:		round count
 *	STATE1:		initial state (input)
 *	STATE2
 *	STATE3
 *	STATE4
 * output:
 *	STATE1:		finial state (output)
 *	STATE2
 *	STATE3
 *	STATE4
 * changed:
 *	KEY
 *	TKEYP (T1)
 */
1536
.align 4
1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550
_aesni_enc4:
	movaps (KEYP), KEY		# key
	mov KEYP, TKEYP
	pxor KEY, STATE1		# round 0
	pxor KEY, STATE2
	pxor KEY, STATE3
	pxor KEY, STATE4
	add $0x30, TKEYP
	cmp $24, KLEN
	jb .L4enc128
	lea 0x20(TKEYP), TKEYP
	je .L4enc192
	add $0x20, TKEYP
	movaps -0x60(TKEYP), KEY
1551 1552 1553 1554
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
1555
	movaps -0x50(TKEYP), KEY
1556 1557 1558 1559
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
1560 1561 1562
#.align 4
.L4enc192:
	movaps -0x40(TKEYP), KEY
1563 1564 1565 1566
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
1567
	movaps -0x30(TKEYP), KEY
1568 1569 1570 1571
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
1572 1573 1574
#.align 4
.L4enc128:
	movaps -0x20(TKEYP), KEY
1575 1576 1577 1578
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
1579
	movaps -0x10(TKEYP), KEY
1580 1581 1582 1583
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
1584
	movaps (TKEYP), KEY
1585 1586 1587 1588
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
1589
	movaps 0x10(TKEYP), KEY
1590 1591 1592 1593
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
1594
	movaps 0x20(TKEYP), KEY
1595 1596 1597 1598
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
1599
	movaps 0x30(TKEYP), KEY
1600 1601 1602 1603
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
1604
	movaps 0x40(TKEYP), KEY
1605 1606 1607 1608
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
1609
	movaps 0x50(TKEYP), KEY
1610 1611 1612 1613
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
1614
	movaps 0x60(TKEYP), KEY
1615 1616 1617 1618
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
1619
	movaps 0x70(TKEYP), KEY
1620 1621 1622 1623
	AESENCLAST KEY STATE1		# last round
	AESENCLAST KEY STATE2
	AESENCLAST KEY STATE3
	AESENCLAST KEY STATE4
1624 1625 1626 1627 1628 1629
	ret

/*
 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
 */
ENTRY(aesni_dec)
1630 1631 1632 1633 1634 1635 1636
#ifndef __x86_64__
	pushl KEYP
	pushl KLEN
	movl 12(%esp), KEYP
	movl 16(%esp), OUTP
	movl 20(%esp), INP
#endif
1637 1638 1639 1640 1641
	mov 480(KEYP), KLEN		# key length
	add $240, KEYP
	movups (INP), STATE		# input
	call _aesni_dec1
	movups STATE, (OUTP)		#output
1642 1643 1644 1645
#ifndef __x86_64__
	popl KLEN
	popl KEYP
#endif
1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659
	ret

/*
 * _aesni_dec1:		internal ABI
 * input:
 *	KEYP:		key struct pointer
 *	KLEN:		key length
 *	STATE:		initial state (input)
 * output:
 *	STATE:		finial state (output)
 * changed:
 *	KEY
 *	TKEYP (T1)
 */
1660
.align 4
1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671
_aesni_dec1:
	movaps (KEYP), KEY		# key
	mov KEYP, TKEYP
	pxor KEY, STATE		# round 0
	add $0x30, TKEYP
	cmp $24, KLEN
	jb .Ldec128
	lea 0x20(TKEYP), TKEYP
	je .Ldec192
	add $0x20, TKEYP
	movaps -0x60(TKEYP), KEY
1672
	AESDEC KEY STATE
1673
	movaps -0x50(TKEYP), KEY
1674
	AESDEC KEY STATE
1675 1676 1677
.align 4
.Ldec192:
	movaps -0x40(TKEYP), KEY
1678
	AESDEC KEY STATE
1679
	movaps -0x30(TKEYP), KEY
1680
	AESDEC KEY STATE
1681 1682 1683
.align 4
.Ldec128:
	movaps -0x20(TKEYP), KEY
1684
	AESDEC KEY STATE
1685
	movaps -0x10(TKEYP), KEY
1686
	AESDEC KEY STATE
1687
	movaps (TKEYP), KEY
1688
	AESDEC KEY STATE
1689
	movaps 0x10(TKEYP), KEY
1690
	AESDEC KEY STATE
1691
	movaps 0x20(TKEYP), KEY
1692
	AESDEC KEY STATE
1693
	movaps 0x30(TKEYP), KEY
1694
	AESDEC KEY STATE
1695
	movaps 0x40(TKEYP), KEY
1696
	AESDEC KEY STATE
1697
	movaps 0x50(TKEYP), KEY
1698
	AESDEC KEY STATE
1699
	movaps 0x60(TKEYP), KEY
1700
	AESDEC KEY STATE
1701
	movaps 0x70(TKEYP), KEY
1702
	AESDECLAST KEY STATE
1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722
	ret

/*
 * _aesni_dec4:	internal ABI
 * input:
 *	KEYP:		key struct pointer
 *	KLEN:		key length
 *	STATE1:		initial state (input)
 *	STATE2
 *	STATE3
 *	STATE4
 * output:
 *	STATE1:		finial state (output)
 *	STATE2
 *	STATE3
 *	STATE4
 * changed:
 *	KEY
 *	TKEYP (T1)
 */
1723
.align 4
1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737
_aesni_dec4:
	movaps (KEYP), KEY		# key
	mov KEYP, TKEYP
	pxor KEY, STATE1		# round 0
	pxor KEY, STATE2
	pxor KEY, STATE3
	pxor KEY, STATE4
	add $0x30, TKEYP
	cmp $24, KLEN
	jb .L4dec128
	lea 0x20(TKEYP), TKEYP
	je .L4dec192
	add $0x20, TKEYP
	movaps -0x60(TKEYP), KEY
1738 1739 1740 1741
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
1742
	movaps -0x50(TKEYP), KEY
1743 1744 1745 1746
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
1747 1748 1749
.align 4
.L4dec192:
	movaps -0x40(TKEYP), KEY
1750 1751 1752 1753
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
1754
	movaps -0x30(TKEYP), KEY
1755 1756 1757 1758
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
1759 1760 1761
.align 4
.L4dec128:
	movaps -0x20(TKEYP), KEY
1762 1763 1764 1765
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
1766
	movaps -0x10(TKEYP), KEY
1767 1768 1769 1770
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
1771
	movaps (TKEYP), KEY
1772 1773 1774 1775
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
1776
	movaps 0x10(TKEYP), KEY
1777 1778 1779 1780
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
1781
	movaps 0x20(TKEYP), KEY
1782 1783 1784 1785
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
1786
	movaps 0x30(TKEYP), KEY
1787 1788 1789 1790
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
1791
	movaps 0x40(TKEYP), KEY
1792 1793 1794 1795
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
1796
	movaps 0x50(TKEYP), KEY
1797 1798 1799 1800
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
1801
	movaps 0x60(TKEYP), KEY
1802 1803 1804 1805
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
1806
	movaps 0x70(TKEYP), KEY
1807 1808 1809 1810
	AESDECLAST KEY STATE1		# last round
	AESDECLAST KEY STATE2
	AESDECLAST KEY STATE3
	AESDECLAST KEY STATE4
1811 1812 1813 1814 1815 1816 1817
	ret

/*
 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 *		      size_t len)
 */
ENTRY(aesni_ecb_enc)
1818 1819 1820 1821 1822 1823 1824 1825 1826
#ifndef __x86_64__
	pushl LEN
	pushl KEYP
	pushl KLEN
	movl 16(%esp), KEYP
	movl 20(%esp), OUTP
	movl 24(%esp), INP
	movl 28(%esp), LEN
#endif
1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862
	test LEN, LEN		# check length
	jz .Lecb_enc_ret
	mov 480(KEYP), KLEN
	cmp $16, LEN
	jb .Lecb_enc_ret
	cmp $64, LEN
	jb .Lecb_enc_loop1
.align 4
.Lecb_enc_loop4:
	movups (INP), STATE1
	movups 0x10(INP), STATE2
	movups 0x20(INP), STATE3
	movups 0x30(INP), STATE4
	call _aesni_enc4
	movups STATE1, (OUTP)
	movups STATE2, 0x10(OUTP)
	movups STATE3, 0x20(OUTP)
	movups STATE4, 0x30(OUTP)
	sub $64, LEN
	add $64, INP
	add $64, OUTP
	cmp $64, LEN
	jge .Lecb_enc_loop4
	cmp $16, LEN
	jb .Lecb_enc_ret
.align 4
.Lecb_enc_loop1:
	movups (INP), STATE1
	call _aesni_enc1
	movups STATE1, (OUTP)
	sub $16, LEN
	add $16, INP
	add $16, OUTP
	cmp $16, LEN
	jge .Lecb_enc_loop1
.Lecb_enc_ret:
1863 1864 1865 1866 1867
#ifndef __x86_64__
	popl KLEN
	popl KEYP
	popl LEN
#endif
1868 1869 1870 1871 1872 1873 1874
	ret

/*
 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 *		      size_t len);
 */
ENTRY(aesni_ecb_dec)
1875 1876 1877 1878 1879 1880 1881 1882 1883
#ifndef __x86_64__
	pushl LEN
	pushl KEYP
	pushl KLEN
	movl 16(%esp), KEYP
	movl 20(%esp), OUTP
	movl 24(%esp), INP
	movl 28(%esp), LEN
#endif
1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920
	test LEN, LEN
	jz .Lecb_dec_ret
	mov 480(KEYP), KLEN
	add $240, KEYP
	cmp $16, LEN
	jb .Lecb_dec_ret
	cmp $64, LEN
	jb .Lecb_dec_loop1
.align 4
.Lecb_dec_loop4:
	movups (INP), STATE1
	movups 0x10(INP), STATE2
	movups 0x20(INP), STATE3
	movups 0x30(INP), STATE4
	call _aesni_dec4
	movups STATE1, (OUTP)
	movups STATE2, 0x10(OUTP)
	movups STATE3, 0x20(OUTP)
	movups STATE4, 0x30(OUTP)
	sub $64, LEN
	add $64, INP
	add $64, OUTP
	cmp $64, LEN
	jge .Lecb_dec_loop4
	cmp $16, LEN
	jb .Lecb_dec_ret
.align 4
.Lecb_dec_loop1:
	movups (INP), STATE1
	call _aesni_dec1
	movups STATE1, (OUTP)
	sub $16, LEN
	add $16, INP
	add $16, OUTP
	cmp $16, LEN
	jge .Lecb_dec_loop1
.Lecb_dec_ret:
1921 1922 1923 1924 1925
#ifndef __x86_64__
	popl KLEN
	popl KEYP
	popl LEN
#endif
1926 1927 1928 1929 1930 1931 1932
	ret

/*
 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 *		      size_t len, u8 *iv)
 */
ENTRY(aesni_cbc_enc)
1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943
#ifndef __x86_64__
	pushl IVP
	pushl LEN
	pushl KEYP
	pushl KLEN
	movl 20(%esp), KEYP
	movl 24(%esp), OUTP
	movl 28(%esp), INP
	movl 32(%esp), LEN
	movl 36(%esp), IVP
#endif
1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960
	cmp $16, LEN
	jb .Lcbc_enc_ret
	mov 480(KEYP), KLEN
	movups (IVP), STATE	# load iv as initial state
.align 4
.Lcbc_enc_loop:
	movups (INP), IN	# load input
	pxor IN, STATE
	call _aesni_enc1
	movups STATE, (OUTP)	# store output
	sub $16, LEN
	add $16, INP
	add $16, OUTP
	cmp $16, LEN
	jge .Lcbc_enc_loop
	movups STATE, (IVP)
.Lcbc_enc_ret:
1961 1962 1963 1964 1965 1966
#ifndef __x86_64__
	popl KLEN
	popl KEYP
	popl LEN
	popl IVP
#endif
1967 1968 1969 1970 1971 1972 1973
	ret

/*
 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 *		      size_t len, u8 *iv)
 */
ENTRY(aesni_cbc_dec)
1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984
#ifndef __x86_64__
	pushl IVP
	pushl LEN
	pushl KEYP
	pushl KLEN
	movl 20(%esp), KEYP
	movl 24(%esp), OUTP
	movl 28(%esp), INP
	movl 32(%esp), LEN
	movl 36(%esp), IVP
#endif
1985
	cmp $16, LEN
1986
	jb .Lcbc_dec_just_ret
1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997
	mov 480(KEYP), KLEN
	add $240, KEYP
	movups (IVP), IV
	cmp $64, LEN
	jb .Lcbc_dec_loop1
.align 4
.Lcbc_dec_loop4:
	movups (INP), IN1
	movaps IN1, STATE1
	movups 0x10(INP), IN2
	movaps IN2, STATE2
1998
#ifdef __x86_64__
1999 2000 2001 2002
	movups 0x20(INP), IN3
	movaps IN3, STATE3
	movups 0x30(INP), IN4
	movaps IN4, STATE4
2003 2004 2005 2006 2007 2008
#else
	movups 0x20(INP), IN1
	movaps IN1, STATE3
	movups 0x30(INP), IN2
	movaps IN2, STATE4
#endif
2009 2010
	call _aesni_dec4
	pxor IV, STATE1
2011
#ifdef __x86_64__
2012 2013 2014 2015
	pxor IN1, STATE2
	pxor IN2, STATE3
	pxor IN3, STATE4
	movaps IN4, IV
2016 2017 2018 2019 2020 2021
#else
	pxor (INP), STATE2
	pxor 0x10(INP), STATE3
	pxor IN1, STATE4
	movaps IN2, IV
#endif
2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046
	movups STATE1, (OUTP)
	movups STATE2, 0x10(OUTP)
	movups STATE3, 0x20(OUTP)
	movups STATE4, 0x30(OUTP)
	sub $64, LEN
	add $64, INP
	add $64, OUTP
	cmp $64, LEN
	jge .Lcbc_dec_loop4
	cmp $16, LEN
	jb .Lcbc_dec_ret
.align 4
.Lcbc_dec_loop1:
	movups (INP), IN
	movaps IN, STATE
	call _aesni_dec1
	pxor IV, STATE
	movups STATE, (OUTP)
	movaps IN, IV
	sub $16, LEN
	add $16, INP
	add $16, OUTP
	cmp $16, LEN
	jge .Lcbc_dec_loop1
.Lcbc_dec_ret:
2047 2048
	movups IV, (IVP)
.Lcbc_dec_just_ret:
2049 2050 2051 2052 2053 2054
#ifndef __x86_64__
	popl KLEN
	popl KEYP
	popl LEN
	popl IVP
#endif
2055
	ret
2056

2057
#ifdef __x86_64__
2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072
.align 16
.Lbswap_mask:
	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0

/*
 * _aesni_inc_init:	internal ABI
 *	setup registers used by _aesni_inc
 * input:
 *	IV
 * output:
 *	CTR:	== IV, in little endian
 *	TCTR_LOW: == lower qword of CTR
 *	INC:	== 1, in little endian
 *	BSWAP_MASK == endian swapping mask
 */
2073
.align 4
2074 2075 2076 2077 2078
_aesni_inc_init:
	movaps .Lbswap_mask, BSWAP_MASK
	movaps IV, CTR
	PSHUFB_XMM BSWAP_MASK CTR
	mov $1, TCTR_LOW
2079 2080
	MOVQ_R64_XMM TCTR_LOW INC
	MOVQ_R64_XMM CTR TCTR_LOW
2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097
	ret

/*
 * _aesni_inc:		internal ABI
 *	Increase IV by 1, IV is in big endian
 * input:
 *	IV
 *	CTR:	== IV, in little endian
 *	TCTR_LOW: == lower qword of CTR
 *	INC:	== 1, in little endian
 *	BSWAP_MASK == endian swapping mask
 * output:
 *	IV:	Increase by 1
 * changed:
 *	CTR:	== output IV, in little endian
 *	TCTR_LOW: == lower qword of CTR
 */
2098
.align 4
2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169
_aesni_inc:
	paddq INC, CTR
	add $1, TCTR_LOW
	jnc .Linc_low
	pslldq $8, INC
	paddq INC, CTR
	psrldq $8, INC
.Linc_low:
	movaps CTR, IV
	PSHUFB_XMM BSWAP_MASK IV
	ret

/*
 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 *		      size_t len, u8 *iv)
 */
ENTRY(aesni_ctr_enc)
	cmp $16, LEN
	jb .Lctr_enc_just_ret
	mov 480(KEYP), KLEN
	movups (IVP), IV
	call _aesni_inc_init
	cmp $64, LEN
	jb .Lctr_enc_loop1
.align 4
.Lctr_enc_loop4:
	movaps IV, STATE1
	call _aesni_inc
	movups (INP), IN1
	movaps IV, STATE2
	call _aesni_inc
	movups 0x10(INP), IN2
	movaps IV, STATE3
	call _aesni_inc
	movups 0x20(INP), IN3
	movaps IV, STATE4
	call _aesni_inc
	movups 0x30(INP), IN4
	call _aesni_enc4
	pxor IN1, STATE1
	movups STATE1, (OUTP)
	pxor IN2, STATE2
	movups STATE2, 0x10(OUTP)
	pxor IN3, STATE3
	movups STATE3, 0x20(OUTP)
	pxor IN4, STATE4
	movups STATE4, 0x30(OUTP)
	sub $64, LEN
	add $64, INP
	add $64, OUTP
	cmp $64, LEN
	jge .Lctr_enc_loop4
	cmp $16, LEN
	jb .Lctr_enc_ret
.align 4
.Lctr_enc_loop1:
	movaps IV, STATE
	call _aesni_inc
	movups (INP), IN
	call _aesni_enc1
	pxor IN, STATE
	movups STATE, (OUTP)
	sub $16, LEN
	add $16, INP
	add $16, OUTP
	cmp $16, LEN
	jge .Lctr_enc_loop1
.Lctr_enc_ret:
	movups IV, (IVP)
.Lctr_enc_just_ret:
	ret
2170
#endif