poly1305-c64xplus.pl 8.5 KB
Newer Older
C
code4lala 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331
#! /usr/bin/env perl
# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License").  You may not use
# this file except in compliance with the License.  You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html

#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# Poly1305 hash for C64x+.
#
# October 2015
#
# Performance is [incredible for a 32-bit processor] 1.82 cycles per
# processed byte. Comparison to compiler-generated code is problematic,
# because results were observed to vary from 2.1 to 7.6 cpb depending
# on compiler's ability to inline small functions. Compiler also
# disables interrupts for some reason, thus making interrupt response
# time dependent on input length. This module on the other hand is free
# from such limitation.

$output=pop;
open STDOUT,">$output";

($CTXA,$INPB,$LEN,$PADBIT)=("A4","B4","A6","B6");
($H0,$H1,$H2,$H3,$H4,$H4a)=("A8","B8","A10","B10","B2",$LEN);
($D0,$D1,$D2,$D3)=         ("A9","B9","A11","B11");
($R0,$R1,$R2,$R3,$S1,$S2,$S3,$S3b)=("A0","B0","A1","B1","A12","B12","A13","B13");
($THREE,$R0b,$S2a)=("B7","B5","A5");

$code.=<<___;
	.text

	.if	.ASSEMBLER_VERSION<7000000
	.asg	0,__TI_EABI__
	.endif
	.if	__TI_EABI__
	.asg	poly1305_init,_poly1305_init
	.asg	poly1305_blocks,_poly1305_blocks
	.asg	poly1305_emit,_poly1305_emit
	.endif

	.asg	B3,RA
	.asg	A15,FP
	.asg	B15,SP

	.if	.LITTLE_ENDIAN
	.asg	MV,SWAP2
	.asg	MV.L,SWAP4
	.endif

	.global	_poly1305_init
_poly1305_init:
	.asmfunc
	LDNDW	*${INPB}[0],B17:B16	; load key material
	LDNDW	*${INPB}[1],A17:A16

||	ZERO	B9:B8
||	MVK	-1,B0
	STDW	B9:B8,*${CTXA}[0]	; initialize h1:h0
||	SHRU	B0,4,B0			; 0x0fffffff
||	MVK	-4,B1
	STDW	B9:B8,*${CTXA}[1]	; initialize h3:h2
||	AND	B0,B1,B1		; 0x0ffffffc
	STW	B8,*${CTXA}[4]		; initialize h4

	.if	.BIG_ENDIAN
	SWAP2	B16,B17
||	SWAP2	B17,B16
	SWAP2	A16,A17
||	SWAP2	A17,A16
	SWAP4	B16,B16
||	SWAP4	A16,A16
	SWAP4	B17,B17
||	SWAP4	A17,A17
	.endif

	AND	B16,B0,B20		; r0 = key[0] & 0x0fffffff
||	AND	B17,B1,B22		; r1 = key[1] & 0x0ffffffc
||	EXTU	B17,4,6,B16		; r1>>2
	AND	A16,B1,B21		; r2 = key[2] & 0x0ffffffc
||	AND	A17,B1,A23		; r3 = key[3] & 0x0ffffffc
||	BNOP	RA
	SHRU	B21,2,B18
||	ADD	B22,B16,B16		; s1 = r1 + r1>>2

	STDW	B21:B20,*${CTXA}[3]	; save r2:r0
||	ADD	B21,B18,B18		; s2 = r2 + r2>>2
||	SHRU	A23,2,B17
||	MV	A23,B23
	STDW	B23:B22,*${CTXA}[4]	; save r3:r1
||	ADD	B23,B17,B19		; s3 = r3 + r3>>2
||	ADD	B23,B17,B17		; s3 = r3 + r3>>2
	STDW	B17:B16,*${CTXA}[5]	; save s3:s1
	STDW	B19:B18,*${CTXA}[6]	; save s3:s2
||	ZERO	A4			; return 0
	.endasmfunc

	.global	_poly1305_blocks
	.align	32
_poly1305_blocks:
	.asmfunc	stack_usage(40)
	SHRU	$LEN,4,A2		; A2 is loop counter, number of blocks
  [!A2]	BNOP	RA			; no data
|| [A2]	STW	FP,*SP--(40)		; save frame pointer and alloca(40)
|| [A2]	MV	SP,FP
   [A2]	STDW	B13:B12,*SP[4]		; ABI says so
|| [A2]	MV	$CTXA,$S3b		; borrow $S3b
   [A2]	STDW	B11:B10,*SP[3]
|| [A2]	STDW	A13:A12,*FP[-3]
   [A2]	STDW	A11:A10,*FP[-4]

|| [A2]	LDDW	*${S3b}[0],B25:B24	; load h1:h0
   [A2]	LDNW	*${INPB}++[4],$D0	; load inp[0]
   [A2]	LDNW	*${INPB}[-3],$D1	; load inp[1]

	LDDW	*${CTXA}[1],B29:B28	; load h3:h2, B28 is h2
	LDNW	*${INPB}[-2],$D2	; load inp[2]
	LDNW	*${INPB}[-1],$D3	; load inp[3]

	LDDW	*${CTXA}[3],$R2:$R0	; load r2:r0
||	LDDW	*${S3b}[4],$R3:$R1	; load r3:r1
||	SWAP2	$D0,$D0

	LDDW	*${CTXA}[5],$S3:$S1	; load s3:s1
||	LDDW	*${S3b}[6],$S3b:$S2	; load s3:s2
||	SWAP4	$D0,$D0
||	SWAP2	$D1,$D1

	ADDU	$D0,B24,$D0:$H0		; h0+=inp[0]
||	ADD	$D0,B24,B27		; B-copy of h0+inp[0]
||	SWAP4	$D1,$D1
	ADDU	$D1,B25,$D1:$H1		; h1+=inp[1]
||	MVK	3,$THREE
||	SWAP2	$D2,$D2
	LDW	*${CTXA}[4],$H4		; load h4
||	SWAP4	$D2,$D2
||	MV	B29,B30			; B30 is h3
	MV	$R0,$R0b

loop?:
	MPY32U	$H0,$R0,A17:A16
||	MPY32U	B27,$R1,B17:B16		; MPY32U	$H0,$R1,B17:B16
||	ADDU	$D0,$D1:$H1,B25:B24	; ADDU		$D0,$D1:$H1,$D1:$H1
||	ADDU	$D2,B28,$D2:$H2		; h2+=inp[2]
||	SWAP2	$D3,$D3
	MPY32U	$H0,$R2,A19:A18
||	MPY32U	B27,$R3,B19:B18		; MPY32U	$H0,$R3,B19:B18
||	ADD	$D0,$H1,A24		; A-copy of B24
||	SWAP4	$D3,$D3
|| [A2]	SUB	A2,1,A2			; decrement loop counter

	MPY32U	A24,$S3,A21:A20		; MPY32U	$H1,$S3,A21:A20
||	MPY32U	B24,$R0b,B21:B20	; MPY32U	$H1,$R0,B21:B20
||	ADDU	B25,$D2:$H2,$D2:$H2	; ADDU		$D1,$D2:$H2,$D2:$H2
||	ADDU	$D3,B30,$D3:$H3		; h3+=inp[3]
||	ADD	B25,$H2,B25		; B-copy of $H2
	MPY32U	A24,$R1,A23:A22		; MPY32U	$H1,$R1,A23:A22
||	MPY32U	B24,$R2,B23:B22		; MPY32U	$H1,$R2,B23:B22

	MPY32U	$H2,$S2,A25:A24
||	MPY32U	B25,$S3b,B25:B24	; MPY32U	$H2,$S3,B25:B24
||	ADDU	$D2,$D3:$H3,$D3:$H3
||	ADD	$PADBIT,$H4,$H4		; h4+=padbit
	MPY32U	$H2,$R0,A27:A26
||	MPY32U	$H2,$R1,B27:B26
||	ADD	$D3,$H4,$H4
||	MV	$S2,$S2a

	MPY32U	$H3,$S1,A29:A28
||	MPY32U	$H3,$S2,B29:B28
||	ADD	A21,A17,A21		; start accumulating "d3:d0"
||	ADD	B21,B17,B21
||	ADDU	A20,A16,A17:A16
||	ADDU	B20,B16,B17:B16
|| [A2]	LDNW	*${INPB}++[4],$D0	; load inp[0]
	MPY32U	$H3,$S3,A31:A30
||	MPY32U	$H3,$R0b,B31:B30
||	ADD	A23,A19,A23
||	ADD	B23,B19,B23
||	ADDU	A22,A18,A19:A18
||	ADDU	B22,B18,B19:B18
|| [A2]	LDNW	*${INPB}[-3],$D1	; load inp[1]

	MPY32	$H4,$S1,B20
||	MPY32	$H4,$S2a,A20
||	ADD	A25,A21,A21
||	ADD	B25,B21,B21
||	ADDU	A24,A17:A16,A17:A16
||	ADDU	B24,B17:B16,B17:B16
|| [A2]	LDNW	*${INPB}[-2],$D2	; load inp[2]
	MPY32	$H4,$S3b,B22
||	ADD	A27,A23,A23
||	ADD	B27,B23,B23
||	ADDU	A26,A19:A18,A19:A18
||	ADDU	B26,B19:B18,B19:B18
|| [A2]	LDNW	*${INPB}[-1],$D3	; load inp[3]

	MPY32	$H4,$R0b,$H4
||	ADD	A29,A21,A21		; final hi("d0")
||	ADD	B29,B21,B21		; final hi("d1")
||	ADDU	A28,A17:A16,A17:A16	; final lo("d0")
||	ADDU	B28,B17:B16,B17:B16
	ADD	A31,A23,A23		; final hi("d2")
||	ADD	B31,B23,B23		; final hi("d3")
||	ADDU	A30,A19:A18,A19:A18
||	ADDU	B30,B19:B18,B19:B18
	ADDU	B20,B17:B16,B17:B16	; final lo("d1")
||	ADDU	A20,A19:A18,A19:A18	; final lo("d2")
	ADDU	B22,B19:B18,B19:B18	; final lo("d3")

||	ADD	A17,A21,A21		; "flatten" "d3:d0"
	MV	A19,B29			; move to avoid cross-path stalls
	ADDU	A21,B17:B16,B27:B26	; B26 is h1
	ADD	B21,B27,B27
||	DMV	B29,A18,B29:B28		; move to avoid cross-path stalls
	ADDU	B27,B29:B28,B29:B28	; B28 is h2
|| [A2]	SWAP2	$D0,$D0
	ADD	A23,B29,B29
|| [A2]	SWAP4	$D0,$D0
	ADDU	B29,B19:B18,B31:B30	; B30 is h3
	ADD	B23,B31,B31
||	MV	A16,B24			; B24 is h0
|| [A2]	SWAP2	$D1,$D1
	ADD	B31,$H4,$H4
|| [A2]	SWAP4	$D1,$D1

	SHRU	$H4,2,B16		; last reduction step
||	AND	$H4,$THREE,$H4
	ADDAW	B16,B16,B16		; 5*(h4>>2)
|| [A2]	BNOP	loop?

	ADDU	B24,B16,B25:B24		; B24 is h0
|| [A2]	SWAP2	$D2,$D2
	ADDU	B26,B25,B27:B26		; B26 is h1
|| [A2]	SWAP4	$D2,$D2
	ADDU	B28,B27,B29:B28		; B28 is h2
|| [A2]	ADDU	$D0,B24,$D0:$H0		; h0+=inp[0]
|| [A2]	ADD	$D0,B24,B27		; B-copy of h0+inp[0]
	ADDU	B30,B29,B31:B30		; B30 is h3
	ADD	B31,$H4,$H4
|| [A2]	ADDU	$D1,B26,$D1:$H1		; h1+=inp[1]
;;===== branch to loop? is taken here

	LDDW	*FP[-4],A11:A10		; ABI says so
	LDDW	*FP[-3],A13:A12
||	LDDW	*SP[3],B11:B10
	LDDW	*SP[4],B13:B12
||	MV	B26,B25
||	BNOP	RA
	LDW	*++SP(40),FP		; restore frame pointer
||	MV	B30,B29
	STDW	B25:B24,*${CTXA}[0]	; save h1:h0
	STDW	B29:B28,*${CTXA}[1]	; save h3:h2
	STW	$H4,*${CTXA}[4]		; save h4
	NOP	1
	.endasmfunc
___
{
my ($MAC,$NONCEA,$NONCEB)=($INPB,$LEN,$PADBIT);

$code.=<<___;
	.global	_poly1305_emit
	.align	32
_poly1305_emit:
	.asmfunc
	LDDW	*${CTXA}[0],A17:A16	; load h1:h0
	LDDW	*${CTXA}[1],A19:A18	; load h3:h2
	LDW	*${CTXA}[4],A20		; load h4
	MV	$NONCEA,$NONCEB

	MVK	5,A22			; compare to modulus
	ADDU	A16,A22,A23:A22
||	LDW	*${NONCEA}[0],A8
||	LDW	*${NONCEB}[1],B8
	ADDU	A17,A23,A25:A24
||	LDW	*${NONCEA}[2],A9
||	LDW	*${NONCEB}[3],B9
	ADDU	A19,A25,A27:A26
	ADDU	A19,A27,A29:A28
	ADD	A20,A29,A29

	SHRU	A29,2,A2		; check for overflow in 130-th bit

   [A2]	MV	A22,A16			; select
|| [A2]	MV	A24,A17
   [A2]	MV	A26,A18
|| [A2]	MV	A28,A19

||	ADDU	A8,A16,A23:A22		; accumulate nonce
	ADDU	B8,A17,A25:A24
||	SWAP2	A22,A22
	ADDU	A23,A25:A24,A25:A24
	ADDU	A9,A18,A27:A26
||	SWAP2	A24,A24
	ADDU	A25,A27:A26,A27:A26
||	ADD	B9,A19,A28
	ADD	A27,A28,A28
||	SWAP2	A26,A26

	.if	.BIG_ENDIAN
	SWAP2	A28,A28
||	SWAP4	A22,A22
||	SWAP4	A24,B24
	SWAP4	A26,A26
	SWAP4	A28,A28
||	MV	B24,A24
	.endif

	BNOP	RA,1
	STNW	A22,*${MAC}[0]		; write the result
	STNW	A24,*${MAC}[1]
	STNW	A26,*${MAC}[2]
	STNW	A28,*${MAC}[3]
	.endasmfunc
___
}
$code.=<<___;
	.sect	.const
	.cstring "Poly1305 for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
	.align	4
___

print $code;