ia64.S 44.3 KB
Newer Older
1
.explicit
2
.text
3
.ident	"ia64.S, Version 2.1"
4
.ident	"IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
5 6 7 8 9 10 11 12 13 14 15

//
// ====================================================================
// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
// project.
//
// Rights for redistribution and usage in source and binary forms are
// granted according to the OpenSSL license. Warranty of any kind is
// disclaimed.
// ====================================================================
//
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
// Version 2.x is Itanium2 re-tune. Few words about how Itanum2 is
// different from Itanium to this module viewpoint. Most notably, is it
// "wider" than Itanium? Can you experience loop scalability as
// discussed in commentary sections? Not really:-( Itanium2 has 6
// integer ALU ports, i.e. it's 2 ports wider, but it's not enough to
// spin twice as fast, as I need 8 IALU ports. Amount of floating point
// ports is the same, i.e. 2, while I need 4. In other words, to this
// module Itanium2 remains effectively as "wide" as Itanium. Yet it's
// essentially different in respect to this module, and a re-tune was
// required. Well, because some intruction latencies has changed. Most
// noticeably those intensively used:
//
//			Itanium	Itanium2
//	ldf8		9	6		L2 hit
//	ld8		2	1		L1 hit
//	getf		2	5
//	xma[->getf]	7[+1]	4[+0]
//	add[->st8]	1[+1]	1[+0]
//
// What does it mean? You might ratiocinate that the original code
// should run just faster... Because sum of latencies is smaller...
// Wrong! Note that getf latency increased. This means that if a loop is
38
// scheduled for lower latency (as they were), then it will suffer from
39 40 41 42 43 44
// stall condition and the code will therefore turn anti-scalable, e.g.
// original bn_mul_words spun at 5*n or 2.5 times slower than expected
// on Itanium2! What to do? Reschedule loops for Itanium2? But then
// Itanium would exhibit anti-scalability. So I've chosen to reschedule
// for worst latency for every instruction aiming for best *all-round*
// performance.  
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147

// Q.	How much faster does it get?
// A.	Here is the output from 'openssl speed rsa dsa' for vanilla
//	0.9.6a compiled with gcc version 2.96 20000731 (Red Hat
//	Linux 7.1 2.96-81):
//
//	                  sign    verify    sign/s verify/s
//	rsa  512 bits   0.0036s   0.0003s    275.3   2999.2
//	rsa 1024 bits   0.0203s   0.0011s     49.3    894.1
//	rsa 2048 bits   0.1331s   0.0040s      7.5    250.9
//	rsa 4096 bits   0.9270s   0.0147s      1.1     68.1
//	                  sign    verify    sign/s verify/s
//	dsa  512 bits   0.0035s   0.0043s    288.3    234.8
//	dsa 1024 bits   0.0111s   0.0135s     90.0     74.2
//
//	And here is similar output but for this assembler
//	implementation:-)
//
//	                  sign    verify    sign/s verify/s
//	rsa  512 bits   0.0021s   0.0001s    549.4   9638.5
//	rsa 1024 bits   0.0055s   0.0002s    183.8   4481.1
//	rsa 2048 bits   0.0244s   0.0006s     41.4   1726.3
//	rsa 4096 bits   0.1295s   0.0018s      7.7    561.5
//	                  sign    verify    sign/s verify/s
//	dsa  512 bits   0.0012s   0.0013s    891.9    756.6
//	dsa 1024 bits   0.0023s   0.0028s    440.4    376.2
//	
//	Yes, you may argue that it's not fair comparison as it's
//	possible to craft the C implementation with BN_UMULT_HIGH
//	inline assembler macro. But of course! Here is the output
//	with the macro:
//
//	                  sign    verify    sign/s verify/s
//	rsa  512 bits   0.0020s   0.0002s    495.0   6561.0
//	rsa 1024 bits   0.0086s   0.0004s    116.2   2235.7
//	rsa 2048 bits   0.0519s   0.0015s     19.3    667.3
//	rsa 4096 bits   0.3464s   0.0053s      2.9    187.7
//	                  sign    verify    sign/s verify/s
//	dsa  512 bits   0.0016s   0.0020s    613.1    510.5
//	dsa 1024 bits   0.0045s   0.0054s    221.0    183.9
//
//	My code is still way faster, huh:-) And I believe that even
//	higher performance can be achieved. Note that as keys get
//	longer, performance gain is larger. Why? According to the
//	profiler there is another player in the field, namely
//	BN_from_montgomery consuming larger and larger portion of CPU
//	time as keysize decreases. I therefore consider putting effort
//	to assembler implementation of the following routine:
//
//	void bn_mul_add_mont (BN_ULONG *rp,BN_ULONG *np,int nl,BN_ULONG n0)
//	{
//	int      i,j;
//	BN_ULONG v;
//
//	for (i=0; i<nl; i++)
//		{
//		v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
//		nrp++;
//		rp++;
//		if (((nrp[-1]+=v)&BN_MASK2) < v)
//			for (j=0; ((++nrp[j])&BN_MASK2) == 0; j++) ;
//		}
//	}
//
//	It might as well be beneficial to implement even combaX
//	variants, as it appears as it can literally unleash the
//	performance (see comment section to bn_mul_comba8 below).
//
//	And finally for your reference the output for 0.9.6a compiled
//	with SGIcc version 0.01.0-12 (keep in mind that for the moment
//	of this writing it's not possible to convince SGIcc to use
//	BN_UMULT_HIGH inline assembler macro, yet the code is fast,
//	i.e. for a compiler generated one:-):
//
//	                  sign    verify    sign/s verify/s
//	rsa  512 bits   0.0022s   0.0002s    452.7   5894.3
//	rsa 1024 bits   0.0097s   0.0005s    102.7   2002.9
//	rsa 2048 bits   0.0578s   0.0017s     17.3    600.2
//	rsa 4096 bits   0.3838s   0.0061s      2.6    164.5
//	                  sign    verify    sign/s verify/s
//	dsa  512 bits   0.0018s   0.0022s    547.3    459.6
//	dsa 1024 bits   0.0051s   0.0062s    196.6    161.3
//
//	Oh! Benchmarks were performed on 733MHz Lion-class Itanium
//	system running Redhat Linux 7.1 (very special thanks to Ray
//	McCaffity of Williams Communications for providing an account).
//
// Q.	What's the heck with 'rum 1<<5' at the end of every function?
// A.	Well, by clearing the "upper FP registers written" bit of the
//	User Mask I want to excuse the kernel from preserving upper
//	(f32-f128) FP register bank over process context switch, thus
//	minimizing bus bandwidth consumption during the switch (i.e.
//	after PKI opration completes and the program is off doing
//	something else like bulk symmetric encryption). Having said
//	this, I also want to point out that it might be good idea
//	to compile the whole toolkit (as well as majority of the
//	programs for that matter) with -mfixed-range=f32-f127 command
//	line option. No, it doesn't prevent the compiler from writing
//	to upper bank, but at least discourages to do so. If you don't
//	like the idea you have the option to compile the module with
//	-Drum=nop.m in command line.
//

148 149 150 151 152 153
#if defined(_HPUX_SOURCE) && !defined(_LP64)
#define	ADDP	addp4
#else
#define	ADDP	add
#endif

154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
#if 1
//
// bn_[add|sub]_words routines.
//
// Loops are spinning in 2*(n+5) ticks on Itanuim (provided that the
// data reside in L1 cache, i.e. 2 ticks away). It's possible to
// compress the epilogue and get down to 2*n+6, but at the cost of
// scalability (the neat feature of this implementation is that it
// shall automagically spin in n+5 on "wider" IA-64 implementations:-)
// I consider that the epilogue is short enough as it is to trade tiny
// performance loss on Itanium for scalability.
//
// BN_ULONG bn_add_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
//
.global	bn_add_words#
.proc	bn_add_words#
.align	64
171
.skip	32	// makes the loop body aligned at 64-byte boundary
172 173 174 175
bn_add_words:
	.prologue
	.fframe	0
	.save	ar.pfs,r2
176
{ .mii;	alloc		r2=ar.pfs,4,12,0,16
177
	cmp4.le		p6,p0=r35,r0	};;
178
{ .mfb;	mov		r8=r0			// return value
179 180 181
(p6)	br.ret.spnt.many	b0	};;

	.save	ar.lc,r3
182
{ .mib;	sub		r10=r35,r0,1
183 184 185 186
	mov		r3=ar.lc
	brp.loop.imp	.L_bn_add_words_ctop,.L_bn_add_words_cend-16
					}
	.body
187
{ .mib;	ADDP		r14=0,r32		// rp
188
	mov		r9=pr		};;
189
{ .mii;	ADDP		r15=0,r33		// ap
190 191
	mov		ar.lc=r10
	mov		ar.ec=6		}
192
{ .mib;	ADDP		r16=0,r34		// bp
193
	mov		pr.rot=1<<16	};;
194

195 196
.L_bn_add_words_ctop:
{ .mii;	(p16)	ld8		r32=[r16],8	  // b=*(bp++)
197 198
	(p18)	add		r39=r37,r34
	(p19)	cmp.ltu.unc	p56,p0=r40,r38	}
199
{ .mfb;	(p0)	nop.m		0x0
200 201
	(p0)	nop.f		0x0
	(p0)	nop.b		0x0		}
202
{ .mii;	(p16)	ld8		r35=[r15],8	  // a=*(ap++)
203 204
	(p58)	cmp.eq.or	p57,p0=-1,r41	  // (p20)
	(p58)	add		r41=1,r41	} // (p20)
205
{ .mfb;	(p21)	st8		[r14]=r42,8	  // *(rp++)=r
206 207 208 209
	(p0)	nop.f		0x0
	br.ctop.sptk	.L_bn_add_words_ctop	};;
.L_bn_add_words_cend:

210
{ .mii;
211
(p59)	add		r8=1,r8		// return value
A
Andy Polyakov 已提交
212
	mov		pr=r9,0x1ffff
213
	mov		ar.lc=r3	}
214
{ .mbb;	nop.b		0x0
215 216 217 218 219 220 221 222 223
	br.ret.sptk.many	b0	};;
.endp	bn_add_words#

//
// BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
//
.global	bn_sub_words#
.proc	bn_sub_words#
.align	64
224
.skip	32	// makes the loop body aligned at 64-byte boundary
225 226 227 228
bn_sub_words:
	.prologue
	.fframe	0
	.save	ar.pfs,r2
229
{ .mii;	alloc		r2=ar.pfs,4,12,0,16
230
	cmp4.le		p6,p0=r35,r0	};;
231
{ .mfb;	mov		r8=r0			// return value
232 233 234
(p6)	br.ret.spnt.many	b0	};;

	.save	ar.lc,r3
235
{ .mib;	sub		r10=r35,r0,1
236 237 238 239
	mov		r3=ar.lc
	brp.loop.imp	.L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
					}
	.body
240
{ .mib;	ADDP		r14=0,r32		// rp
241
	mov		r9=pr		};;
242
{ .mii;	ADDP		r15=0,r33		// ap
243 244
	mov		ar.lc=r10
	mov		ar.ec=6		}
245
{ .mib;	ADDP		r16=0,r34		// bp
246
	mov		pr.rot=1<<16	};;
247

248 249
.L_bn_sub_words_ctop:
{ .mii;	(p16)	ld8		r32=[r16],8	  // b=*(bp++)
250 251
	(p18)	sub		r39=r37,r34
	(p19)	cmp.gtu.unc	p56,p0=r40,r38	}
252
{ .mfb;	(p0)	nop.m		0x0
253 254
	(p0)	nop.f		0x0
	(p0)	nop.b		0x0		}
255
{ .mii;	(p16)	ld8		r35=[r15],8	  // a=*(ap++)
256 257
	(p58)	cmp.eq.or	p57,p0=0,r41	  // (p20)
	(p58)	add		r41=-1,r41	} // (p20)
258
{ .mbb;	(p21)	st8		[r14]=r42,8	  // *(rp++)=r
259 260 261 262
	(p0)	nop.b		0x0
	br.ctop.sptk	.L_bn_sub_words_ctop	};;
.L_bn_sub_words_cend:

263
{ .mii;
264
(p59)	add		r8=1,r8		// return value
A
Andy Polyakov 已提交
265
	mov		pr=r9,0x1ffff
266
	mov		ar.lc=r3	}
267
{ .mbb;	nop.b		0x0
268 269 270 271 272 273 274 275 276 277 278 279 280 281 282
	br.ret.sptk.many	b0	};;
.endp	bn_sub_words#
#endif

#if 0
#define XMA_TEMPTATION
#endif

#if 1
//
// BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
//
.global	bn_mul_words#
.proc	bn_mul_words#
.align	64
283
.skip	32	// makes the loop body aligned at 64-byte boundary
284 285 286 287 288
bn_mul_words:
	.prologue
	.fframe	0
	.save	ar.pfs,r2
#ifdef XMA_TEMPTATION
289
{ .mfi;	alloc		r2=ar.pfs,4,0,0,0	};;
290
#else
291
{ .mfi;	alloc		r2=ar.pfs,4,12,0,16	};;
292
#endif
293
{ .mib;	mov		r8=r0			// return value
294 295 296 297
	cmp4.le		p6,p0=r34,r0
(p6)	br.ret.spnt.many	b0		};;

	.save	ar.lc,r3
298
{ .mii;	sub	r10=r34,r0,1
299 300 301 302
	mov	r3=ar.lc
	mov	r9=pr			};;

	.body
303
{ .mib;	setf.sig	f8=r35	// w
304 305
	mov		pr.rot=0x800001<<16
			// ------^----- serves as (p50) at first (p27)
306 307 308 309 310
	brp.loop.imp	.L_bn_mul_words_ctop,.L_bn_mul_words_cend-16
					}

#ifndef XMA_TEMPTATION

311 312
{ .mmi;	ADDP		r14=0,r32	// rp
	ADDP		r15=0,r33	// ap
313
	mov		ar.lc=r10	}
314
{ .mmi;	mov		r40=0		// serves as r35 at first (p27)
315
	mov		ar.ec=13	};;
316

317 318
// This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium
// L2 cache (i.e. 9 ticks away) as floating point load/store instructions
319
// bypass L1 cache and L2 latency is actually best-case scenario for
320 321
// ldf8. The loop is not scalable and shall run in 2*(n+12) even on
// "wider" IA-64 implementations. It's a trade-off here. n+24 loop
322 323 324 325 326 327
// would give us ~5% in *overall* performance improvement on "wider"
// IA-64, but would hurt Itanium for about same because of longer
// epilogue. As it's a matter of few percents in either case I've
// chosen to trade the scalability for development time (you can see
// this very instruction sequence in bn_mul_add_words loop which in
// turn is scalable).
328
.L_bn_mul_words_ctop:
329 330 331
{ .mfi;	(p25)	getf.sig	r36=f52			// low
	(p21)	xmpy.lu		f48=f37,f8
	(p28)	cmp.ltu		p54,p50=r41,r39	}
332
{ .mfi;	(p16)	ldf8		f32=[r15],8
333
	(p21)	xmpy.hu		f40=f37,f8
334
	(p0)	nop.i		0x0		};;
335 336 337 338 339
{ .mii;	(p25)	getf.sig	r32=f44			// high
	.pred.rel	"mutex",p50,p54
	(p50)	add		r40=r38,r35		// (p27)
	(p54)	add		r40=r38,r35,1	}	// (p27)
{ .mfb;	(p28)	st8		[r14]=r41,8
340 341 342 343
	(p0)	nop.f		0x0
	br.ctop.sptk	.L_bn_mul_words_ctop	};;
.L_bn_mul_words_cend:

344
{ .mii;	nop.m		0x0
345 346 347
.pred.rel	"mutex",p51,p55
(p51)	add		r8=r36,r0
(p55)	add		r8=r36,r0,1	}
348
{ .mfb;	nop.m	0x0
349 350 351 352 353 354 355
	nop.f	0x0
	nop.b	0x0			}

#else	// XMA_TEMPTATION

	setf.sig	f37=r0	// serves as carry at (p18) tick
	mov		ar.lc=r10
356
	mov		ar.ec=5;;
357 358 359 360 361 362 363 364 365 366 367 368

// Most of you examining this code very likely wonder why in the name
// of Intel the following loop is commented out? Indeed, it looks so
// neat that you find it hard to believe that it's something wrong
// with it, right? The catch is that every iteration depends on the
// result from previous one and the latter isn't available instantly.
// The loop therefore spins at the latency of xma minus 1, or in other
// words at 6*(n+4) ticks:-( Compare to the "production" loop above
// that runs in 2*(n+11) where the low latency problem is worked around
// by moving the dependency to one-tick latent interger ALU. Note that
// "distance" between ldf8 and xma is not latency of ldf8, but the
// *difference* between xma and ldf8 latencies.
369 370
.L_bn_mul_words_ctop:
{ .mfi;	(p16)	ldf8		f32=[r33],8
371
	(p18)	xma.hu		f38=f34,f8,f39	}
372
{ .mfb;	(p20)	stf8		[r32]=f37,8
373 374 375 376 377 378 379 380
	(p18)	xma.lu		f35=f34,f8,f39
	br.ctop.sptk	.L_bn_mul_words_ctop	};;
.L_bn_mul_words_cend:

	getf.sig	r8=f41		// the return value

#endif	// XMA_TEMPTATION

381
{ .mii;	nop.m		0x0
A
Andy Polyakov 已提交
382
	mov		pr=r9,0x1ffff
383
	mov		ar.lc=r3	}
384
{ .mfb;	rum		1<<5		// clear um.mfh
385 386 387 388 389 390 391 392 393 394 395 396
	nop.f		0x0
	br.ret.sptk.many	b0	};;
.endp	bn_mul_words#
#endif

#if 1
//
// BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
//
.global	bn_mul_add_words#
.proc	bn_mul_add_words#
.align	64
397
.skip	48	// makes the loop body aligned at 64-byte boundary
398 399 400 401 402
bn_mul_add_words:
	.prologue
	.fframe	0
	.save	ar.pfs,r2
	.save	ar.lc,r3
403 404 405 406 407 408 409
	.save	pr,r9
{ .mmi;	alloc		r2=ar.pfs,4,4,0,8
	cmp4.le		p6,p0=r34,r0
	mov		r3=ar.lc	};;
{ .mib;	mov		r8=r0		// return value
	sub		r10=r34,r0,1
(p6)	br.ret.spnt.many	b0	};;
410 411

	.body
412 413
{ .mib;	setf.sig	f8=r35		// w
	mov		r9=pr
414 415
	brp.loop.imp	.L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
					}
416 417
{ .mmi;	ADDP		r14=0,r32	// rp
	ADDP		r15=0,r33	// ap
418
	mov		ar.lc=r10	}
419 420 421 422 423 424 425 426 427 428 429 430 431 432
{ .mii;	ADDP		r16=0,r32	// rp copy
	mov		pr.rot=0x2001<<16
			// ------^----- serves as (p40) at first (p27)
	mov		ar.ec=11	};;

// This loop spins in 3*(n+10) ticks on Itanium and in 2*(n+10) on
// Itanium 2. Yes, unlike previous versions it scales:-) Previous
// version was peforming *all* additions in IALU and was starving
// for those even on Itanium 2. In this version one addition is
// moved to FPU and is folded with multiplication. This is at cost
// of propogating the result from previous call to this subroutine
// to L2 cache... In other words negligible even for shorter keys.
// *Overall* performance improvement [over previous version] varies
// from 11 to 22 percent depending on key length.
433
.L_bn_mul_add_words_ctop:
434 435 436 437 438 439 440 441 442 443 444 445
.pred.rel	"mutex",p40,p42
{ .mfi;	(p23)	getf.sig	r36=f45			// low
	(p20)	xma.lu		f42=f36,f8,f50		// low
	(p40)	add		r39=r39,r35	}	// (p27)
{ .mfi;	(p16)	ldf8		f32=[r15],8		// *(ap++)
	(p20)	xma.hu		f36=f36,f8,f50		// high
	(p42)	add		r39=r39,r35,1	};;	// (p27)
{ .mmi;	(p24)	getf.sig	r32=f40			// high
	(p16)	ldf8		f46=[r16],8		// *(rp1++)
	(p40)	cmp.ltu		p41,p39=r39,r35	}	// (p27)
{ .mib;	(p26)	st8		[r14]=r39,8		// *(rp2++)
	(p42)	cmp.leu		p41,p39=r39,r35		// (p27)
446 447 448
	br.ctop.sptk	.L_bn_mul_add_words_ctop};;
.L_bn_mul_add_words_cend:

449 450 451 452 453 454
{ .mmi;	.pred.rel	"mutex",p40,p42
(p40)	add		r8=r35,r0
(p42)	add		r8=r35,r0,1
	mov		pr=r9,0x1ffff	}
{ .mib;	rum		1<<5		// clear um.mfh
	mov		ar.lc=r3
455 456 457 458 459 460 461 462 463 464 465
	br.ret.sptk.many	b0	};;
.endp	bn_mul_add_words#
#endif

#if 1
//
// void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
//
.global	bn_sqr_words#
.proc	bn_sqr_words#
.align	64
466
.skip	32	// makes the loop body aligned at 64-byte boundary 
467 468 469 470
bn_sqr_words:
	.prologue
	.fframe	0
	.save	ar.pfs,r2
471
{ .mii;	alloc		r2=ar.pfs,3,0,0,0
472
	sxt4		r34=r34		};;
473
{ .mii;	cmp.le		p6,p0=r34,r0
474
	mov		r8=r0		}	// return value
475 476
{ .mfb;	ADDP		r32=0,r32
	nop.f		0x0
477 478 479
(p6)	br.ret.spnt.many	b0	};;

	.save	ar.lc,r3
480
{ .mii;	sub	r10=r34,r0,1
481 482 483 484
	mov	r3=ar.lc
	mov	r9=pr			};;

	.body
485
{ .mib;	ADDP		r33=0,r33
486 487 488
	mov		pr.rot=1<<16
	brp.loop.imp	.L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16
					}
489
{ .mii;	add		r34=8,r32
490
	mov		ar.lc=r10
491
	mov		ar.ec=18	};;
492 493 494 495 496 497 498 499

// 2*(n+17) on Itanium, (n+17) on "wider" IA-64 implementations. It's
// possible to compress the epilogue (I'm getting tired to write this
// comment over and over) and get down to 2*n+16 at the cost of
// scalability. The decision will very likely be reconsidered after the
// benchmark program is profiled. I.e. if perfomance gain on Itanium
// will appear larger than loss on "wider" IA-64, then the loop should
// be explicitely split and the epilogue compressed.
500 501
.L_bn_sqr_words_ctop:
{ .mfi;	(p16)	ldf8		f32=[r33],8
502 503
	(p25)	xmpy.lu		f42=f41,f41
	(p0)	nop.i		0x0		}
504
{ .mib;	(p33)	stf8		[r32]=f50,16
505 506
	(p0)	nop.i		0x0
	(p0)	nop.b		0x0		}
507
{ .mfi;	(p0)	nop.m		0x0
508 509
	(p25)	xmpy.hu		f52=f41,f41
	(p0)	nop.i		0x0		}
510
{ .mib;	(p33)	stf8		[r34]=f60,16
511 512 513 514
	(p0)	nop.i		0x0
	br.ctop.sptk	.L_bn_sqr_words_ctop	};;
.L_bn_sqr_words_cend:

515
{ .mii;	nop.m		0x0
A
Andy Polyakov 已提交
516
	mov		pr=r9,0x1ffff
517
	mov		ar.lc=r3	}
518
{ .mfb;	rum		1<<5		// clear um.mfh
519 520 521 522 523 524
	nop.f		0x0
	br.ret.sptk.many	b0	};;
.endp	bn_sqr_words#
#endif

#if 1
525
// Apparently we win nothing by implementing special bn_sqr_comba8.
526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549
// Yes, it is possible to reduce the number of multiplications by
// almost factor of two, but then the amount of additions would
// increase by factor of two (as we would have to perform those
// otherwise performed by xma ourselves). Normally we would trade
// anyway as multiplications are way more expensive, but not this
// time... Multiplication kernel is fully pipelined and as we drain
// one 128-bit multiplication result per clock cycle multiplications
// are effectively as inexpensive as additions. Special implementation
// might become of interest for "wider" IA-64 implementation as you'll
// be able to get through the multiplication phase faster (there won't
// be any stall issues as discussed in the commentary section below and
// you therefore will be able to employ all 4 FP units)... But these
// Itanium days it's simply too hard to justify the effort so I just
// drop down to bn_mul_comba8 code:-)
//
// void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
//
.global	bn_sqr_comba8#
.proc	bn_sqr_comba8#
.align	64
bn_sqr_comba8:
	.prologue
	.fframe	0
	.save	ar.pfs,r2
550
#if defined(_HPUX_SOURCE) && !defined(_LP64)
551
{ .mii;	alloc	r2=ar.pfs,2,1,0,0
A
Andy Polyakov 已提交
552 553 554 555 556 557
	addp4	r33=0,r33
	addp4	r32=0,r32		};;
{ .mii;
#else
{ .mii;	alloc	r2=ar.pfs,2,1,0,0
#endif
558 559 560
	mov	r34=r33
	add	r14=8,r33		};;
	.body
561
{ .mii;	add	r17=8,r34
562 563
	add	r15=16,r33
	add	r18=16,r34		}
564
{ .mfb;	add	r16=24,r33
565
	br	.L_cheat_entry_point8	};;
566
.endp	bn_sqr_comba8#
567 568 569 570 571 572 573 574 575
#endif

#if 1
// I've estimated this routine to run in ~120 ticks, but in reality
// (i.e. according to ar.itc) it takes ~160 ticks. Are those extra
// cycles consumed for instructions fetch? Or did I misinterpret some
// clause in Itanium µ-architecture manual? Comments are welcomed and
// highly appreciated.
//
576 577 578 579
// On Itanium 2 it takes ~190 ticks. This is because of stalls on
// result from getf.sig. I do nothing about it at this point for
// reasons depicted below.
//
580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621
// However! It should be noted that even 160 ticks is darn good result
// as it's over 10 (yes, ten, spelled as t-e-n) times faster than the
// C version (compiled with gcc with inline assembler). I really
// kicked compiler's butt here, didn't I? Yeah! This brings us to the
// following statement. It's damn shame that this routine isn't called
// very often nowadays! According to the profiler most CPU time is
// consumed by bn_mul_add_words called from BN_from_montgomery. In
// order to estimate what we're missing, I've compared the performance
// of this routine against "traditional" implementation, i.e. against
// following routine:
//
// void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
// {	r[ 8]=bn_mul_words(    &(r[0]),a,8,b[0]);
//	r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]);
//	r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]);
//	r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]);
//	r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]);
//	r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]);
//	r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]);
//	r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
// }
//
// The one below is over 8 times faster than the one above:-( Even
// more reasons to "combafy" bn_mul_add_mont...
//
// And yes, this routine really made me wish there were an optimizing
// assembler! It also feels like it deserves a dedication.
//
//	To my wife for being there and to my kids...
//
// void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
//
#define	carry1	r14
#define	carry2	r15
#define	carry3	r34
.global	bn_mul_comba8#
.proc	bn_mul_comba8#
.align	64
bn_mul_comba8:
	.prologue
	.fframe	0
	.save	ar.pfs,r2
622
#if defined(_HPUX_SOURCE) && !defined(_LP64)
623
{ .mii;	alloc	r2=ar.pfs,3,0,0,0
A
Andy Polyakov 已提交
624 625 626 627 628 629
	addp4	r33=0,r33
	addp4	r34=0,r34		};;
{ .mii;	addp4	r32=0,r32
#else
{ .mii;	alloc   r2=ar.pfs,3,0,0,0
#endif
630 631 632
	add	r14=8,r33
	add	r17=8,r34		}
	.body
633
{ .mii;	add	r15=16,r33
634
	add	r18=16,r34
635
	add	r16=24,r33		}
636
.L_cheat_entry_point8:
637
{ .mmi;	add	r19=24,r34
638

639
	ldf8	f32=[r33],32		};;
640

641 642 643 644 645
{ .mmi;	ldf8	f120=[r34],32
	ldf8	f121=[r17],32		}
{ .mmi;	ldf8	f122=[r18],32
	ldf8	f123=[r19],32		};;
{ .mmi;	ldf8	f124=[r34]
646
	ldf8	f125=[r17]		}
647
{ .mmi;	ldf8	f126=[r18]
648 649
	ldf8	f127=[r19]		}

650
{ .mmi;	ldf8	f33=[r14],32
651
	ldf8	f34=[r15],32		}
652
{ .mmi;	ldf8	f35=[r16],32;;
653
	ldf8	f36=[r33]		}
654
{ .mmi;	ldf8	f37=[r14]
655
	ldf8	f38=[r15]		}
656
{ .mfi;	ldf8	f39=[r16]
657 658 659 660 661
// -------\ Entering multiplier's heaven /-------
// ------------\                    /------------
// -----------------\          /-----------------
// ----------------------\/----------------------
		xma.hu	f41=f32,f120,f0		}
662 663 664 665 666 667 668 669 670 671 672 673 674 675 676
{ .mfi;		xma.lu	f40=f32,f120,f0		};; // (*)
{ .mfi;		xma.hu	f51=f32,f121,f0		}
{ .mfi;		xma.lu	f50=f32,f121,f0		};;
{ .mfi;		xma.hu	f61=f32,f122,f0		}
{ .mfi;		xma.lu	f60=f32,f122,f0		};;
{ .mfi;		xma.hu	f71=f32,f123,f0		}
{ .mfi;		xma.lu	f70=f32,f123,f0		};;
{ .mfi;		xma.hu	f81=f32,f124,f0		}
{ .mfi;		xma.lu	f80=f32,f124,f0		};;
{ .mfi;		xma.hu	f91=f32,f125,f0		}
{ .mfi;		xma.lu	f90=f32,f125,f0		};;
{ .mfi;		xma.hu	f101=f32,f126,f0	}
{ .mfi;		xma.lu	f100=f32,f126,f0	};;
{ .mfi;		xma.hu	f111=f32,f127,f0	}
{ .mfi;		xma.lu	f110=f32,f127,f0	};;//
677 678 679 680 681 682 683 684 685 686 687
// (*)	You can argue that splitting at every second bundle would
//	prevent "wider" IA-64 implementations from achieving the peak
//	performance. Well, not really... The catch is that if you
//	intend to keep 4 FP units busy by splitting at every fourth
//	bundle and thus perform these 16 multiplications in 4 ticks,
//	the first bundle *below* would stall because the result from
//	the first xma bundle *above* won't be available for another 3
//	ticks (if not more, being an optimist, I assume that "wider"
//	implementation will have same latency:-). This stall will hold
//	you back and the performance would be as if every second bundle
//	were split *anyway*...
688
{ .mfi;	getf.sig	r16=f40
689 690
		xma.hu	f42=f33,f120,f41
	add		r33=8,r32		}
691 692
{ .mfi;		xma.lu	f41=f33,f120,f41	};;
{ .mfi;	getf.sig	r24=f50
693
		xma.hu	f52=f33,f121,f51	}
694 695
{ .mfi;		xma.lu	f51=f33,f121,f51	};;
{ .mfi;	st8		[r32]=r16,16
696
		xma.hu	f62=f33,f122,f61	}
697 698 699 700 701 702 703 704 705 706 707
{ .mfi;		xma.lu	f61=f33,f122,f61	};;
{ .mfi;		xma.hu	f72=f33,f123,f71	}
{ .mfi;		xma.lu	f71=f33,f123,f71	};;
{ .mfi;		xma.hu	f82=f33,f124,f81	}
{ .mfi;		xma.lu	f81=f33,f124,f81	};;
{ .mfi;		xma.hu	f92=f33,f125,f91	}
{ .mfi;		xma.lu	f91=f33,f125,f91	};;
{ .mfi;		xma.hu	f102=f33,f126,f101	}
{ .mfi;		xma.lu	f101=f33,f126,f101	};;
{ .mfi;		xma.hu	f112=f33,f127,f111	}
{ .mfi;		xma.lu	f111=f33,f127,f111	};;//
708
//-------------------------------------------------//
709
{ .mfi;	getf.sig	r25=f41
710
		xma.hu	f43=f34,f120,f42	}
711 712
{ .mfi;		xma.lu	f42=f34,f120,f42	};;
{ .mfi;	getf.sig	r16=f60
713
		xma.hu	f53=f34,f121,f52	}
714 715
{ .mfi;		xma.lu	f52=f34,f121,f52	};;
{ .mfi;	getf.sig	r17=f51
716 717
		xma.hu	f63=f34,f122,f62
	add		r25=r25,r24		}
718
{ .mfi;		xma.lu	f62=f34,f122,f62
719
	mov		carry1=0		};;
720
{ .mfi;	cmp.ltu		p6,p0=r25,r24
721
		xma.hu	f73=f34,f123,f72	}
722 723
{ .mfi;		xma.lu	f72=f34,f123,f72	};;
{ .mfi;	st8		[r33]=r25,16
724 725
		xma.hu	f83=f34,f124,f82
(p6)	add		carry1=1,carry1		}
726 727 728 729 730 731 732
{ .mfi;		xma.lu	f82=f34,f124,f82	};;
{ .mfi;		xma.hu	f93=f34,f125,f92	}
{ .mfi;		xma.lu	f92=f34,f125,f92	};;
{ .mfi;		xma.hu	f103=f34,f126,f102	}
{ .mfi;		xma.lu	f102=f34,f126,f102	};;
{ .mfi;		xma.hu	f113=f34,f127,f112	}
{ .mfi;		xma.lu	f112=f34,f127,f112	};;//
733
//-------------------------------------------------//
734
{ .mfi;	getf.sig	r18=f42
735 736
		xma.hu	f44=f35,f120,f43
	add		r17=r17,r16		}
737 738
{ .mfi;		xma.lu	f43=f35,f120,f43	};;
{ .mfi;	getf.sig	r24=f70
739
		xma.hu	f54=f35,f121,f53	}
740
{ .mfi;	mov		carry2=0
741
		xma.lu	f53=f35,f121,f53	};;
742
{ .mfi;	getf.sig	r25=f61
743 744
		xma.hu	f64=f35,f122,f63
	cmp.ltu		p7,p0=r17,r16		}
745
{ .mfi;	add		r18=r18,r17
746
		xma.lu	f63=f35,f122,f63	};;
747
{ .mfi;	getf.sig	r26=f52
748 749
		xma.hu	f74=f35,f123,f73
(p7)	add		carry2=1,carry2		}
750
{ .mfi;	cmp.ltu		p7,p0=r18,r17
751 752
		xma.lu	f73=f35,f123,f73
	add		r18=r18,carry1		};;
753
{ .mfi;
754 755
		xma.hu	f84=f35,f124,f83
(p7)	add		carry2=1,carry2		}
756
{ .mfi;	cmp.ltu		p7,p0=r18,carry1
757
		xma.lu	f83=f35,f124,f83	};;
758
{ .mfi;	st8		[r32]=r18,16
759 760
		xma.hu	f94=f35,f125,f93
(p7)	add		carry2=1,carry2		}
761 762 763 764 765
{ .mfi;		xma.lu	f93=f35,f125,f93	};;
{ .mfi;		xma.hu	f104=f35,f126,f103	}
{ .mfi;		xma.lu	f103=f35,f126,f103	};;
{ .mfi;		xma.hu	f114=f35,f127,f113	}
{ .mfi;	mov		carry1=0
766
		xma.lu	f113=f35,f127,f113
767
	add		r25=r25,r24		};;//
768
//-------------------------------------------------//
769
{ .mfi;	getf.sig	r27=f43
770 771
		xma.hu	f45=f36,f120,f44
	cmp.ltu		p6,p0=r25,r24		}
772
{ .mfi;		xma.lu	f44=f36,f120,f44	
773
	add		r26=r26,r25		};;
774
{ .mfi;	getf.sig	r16=f80
775 776
		xma.hu	f55=f36,f121,f54
(p6)	add		carry1=1,carry1		}
777 778
{ .mfi;		xma.lu	f54=f36,f121,f54	};;
{ .mfi;	getf.sig	r17=f71
779 780
		xma.hu	f65=f36,f122,f64
	cmp.ltu		p6,p0=r26,r25		}
781
{ .mfi;		xma.lu	f64=f36,f122,f64
782
	add		r27=r27,r26		};;
783
{ .mfi;	getf.sig	r18=f62
784 785
		xma.hu	f75=f36,f123,f74
(p6)	add		carry1=1,carry1		}
786
{ .mfi;	cmp.ltu		p6,p0=r27,r26
787 788
		xma.lu	f74=f36,f123,f74
	add		r27=r27,carry2		};;
789
{ .mfi;	getf.sig	r19=f53
790 791
		xma.hu	f85=f36,f124,f84
(p6)	add		carry1=1,carry1		}
792
{ .mfi;		xma.lu	f84=f36,f124,f84
793
	cmp.ltu		p6,p0=r27,carry2	};;
794
{ .mfi;	st8		[r33]=r27,16
795 796
		xma.hu	f95=f36,f125,f94
(p6)	add		carry1=1,carry1		}
797 798 799
{ .mfi;		xma.lu	f94=f36,f125,f94	};;
{ .mfi;		xma.hu	f105=f36,f126,f104	}
{ .mfi;	mov		carry2=0
800 801
		xma.lu	f104=f36,f126,f104
	add		r17=r17,r16		};;
802
{ .mfi;		xma.hu	f115=f36,f127,f114
803
	cmp.ltu		p7,p0=r17,r16		}
804 805
{ .mfi;		xma.lu	f114=f36,f127,f114
	add		r18=r18,r17		};;//
806
//-------------------------------------------------//
807
{ .mfi;	getf.sig	r20=f44
808 809
		xma.hu	f46=f37,f120,f45
(p7)	add		carry2=1,carry2		}
810
{ .mfi;	cmp.ltu		p7,p0=r18,r17
811 812
		xma.lu	f45=f37,f120,f45
	add		r19=r19,r18		};;
813
{ .mfi;	getf.sig	r24=f90
814
		xma.hu	f56=f37,f121,f55	}
815 816
{ .mfi;		xma.lu	f55=f37,f121,f55	};;
{ .mfi;	getf.sig	r25=f81
817 818
		xma.hu	f66=f37,f122,f65
(p7)	add		carry2=1,carry2		}
819
{ .mfi;	cmp.ltu		p7,p0=r19,r18
820 821
		xma.lu	f65=f37,f122,f65
	add		r20=r20,r19		};;
822
{ .mfi;	getf.sig	r26=f72
823 824
		xma.hu	f76=f37,f123,f75
(p7)	add		carry2=1,carry2		}
825
{ .mfi;	cmp.ltu		p7,p0=r20,r19
826 827
		xma.lu	f75=f37,f123,f75
	add		r20=r20,carry1		};;
828
{ .mfi;	getf.sig	r27=f63
829 830
		xma.hu	f86=f37,f124,f85
(p7)	add		carry2=1,carry2		}
831
{ .mfi;		xma.lu	f85=f37,f124,f85
832
	cmp.ltu		p7,p0=r20,carry1	};;
833
{ .mfi;	getf.sig	r28=f54
834 835
		xma.hu	f96=f37,f125,f95
(p7)	add		carry2=1,carry2		}
836
{ .mfi;	st8		[r32]=r20,16
837
		xma.lu	f95=f37,f125,f95	};;
838 839
{ .mfi;		xma.hu	f106=f37,f126,f105	}
{ .mfi;	mov		carry1=0
840 841
		xma.lu	f105=f37,f126,f105
	add		r25=r25,r24		};;
842
{ .mfi;		xma.hu	f116=f37,f127,f115
843
	cmp.ltu		p6,p0=r25,r24		}
844 845
{ .mfi;		xma.lu	f115=f37,f127,f115
	add		r26=r26,r25		};;//
846
//-------------------------------------------------//
847
{ .mfi;	getf.sig	r29=f45
848 849
		xma.hu	f47=f38,f120,f46
(p6)	add		carry1=1,carry1		}
850
{ .mfi;	cmp.ltu		p6,p0=r26,r25
851 852
		xma.lu	f46=f38,f120,f46
	add		r27=r27,r26		};;
853
{ .mfi;	getf.sig	r16=f100
854 855
		xma.hu	f57=f38,f121,f56
(p6)	add		carry1=1,carry1		}
856
{ .mfi;	cmp.ltu		p6,p0=r27,r26
857 858
		xma.lu	f56=f38,f121,f56
	add		r28=r28,r27		};;
859
{ .mfi;	getf.sig	r17=f91
860 861
		xma.hu	f67=f38,f122,f66
(p6)	add		carry1=1,carry1		}
862
{ .mfi;	cmp.ltu		p6,p0=r28,r27
863 864
		xma.lu	f66=f38,f122,f66
	add		r29=r29,r28		};;
865
{ .mfi;	getf.sig	r18=f82
866 867
		xma.hu	f77=f38,f123,f76
(p6)	add		carry1=1,carry1		}
868
{ .mfi;	cmp.ltu		p6,p0=r29,r28
869 870
		xma.lu	f76=f38,f123,f76
	add		r29=r29,carry2		};;
871
{ .mfi;	getf.sig	r19=f73
872 873
		xma.hu	f87=f38,f124,f86
(p6)	add		carry1=1,carry1		}
874
{ .mfi;		xma.lu	f86=f38,f124,f86
875
	cmp.ltu		p6,p0=r29,carry2	};;
876
{ .mfi;	getf.sig	r20=f64
877 878
		xma.hu	f97=f38,f125,f96
(p6)	add		carry1=1,carry1		}
879
{ .mfi;	st8		[r33]=r29,16
880
		xma.lu	f96=f38,f125,f96	};;
881
{ .mfi;	getf.sig	r21=f55
882
		xma.hu	f107=f38,f126,f106	}
883
{ .mfi;	mov		carry2=0
884 885
		xma.lu	f106=f38,f126,f106
	add		r17=r17,r16		};;
886
{ .mfi;		xma.hu	f117=f38,f127,f116
887
	cmp.ltu		p7,p0=r17,r16		}
888 889
{ .mfi;		xma.lu	f116=f38,f127,f116
	add		r18=r18,r17		};;//
890
//-------------------------------------------------//
891
{ .mfi;	getf.sig	r22=f46
892 893
		xma.hu	f48=f39,f120,f47
(p7)	add		carry2=1,carry2		}
894
{ .mfi;	cmp.ltu		p7,p0=r18,r17
895 896
		xma.lu	f47=f39,f120,f47
	add		r19=r19,r18		};;
897
{ .mfi;	getf.sig	r24=f110
898 899
		xma.hu	f58=f39,f121,f57
(p7)	add		carry2=1,carry2		}
900
{ .mfi;	cmp.ltu		p7,p0=r19,r18
901 902
		xma.lu	f57=f39,f121,f57
	add		r20=r20,r19		};;
903
{ .mfi;	getf.sig	r25=f101
904 905
		xma.hu	f68=f39,f122,f67
(p7)	add		carry2=1,carry2		}
906
{ .mfi;	cmp.ltu		p7,p0=r20,r19
907 908
		xma.lu	f67=f39,f122,f67
	add		r21=r21,r20		};;
909
{ .mfi;	getf.sig	r26=f92
910 911
		xma.hu	f78=f39,f123,f77
(p7)	add		carry2=1,carry2		}
912
{ .mfi;	cmp.ltu		p7,p0=r21,r20
913 914
		xma.lu	f77=f39,f123,f77
	add		r22=r22,r21		};;
915
{ .mfi;	getf.sig	r27=f83
916 917
		xma.hu	f88=f39,f124,f87
(p7)	add		carry2=1,carry2		}
918
{ .mfi;	cmp.ltu		p7,p0=r22,r21
919 920
		xma.lu	f87=f39,f124,f87
	add		r22=r22,carry1		};;
921
{ .mfi;	getf.sig	r28=f74
922 923
		xma.hu	f98=f39,f125,f97
(p7)	add		carry2=1,carry2		}
924
{ .mfi;		xma.lu	f97=f39,f125,f97
925
	cmp.ltu		p7,p0=r22,carry1	};;
926
{ .mfi;	getf.sig	r29=f65
927 928
		xma.hu	f108=f39,f126,f107
(p7)	add		carry2=1,carry2		}
929
{ .mfi;	st8		[r32]=r22,16
930
		xma.lu	f107=f39,f126,f107	};;
931
{ .mfi;	getf.sig	r30=f56
932
		xma.hu	f118=f39,f127,f117	}
933
{ .mfi;		xma.lu	f117=f39,f127,f117	};;//
934 935 936
//-------------------------------------------------//
// Leaving muliplier's heaven... Quite a ride, huh?

937
{ .mii;	getf.sig	r31=f47
938 939
	add		r25=r25,r24
	mov		carry1=0		};;
940
{ .mii;		getf.sig	r16=f111
941 942
	cmp.ltu		p6,p0=r25,r24
	add		r26=r26,r25		};;
943 944
{ .mfb;		getf.sig	r17=f102	}
{ .mii;
945 946 947
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r26,r25
	add		r27=r27,r26		};;
948 949
{ .mfb;	nop.m	0x0				}
{ .mii;
950 951 952
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r27,r26
	add		r28=r28,r27		};;
953
{ .mii;		getf.sig	r18=f93
954 955
		add		r17=r17,r16
		mov		carry3=0	}
956
{ .mii;
957 958 959
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r28,r27
	add		r29=r29,r28		};;
960
{ .mii;		getf.sig	r19=f84
961
		cmp.ltu		p7,p0=r17,r16	}
962
{ .mii;
963 964 965
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r29,r28
	add		r30=r30,r29		};;
966
{ .mii;		getf.sig	r20=f75
967
		add		r18=r18,r17	}
968
{ .mii;
969 970 971
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r30,r29
	add		r31=r31,r30		};;
972 973
{ .mfb;		getf.sig	r21=f66		}
{ .mii;	(p7)	add		carry3=1,carry3
974 975
		cmp.ltu		p7,p0=r18,r17
		add		r19=r19,r18	}
976 977
{ .mfb;	nop.m	0x0				}
{ .mii;
978 979 980
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r31,r30
	add		r31=r31,carry2		};;
981 982
{ .mfb;		getf.sig	r22=f57		}
{ .mii;	(p7)	add		carry3=1,carry3
983 984
		cmp.ltu		p7,p0=r19,r18
		add		r20=r20,r19	}
985 986
{ .mfb;	nop.m	0x0				}
{ .mii;
987 988
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r31,carry2	};;
989 990
{ .mfb;		getf.sig	r23=f48		}
{ .mii;	(p7)	add		carry3=1,carry3
991 992
		cmp.ltu		p7,p0=r20,r19
		add		r21=r21,r20	}
993
{ .mii;
994
(p6)	add		carry1=1,carry1		}
995
{ .mfb;	st8		[r33]=r31,16		};;
996

997 998
{ .mfb;	getf.sig	r24=f112		}
{ .mii;	(p7)	add		carry3=1,carry3
999 1000
		cmp.ltu		p7,p0=r21,r20
		add		r22=r22,r21	};;
1001 1002
{ .mfb;	getf.sig	r25=f103		}
{ .mii;	(p7)	add		carry3=1,carry3
1003 1004
		cmp.ltu		p7,p0=r22,r21
		add		r23=r23,r22	};;
1005 1006
{ .mfb;	getf.sig	r26=f94			}
{ .mii;	(p7)	add		carry3=1,carry3
1007 1008
		cmp.ltu		p7,p0=r23,r22
		add		r23=r23,carry1	};;
1009 1010
{ .mfb;	getf.sig	r27=f85			}
{ .mii;	(p7)	add		carry3=1,carry3
1011
		cmp.ltu		p7,p8=r23,carry1};;
1012
{ .mii;	getf.sig	r28=f76
1013 1014
	add		r25=r25,r24
	mov		carry1=0		}
1015
{ .mii;		st8		[r32]=r23,16
1016 1017 1018
	(p7)	add		carry2=1,carry3
	(p8)	add		carry2=0,carry3	};;

1019 1020
{ .mfb;	nop.m	0x0				}
{ .mii;	getf.sig	r29=f67
1021 1022
	cmp.ltu		p6,p0=r25,r24
	add		r26=r26,r25		};;
1023 1024
{ .mfb;	getf.sig	r30=f58			}
{ .mii;
1025 1026 1027
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r26,r25
	add		r27=r27,r26		};;
1028 1029
{ .mfb;		getf.sig	r16=f113	}
{ .mii;
1030 1031 1032
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r27,r26
	add		r28=r28,r27		};;
1033 1034
{ .mfb;		getf.sig	r17=f104	}
{ .mii;
1035 1036 1037
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r28,r27
	add		r29=r29,r28		};;
1038 1039
{ .mfb;		getf.sig	r18=f95		}
{ .mii;
1040 1041 1042
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r29,r28
	add		r30=r30,r29		};;
1043
{ .mii;		getf.sig	r19=f86
1044 1045
		add		r17=r17,r16
		mov		carry3=0	}
1046
{ .mii;
1047 1048 1049
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r30,r29
	add		r30=r30,carry2		};;
1050
{ .mii;		getf.sig	r20=f77
1051 1052
		cmp.ltu		p7,p0=r17,r16
		add		r18=r18,r17	}
1053
{ .mii;
1054 1055
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r30,carry2	};;
1056 1057
{ .mfb;		getf.sig	r21=f68		}
{ .mii;	st8		[r33]=r30,16
1058 1059
(p6)	add		carry1=1,carry1		};;

1060 1061
{ .mfb;	getf.sig	r24=f114		}
{ .mii;	(p7)	add		carry3=1,carry3
1062 1063
		cmp.ltu		p7,p0=r18,r17
		add		r19=r19,r18	};;
1064 1065
{ .mfb;	getf.sig	r25=f105		}
{ .mii;	(p7)	add		carry3=1,carry3
1066 1067
		cmp.ltu		p7,p0=r19,r18
		add		r20=r20,r19	};;
1068 1069
{ .mfb;	getf.sig	r26=f96			}
{ .mii;	(p7)	add		carry3=1,carry3
1070 1071
		cmp.ltu		p7,p0=r20,r19
		add		r21=r21,r20	};;
1072 1073
{ .mfb;	getf.sig	r27=f87			}
{ .mii;	(p7)	add		carry3=1,carry3
1074 1075
		cmp.ltu		p7,p0=r21,r20
		add		r21=r21,carry1	};;
1076
{ .mib;	getf.sig	r28=f78			
1077
	add		r25=r25,r24		}
1078
{ .mib;	(p7)	add		carry3=1,carry3
1079
		cmp.ltu		p7,p8=r21,carry1};;
1080
{ .mii;		st8		[r32]=r21,16
1081 1082 1083
	(p7)	add		carry2=1,carry3
	(p8)	add		carry2=0,carry3	}

1084
{ .mii;	mov		carry1=0
1085 1086
	cmp.ltu		p6,p0=r25,r24
	add		r26=r26,r25		};;
1087 1088
{ .mfb;		getf.sig	r16=f115	}
{ .mii;
1089 1090 1091
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r26,r25
	add		r27=r27,r26		};;
1092 1093
{ .mfb;		getf.sig	r17=f106	}
{ .mii;
1094 1095 1096
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r27,r26
	add		r28=r28,r27		};;
1097 1098
{ .mfb;		getf.sig	r18=f97		}
{ .mii;
1099 1100 1101
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r28,r27
	add		r28=r28,carry2		};;
1102
{ .mib;		getf.sig	r19=f88
1103
		add		r17=r17,r16	}
1104
{ .mib;
1105 1106
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r28,carry2	};;
1107
{ .mii;	st8		[r33]=r28,16
1108 1109
(p6)	add		carry1=1,carry1		}

1110
{ .mii;		mov		carry2=0
1111 1112
		cmp.ltu		p7,p0=r17,r16
		add		r18=r18,r17	};;
1113 1114
{ .mfb;	getf.sig	r24=f116		}
{ .mii;	(p7)	add		carry2=1,carry2
1115 1116
		cmp.ltu		p7,p0=r18,r17
		add		r19=r19,r18	};;
1117 1118
{ .mfb;	getf.sig	r25=f107		}
{ .mii;	(p7)	add		carry2=1,carry2
1119 1120
		cmp.ltu		p7,p0=r19,r18
		add		r19=r19,carry1	};;
1121 1122
{ .mfb;	getf.sig	r26=f98			}
{ .mii;	(p7)	add		carry2=1,carry2
1123
		cmp.ltu		p7,p0=r19,carry1};;
1124
{ .mii;		st8		[r32]=r19,16
1125 1126
	(p7)	add		carry2=1,carry2	}

1127
{ .mfb;	add		r25=r25,r24		};;
1128

1129 1130
{ .mfb;		getf.sig	r16=f117	}
{ .mii;	mov		carry1=0
1131 1132
	cmp.ltu		p6,p0=r25,r24
	add		r26=r26,r25		};;
1133 1134
{ .mfb;		getf.sig	r17=f108	}
{ .mii;
1135 1136 1137
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r26,r25
	add		r26=r26,carry2		};;
1138 1139
{ .mfb;	nop.m	0x0				}
{ .mii;
1140 1141
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r26,carry2	};;
1142
{ .mii;	st8		[r33]=r26,16
1143 1144
(p6)	add		carry1=1,carry1		}

1145 1146 1147
{ .mfb;		add		r17=r17,r16	};;
{ .mfb;	getf.sig	r24=f118		}
{ .mii;		mov		carry2=0
1148 1149
		cmp.ltu		p7,p0=r17,r16
		add		r17=r17,carry1	};;
1150
{ .mii;	(p7)	add		carry2=1,carry2
1151
		cmp.ltu		p7,p0=r17,carry1};;
1152
{ .mii;		st8		[r32]=r17
1153
	(p7)	add		carry2=1,carry2	};;
1154 1155
{ .mfb;	add		r24=r24,carry2		};;
{ .mib;	st8		[r33]=r24		}
1156

1157
{ .mib;	rum		1<<5		// clear um.mfh
1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179
	br.ret.sptk.many	b0	};;
.endp	bn_mul_comba8#
#undef	carry3
#undef	carry2
#undef	carry1
#endif

#if 1
// It's possible to make it faster (see comment to bn_sqr_comba8), but
// I reckon it doesn't worth the effort. Basically because the routine
// (actually both of them) practically never called... So I just play
// same trick as with bn_sqr_comba8.
//
// void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
//
.global	bn_sqr_comba4#
.proc	bn_sqr_comba4#
.align	64
bn_sqr_comba4:
	.prologue
	.fframe	0
	.save	ar.pfs,r2
1180
#if defined(_HPUX_SOURCE) && !defined(_LP64)
A
Andy Polyakov 已提交
1181 1182 1183 1184 1185
{ .mii;	alloc   r2=ar.pfs,2,1,0,0
	addp4	r32=0,r32
	addp4	r33=0,r33		};;
{ .mii;
#else
1186
{ .mii;	alloc	r2=ar.pfs,2,1,0,0
A
Andy Polyakov 已提交
1187
#endif
1188 1189 1190
	mov	r34=r33
	add	r14=8,r33		};;
	.body
1191
{ .mii;	add	r17=8,r34
1192 1193
	add	r15=16,r33
	add	r18=16,r34		}
1194
{ .mfb;	add	r16=24,r33
1195
	br	.L_cheat_entry_point4	};;
1196
.endp	bn_sqr_comba4#
1197 1198 1199 1200 1201 1202 1203
#endif

#if 1
// Runs in ~115 cycles and ~4.5 times faster than C. Well, whatever...
//
// void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
//
1204
#define	carry1	r14
1205 1206 1207 1208 1209 1210 1211 1212
#define	carry2	r15
.global	bn_mul_comba4#
.proc	bn_mul_comba4#
.align	64
bn_mul_comba4:
	.prologue
	.fframe	0
	.save	ar.pfs,r2
1213
#if defined(_HPUX_SOURCE) && !defined(_LP64)
A
Andy Polyakov 已提交
1214 1215 1216 1217 1218
{ .mii;	alloc   r2=ar.pfs,3,0,0,0
	addp4	r33=0,r33
	addp4	r34=0,r34		};;
{ .mii;	addp4	r32=0,r32
#else
1219
{ .mii;	alloc	r2=ar.pfs,3,0,0,0
A
Andy Polyakov 已提交
1220
#endif
1221 1222 1223
	add	r14=8,r33
	add	r17=8,r34		}
	.body
1224
{ .mii;	add	r15=16,r33
1225 1226 1227
	add	r18=16,r34
	add	r16=24,r33		};;
.L_cheat_entry_point4:
1228
{ .mmi;	add	r19=24,r34
1229 1230 1231

	ldf8	f32=[r33]		}

1232
{ .mmi;	ldf8	f120=[r34]
1233
	ldf8	f121=[r17]		};;
1234
{ .mmi;	ldf8	f122=[r18]
1235 1236
	ldf8	f123=[r19]		}

1237
{ .mmi;	ldf8	f33=[r14]
1238
	ldf8	f34=[r15]		}
1239
{ .mfi;	ldf8	f35=[r16]
1240 1241

		xma.hu	f41=f32,f120,f0		}
1242 1243 1244 1245 1246 1247 1248
{ .mfi;		xma.lu	f40=f32,f120,f0		};;
{ .mfi;		xma.hu	f51=f32,f121,f0		}
{ .mfi;		xma.lu	f50=f32,f121,f0		};;
{ .mfi;		xma.hu	f61=f32,f122,f0		}
{ .mfi;		xma.lu	f60=f32,f122,f0		};;
{ .mfi;		xma.hu	f71=f32,f123,f0		}
{ .mfi;		xma.lu	f70=f32,f123,f0		};;//
1249 1250
// Major stall takes place here, and 3 more places below. Result from
// first xma is not available for another 3 ticks.
1251
{ .mfi;	getf.sig	r16=f40
1252 1253
		xma.hu	f42=f33,f120,f41
	add		r33=8,r32		}
1254 1255
{ .mfi;		xma.lu	f41=f33,f120,f41	};;
{ .mfi;	getf.sig	r24=f50
1256
		xma.hu	f52=f33,f121,f51	}
1257 1258
{ .mfi;		xma.lu	f51=f33,f121,f51	};;
{ .mfi;	st8		[r32]=r16,16
1259
		xma.hu	f62=f33,f122,f61	}
1260 1261 1262
{ .mfi;		xma.lu	f61=f33,f122,f61	};;
{ .mfi;		xma.hu	f72=f33,f123,f71	}
{ .mfi;		xma.lu	f71=f33,f123,f71	};;//
1263
//-------------------------------------------------//
1264
{ .mfi;	getf.sig	r25=f41
1265
		xma.hu	f43=f34,f120,f42	}
1266 1267
{ .mfi;		xma.lu	f42=f34,f120,f42	};;
{ .mfi;	getf.sig	r16=f60
1268
		xma.hu	f53=f34,f121,f52	}
1269 1270
{ .mfi;		xma.lu	f52=f34,f121,f52	};;
{ .mfi;	getf.sig	r17=f51
1271 1272
		xma.hu	f63=f34,f122,f62
	add		r25=r25,r24		}
1273
{ .mfi;	mov		carry1=0
1274
		xma.lu	f62=f34,f122,f62	};;
1275
{ .mfi;	st8		[r33]=r25,16
1276 1277
		xma.hu	f73=f34,f123,f72
	cmp.ltu		p6,p0=r25,r24		}
1278
{ .mfi;		xma.lu	f72=f34,f123,f72	};;//
1279
//-------------------------------------------------//
1280
{ .mfi;	getf.sig	r18=f42
1281 1282
		xma.hu	f44=f35,f120,f43
(p6)	add		carry1=1,carry1		}
1283
{ .mfi;	add		r17=r17,r16
1284 1285
		xma.lu	f43=f35,f120,f43
	mov		carry2=0		};;
1286
{ .mfi;	getf.sig	r24=f70
1287 1288
		xma.hu	f54=f35,f121,f53
	cmp.ltu		p7,p0=r17,r16		}
1289 1290
{ .mfi;		xma.lu	f53=f35,f121,f53	};;
{ .mfi;	getf.sig	r25=f61
1291 1292
		xma.hu	f64=f35,f122,f63
	add		r18=r18,r17		}
1293
{ .mfi;		xma.lu	f63=f35,f122,f63
1294
(p7)	add		carry2=1,carry2		};;
1295
{ .mfi;	getf.sig	r26=f52
1296 1297
		xma.hu	f74=f35,f123,f73
	cmp.ltu		p7,p0=r18,r17		}
1298
{ .mfi;		xma.lu	f73=f35,f123,f73
1299 1300
	add		r18=r18,carry1		};;
//-------------------------------------------------//
1301
{ .mii;	st8		[r32]=r18,16
1302 1303 1304
(p7)	add		carry2=1,carry2
	cmp.ltu		p7,p0=r18,carry1	};;

1305
{ .mfi;	getf.sig	r27=f43	// last major stall
1306
(p7)	add		carry2=1,carry2		};;
1307
{ .mii;		getf.sig	r16=f71
1308 1309
	add		r25=r25,r24
	mov		carry1=0		};;
1310
{ .mii;		getf.sig	r17=f62	
1311 1312
	cmp.ltu		p6,p0=r25,r24
	add		r26=r26,r25		};;
1313
{ .mii;
1314 1315 1316
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r26,r25
	add		r27=r27,r26		};;
1317
{ .mii;
1318 1319 1320
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r27,r26
	add		r27=r27,carry2		};;
1321
{ .mii;		getf.sig	r18=f53
1322 1323
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r27,carry2	};;
1324
{ .mfi;	st8		[r33]=r27,16
1325 1326
(p6)	add		carry1=1,carry1		}

1327
{ .mii;		getf.sig	r19=f44
1328 1329
		add		r17=r17,r16
		mov		carry2=0	};;
1330
{ .mii;	getf.sig	r24=f72
1331 1332
		cmp.ltu		p7,p0=r17,r16
		add		r18=r18,r17	};;
1333
{ .mii;	(p7)	add		carry2=1,carry2
1334 1335
		cmp.ltu		p7,p0=r18,r17
		add		r19=r19,r18	};;
1336
{ .mii;	(p7)	add		carry2=1,carry2
1337 1338
		cmp.ltu		p7,p0=r19,r18
		add		r19=r19,carry1	};;
1339
{ .mii;	getf.sig	r25=f63
1340 1341
	(p7)	add		carry2=1,carry2
		cmp.ltu		p7,p0=r19,carry1};;
1342
{ .mii;		st8		[r32]=r19,16
1343 1344
	(p7)	add		carry2=1,carry2	}

1345
{ .mii;	getf.sig	r26=f54
1346 1347
	add		r25=r25,r24
	mov		carry1=0		};;
1348
{ .mii;		getf.sig	r16=f73
1349 1350
	cmp.ltu		p6,p0=r25,r24
	add		r26=r26,r25		};;
1351
{ .mii;
1352 1353 1354
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r26,r25
	add		r26=r26,carry2		};;
1355
{ .mii;		getf.sig	r17=f64
1356 1357
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r26,carry2	};;
1358
{ .mii;	st8		[r33]=r26,16
1359 1360
(p6)	add		carry1=1,carry1		}

1361
{ .mii;	getf.sig	r24=f74
1362 1363
		add		r17=r17,r16	
		mov		carry2=0	};;
1364
{ .mii;		cmp.ltu		p7,p0=r17,r16
1365 1366
		add		r17=r17,carry1	};;

1367
{ .mii;	(p7)	add		carry2=1,carry2
1368
		cmp.ltu		p7,p0=r17,carry1};;
1369
{ .mii;		st8		[r32]=r17,16
1370 1371
	(p7)	add		carry2=1,carry2	};;

1372 1373
{ .mii;	add		r24=r24,carry2		};;
{ .mii;	st8		[r33]=r24		}
1374

1375
{ .mib;	rum		1<<5		// clear um.mfh
1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395
	br.ret.sptk.many	b0	};;
.endp	bn_mul_comba4#
#undef	carry2
#undef	carry1
#endif

#if 1
//
// BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
//
// In the nutshell it's a port of my MIPS III/IV implementation.
//
#define	AT	r14
#define	H	r16
#define	HH	r20
#define	L	r17
#define	D	r18
#define	DH	r22
#define	I	r21

1396
#if 0
1397 1398
// Some preprocessors (most notably HP-UX) appear to be allergic to
// macros enclosed to parenthesis [as these three were].
1399 1400 1401
#define	cont	p16
#define	break	p0	// p20
#define	equ	p24
1402 1403 1404 1405 1406 1407
#else
cont=p16
break=p0
equ=p24
#endif

1408 1409 1410 1411 1412 1413 1414 1415 1416
.global	abort#
.global	bn_div_words#
.proc	bn_div_words#
.align	64
bn_div_words:
	.prologue
	.fframe	0
	.save	ar.pfs,r2
	.save	b0,r3
1417
{ .mii;	alloc		r2=ar.pfs,3,5,0,8
1418 1419
	mov		r3=b0
	mov		r10=pr		};;
1420
{ .mmb;	cmp.eq		p6,p0=r34,r0
1421 1422 1423 1424
	mov		r8=-1
(p6)	br.ret.spnt.many	b0	};;

	.body
1425
{ .mii;	mov		H=r32		// save h
1426 1427
	mov		ar.ec=0		// don't rotate at exit
	mov		pr.rot=0	}
1428 1429
{ .mii;	mov		L=r33		// save l
	mov		r36=r0		};;
1430

1431 1432
.L_divw_shift:	// -vv- note signed comparison
{ .mfi;	(p0)	cmp.lt		p16,p0=r0,r34	// d
1433
	(p0)	shladd		r33=r34,1,r0	}
1434
{ .mfb;	(p0)	add		r35=1,r36
1435
	(p0)	nop.f		0x0
1436
(p16)	br.wtop.dpnt		.L_divw_shift	};;
1437

1438
{ .mii;	mov		D=r34
1439 1440
	shr.u		DH=r34,32
	sub		r35=64,r36		};;
1441
{ .mii;	setf.sig	f7=DH
1442 1443
	shr.u		AT=H,r35
	mov		I=r36			};;
1444
{ .mib;	cmp.ne		p6,p0=r0,AT
1445 1446 1447
	shl		H=H,r36
(p6)	br.call.spnt.clr	b0=abort	};;	// overflow, die...

1448
{ .mfi;	fcvt.xuf.s1	f7=f7
1449
	shr.u		AT=L,r35		};;
1450
{ .mii;	shl		L=L,r36
1451 1452
	or		H=H,AT			};;

1453
{ .mii;	nop.m		0x0
1454 1455 1456
	cmp.leu		p6,p0=D,H;;
(p6)	sub		H=H,D			}

1457
{ .mlx;	setf.sig	f14=D
1458
	movl		AT=0xffffffff		};;
1459
///////////////////////////////////////////////////////////
1460
{ .mii;	setf.sig	f6=H
1461 1462
	shr.u		HH=H,32;;
	cmp.eq		p6,p7=HH,DH		};;
1463
{ .mfb;
1464 1465 1466 1467
(p6)	setf.sig	f8=AT
(p7)	fcvt.xuf.s1	f6=f6
(p7)	br.call.sptk	b6=.L_udiv64_32_b6	};;

1468
{ .mfi;	getf.sig	r33=f8				// q
1469
	xmpy.lu		f9=f8,f14		}
1470
{ .mfi;	xmpy.hu		f10=f8,f14
1471 1472
	shrp		H=H,L,32		};;

1473
{ .mmi;	getf.sig	r35=f9				// tl
1474 1475
	getf.sig	r31=f10			};;	// th

1476 1477
.L_divw_1st_iter:
{ .mii;	(p0)	add		r32=-1,r33
1478
	(p0)	cmp.eq		equ,cont=HH,r31		};;
1479
{ .mii;	(p0)	cmp.ltu		p8,p0=r35,D
1480 1481
	(p0)	sub		r34=r35,D
	(equ)	cmp.leu		break,cont=r35,H	};;
1482
{ .mib;	(cont)	cmp.leu		cont,break=HH,r31
1483
	(p8)	add		r31=-1,r31
1484
(cont)	br.wtop.spnt		.L_divw_1st_iter	};;
1485
///////////////////////////////////////////////////////////
1486
{ .mii;	sub		H=H,r35
1487
	shl		r8=r33,32
1488
	shl		L=L,32			};;
1489
///////////////////////////////////////////////////////////
1490
{ .mii;	setf.sig	f6=H
1491 1492
	shr.u		HH=H,32;;
	cmp.eq		p6,p7=HH,DH		};;
1493
{ .mfb;
1494 1495 1496 1497
(p6)	setf.sig	f8=AT
(p7)	fcvt.xuf.s1	f6=f6
(p7)	br.call.sptk	b6=.L_udiv64_32_b6	};;

1498
{ .mfi;	getf.sig	r33=f8				// q
1499
	xmpy.lu		f9=f8,f14		}
1500
{ .mfi;	xmpy.hu		f10=f8,f14
1501 1502
	shrp		H=H,L,32		};;

1503
{ .mmi;	getf.sig	r35=f9				// tl
1504 1505
	getf.sig	r31=f10			};;	// th

1506 1507
.L_divw_2nd_iter:
{ .mii;	(p0)	add		r32=-1,r33
1508
	(p0)	cmp.eq		equ,cont=HH,r31		};;
1509
{ .mii;	(p0)	cmp.ltu		p8,p0=r35,D
1510 1511
	(p0)	sub		r34=r35,D
	(equ)	cmp.leu		break,cont=r35,H	};;
1512
{ .mib;	(cont)	cmp.leu		cont,break=HH,r31
1513
	(p8)	add		r31=-1,r31
1514
(cont)	br.wtop.spnt		.L_divw_2nd_iter	};;
1515
///////////////////////////////////////////////////////////
1516
{ .mii;	sub	H=H,r35
1517 1518
	or	r8=r8,r33
	mov	ar.pfs=r2		};;
1519
{ .mii;	shr.u	r9=H,I			// remainder if anybody wants it
A
Andy Polyakov 已提交
1520
	mov	pr=r10,0x1ffff		}
1521
{ .mfb;	br.ret.sptk.many	b0	};;
1522 1523 1524 1525

// Unsigned 64 by 32 (well, by 64 for the moment) bit integer division
// procedure.
//
1526 1527
// inputs:	f6 = (double)a, f7 = (double)b
// output:	f8 = (int)(a/b)
1528 1529
// clobbered:	f8,f9,f10,f11,pred
pred=p15
1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541
// One can argue that this snippet is copyrighted to Intel
// Corporation, as it's essentially identical to one of those
// found in "Divide, Square Root and Remainder" section at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
// Yes, I admit that the referred code was used as template,
// but after I realized that there hardly is any other instruction
// sequence which would perform this operation. I mean I figure that
// any independent attempt to implement high-performance division
// will result in code virtually identical to the Intel code. It
// should be noted though that below division kernel is 1 cycle
// faster than Intel one (note commented splits:-), not to mention
// original prologue (rather lack of one) and epilogue.
1542
.align	32
1543
.skip	16
1544
.L_udiv64_32_b6:
1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555
	frcpa.s1	f8,pred=f6,f7;;		// [0]  y0 = 1 / b

(pred)	fnma.s1		f9=f7,f8,f1		// [5]  e0 = 1 - b * y0
(pred)	fmpy.s1		f10=f6,f8;;		// [5]  q0 = a * y0
(pred)	fmpy.s1		f11=f9,f9		// [10] e1 = e0 * e0
(pred)	fma.s1		f10=f9,f10,f10;;	// [10] q1 = q0 + e0 * q0
(pred)	fma.s1		f8=f9,f8,f8	//;;	// [15] y1 = y0 + e0 * y0
(pred)	fma.s1		f9=f11,f10,f10;;	// [15] q2 = q1 + e1 * q1
(pred)	fma.s1		f8=f11,f8,f8	//;;	// [20] y2 = y1 + e1 * y1
(pred)	fnma.s1		f10=f7,f9,f6;;		// [20] r2 = a - b * q2
(pred)	fma.s1		f8=f10,f8,f9;;		// [25] q3 = q2 + r2 * y2
1556 1557 1558 1559 1560

	fcvt.fxu.trunc.s1	f8=f8		// [30] q = trunc(q3)
	br.ret.sptk.many	b6;;
.endp	bn_div_words#
#endif