ia64.S 44.3 KB
Newer Older
1
.explicit
2
.text
3
.ident	"ia64.S, Version 2.1"
4
.ident	"IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
5 6 7 8 9 10 11 12 13 14 15

//
// ====================================================================
// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
// project.
//
// Rights for redistribution and usage in source and binary forms are
// granted according to the OpenSSL license. Warranty of any kind is
// disclaimed.
// ====================================================================
//
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
// Version 2.x is Itanium2 re-tune. Few words about how Itanum2 is
// different from Itanium to this module viewpoint. Most notably, is it
// "wider" than Itanium? Can you experience loop scalability as
// discussed in commentary sections? Not really:-( Itanium2 has 6
// integer ALU ports, i.e. it's 2 ports wider, but it's not enough to
// spin twice as fast, as I need 8 IALU ports. Amount of floating point
// ports is the same, i.e. 2, while I need 4. In other words, to this
// module Itanium2 remains effectively as "wide" as Itanium. Yet it's
// essentially different in respect to this module, and a re-tune was
// required. Well, because some intruction latencies has changed. Most
// noticeably those intensively used:
//
//			Itanium	Itanium2
//	ldf8		9	6		L2 hit
//	ld8		2	1		L1 hit
//	getf		2	5
//	xma[->getf]	7[+1]	4[+0]
//	add[->st8]	1[+1]	1[+0]
//
// What does it mean? You might ratiocinate that the original code
// should run just faster... Because sum of latencies is smaller...
// Wrong! Note that getf latency increased. This means that if a loop is
38
// scheduled for lower latency (as they were), then it will suffer from
39 40 41 42 43 44
// stall condition and the code will therefore turn anti-scalable, e.g.
// original bn_mul_words spun at 5*n or 2.5 times slower than expected
// on Itanium2! What to do? Reschedule loops for Itanium2? But then
// Itanium would exhibit anti-scalability. So I've chosen to reschedule
// for worst latency for every instruction aiming for best *all-round*
// performance.  
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147

// Q.	How much faster does it get?
// A.	Here is the output from 'openssl speed rsa dsa' for vanilla
//	0.9.6a compiled with gcc version 2.96 20000731 (Red Hat
//	Linux 7.1 2.96-81):
//
//	                  sign    verify    sign/s verify/s
//	rsa  512 bits   0.0036s   0.0003s    275.3   2999.2
//	rsa 1024 bits   0.0203s   0.0011s     49.3    894.1
//	rsa 2048 bits   0.1331s   0.0040s      7.5    250.9
//	rsa 4096 bits   0.9270s   0.0147s      1.1     68.1
//	                  sign    verify    sign/s verify/s
//	dsa  512 bits   0.0035s   0.0043s    288.3    234.8
//	dsa 1024 bits   0.0111s   0.0135s     90.0     74.2
//
//	And here is similar output but for this assembler
//	implementation:-)
//
//	                  sign    verify    sign/s verify/s
//	rsa  512 bits   0.0021s   0.0001s    549.4   9638.5
//	rsa 1024 bits   0.0055s   0.0002s    183.8   4481.1
//	rsa 2048 bits   0.0244s   0.0006s     41.4   1726.3
//	rsa 4096 bits   0.1295s   0.0018s      7.7    561.5
//	                  sign    verify    sign/s verify/s
//	dsa  512 bits   0.0012s   0.0013s    891.9    756.6
//	dsa 1024 bits   0.0023s   0.0028s    440.4    376.2
//	
//	Yes, you may argue that it's not fair comparison as it's
//	possible to craft the C implementation with BN_UMULT_HIGH
//	inline assembler macro. But of course! Here is the output
//	with the macro:
//
//	                  sign    verify    sign/s verify/s
//	rsa  512 bits   0.0020s   0.0002s    495.0   6561.0
//	rsa 1024 bits   0.0086s   0.0004s    116.2   2235.7
//	rsa 2048 bits   0.0519s   0.0015s     19.3    667.3
//	rsa 4096 bits   0.3464s   0.0053s      2.9    187.7
//	                  sign    verify    sign/s verify/s
//	dsa  512 bits   0.0016s   0.0020s    613.1    510.5
//	dsa 1024 bits   0.0045s   0.0054s    221.0    183.9
//
//	My code is still way faster, huh:-) And I believe that even
//	higher performance can be achieved. Note that as keys get
//	longer, performance gain is larger. Why? According to the
//	profiler there is another player in the field, namely
//	BN_from_montgomery consuming larger and larger portion of CPU
//	time as keysize decreases. I therefore consider putting effort
//	to assembler implementation of the following routine:
//
//	void bn_mul_add_mont (BN_ULONG *rp,BN_ULONG *np,int nl,BN_ULONG n0)
//	{
//	int      i,j;
//	BN_ULONG v;
//
//	for (i=0; i<nl; i++)
//		{
//		v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
//		nrp++;
//		rp++;
//		if (((nrp[-1]+=v)&BN_MASK2) < v)
//			for (j=0; ((++nrp[j])&BN_MASK2) == 0; j++) ;
//		}
//	}
//
//	It might as well be beneficial to implement even combaX
//	variants, as it appears as it can literally unleash the
//	performance (see comment section to bn_mul_comba8 below).
//
//	And finally for your reference the output for 0.9.6a compiled
//	with SGIcc version 0.01.0-12 (keep in mind that for the moment
//	of this writing it's not possible to convince SGIcc to use
//	BN_UMULT_HIGH inline assembler macro, yet the code is fast,
//	i.e. for a compiler generated one:-):
//
//	                  sign    verify    sign/s verify/s
//	rsa  512 bits   0.0022s   0.0002s    452.7   5894.3
//	rsa 1024 bits   0.0097s   0.0005s    102.7   2002.9
//	rsa 2048 bits   0.0578s   0.0017s     17.3    600.2
//	rsa 4096 bits   0.3838s   0.0061s      2.6    164.5
//	                  sign    verify    sign/s verify/s
//	dsa  512 bits   0.0018s   0.0022s    547.3    459.6
//	dsa 1024 bits   0.0051s   0.0062s    196.6    161.3
//
//	Oh! Benchmarks were performed on 733MHz Lion-class Itanium
//	system running Redhat Linux 7.1 (very special thanks to Ray
//	McCaffity of Williams Communications for providing an account).
//
// Q.	What's the heck with 'rum 1<<5' at the end of every function?
// A.	Well, by clearing the "upper FP registers written" bit of the
//	User Mask I want to excuse the kernel from preserving upper
//	(f32-f128) FP register bank over process context switch, thus
//	minimizing bus bandwidth consumption during the switch (i.e.
//	after PKI opration completes and the program is off doing
//	something else like bulk symmetric encryption). Having said
//	this, I also want to point out that it might be good idea
//	to compile the whole toolkit (as well as majority of the
//	programs for that matter) with -mfixed-range=f32-f127 command
//	line option. No, it doesn't prevent the compiler from writing
//	to upper bank, but at least discourages to do so. If you don't
//	like the idea you have the option to compile the module with
//	-Drum=nop.m in command line.
//

148 149 150 151 152 153
#if defined(_HPUX_SOURCE) && !defined(_LP64)
#define	ADDP	addp4
#else
#define	ADDP	add
#endif

154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
#if 1
//
// bn_[add|sub]_words routines.
//
// Loops are spinning in 2*(n+5) ticks on Itanuim (provided that the
// data reside in L1 cache, i.e. 2 ticks away). It's possible to
// compress the epilogue and get down to 2*n+6, but at the cost of
// scalability (the neat feature of this implementation is that it
// shall automagically spin in n+5 on "wider" IA-64 implementations:-)
// I consider that the epilogue is short enough as it is to trade tiny
// performance loss on Itanium for scalability.
//
// BN_ULONG bn_add_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
//
.global	bn_add_words#
.proc	bn_add_words#
.align	64
171
.skip	32	// makes the loop body aligned at 64-byte boundary
172 173 174
bn_add_words:
	.prologue
	.save	ar.pfs,r2
175
{ .mii;	alloc		r2=ar.pfs,4,12,0,16
176
	cmp4.le		p6,p0=r35,r0	};;
177
{ .mfb;	mov		r8=r0			// return value
178 179
(p6)	br.ret.spnt.many	b0	};;

180
{ .mib;	sub		r10=r35,r0,1
181
	.save	ar.lc,r3
182 183 184
	mov		r3=ar.lc
	brp.loop.imp	.L_bn_add_words_ctop,.L_bn_add_words_cend-16
					}
185
{ .mib;	ADDP		r14=0,r32		// rp
A
Andy Polyakov 已提交
186
	.save	pr,r9
187
	mov		r9=pr		};;
A
Andy Polyakov 已提交
188
	.body
189
{ .mii;	ADDP		r15=0,r33		// ap
190 191
	mov		ar.lc=r10
	mov		ar.ec=6		}
192
{ .mib;	ADDP		r16=0,r34		// bp
193
	mov		pr.rot=1<<16	};;
194

195 196
.L_bn_add_words_ctop:
{ .mii;	(p16)	ld8		r32=[r16],8	  // b=*(bp++)
197 198
	(p18)	add		r39=r37,r34
	(p19)	cmp.ltu.unc	p56,p0=r40,r38	}
199
{ .mfb;	(p0)	nop.m		0x0
200 201
	(p0)	nop.f		0x0
	(p0)	nop.b		0x0		}
202
{ .mii;	(p16)	ld8		r35=[r15],8	  // a=*(ap++)
203 204
	(p58)	cmp.eq.or	p57,p0=-1,r41	  // (p20)
	(p58)	add		r41=1,r41	} // (p20)
205
{ .mfb;	(p21)	st8		[r14]=r42,8	  // *(rp++)=r
206 207 208 209
	(p0)	nop.f		0x0
	br.ctop.sptk	.L_bn_add_words_ctop	};;
.L_bn_add_words_cend:

210
{ .mii;
211
(p59)	add		r8=1,r8		// return value
A
Andy Polyakov 已提交
212
	mov		pr=r9,0x1ffff
213
	mov		ar.lc=r3	}
214
{ .mbb;	nop.b		0x0
215 216 217 218 219 220 221 222 223
	br.ret.sptk.many	b0	};;
.endp	bn_add_words#

//
// BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
//
.global	bn_sub_words#
.proc	bn_sub_words#
.align	64
224
.skip	32	// makes the loop body aligned at 64-byte boundary
225 226 227
bn_sub_words:
	.prologue
	.save	ar.pfs,r2
228
{ .mii;	alloc		r2=ar.pfs,4,12,0,16
229
	cmp4.le		p6,p0=r35,r0	};;
230
{ .mfb;	mov		r8=r0			// return value
231 232
(p6)	br.ret.spnt.many	b0	};;

233
{ .mib;	sub		r10=r35,r0,1
234
	.save	ar.lc,r3
235 236 237
	mov		r3=ar.lc
	brp.loop.imp	.L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
					}
238
{ .mib;	ADDP		r14=0,r32		// rp
A
Andy Polyakov 已提交
239
	.save	pr,r9
240
	mov		r9=pr		};;
A
Andy Polyakov 已提交
241
	.body
242
{ .mii;	ADDP		r15=0,r33		// ap
243 244
	mov		ar.lc=r10
	mov		ar.ec=6		}
245
{ .mib;	ADDP		r16=0,r34		// bp
246
	mov		pr.rot=1<<16	};;
247

248 249
.L_bn_sub_words_ctop:
{ .mii;	(p16)	ld8		r32=[r16],8	  // b=*(bp++)
250 251
	(p18)	sub		r39=r37,r34
	(p19)	cmp.gtu.unc	p56,p0=r40,r38	}
252
{ .mfb;	(p0)	nop.m		0x0
253 254
	(p0)	nop.f		0x0
	(p0)	nop.b		0x0		}
255
{ .mii;	(p16)	ld8		r35=[r15],8	  // a=*(ap++)
256 257
	(p58)	cmp.eq.or	p57,p0=0,r41	  // (p20)
	(p58)	add		r41=-1,r41	} // (p20)
258
{ .mbb;	(p21)	st8		[r14]=r42,8	  // *(rp++)=r
259 260 261 262
	(p0)	nop.b		0x0
	br.ctop.sptk	.L_bn_sub_words_ctop	};;
.L_bn_sub_words_cend:

263
{ .mii;
264
(p59)	add		r8=1,r8		// return value
A
Andy Polyakov 已提交
265
	mov		pr=r9,0x1ffff
266
	mov		ar.lc=r3	}
267
{ .mbb;	nop.b		0x0
268 269 270 271 272 273 274 275 276 277 278 279 280 281 282
	br.ret.sptk.many	b0	};;
.endp	bn_sub_words#
#endif

#if 0
#define XMA_TEMPTATION
#endif

#if 1
//
// BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
//
.global	bn_mul_words#
.proc	bn_mul_words#
.align	64
283
.skip	32	// makes the loop body aligned at 64-byte boundary
284 285 286 287
bn_mul_words:
	.prologue
	.save	ar.pfs,r2
#ifdef XMA_TEMPTATION
288
{ .mfi;	alloc		r2=ar.pfs,4,0,0,0	};;
289
#else
290
{ .mfi;	alloc		r2=ar.pfs,4,12,0,16	};;
291
#endif
292
{ .mib;	mov		r8=r0			// return value
293 294 295
	cmp4.le		p6,p0=r34,r0
(p6)	br.ret.spnt.many	b0		};;

296
{ .mii;	sub	r10=r34,r0,1
297
	.save	ar.lc,r3
298
	mov	r3=ar.lc
A
Andy Polyakov 已提交
299
	.save	pr,r9
300 301 302
	mov	r9=pr			};;

	.body
303
{ .mib;	setf.sig	f8=r35	// w
304 305
	mov		pr.rot=0x800001<<16
			// ------^----- serves as (p50) at first (p27)
306 307 308 309 310
	brp.loop.imp	.L_bn_mul_words_ctop,.L_bn_mul_words_cend-16
					}

#ifndef XMA_TEMPTATION

311 312
{ .mmi;	ADDP		r14=0,r32	// rp
	ADDP		r15=0,r33	// ap
313
	mov		ar.lc=r10	}
314
{ .mmi;	mov		r40=0		// serves as r35 at first (p27)
315
	mov		ar.ec=13	};;
316

317 318
// This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium
// L2 cache (i.e. 9 ticks away) as floating point load/store instructions
319
// bypass L1 cache and L2 latency is actually best-case scenario for
320 321
// ldf8. The loop is not scalable and shall run in 2*(n+12) even on
// "wider" IA-64 implementations. It's a trade-off here. n+24 loop
322 323 324 325 326 327
// would give us ~5% in *overall* performance improvement on "wider"
// IA-64, but would hurt Itanium for about same because of longer
// epilogue. As it's a matter of few percents in either case I've
// chosen to trade the scalability for development time (you can see
// this very instruction sequence in bn_mul_add_words loop which in
// turn is scalable).
328
.L_bn_mul_words_ctop:
329 330 331
{ .mfi;	(p25)	getf.sig	r36=f52			// low
	(p21)	xmpy.lu		f48=f37,f8
	(p28)	cmp.ltu		p54,p50=r41,r39	}
332
{ .mfi;	(p16)	ldf8		f32=[r15],8
333
	(p21)	xmpy.hu		f40=f37,f8
334
	(p0)	nop.i		0x0		};;
335 336 337 338 339
{ .mii;	(p25)	getf.sig	r32=f44			// high
	.pred.rel	"mutex",p50,p54
	(p50)	add		r40=r38,r35		// (p27)
	(p54)	add		r40=r38,r35,1	}	// (p27)
{ .mfb;	(p28)	st8		[r14]=r41,8
340 341 342 343
	(p0)	nop.f		0x0
	br.ctop.sptk	.L_bn_mul_words_ctop	};;
.L_bn_mul_words_cend:

344
{ .mii;	nop.m		0x0
345 346 347
.pred.rel	"mutex",p51,p55
(p51)	add		r8=r36,r0
(p55)	add		r8=r36,r0,1	}
348
{ .mfb;	nop.m	0x0
349 350 351 352 353 354 355
	nop.f	0x0
	nop.b	0x0			}

#else	// XMA_TEMPTATION

	setf.sig	f37=r0	// serves as carry at (p18) tick
	mov		ar.lc=r10
356
	mov		ar.ec=5;;
357 358 359 360 361 362 363 364 365 366 367 368

// Most of you examining this code very likely wonder why in the name
// of Intel the following loop is commented out? Indeed, it looks so
// neat that you find it hard to believe that it's something wrong
// with it, right? The catch is that every iteration depends on the
// result from previous one and the latter isn't available instantly.
// The loop therefore spins at the latency of xma minus 1, or in other
// words at 6*(n+4) ticks:-( Compare to the "production" loop above
// that runs in 2*(n+11) where the low latency problem is worked around
// by moving the dependency to one-tick latent interger ALU. Note that
// "distance" between ldf8 and xma is not latency of ldf8, but the
// *difference* between xma and ldf8 latencies.
369 370
.L_bn_mul_words_ctop:
{ .mfi;	(p16)	ldf8		f32=[r33],8
371
	(p18)	xma.hu		f38=f34,f8,f39	}
372
{ .mfb;	(p20)	stf8		[r32]=f37,8
373 374 375 376 377 378 379 380
	(p18)	xma.lu		f35=f34,f8,f39
	br.ctop.sptk	.L_bn_mul_words_ctop	};;
.L_bn_mul_words_cend:

	getf.sig	r8=f41		// the return value

#endif	// XMA_TEMPTATION

381
{ .mii;	nop.m		0x0
A
Andy Polyakov 已提交
382
	mov		pr=r9,0x1ffff
383
	mov		ar.lc=r3	}
384
{ .mfb;	rum		1<<5		// clear um.mfh
385 386 387 388 389 390 391 392 393 394 395 396
	nop.f		0x0
	br.ret.sptk.many	b0	};;
.endp	bn_mul_words#
#endif

#if 1
//
// BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
//
.global	bn_mul_add_words#
.proc	bn_mul_add_words#
.align	64
397
.skip	48	// makes the loop body aligned at 64-byte boundary
398 399 400
bn_mul_add_words:
	.prologue
	.save	ar.pfs,r2
401 402
{ .mmi;	alloc		r2=ar.pfs,4,4,0,8
	cmp4.le		p6,p0=r34,r0
403
	.save	ar.lc,r3
404 405 406 407
	mov		r3=ar.lc	};;
{ .mib;	mov		r8=r0		// return value
	sub		r10=r34,r0,1
(p6)	br.ret.spnt.many	b0	};;
408

409
{ .mib;	setf.sig	f8=r35		// w
410
	.save	pr,r9
411
	mov		r9=pr
412 413
	brp.loop.imp	.L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
					}
A
Andy Polyakov 已提交
414
	.body
415 416
{ .mmi;	ADDP		r14=0,r32	// rp
	ADDP		r15=0,r33	// ap
417
	mov		ar.lc=r10	}
418 419 420 421 422 423 424 425 426 427 428 429 430 431
{ .mii;	ADDP		r16=0,r32	// rp copy
	mov		pr.rot=0x2001<<16
			// ------^----- serves as (p40) at first (p27)
	mov		ar.ec=11	};;

// This loop spins in 3*(n+10) ticks on Itanium and in 2*(n+10) on
// Itanium 2. Yes, unlike previous versions it scales:-) Previous
// version was peforming *all* additions in IALU and was starving
// for those even on Itanium 2. In this version one addition is
// moved to FPU and is folded with multiplication. This is at cost
// of propogating the result from previous call to this subroutine
// to L2 cache... In other words negligible even for shorter keys.
// *Overall* performance improvement [over previous version] varies
// from 11 to 22 percent depending on key length.
432
.L_bn_mul_add_words_ctop:
433 434 435 436 437 438 439 440 441 442 443 444
.pred.rel	"mutex",p40,p42
{ .mfi;	(p23)	getf.sig	r36=f45			// low
	(p20)	xma.lu		f42=f36,f8,f50		// low
	(p40)	add		r39=r39,r35	}	// (p27)
{ .mfi;	(p16)	ldf8		f32=[r15],8		// *(ap++)
	(p20)	xma.hu		f36=f36,f8,f50		// high
	(p42)	add		r39=r39,r35,1	};;	// (p27)
{ .mmi;	(p24)	getf.sig	r32=f40			// high
	(p16)	ldf8		f46=[r16],8		// *(rp1++)
	(p40)	cmp.ltu		p41,p39=r39,r35	}	// (p27)
{ .mib;	(p26)	st8		[r14]=r39,8		// *(rp2++)
	(p42)	cmp.leu		p41,p39=r39,r35		// (p27)
445 446 447
	br.ctop.sptk	.L_bn_mul_add_words_ctop};;
.L_bn_mul_add_words_cend:

448 449 450 451 452 453
{ .mmi;	.pred.rel	"mutex",p40,p42
(p40)	add		r8=r35,r0
(p42)	add		r8=r35,r0,1
	mov		pr=r9,0x1ffff	}
{ .mib;	rum		1<<5		// clear um.mfh
	mov		ar.lc=r3
454 455 456 457 458 459 460 461 462 463 464
	br.ret.sptk.many	b0	};;
.endp	bn_mul_add_words#
#endif

#if 1
//
// void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
//
.global	bn_sqr_words#
.proc	bn_sqr_words#
.align	64
465
.skip	32	// makes the loop body aligned at 64-byte boundary 
466 467 468
bn_sqr_words:
	.prologue
	.save	ar.pfs,r2
469
{ .mii;	alloc		r2=ar.pfs,3,0,0,0
470
	sxt4		r34=r34		};;
471
{ .mii;	cmp.le		p6,p0=r34,r0
472
	mov		r8=r0		}	// return value
473 474
{ .mfb;	ADDP		r32=0,r32
	nop.f		0x0
475 476
(p6)	br.ret.spnt.many	b0	};;

477
{ .mii;	sub	r10=r34,r0,1
478
	.save	ar.lc,r3
479
	mov	r3=ar.lc
480
	.save	pr,r9
481 482 483
	mov	r9=pr			};;

	.body
484
{ .mib;	ADDP		r33=0,r33
485 486 487
	mov		pr.rot=1<<16
	brp.loop.imp	.L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16
					}
488
{ .mii;	add		r34=8,r32
489
	mov		ar.lc=r10
490
	mov		ar.ec=18	};;
491 492 493 494 495 496 497 498

// 2*(n+17) on Itanium, (n+17) on "wider" IA-64 implementations. It's
// possible to compress the epilogue (I'm getting tired to write this
// comment over and over) and get down to 2*n+16 at the cost of
// scalability. The decision will very likely be reconsidered after the
// benchmark program is profiled. I.e. if perfomance gain on Itanium
// will appear larger than loss on "wider" IA-64, then the loop should
// be explicitely split and the epilogue compressed.
499 500
.L_bn_sqr_words_ctop:
{ .mfi;	(p16)	ldf8		f32=[r33],8
501 502
	(p25)	xmpy.lu		f42=f41,f41
	(p0)	nop.i		0x0		}
503
{ .mib;	(p33)	stf8		[r32]=f50,16
504 505
	(p0)	nop.i		0x0
	(p0)	nop.b		0x0		}
506
{ .mfi;	(p0)	nop.m		0x0
507 508
	(p25)	xmpy.hu		f52=f41,f41
	(p0)	nop.i		0x0		}
509
{ .mib;	(p33)	stf8		[r34]=f60,16
510 511 512 513
	(p0)	nop.i		0x0
	br.ctop.sptk	.L_bn_sqr_words_ctop	};;
.L_bn_sqr_words_cend:

514
{ .mii;	nop.m		0x0
A
Andy Polyakov 已提交
515
	mov		pr=r9,0x1ffff
516
	mov		ar.lc=r3	}
517
{ .mfb;	rum		1<<5		// clear um.mfh
518 519 520 521 522 523
	nop.f		0x0
	br.ret.sptk.many	b0	};;
.endp	bn_sqr_words#
#endif

#if 1
524
// Apparently we win nothing by implementing special bn_sqr_comba8.
525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547
// Yes, it is possible to reduce the number of multiplications by
// almost factor of two, but then the amount of additions would
// increase by factor of two (as we would have to perform those
// otherwise performed by xma ourselves). Normally we would trade
// anyway as multiplications are way more expensive, but not this
// time... Multiplication kernel is fully pipelined and as we drain
// one 128-bit multiplication result per clock cycle multiplications
// are effectively as inexpensive as additions. Special implementation
// might become of interest for "wider" IA-64 implementation as you'll
// be able to get through the multiplication phase faster (there won't
// be any stall issues as discussed in the commentary section below and
// you therefore will be able to employ all 4 FP units)... But these
// Itanium days it's simply too hard to justify the effort so I just
// drop down to bn_mul_comba8 code:-)
//
// void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
//
.global	bn_sqr_comba8#
.proc	bn_sqr_comba8#
.align	64
bn_sqr_comba8:
	.prologue
	.save	ar.pfs,r2
548
#if defined(_HPUX_SOURCE) && !defined(_LP64)
549
{ .mii;	alloc	r2=ar.pfs,2,1,0,0
A
Andy Polyakov 已提交
550 551 552 553 554 555
	addp4	r33=0,r33
	addp4	r32=0,r32		};;
{ .mii;
#else
{ .mii;	alloc	r2=ar.pfs,2,1,0,0
#endif
556 557 558
	mov	r34=r33
	add	r14=8,r33		};;
	.body
559
{ .mii;	add	r17=8,r34
560 561
	add	r15=16,r33
	add	r18=16,r34		}
562
{ .mfb;	add	r16=24,r33
563
	br	.L_cheat_entry_point8	};;
564
.endp	bn_sqr_comba8#
565 566 567 568 569 570
#endif

#if 1
// I've estimated this routine to run in ~120 ticks, but in reality
// (i.e. according to ar.itc) it takes ~160 ticks. Are those extra
// cycles consumed for instructions fetch? Or did I misinterpret some
571
// clause in Itanium µ-architecture manual? Comments are welcomed and
572 573
// highly appreciated.
//
574 575 576 577
// On Itanium 2 it takes ~190 ticks. This is because of stalls on
// result from getf.sig. I do nothing about it at this point for
// reasons depicted below.
//
578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618
// However! It should be noted that even 160 ticks is darn good result
// as it's over 10 (yes, ten, spelled as t-e-n) times faster than the
// C version (compiled with gcc with inline assembler). I really
// kicked compiler's butt here, didn't I? Yeah! This brings us to the
// following statement. It's damn shame that this routine isn't called
// very often nowadays! According to the profiler most CPU time is
// consumed by bn_mul_add_words called from BN_from_montgomery. In
// order to estimate what we're missing, I've compared the performance
// of this routine against "traditional" implementation, i.e. against
// following routine:
//
// void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
// {	r[ 8]=bn_mul_words(    &(r[0]),a,8,b[0]);
//	r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]);
//	r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]);
//	r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]);
//	r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]);
//	r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]);
//	r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]);
//	r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
// }
//
// The one below is over 8 times faster than the one above:-( Even
// more reasons to "combafy" bn_mul_add_mont...
//
// And yes, this routine really made me wish there were an optimizing
// assembler! It also feels like it deserves a dedication.
//
//	To my wife for being there and to my kids...
//
// void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
//
#define	carry1	r14
#define	carry2	r15
#define	carry3	r34
.global	bn_mul_comba8#
.proc	bn_mul_comba8#
.align	64
bn_mul_comba8:
	.prologue
	.save	ar.pfs,r2
619
#if defined(_HPUX_SOURCE) && !defined(_LP64)
620
{ .mii;	alloc	r2=ar.pfs,3,0,0,0
A
Andy Polyakov 已提交
621 622 623 624 625 626
	addp4	r33=0,r33
	addp4	r34=0,r34		};;
{ .mii;	addp4	r32=0,r32
#else
{ .mii;	alloc   r2=ar.pfs,3,0,0,0
#endif
627 628 629
	add	r14=8,r33
	add	r17=8,r34		}
	.body
630
{ .mii;	add	r15=16,r33
631
	add	r18=16,r34
632
	add	r16=24,r33		}
633
.L_cheat_entry_point8:
634
{ .mmi;	add	r19=24,r34
635

636
	ldf8	f32=[r33],32		};;
637

638 639 640 641 642
{ .mmi;	ldf8	f120=[r34],32
	ldf8	f121=[r17],32		}
{ .mmi;	ldf8	f122=[r18],32
	ldf8	f123=[r19],32		};;
{ .mmi;	ldf8	f124=[r34]
643
	ldf8	f125=[r17]		}
644
{ .mmi;	ldf8	f126=[r18]
645 646
	ldf8	f127=[r19]		}

647
{ .mmi;	ldf8	f33=[r14],32
648
	ldf8	f34=[r15],32		}
649
{ .mmi;	ldf8	f35=[r16],32;;
650
	ldf8	f36=[r33]		}
651
{ .mmi;	ldf8	f37=[r14]
652
	ldf8	f38=[r15]		}
653
{ .mfi;	ldf8	f39=[r16]
654 655 656 657 658
// -------\ Entering multiplier's heaven /-------
// ------------\                    /------------
// -----------------\          /-----------------
// ----------------------\/----------------------
		xma.hu	f41=f32,f120,f0		}
659 660 661 662 663 664 665 666 667 668 669 670 671 672 673
{ .mfi;		xma.lu	f40=f32,f120,f0		};; // (*)
{ .mfi;		xma.hu	f51=f32,f121,f0		}
{ .mfi;		xma.lu	f50=f32,f121,f0		};;
{ .mfi;		xma.hu	f61=f32,f122,f0		}
{ .mfi;		xma.lu	f60=f32,f122,f0		};;
{ .mfi;		xma.hu	f71=f32,f123,f0		}
{ .mfi;		xma.lu	f70=f32,f123,f0		};;
{ .mfi;		xma.hu	f81=f32,f124,f0		}
{ .mfi;		xma.lu	f80=f32,f124,f0		};;
{ .mfi;		xma.hu	f91=f32,f125,f0		}
{ .mfi;		xma.lu	f90=f32,f125,f0		};;
{ .mfi;		xma.hu	f101=f32,f126,f0	}
{ .mfi;		xma.lu	f100=f32,f126,f0	};;
{ .mfi;		xma.hu	f111=f32,f127,f0	}
{ .mfi;		xma.lu	f110=f32,f127,f0	};;//
674 675 676 677 678 679 680 681 682 683 684
// (*)	You can argue that splitting at every second bundle would
//	prevent "wider" IA-64 implementations from achieving the peak
//	performance. Well, not really... The catch is that if you
//	intend to keep 4 FP units busy by splitting at every fourth
//	bundle and thus perform these 16 multiplications in 4 ticks,
//	the first bundle *below* would stall because the result from
//	the first xma bundle *above* won't be available for another 3
//	ticks (if not more, being an optimist, I assume that "wider"
//	implementation will have same latency:-). This stall will hold
//	you back and the performance would be as if every second bundle
//	were split *anyway*...
685
{ .mfi;	getf.sig	r16=f40
686 687
		xma.hu	f42=f33,f120,f41
	add		r33=8,r32		}
688 689
{ .mfi;		xma.lu	f41=f33,f120,f41	};;
{ .mfi;	getf.sig	r24=f50
690
		xma.hu	f52=f33,f121,f51	}
691 692
{ .mfi;		xma.lu	f51=f33,f121,f51	};;
{ .mfi;	st8		[r32]=r16,16
693
		xma.hu	f62=f33,f122,f61	}
694 695 696 697 698 699 700 701 702 703 704
{ .mfi;		xma.lu	f61=f33,f122,f61	};;
{ .mfi;		xma.hu	f72=f33,f123,f71	}
{ .mfi;		xma.lu	f71=f33,f123,f71	};;
{ .mfi;		xma.hu	f82=f33,f124,f81	}
{ .mfi;		xma.lu	f81=f33,f124,f81	};;
{ .mfi;		xma.hu	f92=f33,f125,f91	}
{ .mfi;		xma.lu	f91=f33,f125,f91	};;
{ .mfi;		xma.hu	f102=f33,f126,f101	}
{ .mfi;		xma.lu	f101=f33,f126,f101	};;
{ .mfi;		xma.hu	f112=f33,f127,f111	}
{ .mfi;		xma.lu	f111=f33,f127,f111	};;//
705
//-------------------------------------------------//
706
{ .mfi;	getf.sig	r25=f41
707
		xma.hu	f43=f34,f120,f42	}
708 709
{ .mfi;		xma.lu	f42=f34,f120,f42	};;
{ .mfi;	getf.sig	r16=f60
710
		xma.hu	f53=f34,f121,f52	}
711 712
{ .mfi;		xma.lu	f52=f34,f121,f52	};;
{ .mfi;	getf.sig	r17=f51
713 714
		xma.hu	f63=f34,f122,f62
	add		r25=r25,r24		}
715
{ .mfi;		xma.lu	f62=f34,f122,f62
716
	mov		carry1=0		};;
717
{ .mfi;	cmp.ltu		p6,p0=r25,r24
718
		xma.hu	f73=f34,f123,f72	}
719 720
{ .mfi;		xma.lu	f72=f34,f123,f72	};;
{ .mfi;	st8		[r33]=r25,16
721 722
		xma.hu	f83=f34,f124,f82
(p6)	add		carry1=1,carry1		}
723 724 725 726 727 728 729
{ .mfi;		xma.lu	f82=f34,f124,f82	};;
{ .mfi;		xma.hu	f93=f34,f125,f92	}
{ .mfi;		xma.lu	f92=f34,f125,f92	};;
{ .mfi;		xma.hu	f103=f34,f126,f102	}
{ .mfi;		xma.lu	f102=f34,f126,f102	};;
{ .mfi;		xma.hu	f113=f34,f127,f112	}
{ .mfi;		xma.lu	f112=f34,f127,f112	};;//
730
//-------------------------------------------------//
731
{ .mfi;	getf.sig	r18=f42
732 733
		xma.hu	f44=f35,f120,f43
	add		r17=r17,r16		}
734 735
{ .mfi;		xma.lu	f43=f35,f120,f43	};;
{ .mfi;	getf.sig	r24=f70
736
		xma.hu	f54=f35,f121,f53	}
737
{ .mfi;	mov		carry2=0
738
		xma.lu	f53=f35,f121,f53	};;
739
{ .mfi;	getf.sig	r25=f61
740 741
		xma.hu	f64=f35,f122,f63
	cmp.ltu		p7,p0=r17,r16		}
742
{ .mfi;	add		r18=r18,r17
743
		xma.lu	f63=f35,f122,f63	};;
744
{ .mfi;	getf.sig	r26=f52
745 746
		xma.hu	f74=f35,f123,f73
(p7)	add		carry2=1,carry2		}
747
{ .mfi;	cmp.ltu		p7,p0=r18,r17
748 749
		xma.lu	f73=f35,f123,f73
	add		r18=r18,carry1		};;
750
{ .mfi;
751 752
		xma.hu	f84=f35,f124,f83
(p7)	add		carry2=1,carry2		}
753
{ .mfi;	cmp.ltu		p7,p0=r18,carry1
754
		xma.lu	f83=f35,f124,f83	};;
755
{ .mfi;	st8		[r32]=r18,16
756 757
		xma.hu	f94=f35,f125,f93
(p7)	add		carry2=1,carry2		}
758 759 760 761 762
{ .mfi;		xma.lu	f93=f35,f125,f93	};;
{ .mfi;		xma.hu	f104=f35,f126,f103	}
{ .mfi;		xma.lu	f103=f35,f126,f103	};;
{ .mfi;		xma.hu	f114=f35,f127,f113	}
{ .mfi;	mov		carry1=0
763
		xma.lu	f113=f35,f127,f113
764
	add		r25=r25,r24		};;//
765
//-------------------------------------------------//
766
{ .mfi;	getf.sig	r27=f43
767 768
		xma.hu	f45=f36,f120,f44
	cmp.ltu		p6,p0=r25,r24		}
769
{ .mfi;		xma.lu	f44=f36,f120,f44	
770
	add		r26=r26,r25		};;
771
{ .mfi;	getf.sig	r16=f80
772 773
		xma.hu	f55=f36,f121,f54
(p6)	add		carry1=1,carry1		}
774 775
{ .mfi;		xma.lu	f54=f36,f121,f54	};;
{ .mfi;	getf.sig	r17=f71
776 777
		xma.hu	f65=f36,f122,f64
	cmp.ltu		p6,p0=r26,r25		}
778
{ .mfi;		xma.lu	f64=f36,f122,f64
779
	add		r27=r27,r26		};;
780
{ .mfi;	getf.sig	r18=f62
781 782
		xma.hu	f75=f36,f123,f74
(p6)	add		carry1=1,carry1		}
783
{ .mfi;	cmp.ltu		p6,p0=r27,r26
784 785
		xma.lu	f74=f36,f123,f74
	add		r27=r27,carry2		};;
786
{ .mfi;	getf.sig	r19=f53
787 788
		xma.hu	f85=f36,f124,f84
(p6)	add		carry1=1,carry1		}
789
{ .mfi;		xma.lu	f84=f36,f124,f84
790
	cmp.ltu		p6,p0=r27,carry2	};;
791
{ .mfi;	st8		[r33]=r27,16
792 793
		xma.hu	f95=f36,f125,f94
(p6)	add		carry1=1,carry1		}
794 795 796
{ .mfi;		xma.lu	f94=f36,f125,f94	};;
{ .mfi;		xma.hu	f105=f36,f126,f104	}
{ .mfi;	mov		carry2=0
797 798
		xma.lu	f104=f36,f126,f104
	add		r17=r17,r16		};;
799
{ .mfi;		xma.hu	f115=f36,f127,f114
800
	cmp.ltu		p7,p0=r17,r16		}
801 802
{ .mfi;		xma.lu	f114=f36,f127,f114
	add		r18=r18,r17		};;//
803
//-------------------------------------------------//
804
{ .mfi;	getf.sig	r20=f44
805 806
		xma.hu	f46=f37,f120,f45
(p7)	add		carry2=1,carry2		}
807
{ .mfi;	cmp.ltu		p7,p0=r18,r17
808 809
		xma.lu	f45=f37,f120,f45
	add		r19=r19,r18		};;
810
{ .mfi;	getf.sig	r24=f90
811
		xma.hu	f56=f37,f121,f55	}
812 813
{ .mfi;		xma.lu	f55=f37,f121,f55	};;
{ .mfi;	getf.sig	r25=f81
814 815
		xma.hu	f66=f37,f122,f65
(p7)	add		carry2=1,carry2		}
816
{ .mfi;	cmp.ltu		p7,p0=r19,r18
817 818
		xma.lu	f65=f37,f122,f65
	add		r20=r20,r19		};;
819
{ .mfi;	getf.sig	r26=f72
820 821
		xma.hu	f76=f37,f123,f75
(p7)	add		carry2=1,carry2		}
822
{ .mfi;	cmp.ltu		p7,p0=r20,r19
823 824
		xma.lu	f75=f37,f123,f75
	add		r20=r20,carry1		};;
825
{ .mfi;	getf.sig	r27=f63
826 827
		xma.hu	f86=f37,f124,f85
(p7)	add		carry2=1,carry2		}
828
{ .mfi;		xma.lu	f85=f37,f124,f85
829
	cmp.ltu		p7,p0=r20,carry1	};;
830
{ .mfi;	getf.sig	r28=f54
831 832
		xma.hu	f96=f37,f125,f95
(p7)	add		carry2=1,carry2		}
833
{ .mfi;	st8		[r32]=r20,16
834
		xma.lu	f95=f37,f125,f95	};;
835 836
{ .mfi;		xma.hu	f106=f37,f126,f105	}
{ .mfi;	mov		carry1=0
837 838
		xma.lu	f105=f37,f126,f105
	add		r25=r25,r24		};;
839
{ .mfi;		xma.hu	f116=f37,f127,f115
840
	cmp.ltu		p6,p0=r25,r24		}
841 842
{ .mfi;		xma.lu	f115=f37,f127,f115
	add		r26=r26,r25		};;//
843
//-------------------------------------------------//
844
{ .mfi;	getf.sig	r29=f45
845 846
		xma.hu	f47=f38,f120,f46
(p6)	add		carry1=1,carry1		}
847
{ .mfi;	cmp.ltu		p6,p0=r26,r25
848 849
		xma.lu	f46=f38,f120,f46
	add		r27=r27,r26		};;
850
{ .mfi;	getf.sig	r16=f100
851 852
		xma.hu	f57=f38,f121,f56
(p6)	add		carry1=1,carry1		}
853
{ .mfi;	cmp.ltu		p6,p0=r27,r26
854 855
		xma.lu	f56=f38,f121,f56
	add		r28=r28,r27		};;
856
{ .mfi;	getf.sig	r17=f91
857 858
		xma.hu	f67=f38,f122,f66
(p6)	add		carry1=1,carry1		}
859
{ .mfi;	cmp.ltu		p6,p0=r28,r27
860 861
		xma.lu	f66=f38,f122,f66
	add		r29=r29,r28		};;
862
{ .mfi;	getf.sig	r18=f82
863 864
		xma.hu	f77=f38,f123,f76
(p6)	add		carry1=1,carry1		}
865
{ .mfi;	cmp.ltu		p6,p0=r29,r28
866 867
		xma.lu	f76=f38,f123,f76
	add		r29=r29,carry2		};;
868
{ .mfi;	getf.sig	r19=f73
869 870
		xma.hu	f87=f38,f124,f86
(p6)	add		carry1=1,carry1		}
871
{ .mfi;		xma.lu	f86=f38,f124,f86
872
	cmp.ltu		p6,p0=r29,carry2	};;
873
{ .mfi;	getf.sig	r20=f64
874 875
		xma.hu	f97=f38,f125,f96
(p6)	add		carry1=1,carry1		}
876
{ .mfi;	st8		[r33]=r29,16
877
		xma.lu	f96=f38,f125,f96	};;
878
{ .mfi;	getf.sig	r21=f55
879
		xma.hu	f107=f38,f126,f106	}
880
{ .mfi;	mov		carry2=0
881 882
		xma.lu	f106=f38,f126,f106
	add		r17=r17,r16		};;
883
{ .mfi;		xma.hu	f117=f38,f127,f116
884
	cmp.ltu		p7,p0=r17,r16		}
885 886
{ .mfi;		xma.lu	f116=f38,f127,f116
	add		r18=r18,r17		};;//
887
//-------------------------------------------------//
888
{ .mfi;	getf.sig	r22=f46
889 890
		xma.hu	f48=f39,f120,f47
(p7)	add		carry2=1,carry2		}
891
{ .mfi;	cmp.ltu		p7,p0=r18,r17
892 893
		xma.lu	f47=f39,f120,f47
	add		r19=r19,r18		};;
894
{ .mfi;	getf.sig	r24=f110
895 896
		xma.hu	f58=f39,f121,f57
(p7)	add		carry2=1,carry2		}
897
{ .mfi;	cmp.ltu		p7,p0=r19,r18
898 899
		xma.lu	f57=f39,f121,f57
	add		r20=r20,r19		};;
900
{ .mfi;	getf.sig	r25=f101
901 902
		xma.hu	f68=f39,f122,f67
(p7)	add		carry2=1,carry2		}
903
{ .mfi;	cmp.ltu		p7,p0=r20,r19
904 905
		xma.lu	f67=f39,f122,f67
	add		r21=r21,r20		};;
906
{ .mfi;	getf.sig	r26=f92
907 908
		xma.hu	f78=f39,f123,f77
(p7)	add		carry2=1,carry2		}
909
{ .mfi;	cmp.ltu		p7,p0=r21,r20
910 911
		xma.lu	f77=f39,f123,f77
	add		r22=r22,r21		};;
912
{ .mfi;	getf.sig	r27=f83
913 914
		xma.hu	f88=f39,f124,f87
(p7)	add		carry2=1,carry2		}
915
{ .mfi;	cmp.ltu		p7,p0=r22,r21
916 917
		xma.lu	f87=f39,f124,f87
	add		r22=r22,carry1		};;
918
{ .mfi;	getf.sig	r28=f74
919 920
		xma.hu	f98=f39,f125,f97
(p7)	add		carry2=1,carry2		}
921
{ .mfi;		xma.lu	f97=f39,f125,f97
922
	cmp.ltu		p7,p0=r22,carry1	};;
923
{ .mfi;	getf.sig	r29=f65
924 925
		xma.hu	f108=f39,f126,f107
(p7)	add		carry2=1,carry2		}
926
{ .mfi;	st8		[r32]=r22,16
927
		xma.lu	f107=f39,f126,f107	};;
928
{ .mfi;	getf.sig	r30=f56
929
		xma.hu	f118=f39,f127,f117	}
930
{ .mfi;		xma.lu	f117=f39,f127,f117	};;//
931 932 933
//-------------------------------------------------//
// Leaving muliplier's heaven... Quite a ride, huh?

934
{ .mii;	getf.sig	r31=f47
935 936
	add		r25=r25,r24
	mov		carry1=0		};;
937
{ .mii;		getf.sig	r16=f111
938 939
	cmp.ltu		p6,p0=r25,r24
	add		r26=r26,r25		};;
940 941
{ .mfb;		getf.sig	r17=f102	}
{ .mii;
942 943 944
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r26,r25
	add		r27=r27,r26		};;
945 946
{ .mfb;	nop.m	0x0				}
{ .mii;
947 948 949
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r27,r26
	add		r28=r28,r27		};;
950
{ .mii;		getf.sig	r18=f93
951 952
		add		r17=r17,r16
		mov		carry3=0	}
953
{ .mii;
954 955 956
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r28,r27
	add		r29=r29,r28		};;
957
{ .mii;		getf.sig	r19=f84
958
		cmp.ltu		p7,p0=r17,r16	}
959
{ .mii;
960 961 962
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r29,r28
	add		r30=r30,r29		};;
963
{ .mii;		getf.sig	r20=f75
964
		add		r18=r18,r17	}
965
{ .mii;
966 967 968
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r30,r29
	add		r31=r31,r30		};;
969 970
{ .mfb;		getf.sig	r21=f66		}
{ .mii;	(p7)	add		carry3=1,carry3
971 972
		cmp.ltu		p7,p0=r18,r17
		add		r19=r19,r18	}
973 974
{ .mfb;	nop.m	0x0				}
{ .mii;
975 976 977
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r31,r30
	add		r31=r31,carry2		};;
978 979
{ .mfb;		getf.sig	r22=f57		}
{ .mii;	(p7)	add		carry3=1,carry3
980 981
		cmp.ltu		p7,p0=r19,r18
		add		r20=r20,r19	}
982 983
{ .mfb;	nop.m	0x0				}
{ .mii;
984 985
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r31,carry2	};;
986 987
{ .mfb;		getf.sig	r23=f48		}
{ .mii;	(p7)	add		carry3=1,carry3
988 989
		cmp.ltu		p7,p0=r20,r19
		add		r21=r21,r20	}
990
{ .mii;
991
(p6)	add		carry1=1,carry1		}
992
{ .mfb;	st8		[r33]=r31,16		};;
993

994 995
{ .mfb;	getf.sig	r24=f112		}
{ .mii;	(p7)	add		carry3=1,carry3
996 997
		cmp.ltu		p7,p0=r21,r20
		add		r22=r22,r21	};;
998 999
{ .mfb;	getf.sig	r25=f103		}
{ .mii;	(p7)	add		carry3=1,carry3
1000 1001
		cmp.ltu		p7,p0=r22,r21
		add		r23=r23,r22	};;
1002 1003
{ .mfb;	getf.sig	r26=f94			}
{ .mii;	(p7)	add		carry3=1,carry3
1004 1005
		cmp.ltu		p7,p0=r23,r22
		add		r23=r23,carry1	};;
1006 1007
{ .mfb;	getf.sig	r27=f85			}
{ .mii;	(p7)	add		carry3=1,carry3
1008
		cmp.ltu		p7,p8=r23,carry1};;
1009
{ .mii;	getf.sig	r28=f76
1010 1011
	add		r25=r25,r24
	mov		carry1=0		}
1012
{ .mii;		st8		[r32]=r23,16
1013 1014 1015
	(p7)	add		carry2=1,carry3
	(p8)	add		carry2=0,carry3	};;

1016 1017
{ .mfb;	nop.m	0x0				}
{ .mii;	getf.sig	r29=f67
1018 1019
	cmp.ltu		p6,p0=r25,r24
	add		r26=r26,r25		};;
1020 1021
{ .mfb;	getf.sig	r30=f58			}
{ .mii;
1022 1023 1024
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r26,r25
	add		r27=r27,r26		};;
1025 1026
{ .mfb;		getf.sig	r16=f113	}
{ .mii;
1027 1028 1029
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r27,r26
	add		r28=r28,r27		};;
1030 1031
{ .mfb;		getf.sig	r17=f104	}
{ .mii;
1032 1033 1034
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r28,r27
	add		r29=r29,r28		};;
1035 1036
{ .mfb;		getf.sig	r18=f95		}
{ .mii;
1037 1038 1039
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r29,r28
	add		r30=r30,r29		};;
1040
{ .mii;		getf.sig	r19=f86
1041 1042
		add		r17=r17,r16
		mov		carry3=0	}
1043
{ .mii;
1044 1045 1046
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r30,r29
	add		r30=r30,carry2		};;
1047
{ .mii;		getf.sig	r20=f77
1048 1049
		cmp.ltu		p7,p0=r17,r16
		add		r18=r18,r17	}
1050
{ .mii;
1051 1052
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r30,carry2	};;
1053 1054
{ .mfb;		getf.sig	r21=f68		}
{ .mii;	st8		[r33]=r30,16
1055 1056
(p6)	add		carry1=1,carry1		};;

1057 1058
{ .mfb;	getf.sig	r24=f114		}
{ .mii;	(p7)	add		carry3=1,carry3
1059 1060
		cmp.ltu		p7,p0=r18,r17
		add		r19=r19,r18	};;
1061 1062
{ .mfb;	getf.sig	r25=f105		}
{ .mii;	(p7)	add		carry3=1,carry3
1063 1064
		cmp.ltu		p7,p0=r19,r18
		add		r20=r20,r19	};;
1065 1066
{ .mfb;	getf.sig	r26=f96			}
{ .mii;	(p7)	add		carry3=1,carry3
1067 1068
		cmp.ltu		p7,p0=r20,r19
		add		r21=r21,r20	};;
1069 1070
{ .mfb;	getf.sig	r27=f87			}
{ .mii;	(p7)	add		carry3=1,carry3
1071 1072
		cmp.ltu		p7,p0=r21,r20
		add		r21=r21,carry1	};;
1073
{ .mib;	getf.sig	r28=f78			
1074
	add		r25=r25,r24		}
1075
{ .mib;	(p7)	add		carry3=1,carry3
1076
		cmp.ltu		p7,p8=r21,carry1};;
1077
{ .mii;		st8		[r32]=r21,16
1078 1079 1080
	(p7)	add		carry2=1,carry3
	(p8)	add		carry2=0,carry3	}

1081
{ .mii;	mov		carry1=0
1082 1083
	cmp.ltu		p6,p0=r25,r24
	add		r26=r26,r25		};;
1084 1085
{ .mfb;		getf.sig	r16=f115	}
{ .mii;
1086 1087 1088
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r26,r25
	add		r27=r27,r26		};;
1089 1090
{ .mfb;		getf.sig	r17=f106	}
{ .mii;
1091 1092 1093
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r27,r26
	add		r28=r28,r27		};;
1094 1095
{ .mfb;		getf.sig	r18=f97		}
{ .mii;
1096 1097 1098
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r28,r27
	add		r28=r28,carry2		};;
1099
{ .mib;		getf.sig	r19=f88
1100
		add		r17=r17,r16	}
1101
{ .mib;
1102 1103
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r28,carry2	};;
1104
{ .mii;	st8		[r33]=r28,16
1105 1106
(p6)	add		carry1=1,carry1		}

1107
{ .mii;		mov		carry2=0
1108 1109
		cmp.ltu		p7,p0=r17,r16
		add		r18=r18,r17	};;
1110 1111
{ .mfb;	getf.sig	r24=f116		}
{ .mii;	(p7)	add		carry2=1,carry2
1112 1113
		cmp.ltu		p7,p0=r18,r17
		add		r19=r19,r18	};;
1114 1115
{ .mfb;	getf.sig	r25=f107		}
{ .mii;	(p7)	add		carry2=1,carry2
1116 1117
		cmp.ltu		p7,p0=r19,r18
		add		r19=r19,carry1	};;
1118 1119
{ .mfb;	getf.sig	r26=f98			}
{ .mii;	(p7)	add		carry2=1,carry2
1120
		cmp.ltu		p7,p0=r19,carry1};;
1121
{ .mii;		st8		[r32]=r19,16
1122 1123
	(p7)	add		carry2=1,carry2	}

1124
{ .mfb;	add		r25=r25,r24		};;
1125

1126 1127
{ .mfb;		getf.sig	r16=f117	}
{ .mii;	mov		carry1=0
1128 1129
	cmp.ltu		p6,p0=r25,r24
	add		r26=r26,r25		};;
1130 1131
{ .mfb;		getf.sig	r17=f108	}
{ .mii;
1132 1133 1134
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r26,r25
	add		r26=r26,carry2		};;
1135 1136
{ .mfb;	nop.m	0x0				}
{ .mii;
1137 1138
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r26,carry2	};;
1139
{ .mii;	st8		[r33]=r26,16
1140 1141
(p6)	add		carry1=1,carry1		}

1142 1143 1144
{ .mfb;		add		r17=r17,r16	};;
{ .mfb;	getf.sig	r24=f118		}
{ .mii;		mov		carry2=0
1145 1146
		cmp.ltu		p7,p0=r17,r16
		add		r17=r17,carry1	};;
1147
{ .mii;	(p7)	add		carry2=1,carry2
1148
		cmp.ltu		p7,p0=r17,carry1};;
1149
{ .mii;		st8		[r32]=r17
1150
	(p7)	add		carry2=1,carry2	};;
1151 1152
{ .mfb;	add		r24=r24,carry2		};;
{ .mib;	st8		[r33]=r24		}
1153

1154
{ .mib;	rum		1<<5		// clear um.mfh
1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175
	br.ret.sptk.many	b0	};;
.endp	bn_mul_comba8#
#undef	carry3
#undef	carry2
#undef	carry1
#endif

#if 1
// It's possible to make it faster (see comment to bn_sqr_comba8), but
// I reckon it doesn't worth the effort. Basically because the routine
// (actually both of them) practically never called... So I just play
// same trick as with bn_sqr_comba8.
//
// void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
//
.global	bn_sqr_comba4#
.proc	bn_sqr_comba4#
.align	64
bn_sqr_comba4:
	.prologue
	.save	ar.pfs,r2
1176
#if defined(_HPUX_SOURCE) && !defined(_LP64)
A
Andy Polyakov 已提交
1177 1178 1179 1180 1181
{ .mii;	alloc   r2=ar.pfs,2,1,0,0
	addp4	r32=0,r32
	addp4	r33=0,r33		};;
{ .mii;
#else
1182
{ .mii;	alloc	r2=ar.pfs,2,1,0,0
A
Andy Polyakov 已提交
1183
#endif
1184 1185 1186
	mov	r34=r33
	add	r14=8,r33		};;
	.body
1187
{ .mii;	add	r17=8,r34
1188 1189
	add	r15=16,r33
	add	r18=16,r34		}
1190
{ .mfb;	add	r16=24,r33
1191
	br	.L_cheat_entry_point4	};;
1192
.endp	bn_sqr_comba4#
1193 1194 1195 1196 1197 1198 1199
#endif

#if 1
// Runs in ~115 cycles and ~4.5 times faster than C. Well, whatever...
//
// void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
//
1200
#define	carry1	r14
1201 1202 1203 1204 1205 1206 1207
#define	carry2	r15
.global	bn_mul_comba4#
.proc	bn_mul_comba4#
.align	64
bn_mul_comba4:
	.prologue
	.save	ar.pfs,r2
1208
#if defined(_HPUX_SOURCE) && !defined(_LP64)
A
Andy Polyakov 已提交
1209 1210 1211 1212 1213
{ .mii;	alloc   r2=ar.pfs,3,0,0,0
	addp4	r33=0,r33
	addp4	r34=0,r34		};;
{ .mii;	addp4	r32=0,r32
#else
1214
{ .mii;	alloc	r2=ar.pfs,3,0,0,0
A
Andy Polyakov 已提交
1215
#endif
1216 1217 1218
	add	r14=8,r33
	add	r17=8,r34		}
	.body
1219
{ .mii;	add	r15=16,r33
1220 1221 1222
	add	r18=16,r34
	add	r16=24,r33		};;
.L_cheat_entry_point4:
1223
{ .mmi;	add	r19=24,r34
1224 1225 1226

	ldf8	f32=[r33]		}

1227
{ .mmi;	ldf8	f120=[r34]
1228
	ldf8	f121=[r17]		};;
1229
{ .mmi;	ldf8	f122=[r18]
1230 1231
	ldf8	f123=[r19]		}

1232
{ .mmi;	ldf8	f33=[r14]
1233
	ldf8	f34=[r15]		}
1234
{ .mfi;	ldf8	f35=[r16]
1235 1236

		xma.hu	f41=f32,f120,f0		}
1237 1238 1239 1240 1241 1242 1243
{ .mfi;		xma.lu	f40=f32,f120,f0		};;
{ .mfi;		xma.hu	f51=f32,f121,f0		}
{ .mfi;		xma.lu	f50=f32,f121,f0		};;
{ .mfi;		xma.hu	f61=f32,f122,f0		}
{ .mfi;		xma.lu	f60=f32,f122,f0		};;
{ .mfi;		xma.hu	f71=f32,f123,f0		}
{ .mfi;		xma.lu	f70=f32,f123,f0		};;//
1244 1245
// Major stall takes place here, and 3 more places below. Result from
// first xma is not available for another 3 ticks.
1246
{ .mfi;	getf.sig	r16=f40
1247 1248
		xma.hu	f42=f33,f120,f41
	add		r33=8,r32		}
1249 1250
{ .mfi;		xma.lu	f41=f33,f120,f41	};;
{ .mfi;	getf.sig	r24=f50
1251
		xma.hu	f52=f33,f121,f51	}
1252 1253
{ .mfi;		xma.lu	f51=f33,f121,f51	};;
{ .mfi;	st8		[r32]=r16,16
1254
		xma.hu	f62=f33,f122,f61	}
1255 1256 1257
{ .mfi;		xma.lu	f61=f33,f122,f61	};;
{ .mfi;		xma.hu	f72=f33,f123,f71	}
{ .mfi;		xma.lu	f71=f33,f123,f71	};;//
1258
//-------------------------------------------------//
1259
{ .mfi;	getf.sig	r25=f41
1260
		xma.hu	f43=f34,f120,f42	}
1261 1262
{ .mfi;		xma.lu	f42=f34,f120,f42	};;
{ .mfi;	getf.sig	r16=f60
1263
		xma.hu	f53=f34,f121,f52	}
1264 1265
{ .mfi;		xma.lu	f52=f34,f121,f52	};;
{ .mfi;	getf.sig	r17=f51
1266 1267
		xma.hu	f63=f34,f122,f62
	add		r25=r25,r24		}
1268
{ .mfi;	mov		carry1=0
1269
		xma.lu	f62=f34,f122,f62	};;
1270
{ .mfi;	st8		[r33]=r25,16
1271 1272
		xma.hu	f73=f34,f123,f72
	cmp.ltu		p6,p0=r25,r24		}
1273
{ .mfi;		xma.lu	f72=f34,f123,f72	};;//
1274
//-------------------------------------------------//
1275
{ .mfi;	getf.sig	r18=f42
1276 1277
		xma.hu	f44=f35,f120,f43
(p6)	add		carry1=1,carry1		}
1278
{ .mfi;	add		r17=r17,r16
1279 1280
		xma.lu	f43=f35,f120,f43
	mov		carry2=0		};;
1281
{ .mfi;	getf.sig	r24=f70
1282 1283
		xma.hu	f54=f35,f121,f53
	cmp.ltu		p7,p0=r17,r16		}
1284 1285
{ .mfi;		xma.lu	f53=f35,f121,f53	};;
{ .mfi;	getf.sig	r25=f61
1286 1287
		xma.hu	f64=f35,f122,f63
	add		r18=r18,r17		}
1288
{ .mfi;		xma.lu	f63=f35,f122,f63
1289
(p7)	add		carry2=1,carry2		};;
1290
{ .mfi;	getf.sig	r26=f52
1291 1292
		xma.hu	f74=f35,f123,f73
	cmp.ltu		p7,p0=r18,r17		}
1293
{ .mfi;		xma.lu	f73=f35,f123,f73
1294 1295
	add		r18=r18,carry1		};;
//-------------------------------------------------//
1296
{ .mii;	st8		[r32]=r18,16
1297 1298 1299
(p7)	add		carry2=1,carry2
	cmp.ltu		p7,p0=r18,carry1	};;

1300
{ .mfi;	getf.sig	r27=f43	// last major stall
1301
(p7)	add		carry2=1,carry2		};;
1302
{ .mii;		getf.sig	r16=f71
1303 1304
	add		r25=r25,r24
	mov		carry1=0		};;
1305
{ .mii;		getf.sig	r17=f62	
1306 1307
	cmp.ltu		p6,p0=r25,r24
	add		r26=r26,r25		};;
1308
{ .mii;
1309 1310 1311
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r26,r25
	add		r27=r27,r26		};;
1312
{ .mii;
1313 1314 1315
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r27,r26
	add		r27=r27,carry2		};;
1316
{ .mii;		getf.sig	r18=f53
1317 1318
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r27,carry2	};;
1319
{ .mfi;	st8		[r33]=r27,16
1320 1321
(p6)	add		carry1=1,carry1		}

1322
{ .mii;		getf.sig	r19=f44
1323 1324
		add		r17=r17,r16
		mov		carry2=0	};;
1325
{ .mii;	getf.sig	r24=f72
1326 1327
		cmp.ltu		p7,p0=r17,r16
		add		r18=r18,r17	};;
1328
{ .mii;	(p7)	add		carry2=1,carry2
1329 1330
		cmp.ltu		p7,p0=r18,r17
		add		r19=r19,r18	};;
1331
{ .mii;	(p7)	add		carry2=1,carry2
1332 1333
		cmp.ltu		p7,p0=r19,r18
		add		r19=r19,carry1	};;
1334
{ .mii;	getf.sig	r25=f63
1335 1336
	(p7)	add		carry2=1,carry2
		cmp.ltu		p7,p0=r19,carry1};;
1337
{ .mii;		st8		[r32]=r19,16
1338 1339
	(p7)	add		carry2=1,carry2	}

1340
{ .mii;	getf.sig	r26=f54
1341 1342
	add		r25=r25,r24
	mov		carry1=0		};;
1343
{ .mii;		getf.sig	r16=f73
1344 1345
	cmp.ltu		p6,p0=r25,r24
	add		r26=r26,r25		};;
1346
{ .mii;
1347 1348 1349
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r26,r25
	add		r26=r26,carry2		};;
1350
{ .mii;		getf.sig	r17=f64
1351 1352
(p6)	add		carry1=1,carry1
	cmp.ltu		p6,p0=r26,carry2	};;
1353
{ .mii;	st8		[r33]=r26,16
1354 1355
(p6)	add		carry1=1,carry1		}

1356
{ .mii;	getf.sig	r24=f74
1357 1358
		add		r17=r17,r16	
		mov		carry2=0	};;
1359
{ .mii;		cmp.ltu		p7,p0=r17,r16
1360 1361
		add		r17=r17,carry1	};;

1362
{ .mii;	(p7)	add		carry2=1,carry2
1363
		cmp.ltu		p7,p0=r17,carry1};;
1364
{ .mii;		st8		[r32]=r17,16
1365 1366
	(p7)	add		carry2=1,carry2	};;

1367 1368
{ .mii;	add		r24=r24,carry2		};;
{ .mii;	st8		[r33]=r24		}
1369

1370
{ .mib;	rum		1<<5		// clear um.mfh
1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390
	br.ret.sptk.many	b0	};;
.endp	bn_mul_comba4#
#undef	carry2
#undef	carry1
#endif

#if 1
//
// BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
//
// In the nutshell it's a port of my MIPS III/IV implementation.
//
#define	AT	r14
#define	H	r16
#define	HH	r20
#define	L	r17
#define	D	r18
#define	DH	r22
#define	I	r21

1391
#if 0
1392 1393
// Some preprocessors (most notably HP-UX) appear to be allergic to
// macros enclosed to parenthesis [as these three were].
1394 1395 1396
#define	cont	p16
#define	break	p0	// p20
#define	equ	p24
1397 1398 1399 1400 1401 1402
#else
cont=p16
break=p0
equ=p24
#endif

1403 1404 1405 1406 1407 1408 1409
.global	abort#
.global	bn_div_words#
.proc	bn_div_words#
.align	64
bn_div_words:
	.prologue
	.save	ar.pfs,r2
1410
{ .mii;	alloc		r2=ar.pfs,3,5,0,8
1411
	.save	b0,r3
1412
	mov		r3=b0
A
Andy Polyakov 已提交
1413
	.save	pr,r10
1414
	mov		r10=pr		};;
1415
{ .mmb;	cmp.eq		p6,p0=r34,r0
1416 1417 1418 1419
	mov		r8=-1
(p6)	br.ret.spnt.many	b0	};;

	.body
1420
{ .mii;	mov		H=r32		// save h
1421 1422
	mov		ar.ec=0		// don't rotate at exit
	mov		pr.rot=0	}
1423 1424
{ .mii;	mov		L=r33		// save l
	mov		r36=r0		};;
1425

1426 1427
.L_divw_shift:	// -vv- note signed comparison
{ .mfi;	(p0)	cmp.lt		p16,p0=r0,r34	// d
1428
	(p0)	shladd		r33=r34,1,r0	}
1429
{ .mfb;	(p0)	add		r35=1,r36
1430
	(p0)	nop.f		0x0
1431
(p16)	br.wtop.dpnt		.L_divw_shift	};;
1432

1433
{ .mii;	mov		D=r34
1434 1435
	shr.u		DH=r34,32
	sub		r35=64,r36		};;
1436
{ .mii;	setf.sig	f7=DH
1437 1438
	shr.u		AT=H,r35
	mov		I=r36			};;
1439
{ .mib;	cmp.ne		p6,p0=r0,AT
1440 1441 1442
	shl		H=H,r36
(p6)	br.call.spnt.clr	b0=abort	};;	// overflow, die...

1443
{ .mfi;	fcvt.xuf.s1	f7=f7
1444
	shr.u		AT=L,r35		};;
1445
{ .mii;	shl		L=L,r36
1446 1447
	or		H=H,AT			};;

1448
{ .mii;	nop.m		0x0
1449 1450 1451
	cmp.leu		p6,p0=D,H;;
(p6)	sub		H=H,D			}

1452
{ .mlx;	setf.sig	f14=D
1453
	movl		AT=0xffffffff		};;
1454
///////////////////////////////////////////////////////////
1455
{ .mii;	setf.sig	f6=H
1456 1457
	shr.u		HH=H,32;;
	cmp.eq		p6,p7=HH,DH		};;
1458
{ .mfb;
1459 1460 1461 1462
(p6)	setf.sig	f8=AT
(p7)	fcvt.xuf.s1	f6=f6
(p7)	br.call.sptk	b6=.L_udiv64_32_b6	};;

1463
{ .mfi;	getf.sig	r33=f8				// q
1464
	xmpy.lu		f9=f8,f14		}
1465
{ .mfi;	xmpy.hu		f10=f8,f14
1466 1467
	shrp		H=H,L,32		};;

1468
{ .mmi;	getf.sig	r35=f9				// tl
1469 1470
	getf.sig	r31=f10			};;	// th

1471 1472
.L_divw_1st_iter:
{ .mii;	(p0)	add		r32=-1,r33
1473
	(p0)	cmp.eq		equ,cont=HH,r31		};;
1474
{ .mii;	(p0)	cmp.ltu		p8,p0=r35,D
1475 1476
	(p0)	sub		r34=r35,D
	(equ)	cmp.leu		break,cont=r35,H	};;
1477
{ .mib;	(cont)	cmp.leu		cont,break=HH,r31
1478
	(p8)	add		r31=-1,r31
1479
(cont)	br.wtop.spnt		.L_divw_1st_iter	};;
1480
///////////////////////////////////////////////////////////
1481
{ .mii;	sub		H=H,r35
1482
	shl		r8=r33,32
1483
	shl		L=L,32			};;
1484
///////////////////////////////////////////////////////////
1485
{ .mii;	setf.sig	f6=H
1486 1487
	shr.u		HH=H,32;;
	cmp.eq		p6,p7=HH,DH		};;
1488
{ .mfb;
1489 1490 1491 1492
(p6)	setf.sig	f8=AT
(p7)	fcvt.xuf.s1	f6=f6
(p7)	br.call.sptk	b6=.L_udiv64_32_b6	};;

1493
{ .mfi;	getf.sig	r33=f8				// q
1494
	xmpy.lu		f9=f8,f14		}
1495
{ .mfi;	xmpy.hu		f10=f8,f14
1496 1497
	shrp		H=H,L,32		};;

1498
{ .mmi;	getf.sig	r35=f9				// tl
1499 1500
	getf.sig	r31=f10			};;	// th

1501 1502
.L_divw_2nd_iter:
{ .mii;	(p0)	add		r32=-1,r33
1503
	(p0)	cmp.eq		equ,cont=HH,r31		};;
1504
{ .mii;	(p0)	cmp.ltu		p8,p0=r35,D
1505 1506
	(p0)	sub		r34=r35,D
	(equ)	cmp.leu		break,cont=r35,H	};;
1507
{ .mib;	(cont)	cmp.leu		cont,break=HH,r31
1508
	(p8)	add		r31=-1,r31
1509
(cont)	br.wtop.spnt		.L_divw_2nd_iter	};;
1510
///////////////////////////////////////////////////////////
1511
{ .mii;	sub	H=H,r35
1512 1513
	or	r8=r8,r33
	mov	ar.pfs=r2		};;
1514
{ .mii;	shr.u	r9=H,I			// remainder if anybody wants it
A
Andy Polyakov 已提交
1515
	mov	pr=r10,0x1ffff		}
1516
{ .mfb;	br.ret.sptk.many	b0	};;
1517 1518 1519 1520

// Unsigned 64 by 32 (well, by 64 for the moment) bit integer division
// procedure.
//
1521 1522
// inputs:	f6 = (double)a, f7 = (double)b
// output:	f8 = (int)(a/b)
1523 1524
// clobbered:	f8,f9,f10,f11,pred
pred=p15
1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536
// One can argue that this snippet is copyrighted to Intel
// Corporation, as it's essentially identical to one of those
// found in "Divide, Square Root and Remainder" section at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
// Yes, I admit that the referred code was used as template,
// but after I realized that there hardly is any other instruction
// sequence which would perform this operation. I mean I figure that
// any independent attempt to implement high-performance division
// will result in code virtually identical to the Intel code. It
// should be noted though that below division kernel is 1 cycle
// faster than Intel one (note commented splits:-), not to mention
// original prologue (rather lack of one) and epilogue.
1537
.align	32
1538
.skip	16
1539
.L_udiv64_32_b6:
1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550
	frcpa.s1	f8,pred=f6,f7;;		// [0]  y0 = 1 / b

(pred)	fnma.s1		f9=f7,f8,f1		// [5]  e0 = 1 - b * y0
(pred)	fmpy.s1		f10=f6,f8;;		// [5]  q0 = a * y0
(pred)	fmpy.s1		f11=f9,f9		// [10] e1 = e0 * e0
(pred)	fma.s1		f10=f9,f10,f10;;	// [10] q1 = q0 + e0 * q0
(pred)	fma.s1		f8=f9,f8,f8	//;;	// [15] y1 = y0 + e0 * y0
(pred)	fma.s1		f9=f11,f10,f10;;	// [15] q2 = q1 + e1 * q1
(pred)	fma.s1		f8=f11,f8,f8	//;;	// [20] y2 = y1 + e1 * y1
(pred)	fnma.s1		f10=f7,f9,f6;;		// [20] r2 = a - b * q2
(pred)	fma.s1		f8=f10,f8,f9;;		// [25] q3 = q2 + r2 * y2
1551 1552 1553 1554 1555

	fcvt.fxu.trunc.s1	f8=f8		// [30] q = trunc(q3)
	br.ret.sptk.many	b6;;
.endp	bn_div_words#
#endif