poly1305-s390x.pl 4.5 KB
Newer Older
C
code4lala 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227
#! /usr/bin/env perl
# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License").  You may not use
# this file except in compliance with the License.  You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html

#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# This module implements Poly1305 hash for s390x.
#
# June 2015
#
# ~6.6/2.3 cpb on z10/z196+, >2x improvement over compiler-generated
# code. For older compiler improvement coefficient is >3x, because
# then base 2^64 and base 2^32 implementations are compared.
#
# On side note, z13 enables vector base 2^26 implementation...

$flavour = shift;

if ($flavour =~ /3[12]/) {
	$SIZE_T=4;
	$g="";
} else {
	$SIZE_T=8;
	$g="g";
}

while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";

$sp="%r15";

my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5));

$code.=<<___;
.text

.globl	poly1305_init
.type	poly1305_init,\@function
.align	16
poly1305_init:
	lghi	%r0,0
	lghi	%r1,-1
	stg	%r0,0($ctx)		# zero hash value
	stg	%r0,8($ctx)
	stg	%r0,16($ctx)

	cl${g}r	$inp,%r0
	je	.Lno_key

	lrvg	%r4,0($inp)		# load little-endian key
	lrvg	%r5,8($inp)

	nihl	%r1,0xffc0		# 0xffffffc0ffffffff
	srlg	%r0,%r1,4		# 0x0ffffffc0fffffff
	srlg	%r1,%r1,4
	nill	%r1,0xfffc		# 0x0ffffffc0ffffffc

	ngr	%r4,%r0
	ngr	%r5,%r1

	stg	%r4,32($ctx)
	stg	%r5,40($ctx)

.Lno_key:
	lghi	%r2,0
	br	%r14
.size	poly1305_init,.-poly1305_init
___
{
my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14));
my ($r0,$r1,$s1) = map("%r$_",(0..2));

$code.=<<___;
.globl	poly1305_blocks
.type	poly1305_blocks,\@function
.align	16
poly1305_blocks:
	srl${g}	$len,4			# fixed-up in 64-bit build
	lghi	%r0,0
	cl${g}r	$len,%r0
	je	.Lno_data

	stm${g}	%r6,%r14,`6*$SIZE_T`($sp)

	llgfr   $padbit,$padbit		# clear upper half, much needed with
					# non-64-bit ABI
	lg	$r0,32($ctx)		# load key
	lg	$r1,40($ctx)

	lg	$h0,0($ctx)		# load hash value
	lg	$h1,8($ctx)
	lg	$h2,16($ctx)

	st$g	$ctx,`2*$SIZE_T`($sp)	# off-load $ctx
	srlg	$s1,$r1,2
	algr	$s1,$r1			# s1 = r1 + r1>>2
	j	.Loop

.align	16
.Loop:
	lrvg	$d0lo,0($inp)		# load little-endian input
	lrvg	$d1lo,8($inp)
	la	$inp,16($inp)

	algr	$d0lo,$h0		# accumulate input
	alcgr	$d1lo,$h1

	lgr	$h0,$d0lo
	mlgr	$d0hi,$r0		# h0*r0	  -> $d0hi:$d0lo
	lgr	$h1,$d1lo
	mlgr	$d1hi,$s1		# h1*5*r1 -> $d1hi:$d1lo

	mlgr	$t0,$r1			# h0*r1   -> $t0:$h0
	mlgr	$t1,$r0			# h1*r0   -> $t1:$h1
	alcgr	$h2,$padbit

	algr	$d0lo,$d1lo
	lgr	$d1lo,$h2
	alcgr	$d0hi,$d1hi
	lghi	$d1hi,0

	algr	$h1,$h0
	alcgr	$t1,$t0

	msgr	$d1lo,$s1		# h2*s1
	msgr	$h2,$r0			# h2*r0

	algr	$h1,$d1lo
	alcgr	$t1,$d1hi		# $d1hi is zero

	algr	$h1,$d0hi
	alcgr	$h2,$t1

	lghi	$h0,-4			# final reduction step
	ngr	$h0,$h2
	srlg	$t0,$h2,2
	algr	$h0,$t0
	lghi	$t1,3
	ngr	$h2,$t1

	algr	$h0,$d0lo
	alcgr	$h1,$d1hi		# $d1hi is still zero
	alcgr	$h2,$d1hi		# $d1hi is still zero

	brct$g	$len,.Loop

	l$g	$ctx,`2*$SIZE_T`($sp)	# restore $ctx

	stg	$h0,0($ctx)		# store hash value
	stg	$h1,8($ctx)
	stg	$h2,16($ctx)

	lm${g}	%r6,%r14,`6*$SIZE_T`($sp)
.Lno_data:
	br	%r14
.size	poly1305_blocks,.-poly1305_blocks
___
}
{
my ($mac,$nonce)=($inp,$len);
my ($h0,$h1,$h2,$d0,$d1)=map("%r$_",(5..9));

$code.=<<___;
.globl	poly1305_emit
.type	poly1305_emit,\@function
.align	16
poly1305_emit:
	stm${g}	%r6,%r9,`6*$SIZE_T`($sp)

	lg	$h0,0($ctx)
	lg	$h1,8($ctx)
	lg	$h2,16($ctx)

	lghi	%r0,5
	lghi	%r1,0
	lgr	$d0,$h0
	lgr	$d1,$h1

	algr	$h0,%r0			# compare to modulus
	alcgr	$h1,%r1
	alcgr	$h2,%r1

	srlg	$h2,$h2,2		# did it borrow/carry?
	slgr	%r1,$h2			# 0-$h2>>2
	lg	$h2,0($nonce)		# load nonce
	lghi	%r0,-1
	lg	$ctx,8($nonce)
	xgr	%r0,%r1			# ~%r1

	ngr	$h0,%r1
	ngr	$d0,%r0
	ngr	$h1,%r1
	ngr	$d1,%r0
	ogr	$h0,$d0
	rllg	$d0,$h2,32		# flip nonce words
	ogr	$h1,$d1
	rllg	$d1,$ctx,32

	algr	$h0,$d0			# accumulate nonce
	alcgr	$h1,$d1

	strvg	$h0,0($mac)		# write little-endian result
	strvg	$h1,8($mac)

	lm${g}	%r6,%r9,`6*$SIZE_T`($sp)
	br	%r14
.size	poly1305_emit,.-poly1305_emit

.string	"Poly1305 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
___
}

$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\b(srlg\s+)(%r[0-9]+\s*,)\s*([0-9]+)/$1$2$2$3/gm;

print $code;
close STDOUT or die "error closing STDOUT: $!";