rc4-amd64.pl 4.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. Rights for redistribution and usage in source and binary
# forms are granted according to the OpenSSL license.
# ====================================================================
#
# 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
# "hand-coded assembler"] doesn't stand for the whole improvement
# coefficient. It turned out that eliminating RC4_CHAR from config
# line results in ~40% improvement (yes, even for C implementation).
# Presumably it has everything to do with AMD cache architecture and
# RAW or whatever penalties. Once again! The module *requires* config
# line *without* RC4_CHAR! As for coding "secret," I bet on partial
# register arithmetics. For example instead 'inc %r8; and $255,%r8'
# I simply 'inc %r8b'. Even though optimization manual discourages
# to operate on partial registers, it turned out to be the best bet.
# At least for AMD... How IA32E would perform remains to be seen...

$output=shift;

$win64=1 if ($output =~ /win64.[s|asm]/);

open STDOUT,">$output" || die "can't open $output: $!";

if (defined($win64)) {
    $dat="%rcx";	# arg1
    $len="%rdx";	# arg2
    $inp="%rsi";	# r8, arg3 moves here
    $out="%rdi";	# r9, arg4 moves here
} else {
    $dat="%rdi";	# arg1
    $len="%rsi";	# arg2
    $inp="%rdx";	# arg3
    $out="%rcx";	# arg4
}

$XX="%r10";
$TX="%r8";
$YY="%r11";
$TY="%r9";

sub PTR() {
    my $ret=shift;
    if (defined($win64)) {
	$ret =~ s/\[([\S]+)\+([\S]+)\]/[$2+$1]/g; # [%rN+%rM*4]->[%rM*4+%rN]
    } else {
	$ret =~ s/[\+\*]/,/g;		# [%rN+%rM*4]->[%rN,%rM,4]
	$ret =~ s/\[([^\]]+)\]/($1)/g;	# [%rN]->(%rN)
    }
    $ret;
}

$code=<<___ if (!defined($win64));
.text

.globl	RC4
.type	RC4,\@function
.align	16
RC4:	or	$len,$len
	jne	.Lentry
	.byte	0xF3,0xC3	# repz ret, 2-byte ret
.Lentry:
___
$code=<<___ if (defined($win64));
TEXT	SEGMENT
PUBLIC	RC4
ALIGN	16
RC4	PROC NEAR
	or	$len,$len
	jne	.Lentry
	DB	F3h,C3h		; repz ret, 2-byte ret
.Lentry:
	push	%edi
	push	%esi
	sub	\$40,%esp
	mov	%r8,$inp
	mov	%r9,$out
___
$code.=<<___;
	add	\$8,$dat
	movl	`&PTR("DWORD-8[$dat]")`,$XX#d
	movl	`&PTR("DWORD-4[$dat]")`,$YY#d
	test	\$-8,$len
	jz	.Lloop1
.align	16
.Lloop8:
	movq	`&PTR("QWORD[$inp]")`,%rax

	inc	$XX#b
	movl	`&PTR("DWORD[$dat+$XX*4]")`,$TX#d
	add	$TX#b,$YY#b
	movl	`&PTR("DWORD[$dat+$YY*4]")`,$TY#d
	movl	$TX#d,`&PTR("DWORD[$dat+$YY*4]")`
	movl	$TY#d,`&PTR("DWORD[$dat+$XX*4]")`
	add	$TY#b,$TX#b
	inc	$XX#b
	movl	`&PTR("DWORD[$dat+$TX*4]")`,$TY#d
	xor	$TY,%rax
___
for ($i=1;$i<=6;$i++) {
$code.=<<___;
	movl	`&PTR("DWORD[$dat+$XX*4]")`,$TX#d
	add	$TX#b,$YY#b
	movl	`&PTR("DWORD[$dat+$YY*4]")`,$TY#d
	movl	$TX#d,`&PTR("DWORD[$dat+$YY*4]")`
	movl	$TY#d,`&PTR("DWORD[$dat+$XX*4]")`
	add	$TY#b,$TX#b
	movl	`&PTR("DWORD[$dat+$TX*4]")`,$TY#d
	shl	\$`8*$i`,$TY
	inc	$XX#b
	xor	$TY,%rax
___
}
$code.=<<___;
	movl	`&PTR("DWORD[$dat+$XX*4]")`,$TX#d
	add	$TX#b,$YY#b
	movl	`&PTR("DWORD[$dat+$YY*4]")`,$TY#d
	movl	$TX#d,`&PTR("DWORD[$dat+$YY*4]")`
	movl	$TY#d,`&PTR("DWORD[$dat+$XX*4]")`
	sub	\$8,$len
	add	$TY#b,$TX#b
	add	\$8,$out
	movl	`&PTR("DWORD[$dat+$TX*4]")`,$TY#d
	shl	\$56,$TY
	add	\$8,$inp
	xor	$TY,%rax

	mov	%rax,`&PTR("QWORD-8[$out]")`

	test	\$-8,$len
	jnz	.Lloop8
	cmp	\$0,$len
	jne	.Lloop1
.Lexit:
	movl	$XX#d,`&PTR("DWORD-8[$dat]")`
	movl	$YY#d,`&PTR("DWORD-4[$dat]")`
___
$code.=<<___ if (defined($win64));
	add	\$40,%esp
	pop	%esi
	pop	%edi
	DB	F3h,C3h		; retz ret, 2-byte ret
___
$code.=<<___ if (!defined($win64));
	.byte	0xF3,0xC3	# repz ret, 2-byte ret
___
$code.=<<___;
.align	16
.Lloop1:
	movzb	`&PTR("BYTE[$inp]")`,%rax
	inc	$XX#b
	nop
	movl	`&PTR("DWORD[$dat+$XX*4]")`,$TX#d
	add	$TX#b,$YY#b
	movl	`&PTR("DWORD[$dat+$YY*4]")`,$TY#d
	movl	$TX#d,`&PTR("DWORD[$dat+$YY*4]")`
	movl	$TY#d,`&PTR("DWORD[$dat+$XX*4]")`
	add	$TY#b,$TX#b
	movl	`&PTR("DWORD[$dat+$TX*4]")`,$TY#d
	xor	$TY,%rax
	inc	$inp
	movb	%al,`&PTR("BYTE[$out]")`
	inc	$out
	dec	$len
	jnz	.Lloop1
	jmp	.Lexit
___
if (defined($win64)) {
    $code.="RC4	ENDP\n";
} else {
    $code.=".size	RC4,.-RC4\n"
}

$code =~ s/#([bwd])/$1/gm;
$code =~ s/\`([^\`]*)\`/eval $1/gem;

if (defined($win64)) {
    $code =~ s/\.align/ALIGN/gm;
    $code =~ s/[\$%]//gm;
    $code =~ s/\.L/\$L/gm;
    $code =~ s/([\w]+)([\s]+)([\S]+),([\S]+)/$1$2$4,$3/gm;
    $code =~ s/([QD]*WORD|BYTE)/$1 PTR /gm;
    $code =~ s/(mov[z]*)[bwlq]/$1/gm;
} else {
    $code =~ s/[QD]*WORD|BYTE//gm;
}
print $code;