Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenHarmony
Third Party Openssl
提交
d675c74d
T
Third Party Openssl
项目概览
OpenHarmony
/
Third Party Openssl
1 年多 前同步成功
通知
10
Star
18
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
T
Third Party Openssl
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
d675c74d
编写于
11月 26, 2004
作者:
A
Andy Polyakov
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
RC4 IA-64 assembler implementation.
上级
59c70298
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
148 addition
and
0 deletion
+148
-0
crypto/rc4/asm/rc4-ia64.S
crypto/rc4/asm/rc4-ia64.S
+148
-0
未找到文件。
crypto/rc4/asm/rc4-ia64.S
0 → 100644
浏览文件 @
d675c74d
//
===================================================================
=
//
Written
by
Andy
Polyakov
<
appro
@
fy
.
chalmers
.
se
>
for
the
OpenSSL
//
project
.
//
//
Rights
for
redistribution
and
usage
in
source
and
binary
forms
are
//
granted
according
to
the
OpenSSL
license
.
Warranty
of
any
kind
is
//
disclaimed
.
//
===================================================================
=
.
ident
"
rc4
-
ia64.S
,
Version
1
.0
"
.
ident
"
IA
-64
ISA
artwork
by
Andy
Polyakov
<
appro
@
fy
.
chalmers
.
se
>
"
//
What
'
s
wrong
with
compiler
generated
code
?
Because
of
the
nature
of
//
C
language
,
compiler
doesn
't [dare to] reorder load and stores. But
//
being
memory
-
bound
,
RC4
should
benefit
from
reorder
[
on
in
-
order
-
//
execution
core
such
as
IA
-
64
]
.
But
what
can
we
reorder
?
At
the
very
//
least
we
can
safely
reorder
references
to
key
schedule
in
respect
//
to
input
and
output
streams
.
Secondly
,
less
obvious
,
it
's possible
//
to
pull
up
some
references
to
elements
of
the
key
schedule
itself
.
//
Fact
is
that
such
prior
loads
are
not
safe
only
for
"degenerated"
//
key
schedule
,
when
all
elements
equal
to
the
same
value
,
which
is
//
never
the
case
[
key
schedule
setup
routine
makes
sure
it
's not].
//
Furthermore
.
In
order
to
compress
loop
body
to
the
minimum
,
I
chose
//
to
deploy
deposit
instruction
,
which
substitutes
for
the
whole
//
key
->
data
+((
x
&255)<<
log2
(
sizeof
(
key
->
data
[0]))).
This
unfortunately
//
requires
key
->
data
to
be
aligned
at
sizeof
(
key
->
data
)
boundary
.
//
This
is
why
you
'll find "RC4_INT pad[512-256-2];" addenum to RC4_KEY
//
and
"d=(RC4_INT *)(((size_t)(d+255))&~(sizeof(key->data)-1));"
in
//
rc4_skey
.
c
[
and
rc4_enc
.
c
,
where
it
's retained for debugging
//
purposes
].
Throughput
is
~
210
MBps
on
900
MHz
CPU
,
which
is
is
>
3
x
//
faster
than
gcc
generated
code
and
+
30
%
-
if
compared
to
HP
-
UX
C
.
//
Unrolling
loop
below
should
give
>
30
%
on
top
of
that
...
.
text
.
explicit
#if defined(_HPUX_SOURCE) && !defined(_LP64)
# define ADDP addp4
#else
# define ADDP add
#endif
#define SZ 4 // this is set to sizeof(RC4_INT)
//
SZ
==4
seems
to
be
optimal
.
At
least
SZ
==
8
is
not
any
faster
,
not
for
//
assembler
implementation
,
while
SZ
==
1
code
is
~
30
%
slower
.
#if SZ==1 // RC4_INT is unsigned char
# define LDKEY ld1
# define STKEY st1
# define OFF 0
#elif SZ==4 // RC4_INT is unsigned int
# define LDKEY ld4
# define STKEY st4
# define OFF 2
#elif SZ==8 // RC4_INT is unsigned long
# define LDKEY ld8
# define STKEY st8
# define OFF 3
#endif
out
=
r8
; // [expanded] output pointer
inp
=
r9
; // [expanded] output pointer
prsave
=
r10
;
key
=
r28
; // [expanded] pointer to RC4_KEY
ksch
=
r29
; // (key->data+255)[&~(sizeof(key->data)-1)]
xx
=
r30
;
yy
=
r31
;
//
void
RC4
(
RC4_KEY
*
key
,
size_t
len
,
const
void
*
inp
,
void
*
out
)
;
.
global
RC4
#
.
proc
RC4
#
.
align
32
.
skip
16
RC4
:
.
prologue
.
fframe
0
.
save
ar
.
pfs
,
r2
.
save
ar
.
lc
,
r3
.
save
pr
,
prsave
{
.
mii
; alloc r2=ar.pfs,4,12,0,16
mov
prsave
=
pr
ADDP
key
=
0
,
in0
}
;;
{
.
mib
; cmp.eq p6,p0=0,in1 // len==0?
mov
r3
=
ar
.
lc
(
p6
)
br.ret.spnt.many
b0
}
;; // emergency exit
.
body
.
rotr
dat
[
4
],
key_x
[
4
],
tx
[
2
],
rnd
[
2
],
key_y
[
2
],
ty
[
1
]
;
{
.
mib
; LDKEY xx=[key],SZ // load key->x
add
in1
=-
1
,
in1
//
adjust
len
for
loop
counter
nop.b
0
}
{
.
mib
; ADDP inp=0,in2
ADDP
out
=
0
,
in3
brp.loop.imp
.
Ltop
,
.
Lexit
-
16
}
;;
{
.
mmi
; LDKEY yy=[key] // load key->y
add
ksch
=(
255
+
1
)*
SZ
,
key
//
as
ksch
will
be
used
with
//
deposit
instruction
only
,
//
I
don
't have to &~255...
mov
ar
.
lc
=
in1
}
{
.
mmi
; nop.m 0
add
xx
=
1
,
xx
mov
pr
.
rot
=
1
<<
16
}
;;
{
.
mii
; nop.m 0
dep
key_x
[
1
]=
xx
,
ksch
,
OFF
,
8
mov
ar
.
ec
=
3
}
;; // note that epilogue counter
//
is
off
by
1
.
I
compensate
//
for
this
at
exit
...
.
Ltop
:
//
The
loop
is
scheduled
for
3
*(
n
+
2
)
spin
-
rate
on
Itanium
2
,
which
//
theoretically
gives
asymptotic
performance
of
clock
frequency
//
divided
by
3
bytes
per
seconds
,
or
500
MBps
on
1
.5
GHz
CPU
.
Measured
//
performance
however
is
distinctly
lower
than
1
/
4
:
-(
The
culplrit
//
seems
to
be
*(
out
++)=
dat
,
which
inadvertently
splits
the
bundle
,
//
even
though
there
is
M
-
unit
available
...
Unrolling
is
due
...
//
Unrolled
loop
should
collect
output
with
variable
shift
instruction
//
in
order
to
avoid
starvation
for
integer
shifter
...
Only
output
//
pointer
has
to
be
aligned
...
It
should
be
possible
to
get
pretty
//
close
to
theoretical
peak
...
{
.
mmi
; (p16) LDKEY tx[0]=[key_x[1]] // tx=key[xx]
(
p17
)
LDKEY
ty
[
0
]=[
key_y
[
1
]]
//
ty
=
key
[
yy
]
(
p18
)
dep
rnd
[
1
]=
rnd
[
1
],
ksch
,
OFF
,
8
}
//
&
key
[(
tx
+
ty
)
&255
]
{
.
mmi
; (p19) st1 [out]=dat[3],1 // *(out++)=dat
(
p16
)
add
xx
=
1
,
xx
//
x
++
(
p0
)
nop
.
i
0
}
;;
{
.
mmi
; (p18) LDKEY rnd[1]=[rnd[1]] // rnd=key[(tx+ty)&255]
(
p16
)
ld1
dat
[
0
]=[
inp
],
1
//
dat
=*(
inp
++)
(
p16
)
dep
key_x
[
0
]=
xx
,
ksch
,
OFF
,
8
}
//
&
key
[
xx
&255
]
{
.
mmi
; (p0) nop.m 0
(
p16
)
add
yy
=
yy
,
tx
[
0
]
//
y
+=
tx
(
p0
)
nop
.
i
0
}
;;
{
.
mmi
; (p17) STKEY [key_y[1]]=tx[1] // key[yy]=tx
(
p17
)
STKEY
[
key_x
[
2
]]=
ty
[
0
]
//
key
[
xx
]=
ty
(
p16
)
dep
key_y
[
0
]=
yy
,
ksch
,
OFF
,
8
}
//
&
key
[
yy
&255
]
{
.
mmb
; (p17) add rnd[0]=tx[1],ty[0] // tx+=ty
(
p18
)
xor
dat
[
2
]=
dat
[
2
],
rnd
[
1
]
//
dat
^=
rnd
br.ctop.sptk
.
Ltop
}
;;
.
Lexit
:
{
.
mib
; STKEY [key]=yy,-SZ // save key->y
mov
pr
=
prsave
,
0x1ffff
nop.b
0
}
{
.
mib
; st1 [out]=dat[3],1 // compensate for truncated
//
epilogue
counter
add
xx
=-
1
,
xx
nop.b
0
}
;;
{
.
mib
; STKEY [key]=xx // save key->x
mov
ar
.
lc
=
r3
br.ret.sptk.many
b0
}
;;
.
endp
RC4
#
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录