Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
openanolis
dragonwell8_hotspot
提交
1b55810b
D
dragonwell8_hotspot
项目概览
openanolis
/
dragonwell8_hotspot
通知
2
Star
2
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
dragonwell8_hotspot
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
1b55810b
编写于
9月 22, 2016
作者:
M
mdoerr
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
8164920: ppc: enhancement of CRC32 intrinsic
Reviewed-by: goetz, mdoerr Contributed-by:
N
Hiroshi H Horii
<
horii@jp.ibm.com
>
上级
d9bd1ec1
变更
9
展开全部
显示空白变更内容
内联
并排
Showing
9 changed file
with
954 addition
and
21 deletion
+954
-21
src/cpu/ppc/vm/assembler_ppc.hpp
src/cpu/ppc/vm/assembler_ppc.hpp
+8
-0
src/cpu/ppc/vm/assembler_ppc.inline.hpp
src/cpu/ppc/vm/assembler_ppc.inline.hpp
+4
-0
src/cpu/ppc/vm/macroAssembler_ppc.cpp
src/cpu/ppc/vm/macroAssembler_ppc.cpp
+559
-0
src/cpu/ppc/vm/macroAssembler_ppc.hpp
src/cpu/ppc/vm/macroAssembler_ppc.hpp
+7
-0
src/cpu/ppc/vm/stubGenerator_ppc.cpp
src/cpu/ppc/vm/stubGenerator_ppc.cpp
+51
-19
src/cpu/ppc/vm/stubRoutines_ppc_64.cpp
src/cpu/ppc/vm/stubRoutines_ppc_64.cpp
+309
-0
src/cpu/ppc/vm/stubRoutines_ppc_64.hpp
src/cpu/ppc/vm/stubRoutines_ppc_64.hpp
+8
-0
src/cpu/ppc/vm/vm_version_ppc.cpp
src/cpu/ppc/vm/vm_version_ppc.cpp
+5
-2
src/cpu/ppc/vm/vm_version_ppc.hpp
src/cpu/ppc/vm/vm_version_ppc.hpp
+3
-0
未找到文件。
src/cpu/ppc/vm/assembler_ppc.hpp
浏览文件 @
1b55810b
...
...
@@ -468,6 +468,10 @@ class Assembler : public AbstractAssembler {
LVSL_OPCODE
=
(
31u
<<
OPCODE_SHIFT
|
6u
<<
1
),
LVSR_OPCODE
=
(
31u
<<
OPCODE_SHIFT
|
38u
<<
1
),
// Vector-Scalar (VSX) instruction support.
MTVSRD_OPCODE
=
(
31u
<<
OPCODE_SHIFT
|
179u
<<
1
),
MFVSRD_OPCODE
=
(
31u
<<
OPCODE_SHIFT
|
51u
<<
1
),
// Vector Permute and Formatting
VPKPX_OPCODE
=
(
4u
<<
OPCODE_SHIFT
|
782u
),
VPKSHSS_OPCODE
=
(
4u
<<
OPCODE_SHIFT
|
398u
),
...
...
@@ -1938,6 +1942,10 @@ class Assembler : public AbstractAssembler {
inline
void
mtvscr
(
VectorRegister
b
);
inline
void
mfvscr
(
VectorRegister
d
);
// Vector-Scalar (VSX) instructions.
inline
void
mtvrd
(
VectorRegister
d
,
Register
a
);
inline
void
mfvrd
(
Register
a
,
VectorRegister
d
);
// AES (introduced with Power 8)
inline
void
vcipher
(
VectorRegister
d
,
VectorRegister
a
,
VectorRegister
b
);
inline
void
vcipherlast
(
VectorRegister
d
,
VectorRegister
a
,
VectorRegister
b
);
...
...
src/cpu/ppc/vm/assembler_ppc.inline.hpp
浏览文件 @
1b55810b
...
...
@@ -623,6 +623,10 @@ inline void Assembler::stvxl( VectorRegister d, Register s1, Register s2) { emit
inline
void
Assembler
::
lvsl
(
VectorRegister
d
,
Register
s1
,
Register
s2
)
{
emit_int32
(
LVSL_OPCODE
|
vrt
(
d
)
|
ra0mem
(
s1
)
|
rb
(
s2
));
}
inline
void
Assembler
::
lvsr
(
VectorRegister
d
,
Register
s1
,
Register
s2
)
{
emit_int32
(
LVSR_OPCODE
|
vrt
(
d
)
|
ra0mem
(
s1
)
|
rb
(
s2
));
}
// Vector-Scalar (VSX) instructions.
inline
void
Assembler
::
mtvrd
(
VectorRegister
d
,
Register
a
)
{
emit_int32
(
MTVSRD_OPCODE
|
vrt
(
d
)
|
ra
(
a
)
|
1u
);
}
// 1u: d is treated as Vector (VMX/Altivec).
inline
void
Assembler
::
mfvrd
(
Register
a
,
VectorRegister
d
)
{
emit_int32
(
MFVSRD_OPCODE
|
vrt
(
d
)
|
ra
(
a
)
|
1u
);
}
// 1u: d is treated as Vector (VMX/Altivec).
inline
void
Assembler
::
vpkpx
(
VectorRegister
d
,
VectorRegister
a
,
VectorRegister
b
)
{
emit_int32
(
VPKPX_OPCODE
|
vrt
(
d
)
|
vra
(
a
)
|
vrb
(
b
));
}
inline
void
Assembler
::
vpkshss
(
VectorRegister
d
,
VectorRegister
a
,
VectorRegister
b
)
{
emit_int32
(
VPKSHSS_OPCODE
|
vrt
(
d
)
|
vra
(
a
)
|
vrb
(
b
));
}
inline
void
Assembler
::
vpkswss
(
VectorRegister
d
,
VectorRegister
a
,
VectorRegister
b
)
{
emit_int32
(
VPKSWSS_OPCODE
|
vrt
(
d
)
|
vra
(
a
)
|
vrb
(
b
));
}
...
...
src/cpu/ppc/vm/macroAssembler_ppc.cpp
浏览文件 @
1b55810b
...
...
@@ -3423,6 +3423,565 @@ void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len
BLOCK_COMMENT
(
"} kernel_crc32_1byte"
);
}
/**
* @param crc register containing existing CRC (32-bit)
* @param buf register pointing to input byte buffer (byte*)
* @param len register containing number of bytes
* @param table register pointing to CRC table
* @param constants register pointing to CRC table for 128-bit aligned memory
* @param barretConstants register pointing to table for barrett reduction
* @param t0 volatile register
* @param t1 volatile register
* @param t2 volatile register
* @param t3 volatile register
*/
void
MacroAssembler
::
kernel_crc32_1word_vpmsumd
(
Register
crc
,
Register
buf
,
Register
len
,
Register
table
,
Register
constants
,
Register
barretConstants
,
Register
t0
,
Register
t1
,
Register
t2
,
Register
t3
,
Register
t4
)
{
assert_different_registers
(
crc
,
buf
,
len
,
table
);
Label
L_alignedHead
,
L_tail
,
L_alignTail
,
L_start
,
L_end
;
Register
prealign
=
t0
;
Register
postalign
=
t0
;
BLOCK_COMMENT
(
"kernel_crc32_1word_vpmsumb {"
);
// 1. use kernel_crc32_1word for shorter than 384bit
clrldi
(
len
,
len
,
32
);
cmpdi
(
CCR0
,
len
,
384
);
bge
(
CCR0
,
L_start
);
Register
tc0
=
t4
;
Register
tc1
=
constants
;
Register
tc2
=
barretConstants
;
kernel_crc32_1word
(
crc
,
buf
,
len
,
table
,
t0
,
t1
,
t2
,
t3
,
tc0
,
tc1
,
tc2
,
table
);
b
(
L_end
);
BIND
(
L_start
);
// 2. ~c
nand
(
crc
,
crc
,
crc
);
// 3. calculate from 0 to first 128bit-aligned address
clrldi_
(
prealign
,
buf
,
57
);
beq
(
CCR0
,
L_alignedHead
);
subfic
(
prealign
,
prealign
,
128
);
subf
(
len
,
prealign
,
len
);
update_byteLoop_crc32
(
crc
,
buf
,
prealign
,
table
,
t2
,
false
,
false
);
// 4. calculate from first 128bit-aligned address to last 128bit-aligned address
BIND
(
L_alignedHead
);
clrldi
(
postalign
,
len
,
57
);
subf
(
len
,
postalign
,
len
);
// len must be more than 256bit
kernel_crc32_1word_aligned
(
crc
,
buf
,
len
,
constants
,
barretConstants
,
t1
,
t2
,
t3
);
// 5. calculate remaining
cmpdi
(
CCR0
,
postalign
,
0
);
beq
(
CCR0
,
L_tail
);
update_byteLoop_crc32
(
crc
,
buf
,
postalign
,
table
,
t2
,
false
,
false
);
BIND
(
L_tail
);
// 6. ~c
nand
(
crc
,
crc
,
crc
);
BIND
(
L_end
);
BLOCK_COMMENT
(
"} kernel_crc32_1word_vpmsumb"
);
}
/**
* @param crc register containing existing CRC (32-bit)
* @param buf register pointing to input byte buffer (byte*)
* @param len register containing number of bytes
* @param constants register pointing to CRC table for 128-bit aligned memory
* @param barretConstants register pointing to table for barrett reduction
* @param t0 volatile register
* @param t1 volatile register
* @param t2 volatile register
*/
void
MacroAssembler
::
kernel_crc32_1word_aligned
(
Register
crc
,
Register
buf
,
Register
len
,
Register
constants
,
Register
barretConstants
,
Register
t0
,
Register
t1
,
Register
t2
)
{
Label
L_mainLoop
,
L_tail
,
L_alignTail
,
L_barrett_reduction
,
L_end
,
L_first_warm_up_done
,
L_first_cool_down
,
L_second_cool_down
,
L_XOR
,
L_test
;
Label
L_lv0
,
L_lv1
,
L_lv2
,
L_lv3
,
L_lv4
,
L_lv5
,
L_lv6
,
L_lv7
,
L_lv8
,
L_lv9
,
L_lv10
,
L_lv11
,
L_lv12
,
L_lv13
,
L_lv14
,
L_lv15
;
Label
L_1
,
L_2
,
L_3
,
L_4
;
Register
rLoaded
=
t0
;
Register
rTmp1
=
t1
;
Register
rTmp2
=
t2
;
Register
off16
=
R22
;
Register
off32
=
R23
;
Register
off48
=
R24
;
Register
off64
=
R25
;
Register
off80
=
R26
;
Register
off96
=
R27
;
Register
off112
=
R28
;
Register
rIdx
=
R29
;
Register
rMax
=
R30
;
Register
constantsPos
=
R31
;
VectorRegister
mask_32bit
=
VR24
;
VectorRegister
mask_64bit
=
VR25
;
VectorRegister
zeroes
=
VR26
;
VectorRegister
const1
=
VR27
;
VectorRegister
const2
=
VR28
;
// Save non-volatile vector registers (frameless).
Register
offset
=
t1
;
int
offsetInt
=
0
;
offsetInt
-=
16
;
li
(
offset
,
-
16
);
stvx
(
VR20
,
offset
,
R1_SP
);
offsetInt
-=
16
;
addi
(
offset
,
offset
,
-
16
);
stvx
(
VR21
,
offset
,
R1_SP
);
offsetInt
-=
16
;
addi
(
offset
,
offset
,
-
16
);
stvx
(
VR22
,
offset
,
R1_SP
);
offsetInt
-=
16
;
addi
(
offset
,
offset
,
-
16
);
stvx
(
VR23
,
offset
,
R1_SP
);
offsetInt
-=
16
;
addi
(
offset
,
offset
,
-
16
);
stvx
(
VR24
,
offset
,
R1_SP
);
offsetInt
-=
16
;
addi
(
offset
,
offset
,
-
16
);
stvx
(
VR25
,
offset
,
R1_SP
);
offsetInt
-=
16
;
addi
(
offset
,
offset
,
-
16
);
stvx
(
VR26
,
offset
,
R1_SP
);
offsetInt
-=
16
;
addi
(
offset
,
offset
,
-
16
);
stvx
(
VR27
,
offset
,
R1_SP
);
offsetInt
-=
16
;
addi
(
offset
,
offset
,
-
16
);
stvx
(
VR28
,
offset
,
R1_SP
);
offsetInt
-=
8
;
std
(
R22
,
offsetInt
,
R1_SP
);
offsetInt
-=
8
;
std
(
R23
,
offsetInt
,
R1_SP
);
offsetInt
-=
8
;
std
(
R24
,
offsetInt
,
R1_SP
);
offsetInt
-=
8
;
std
(
R25
,
offsetInt
,
R1_SP
);
offsetInt
-=
8
;
std
(
R26
,
offsetInt
,
R1_SP
);
offsetInt
-=
8
;
std
(
R27
,
offsetInt
,
R1_SP
);
offsetInt
-=
8
;
std
(
R28
,
offsetInt
,
R1_SP
);
offsetInt
-=
8
;
std
(
R29
,
offsetInt
,
R1_SP
);
offsetInt
-=
8
;
std
(
R30
,
offsetInt
,
R1_SP
);
offsetInt
-=
8
;
std
(
R31
,
offsetInt
,
R1_SP
);
// Set constants
li
(
off16
,
16
);
li
(
off32
,
32
);
li
(
off48
,
48
);
li
(
off64
,
64
);
li
(
off80
,
80
);
li
(
off96
,
96
);
li
(
off112
,
112
);
clrldi
(
crc
,
crc
,
32
);
vxor
(
zeroes
,
zeroes
,
zeroes
);
vspltisw
(
VR0
,
-
1
);
vsldoi
(
mask_32bit
,
zeroes
,
VR0
,
4
);
vsldoi
(
mask_64bit
,
zeroes
,
VR0
,
-
8
);
// Get the initial value into v8
vxor
(
VR8
,
VR8
,
VR8
);
mtvrd
(
VR8
,
crc
);
vsldoi
(
VR8
,
zeroes
,
VR8
,
-
8
);
// shift into bottom 32 bits
li
(
rLoaded
,
0
);
rldicr
(
rIdx
,
len
,
0
,
56
);
{
BIND
(
L_1
);
// Checksum in blocks of MAX_SIZE (32768)
lis
(
rMax
,
0
);
ori
(
rMax
,
rMax
,
32768
);
mr
(
rTmp2
,
rMax
);
cmpd
(
CCR0
,
rIdx
,
rMax
);
bgt
(
CCR0
,
L_2
);
mr
(
rMax
,
rIdx
);
BIND
(
L_2
);
subf
(
rIdx
,
rMax
,
rIdx
);
// our main loop does 128 bytes at a time
srdi
(
rMax
,
rMax
,
7
);
/*
* Work out the offset into the constants table to start at. Each
* constant is 16 bytes, and it is used against 128 bytes of input
* data - 128 / 16 = 8
*/
sldi
(
rTmp1
,
rMax
,
4
);
srdi
(
rTmp2
,
rTmp2
,
3
);
subf
(
rTmp1
,
rTmp1
,
rTmp2
);
// We reduce our final 128 bytes in a separate step
addi
(
rMax
,
rMax
,
-
1
);
mtctr
(
rMax
);
// Find the start of our constants
add
(
constantsPos
,
constants
,
rTmp1
);
// zero VR0-v7 which will contain our checksums
vxor
(
VR0
,
VR0
,
VR0
);
vxor
(
VR1
,
VR1
,
VR1
);
vxor
(
VR2
,
VR2
,
VR2
);
vxor
(
VR3
,
VR3
,
VR3
);
vxor
(
VR4
,
VR4
,
VR4
);
vxor
(
VR5
,
VR5
,
VR5
);
vxor
(
VR6
,
VR6
,
VR6
);
vxor
(
VR7
,
VR7
,
VR7
);
lvx
(
const1
,
constantsPos
);
/*
* If we are looping back to consume more data we use the values
* already in VR16-v23.
*/
cmpdi
(
CCR0
,
rLoaded
,
1
);
beq
(
CCR0
,
L_3
);
{
// First warm up pass
lvx
(
VR16
,
buf
);
lvx
(
VR17
,
off16
,
buf
);
lvx
(
VR18
,
off32
,
buf
);
lvx
(
VR19
,
off48
,
buf
);
lvx
(
VR20
,
off64
,
buf
);
lvx
(
VR21
,
off80
,
buf
);
lvx
(
VR22
,
off96
,
buf
);
lvx
(
VR23
,
off112
,
buf
);
addi
(
buf
,
buf
,
8
*
16
);
// xor in initial value
vxor
(
VR16
,
VR16
,
VR8
);
}
BIND
(
L_3
);
bdz
(
L_first_warm_up_done
);
addi
(
constantsPos
,
constantsPos
,
16
);
lvx
(
const2
,
constantsPos
);
// Second warm up pass
vpmsumd
(
VR8
,
VR16
,
const1
);
lvx
(
VR16
,
buf
);
vpmsumd
(
VR9
,
VR17
,
const1
);
lvx
(
VR17
,
off16
,
buf
);
vpmsumd
(
VR10
,
VR18
,
const1
);
lvx
(
VR18
,
off32
,
buf
);
vpmsumd
(
VR11
,
VR19
,
const1
);
lvx
(
VR19
,
off48
,
buf
);
vpmsumd
(
VR12
,
VR20
,
const1
);
lvx
(
VR20
,
off64
,
buf
);
vpmsumd
(
VR13
,
VR21
,
const1
);
lvx
(
VR21
,
off80
,
buf
);
vpmsumd
(
VR14
,
VR22
,
const1
);
lvx
(
VR22
,
off96
,
buf
);
vpmsumd
(
VR15
,
VR23
,
const1
);
lvx
(
VR23
,
off112
,
buf
);
addi
(
buf
,
buf
,
8
*
16
);
bdz
(
L_first_cool_down
);
/*
* main loop. We modulo schedule it such that it takes three iterations
* to complete - first iteration load, second iteration vpmsum, third
* iteration xor.
*/
{
BIND
(
L_4
);
lvx
(
const1
,
constantsPos
);
addi
(
constantsPos
,
constantsPos
,
16
);
vxor
(
VR0
,
VR0
,
VR8
);
vpmsumd
(
VR8
,
VR16
,
const2
);
lvx
(
VR16
,
buf
);
vxor
(
VR1
,
VR1
,
VR9
);
vpmsumd
(
VR9
,
VR17
,
const2
);
lvx
(
VR17
,
off16
,
buf
);
vxor
(
VR2
,
VR2
,
VR10
);
vpmsumd
(
VR10
,
VR18
,
const2
);
lvx
(
VR18
,
off32
,
buf
);
vxor
(
VR3
,
VR3
,
VR11
);
vpmsumd
(
VR11
,
VR19
,
const2
);
lvx
(
VR19
,
off48
,
buf
);
lvx
(
const2
,
constantsPos
);
vxor
(
VR4
,
VR4
,
VR12
);
vpmsumd
(
VR12
,
VR20
,
const1
);
lvx
(
VR20
,
off64
,
buf
);
vxor
(
VR5
,
VR5
,
VR13
);
vpmsumd
(
VR13
,
VR21
,
const1
);
lvx
(
VR21
,
off80
,
buf
);
vxor
(
VR6
,
VR6
,
VR14
);
vpmsumd
(
VR14
,
VR22
,
const1
);
lvx
(
VR22
,
off96
,
buf
);
vxor
(
VR7
,
VR7
,
VR15
);
vpmsumd
(
VR15
,
VR23
,
const1
);
lvx
(
VR23
,
off112
,
buf
);
addi
(
buf
,
buf
,
8
*
16
);
bdnz
(
L_4
);
}
BIND
(
L_first_cool_down
);
// First cool down pass
lvx
(
const1
,
constantsPos
);
addi
(
constantsPos
,
constantsPos
,
16
);
vxor
(
VR0
,
VR0
,
VR8
);
vpmsumd
(
VR8
,
VR16
,
const1
);
vxor
(
VR1
,
VR1
,
VR9
);
vpmsumd
(
VR9
,
VR17
,
const1
);
vxor
(
VR2
,
VR2
,
VR10
);
vpmsumd
(
VR10
,
VR18
,
const1
);
vxor
(
VR3
,
VR3
,
VR11
);
vpmsumd
(
VR11
,
VR19
,
const1
);
vxor
(
VR4
,
VR4
,
VR12
);
vpmsumd
(
VR12
,
VR20
,
const1
);
vxor
(
VR5
,
VR5
,
VR13
);
vpmsumd
(
VR13
,
VR21
,
const1
);
vxor
(
VR6
,
VR6
,
VR14
);
vpmsumd
(
VR14
,
VR22
,
const1
);
vxor
(
VR7
,
VR7
,
VR15
);
vpmsumd
(
VR15
,
VR23
,
const1
);
BIND
(
L_second_cool_down
);
// Second cool down pass
vxor
(
VR0
,
VR0
,
VR8
);
vxor
(
VR1
,
VR1
,
VR9
);
vxor
(
VR2
,
VR2
,
VR10
);
vxor
(
VR3
,
VR3
,
VR11
);
vxor
(
VR4
,
VR4
,
VR12
);
vxor
(
VR5
,
VR5
,
VR13
);
vxor
(
VR6
,
VR6
,
VR14
);
vxor
(
VR7
,
VR7
,
VR15
);
/*
* vpmsumd produces a 96 bit result in the least significant bits
* of the register. Since we are bit reflected we have to shift it
* left 32 bits so it occupies the least significant bits in the
* bit reflected domain.
*/
vsldoi
(
VR0
,
VR0
,
zeroes
,
4
);
vsldoi
(
VR1
,
VR1
,
zeroes
,
4
);
vsldoi
(
VR2
,
VR2
,
zeroes
,
4
);
vsldoi
(
VR3
,
VR3
,
zeroes
,
4
);
vsldoi
(
VR4
,
VR4
,
zeroes
,
4
);
vsldoi
(
VR5
,
VR5
,
zeroes
,
4
);
vsldoi
(
VR6
,
VR6
,
zeroes
,
4
);
vsldoi
(
VR7
,
VR7
,
zeroes
,
4
);
// xor with last 1024 bits
lvx
(
VR8
,
buf
);
lvx
(
VR9
,
off16
,
buf
);
lvx
(
VR10
,
off32
,
buf
);
lvx
(
VR11
,
off48
,
buf
);
lvx
(
VR12
,
off64
,
buf
);
lvx
(
VR13
,
off80
,
buf
);
lvx
(
VR14
,
off96
,
buf
);
lvx
(
VR15
,
off112
,
buf
);
addi
(
buf
,
buf
,
8
*
16
);
vxor
(
VR16
,
VR0
,
VR8
);
vxor
(
VR17
,
VR1
,
VR9
);
vxor
(
VR18
,
VR2
,
VR10
);
vxor
(
VR19
,
VR3
,
VR11
);
vxor
(
VR20
,
VR4
,
VR12
);
vxor
(
VR21
,
VR5
,
VR13
);
vxor
(
VR22
,
VR6
,
VR14
);
vxor
(
VR23
,
VR7
,
VR15
);
li
(
rLoaded
,
1
);
cmpdi
(
CCR0
,
rIdx
,
0
);
addi
(
rIdx
,
rIdx
,
128
);
bne
(
CCR0
,
L_1
);
}
// Work out how many bytes we have left
andi_
(
len
,
len
,
127
);
// Calculate where in the constant table we need to start
subfic
(
rTmp1
,
len
,
128
);
add
(
constantsPos
,
constantsPos
,
rTmp1
);
// How many 16 byte chunks are in the tail
srdi
(
rIdx
,
len
,
4
);
mtctr
(
rIdx
);
/*
* Reduce the previously calculated 1024 bits to 64 bits, shifting
* 32 bits to include the trailing 32 bits of zeros
*/
lvx
(
VR0
,
constantsPos
);
lvx
(
VR1
,
off16
,
constantsPos
);
lvx
(
VR2
,
off32
,
constantsPos
);
lvx
(
VR3
,
off48
,
constantsPos
);
lvx
(
VR4
,
off64
,
constantsPos
);
lvx
(
VR5
,
off80
,
constantsPos
);
lvx
(
VR6
,
off96
,
constantsPos
);
lvx
(
VR7
,
off112
,
constantsPos
);
addi
(
constantsPos
,
constantsPos
,
8
*
16
);
vpmsumw
(
VR0
,
VR16
,
VR0
);
vpmsumw
(
VR1
,
VR17
,
VR1
);
vpmsumw
(
VR2
,
VR18
,
VR2
);
vpmsumw
(
VR3
,
VR19
,
VR3
);
vpmsumw
(
VR4
,
VR20
,
VR4
);
vpmsumw
(
VR5
,
VR21
,
VR5
);
vpmsumw
(
VR6
,
VR22
,
VR6
);
vpmsumw
(
VR7
,
VR23
,
VR7
);
// Now reduce the tail (0 - 112 bytes)
cmpdi
(
CCR0
,
rIdx
,
0
);
beq
(
CCR0
,
L_XOR
);
lvx
(
VR16
,
buf
);
addi
(
buf
,
buf
,
16
);
lvx
(
VR17
,
constantsPos
);
vpmsumw
(
VR16
,
VR16
,
VR17
);
vxor
(
VR0
,
VR0
,
VR16
);
beq
(
CCR0
,
L_XOR
);
lvx
(
VR16
,
buf
);
addi
(
buf
,
buf
,
16
);
lvx
(
VR17
,
off16
,
constantsPos
);
vpmsumw
(
VR16
,
VR16
,
VR17
);
vxor
(
VR0
,
VR0
,
VR16
);
beq
(
CCR0
,
L_XOR
);
lvx
(
VR16
,
buf
);
addi
(
buf
,
buf
,
16
);
lvx
(
VR17
,
off32
,
constantsPos
);
vpmsumw
(
VR16
,
VR16
,
VR17
);
vxor
(
VR0
,
VR0
,
VR16
);
beq
(
CCR0
,
L_XOR
);
lvx
(
VR16
,
buf
);
addi
(
buf
,
buf
,
16
);
lvx
(
VR17
,
off48
,
constantsPos
);
vpmsumw
(
VR16
,
VR16
,
VR17
);
vxor
(
VR0
,
VR0
,
VR16
);
beq
(
CCR0
,
L_XOR
);
lvx
(
VR16
,
buf
);
addi
(
buf
,
buf
,
16
);
lvx
(
VR17
,
off64
,
constantsPos
);
vpmsumw
(
VR16
,
VR16
,
VR17
);
vxor
(
VR0
,
VR0
,
VR16
);
beq
(
CCR0
,
L_XOR
);
lvx
(
VR16
,
buf
);
addi
(
buf
,
buf
,
16
);
lvx
(
VR17
,
off80
,
constantsPos
);
vpmsumw
(
VR16
,
VR16
,
VR17
);
vxor
(
VR0
,
VR0
,
VR16
);
beq
(
CCR0
,
L_XOR
);
lvx
(
VR16
,
buf
);
addi
(
buf
,
buf
,
16
);
lvx
(
VR17
,
off96
,
constantsPos
);
vpmsumw
(
VR16
,
VR16
,
VR17
);
vxor
(
VR0
,
VR0
,
VR16
);
// Now xor all the parallel chunks together
BIND
(
L_XOR
);
vxor
(
VR0
,
VR0
,
VR1
);
vxor
(
VR2
,
VR2
,
VR3
);
vxor
(
VR4
,
VR4
,
VR5
);
vxor
(
VR6
,
VR6
,
VR7
);
vxor
(
VR0
,
VR0
,
VR2
);
vxor
(
VR4
,
VR4
,
VR6
);
vxor
(
VR0
,
VR0
,
VR4
);
b
(
L_barrett_reduction
);
BIND
(
L_first_warm_up_done
);
lvx
(
const1
,
constantsPos
);
addi
(
constantsPos
,
constantsPos
,
16
);
vpmsumd
(
VR8
,
VR16
,
const1
);
vpmsumd
(
VR9
,
VR17
,
const1
);
vpmsumd
(
VR10
,
VR18
,
const1
);
vpmsumd
(
VR11
,
VR19
,
const1
);
vpmsumd
(
VR12
,
VR20
,
const1
);
vpmsumd
(
VR13
,
VR21
,
const1
);
vpmsumd
(
VR14
,
VR22
,
const1
);
vpmsumd
(
VR15
,
VR23
,
const1
);
b
(
L_second_cool_down
);
BIND
(
L_barrett_reduction
);
lvx
(
const1
,
barretConstants
);
addi
(
barretConstants
,
barretConstants
,
16
);
lvx
(
const2
,
barretConstants
);
vsldoi
(
VR1
,
VR0
,
VR0
,
-
8
);
vxor
(
VR0
,
VR0
,
VR1
);
// xor two 64 bit results together
// shift left one bit
vspltisb
(
VR1
,
1
);
vsl
(
VR0
,
VR0
,
VR1
);
vand
(
VR0
,
VR0
,
mask_64bit
);
/*
* The reflected version of Barrett reduction. Instead of bit
* reflecting our data (which is expensive to do), we bit reflect our
* constants and our algorithm, which means the intermediate data in
* our vector registers goes from 0-63 instead of 63-0. We can reflect
* the algorithm because we don't carry in mod 2 arithmetic.
*/
vand
(
VR1
,
VR0
,
mask_32bit
);
// bottom 32 bits of a
vpmsumd
(
VR1
,
VR1
,
const1
);
// ma
vand
(
VR1
,
VR1
,
mask_32bit
);
// bottom 32bits of ma
vpmsumd
(
VR1
,
VR1
,
const2
);
// qn */
vxor
(
VR0
,
VR0
,
VR1
);
// a - qn, subtraction is xor in GF(2)
/*
* Since we are bit reflected, the result (ie the low 32 bits) is in
* the high 32 bits. We just need to shift it left 4 bytes
* V0 [ 0 1 X 3 ]
* V0 [ 0 X 2 3 ]
*/
vsldoi
(
VR0
,
VR0
,
zeroes
,
4
);
// shift result into top 64 bits of
// Get it into r3
mfvrd
(
crc
,
VR0
);
BIND
(
L_end
);
offsetInt
=
0
;
// Restore non-volatile Vector registers (frameless).
offsetInt
-=
16
;
li
(
offset
,
-
16
);
lvx
(
VR20
,
offset
,
R1_SP
);
offsetInt
-=
16
;
addi
(
offset
,
offset
,
-
16
);
lvx
(
VR21
,
offset
,
R1_SP
);
offsetInt
-=
16
;
addi
(
offset
,
offset
,
-
16
);
lvx
(
VR22
,
offset
,
R1_SP
);
offsetInt
-=
16
;
addi
(
offset
,
offset
,
-
16
);
lvx
(
VR23
,
offset
,
R1_SP
);
offsetInt
-=
16
;
addi
(
offset
,
offset
,
-
16
);
lvx
(
VR24
,
offset
,
R1_SP
);
offsetInt
-=
16
;
addi
(
offset
,
offset
,
-
16
);
lvx
(
VR25
,
offset
,
R1_SP
);
offsetInt
-=
16
;
addi
(
offset
,
offset
,
-
16
);
lvx
(
VR26
,
offset
,
R1_SP
);
offsetInt
-=
16
;
addi
(
offset
,
offset
,
-
16
);
lvx
(
VR27
,
offset
,
R1_SP
);
offsetInt
-=
16
;
addi
(
offset
,
offset
,
-
16
);
lvx
(
VR28
,
offset
,
R1_SP
);
offsetInt
-=
8
;
ld
(
R22
,
offsetInt
,
R1_SP
);
offsetInt
-=
8
;
ld
(
R23
,
offsetInt
,
R1_SP
);
offsetInt
-=
8
;
ld
(
R24
,
offsetInt
,
R1_SP
);
offsetInt
-=
8
;
ld
(
R25
,
offsetInt
,
R1_SP
);
offsetInt
-=
8
;
ld
(
R26
,
offsetInt
,
R1_SP
);
offsetInt
-=
8
;
ld
(
R27
,
offsetInt
,
R1_SP
);
offsetInt
-=
8
;
ld
(
R28
,
offsetInt
,
R1_SP
);
offsetInt
-=
8
;
ld
(
R29
,
offsetInt
,
R1_SP
);
offsetInt
-=
8
;
ld
(
R30
,
offsetInt
,
R1_SP
);
offsetInt
-=
8
;
ld
(
R31
,
offsetInt
,
R1_SP
);
}
void
MacroAssembler
::
kernel_crc32_singleByte
(
Register
crc
,
Register
buf
,
Register
len
,
Register
table
,
Register
tmp
)
{
assert_different_registers
(
crc
,
buf
,
/* len, not used!! */
table
,
tmp
);
...
...
src/cpu/ppc/vm/macroAssembler_ppc.hpp
浏览文件 @
1b55810b
...
...
@@ -656,6 +656,13 @@ class MacroAssembler: public Assembler {
Register
tc0
,
Register
tc1
,
Register
tc2
,
Register
tc3
);
void
kernel_crc32_1byte
(
Register
crc
,
Register
buf
,
Register
len
,
Register
table
,
Register
t0
,
Register
t1
,
Register
t2
,
Register
t3
);
void
kernel_crc32_1word_vpmsumd
(
Register
crc
,
Register
buf
,
Register
len
,
Register
table
,
Register
constants
,
Register
barretConstants
,
Register
t0
,
Register
t1
,
Register
t2
,
Register
t3
,
Register
t4
);
void
kernel_crc32_1word_aligned
(
Register
crc
,
Register
buf
,
Register
len
,
Register
constants
,
Register
barretConstants
,
Register
t0
,
Register
t1
,
Register
t2
);
void
kernel_crc32_singleByte
(
Register
crc
,
Register
buf
,
Register
len
,
Register
table
,
Register
tmp
);
//
...
...
src/cpu/ppc/vm/stubGenerator_ppc.cpp
浏览文件 @
1b55810b
...
...
@@ -2482,9 +2482,7 @@ class StubGenerator: public StubCodeGenerator {
* R5_ARG3 - int length (of buffer)
*
* scratch:
* R6_ARG4 - crc table address
* R7_ARG5 - tmp1
* R8_ARG6 - tmp2
* R2, R6-R12
*
* Ouput:
* R3_RET - int crc result
...
...
@@ -2496,28 +2494,62 @@ class StubGenerator: public StubCodeGenerator {
address
start
=
__
function_entry
();
// Remember stub start address (is rtn value).
// arguments to kernel_crc32:
Register
crc
=
R3_ARG1
;
// Current checksum, preset by caller or result from previous call.
Register
data
=
R4_ARG2
;
// source byte array
Register
dataLen
=
R5_ARG3
;
// #bytes to process
Register
table
=
R6_ARG4
;
// crc table address
const
Register
crc
=
R3_ARG1
;
// Current checksum, preset by caller or result from previous call.
const
Register
data
=
R4_ARG2
;
// source byte array
const
Register
dataLen
=
R5_ARG3
;
// #bytes to process
Register
t0
=
R9
;
// work reg for kernel* emitters
Register
t1
=
R10
;
// work reg for kernel* emitters
Register
t2
=
R11
;
// work reg for kernel* emitters
Register
t3
=
R12
;
// work reg for kernel* emitters
const
Register
table
=
R6
;
// crc table address
#ifdef VM_LITTLE_ENDIAN
if
(
VM_Version
::
has_vpmsumb
())
{
const
Register
constants
=
R2
;
// constants address
const
Register
bconstants
=
R8
;
// barret table address
const
Register
t0
=
R9
;
const
Register
t1
=
R10
;
const
Register
t2
=
R11
;
const
Register
t3
=
R12
;
const
Register
t4
=
R7
;
BLOCK_COMMENT
(
"Stub body {"
);
assert_different_registers
(
crc
,
data
,
dataLen
,
table
);
StubRoutines
::
ppc64
::
generate_load_crc_table_addr
(
_masm
,
table
);
StubRoutines
::
ppc64
::
generate_load_crc_constants_addr
(
_masm
,
constants
);
StubRoutines
::
ppc64
::
generate_load_crc_barret_constants_addr
(
_masm
,
bconstants
);
__
kernel_crc32_1word_vpmsumd
(
crc
,
data
,
dataLen
,
table
,
constants
,
bconstants
,
t0
,
t1
,
t2
,
t3
,
t4
);
BLOCK_COMMENT
(
"return"
);
__
mr_if_needed
(
R3_RET
,
crc
);
// Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
__
blr
();
BLOCK_COMMENT
(
"} Stub body"
);
}
else
#endif
{
const
Register
t0
=
R2
;
const
Register
t1
=
R7
;
const
Register
t2
=
R8
;
const
Register
t3
=
R9
;
const
Register
tc0
=
R10
;
const
Register
tc1
=
R11
;
const
Register
tc2
=
R12
;
BLOCK_COMMENT
(
"Stub body {"
);
assert_different_registers
(
crc
,
data
,
dataLen
,
table
);
StubRoutines
::
ppc64
::
generate_load_crc_table_addr
(
_masm
,
table
);
__
kernel_crc32_1byte
(
crc
,
data
,
dataLen
,
table
,
t0
,
t1
,
t2
,
t3
);
__
kernel_crc32_1word
(
crc
,
data
,
dataLen
,
table
,
t0
,
t1
,
t2
,
t3
,
tc0
,
tc1
,
tc2
,
table
);
BLOCK_COMMENT
(
"return"
);
__
mr_if_needed
(
R3_RET
,
crc
);
// Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
__
blr
();
BLOCK_COMMENT
(
"} Stub body"
);
}
return
start
;
}
...
...
src/cpu/ppc/vm/stubRoutines_ppc_64.cpp
浏览文件 @
1b55810b
此差异已折叠。
点击以展开。
src/cpu/ppc/vm/stubRoutines_ppc_64.hpp
浏览文件 @
1b55810b
...
...
@@ -45,6 +45,8 @@ enum platform_dependent_constants {
#else
#define CRC32_TABLES 1
#endif
#define CRC32_CONSTANTS_SIZE 1084
#define CRC32_BARRET_CONSTANTS 10
class
ppc64
{
friend
class
StubGenerator
;
...
...
@@ -53,11 +55,17 @@ class ppc64 {
// CRC32 Intrinsics.
static
juint
_crc_table
[
CRC32_TABLES
][
CRC32_COLUMN_SIZE
];
static
juint
*
_constants
;
static
juint
*
_barret_constants
;
public:
// CRC32 Intrinsics.
static
void
generate_load_crc_table_addr
(
MacroAssembler
*
masm
,
Register
table
);
static
void
generate_load_crc_constants_addr
(
MacroAssembler
*
masm
,
Register
table
);
static
void
generate_load_crc_barret_constants_addr
(
MacroAssembler
*
masm
,
Register
table
);
static
juint
*
generate_crc_constants
();
static
juint
*
generate_crc_barret_constants
();
};
...
...
src/cpu/ppc/vm/vm_version_ppc.cpp
浏览文件 @
1b55810b
...
...
@@ -102,7 +102,7 @@ void VM_Version::initialize() {
// Create and print feature-string.
char
buf
[(
num_features
+
1
)
*
16
];
// Max 16 chars per feature.
jio_snprintf
(
buf
,
sizeof
(
buf
),
"ppc64%s%s%s%s%s%s%s%s%s"
,
"ppc64%s%s%s%s%s%s%s%s%s
%s
"
,
(
has_fsqrt
()
?
" fsqrt"
:
""
),
(
has_isel
()
?
" isel"
:
""
),
(
has_lxarxeh
()
?
" lxarxeh"
:
""
),
...
...
@@ -112,7 +112,8 @@ void VM_Version::initialize() {
(
has_popcntw
()
?
" popcntw"
:
""
),
(
has_fcfids
()
?
" fcfids"
:
""
),
(
has_vand
()
?
" vand"
:
""
),
(
has_vcipher
()
?
" aes"
:
""
)
(
has_vcipher
()
?
" aes"
:
""
),
(
has_vpmsumb
()
?
" vpmsumb"
:
""
)
// Make sure number of %s matches num_features!
);
_features_str
=
strdup
(
buf
);
...
...
@@ -485,6 +486,7 @@ void VM_Version::determine_features() {
a
->
fcfids
(
F3
,
F4
);
// code[8] -> fcfids
a
->
vand
(
VR0
,
VR0
,
VR0
);
// code[9] -> vand
a
->
vcipher
(
VR0
,
VR1
,
VR2
);
// code[10] -> vcipher
a
->
vpmsumb
(
VR0
,
VR1
,
VR2
);
// code[11] -> vpmsumb
a
->
blr
();
// Emit function to set one cache line to zero. Emit function descriptor and get pointer to it.
...
...
@@ -529,6 +531,7 @@ void VM_Version::determine_features() {
if
(
code
[
feature_cntr
++
])
features
|=
fcfids_m
;
if
(
code
[
feature_cntr
++
])
features
|=
vand_m
;
if
(
code
[
feature_cntr
++
])
features
|=
vcipher_m
;
if
(
code
[
feature_cntr
++
])
features
|=
vpmsumb_m
;
// Print the detection code.
if
(
PrintAssembly
)
{
...
...
src/cpu/ppc/vm/vm_version_ppc.hpp
浏览文件 @
1b55810b
...
...
@@ -43,6 +43,7 @@ protected:
vand
,
dcba
,
vcipher
,
vpmsumb
,
num_features
// last entry to count features
};
enum
Feature_Flag_Set
{
...
...
@@ -58,6 +59,7 @@ protected:
vand_m
=
(
1
<<
vand
),
dcba_m
=
(
1
<<
dcba
),
vcipher_m
=
(
1
<<
vcipher
),
vpmsumb_m
=
(
1
<<
vpmsumb
),
all_features_m
=
-
1
};
static
int
_features
;
...
...
@@ -86,6 +88,7 @@ public:
static
bool
has_vand
()
{
return
(
_features
&
vand_m
)
!=
0
;
}
static
bool
has_dcba
()
{
return
(
_features
&
dcba_m
)
!=
0
;
}
static
bool
has_vcipher
()
{
return
(
_features
&
vcipher_m
)
!=
0
;
}
static
bool
has_vpmsumb
()
{
return
(
_features
&
vpmsumb_m
)
!=
0
;
}
static
const
char
*
cpu_features
()
{
return
_features_str
;
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录