提交 847cb7ef 编写于 作者: J Jussi Kivilinna 提交者: Herbert Xu

crypto: serpent-sse2 - change transpose_4x4 to only use integer instructions

Matrix transpose macro in serpent-sse2 uses mix of SSE2 integer and SSE floating
point instructions, which might cause performance penality on some CPUs.

This patch replaces transpose_4x4 macro with version that uses only SSE2
integer instructions.
Signed-off-by: NJussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: NHerbert Xu <herbert@gondor.apana.org.au>
上级 4c58464b
...@@ -463,23 +463,20 @@ ...@@ -463,23 +463,20 @@
pand x0, x4; \ pand x0, x4; \
pxor x2, x4; pxor x2, x4;
#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \ #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
movdqa x2, t3; \
movdqa x0, t1; \
unpcklps x3, t3; \
movdqa x0, t2; \ movdqa x0, t2; \
unpcklps x1, t1; \ punpckldq x1, x0; \
unpckhps x1, t2; \ punpckhdq x1, t2; \
movdqa t3, x1; \ movdqa x2, t1; \
unpckhps x3, x2; \ punpckhdq x3, x2; \
movdqa t1, x0; \ punpckldq x3, t1; \
movhlps t1, x1; \ movdqa x0, x1; \
movdqa t2, t1; \ punpcklqdq t1, x0; \
movlhps t3, x0; \ punpckhqdq t1, x1; \
movlhps x2, t1; \ movdqa t2, x3; \
movhlps t2, x2; \ punpcklqdq x2, t2; \
movdqa x2, x3; \ punpckhqdq x2, x3; \
movdqa t1, x2; movdqa t2, x2;
#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ #define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
movdqu (0*4*4)(in), x0; \ movdqu (0*4*4)(in), x0; \
......
...@@ -585,23 +585,20 @@ ...@@ -585,23 +585,20 @@
get_key(i, 1, RK1); \ get_key(i, 1, RK1); \
SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \ #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
movdqa x2, t3; \
movdqa x0, t1; \
unpcklps x3, t3; \
movdqa x0, t2; \ movdqa x0, t2; \
unpcklps x1, t1; \ punpckldq x1, x0; \
unpckhps x1, t2; \ punpckhdq x1, t2; \
movdqa t3, x1; \ movdqa x2, t1; \
unpckhps x3, x2; \ punpckhdq x3, x2; \
movdqa t1, x0; \ punpckldq x3, t1; \
movhlps t1, x1; \ movdqa x0, x1; \
movdqa t2, t1; \ punpcklqdq t1, x0; \
movlhps t3, x0; \ punpckhqdq t1, x1; \
movlhps x2, t1; \ movdqa t2, x3; \
movhlps t2, x2; \ punpcklqdq x2, t2; \
movdqa x2, x3; \ punpckhqdq x2, x3; \
movdqa t1, x2; movdqa t2, x2;
#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ #define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
movdqu (0*4*4)(in), x0; \ movdqu (0*4*4)(in), x0; \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册