提交 e7e96fc5 编写于 作者: R Richard Henderson 提交者: Peter Maydell

target/arm: Convert PMULL.8 to gvec

We still need two different helpers, since NEON and SVE2 get the
inputs from different locations within the source vector.  However,
we can convert both to the same internal form for computation.

The sve2 helper is not used yet, but adding it with this patch
helps illustrate why the neon changes are helpful.
Tested-by: NAlex Bennée <alex.bennee@linaro.org>
Reviewed-by: NAlex Bennée <alex.bennee@linaro.org>
Signed-off-by: NRichard Henderson <richard.henderson@linaro.org>
Message-id: 20200216214232.4230-5-richard.henderson@linaro.org
Signed-off-by: NPeter Maydell <peter.maydell@linaro.org>
上级 b9ed510e
......@@ -1574,3 +1574,5 @@ DEF_HELPER_FLAGS_6(sve_stdd_le_zd, TCG_CALL_NO_WG,
void, env, ptr, ptr, ptr, tl, i32)
DEF_HELPER_FLAGS_6(sve_stdd_be_zd, TCG_CALL_NO_WG,
void, env, ptr, ptr, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve2_pmull_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
......@@ -342,7 +342,6 @@ DEF_HELPER_2(neon_sub_u8, i32, i32, i32)
DEF_HELPER_2(neon_sub_u16, i32, i32, i32)
DEF_HELPER_2(neon_mul_u8, i32, i32, i32)
DEF_HELPER_2(neon_mul_u16, i32, i32, i32)
DEF_HELPER_2(neon_mull_p8, i64, i32, i32)
DEF_HELPER_2(neon_tst_u8, i32, i32, i32)
DEF_HELPER_2(neon_tst_u16, i32, i32, i32)
......@@ -695,6 +694,8 @@ DEF_HELPER_FLAGS_4(gvec_ushl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_pmul_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_pmull_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(neon_pmull_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
#ifdef TARGET_AARCH64
#include "helper-a64.h"
#include "helper-sve.h"
......
......@@ -1129,38 +1129,6 @@ NEON_VOP(mul_u8, neon_u8, 4)
NEON_VOP(mul_u16, neon_u16, 2)
#undef NEON_FN
/* Polynomial multiplication is like integer multiplication except the
partial products are XORed, not added. */
uint64_t HELPER(neon_mull_p8)(uint32_t op1, uint32_t op2)
{
uint64_t result = 0;
uint64_t mask;
uint64_t op2ex = op2;
op2ex = (op2ex & 0xff) |
((op2ex & 0xff00) << 8) |
((op2ex & 0xff0000) << 16) |
((op2ex & 0xff000000) << 24);
while (op1) {
mask = 0;
if (op1 & 1) {
mask |= 0xffff;
}
if (op1 & (1 << 8)) {
mask |= (0xffffU << 16);
}
if (op1 & (1 << 16)) {
mask |= (0xffffULL << 32);
}
if (op1 & (1 << 24)) {
mask |= (0xffffULL << 48);
}
result ^= op2ex & mask;
op1 = (op1 >> 1) & 0x7f7f7f7f;
op2ex <<= 1;
}
return result;
}
#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
NEON_VOP(tst_u8, neon_u8, 4)
NEON_VOP(tst_u16, neon_u16, 2)
......
......@@ -10542,10 +10542,6 @@ static void handle_3rd_widening(DisasContext *s, int is_q, int is_u, int size,
gen_helper_neon_addl_saturate_s32(tcg_passres, cpu_env,
tcg_passres, tcg_passres);
break;
case 14: /* PMULL */
assert(size == 0);
gen_helper_neon_mull_p8(tcg_passres, tcg_op1, tcg_op2);
break;
default:
g_assert_not_reached();
}
......@@ -10709,11 +10705,21 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
handle_3rd_narrowing(s, is_q, is_u, size, opcode, rd, rn, rm);
break;
case 14: /* PMULL, PMULL2 */
if (is_u || size == 1 || size == 2) {
if (is_u) {
unallocated_encoding(s);
return;
}
if (size == 3) {
switch (size) {
case 0: /* PMULL.P8 */
if (!fp_access_check(s)) {
return;
}
/* The Q field specifies lo/hi half input for this insn. */
gen_gvec_op3_ool(s, true, rd, rn, rm, is_q,
gen_helper_neon_pmull_h);
break;
case 3: /* PMULL.P64 */
if (!dc_isar_feature(aa64_pmull, s)) {
unallocated_encoding(s);
return;
......@@ -10724,9 +10730,13 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
/* The Q field specifies lo/hi half input for this insn. */
gen_gvec_op3_ool(s, true, rd, rn, rm, is_q,
gen_helper_gvec_pmull_q);
return;
break;
default:
unallocated_encoding(s);
break;
}
goto is_widening;
return;
case 9: /* SQDMLAL, SQDMLAL2 */
case 11: /* SQDMLSL, SQDMLSL2 */
case 13: /* SQDMULL, SQDMULL2 */
......@@ -10747,7 +10757,6 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
unallocated_encoding(s);
return;
}
is_widening:
if (!fp_access_check(s)) {
return;
}
......
......@@ -5866,15 +5866,20 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
return 1;
}
/* Handle VMULL.P64 (Polynomial 64x64 to 128 bit multiply)
* outside the loop below as it only performs a single pass.
*/
if (op == 14 && size == 2) {
if (!dc_isar_feature(aa32_pmull, s)) {
return 1;
/* Handle polynomial VMULL in a single pass. */
if (op == 14) {
if (size == 0) {
/* VMULL.P8 */
tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, 16, 16,
0, gen_helper_neon_pmull_h);
} else {
/* VMULL.P64 */
if (!dc_isar_feature(aa32_pmull, s)) {
return 1;
}
tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, 16, 16,
0, gen_helper_gvec_pmull_q);
}
tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, 16, 16,
0, gen_helper_gvec_pmull_q);
return 0;
}
......@@ -5952,11 +5957,6 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
/* VMLAL, VQDMLAL, VMLSL, VQDMLSL, VMULL, VQDMULL */
gen_neon_mull(cpu_V0, tmp, tmp2, size, u);
break;
case 14: /* Polynomial VMULL */
gen_helper_neon_mull_p8(cpu_V0, tmp, tmp2);
tcg_temp_free_i32(tmp2);
tcg_temp_free_i32(tmp);
break;
default: /* 15 is RESERVED: caught earlier */
abort();
}
......
......@@ -1197,3 +1197,63 @@ void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
/*
* 8x8->16 polynomial multiply.
*
* The byte inputs are expanded to (or extracted from) half-words.
* Note that neon and sve2 get the inputs from different positions.
* This allows 4 bytes to be processed in parallel with uint64_t.
*/
static uint64_t expand_byte_to_half(uint64_t x)
{
return (x & 0x000000ff)
| ((x & 0x0000ff00) << 8)
| ((x & 0x00ff0000) << 16)
| ((x & 0xff000000) << 24);
}
static uint64_t pmull_h(uint64_t op1, uint64_t op2)
{
uint64_t result = 0;
int i;
for (i = 0; i < 8; ++i) {
uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff;
result ^= op2 & mask;
op1 >>= 1;
op2 <<= 1;
}
return result;
}
void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
{
int hi = simd_data(desc);
uint64_t *d = vd, *n = vn, *m = vm;
uint64_t nn = n[hi], mm = m[hi];
d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
nn >>= 32;
mm >>= 32;
d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
clear_tail(d, 16, simd_maxsz(desc));
}
#ifdef TARGET_AARCH64
void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
{
int shift = simd_data(desc) * 8;
intptr_t i, opr_sz = simd_oprsz(desc);
uint64_t *d = vd, *n = vn, *m = vm;
for (i = 0; i < opr_sz / 8; ++i) {
uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull;
uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull;
d[i] = pmull_h(nn, mm);
}
}
#endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册