From 8e18cde30b06d2e7411bf38091c4e30602f85cdd Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Tue, 15 Mar 2011 16:26:51 +0000 Subject: [PATCH] target-arm: Fix VLD of single element to all lanes Fix several bugs in VLD of single element to all lanes: The "single element to all lanes" form of VLD1 differs from those for VLD2, VLD3 and VLD4 in that bit 5 indicates whether the loaded element should be written to one or two Dregs (rather than being a register stride). Handle this by special-casing VLD1 rather than trying to have one loop which deals with both VLD1 and 2/3/4. Handle VLD4.32 with 16 byte alignment specified, rather than UNDEFfing. UNDEF for the invalid size and alignment combinations. Signed-off-by: Peter Maydell Signed-off-by: Aurelien Jarno --- target-arm/translate.c | 84 +++++++++++++++++++++++++++++------------- 1 file changed, 59 insertions(+), 25 deletions(-) diff --git a/target-arm/translate.c b/target-arm/translate.c index f69912f20c..1dfd4823f5 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -2648,6 +2648,28 @@ static void gen_neon_dup_high16(TCGv var) tcg_temp_free_i32(tmp); } +static TCGv gen_load_and_replicate(DisasContext *s, TCGv addr, int size) +{ + /* Load a single Neon element and replicate into a 32 bit TCG reg */ + TCGv tmp; + switch (size) { + case 0: + tmp = gen_ld8u(addr, IS_USER(s)); + gen_neon_dup_u8(tmp, 0); + break; + case 1: + tmp = gen_ld16u(addr, IS_USER(s)); + gen_neon_dup_low16(tmp); + break; + case 2: + tmp = gen_ld32(addr, IS_USER(s)); + break; + default: /* Avoid compiler warnings. */ + abort(); + } + return tmp; +} + /* Disassemble a VFP instruction. Returns nonzero if an error occured (ie. an undefined instruction). */ static int disas_vfp_insn(CPUState * env, DisasContext *s, uint32_t insn) @@ -3890,36 +3912,48 @@ static int disas_neon_ls_insn(CPUState * env, DisasContext *s, uint32_t insn) size = (insn >> 10) & 3; if (size == 3) { /* Load single element to all lanes. */ - if (!load) + int a = (insn >> 4) & 1; + if (!load) { return 1; + } size = (insn >> 6) & 3; nregs = ((insn >> 8) & 3) + 1; - stride = (insn & (1 << 5)) ? 2 : 1; - load_reg_var(s, addr, rn); - for (reg = 0; reg < nregs; reg++) { - switch (size) { - case 0: - tmp = gen_ld8u(addr, IS_USER(s)); - gen_neon_dup_u8(tmp, 0); - break; - case 1: - tmp = gen_ld16u(addr, IS_USER(s)); - gen_neon_dup_low16(tmp); - break; - case 2: - tmp = gen_ld32(addr, IS_USER(s)); - break; - case 3: + + if (size == 3) { + if (nregs != 4 || a == 0) { return 1; - default: /* Avoid compiler warnings. */ - abort(); } - tcg_gen_addi_i32(addr, addr, 1 << size); - tmp2 = tcg_temp_new_i32(); - tcg_gen_mov_i32(tmp2, tmp); - neon_store_reg(rd, 0, tmp2); - neon_store_reg(rd, 1, tmp); - rd += stride; + /* For VLD4 size==3 a == 1 means 32 bits at 16 byte alignment */ + size = 2; + } + if (nregs == 1 && a == 1 && size == 0) { + return 1; + } + if (nregs == 3 && a == 1) { + return 1; + } + load_reg_var(s, addr, rn); + if (nregs == 1) { + /* VLD1 to all lanes: bit 5 indicates how many Dregs to write */ + tmp = gen_load_and_replicate(s, addr, size); + tcg_gen_st_i32(tmp, cpu_env, neon_reg_offset(rd, 0)); + tcg_gen_st_i32(tmp, cpu_env, neon_reg_offset(rd, 1)); + if (insn & (1 << 5)) { + tcg_gen_st_i32(tmp, cpu_env, neon_reg_offset(rd + 1, 0)); + tcg_gen_st_i32(tmp, cpu_env, neon_reg_offset(rd + 1, 1)); + } + tcg_temp_free_i32(tmp); + } else { + /* VLD2/3/4 to all lanes: bit 5 indicates register stride */ + stride = (insn & (1 << 5)) ? 2 : 1; + for (reg = 0; reg < nregs; reg++) { + tmp = gen_load_and_replicate(s, addr, size); + tcg_gen_st_i32(tmp, cpu_env, neon_reg_offset(rd, 0)); + tcg_gen_st_i32(tmp, cpu_env, neon_reg_offset(rd, 1)); + tcg_temp_free_i32(tmp); + tcg_gen_addi_i32(addr, addr, 1 << size); + rd += stride; + } } stride = (1 << size) * nregs; } else { -- GitLab