diff --git a/eng/pipelines/common/templates/runtimes/run-test-job.yml b/eng/pipelines/common/templates/runtimes/run-test-job.yml index a26dfd9041dd68790bcd532cf9f6866d9f192922..fb1a3b55f9afc9ccbe66679f402630ff2d473b64 100644 --- a/eng/pipelines/common/templates/runtimes/run-test-job.yml +++ b/eng/pipelines/common/templates/runtimes/run-test-job.yml @@ -326,6 +326,7 @@ jobs: osSubgroup: ${{ parameters.osSubgroup}} runtimeFlavorDisplayName: ${{ parameters.runtimeFlavorDisplayName }} shouldContinueOnError: ${{ parameters.shouldContinueOnError }} + runtimeVariant: ${{ parameters.runtimeVariant }} ${{ if eq(variables['System.TeamProject'], 'public') }}: creator: $(Build.DefinitionName) diff --git a/src/mono/mono/mini/cpu-amd64.md b/src/mono/mono/mini/cpu-amd64.md index e581a983944f7c5bf82bbef27c332306ae272e3b..48a1b9f0fbafaebb9e94f7aade82120d81a3dd11 100644 --- a/src/mono/mono/mini/cpu-amd64.md +++ b/src/mono/mono/mini/cpu-amd64.md @@ -779,9 +779,7 @@ extract_i4: dest:i src1:x len:5 extract_i8: dest:i src1:x len:9 extract_i2: dest:i src1:x len:13 -extract_u2: dest:i src1:x len:13 extract_i1: dest:i src1:x len:13 -extract_u1: dest:i src1:x len:13 extract_r8: dest:f src1:x len:5 iconv_to_r4_raw: dest:f src1:i len:10 diff --git a/src/mono/mono/mini/cpu-x86.md b/src/mono/mono/mini/cpu-x86.md index a0f0fc14f15dd95ecab652edffefb28e155ac14a..25e44d16eea5b7a7d19121e96763606c3a145afa 100644 --- a/src/mono/mono/mini/cpu-x86.md +++ b/src/mono/mono/mini/cpu-x86.md @@ -631,9 +631,7 @@ iconv_to_x: dest:x src1:i len:4 extract_i4: dest:i src1:x len:4 extract_i2: dest:i src1:x len:10 -extract_u2: dest:i src1:x len:10 extract_i1: dest:i src1:x len:10 -extract_u1: dest:i src1:x len:10 extract_r8: dest:f src1:x len:8 insert_i2: dest:x src1:x src2:i len:5 clob:1 diff --git a/src/mono/mono/mini/intrinsics.c b/src/mono/mono/mini/intrinsics.c index d194525124dbd9b6239aadb65ef896c9cc0f41f1..948fd5dde5412844fd2ae353196d8ed2ee2e233c 100644 --- a/src/mono/mono/mini/intrinsics.c +++ b/src/mono/mono/mini/intrinsics.c @@ -140,15 +140,18 @@ llvm_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign } #if defined(TARGET_X86) || defined(TARGET_AMD64) else if (!strcmp (cmethod->name, "Round") && (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE41) != 0) { - // special case: emit vroundps for MathF.Round directly instead of what llvm.round.f32 emits + // special case: emit vroundss for MathF.Round directly instead of what llvm.round.f32 emits // to align with CoreCLR behavior int xreg = alloc_xreg (cfg); EMIT_NEW_UNALU (cfg, ins, OP_FCONV_TO_R4_X, xreg, args [0]->dreg); - EMIT_NEW_UNALU (cfg, ins, OP_SSE41_ROUNDS, xreg, xreg); + int xround = alloc_xreg (cfg); + EMIT_NEW_BIALU (cfg, ins, OP_SSE41_ROUNDS, xround, xreg, xreg); ins->inst_c0 = 0x4; // vroundss xmm0, xmm0, xmm0, 0x4 (mode for rounding) ins->inst_c1 = MONO_TYPE_R4; int dreg = alloc_freg (cfg); - EMIT_NEW_UNALU (cfg, ins, OP_EXTRACT_R4, dreg, xreg); + EMIT_NEW_UNALU (cfg, ins, OP_EXTRACT_R4, dreg, xround); + ins->inst_c0 = 0; + ins->inst_c1 = MONO_TYPE_R4; return ins; } #endif diff --git a/src/mono/mono/mini/mini-amd64.c b/src/mono/mono/mini/mini-amd64.c index 367324519dcd88d71fff925e61afbc8a6cdb2072..150791b62c3dde2761d5d6fca7f7f9487d18fb56 100644 --- a/src/mono/mono/mini/mini-amd64.c +++ b/src/mono/mono/mini/mini-amd64.c @@ -4076,8 +4076,8 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb) case OP_XEXTRACT_R4: case OP_XEXTRACT_R8: - case OP_XEXTRACT_I32: - case OP_XEXTRACT_I64: { + case OP_XEXTRACT_I4: + case OP_XEXTRACT_I8: { // TODO g_assert_not_reached(); break; @@ -7235,19 +7235,17 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) } break; case OP_EXTRACT_I1: - case OP_EXTRACT_U1: amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 4); if (ins->inst_c0) amd64_shift_reg_imm (code, X86_SHR, ins->dreg, ins->inst_c0 * 8); - amd64_widen_reg (code, ins->dreg, ins->dreg, ins->opcode == OP_EXTRACT_I1, FALSE); + amd64_widen_reg (code, ins->dreg, ins->dreg, ins->inst_c1 == MONO_TYPE_I1, FALSE); break; case OP_EXTRACT_I2: - case OP_EXTRACT_U2: /*amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 4); if (ins->inst_c0) amd64_shift_reg_imm_size (code, X86_SHR, ins->dreg, 16, 4);*/ amd64_sse_pextrw_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0); - amd64_widen_reg_size (code, ins->dreg, ins->dreg, ins->opcode == OP_EXTRACT_I2, TRUE, 4); + amd64_widen_reg_size (code, ins->dreg, ins->dreg, ins->inst_c1 == MONO_TYPE_I2, TRUE, 4); break; case OP_EXTRACT_R8: if (ins->inst_c0) @@ -8965,6 +8963,8 @@ mono_arch_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMetho ins->inst_c1 = MONO_TYPE_R8; int dreg = alloc_freg (cfg); EMIT_NEW_UNALU (cfg, ins, OP_EXTRACT_R8, dreg, xreg); + ins->inst_c0 = 0; + ins->inst_c1 = MONO_TYPE_R8; return ins; } } diff --git a/src/mono/mono/mini/mini-llvm.c b/src/mono/mono/mini/mini-llvm.c index 8c880d19c54447f93901a2515d6063185386356e..b429d49ca12305e9d713f0bc75b8e4fe45613fb3 100644 --- a/src/mono/mono/mini/mini-llvm.c +++ b/src/mono/mono/mini/mini-llvm.c @@ -287,6 +287,20 @@ static LLVMRealPredicate fpcond_to_llvm_cond [] = { LLVMRealUNO }; +/* See Table 3-1 ("Comparison Predicate for CMPPD and CMPPS Instructions") in + * Vol. 2A of the Intel SDM. + */ +enum { + SSE_eq_ord_nosignal = 0, + SSE_lt_ord_signal = 1, + SSE_le_ord_signal = 2, + SSE_unord_nosignal = 3, + SSE_neq_unord_nosignal = 4, + SSE_nlt_unord_signal = 5, + SSE_nle_unord_signal = 6, + SSE_ord_nosignal = 7, +}; + static MonoLLVMModule aot_module; static GHashTable *intrins_id_to_intrins; @@ -467,6 +481,12 @@ const_int1 (int v) return LLVMConstInt (LLVMInt1Type (), v ? 1 : 0, FALSE); } +static LLVMValueRef +const_int8 (int v) +{ + return LLVMConstInt (LLVMInt8Type (), v, FALSE); +} + static LLVMValueRef const_int32 (int v) { @@ -1133,12 +1153,10 @@ simd_op_to_llvm_type (int opcode) case OP_EXPAND_I4: return sse_i4_t; case OP_EXTRACT_I2: - case OP_EXTRACT_U2: case OP_EXTRACTX_U2: case OP_EXPAND_I2: return sse_i2_t; case OP_EXTRACT_I1: - case OP_EXTRACT_U1: case OP_EXPAND_I1: return sse_i1_t; case OP_EXTRACT_R4: @@ -7354,6 +7372,53 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb) } #if defined(TARGET_X86) || defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined(TARGET_WASM) + case OP_EXTRACTX_U2: + case OP_XEXTRACT_I1: + case OP_XEXTRACT_I2: + case OP_XEXTRACT_I4: + case OP_XEXTRACT_I8: + case OP_XEXTRACT_R4: + case OP_XEXTRACT_R8: + case OP_EXTRACT_I1: + case OP_EXTRACT_I2: + case OP_EXTRACT_I4: + case OP_EXTRACT_I8: + case OP_EXTRACT_R4: + case OP_EXTRACT_R8: { + MonoTypeEnum mono_elt_t = inst_c1_type (ins); + LLVMTypeRef elt_t = primitive_type_to_llvm_type (mono_elt_t); + gboolean sext = FALSE; + gboolean zext = FALSE; + switch (mono_elt_t) { + case MONO_TYPE_I1: case MONO_TYPE_I2: sext = TRUE; break; + case MONO_TYPE_U1: case MONO_TYPE_U2: zext = TRUE; break; + } + LLVMValueRef element_ix = NULL; + switch (ins->opcode) { + case OP_XEXTRACT_I1: + case OP_XEXTRACT_I2: + case OP_XEXTRACT_I4: + case OP_XEXTRACT_R4: + case OP_XEXTRACT_R8: + case OP_XEXTRACT_I8: + element_ix = rhs; + break; + default: + element_ix = const_int32 (ins->inst_c0); + } + LLVMTypeRef lhs_t = LLVMTypeOf (lhs); + int vec_width = mono_llvm_get_prim_size_bits (lhs_t); + int elem_width = mono_llvm_get_prim_size_bits (elt_t); + LLVMTypeRef ret_t = LLVMVectorType (elt_t, vec_width / elem_width); + LLVMValueRef src = LLVMBuildBitCast (builder, lhs, ret_t, "extract"); + LLVMValueRef result = LLVMBuildExtractElement (builder, src, element_ix, "extract"); + if (zext) + result = LLVMBuildZExt (builder, result, i4_t, "extract_zext"); + else if (sext) + result = LLVMBuildSExt (builder, result, i4_t, "extract_sext"); + values [ins->dreg] = result; + break; + } case OP_EXPAND_I1: case OP_EXPAND_I2: case OP_EXPAND_I4: @@ -7720,44 +7785,6 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb) values [ins->dreg] = LLVMBuildSExt (builder, pcmp, retType, ""); break; } - case OP_EXTRACT_R4: - case OP_EXTRACT_R8: - case OP_EXTRACT_I8: - case OP_EXTRACT_I4: - case OP_EXTRACT_I2: - case OP_EXTRACT_U2: - case OP_EXTRACTX_U2: - case OP_EXTRACT_I1: - case OP_EXTRACT_U1: { - LLVMTypeRef t; - gboolean zext = FALSE; - - t = simd_op_to_llvm_type (ins->opcode); - - switch (ins->opcode) { - case OP_EXTRACT_R4: - case OP_EXTRACT_R8: - case OP_EXTRACT_I8: - case OP_EXTRACT_I4: - case OP_EXTRACT_I2: - case OP_EXTRACT_I1: - break; - case OP_EXTRACT_U2: - case OP_EXTRACTX_U2: - case OP_EXTRACT_U1: - zext = TRUE; - break; - default: - t = LLVMInt32Type (); - g_assert_not_reached (); - } - - lhs = LLVMBuildBitCast (builder, lhs, t, ""); - values [ins->dreg] = LLVMBuildExtractElement (builder, lhs, LLVMConstInt (LLVMInt32Type (), ins->inst_c0, FALSE), ""); - if (zext) - values [ins->dreg] = LLVMBuildZExt (builder, values [ins->dreg], LLVMInt32Type (), ""); - break; - } case OP_XINSERT_I2: { LLVMBasicBlockRef bbs [64]; LLVMValueRef switch_ins; @@ -8313,36 +8340,6 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb) break; } - case OP_SSE_SHUFFLE: { - LLVMValueRef shuffle_vec = create_const_vector_4_i32 ( - ((ins->inst_c0 >> 0) & 0x3) + 0, // take two elements from lhs - ((ins->inst_c0 >> 2) & 0x3) + 0, - ((ins->inst_c0 >> 4) & 0x3) + 4, // and two from rhs - ((ins->inst_c0 >> 6) & 0x3) + 4); - values [ins->dreg] = LLVMBuildShuffleVector (builder, lhs, rhs, shuffle_vec, ""); - break; - } - - case OP_SSE2_SHUFFLE: { - LLVMValueRef right_vec; - LLVMValueRef shuffle_vec; - if (ins->inst_c1 == MONO_TYPE_R8) { - right_vec = rhs; - shuffle_vec = create_const_vector_2_i32 ( - ((ins->inst_c0 >> 0) & 0x1) + 0, - ((ins->inst_c0 >> 1) & 0x1) + 2); - } else { - right_vec = LLVMGetUndef (LLVMVectorType (LLVMInt32Type (), 4)); - shuffle_vec = create_const_vector_4_i32 ( - (ins->inst_c0 >> 0) & 0x3, - (ins->inst_c0 >> 2) & 0x3, - (ins->inst_c0 >> 4) & 0x3, - (ins->inst_c0 >> 6) & 0x3); - } - values [ins->dreg] = LLVMBuildShuffleVector (builder, lhs, right_vec, shuffle_vec, ""); - break; - } - case OP_SSE_OR: { LLVMValueRef vec_lhs_i64 = convert (ctx, lhs, sse_i8_t); LLVMValueRef vec_rhs_i64 = convert (ctx, rhs, sse_i8_t); @@ -8418,30 +8415,35 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb) case OP_SSE_CMPSS: case OP_SSE2_CMPSD: { int imm = -1; + gboolean swap = FALSE; switch (ins->inst_c0) { - case CMP_EQ: imm = 0; break; - case CMP_GT: imm = 6; break; - case CMP_GE: imm = 5; break; - case CMP_LT: imm = 1; break; - case CMP_LE: imm = 2; break; - case CMP_NE: imm = 4; break; - case CMP_ORD: imm = 7; break; - case CMP_UNORD: imm = 3; break; + case CMP_EQ: imm = SSE_eq_ord_nosignal; break; + case CMP_GT: imm = SSE_lt_ord_signal; swap = TRUE; break; + case CMP_GE: imm = SSE_le_ord_signal; swap = TRUE; break; + case CMP_LT: imm = SSE_lt_ord_signal; break; + case CMP_LE: imm = SSE_le_ord_signal; break; + case CMP_GT_UN: imm = SSE_nle_unord_signal; break; + case CMP_GE_UN: imm = SSE_nlt_unord_signal; break; + case CMP_LT_UN: imm = SSE_nle_unord_signal; swap = TRUE; break; + case CMP_LE_UN: imm = SSE_nlt_unord_signal; swap = TRUE; break; + case CMP_NE: imm = SSE_neq_unord_nosignal; break; + case CMP_ORD: imm = SSE_ord_nosignal; break; + case CMP_UNORD: imm = SSE_unord_nosignal; break; default: g_assert_not_reached (); break; } LLVMValueRef cmp = LLVMConstInt (LLVMInt8Type (), imm, FALSE); LLVMValueRef args [] = { lhs, rhs, cmp }; + if (swap) { + args [0] = rhs; + args [1] = lhs; + } + IntrinsicId id = (IntrinsicId) 0; switch (ins->opcode) { - case OP_SSE_CMPSS: - values [ins->dreg] = call_intrins (ctx, INTRINS_SSE_CMPSS, args, ""); - break; - case OP_SSE2_CMPSD: - values [ins->dreg] = call_intrins (ctx, INTRINS_SSE_CMPSD, args, ""); - break; - default: - g_assert_not_reached (); - break; + case OP_SSE_CMPSS: id = INTRINS_SSE_CMPSS; break; + case OP_SSE2_CMPSD: id = INTRINS_SSE_CMPSD; break; + default: g_assert_not_reached (); break; } + values [ins->dreg] = call_intrins (ctx, id, args, ""); break; } case OP_SSE_COMISS: { @@ -8778,50 +8780,50 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb) break; } + case OP_SSE_SHUFPS: case OP_SSE2_SHUFPD: case OP_SSE2_PSHUFD: case OP_SSE2_PSHUFHW: case OP_SSE2_PSHUFLW: { - LLVMBasicBlockRef bbs [256 + 1]; - LLVMValueRef switch_ins; - LLVMValueRef v1, v2, mask; - LLVMValueRef phi_values [256 + 1]; - int ncases; - - // FIXME: Optimize constant shuffle mask - - if (ins->opcode == OP_SSE2_SHUFPD) { - /* 3 parameter version */ - v1 = lhs; - v2 = rhs; - mask = values [ins->sreg3]; - ncases = 4; - } else { - /* 2 parameter version */ - v1 = v2 = lhs; - mask = rhs; - ncases = 256; - } - - for (int i = 0; i < ncases; ++i) - bbs [i] = gen_bb (ctx, "PSHUFHW_CASE_BB"); - cbb = gen_bb (ctx, "PSHUFHW_COND_BB"); - /* No default case */ - switch_ins = LLVMBuildSwitch (builder, mask, bbs [0], 0); - for (int i = 0; i < ncases; ++i) { - LLVMAddCase (switch_ins, LLVMConstInt (LLVMInt32Type (), i, FALSE), bbs [i]); - LLVMPositionBuilderAtEnd (builder, bbs [i]); + LLVMTypeRef ret_t = LLVMTypeOf (lhs); + LLVMValueRef l = lhs; + LLVMValueRef r = rhs; + LLVMValueRef ctl = arg3; + const char *oname = ""; + int ncases = 0; + + switch (ins->opcode) { + case OP_SSE_SHUFPS: ncases = 256; break; + case OP_SSE2_SHUFPD: ncases = 4; break; + case OP_SSE2_PSHUFD: case OP_SSE2_PSHUFHW: case OP_SSE2_PSHUFLW: ncases = 256; r = lhs; ctl = rhs; break; + } + + switch (ins->opcode) { + case OP_SSE_SHUFPS: oname = "sse_shufps"; break; + case OP_SSE2_SHUFPD: oname = "sse2_shufpd"; break; + case OP_SSE2_PSHUFD: oname = "sse2_pshufd"; break; + case OP_SSE2_PSHUFHW: oname = "sse2_pshufhw"; break; + case OP_SSE2_PSHUFLW: oname = "sse2_pshuflw"; break; + } - /* Convert the x86 shuffle mask to LLVM's */ - guint32 imask = i; - int mask_values [8]; - int mask_len = 0; + ctl = LLVMBuildAnd (builder, ctl, const_int32 (ncases - 1), ""); + ImmediateUnrollCtx ictx = immediate_unroll_begin (ctx, bb, ncases, ctl, ret_t, oname); + int mask_values [8]; + int mask_len = 0; + int i = 0; + while (immediate_unroll_next (&ictx, &i)) { switch (ins->opcode) { + case OP_SSE_SHUFPS: + mask_len = 4; + mask_values [0] = ((i >> 0) & 0x3) + 0; // take two elements from lhs + mask_values [1] = ((i >> 2) & 0x3) + 0; + mask_values [2] = ((i >> 4) & 0x3) + 4; // and two from rhs + mask_values [3] = ((i >> 6) & 0x3) + 4; + break; case OP_SSE2_SHUFPD: - /* Bit 0 selects v1[0] or v1[1], bit 1 selects v2[0] or v2[1] */ mask_len = 2; - mask_values [0] = ((imask >> 0) & 1); - mask_values [1] = ((imask >> 1) & 1) + 2; + mask_values [0] = ((i >> 0) & 0x1) + 0; + mask_values [1] = ((i >> 1) & 0x1) + 2; break; case OP_SSE2_PSHUFD: /* @@ -8830,7 +8832,7 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb) */ mask_len = 4; for (int j = 0; j < 4; ++j) { - int windex = (imask >> (j * 2)) & 0x3; + int windex = (i >> (j * 2)) & 0x3; mask_values [j] = windex; } break; @@ -8844,7 +8846,7 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb) for (int j = 0; j < 4; ++j) mask_values [j] = j; for (int j = 0; j < 4; ++j) { - int windex = (imask >> (j * 2)) & 0x3; + int windex = (i >> (j * 2)) & 0x3; mask_values [j + 4] = 4 + windex; } break; @@ -8854,21 +8856,18 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb) for (int j = 0; j < 4; ++j) mask_values [j + 4] = j + 4; for (int j = 0; j < 4; ++j) { - int windex = (imask >> (j * 2)) & 0x3; + int windex = (i >> (j * 2)) & 0x3; mask_values [j] = windex; } break; - default: - g_assert_not_reached (); - break; } - phi_values [i] = LLVMBuildShuffleVector (builder, v1, v2, create_const_vector_i32 (mask_values, mask_len), ""); - LLVMBuildBr (builder, cbb); + LLVMValueRef mask = create_const_vector_i32 (mask_values, mask_len); + LLVMValueRef result = LLVMBuildShuffleVector (builder, l, r, mask, oname); + immediate_unroll_commit (&ictx, i, result); } - - LLVMPositionBuilderAtEnd (builder, cbb); - values [ins->dreg] = LLVMBuildPhi (builder, LLVMTypeOf (phi_values [0]), ""); - LLVMAddIncoming (values [ins->dreg], phi_values, bbs, ncases); + immediate_unroll_default (&ictx); + immediate_unroll_commit_default (&ictx, LLVMGetUndef (ret_t)); + values [ins->dreg] = immediate_unroll_end (&ictx, &cbb); break; } @@ -8880,10 +8879,11 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb) break; } case OP_SSE3_MOVDDUP_MEM: { - int mask [] = { 0, 0 }; - LLVMTypeRef t = type_to_sse_type (ins->inst_c1); - LLVMValueRef value = mono_llvm_build_load (builder, convert (ctx, lhs, LLVMPointerType (t, 0)), "", FALSE); - values [ins->dreg] = LLVMBuildShuffleVector (builder, value, LLVMGetUndef (LLVMTypeOf (value)), create_const_vector_i32 (mask, 2), ""); + LLVMValueRef undef = LLVMGetUndef (v128_r8_t); + LLVMValueRef addr = convert (ctx, lhs, LLVMPointerType (r8_t, 0)); + LLVMValueRef elem = mono_llvm_build_aligned_load (builder, addr, "sse3_movddup_mem", FALSE, 1); + LLVMValueRef val = LLVMBuildInsertElement (builder, undef, elem, const_int32 (0), "sse3_movddup_mem"); + values [ins->dreg] = LLVMBuildShuffleVector (builder, val, undef, LLVMConstNull (LLVMVectorType (i4_t, 2)), "sse3_movddup_mem"); break; } case OP_SSE3_MOVSHDUP: { @@ -8941,15 +8941,51 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb) break; } - case OP_SSE41_DPPS_IMM: { - LLVMValueRef args [] = { lhs, rhs, LLVMConstInt (LLVMInt8Type (), ins->inst_c0, FALSE) }; - values [ins->dreg] = call_intrins (ctx, INTRINS_SSE_DPPS, args, dname); - break; - } - - case OP_SSE41_DPPD_IMM: { - LLVMValueRef args [] = { lhs, rhs, LLVMConstInt (LLVMInt8Type (), ins->inst_c0, FALSE) }; - values [ins->dreg] = call_intrins (ctx, INTRINS_SSE_DPPD, args, dname); + case OP_SSE41_DPPS: + case OP_SSE41_DPPD: { + /* Bits 0, 1, 4, 5 are meaningful for the control mask + * in dppd; all bits are meaningful for dpps. + */ + LLVMTypeRef ret_t = NULL; + LLVMValueRef mask = NULL; + int mask_bits = 0; + int high_shift = 0; + int low_mask = 0; + IntrinsicId iid = (IntrinsicId) 0; + const char *oname = ""; + switch (ins->opcode) { + case OP_SSE41_DPPS: + ret_t = v128_r4_t; + mask = const_int8 (0xff); // 0b11111111 + mask_bits = 8; + high_shift = 4; + low_mask = 0xf; + iid = INTRINS_SSE_DPPS; + oname = "sse41_dpps"; + break; + case OP_SSE41_DPPD: + ret_t = v128_r8_t; + mask = const_int8 (0x33); // 0b00110011 + mask_bits = 4; + high_shift = 2; + low_mask = 0x3; + iid = INTRINS_SSE_DPPD; + oname = "sse41_dppd"; + break; + } + LLVMValueRef args [] = { lhs, rhs, NULL }; + LLVMValueRef index = LLVMBuildAnd (builder, convert (ctx, arg3, i1_t), mask, oname); + ImmediateUnrollCtx ictx = immediate_unroll_begin (ctx, bb, 1 << mask_bits, index, ret_t, oname); + int i = 0; + while (immediate_unroll_next (&ictx, &i)) { + int imm = ((i >> high_shift) << 4) | (i & low_mask); + args [2] = const_int8 (imm); + LLVMValueRef result = call_intrins (ctx, iid, args, dname); + immediate_unroll_commit (&ictx, imm, result); + } + immediate_unroll_default (&ictx); + immediate_unroll_commit_default (&ictx, LLVMGetUndef (ret_t)); + values [ins->dreg] = immediate_unroll_end (&ictx, &cbb); break; } @@ -8980,19 +9016,31 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb) break; } - case OP_SSE41_BLEND_IMM: { - int nelem = LLVMGetVectorSize (LLVMTypeOf (lhs)); - g_assert(nelem >= 2 && nelem <= 8); // I2, U2, R4, R8 - - int mask_values [8]; - for (int i = 0; i < nelem; i++) { - // n-bit in inst_c0 (control byte) is set to 1 - gboolean bit_set = ((ins->inst_c0 & ( 1 << i )) >> i); - mask_values [i] = i + (bit_set ? 1 : 0) * nelem; + case OP_SSE41_BLEND: { + LLVMTypeRef ret_t = LLVMTypeOf (lhs); + int nelem = LLVMGetVectorSize (ret_t); + g_assert (nelem >= 2 && nelem <= 8); // I2, U2, R4, R8 + int unique_ctl_patterns = 1 << nelem; + int ctlmask = unique_ctl_patterns - 1; + LLVMValueRef ctl = convert (ctx, arg3, i1_t); + ctl = LLVMBuildAnd (builder, ctl, const_int8 (ctlmask), "sse41_blend"); + + ImmediateUnrollCtx ictx = immediate_unroll_begin (ctx, bb, unique_ctl_patterns, ctl, ret_t, "sse41_blend"); + int i = 0; + int mask_values [MAX_VECTOR_ELEMS] = { 0 }; + while (immediate_unroll_next (&ictx, &i)) { + for (int lane = 0; lane < nelem; ++lane) { + // n-bit in inst_c0 (control byte) is set to 1 + gboolean bit_set = (i & (1 << lane)) >> lane; + mask_values [lane] = lane + (bit_set ? nelem : 0); + } + LLVMValueRef mask = create_const_vector_i32 (mask_values, nelem); + LLVMValueRef result = LLVMBuildShuffleVector (builder, lhs, rhs, mask, "sse41_blend"); + immediate_unroll_commit (&ictx, i, result); } - - LLVMValueRef mask = create_const_vector_i32 (mask_values, nelem); - values [ins->dreg] = LLVMBuildShuffleVector (builder, lhs, rhs, mask, ""); + immediate_unroll_default (&ictx); + immediate_unroll_commit_default (&ictx, LLVMGetUndef (ret_t)); + values [ins->dreg] = immediate_unroll_end (&ictx, &cbb); break; } @@ -9213,24 +9261,6 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb) values [ins->dreg] = LLVMBuildZExt (builder, cmp_zero, LLVMInt8Type (), ""); break; } - case OP_XEXTRACT_I32: - case OP_XEXTRACT_I64: - case OP_XEXTRACT_R8: - case OP_XEXTRACT_R4: { - LLVMTypeRef rhst = LLVMTypeOf (rhs); - LLVMValueRef mask = NULL; - switch (ins->opcode) { - case OP_XEXTRACT_I32: case OP_XEXTRACT_R4: - mask = LLVMConstInt (rhst, 0x3, FALSE); break; - case OP_XEXTRACT_I64: case OP_XEXTRACT_R8: - mask = LLVMConstInt (rhst, 0x1, FALSE); break; - default: - g_assert_not_reached (); - } - LLVMValueRef selector = LLVMBuildAnd (builder, rhs, mask, ""); - values [ins->dreg] = LLVMBuildExtractElement (builder, lhs, selector, ""); - break; - } case OP_POPCNT32: values [ins->dreg] = call_intrins (ctx, INTRINS_CTPOP_I32, &lhs, ""); break; @@ -9301,66 +9331,15 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb) #if defined(TARGET_ARM64) || defined(TARGET_X86) || defined(TARGET_AMD64) case OP_LZCNT32: case OP_LZCNT64: { - LLVMValueRef args [2]; - args [0] = lhs; - args [1] = LLVMConstInt (LLVMInt1Type (), 1, FALSE); - values [ins->dreg] = LLVMBuildCall (builder, get_intrins (ctx, ins->opcode == OP_LZCNT32 ? INTRINS_CTLZ_I32 : INTRINS_CTLZ_I64), args, 2, ""); + IntrinsicId iid = ins->opcode == OP_LZCNT32 ? INTRINS_CTLZ_I32 : INTRINS_CTLZ_I64; + LLVMValueRef args [] = { lhs, const_int1 (FALSE) }; + values [ins->dreg] = call_intrins (ctx, iid, args, ""); break; } #endif #if defined(TARGET_ARM64) - case OP_EXTRACT_VAR_I1: - case OP_EXTRACT_VAR_U1: - case OP_EXTRACT_VAR_I2: - case OP_EXTRACT_VAR_U2: - case OP_EXTRACT_VAR_I4: - case OP_EXTRACT_VAR_R4: - case OP_EXTRACT_VAR_R8: - case OP_EXTRACT_VAR_I8: - case OP_EXTRACT_U1: - case OP_EXTRACT_I1: - case OP_EXTRACT_U2: - case OP_EXTRACT_I2: - case OP_EXTRACT_I4: - case OP_EXTRACT_I8: - case OP_EXTRACT_R4: - case OP_EXTRACT_R8: { - gboolean sext = FALSE; - gboolean zext = FALSE; - switch (ins->opcode) { - case OP_EXTRACT_U1: case OP_EXTRACT_U2: zext = TRUE; break; - case OP_EXTRACT_I1: case OP_EXTRACT_I2: sext = TRUE; break; - case OP_EXTRACT_VAR_U1: case OP_EXTRACT_VAR_U2: zext = TRUE; break; - case OP_EXTRACT_VAR_I1: case OP_EXTRACT_VAR_I2: sext = TRUE; break; - } - LLVMValueRef element_ix = NULL; - switch (ins->opcode) { - case OP_EXTRACT_VAR_I1: - case OP_EXTRACT_VAR_U1: - case OP_EXTRACT_VAR_I2: - case OP_EXTRACT_VAR_U2: - case OP_EXTRACT_VAR_I4: - case OP_EXTRACT_VAR_R4: - case OP_EXTRACT_VAR_R8: - case OP_EXTRACT_VAR_I8: - element_ix = rhs; - break; - default: - element_ix = const_int32 (ins->inst_c0); - } - LLVMValueRef result = LLVMBuildExtractElement (builder, lhs, element_ix, "extract"); - /* TODO: Scalar types smaller than i32 seem to be - * normalized to i32 via zero or sign extension. - * Is this still necessary? - */ - if (zext) - result = LLVMBuildZExt (builder, result, i4_t, "extract_zext"); - else if (sext) - result = LLVMBuildSExt (builder, result, i4_t, "extract_sext"); - values [ins->dreg] = result; - break; - } + case OP_XOP_I4_I4: case OP_XOP_I8_I8: { IntrinsicId id = (IntrinsicId)ins->inst_c0; diff --git a/src/mono/mono/mini/mini-ops.h b/src/mono/mono/mini/mini-ops.h index d7ab7c739f3c1c3e8d18acaeb8bcb37438d96557..a0a785fb6241a7aaeb89bd112159fd9f847b181e 100644 --- a/src/mono/mono/mini/mini-ops.h +++ b/src/mono/mono/mini/mini-ops.h @@ -787,16 +787,19 @@ MINI_OP(OP_NOT_NULL, "not_null", NONE, IREG, NONE) #if defined(TARGET_X86) || defined(TARGET_AMD64) || defined(TARGET_WASM) || defined(TARGET_ARM64) -MINI_OP(OP_EXTRACT_I4, "extract_i4", IREG, XREG, NONE) MINI_OP(OP_ICONV_TO_R4_RAW, "iconv_to_r4_raw", FREG, IREG, NONE) -MINI_OP(OP_EXTRACT_I2, "extract_i2", IREG, XREG, NONE) -MINI_OP(OP_EXTRACT_U2, "extract_u2", IREG, XREG, NONE) +/* Extract an element from a vector with a constant lane index. + * inst_c0 is the lane index. + * inst_c1 is a MonoTypeEnum representing the element type. + */ MINI_OP(OP_EXTRACT_I1, "extract_i1", IREG, XREG, NONE) -MINI_OP(OP_EXTRACT_U1, "extract_u1", IREG, XREG, NONE) +MINI_OP(OP_EXTRACT_I2, "extract_i2", IREG, XREG, NONE) +MINI_OP(OP_EXTRACT_I4, "extract_i4", IREG, XREG, NONE) +MINI_OP(OP_EXTRACT_I8, "extract_i8", LREG, XREG, NONE) MINI_OP(OP_EXTRACT_R4, "extract_r4", FREG, XREG, NONE) MINI_OP(OP_EXTRACT_R8, "extract_r8", FREG, XREG, NONE) -MINI_OP(OP_EXTRACT_I8, "extract_i8", LREG, XREG, NONE) +MINI_OP(OP_EXTRACTX_U2, "extractx_u2", IREG, XREG, NONE) /* Used by LLVM */ MINI_OP(OP_INSERT_I1, "insert_i1", XREG, XREG, IREG) @@ -806,8 +809,6 @@ MINI_OP(OP_INSERT_I8, "insert_i8", XREG, XREG, LREG) MINI_OP(OP_INSERT_R4, "insert_r4", XREG, XREG, FREG) MINI_OP(OP_INSERT_R8, "insert_r8", XREG, XREG, FREG) -MINI_OP(OP_EXTRACTX_U2, "extractx_u2", IREG, XREG, NONE) - /*these slow ops are modeled around the availability of a fast 2 bytes insert op*/ /*insertx_u1_slow takes old value and new value as source regs */ MINI_OP(OP_INSERTX_U1_SLOW, "insertx_u1_slow", XREG, IREG, IREG) @@ -1003,10 +1004,6 @@ MINI_OP(OP_CVTPS2PD, "cvtps2pd", XREG, XREG, NONE) MINI_OP(OP_CVTTPD2DQ, "cvttpd2dq", XREG, XREG, NONE) MINI_OP(OP_CVTTPS2DQ, "cvttps2dq", XREG, XREG, NONE) -/* r4 dot product */ -/* multiply all 4 single precision float elements, add them together, and store the result to the lowest element */ -MINI_OP(OP_DPPS, "dpps", XREG, XREG, XREG) - /* sse 1 */ /* inst_c1 is target type */ MINI_OP(OP_SSE_LOADU, "sse_loadu", XREG, XREG, NONE) @@ -1019,7 +1016,7 @@ MINI_OP(OP_SSE_MOVEHL, "sse_movehl", XREG, XREG, XREG) MINI_OP(OP_SSE_MOVELH, "sse_movelh", XREG, XREG, XREG) MINI_OP(OP_SSE_UNPACKLO, "sse_unpacklo", XREG, XREG, XREG) MINI_OP(OP_SSE_UNPACKHI, "sse_unpackhi", XREG, XREG, XREG) -MINI_OP(OP_SSE_SHUFFLE, "sse_shuffle", XREG, XREG, XREG) +MINI_OP3(OP_SSE_SHUFPS, "sse_shufps", XREG, XREG, XREG, IREG) MINI_OP(OP_SSE_AND, "sse_and", XREG, XREG, XREG) MINI_OP(OP_SSE_OR, "sse_or", XREG, XREG, XREG) MINI_OP(OP_SSE_XOR, "sse_xor", XREG, XREG, XREG) @@ -1051,7 +1048,6 @@ MINI_OP(OP_SSE_CVTSI2SS64, "sse_cvtsi2ss64", XREG, XREG, LREG) /* sse 2 */ MINI_OP(OP_SSE2_PACKUS, "sse2_packus", XREG, XREG, XREG) MINI_OP(OP_SSE2_SRLI, "sse2_srli", XREG, XREG, XREG) -MINI_OP(OP_SSE2_SHUFFLE, "sse2_shuffle", XREG, XREG, XREG) MINI_OP(OP_SSE2_ADDS, "sse2_adds", XREG, XREG, XREG) MINI_OP(OP_SSE2_SUBS, "sse2_subs", XREG, XREG, XREG) MINI_OP(OP_SSE2_CMPSD, "sse2_cmpsd", XREG, XREG, XREG) @@ -1100,16 +1096,16 @@ MINI_OP3(OP_SSSE3_ALIGNR, "ssse3_alignr", XREG, XREG, XREG, IREG) /* sse 4.1 */ MINI_OP(OP_SSE41_ROUNDP, "roundp", XREG, XREG, NONE) // packed, inst_c0 - mode, inst_c1 - r4 or r8 -MINI_OP(OP_SSE41_ROUNDS, "rounds", XREG, XREG, XREG) // scalar, inst_c0 - mode, inst_c1 - r4 or r8 +MINI_OP(OP_SSE41_ROUNDS, "sse41_rounds", XREG, XREG, XREG) // scalar, inst_c0 - mode, inst_c1 - r4 or r8 MINI_OP3(OP_SSE41_INSERT, "sse41_insert", XREG, XREG, XREG, IREG) MINI_OP3(OP_SSE41_BLENDV, "sse41_blendv", XREG, XREG, XREG, XREG) -MINI_OP(OP_SSE41_BLEND_IMM, "sse41_blend", XREG, XREG, XREG) +MINI_OP3(OP_SSE41_BLEND, "sse41_blend", XREG, XREG, XREG, IREG) MINI_OP(OP_SSE41_LOADANT, "sse41_loadant", XREG, XREG, NONE) MINI_OP(OP_SSE41_MUL, "sse41_mul", XREG, XREG, XREG) MINI_OP(OP_SSE41_MULLO, "sse41_mullo", XREG, XREG, XREG) MINI_OP(OP_SSE_CVTII, "sse_cvtii", XREG, XREG, NONE) -MINI_OP(OP_SSE41_DPPS_IMM, "sse_dpps", XREG, XREG, XREG) -MINI_OP(OP_SSE41_DPPD_IMM, "sse_dppd", XREG, XREG, XREG) +MINI_OP3(OP_SSE41_DPPS, "sse41_dpps", XREG, XREG, XREG, IREG) +MINI_OP3(OP_SSE41_DPPD, "sse41_dppd", XREG, XREG, XREG, IREG) MINI_OP(OP_SSE41_MPSADBW_IMM, "sse_mpsadbw", XREG, XREG, XREG) /* pclmulqdq */ @@ -1568,14 +1564,17 @@ MINI_OP(OP_XOP_OVR_BYSCALAR_X_X_X, "xop_ovr_byscalar_x_x_x", XREG, XREG, XREG) MINI_OP(OP_XCONCAT, "xconcat", XREG, XREG, XREG) MINI_OP(OP_XCAST, "xcast", XREG, XREG, NONE) -/* Extract element of vector */ -/* The index is assumed to be in range */ -/* inst_i0 is the element type */ -MINI_OP(OP_XEXTRACT_I32, "xextract_i32", IREG, XREG, IREG) -MINI_OP(OP_XEXTRACT_I64, "xextract_i64", LREG, XREG, IREG) -MINI_OP(OP_XEXTRACT_R8, "xextract_r8", FREG, XREG, IREG) -/* Return an R4 */ + +/* Extract an element from a vector with a variable lane index. + * The index is assumed to be in range. + * inst_c1 is a MonoTypeEnum representing the element type. + */ +MINI_OP(OP_XEXTRACT_I1, "xextract_i1", IREG, XREG, IREG) +MINI_OP(OP_XEXTRACT_I2, "xextract_i2", IREG, XREG, IREG) +MINI_OP(OP_XEXTRACT_I4, "xextract_i4", IREG, XREG, IREG) +MINI_OP(OP_XEXTRACT_I8, "xextract_i8", LREG, XREG, IREG) MINI_OP(OP_XEXTRACT_R4, "xextract_r4", FREG, XREG, IREG) +MINI_OP(OP_XEXTRACT_R8, "xextract_r8", FREG, XREG, IREG) /* Insert element into a vector */ /* sreg1 is the vector, sreg2 is the value, sreg3 is the index */ @@ -1738,15 +1737,6 @@ MINI_OP(OP_ARM64_SQXTUN2, "arm64_sqxtun2", XREG, XREG, XREG) MINI_OP(OP_ARM64_SELECT_SCALAR, "arm64_select_scalar", XREG, XREG, IREG) -MINI_OP(OP_EXTRACT_VAR_I1, "extract_var_i1", IREG, XREG, IREG) -MINI_OP(OP_EXTRACT_VAR_U1, "extract_var_u1", IREG, XREG, IREG) -MINI_OP(OP_EXTRACT_VAR_I2, "extract_var_i2", IREG, XREG, IREG) -MINI_OP(OP_EXTRACT_VAR_U2, "extract_var_u2", IREG, XREG, IREG) -MINI_OP(OP_EXTRACT_VAR_I4, "extract_var_i4", IREG, XREG, IREG) -MINI_OP(OP_EXTRACT_VAR_R4, "extract_var_r4", FREG, XREG, IREG) -MINI_OP(OP_EXTRACT_VAR_R8, "extract_var_r8", FREG, XREG, IREG) -MINI_OP(OP_EXTRACT_VAR_I8, "extract_var_i8", LREG, XREG, IREG) - MINI_OP(OP_ARM64_FCVTZU, "arm64_fcvtzu", XREG, XREG, NONE) MINI_OP(OP_ARM64_FCVTZS, "arm64_fcvtzs", XREG, XREG, NONE) MINI_OP(OP_ARM64_FCVTZU_SCALAR, "arm64_fcvtzu_scalar", XREG, XREG, NONE) diff --git a/src/mono/mono/mini/mini-s390x.c b/src/mono/mono/mini/mini-s390x.c index 9f276742151e521ccee134996fa690dca2085111..bc4e9b24d114c6ec65389c406b266cb5d25f19ed 100644 --- a/src/mono/mono/mini/mini-s390x.c +++ b/src/mono/mono/mini/mini-s390x.c @@ -5131,19 +5131,17 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) } break; case OP_EXTRACT_I1: - case OP_EXTRACT_U1: amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 4); if (ins->inst_c0) amd64_shift_reg_imm (code, X86_SHR, ins->dreg, ins->inst_c0 * 8); - amd64_widen_reg (code, ins->dreg, ins->dreg, ins->opcode == OP_EXTRACT_I1, FALSE); + amd64_widen_reg (code, ins->dreg, ins->dreg, ins->inst_c1 == MONO_TYPE_I1, FALSE); break; case OP_EXTRACT_I2: - case OP_EXTRACT_U2: /*amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 4); if (ins->inst_c0) amd64_shift_reg_imm_size (code, X86_SHR, ins->dreg, 16, 4);*/ s390x_pextrw_imm (code, ins->dreg, ins->sreg1, ins->inst_c0); - amd64_widen_reg_size (code, ins->dreg, ins->dreg, ins->opcode == OP_EXTRACT_I2, TRUE, 4); + amd64_widen_reg_size (code, ins->dreg, ins->dreg, ins->inst_c1 == MONO_TYPE_I2, TRUE, 4); break; case OP_EXTRACT_R8: if (ins->inst_c0) diff --git a/src/mono/mono/mini/mini-x86.c b/src/mono/mono/mini/mini-x86.c index 5143ea8dc4a1350e67670f415def153ab76fc80b..5c43d7ae72796f85debb193cad2888d2208e36d9 100644 --- a/src/mono/mono/mini/mini-x86.c +++ b/src/mono/mono/mini/mini-x86.c @@ -4745,18 +4745,16 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) x86_movd_reg_xreg (code, ins->dreg, ins->sreg1); break; case OP_EXTRACT_I1: - case OP_EXTRACT_U1: x86_movd_reg_xreg (code, ins->dreg, ins->sreg1); if (ins->inst_c0) x86_shift_reg_imm (code, X86_SHR, ins->dreg, ins->inst_c0 * 8); - x86_widen_reg (code, ins->dreg, ins->dreg, ins->opcode == OP_EXTRACT_I1, FALSE); + x86_widen_reg (code, ins->dreg, ins->dreg, ins->inst_c1 == MONO_TYPE_I1, FALSE); break; case OP_EXTRACT_I2: - case OP_EXTRACT_U2: x86_movd_reg_xreg (code, ins->dreg, ins->sreg1); if (ins->inst_c0) x86_shift_reg_imm (code, X86_SHR, ins->dreg, 16); - x86_widen_reg (code, ins->dreg, ins->dreg, ins->opcode == OP_EXTRACT_I2, TRUE); + x86_widen_reg (code, ins->dreg, ins->dreg, ins->inst_c1 == MONO_TYPE_I2, TRUE); break; case OP_EXTRACT_R8: if (ins->inst_c0) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 95c7ec07d744a6288830c701197790db74b01619..bc77b5a49507f9210192999aad0e2e9ea27da66c 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -388,6 +388,10 @@ typedef MonoInst * (* EmitIntrinsicFn) ( const SimdIntrinsic *info, int id, MonoTypeEnum arg0_type, gboolean is_64bit); +static const IntrinGroup unsupported_intrin_group [] = { + { "", 0, unsupported, sizeof (unsupported) }, +}; + static MonoInst * emit_hardware_intrinsics ( MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, @@ -395,9 +399,8 @@ emit_hardware_intrinsics ( EmitIntrinsicFn custom_emit) { MonoClass *klass = cmethod->klass; - const IntrinGroup *intrin_group = NULL; + const IntrinGroup *intrin_group = unsupported_intrin_group; gboolean is_64bit = FALSE; - int id = -1; int groups_size = groups_size_bytes / sizeof (groups [0]); for (int i = 0; i < groups_size; ++i) { const IntrinGroup *group = &groups [i]; @@ -407,31 +410,30 @@ emit_hardware_intrinsics ( } } - const SimdIntrinsic *info = NULL; - MonoInst *ins = NULL; gboolean supported = FALSE; MonoTypeEnum arg0_type = fsig->param_count > 0 ? get_underlying_type (fsig->params [0]) : MONO_TYPE_VOID; + int id = -1; uint16_t op = 0; uint16_t c0 = 0; - if (intrin_group) { - const SimdIntrinsic *intrinsics = intrin_group->intrinsics; - int intrinsics_size = intrin_group->intrinsics_size; - MonoCPUFeatures feature = intrin_group->feature; - + const SimdIntrinsic *intrinsics = intrin_group->intrinsics; + int intrinsics_size = intrin_group->intrinsics_size; + MonoCPUFeatures feature = intrin_group->feature; + const SimdIntrinsic *info = lookup_intrins_info ((SimdIntrinsic *) intrinsics, intrinsics_size, cmethod); + { + if (!info) + goto support_probe_complete; + id = info->id; // Hardware intrinsics are LLVM-only. if (!COMPILE_LLVM (cfg) && !intrin_group->jit_supported) goto support_probe_complete; - info = lookup_intrins_info ((SimdIntrinsic *) intrinsics, intrinsics_size, cmethod); - if (!info) - goto support_probe_complete; - - if (feature) - supported = ((mini_get_cpu_features (cfg) & feature) != 0) && (intrin_group->intrinsics != unsupported); + if (intrin_group->intrinsics == unsupported) + supported = FALSE; + else if (feature) + supported = (mini_get_cpu_features (cfg) & feature) != 0; else supported = TRUE; - id = info->id; op = info->default_op; c0 = info->default_instc0; @@ -457,16 +459,16 @@ emit_hardware_intrinsics ( op = info->floating_op; c0 = info->floating_instc0; } - } support_probe_complete: if (id == SN_get_IsSupported) { + MonoInst *ins = NULL; EMIT_NEW_ICONST (cfg, ins, supported ? 1 : 0); return ins; } if (!supported) { // Can't emit non-supported llvm intrinsics - if (!intrin_group || cfg->method != cmethod) { + if (cfg->method != cmethod) { // Keep the original call so we end up in the intrinsic method return NULL; } else { @@ -497,6 +499,27 @@ emit_vector_create_elementwise ( #if defined(TARGET_AMD64) || defined(TARGET_ARM64) +static int +type_to_xextract_op (MonoTypeEnum type) +{ + switch (type) { + case MONO_TYPE_I1: case MONO_TYPE_U1: return OP_XEXTRACT_I1; + case MONO_TYPE_I2: case MONO_TYPE_U2: return OP_XEXTRACT_I2; + case MONO_TYPE_I4: case MONO_TYPE_U4: return OP_XEXTRACT_I4; + case MONO_TYPE_I8: case MONO_TYPE_U8: return OP_XEXTRACT_I8; + case MONO_TYPE_R4: return OP_XEXTRACT_R4; + case MONO_TYPE_R8: return OP_XEXTRACT_R8; + case MONO_TYPE_I: + case MONO_TYPE_U: +#if TARGET_SIZEOF_VOID_P == 8 + return OP_XEXTRACT_I8; +#else + return OP_XEXTRACT_I4; +#endif + default: g_assert_not_reached (); + } +} + static guint16 sri_vector_methods [] = { SN_AsByte, SN_AsDouble, @@ -759,48 +782,11 @@ emit_sys_numerics_vector_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSig return NULL; MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, args [1]->dreg, len); MONO_EMIT_NEW_COND_EXC (cfg, GE_UN, "IndexOutOfRangeException"); - int opcode = -1; - int dreg; - gboolean is64 = FALSE; - switch (etype->type) { - case MONO_TYPE_I8: - case MONO_TYPE_U8: - opcode = OP_XEXTRACT_I64; - is64 = TRUE; - dreg = alloc_lreg (cfg); - break; - case MONO_TYPE_R8: - opcode = OP_XEXTRACT_R8; - dreg = alloc_freg (cfg); - break; - case MONO_TYPE_R4: - g_assert (cfg->r4fp); - opcode = OP_XEXTRACT_R4; - dreg = alloc_freg (cfg); - break; - case MONO_TYPE_I: - case MONO_TYPE_U: -#if TARGET_SIZEOF_VOID_P == 8 - opcode = OP_XEXTRACT_I64; - is64 = TRUE; - dreg = alloc_lreg (cfg); -#else - opcode = OP_XEXTRACT_I32; - dreg = alloc_ireg (cfg); -#endif - break; - default: - opcode = OP_XEXTRACT_I32; - dreg = alloc_ireg (cfg); - break; - } - MONO_INST_NEW (cfg, ins, opcode); - ins->dreg = dreg; - ins->sreg1 = load_simd_vreg (cfg, cmethod, args [0], NULL); - ins->sreg2 = args [1]->dreg; - ins->inst_c0 = etype->type; - mini_type_to_eval_stack_type (cfg, etype, ins); - MONO_ADD_INS (cfg->cbb, ins); + MonoTypeEnum ty = etype->type; + int opcode = type_to_xextract_op (ty); + int src1 = load_simd_vreg (cfg, cmethod, args [0], NULL); + MonoInst *ins = emit_simd_ins (cfg, klass, opcode, src1, args [1]->dreg); + ins->inst_c1 = ty; return ins; } case SN_ctor: @@ -1019,29 +1005,6 @@ emit_invalid_operation (MonoCompile *cfg, const char* message) #ifdef TARGET_ARM64 -static int -type_to_extract_var_op (MonoTypeEnum type) -{ - switch (type) { - case MONO_TYPE_I1: return OP_EXTRACT_VAR_U1; - case MONO_TYPE_U1: return OP_EXTRACT_VAR_I1; - case MONO_TYPE_I2: return OP_EXTRACT_VAR_U2; - case MONO_TYPE_U2: return OP_EXTRACT_VAR_I2; - case MONO_TYPE_I4: case MONO_TYPE_U4: return OP_EXTRACT_VAR_I4; - case MONO_TYPE_I8: case MONO_TYPE_U8: return OP_EXTRACT_VAR_I8; - case MONO_TYPE_R4: return OP_EXTRACT_VAR_R4; - case MONO_TYPE_R8: return OP_EXTRACT_VAR_R8; - case MONO_TYPE_I: - case MONO_TYPE_U: -#if TARGET_SIZEOF_VOID_P == 8 - return OP_EXTRACT_VAR_I8; -#else - return OP_EXTRACT_VAR_I4; -#endif - default: g_assert_not_reached (); - } -} - static SimdIntrinsic armbase_methods [] = { {SN_LeadingSignCount}, {SN_LeadingZeroCount}, @@ -1578,7 +1541,8 @@ emit_arm64_intrinsics ( switch (id) { case SN_DuplicateSelectedScalarToVector128: case SN_DuplicateSelectedScalarToVector64: { - MonoInst *ins = emit_simd_ins (cfg, ret_klass, type_to_extract_var_op (rtype->type), args [0]->dreg, args [1]->dreg); + MonoInst *ins = emit_simd_ins (cfg, ret_klass, type_to_xextract_op (rtype->type), args [0]->dreg, args [1]->dreg); + ins->inst_c1 = arg0_type; scalar_src_reg = ins->dreg; break; } @@ -1586,7 +1550,7 @@ emit_arm64_intrinsics ( return emit_simd_ins (cfg, ret_klass, type_to_expand_op (rtype), scalar_src_reg, -1); } case SN_Extract: { - int extract_op = type_to_extract_var_op (arg0_type); + int extract_op = type_to_xextract_op (arg0_type); MonoInst *ins = emit_simd_ins (cfg, klass, extract_op, args [0]->dreg, args [1]->dreg); ins->inst_c1 = arg0_type; return ins; @@ -1597,10 +1561,8 @@ emit_arm64_intrinsics ( int insert_op = 0; int extract_op = 0; switch (arg0_type) { - case MONO_TYPE_I1: insert_op = OP_XINSERT_I1; extract_op = OP_EXTRACT_U1; break; - case MONO_TYPE_U1: insert_op = OP_XINSERT_I1; extract_op = OP_EXTRACT_I1; break; - case MONO_TYPE_I2: insert_op = OP_XINSERT_I2; extract_op = OP_EXTRACT_U2; break; - case MONO_TYPE_U2: insert_op = OP_XINSERT_I2; extract_op = OP_EXTRACT_I2; break; + case MONO_TYPE_I1: case MONO_TYPE_U1: insert_op = OP_XINSERT_I1; extract_op = OP_EXTRACT_I1; break; + case MONO_TYPE_I2: case MONO_TYPE_U2: insert_op = OP_XINSERT_I2; extract_op = OP_EXTRACT_I2; break; case MONO_TYPE_I4: case MONO_TYPE_U4: insert_op = OP_XINSERT_I4; extract_op = OP_EXTRACT_I4; break; case MONO_TYPE_I8: case MONO_TYPE_U8: insert_op = OP_XINSERT_I8; extract_op = OP_EXTRACT_I8; break; case MONO_TYPE_R4: insert_op = OP_XINSERT_R4; extract_op = OP_EXTRACT_R4; break; @@ -1627,6 +1589,7 @@ emit_arm64_intrinsics ( case SN_InsertScalar: { MonoInst *ins = emit_simd_ins (cfg, klass, extract_op, val_src_reg, -1); ins->inst_c0 = 0; + ins->inst_c1 = arg0_type; val_src_reg = ins->dreg; break; } @@ -1782,10 +1745,10 @@ static SimdIntrinsic sse_methods [] = { {SN_CompareLessThan, OP_XCOMPARE_FP, CMP_LT}, {SN_CompareLessThanOrEqual, OP_XCOMPARE_FP, CMP_LE}, {SN_CompareNotEqual, OP_XCOMPARE_FP, CMP_NE}, - {SN_CompareNotGreaterThan, OP_XCOMPARE_FP, CMP_LE}, - {SN_CompareNotGreaterThanOrEqual, OP_XCOMPARE_FP, CMP_LT}, - {SN_CompareNotLessThan, OP_XCOMPARE_FP, CMP_GE}, - {SN_CompareNotLessThanOrEqual, OP_XCOMPARE_FP, CMP_GT}, + {SN_CompareNotGreaterThan, OP_XCOMPARE_FP, CMP_LE_UN}, + {SN_CompareNotGreaterThanOrEqual, OP_XCOMPARE_FP, CMP_LT_UN}, + {SN_CompareNotLessThan, OP_XCOMPARE_FP, CMP_GE_UN}, + {SN_CompareNotLessThanOrEqual, OP_XCOMPARE_FP, CMP_GT_UN}, {SN_CompareOrdered, OP_XCOMPARE_FP, CMP_ORD}, {SN_CompareScalarEqual, OP_SSE_CMPSS, CMP_EQ}, {SN_CompareScalarGreaterThan, OP_SSE_CMPSS, CMP_GT}, @@ -1793,10 +1756,10 @@ static SimdIntrinsic sse_methods [] = { {SN_CompareScalarLessThan, OP_SSE_CMPSS, CMP_LT}, {SN_CompareScalarLessThanOrEqual, OP_SSE_CMPSS, CMP_LE}, {SN_CompareScalarNotEqual, OP_SSE_CMPSS, CMP_NE}, - {SN_CompareScalarNotGreaterThan, OP_SSE_CMPSS, CMP_LE}, - {SN_CompareScalarNotGreaterThanOrEqual, OP_SSE_CMPSS, CMP_LT}, - {SN_CompareScalarNotLessThan, OP_SSE_CMPSS, CMP_GE}, - {SN_CompareScalarNotLessThanOrEqual, OP_SSE_CMPSS, CMP_GT}, + {SN_CompareScalarNotGreaterThan, OP_SSE_CMPSS, CMP_LE_UN}, + {SN_CompareScalarNotGreaterThanOrEqual, OP_SSE_CMPSS, CMP_LT_UN}, + {SN_CompareScalarNotLessThan, OP_SSE_CMPSS, CMP_GE_UN}, + {SN_CompareScalarNotLessThanOrEqual, OP_SSE_CMPSS, CMP_GT_UN}, {SN_CompareScalarOrdered, OP_SSE_CMPSS, CMP_ORD}, {SN_CompareScalarOrderedEqual, OP_SSE_COMISS, CMP_EQ}, {SN_CompareScalarOrderedGreaterThan, OP_SSE_COMISS, CMP_GT}, @@ -1874,10 +1837,10 @@ static SimdIntrinsic sse2_methods [] = { {SN_CompareLessThan}, {SN_CompareLessThanOrEqual, OP_XCOMPARE_FP, CMP_LE}, {SN_CompareNotEqual, OP_XCOMPARE_FP, CMP_NE}, - {SN_CompareNotGreaterThan, OP_XCOMPARE_FP, CMP_LE}, - {SN_CompareNotGreaterThanOrEqual, OP_XCOMPARE_FP, CMP_LT}, - {SN_CompareNotLessThan, OP_XCOMPARE_FP, CMP_GE}, - {SN_CompareNotLessThanOrEqual, OP_XCOMPARE_FP, CMP_GT}, + {SN_CompareNotGreaterThan, OP_XCOMPARE_FP, CMP_LE_UN}, + {SN_CompareNotGreaterThanOrEqual, OP_XCOMPARE_FP, CMP_LT_UN}, + {SN_CompareNotLessThan, OP_XCOMPARE_FP, CMP_GE_UN}, + {SN_CompareNotLessThanOrEqual, OP_XCOMPARE_FP, CMP_GT_UN}, {SN_CompareOrdered, OP_XCOMPARE_FP, CMP_ORD}, {SN_CompareScalarEqual, OP_SSE2_CMPSD, CMP_EQ}, {SN_CompareScalarGreaterThan, OP_SSE2_CMPSD, CMP_GT}, @@ -1885,10 +1848,10 @@ static SimdIntrinsic sse2_methods [] = { {SN_CompareScalarLessThan, OP_SSE2_CMPSD, CMP_LT}, {SN_CompareScalarLessThanOrEqual, OP_SSE2_CMPSD, CMP_LE}, {SN_CompareScalarNotEqual, OP_SSE2_CMPSD, CMP_NE}, - {SN_CompareScalarNotGreaterThan, OP_SSE2_CMPSD, CMP_LE}, - {SN_CompareScalarNotGreaterThanOrEqual, OP_SSE2_CMPSD, CMP_LT}, - {SN_CompareScalarNotLessThan, OP_SSE2_CMPSD, CMP_GE}, - {SN_CompareScalarNotLessThanOrEqual, OP_SSE2_CMPSD, CMP_GT}, + {SN_CompareScalarNotGreaterThan, OP_SSE2_CMPSD, CMP_LE_UN}, + {SN_CompareScalarNotGreaterThanOrEqual, OP_SSE2_CMPSD, CMP_LT_UN}, + {SN_CompareScalarNotLessThan, OP_SSE2_CMPSD, CMP_GE_UN}, + {SN_CompareScalarNotLessThanOrEqual, OP_SSE2_CMPSD, CMP_GT_UN}, {SN_CompareScalarOrdered, OP_SSE2_CMPSD, CMP_ORD}, {SN_CompareScalarOrderedEqual, OP_SSE2_COMISD, CMP_EQ}, {SN_CompareScalarOrderedGreaterThan, OP_SSE2_COMISD, CMP_GT}, @@ -2125,10 +2088,7 @@ emit_x86_intrinsics ( if (feature == MONO_CPU_X86_SSE) { switch (id) { case SN_Shuffle: - if (args [2]->opcode == OP_ICONST) - return emit_simd_ins_for_sig (cfg, klass, OP_SSE_SHUFFLE, args [2]->inst_c0, arg0_type, fsig, args); - // FIXME: handle non-constant mask (generate a switch) - return emit_invalid_operation (cfg, "mask in Sse.Shuffle must be constant"); + return emit_simd_ins_for_sig (cfg, klass, OP_SSE_SHUFPS, 0, arg0_type, fsig, args); case SN_ConvertScalarToVector128Single: { int op = 0; switch (fsig->params [1]->type) { @@ -2298,7 +2258,7 @@ emit_x86_intrinsics ( return emit_simd_ins_for_sig (cfg, klass, OP_SSE2_PACKUS, -1, arg0_type, fsig, args); case SN_Extract: g_assert (arg0_type == MONO_TYPE_U2); - return emit_simd_ins_for_sig (cfg, klass, OP_XEXTRACT_I32, arg0_type, 0, fsig, args); + return emit_simd_ins_for_sig (cfg, klass, OP_XEXTRACT_I4, 0, arg0_type, fsig, args); case SN_Insert: g_assert (arg0_type == MONO_TYPE_I2 || arg0_type == MONO_TYPE_U2); return emit_simd_ins_for_sig (cfg, klass, OP_XINSERT_I2, 0, arg0_type, fsig, args); @@ -2467,45 +2427,42 @@ emit_x86_intrinsics ( if (feature == MONO_CPU_X86_SSE41) { switch (id) { - case SN_DotProduct: - if (args [2]->opcode == OP_ICONST && arg0_type == MONO_TYPE_R4) - return emit_simd_ins_for_sig (cfg, klass, OP_SSE41_DPPS_IMM, args [2]->inst_c0, arg0_type, fsig, args); - else if (args [2]->opcode == OP_ICONST && arg0_type == MONO_TYPE_R8) - return emit_simd_ins_for_sig (cfg, klass, OP_SSE41_DPPD_IMM, args [2]->inst_c0, arg0_type, fsig, args); - // FIXME: handle non-constant control byte (generate a switch) - return emit_invalid_operation (cfg, "control byte in Sse41.DotProduct must be constant"); + case SN_DotProduct: { + int op = 0; + switch (arg0_type) { + case MONO_TYPE_R4: op = OP_SSE41_DPPS; break; + case MONO_TYPE_R8: op = OP_SSE41_DPPD; break; + default: g_assert_not_reached (); break; + } + return emit_simd_ins_for_sig (cfg, klass, op, 0, arg0_type, fsig, args); + } case SN_MultipleSumAbsoluteDifferences: if (args [2]->opcode == OP_ICONST) return emit_simd_ins_for_sig (cfg, klass, OP_SSE41_MPSADBW_IMM, args [2]->inst_c0, arg0_type, fsig, args); // FIXME: handle non-constant control byte (generate a switch) return emit_invalid_operation (cfg, "control byte in Sse41.MultipleSumAbsoluteDifferences must be constant"); case SN_Blend: - if (args [2]->opcode == OP_ICONST) - return emit_simd_ins_for_sig (cfg, klass, OP_SSE41_BLEND_IMM, args [2]->inst_c0, arg0_type, fsig, args); - // FIXME: handle non-constant control byte (generate a switch) - return emit_invalid_operation (cfg, "control byte in Sse41.Blend must be constant"); + return emit_simd_ins_for_sig (cfg, klass, OP_SSE41_BLEND, 0, arg0_type, fsig, args); case SN_BlendVariable: return emit_simd_ins_for_sig (cfg, klass, OP_SSE41_BLENDV, -1, arg0_type, fsig, args); case SN_Extract: { int op = 0; switch (arg0_type) { - case MONO_TYPE_U1: - case MONO_TYPE_U4: - case MONO_TYPE_I4: op = OP_XEXTRACT_I32; break; - case MONO_TYPE_I8: - case MONO_TYPE_U8: op = OP_XEXTRACT_I64; break; + case MONO_TYPE_U1: op = OP_XEXTRACT_I1; break; + case MONO_TYPE_U4: case MONO_TYPE_I4: op = OP_XEXTRACT_I4; break; + case MONO_TYPE_U8: case MONO_TYPE_I8: op = OP_XEXTRACT_I8; break; case MONO_TYPE_R4: op = OP_XEXTRACT_R4; break; case MONO_TYPE_I: case MONO_TYPE_U: #if TARGET_SIZEOF_VOID_P == 8 - op = OP_XEXTRACT_I64; + op = OP_XEXTRACT_I8; #else - op = OP_XEXTRACT_I32; + op = OP_XEXTRACT_I4; #endif break; default: g_assert_not_reached(); break; } - return emit_simd_ins_for_sig (cfg, klass, op, arg0_type, 0, fsig, args); + return emit_simd_ins_for_sig (cfg, klass, op, 0, arg0_type, fsig, args); } case SN_Insert: if (args [2]->opcode == OP_ICONST) @@ -2593,12 +2550,7 @@ emit_x86_intrinsics ( if (feature == MONO_CPU_X86_LZCNT) { switch (id) { case SN_LeadingZeroCount: - MONO_INST_NEW (cfg, ins, is_64bit ? OP_LZCNT64 : OP_LZCNT32); - ins->dreg = is_64bit ? alloc_lreg (cfg) : alloc_ireg (cfg); - ins->sreg1 = args [0]->dreg; - ins->type = is_64bit ? STACK_I8 : STACK_I4; - MONO_ADD_INS (cfg->cbb, ins); - return ins; + return emit_simd_ins_for_sig (cfg, klass, is_64bit ? OP_LZCNT64 : OP_LZCNT32, 0, arg0_type, fsig, args); default: return NULL; } diff --git a/src/tests/Common/CLRTest.Execute.Bash.targets b/src/tests/Common/CLRTest.Execute.Bash.targets index 8ffa05d6b8c060018d39b3d4b20ebb8abca95c79..cb28cbfac5e5a1e150d41e669bc8fa32eb405187 100644 --- a/src/tests/Common/CLRTest.Execute.Bash.targets +++ b/src/tests/Common/CLRTest.Execute.Bash.targets @@ -325,8 +325,6 @@ if [ ! -z ${RunCrossGen+x} ]%3B then TakeLock fi -export MONO_ENV_OPTIONS=--llvm - echo $LAUNCHER $ExePath %24(printf "'%s' " "${CLRTestExecutionArguments[@]}") $LAUNCHER $ExePath "${CLRTestExecutionArguments[@]}" diff --git a/src/tests/Common/helixpublishwitharcade.proj b/src/tests/Common/helixpublishwitharcade.proj index 8daf5b0e0eabb1cc9b4f868ce0c7449050dc1fe8..b98bd8e8289b17140fdd74956d0c83a1e5a1580a 100644 --- a/src/tests/Common/helixpublishwitharcade.proj +++ b/src/tests/Common/helixpublishwitharcade.proj @@ -194,7 +194,7 @@ <_PayloadGroups Include="$(PayloadGroups)" /> <_ProjectsToBuild Include="testenvironment.proj"> - Scenario=$(Scenario);TestEnvFileName=$(PayloadsRootDirectory)%(_PayloadGroups.Identity)\$(TestEnvFileName);TargetsWindows=$(TestWrapperTargetsWindows) + Scenario=$(Scenario);TestEnvFileName=$(PayloadsRootDirectory)%(_PayloadGroups.Identity)\$(TestEnvFileName);TargetsWindows=$(TestWrapperTargetsWindows);RuntimeVariant=$(_RuntimeVariant) diff --git a/src/tests/Common/testenvironment.proj b/src/tests/Common/testenvironment.proj index b1a359d90283bfda1dbbff7e0ec4bd23a6a2a176..4dc72fdf06d1cc6534a751400494be9b35520f55 100644 --- a/src/tests/Common/testenvironment.proj +++ b/src/tests/Common/testenvironment.proj @@ -210,6 +210,9 @@ <_TestEnvFileLine Include="export MONO_ENV_OPTIONS=--interpreter" Condition="'$(Scenario)' == 'interpreter'" /> + + <_TestEnvFileLine Condition="'$(RuntimeVariant)' == 'llvmaot'" Include="export MONO_ENV_OPTIONS=--llvm" /> + <_TestEnvFileLine Condition="'$(Scenario)' == 'clrinterpreter'" Include="export COMPlus_Interpret=%2A" /> <_TestEnvFileLine Condition="'$(Scenario)' == 'clrinterpreter'" Include="export COMPlus_InterpreterHWIntrinsicsIsSupportedFalse=1" /> diff --git a/src/tests/issues.targets b/src/tests/issues.targets index f2f7bd6ba37f4a3c12698cac90a013a353a71861..6c336ce6d106e3116952a66e3e1193c4387e85fa 100644 --- a/src/tests/issues.targets +++ b/src/tests/issues.targets @@ -1333,30 +1333,6 @@ needs triage - - needs triage - - - needs triage - - - needs triage - - - needs triage - - - needs triage - - - needs triage - - - needs triage - - - needs triage - needs triage @@ -2534,6 +2510,13 @@ needs triage + + + https://github.com/dotnet/runtime/issues/48914 + + + https://github.com/dotnet/runtime/issues/54113 + @@ -2549,13 +2532,7 @@ Doesn't compile with LLVM AOT. - - https://github.com/dotnet/runtime/issues/48914 - - - needs triage - needs triage @@ -2700,6 +2677,13 @@ + + https://github.com/dotnet/runtime/issues/54122 + + + https://github.com/dotnet/runtime/issues/54122 + + https://github.com/dotnet/runtime/issues/48819 diff --git a/src/tests/run.proj b/src/tests/run.proj index b866b805b0b2afae528767a065f9f6c1e010a47e..c092a4ae0e7ba32989300be1c063b2ef04d9edf2 100644 --- a/src/tests/run.proj +++ b/src/tests/run.proj @@ -599,13 +599,11 @@ namespace $([System.String]::Copy($(Category)).Replace(".","_").Replace("\",""). - @(MonoAotOption->'%(Identity)', ',')