tcg-target.inc.c 116.0 KB
Newer Older
B
bellard 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
/*
 * Tiny Code Generator for QEMU
 *
 * Copyright (c) 2008 Fabrice Bellard
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
24

25 26
#include "tcg-pool.inc.c"

27
#ifdef CONFIG_DEBUG_TCG
28
static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
29 30 31 32
#if TCG_TARGET_REG_BITS == 64
    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
#else
    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
33 34 35 36 37 38
#endif
    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
#if TCG_TARGET_REG_BITS == 64
    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
39
#endif
B
bellard 已提交
40
};
41
#endif
B
bellard 已提交
42

43
static const int tcg_target_reg_alloc_order[] = {
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
#if TCG_TARGET_REG_BITS == 64
    TCG_REG_RBP,
    TCG_REG_RBX,
    TCG_REG_R12,
    TCG_REG_R13,
    TCG_REG_R14,
    TCG_REG_R15,
    TCG_REG_R10,
    TCG_REG_R11,
    TCG_REG_R9,
    TCG_REG_R8,
    TCG_REG_RCX,
    TCG_REG_RDX,
    TCG_REG_RSI,
    TCG_REG_RDI,
    TCG_REG_RAX,
#else
B
bellard 已提交
61 62 63 64
    TCG_REG_EBX,
    TCG_REG_ESI,
    TCG_REG_EDI,
    TCG_REG_EBP,
65 66 67
    TCG_REG_ECX,
    TCG_REG_EDX,
    TCG_REG_EAX,
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
#endif
    TCG_REG_XMM0,
    TCG_REG_XMM1,
    TCG_REG_XMM2,
    TCG_REG_XMM3,
    TCG_REG_XMM4,
    TCG_REG_XMM5,
#ifndef _WIN64
    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
    TCG_REG_XMM6,
    TCG_REG_XMM7,
#if TCG_TARGET_REG_BITS == 64
    TCG_REG_XMM8,
    TCG_REG_XMM9,
    TCG_REG_XMM10,
    TCG_REG_XMM11,
    TCG_REG_XMM12,
    TCG_REG_XMM13,
    TCG_REG_XMM14,
    TCG_REG_XMM15,
#endif
90
#endif
B
bellard 已提交
91 92
};

93 94
static const int tcg_target_call_iarg_regs[] = {
#if TCG_TARGET_REG_BITS == 64
95 96 97 98
#if defined(_WIN64)
    TCG_REG_RCX,
    TCG_REG_RDX,
#else
99 100 101 102
    TCG_REG_RDI,
    TCG_REG_RSI,
    TCG_REG_RDX,
    TCG_REG_RCX,
103
#endif
104 105 106
    TCG_REG_R8,
    TCG_REG_R9,
#else
107
    /* 32 bit mode uses stack based calling convention (GCC default). */
108 109 110
#endif
};

111
static const int tcg_target_call_oarg_regs[] = {
112
    TCG_REG_EAX,
113
#if TCG_TARGET_REG_BITS == 32
114
    TCG_REG_EDX
115
#endif
116
};
B
bellard 已提交
117

118 119 120
/* Constants we accept.  */
#define TCG_CT_CONST_S32 0x100
#define TCG_CT_CONST_U32 0x200
121
#define TCG_CT_CONST_I32 0x400
122
#define TCG_CT_CONST_WSZ 0x800
123

124
/* Registers used with L constraint, which are the first argument
125 126 127 128 129 130 131 132 133 134
   registers on x86_64, and two random call clobbered registers on
   i386. */
#if TCG_TARGET_REG_BITS == 64
# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
#else
# define TCG_REG_L0 TCG_REG_EAX
# define TCG_REG_L1 TCG_REG_EDX
#endif

135 136 137 138
/* The host compiler should supply <cpuid.h> to enable runtime features
   detection, as we're not going to go so far as our own inline assembly.
   If not available, default values will be assumed.  */
#if defined(CONFIG_CPUID_H)
139
#include "qemu/cpuid.h"
140 141
#endif

142
/* For 64-bit, we always know that CMOV is available.  */
143 144
#if TCG_TARGET_REG_BITS == 64
# define have_cmov 1
145
#elif defined(CONFIG_CPUID_H)
146 147 148 149 150
static bool have_cmov;
#else
# define have_cmov 0
#endif

151
/* We need these symbols in tcg-target.h, and we can't properly conditionalize
152 153
   it there.  Therefore we always define the variable.  */
bool have_bmi1;
154
bool have_popcnt;
155 156
bool have_avx1;
bool have_avx2;
157

158 159
#ifdef CONFIG_CPUID_H
static bool have_movbe;
160
static bool have_bmi2;
161 162
static bool have_lzcnt;
#else
163 164
# define have_movbe 0
# define have_bmi2 0
165 166
# define have_lzcnt 0
#endif
167

168
static tcg_insn_unit *tb_ret_addr;
169

170
static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
171
                        intptr_t value, intptr_t addend)
B
bellard 已提交
172
{
A
aurel32 已提交
173
    value += addend;
B
bellard 已提交
174 175
    switch(type) {
    case R_386_PC32:
176 177
        value -= (uintptr_t)code_ptr;
        if (value != (int32_t)value) {
178
            return false;
179
        }
180 181
        /* FALLTHRU */
    case R_386_32:
182
        tcg_patch32(code_ptr, value);
B
bellard 已提交
183
        break;
184
    case R_386_PC8:
185
        value -= (uintptr_t)code_ptr;
186
        if (value != (int8_t)value) {
187
            return false;
188
        }
189
        tcg_patch8(code_ptr, value);
190
        break;
B
bellard 已提交
191 192 193
    default:
        tcg_abort();
    }
194
    return true;
B
bellard 已提交
195 196
}

197 198 199 200 201 202 203 204
#if TCG_TARGET_REG_BITS == 64
#define ALL_GENERAL_REGS   0x0000ffffu
#define ALL_VECTOR_REGS    0xffff0000u
#else
#define ALL_GENERAL_REGS   0x000000ffu
#define ALL_VECTOR_REGS    0x00ff0000u
#endif

B
bellard 已提交
205
/* parse target specific constraints */
206 207
static const char *target_parse_constraint(TCGArgConstraint *ct,
                                           const char *ct_str, TCGType type)
B
bellard 已提交
208
{
209
    switch(*ct_str++) {
B
bellard 已提交
210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
    case 'a':
        ct->ct |= TCG_CT_REG;
        tcg_regset_set_reg(ct->u.regs, TCG_REG_EAX);
        break;
    case 'b':
        ct->ct |= TCG_CT_REG;
        tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX);
        break;
    case 'c':
        ct->ct |= TCG_CT_REG;
        tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX);
        break;
    case 'd':
        ct->ct |= TCG_CT_REG;
        tcg_regset_set_reg(ct->u.regs, TCG_REG_EDX);
        break;
    case 'S':
        ct->ct |= TCG_CT_REG;
        tcg_regset_set_reg(ct->u.regs, TCG_REG_ESI);
        break;
    case 'D':
        ct->ct |= TCG_CT_REG;
        tcg_regset_set_reg(ct->u.regs, TCG_REG_EDI);
        break;
    case 'q':
235
        /* A register that can be used as a byte operand.  */
B
bellard 已提交
236
        ct->ct |= TCG_CT_REG;
237
        ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf;
B
bellard 已提交
238
        break;
239
    case 'Q':
240
        /* A register with an addressable second byte (e.g. %ah).  */
241
        ct->ct |= TCG_CT_REG;
242
        ct->u.regs = 0xf;
243
        break;
B
bellard 已提交
244
    case 'r':
245
        /* A general register.  */
B
bellard 已提交
246
        ct->ct |= TCG_CT_REG;
247
        ct->u.regs |= ALL_GENERAL_REGS;
B
bellard 已提交
248
        break;
249 250 251 252
    case 'W':
        /* With TZCNT/LZCNT, we can have operand-size as an input.  */
        ct->ct |= TCG_CT_CONST_WSZ;
        break;
253 254 255 256 257
    case 'x':
        /* A vector register.  */
        ct->ct |= TCG_CT_REG;
        ct->u.regs |= ALL_VECTOR_REGS;
        break;
B
bellard 已提交
258 259 260 261

        /* qemu_ld/st address constraint */
    case 'L':
        ct->ct |= TCG_CT_REG;
262
        ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff;
263 264
        tcg_regset_reset_reg(ct->u.regs, TCG_REG_L0);
        tcg_regset_reset_reg(ct->u.regs, TCG_REG_L1);
265 266 267
        break;

    case 'e':
268
        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_S32);
269 270
        break;
    case 'Z':
271
        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_U32);
B
bellard 已提交
272
        break;
273
    case 'I':
274
        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_I32);
275
        break;
276

B
bellard 已提交
277
    default:
278
        return NULL;
B
bellard 已提交
279
    }
280
    return ct_str;
B
bellard 已提交
281 282 283
}

/* test if a constant matches the constraint */
284
static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
B
bellard 已提交
285 286
                                         const TCGArgConstraint *arg_ct)
{
287 288
    int ct = arg_ct->ct;
    if (ct & TCG_CT_CONST) {
B
bellard 已提交
289
        return 1;
290 291 292 293 294 295 296
    }
    if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
        return 1;
    }
    if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
        return 1;
    }
297 298 299
    if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
        return 1;
    }
300 301 302
    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
        return 1;
    }
303
    return 0;
B
bellard 已提交
304 305
}

306 307
# define LOWREGMASK(x)	((x) & 7)

308
#define P_EXT		0x100		/* 0x0f opcode prefix */
309 310
#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
#define P_DATA16        0x400           /* 0x66 opcode prefix */
311
#if TCG_TARGET_REG_BITS == 64
312 313 314 315 316
# define P_ADDR32       0x800           /* 0x67 opcode prefix */
# define P_REXW         0x1000          /* Set REX.W = 1 */
# define P_REXB_R       0x2000          /* REG field as byte register */
# define P_REXB_RM      0x4000          /* R/M field as byte register */
# define P_GS           0x8000          /* gs segment override */
317 318 319 320 321
#else
# define P_ADDR32	0
# define P_REXW		0
# define P_REXB_R	0
# define P_REXB_RM	0
322
# define P_GS           0
323
#endif
324 325 326 327
#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
#define P_VEXL          0x80000         /* Set VEX.L = 1 */
328

329 330
#define OPC_ARITH_EvIz	(0x81)
#define OPC_ARITH_EvIb	(0x83)
331
#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
332
#define OPC_ANDN        (0xf2 | P_EXT38)
333
#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
334
#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
335 336
#define OPC_BSF         (0xbc | P_EXT)
#define OPC_BSR         (0xbd | P_EXT)
337
#define OPC_BSWAP	(0xc8 | P_EXT)
R
Richard Henderson 已提交
338
#define OPC_CALL_Jz	(0xe8)
339
#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
340 341
#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
#define OPC_DEC_r32	(0x48)
342 343 344
#define OPC_IMUL_GvEv	(0xaf | P_EXT)
#define OPC_IMUL_GvEvIb	(0x6b)
#define OPC_IMUL_GvEvIz	(0x69)
345
#define OPC_INC_r32	(0x40)
R
Richard Henderson 已提交
346 347 348 349
#define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
#define OPC_JCC_short	(0x70)		/* ... plus condition code */
#define OPC_JMP_long	(0xe9)
#define OPC_JMP_short	(0xeb)
R
Richard Henderson 已提交
350
#define OPC_LEA         (0x8d)
351
#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
352 353 354
#define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
#define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
#define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
355
#define OPC_MOVB_EvIz   (0xc6)
356
#define OPC_MOVL_EvIz	(0xc7)
R
Richard Henderson 已提交
357
#define OPC_MOVL_Iv     (0xb8)
358 359
#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
360 361 362 363 364 365 366 367 368
#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
369 370
#define OPC_MOVSBL	(0xbe | P_EXT)
#define OPC_MOVSWL	(0xbf | P_EXT)
371
#define OPC_MOVSLQ	(0x63 | P_REXW)
372 373
#define OPC_MOVZBL	(0xb6 | P_EXT)
#define OPC_MOVZWL	(0xb7 | P_EXT)
374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421
#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
#define OPC_POR         (0xeb | P_EXT | P_DATA16)
#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
422
#define OPC_POP_r32	(0x58)
423
#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
424 425 426
#define OPC_PUSH_r32	(0x50)
#define OPC_PUSH_Iv	(0x68)
#define OPC_PUSH_Ib	(0x6a)
R
Richard Henderson 已提交
427
#define OPC_RET		(0xc3)
428
#define OPC_SETCC	(0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
429 430 431
#define OPC_SHIFT_1	(0xd1)
#define OPC_SHIFT_Ib	(0xc1)
#define OPC_SHIFT_cl	(0xd3)
432
#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
433
#define OPC_SHUFPS      (0xc6 | P_EXT)
434 435
#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
436
#define OPC_TESTL	(0x85)
437
#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
438 439 440 441 442 443 444 445 446 447
#define OPC_UD2         (0x0b | P_EXT)
#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_REXW)
#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
#define OPC_VZEROUPPER  (0x77 | P_EXT)
R
Richard Henderson 已提交
448
#define OPC_XCHG_ax_r32	(0x90)
449

450 451
#define OPC_GRP3_Ev	(0xf7)
#define OPC_GRP5	(0xff)
452
#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
453 454 455

/* Group 1 opcode extensions for 0x80-0x83.
   These are also used as modifiers for OPC_ARITH.  */
B
bellard 已提交
456 457 458 459 460 461 462 463 464
#define ARITH_ADD 0
#define ARITH_OR  1
#define ARITH_ADC 2
#define ARITH_SBB 3
#define ARITH_AND 4
#define ARITH_SUB 5
#define ARITH_XOR 6
#define ARITH_CMP 7

R
Richard Henderson 已提交
465
/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
466 467
#define SHIFT_ROL 0
#define SHIFT_ROR 1
B
bellard 已提交
468 469 470 471
#define SHIFT_SHL 4
#define SHIFT_SHR 5
#define SHIFT_SAR 7

472 473 474 475 476 477 478 479 480
/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
#define EXT3_NOT   2
#define EXT3_NEG   3
#define EXT3_MUL   4
#define EXT3_IMUL  5
#define EXT3_DIV   6
#define EXT3_IDIV  7

/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
481 482
#define EXT5_INC_Ev	0
#define EXT5_DEC_Ev	1
483 484
#define EXT5_CALLN_Ev	2
#define EXT5_JMPN_Ev	4
R
Richard Henderson 已提交
485 486

/* Condition codes to be added to OPC_JCC_{long,short}.  */
B
bellard 已提交
487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504
#define JCC_JMP (-1)
#define JCC_JO  0x0
#define JCC_JNO 0x1
#define JCC_JB  0x2
#define JCC_JAE 0x3
#define JCC_JE  0x4
#define JCC_JNE 0x5
#define JCC_JBE 0x6
#define JCC_JA  0x7
#define JCC_JS  0x8
#define JCC_JNS 0x9
#define JCC_JP  0xa
#define JCC_JNP 0xb
#define JCC_JL  0xc
#define JCC_JGE 0xd
#define JCC_JLE 0xe
#define JCC_JG  0xf

505
static const uint8_t tcg_cond_to_jcc[] = {
B
bellard 已提交
506 507 508 509 510 511 512 513 514 515 516 517
    [TCG_COND_EQ] = JCC_JE,
    [TCG_COND_NE] = JCC_JNE,
    [TCG_COND_LT] = JCC_JL,
    [TCG_COND_GE] = JCC_JGE,
    [TCG_COND_LE] = JCC_JLE,
    [TCG_COND_GT] = JCC_JG,
    [TCG_COND_LTU] = JCC_JB,
    [TCG_COND_GEU] = JCC_JAE,
    [TCG_COND_LEU] = JCC_JBE,
    [TCG_COND_GTU] = JCC_JA,
};

518 519 520 521 522
#if TCG_TARGET_REG_BITS == 64
static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
{
    int rex;

523 524 525
    if (opc & P_GS) {
        tcg_out8(s, 0x65);
    }
526 527
    if (opc & P_DATA16) {
        /* We should never be asking for both 16 and 64-bit operation.  */
528
        tcg_debug_assert((opc & P_REXW) == 0);
529 530 531 532 533
        tcg_out8(s, 0x66);
    }
    if (opc & P_ADDR32) {
        tcg_out8(s, 0x67);
    }
534 535 536 537 538
    if (opc & P_SIMDF3) {
        tcg_out8(s, 0xf3);
    } else if (opc & P_SIMDF2) {
        tcg_out8(s, 0xf2);
    }
539 540

    rex = 0;
541
    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
542 543 544
    rex |= (r & 8) >> 1;                /* REX.R */
    rex |= (x & 8) >> 2;                /* REX.X */
    rex |= (rm & 8) >> 3;               /* REX.B */
545 546 547 548 549 550 551 552 553 554 555 556 557

    /* P_REXB_{R,RM} indicates that the given register is the low byte.
       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
       as otherwise the encoding indicates %[abcd]h.  Note that the values
       that are ORed in merely indicate that the REX byte must be present;
       those bits get discarded in output.  */
    rex |= opc & (r >= 4 ? P_REXB_R : 0);
    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);

    if (rex) {
        tcg_out8(s, (uint8_t)(rex | 0x40));
    }

558
    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
559
        tcg_out8(s, 0x0f);
560 561
        if (opc & P_EXT38) {
            tcg_out8(s, 0x38);
562 563
        } else if (opc & P_EXT3A) {
            tcg_out8(s, 0x3a);
564
        }
565
    }
566

567 568 569 570
    tcg_out8(s, opc);
}
#else
static void tcg_out_opc(TCGContext *s, int opc)
B
bellard 已提交
571
{
572 573 574
    if (opc & P_DATA16) {
        tcg_out8(s, 0x66);
    }
575 576 577 578 579
    if (opc & P_SIMDF3) {
        tcg_out8(s, 0xf3);
    } else if (opc & P_SIMDF2) {
        tcg_out8(s, 0xf2);
    }
580
    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
B
bellard 已提交
581
        tcg_out8(s, 0x0f);
582 583
        if (opc & P_EXT38) {
            tcg_out8(s, 0x38);
584 585
        } else if (opc & P_EXT3A) {
            tcg_out8(s, 0x3a);
586
        }
587
    }
B
bellard 已提交
588 589
    tcg_out8(s, opc);
}
590 591 592 593 594
/* Discard the register arguments to tcg_out_opc early, so as not to penalize
   the 32-bit compilation paths.  This method works with all versions of gcc,
   whereas relying on optimization may not be able to exclude them.  */
#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
#endif
B
bellard 已提交
595

596
static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
B
bellard 已提交
597
{
598 599
    tcg_out_opc(s, opc, r, rm, 0);
    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
B
bellard 已提交
600 601
}

602 603
static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
                            int rm, int index)
604 605 606
{
    int tmp;

607 608 609 610 611 612 613 614 615
    /* Use the two byte form if possible, which cannot encode
       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT
        && ((rm | index) & 8) == 0) {
        /* Two byte VEX prefix.  */
        tcg_out8(s, 0xc5);

        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
    } else {
616 617 618 619
        /* Three byte VEX prefix.  */
        tcg_out8(s, 0xc4);

        /* VEX.m-mmmm */
620 621 622
        if (opc & P_EXT3A) {
            tmp = 3;
        } else if (opc & P_EXT38) {
623 624 625 626
            tmp = 2;
        } else if (opc & P_EXT) {
            tmp = 1;
        } else {
627
            g_assert_not_reached();
628
        }
629 630 631
        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
632 633
        tcg_out8(s, tmp);

634
        tmp = (opc & P_REXW ? 0x80 : 0);       /* VEX.W */
635
    }
636 637

    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
638 639 640 641 642 643 644 645
    /* VEX.pp */
    if (opc & P_DATA16) {
        tmp |= 1;                          /* 0x66 */
    } else if (opc & P_SIMDF3) {
        tmp |= 2;                          /* 0xf3 */
    } else if (opc & P_SIMDF2) {
        tmp |= 3;                          /* 0xf2 */
    }
646 647 648
    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
    tcg_out8(s, tmp);
    tcg_out8(s, opc);
649 650 651 652 653
}

static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
{
    tcg_out_vex_opc(s, opc, r, v, rm, 0);
654 655 656
    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
}

R
Richard Henderson 已提交
657
/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
658 659 660
   We handle either RM and INDEX missing with a negative value.  In 64-bit
   mode for absolute addresses, ~RM is the size of the immediate operand
   that will follow the instruction.  */
R
Richard Henderson 已提交
661

662 663
static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
                               int shift, intptr_t offset)
B
bellard 已提交
664
{
R
Richard Henderson 已提交
665 666
    int mod, len;

667 668 669 670
    if (index < 0 && rm < 0) {
        if (TCG_TARGET_REG_BITS == 64) {
            /* Try for a rip-relative addressing mode.  This has replaced
               the 32-bit-mode absolute addressing encoding.  */
671 672
            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
            intptr_t disp = offset - pc;
673 674 675 676 677
            if (disp == (int32_t)disp) {
                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
                tcg_out32(s, disp);
                return;
            }
R
Richard Henderson 已提交
678

679 680 681 682 683 684 685 686 687 688 689
            /* Try for an absolute address encoding.  This requires the
               use of the MODRM+SIB encoding and is therefore larger than
               rip-relative addressing.  */
            if (offset == (int32_t)offset) {
                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
                tcg_out8(s, (4 << 3) | 5);
                tcg_out32(s, offset);
                return;
            }

            /* ??? The memory isn't directly addressable.  */
690
            g_assert_not_reached();
691 692 693 694 695 696 697
        } else {
            /* Absolute address.  */
            tcg_out8(s, (r << 3) | 5);
            tcg_out32(s, offset);
            return;
        }
    }
R
Richard Henderson 已提交
698 699 700

    /* Find the length of the immediate addend.  Note that the encoding
       that would be used for (%ebp) indicates absolute addressing.  */
701
    if (rm < 0) {
R
Richard Henderson 已提交
702
        mod = 0, len = 4, rm = 5;
703
    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
R
Richard Henderson 已提交
704 705 706
        mod = 0, len = 0;
    } else if (offset == (int8_t)offset) {
        mod = 0x40, len = 1;
B
bellard 已提交
707
    } else {
R
Richard Henderson 已提交
708 709 710 711 712
        mod = 0x80, len = 4;
    }

    /* Use a single byte MODRM format if possible.  Note that the encoding
       that would be used for %esp is the escape to the two byte form.  */
713
    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
R
Richard Henderson 已提交
714
        /* Single byte MODRM format.  */
715
        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
R
Richard Henderson 已提交
716 717 718 719
    } else {
        /* Two byte MODRM+SIB format.  */

        /* Note that the encoding that would place %esp into the index
720 721 722
           field indicates no index register.  In 64-bit mode, the REX.X
           bit counts, so %r12 can be used as the index.  */
        if (index < 0) {
R
Richard Henderson 已提交
723
            index = 4;
B
bellard 已提交
724
        } else {
725
            tcg_debug_assert(index != TCG_REG_ESP);
B
bellard 已提交
726
        }
R
Richard Henderson 已提交
727

728 729
        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
R
Richard Henderson 已提交
730 731 732 733 734
    }

    if (len == 1) {
        tcg_out8(s, offset);
    } else if (len == 4) {
B
bellard 已提交
735 736 737 738
        tcg_out32(s, offset);
    }
}

739 740 741 742 743 744 745 746 747 748 749 750 751 752 753
static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
                                     int index, int shift, intptr_t offset)
{
    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
    tcg_out_sib_offset(s, r, rm, index, shift, offset);
}

static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
                                         int rm, int index, int shift,
                                         intptr_t offset)
{
    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
    tcg_out_sib_offset(s, r, rm, index, shift, offset);
}

754 755
/* A simplification of the above with no index or shift.  */
static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
756
                                        int rm, intptr_t offset)
R
Richard Henderson 已提交
757 758 759 760
{
    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
}

761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784
static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
                                            int v, int rm, intptr_t offset)
{
    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
}

/* Output an opcode with an expected reference to the constant pool.  */
static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
{
    tcg_out_opc(s, opc, r, 0, 0);
    /* Absolute for 32-bit, pc-relative for 64-bit.  */
    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
    tcg_out32(s, 0);
}

/* Output an opcode with an expected reference to the constant pool.  */
static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
{
    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
    /* Absolute for 32-bit, pc-relative for 64-bit.  */
    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
    tcg_out32(s, 0);
}

785 786 787
/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
{
788 789 790 791 792
    /* Propagate an opcode prefix, such as P_REXW.  */
    int ext = subop & ~0x7;
    subop &= 0x7;

    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
793 794
}

795
static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
B
bellard 已提交
796
{
797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853
    int rexw = 0;

    if (arg == ret) {
        return;
    }
    switch (type) {
    case TCG_TYPE_I64:
        rexw = P_REXW;
        /* fallthru */
    case TCG_TYPE_I32:
        if (ret < 16) {
            if (arg < 16) {
                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
            } else {
                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
            }
        } else {
            if (arg < 16) {
                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
            } else {
                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
            }
        }
        break;

    case TCG_TYPE_V64:
        tcg_debug_assert(ret >= 16 && arg >= 16);
        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
        break;
    case TCG_TYPE_V128:
        tcg_debug_assert(ret >= 16 && arg >= 16);
        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
        break;
    case TCG_TYPE_V256:
        tcg_debug_assert(ret >= 16 && arg >= 16);
        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
        break;

    default:
        g_assert_not_reached();
    }
}

static void tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
                            TCGReg r, TCGReg a)
{
    if (have_avx2) {
        static const int dup_insn[4] = {
            OPC_VPBROADCASTB, OPC_VPBROADCASTW,
            OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
        };
        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
        tcg_out_vex_modrm(s, dup_insn[vece] + vex_l, r, 0, a);
    } else {
        switch (vece) {
        case MO_8:
            /* ??? With zero in a register, use PSHUFB.  */
854
            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
855 856 857
            a = r;
            /* FALLTHRU */
        case MO_16:
858
            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
859 860 861 862 863 864 865 866
            a = r;
            /* FALLTHRU */
        case MO_32:
            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
            /* imm8 operand: all output lanes selected from input lane 0.  */
            tcg_out8(s, 0);
            break;
        case MO_64:
867
            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904
            break;
        default:
            g_assert_not_reached();
        }
    }
}

static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
                             TCGReg ret, tcg_target_long arg)
{
    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);

    if (arg == 0) {
        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
        return;
    }
    if (arg == -1) {
        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
        return;
    }

    if (TCG_TARGET_REG_BITS == 64) {
        if (type == TCG_TYPE_V64) {
            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
        } else if (have_avx2) {
            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
        } else {
            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
        }
        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
    } else if (have_avx2) {
        tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
    } else {
        tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy, ret);
        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
        tcg_out_dup_vec(s, type, MO_32, ret, ret);
905
    }
B
bellard 已提交
906 907
}

908
static void tcg_out_movi(TCGContext *s, TCGType type,
909
                         TCGReg ret, tcg_target_long arg)
B
bellard 已提交
910
{
911 912
    tcg_target_long diff;

913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931
    switch (type) {
    case TCG_TYPE_I32:
#if TCG_TARGET_REG_BITS == 64
    case TCG_TYPE_I64:
#endif
        if (ret < 16) {
            break;
        }
        /* fallthru */
    case TCG_TYPE_V64:
    case TCG_TYPE_V128:
    case TCG_TYPE_V256:
        tcg_debug_assert(ret >= 16);
        tcg_out_dupi_vec(s, type, ret, arg);
        return;
    default:
        g_assert_not_reached();
    }

B
bellard 已提交
932
    if (arg == 0) {
933
        tgen_arithr(s, ARITH_XOR, ret, ret);
934
        return;
935 936
    }
    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
937 938
        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
        tcg_out32(s, arg);
939 940 941
        return;
    }
    if (arg == (int32_t)arg) {
942 943
        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
        tcg_out32(s, arg);
944
        return;
B
bellard 已提交
945
    }
946 947

    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
948
    diff = arg - ((uintptr_t)s->code_ptr + 7);
949 950 951 952 953 954 955 956 957
    if (diff == (int32_t)diff) {
        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
        tcg_out32(s, diff);
        return;
    }

    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
    tcg_out64(s, arg);
B
bellard 已提交
958 959
}

960 961 962
static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
{
    if (val == (int8_t)val) {
963
        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
964
        tcg_out8(s, val);
965 966
    } else if (val == (int32_t)val) {
        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
967
        tcg_out32(s, val);
968 969
    } else {
        tcg_abort();
970 971 972
    }
}

973 974 975 976 977 978 979 980 981 982 983 984
static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
{
    /* Given the strength of x86 memory ordering, we only need care for
       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
       faster than "mfence", so don't bother with the sse insn.  */
    if (a0 & TCG_MO_ST_LD) {
        tcg_out8(s, 0xf0);
        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
        tcg_out8(s, 0);
    }
}

985 986
static inline void tcg_out_push(TCGContext *s, int reg)
{
987
    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
988 989 990 991
}

static inline void tcg_out_pop(TCGContext *s, int reg)
{
992
    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
993 994
}

995 996
static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
                       TCGReg arg1, intptr_t arg2)
B
bellard 已提交
997
{
998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027
    switch (type) {
    case TCG_TYPE_I32:
        if (ret < 16) {
            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
        } else {
            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
        }
        break;
    case TCG_TYPE_I64:
        if (ret < 16) {
            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
            break;
        }
        /* FALLTHRU */
    case TCG_TYPE_V64:
        tcg_debug_assert(ret >= 16);
        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
        break;
    case TCG_TYPE_V128:
        tcg_debug_assert(ret >= 16);
        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx, ret, 0, arg1, arg2);
        break;
    case TCG_TYPE_V256:
        tcg_debug_assert(ret >= 16);
        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
                                 ret, 0, arg1, arg2);
        break;
    default:
        g_assert_not_reached();
    }
B
bellard 已提交
1028 1029
}

1030 1031
static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
                       TCGReg arg1, intptr_t arg2)
B
bellard 已提交
1032
{
1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062
    switch (type) {
    case TCG_TYPE_I32:
        if (arg < 16) {
            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
        } else {
            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
        }
        break;
    case TCG_TYPE_I64:
        if (arg < 16) {
            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
            break;
        }
        /* FALLTHRU */
    case TCG_TYPE_V64:
        tcg_debug_assert(arg >= 16);
        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
        break;
    case TCG_TYPE_V128:
        tcg_debug_assert(arg >= 16);
        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx, arg, 0, arg1, arg2);
        break;
    case TCG_TYPE_V256:
        tcg_debug_assert(arg >= 16);
        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
                                 arg, 0, arg1, arg2);
        break;
    default:
        g_assert_not_reached();
    }
B
bellard 已提交
1063 1064
}

1065 1066
static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
                        TCGReg base, intptr_t ofs)
1067
{
1068 1069 1070 1071 1072 1073
    int rexw = 0;
    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
        if (val != (int32_t)val) {
            return false;
        }
        rexw = P_REXW;
1074 1075
    } else if (type != TCG_TYPE_I32) {
        return false;
1076 1077
    }
    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1078
    tcg_out32(s, val);
1079
    return true;
1080 1081
}

1082 1083
static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
{
1084 1085 1086 1087
    /* Propagate an opcode prefix, such as P_DATA16.  */
    int ext = subopc & ~0x7;
    subopc &= 0x7;

1088
    if (count == 1) {
1089
        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1090
    } else {
1091
        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1092 1093 1094 1095
        tcg_out8(s, count);
    }
}

1096 1097
static inline void tcg_out_bswap32(TCGContext *s, int reg)
{
1098
    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1099 1100 1101 1102
}

static inline void tcg_out_rolw_8(TCGContext *s, int reg)
{
1103
    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1104 1105
}

1106 1107 1108
static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
{
    /* movzbl */
1109
    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1110
    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1111 1112
}

1113
static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
1114 1115
{
    /* movsbl */
1116
    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1117
    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1118 1119
}

1120 1121 1122 1123 1124 1125
static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
{
    /* movzwl */
    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
}

1126
static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
1127
{
1128 1129
    /* movsw[lq] */
    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1130 1131
}

1132
static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
B
bellard 已提交
1133
{
1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157
    /* 32-bit mov zero extends.  */
    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
}

static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
{
    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
}

static inline void tcg_out_bswap64(TCGContext *s, int reg)
{
    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
}

static void tgen_arithi(TCGContext *s, int c, int r0,
                        tcg_target_long val, int cf)
{
    int rexw = 0;

    if (TCG_TARGET_REG_BITS == 64) {
        rexw = c & -8;
        c &= 7;
    }

1158 1159 1160 1161
    /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
       partial flags update stalls on Pentium4 and are not recommended
       by current Intel optimization manuals.  */
    if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1162
        int is_inc = (c == ARITH_ADD) ^ (val < 0);
1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184
        if (TCG_TARGET_REG_BITS == 64) {
            /* The single-byte increment encodings are re-tasked as the
               REX prefixes.  Use the MODRM encoding.  */
            tcg_out_modrm(s, OPC_GRP5 + rexw,
                          (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
        } else {
            tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
        }
        return;
    }

    if (c == ARITH_AND) {
        if (TCG_TARGET_REG_BITS == 64) {
            if (val == 0xffffffffu) {
                tcg_out_ext32u(s, r0, r0);
                return;
            }
            if (val == (uint32_t)val) {
                /* AND with no high bits set can use a 32-bit operation.  */
                rexw = 0;
            }
        }
1185
        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196
            tcg_out_ext8u(s, r0, r0);
            return;
        }
        if (val == 0xffffu) {
            tcg_out_ext16u(s, r0, r0);
            return;
        }
    }

    if (val == (int8_t)val) {
        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
B
bellard 已提交
1197
        tcg_out8(s, val);
1198 1199 1200 1201
        return;
    }
    if (rexw == 0 || val == (int32_t)val) {
        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
B
bellard 已提交
1202
        tcg_out32(s, val);
1203
        return;
B
bellard 已提交
1204
    }
1205 1206

    tcg_abort();
B
bellard 已提交
1207 1208
}

A
aurel32 已提交
1209
static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
B
bellard 已提交
1210
{
1211 1212 1213
    if (val != 0) {
        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
    }
B
bellard 已提交
1214 1215
}

1216
/* Use SMALL != 0 to force a short forward branch.  */
1217
static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
B
bellard 已提交
1218 1219
{
    int32_t val, val1;
1220

B
bellard 已提交
1221
    if (l->has_value) {
1222
        val = tcg_pcrel_diff(s, l->u.value_ptr);
B
bellard 已提交
1223 1224
        val1 = val - 2;
        if ((int8_t)val1 == val1) {
1225
            if (opc == -1) {
R
Richard Henderson 已提交
1226
                tcg_out8(s, OPC_JMP_short);
1227
            } else {
R
Richard Henderson 已提交
1228
                tcg_out8(s, OPC_JCC_short + opc);
1229
            }
B
bellard 已提交
1230 1231
            tcg_out8(s, val1);
        } else {
1232 1233 1234
            if (small) {
                tcg_abort();
            }
B
bellard 已提交
1235
            if (opc == -1) {
R
Richard Henderson 已提交
1236
                tcg_out8(s, OPC_JMP_long);
B
bellard 已提交
1237 1238
                tcg_out32(s, val - 5);
            } else {
1239
                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
B
bellard 已提交
1240 1241 1242
                tcg_out32(s, val - 6);
            }
        }
1243 1244
    } else if (small) {
        if (opc == -1) {
R
Richard Henderson 已提交
1245
            tcg_out8(s, OPC_JMP_short);
1246
        } else {
R
Richard Henderson 已提交
1247
            tcg_out8(s, OPC_JCC_short + opc);
1248
        }
1249
        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1250
        s->code_ptr += 1;
B
bellard 已提交
1251 1252
    } else {
        if (opc == -1) {
R
Richard Henderson 已提交
1253
            tcg_out8(s, OPC_JMP_long);
B
bellard 已提交
1254
        } else {
1255
            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
B
bellard 已提交
1256
        }
1257
        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
P
pbrook 已提交
1258
        s->code_ptr += 4;
B
bellard 已提交
1259 1260 1261
    }
}

1262
static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1263
                        int const_arg2, int rexw)
B
bellard 已提交
1264 1265 1266 1267
{
    if (const_arg2) {
        if (arg2 == 0) {
            /* test r, r */
1268
            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
B
bellard 已提交
1269
        } else {
1270
            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
B
bellard 已提交
1271 1272
        }
    } else {
1273
        tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
B
bellard 已提交
1274
    }
1275 1276
}

1277 1278
static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
                             TCGArg arg1, TCGArg arg2, int const_arg2,
1279
                             TCGLabel *label, int small)
1280
{
1281
    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1282
    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
B
bellard 已提交
1283 1284
}

1285 1286 1287
#if TCG_TARGET_REG_BITS == 64
static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
                             TCGArg arg1, TCGArg arg2, int const_arg2,
1288
                             TCGLabel *label, int small)
1289 1290
{
    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1291
    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1292 1293
}
#else
B
bellard 已提交
1294 1295
/* XXX: we implement it at the target level to avoid having to
   handle cross basic blocks temporaries */
1296 1297
static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
                            const int *const_args, int small)
B
bellard 已提交
1298
{
1299 1300
    TCGLabel *label_next = gen_new_label();
    TCGLabel *label_this = arg_label(args[5]);
1301

B
bellard 已提交
1302 1303
    switch(args[4]) {
    case TCG_COND_EQ:
1304 1305 1306
        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
                         label_next, 1);
        tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1307
                         label_this, small);
B
bellard 已提交
1308 1309
        break;
    case TCG_COND_NE:
1310
        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1311
                         label_this, small);
1312
        tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1313
                         label_this, small);
B
bellard 已提交
1314 1315
        break;
    case TCG_COND_LT:
1316
        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1317
                         label_this, small);
1318
        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1319
        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1320
                         label_this, small);
B
bellard 已提交
1321 1322
        break;
    case TCG_COND_LE:
1323
        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1324
                         label_this, small);
1325
        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1326
        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1327
                         label_this, small);
B
bellard 已提交
1328 1329
        break;
    case TCG_COND_GT:
1330
        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1331
                         label_this, small);
1332
        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1333
        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1334
                         label_this, small);
B
bellard 已提交
1335 1336
        break;
    case TCG_COND_GE:
1337
        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1338
                         label_this, small);
1339
        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1340
        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1341
                         label_this, small);
B
bellard 已提交
1342 1343
        break;
    case TCG_COND_LTU:
1344
        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1345
                         label_this, small);
1346
        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1347
        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1348
                         label_this, small);
B
bellard 已提交
1349 1350
        break;
    case TCG_COND_LEU:
1351
        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1352
                         label_this, small);
1353
        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1354
        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1355
                         label_this, small);
B
bellard 已提交
1356 1357
        break;
    case TCG_COND_GTU:
1358
        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1359
                         label_this, small);
1360
        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1361
        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1362
                         label_this, small);
B
bellard 已提交
1363 1364
        break;
    case TCG_COND_GEU:
1365
        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1366
                         label_this, small);
1367
        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1368
        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1369
                         label_this, small);
B
bellard 已提交
1370 1371 1372 1373
        break;
    default:
        tcg_abort();
    }
1374
    tcg_out_label(s, label_next, s->code_ptr);
B
bellard 已提交
1375
}
1376
#endif
B
bellard 已提交
1377

1378 1379
static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
                              TCGArg arg1, TCGArg arg2, int const_arg2)
1380
{
1381
    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
R
Richard Henderson 已提交
1382
    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1383
    tcg_out_ext8u(s, dest, dest);
1384 1385
}

1386 1387 1388 1389 1390 1391 1392 1393 1394
#if TCG_TARGET_REG_BITS == 64
static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
                              TCGArg arg1, TCGArg arg2, int const_arg2)
{
    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
    tcg_out_ext8u(s, dest, dest);
}
#else
1395 1396 1397 1398
static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
                             const int *const_args)
{
    TCGArg new_args[6];
1399
    TCGLabel *label_true, *label_over;
1400 1401 1402 1403 1404 1405 1406 1407

    memcpy(new_args, args+1, 5*sizeof(TCGArg));

    if (args[0] == args[1] || args[0] == args[2]
        || (!const_args[3] && args[0] == args[3])
        || (!const_args[4] && args[0] == args[4])) {
        /* When the destination overlaps with one of the argument
           registers, don't do anything tricky.  */
1408 1409
        label_true = gen_new_label();
        label_over = gen_new_label();
1410

1411
        new_args[5] = label_arg(label_true);
1412 1413 1414 1415
        tcg_out_brcond2(s, new_args, const_args+1, 1);

        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1416
        tcg_out_label(s, label_true, s->code_ptr);
1417 1418

        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1419
        tcg_out_label(s, label_over, s->code_ptr);
1420 1421 1422 1423 1424 1425 1426
    } else {
        /* When the destination does not overlap one of the arguments,
           clear the destination first, jump if cond false, and emit an
           increment in the true case.  This results in smaller code.  */

        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);

1427
        label_over = gen_new_label();
1428
        new_args[4] = tcg_invert_cond(new_args[4]);
1429
        new_args[5] = label_arg(label_over);
1430 1431 1432
        tcg_out_brcond2(s, new_args, const_args+1, 1);

        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1433
        tcg_out_label(s, label_over, s->code_ptr);
1434 1435
    }
}
1436 1437
#endif

1438 1439
static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
                         TCGReg dest, TCGReg v1)
1440
{
1441
    if (have_cmov) {
1442
        tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1443
    } else {
1444
        TCGLabel *over = gen_new_label();
1445 1446 1447 1448
        tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
        tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
        tcg_out_label(s, over, s->code_ptr);
    }
1449 1450
}

1451 1452 1453 1454 1455 1456 1457 1458
static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
                              TCGReg c1, TCGArg c2, int const_c2,
                              TCGReg v1)
{
    tcg_out_cmp(s, c1, c2, const_c2, 0);
    tcg_out_cmov(s, cond, 0, dest, v1);
}

1459
#if TCG_TARGET_REG_BITS == 64
1460 1461 1462
static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
                              TCGReg c1, TCGArg c2, int const_c2,
                              TCGReg v1)
1463 1464
{
    tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1465
    tcg_out_cmov(s, cond, P_REXW, dest, v1);
1466 1467 1468
}
#endif

1469 1470 1471
static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
                        TCGArg arg2, bool const_a2)
{
1472
    if (have_bmi1) {
1473
        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1474 1475 1476 1477 1478 1479
        if (const_a2) {
            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
        } else {
            tcg_debug_assert(dest != arg2);
            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
        }
1480
    } else {
1481
        tcg_debug_assert(dest != arg2);
1482
        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1483
        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498
    }
}

static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
                        TCGArg arg2, bool const_a2)
{
    if (have_lzcnt) {
        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
        if (const_a2) {
            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
        } else {
            tcg_debug_assert(dest != arg2);
            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
        }
    } else {
1499 1500 1501
        tcg_debug_assert(!const_a2);
        tcg_debug_assert(dest != arg1);
        tcg_debug_assert(dest != arg2);
1502

1503
        /* Recall that the output of BSR is the index not the count.  */
1504
        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1505 1506 1507 1508 1509
        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);

        /* Since we have destroyed the flags from BSR, we have to re-test.  */
        tcg_out_cmp(s, arg1, 0, 1, rexw);
        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1510 1511 1512
    }
}

1513
static void tcg_out_branch(TCGContext *s, int call, tcg_insn_unit *dest)
1514
{
1515
    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1516 1517 1518 1519 1520

    if (disp == (int32_t)disp) {
        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
        tcg_out32(s, disp);
    } else {
1521 1522 1523 1524 1525 1526 1527 1528
        /* rip-relative addressing into the constant pool.
           This is 6 + 8 = 14 bytes, as compared to using an
           an immediate load 10 + 6 = 16 bytes, plus we may
           be able to re-use the pool constant for more calls.  */
        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
        tcg_out32(s, 0);
1529 1530 1531
    }
}

1532
static inline void tcg_out_call(TCGContext *s, tcg_insn_unit *dest)
1533 1534 1535
{
    tcg_out_branch(s, 1, dest);
}
1536

1537
static void tcg_out_jmp(TCGContext *s, tcg_insn_unit *dest)
R
Richard Henderson 已提交
1538
{
1539
    tcg_out_branch(s, 0, dest);
R
Richard Henderson 已提交
1540 1541
}

1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556
static void tcg_out_nopn(TCGContext *s, int n)
{
    int i;
    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
     * duplicate prefix, and all of the interesting recent cores can
     * decode and discard the duplicates in a single cycle.
     */
    tcg_debug_assert(n >= 1);
    for (i = 1; i < n; ++i) {
        tcg_out8(s, 0x66);
    }
    tcg_out8(s, 0x90);
}

B
bellard 已提交
1557
#if defined(CONFIG_SOFTMMU)
1558 1559
#include "tcg-ldst.inc.c"

1560 1561 1562
/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
 *                                     int mmu_idx, uintptr_t ra)
 */
1563
static void * const qemu_ld_helpers[16] = {
1564 1565 1566 1567 1568 1569 1570
    [MO_UB]   = helper_ret_ldub_mmu,
    [MO_LEUW] = helper_le_lduw_mmu,
    [MO_LEUL] = helper_le_ldul_mmu,
    [MO_LEQ]  = helper_le_ldq_mmu,
    [MO_BEUW] = helper_be_lduw_mmu,
    [MO_BEUL] = helper_be_ldul_mmu,
    [MO_BEQ]  = helper_be_ldq_mmu,
1571 1572
};

1573 1574 1575
/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
 *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
 */
1576
static void * const qemu_st_helpers[16] = {
1577 1578 1579 1580 1581 1582 1583
    [MO_UB]   = helper_ret_stb_mmu,
    [MO_LEUW] = helper_le_stw_mmu,
    [MO_LEUL] = helper_le_stl_mmu,
    [MO_LEQ]  = helper_le_stq_mmu,
    [MO_BEUW] = helper_be_stw_mmu,
    [MO_BEUL] = helper_be_stl_mmu,
    [MO_BEQ]  = helper_be_stq_mmu,
1584
};
1585 1586 1587 1588

/* Perform the TLB load and compare.

   Inputs:
1589
   ADDRLO and ADDRHI contain the low and high part of the address.
1590 1591 1592 1593 1594 1595 1596 1597 1598 1599

   MEM_INDEX and S_BITS are the memory context and log2 size of the load.

   WHICH is the offset into the CPUTLBEntry structure of the slot to read.
   This should be offsetof addr_read or addr_write.

   Outputs:
   LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
   positions of the displacements of forward jumps to the TLB miss case.

1600
   Second argument register is loaded with the low part of the address.
1601 1602 1603
   In the TLB hit case, it has been adjusted as indicated by the TLB
   and so is a host address.  In the TLB miss case, it continues to
   hold a guest address.
1604

1605
   First argument register is clobbered.  */
1606

1607
static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1608
                                    int mem_index, TCGMemOp opc,
1609
                                    tcg_insn_unit **label_ptr, int which)
1610
{
1611 1612
    const TCGReg r0 = TCG_REG_L0;
    const TCGReg r1 = TCG_REG_L1;
1613
    TCGType ttype = TCG_TYPE_I32;
1614 1615
    TCGType tlbtype = TCG_TYPE_I32;
    int trexw = 0, hrexw = 0, tlbrexw = 0;
1616 1617 1618 1619
    unsigned a_bits = get_alignment_bits(opc);
    unsigned s_bits = opc & MO_SIZE;
    unsigned a_mask = (1 << a_bits) - 1;
    unsigned s_mask = (1 << s_bits) - 1;
1620
    target_ulong tlb_mask;
1621

1622 1623 1624 1625 1626 1627 1628
    if (TCG_TARGET_REG_BITS == 64) {
        if (TARGET_LONG_BITS == 64) {
            ttype = TCG_TYPE_I64;
            trexw = P_REXW;
        }
        if (TCG_TYPE_PTR == TCG_TYPE_I64) {
            hrexw = P_REXW;
1629 1630 1631 1632
            if (TARGET_PAGE_BITS + CPU_TLB_BITS > 32) {
                tlbtype = TCG_TYPE_I64;
                tlbrexw = P_REXW;
            }
1633
        }
1634
    }
1635

1636
    tcg_out_mov(s, tlbtype, r0, addrlo);
1637 1638 1639 1640
    /* If the required alignment is at least as large as the access, simply
       copy the address and mask.  For lesser alignments, check that we don't
       cross pages for the complete access.  */
    if (a_bits >= s_bits) {
1641 1642
        tcg_out_mov(s, ttype, r1, addrlo);
    } else {
1643
        tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1644
    }
1645
    tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1646

1647
    tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1648
                   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1649

1650
    tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1651
    tgen_arithi(s, ARITH_AND + tlbrexw, r0,
1652
                (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS, 0);
1653

1654
    tcg_out_modrm_sib_offset(s, OPC_LEA + hrexw, r0, TCG_AREG0, r0, 0,
1655
                             offsetof(CPUArchState, tlb_table[mem_index][0])
1656 1657
                             + which);

1658
    /* cmp 0(r0), r1 */
1659
    tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, 0);
1660

1661 1662 1663 1664 1665 1666 1667
    /* Prepare for both the fast path add of the tlb addend, and the slow
       path function argument setup.  There are two cases worth note:
       For 32-bit guest and x86_64 host, MOVL zero-extends the guest address
       before the fastpath ADDQ below.  For 64-bit guest and x32 host, MOVQ
       copies the entire guest address for the slow path, while truncation
       for the 32-bit host happens with the fastpath ADDL below.  */
    tcg_out_mov(s, ttype, r1, addrlo);
1668

1669 1670
    /* jne slow_path */
    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1671
    label_ptr[0] = s->code_ptr;
1672
    s->code_ptr += 4;
1673

1674
    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1675
        /* cmp 4(r0), addrhi */
1676
        tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, 4);
1677

1678 1679
        /* jne slow_path */
        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1680
        label_ptr[1] = s->code_ptr;
1681
        s->code_ptr += 4;
1682 1683 1684 1685
    }

    /* TLB Hit.  */

1686
    /* add addend(r0), r1 */
1687
    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1688 1689
                         offsetof(CPUTLBEntry, addend) - which);
}
1690 1691 1692 1693 1694

/*
 * Record the context of a call to the out of line helper code for the slow path
 * for a load or store, so that we can later generate the correct helper code
 */
1695 1696
static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64,
                                TCGMemOpIdx oi,
1697 1698
                                TCGReg datalo, TCGReg datahi,
                                TCGReg addrlo, TCGReg addrhi,
1699
                                tcg_insn_unit *raddr,
1700
                                tcg_insn_unit **label_ptr)
1701 1702 1703 1704
{
    TCGLabelQemuLdst *label = new_ldst_label(s);

    label->is_ld = is_ld;
1705
    label->oi = oi;
1706
    label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722
    label->datalo_reg = datalo;
    label->datahi_reg = datahi;
    label->addrlo_reg = addrlo;
    label->addrhi_reg = addrhi;
    label->raddr = raddr;
    label->label_ptr[0] = label_ptr[0];
    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
        label->label_ptr[1] = label_ptr[1];
    }
}

/*
 * Generate code for the slow path for a load at the end of block
 */
static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
{
1723 1724
    TCGMemOpIdx oi = l->oi;
    TCGMemOp opc = get_memop(oi);
1725
    TCGReg data_reg;
1726
    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1727
    int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0);
1728 1729

    /* resolve label address */
1730
    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1731
    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1732
        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748
    }

    if (TCG_TARGET_REG_BITS == 32) {
        int ofs = 0;

        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
        ofs += 4;

        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
        ofs += 4;

        if (TARGET_LONG_BITS == 64) {
            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
            ofs += 4;
        }

1749
        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1750 1751
        ofs += 4;

1752
        tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1753 1754 1755
    } else {
        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
        /* The second argument is already loaded with addrlo.  */
1756
        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1757 1758 1759 1760
        tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
                     (uintptr_t)l->raddr);
    }

1761
    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1762 1763 1764 1765

    data_reg = l->datalo_reg;
    switch (opc & MO_SSIZE) {
    case MO_SB:
1766
        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw);
1767 1768
        break;
    case MO_SW:
1769
        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw);
1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798
        break;
#if TCG_TARGET_REG_BITS == 64
    case MO_SL:
        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
        break;
#endif
    case MO_UB:
    case MO_UW:
        /* Note that the helpers have zero-extended to tcg_target_long.  */
    case MO_UL:
        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
        break;
    case MO_Q:
        if (TCG_TARGET_REG_BITS == 64) {
            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
        } else if (data_reg == TCG_REG_EDX) {
            /* xchg %edx, %eax */
            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
        } else {
            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
        }
        break;
    default:
        tcg_abort();
    }

    /* Jump to the code corresponding to next IR of qemu_st */
1799
    tcg_out_jmp(s, l->raddr);
1800 1801 1802 1803 1804 1805 1806
}

/*
 * Generate code for the slow path for a store at the end of block
 */
static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
{
1807 1808
    TCGMemOpIdx oi = l->oi;
    TCGMemOp opc = get_memop(oi);
1809
    TCGMemOp s_bits = opc & MO_SIZE;
1810
    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1811 1812 1813
    TCGReg retaddr;

    /* resolve label address */
1814
    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1815
    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1816
        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840
    }

    if (TCG_TARGET_REG_BITS == 32) {
        int ofs = 0;

        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
        ofs += 4;

        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
        ofs += 4;

        if (TARGET_LONG_BITS == 64) {
            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
            ofs += 4;
        }

        tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
        ofs += 4;

        if (s_bits == MO_64) {
            tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
            ofs += 4;
        }

1841
        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1842 1843 1844
        ofs += 4;

        retaddr = TCG_REG_EAX;
1845 1846
        tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
        tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1847 1848 1849 1850 1851
    } else {
        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
        /* The second argument is already loaded with addrlo.  */
        tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
                    tcg_target_call_iarg_regs[2], l->datalo_reg);
1852
        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1853 1854 1855 1856 1857 1858 1859

        if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
            retaddr = tcg_target_call_iarg_regs[4];
            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
        } else {
            retaddr = TCG_REG_RAX;
            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1860 1861
            tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
                       TCG_TARGET_CALL_STACK_OFFSET);
1862 1863 1864 1865 1866
        }
    }

    /* "Tail call" to the helper, with the return address back inline.  */
    tcg_out_push(s, retaddr);
1867
    tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1868
}
1869 1870 1871 1872 1873 1874 1875 1876 1877
#elif defined(__x86_64__) && defined(__linux__)
# include <asm/prctl.h>
# include <sys/prctl.h>

int arch_prctl(int code, unsigned long addr);

static int guest_base_flags;
static inline void setup_guest_base_seg(void)
{
1878
    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1879 1880 1881 1882 1883 1884 1885
        guest_base_flags = P_GS;
    }
}
#else
# define guest_base_flags 0
static inline void setup_guest_base_seg(void) { }
#endif /* SOFTMMU */
B
bellard 已提交
1886

1887
static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1888
                                   TCGReg base, int index, intptr_t ofs,
1889
                                   int seg, bool is64, TCGMemOp memop)
1890
{
1891 1892
    const TCGMemOp real_bswap = memop & MO_BSWAP;
    TCGMemOp bswap = real_bswap;
1893
    int rexw = is64 * P_REXW;
1894 1895 1896 1897 1898 1899
    int movop = OPC_MOVL_GvEv;

    if (have_movbe && real_bswap) {
        bswap = 0;
        movop = OPC_MOVBE_GyMy;
    }
1900 1901 1902

    switch (memop & MO_SSIZE) {
    case MO_UB:
1903 1904
        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
                                 base, index, 0, ofs);
1905
        break;
1906
    case MO_SB:
1907
        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo,
1908
                                 base, index, 0, ofs);
1909
        break;
1910
    case MO_UW:
1911 1912
        tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
                                 base, index, 0, ofs);
1913
        if (real_bswap) {
1914 1915 1916
            tcg_out_rolw_8(s, datalo);
        }
        break;
1917
    case MO_SW:
1918 1919
        if (real_bswap) {
            if (have_movbe) {
1920 1921
                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
                                         datalo, base, index, 0, ofs);
1922
            } else {
1923 1924
                tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
                                         base, index, 0, ofs);
1925 1926
                tcg_out_rolw_8(s, datalo);
            }
1927
            tcg_out_modrm(s, OPC_MOVSWL + rexw, datalo, datalo);
1928
        } else {
1929
            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg,
1930
                                     datalo, base, index, 0, ofs);
1931 1932
        }
        break;
1933
    case MO_UL:
1934
        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
1935 1936 1937 1938
        if (bswap) {
            tcg_out_bswap32(s, datalo);
        }
        break;
1939
#if TCG_TARGET_REG_BITS == 64
1940
    case MO_SL:
1941
        if (real_bswap) {
1942 1943
            tcg_out_modrm_sib_offset(s, movop + seg, datalo,
                                     base, index, 0, ofs);
1944 1945 1946
            if (bswap) {
                tcg_out_bswap32(s, datalo);
            }
1947
            tcg_out_ext32s(s, datalo, datalo);
1948
        } else {
1949 1950
            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
                                     base, index, 0, ofs);
1951
        }
1952 1953
        break;
#endif
1954
    case MO_Q:
1955
        if (TCG_TARGET_REG_BITS == 64) {
1956 1957
            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
                                     base, index, 0, ofs);
1958 1959 1960 1961
            if (bswap) {
                tcg_out_bswap64(s, datalo);
            }
        } else {
1962
            if (real_bswap) {
1963 1964 1965 1966 1967
                int t = datalo;
                datalo = datahi;
                datahi = t;
            }
            if (base != datalo) {
1968 1969 1970 1971
                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
                                         base, index, 0, ofs);
                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
                                         base, index, 0, ofs + 4);
1972
            } else {
1973 1974 1975 1976
                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
                                         base, index, 0, ofs + 4);
                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
                                         base, index, 0, ofs);
1977 1978 1979 1980 1981
            }
            if (bswap) {
                tcg_out_bswap32(s, datalo);
                tcg_out_bswap32(s, datahi);
            }
1982 1983 1984 1985 1986 1987
        }
        break;
    default:
        tcg_abort();
    }
}
1988

B
bellard 已提交
1989 1990 1991
/* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
   EAX. It will be useful once fixed registers globals are less
   common. */
1992
static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
B
bellard 已提交
1993
{
1994
    TCGReg datalo, datahi, addrlo;
1995
    TCGReg addrhi __attribute__((unused));
1996
    TCGMemOpIdx oi;
1997
    TCGMemOp opc;
B
bellard 已提交
1998
#if defined(CONFIG_SOFTMMU)
1999
    int mem_index;
2000
    tcg_insn_unit *label_ptr[2];
B
bellard 已提交
2001 2002
#endif

2003
    datalo = *args++;
2004
    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2005
    addrlo = *args++;
2006
    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2007 2008
    oi = *args++;
    opc = get_memop(oi);
B
bellard 已提交
2009 2010

#if defined(CONFIG_SOFTMMU)
2011
    mem_index = get_mmuidx(oi);
2012

2013
    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2014
                     label_ptr, offsetof(CPUTLBEntry, addr_read));
2015 2016

    /* TLB Hit.  */
2017
    tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc);
B
bellard 已提交
2018

2019
    /* Record the current context of a load into ldst label */
2020
    add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
2021
                        s->code_ptr, label_ptr);
B
bellard 已提交
2022
#else
2023
    {
2024
        int32_t offset = guest_base;
2025
        TCGReg base = addrlo;
2026
        int index = -1;
2027 2028
        int seg = 0;

2029 2030 2031 2032
        /* For a 32-bit guest, the high 32 bits may contain garbage.
           We can do this with the ADDR32 prefix if we're not using
           a guest base, or when using segmentation.  Otherwise we
           need to zero-extend manually.  */
2033
        if (guest_base == 0 || guest_base_flags) {
2034 2035
            seg = guest_base_flags;
            offset = 0;
2036 2037 2038 2039 2040 2041 2042 2043
            if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
                seg |= P_ADDR32;
            }
        } else if (TCG_TARGET_REG_BITS == 64) {
            if (TARGET_LONG_BITS == 32) {
                tcg_out_ext32u(s, TCG_REG_L0, base);
                base = TCG_REG_L0;
            }
2044 2045
            if (offset != guest_base) {
                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, guest_base);
2046 2047 2048
                index = TCG_REG_L1;
                offset = 0;
            }
2049 2050
        }

2051
        tcg_out_qemu_ld_direct(s, datalo, datahi,
2052
                               base, index, offset, seg, is64, opc);
2053
    }
B
bellard 已提交
2054
#endif
2055
}
B
bellard 已提交
2056

2057 2058 2059
static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
                                   TCGReg base, intptr_t ofs, int seg,
                                   TCGMemOp memop)
2060 2061 2062 2063
{
    /* ??? Ideally we wouldn't need a scratch register.  For user-only,
       we could perform the bswap twice to restore the original value
       instead of moving to the scratch.  But as it is, the L constraint
2064
       means that TCG_REG_L0 is definitely free here.  */
2065
    const TCGReg scratch = TCG_REG_L0;
2066 2067 2068 2069 2070 2071 2072 2073
    const TCGMemOp real_bswap = memop & MO_BSWAP;
    TCGMemOp bswap = real_bswap;
    int movop = OPC_MOVL_EvGv;

    if (have_movbe && real_bswap) {
        bswap = 0;
        movop = OPC_MOVBE_MyGy;
    }
2074

2075 2076
    switch (memop & MO_SIZE) {
    case MO_8:
A
Aurelien Jarno 已提交
2077
        /* In 32-bit mode, 8-bit stores can only happen from [abcd]x.
2078 2079 2080 2081 2082
           Use the scratch register if necessary.  */
        if (TCG_TARGET_REG_BITS == 32 && datalo >= 4) {
            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
            datalo = scratch;
        }
2083 2084
        tcg_out_modrm_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
                             datalo, base, ofs);
B
bellard 已提交
2085
        break;
2086
    case MO_16:
B
bellard 已提交
2087
        if (bswap) {
2088
            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2089 2090
            tcg_out_rolw_8(s, scratch);
            datalo = scratch;
B
bellard 已提交
2091
        }
2092
        tcg_out_modrm_offset(s, movop + P_DATA16 + seg, datalo, base, ofs);
B
bellard 已提交
2093
        break;
2094
    case MO_32:
B
bellard 已提交
2095
        if (bswap) {
2096
            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2097 2098
            tcg_out_bswap32(s, scratch);
            datalo = scratch;
B
bellard 已提交
2099
        }
2100
        tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
B
bellard 已提交
2101
        break;
2102
    case MO_64:
2103 2104 2105 2106 2107 2108
        if (TCG_TARGET_REG_BITS == 64) {
            if (bswap) {
                tcg_out_mov(s, TCG_TYPE_I64, scratch, datalo);
                tcg_out_bswap64(s, scratch);
                datalo = scratch;
            }
2109
            tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
2110
        } else if (bswap) {
2111
            tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
2112
            tcg_out_bswap32(s, scratch);
2113
            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs);
2114
            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2115
            tcg_out_bswap32(s, scratch);
2116
            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs+4);
B
bellard 已提交
2117
        } else {
2118 2119 2120 2121 2122 2123 2124
            if (real_bswap) {
                int t = datalo;
                datalo = datahi;
                datahi = t;
            }
            tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
            tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs+4);
B
bellard 已提交
2125 2126 2127 2128 2129 2130 2131
        }
        break;
    default:
        tcg_abort();
    }
}

2132
static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
B
bellard 已提交
2133
{
2134
    TCGReg datalo, datahi, addrlo;
2135
    TCGReg addrhi __attribute__((unused));
2136
    TCGMemOpIdx oi;
2137
    TCGMemOp opc;
B
bellard 已提交
2138
#if defined(CONFIG_SOFTMMU)
2139
    int mem_index;
2140
    tcg_insn_unit *label_ptr[2];
B
bellard 已提交
2141 2142
#endif

2143
    datalo = *args++;
2144
    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2145
    addrlo = *args++;
2146
    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2147 2148
    oi = *args++;
    opc = get_memop(oi);
B
bellard 已提交
2149 2150

#if defined(CONFIG_SOFTMMU)
2151
    mem_index = get_mmuidx(oi);
2152

2153
    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2154
                     label_ptr, offsetof(CPUTLBEntry, addr_write));
2155 2156

    /* TLB Hit.  */
2157
    tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, 0, 0, opc);
B
bellard 已提交
2158

2159
    /* Record the current context of a store into ldst label */
2160
    add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi,
2161
                        s->code_ptr, label_ptr);
2162 2163
#else
    {
2164
        int32_t offset = guest_base;
2165
        TCGReg base = addrlo;
2166 2167
        int seg = 0;

2168
        /* See comment in tcg_out_qemu_ld re zero-extension of addrlo.  */
2169
        if (guest_base == 0 || guest_base_flags) {
2170 2171
            seg = guest_base_flags;
            offset = 0;
2172 2173 2174 2175 2176 2177
            if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
                seg |= P_ADDR32;
            }
        } else if (TCG_TARGET_REG_BITS == 64) {
            /* ??? Note that we can't use the same SIB addressing scheme
               as for loads, since we require L0 free for bswap.  */
2178
            if (offset != guest_base) {
2179 2180 2181 2182
                if (TARGET_LONG_BITS == 32) {
                    tcg_out_ext32u(s, TCG_REG_L0, base);
                    base = TCG_REG_L0;
                }
2183
                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, guest_base);
2184 2185 2186 2187 2188 2189 2190
                tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L1, base);
                base = TCG_REG_L1;
                offset = 0;
            } else if (TARGET_LONG_BITS == 32) {
                tcg_out_ext32u(s, TCG_REG_L1, base);
                base = TCG_REG_L1;
            }
2191 2192
        }

2193
        tcg_out_qemu_st_direct(s, datalo, datahi, base, offset, seg, opc);
2194 2195 2196
    }
#endif
}
B
bellard 已提交
2197

2198
static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
B
bellard 已提交
2199 2200
                              const TCGArg *args, const int *const_args)
{
2201 2202
    TCGArg a0, a1, a2;
    int c, const_a2, vexop, rexw = 0;
2203 2204 2205 2206 2207 2208 2209 2210 2211 2212

#if TCG_TARGET_REG_BITS == 64
# define OP_32_64(x) \
        case glue(glue(INDEX_op_, x), _i64): \
            rexw = P_REXW; /* FALLTHRU */    \
        case glue(glue(INDEX_op_, x), _i32)
#else
# define OP_32_64(x) \
        case glue(glue(INDEX_op_, x), _i32)
#endif
2213

2214 2215 2216 2217 2218 2219 2220
    /* Hoist the loads of the most common arguments.  */
    a0 = args[0];
    a1 = args[1];
    a2 = args[2];
    const_a2 = const_args[2];

    switch (opc) {
B
bellard 已提交
2221
    case INDEX_op_exit_tb:
2222 2223 2224 2225 2226 2227 2228
        /* Reuse the zeroing that exists for goto_ptr.  */
        if (a0 == 0) {
            tcg_out_jmp(s, s->code_gen_epilogue);
        } else {
            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
            tcg_out_jmp(s, tb_ret_addr);
        }
B
bellard 已提交
2229 2230
        break;
    case INDEX_op_goto_tb:
2231
        if (s->tb_jmp_insn_offset) {
B
bellard 已提交
2232
            /* direct jump method */
2233 2234 2235 2236 2237 2238 2239 2240
            int gap;
            /* jump displacement must be aligned for atomic patching;
             * see if we need to add extra nops before jump
             */
            gap = tcg_pcrel_diff(s, QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4));
            if (gap != 1) {
                tcg_out_nopn(s, gap - 1);
            }
R
Richard Henderson 已提交
2241
            tcg_out8(s, OPC_JMP_long); /* jmp im */
2242
            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
B
bellard 已提交
2243 2244 2245
            tcg_out32(s, 0);
        } else {
            /* indirect jump method */
2246
            tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
2247
                                 (intptr_t)(s->tb_jmp_target_addr + a0));
B
bellard 已提交
2248
        }
2249
        set_jmp_reset_offset(s, a0);
B
bellard 已提交
2250
        break;
2251 2252 2253 2254
    case INDEX_op_goto_ptr:
        /* jmp to the given host address (could be epilogue) */
        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
        break;
B
bellard 已提交
2255
    case INDEX_op_br:
2256
        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
B
bellard 已提交
2257
        break;
2258 2259
    OP_32_64(ld8u):
        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2260
        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
B
bellard 已提交
2261
        break;
2262
    OP_32_64(ld8s):
2263
        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
B
bellard 已提交
2264
        break;
2265 2266
    OP_32_64(ld16u):
        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2267
        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
B
bellard 已提交
2268
        break;
2269
    OP_32_64(ld16s):
2270
        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
B
bellard 已提交
2271
        break;
2272 2273 2274
#if TCG_TARGET_REG_BITS == 64
    case INDEX_op_ld32u_i64:
#endif
B
bellard 已提交
2275
    case INDEX_op_ld_i32:
2276
        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
B
bellard 已提交
2277
        break;
2278 2279

    OP_32_64(st8):
2280
        if (const_args[0]) {
2281 2282
            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
            tcg_out8(s, a0);
2283
        } else {
2284
            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2285
        }
B
bellard 已提交
2286
        break;
2287
    OP_32_64(st16):
2288
        if (const_args[0]) {
2289 2290
            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
            tcg_out16(s, a0);
2291
        } else {
2292
            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2293
        }
B
bellard 已提交
2294
        break;
2295 2296 2297
#if TCG_TARGET_REG_BITS == 64
    case INDEX_op_st32_i64:
#endif
B
bellard 已提交
2298
    case INDEX_op_st_i32:
2299
        if (const_args[0]) {
2300 2301
            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
            tcg_out32(s, a0);
2302
        } else {
2303
            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2304
        }
B
bellard 已提交
2305
        break;
2306 2307

    OP_32_64(add):
2308
        /* For 3-operand addition, use LEA.  */
2309 2310 2311
        if (a0 != a1) {
            TCGArg c3 = 0;
            if (const_a2) {
2312 2313 2314 2315
                c3 = a2, a2 = -1;
            } else if (a0 == a2) {
                /* Watch out for dest = src + dest, since we've removed
                   the matching constraint on the add.  */
2316
                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2317 2318 2319
                break;
            }

2320
            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2321 2322 2323 2324
            break;
        }
        c = ARITH_ADD;
        goto gen_arith;
2325
    OP_32_64(sub):
B
bellard 已提交
2326 2327
        c = ARITH_SUB;
        goto gen_arith;
2328
    OP_32_64(and):
B
bellard 已提交
2329 2330
        c = ARITH_AND;
        goto gen_arith;
2331
    OP_32_64(or):
B
bellard 已提交
2332 2333
        c = ARITH_OR;
        goto gen_arith;
2334
    OP_32_64(xor):
B
bellard 已提交
2335 2336 2337
        c = ARITH_XOR;
        goto gen_arith;
    gen_arith:
2338 2339
        if (const_a2) {
            tgen_arithi(s, c + rexw, a0, a2, 0);
B
bellard 已提交
2340
        } else {
2341
            tgen_arithr(s, c + rexw, a0, a2);
B
bellard 已提交
2342 2343
        }
        break;
2344

2345
    OP_32_64(andc):
2346 2347 2348
        if (const_a2) {
            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2349
        } else {
2350
            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2351 2352 2353
        }
        break;

2354
    OP_32_64(mul):
2355
        if (const_a2) {
B
bellard 已提交
2356
            int32_t val;
2357
            val = a2;
B
bellard 已提交
2358
            if (val == (int8_t)val) {
2359
                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
B
bellard 已提交
2360 2361
                tcg_out8(s, val);
            } else {
2362
                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
B
bellard 已提交
2363 2364 2365
                tcg_out32(s, val);
            }
        } else {
2366
            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
B
bellard 已提交
2367 2368
        }
        break;
2369 2370 2371

    OP_32_64(div2):
        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
B
bellard 已提交
2372
        break;
2373 2374
    OP_32_64(divu2):
        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
B
bellard 已提交
2375
        break;
2376 2377

    OP_32_64(shl):
2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388
        /* For small constant 3-operand shift, use LEA.  */
        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
            if (a2 - 1 == 0) {
                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
            } else {
                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
            }
            break;
        }
B
bellard 已提交
2389
        c = SHIFT_SHL;
2390 2391
        vexop = OPC_SHLX;
        goto gen_shift_maybe_vex;
2392
    OP_32_64(shr):
B
bellard 已提交
2393
        c = SHIFT_SHR;
2394 2395
        vexop = OPC_SHRX;
        goto gen_shift_maybe_vex;
2396
    OP_32_64(sar):
B
bellard 已提交
2397
        c = SHIFT_SAR;
2398 2399
        vexop = OPC_SARX;
        goto gen_shift_maybe_vex;
2400
    OP_32_64(rotl):
2401
        c = SHIFT_ROL;
2402 2403
        goto gen_shift;
    OP_32_64(rotr):
2404
        c = SHIFT_ROR;
2405
        goto gen_shift;
2406
    gen_shift_maybe_vex:
2407 2408 2409 2410 2411 2412
        if (have_bmi2) {
            if (!const_a2) {
                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
                break;
            }
            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2413 2414
        }
        /* FALLTHRU */
2415
    gen_shift:
2416 2417
        if (const_a2) {
            tcg_out_shifti(s, c + rexw, a0, a2);
2418
        } else {
2419
            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2420
        }
B
bellard 已提交
2421
        break;
2422

2423 2424 2425 2426 2427 2428
    OP_32_64(ctz):
        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
        break;
    OP_32_64(clz):
        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
        break;
2429 2430 2431
    OP_32_64(ctpop):
        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
        break;
2432

B
bellard 已提交
2433
    case INDEX_op_brcond_i32:
2434
        tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
B
bellard 已提交
2435
        break;
2436
    case INDEX_op_setcond_i32:
2437
        tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
B
bellard 已提交
2438
        break;
2439
    case INDEX_op_movcond_i32:
2440
        tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2441
        break;
B
bellard 已提交
2442

2443
    OP_32_64(bswap16):
2444
        tcg_out_rolw_8(s, a0);
A
aurel32 已提交
2445
        break;
2446
    OP_32_64(bswap32):
2447
        tcg_out_bswap32(s, a0);
2448 2449
        break;

2450
    OP_32_64(neg):
2451
        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2452
        break;
2453
    OP_32_64(not):
2454
        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2455 2456
        break;

2457
    OP_32_64(ext8s):
2458
        tcg_out_ext8s(s, a0, a1, rexw);
2459
        break;
2460
    OP_32_64(ext16s):
2461
        tcg_out_ext16s(s, a0, a1, rexw);
2462
        break;
2463
    OP_32_64(ext8u):
2464
        tcg_out_ext8u(s, a0, a1);
2465
        break;
2466
    OP_32_64(ext16u):
2467
        tcg_out_ext16u(s, a0, a1);
2468
        break;
2469

2470 2471
    case INDEX_op_qemu_ld_i32:
        tcg_out_qemu_ld(s, args, 0);
B
bellard 已提交
2472
        break;
2473 2474
    case INDEX_op_qemu_ld_i64:
        tcg_out_qemu_ld(s, args, 1);
B
bellard 已提交
2475
        break;
2476 2477
    case INDEX_op_qemu_st_i32:
        tcg_out_qemu_st(s, args, 0);
B
bellard 已提交
2478
        break;
2479 2480
    case INDEX_op_qemu_st_i64:
        tcg_out_qemu_st(s, args, 1);
B
bellard 已提交
2481 2482
        break;

2483 2484
    OP_32_64(mulu2):
        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2485
        break;
2486 2487 2488 2489
    OP_32_64(muls2):
        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
        break;
    OP_32_64(add2):
2490
        if (const_args[4]) {
2491
            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2492
        } else {
2493
            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2494 2495
        }
        if (const_args[5]) {
2496
            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2497
        } else {
2498
            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2499 2500
        }
        break;
2501
    OP_32_64(sub2):
2502
        if (const_args[4]) {
2503
            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2504
        } else {
2505
            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2506 2507
        }
        if (const_args[5]) {
2508
            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2509
        } else {
2510
            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2511 2512
        }
        break;
2513 2514 2515 2516 2517 2518 2519 2520

#if TCG_TARGET_REG_BITS == 32
    case INDEX_op_brcond2_i32:
        tcg_out_brcond2(s, args, const_args, 0);
        break;
    case INDEX_op_setcond2_i32:
        tcg_out_setcond2(s, args, const_args);
        break;
2521 2522
#else /* TCG_TARGET_REG_BITS == 64 */
    case INDEX_op_ld32s_i64:
2523
        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2524 2525
        break;
    case INDEX_op_ld_i64:
2526
        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2527 2528
        break;
    case INDEX_op_st_i64:
2529
        if (const_args[0]) {
2530 2531
            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
            tcg_out32(s, a0);
2532
        } else {
2533
            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2534
        }
2535 2536 2537
        break;

    case INDEX_op_brcond_i64:
2538
        tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2539 2540
        break;
    case INDEX_op_setcond_i64:
2541
        tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2542
        break;
2543
    case INDEX_op_movcond_i64:
2544
        tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2545
        break;
2546 2547

    case INDEX_op_bswap64_i64:
2548
        tcg_out_bswap64(s, a0);
2549
        break;
2550
    case INDEX_op_extu_i32_i64:
2551
    case INDEX_op_ext32u_i64:
2552
    case INDEX_op_extrl_i64_i32:
2553
        tcg_out_ext32u(s, a0, a1);
2554
        break;
2555
    case INDEX_op_ext_i32_i64:
2556
    case INDEX_op_ext32s_i64:
2557
        tcg_out_ext32s(s, a0, a1);
2558
        break;
2559 2560 2561
    case INDEX_op_extrh_i64_i32:
        tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
        break;
2562 2563
#endif

2564 2565 2566
    OP_32_64(deposit):
        if (args[3] == 0 && args[4] == 8) {
            /* load bits 0..7 */
2567
            tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2568 2569
        } else if (args[3] == 8 && args[4] == 8) {
            /* load bits 8..15 */
2570
            tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2571 2572
        } else if (args[3] == 0 && args[4] == 16) {
            /* load bits 0..15 */
2573
            tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2574 2575 2576 2577 2578
        } else {
            tcg_abort();
        }
        break;

2579
    case INDEX_op_extract_i64:
2580
        if (a2 + args[3] == 32) {
2581
            /* This is a 32-bit zero-extending right shift.  */
2582 2583
            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2584 2585 2586 2587 2588 2589 2590
            break;
        }
        /* FALLTHRU */
    case INDEX_op_extract_i32:
        /* On the off-chance that we can use the high-byte registers.
           Otherwise we emit the same ext16 + shift pattern that we
           would have gotten from the normal tcg-op.c expansion.  */
2591 2592 2593
        tcg_debug_assert(a2 == 8 && args[3] == 8);
        if (a1 < 4 && a0 < 8) {
            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2594
        } else {
2595 2596
            tcg_out_ext16u(s, a0, a1);
            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2597 2598 2599 2600 2601 2602 2603
        }
        break;

    case INDEX_op_sextract_i32:
        /* We don't implement sextract_i64, as we cannot sign-extend to
           64-bits without using the REX prefix that explicitly excludes
           access to the high-byte registers.  */
2604 2605 2606
        tcg_debug_assert(a2 == 8 && args[3] == 8);
        if (a1 < 4 && a0 < 8) {
            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2607
        } else {
2608 2609
            tcg_out_ext16s(s, a0, a1, 0);
            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2610 2611 2612
        }
        break;

2613
    case INDEX_op_mb:
2614
        tcg_out_mb(s, a0);
2615
        break;
2616 2617
    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
    case INDEX_op_mov_i64:
2618
    case INDEX_op_mov_vec:
2619 2620
    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
    case INDEX_op_movi_i64:
2621
    case INDEX_op_dupi_vec:
2622
    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
B
bellard 已提交
2623 2624 2625
    default:
        tcg_abort();
    }
2626 2627

#undef OP_32_64
B
bellard 已提交
2628 2629
}

2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703
static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
                           unsigned vecl, unsigned vece,
                           const TCGArg *args, const int *const_args)
{
    static int const add_insn[4] = {
        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
    };
    static int const sub_insn[4] = {
        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
    };
    static int const mul_insn[4] = {
        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
    };
    static int const shift_imm_insn[4] = {
        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
    };
    static int const cmpeq_insn[4] = {
        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
    };
    static int const cmpgt_insn[4] = {
        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
    };
    static int const punpckl_insn[4] = {
        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
    };
    static int const punpckh_insn[4] = {
        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
    };
    static int const packss_insn[4] = {
        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
    };
    static int const packus_insn[4] = {
        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
    };

    TCGType type = vecl + TCG_TYPE_V64;
    int insn, sub;
    TCGArg a0, a1, a2;

    a0 = args[0];
    a1 = args[1];
    a2 = args[2];

    switch (opc) {
    case INDEX_op_add_vec:
        insn = add_insn[vece];
        goto gen_simd;
    case INDEX_op_sub_vec:
        insn = sub_insn[vece];
        goto gen_simd;
    case INDEX_op_mul_vec:
        insn = mul_insn[vece];
        goto gen_simd;
    case INDEX_op_and_vec:
        insn = OPC_PAND;
        goto gen_simd;
    case INDEX_op_or_vec:
        insn = OPC_POR;
        goto gen_simd;
    case INDEX_op_xor_vec:
        insn = OPC_PXOR;
        goto gen_simd;
    case INDEX_op_x86_punpckl_vec:
        insn = punpckl_insn[vece];
        goto gen_simd;
    case INDEX_op_x86_punpckh_vec:
        insn = punpckh_insn[vece];
        goto gen_simd;
    case INDEX_op_x86_packss_vec:
        insn = packss_insn[vece];
        goto gen_simd;
    case INDEX_op_x86_packus_vec:
        insn = packus_insn[vece];
        goto gen_simd;
2704 2705 2706 2707 2708 2709
#if TCG_TARGET_REG_BITS == 32
    case INDEX_op_dup2_vec:
        /* Constraints have already placed both 32-bit inputs in xmm regs.  */
        insn = OPC_PUNPCKLDQ;
        goto gen_simd;
#endif
2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810
    gen_simd:
        tcg_debug_assert(insn != OPC_UD2);
        if (type == TCG_TYPE_V256) {
            insn |= P_VEXL;
        }
        tcg_out_vex_modrm(s, insn, a0, a1, a2);
        break;

    case INDEX_op_cmp_vec:
        sub = args[3];
        if (sub == TCG_COND_EQ) {
            insn = cmpeq_insn[vece];
        } else if (sub == TCG_COND_GT) {
            insn = cmpgt_insn[vece];
        } else {
            g_assert_not_reached();
        }
        goto gen_simd;

    case INDEX_op_andc_vec:
        insn = OPC_PANDN;
        if (type == TCG_TYPE_V256) {
            insn |= P_VEXL;
        }
        tcg_out_vex_modrm(s, insn, a0, a2, a1);
        break;

    case INDEX_op_shli_vec:
        sub = 6;
        goto gen_shift;
    case INDEX_op_shri_vec:
        sub = 2;
        goto gen_shift;
    case INDEX_op_sari_vec:
        tcg_debug_assert(vece != MO_64);
        sub = 4;
    gen_shift:
        tcg_debug_assert(vece != MO_8);
        insn = shift_imm_insn[vece];
        if (type == TCG_TYPE_V256) {
            insn |= P_VEXL;
        }
        tcg_out_vex_modrm(s, insn, sub, a0, a1);
        tcg_out8(s, a2);
        break;

    case INDEX_op_ld_vec:
        tcg_out_ld(s, type, a0, a1, a2);
        break;
    case INDEX_op_st_vec:
        tcg_out_st(s, type, a0, a1, a2);
        break;
    case INDEX_op_dup_vec:
        tcg_out_dup_vec(s, type, vece, a0, a1);
        break;

    case INDEX_op_x86_shufps_vec:
        insn = OPC_SHUFPS;
        sub = args[3];
        goto gen_simd_imm8;
    case INDEX_op_x86_blend_vec:
        if (vece == MO_16) {
            insn = OPC_PBLENDW;
        } else if (vece == MO_32) {
            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
        } else {
            g_assert_not_reached();
        }
        sub = args[3];
        goto gen_simd_imm8;
    case INDEX_op_x86_vperm2i128_vec:
        insn = OPC_VPERM2I128;
        sub = args[3];
        goto gen_simd_imm8;
    gen_simd_imm8:
        if (type == TCG_TYPE_V256) {
            insn |= P_VEXL;
        }
        tcg_out_vex_modrm(s, insn, a0, a1, a2);
        tcg_out8(s, sub);
        break;

    case INDEX_op_x86_vpblendvb_vec:
        insn = OPC_VPBLENDVB;
        if (type == TCG_TYPE_V256) {
            insn |= P_VEXL;
        }
        tcg_out_vex_modrm(s, insn, a0, a1, a2);
        tcg_out8(s, args[3] << 4);
        break;

    case INDEX_op_x86_psrldq_vec:
        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
        tcg_out8(s, a2);
        break;

    default:
        g_assert_not_reached();
    }
}

2811 2812
static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
{
2813
    static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
2814 2815 2816 2817 2818 2819 2820
    static const TCGTargetOpDef ri_r = { .args_ct_str = { "ri", "r" } };
    static const TCGTargetOpDef re_r = { .args_ct_str = { "re", "r" } };
    static const TCGTargetOpDef qi_r = { .args_ct_str = { "qi", "r" } };
    static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
    static const TCGTargetOpDef r_q = { .args_ct_str = { "r", "q" } };
    static const TCGTargetOpDef r_re = { .args_ct_str = { "r", "re" } };
    static const TCGTargetOpDef r_0 = { .args_ct_str = { "r", "0" } };
2821
    static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833
    static const TCGTargetOpDef r_r_re = { .args_ct_str = { "r", "r", "re" } };
    static const TCGTargetOpDef r_0_re = { .args_ct_str = { "r", "0", "re" } };
    static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } };
    static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
    static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
    static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
    static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
    static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
    static const TCGTargetOpDef r_r_L_L
        = { .args_ct_str = { "r", "r", "L", "L" } };
    static const TCGTargetOpDef L_L_L_L
        = { .args_ct_str = { "L", "L", "L", "L" } };
2834 2835 2836 2837 2838
    static const TCGTargetOpDef x_x = { .args_ct_str = { "x", "x" } };
    static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } };
    static const TCGTargetOpDef x_x_x_x
        = { .args_ct_str = { "x", "x", "x", "x" } };
    static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } };
2839 2840

    switch (op) {
2841 2842 2843
    case INDEX_op_goto_ptr:
        return &r;

2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856
    case INDEX_op_ld8u_i32:
    case INDEX_op_ld8u_i64:
    case INDEX_op_ld8s_i32:
    case INDEX_op_ld8s_i64:
    case INDEX_op_ld16u_i32:
    case INDEX_op_ld16u_i64:
    case INDEX_op_ld16s_i32:
    case INDEX_op_ld16s_i64:
    case INDEX_op_ld_i32:
    case INDEX_op_ld32u_i64:
    case INDEX_op_ld32s_i64:
    case INDEX_op_ld_i64:
        return &r_r;
2857

2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897
    case INDEX_op_st8_i32:
    case INDEX_op_st8_i64:
        return &qi_r;
    case INDEX_op_st16_i32:
    case INDEX_op_st16_i64:
    case INDEX_op_st_i32:
    case INDEX_op_st32_i64:
        return &ri_r;
    case INDEX_op_st_i64:
        return &re_r;

    case INDEX_op_add_i32:
    case INDEX_op_add_i64:
        return &r_r_re;
    case INDEX_op_sub_i32:
    case INDEX_op_sub_i64:
    case INDEX_op_mul_i32:
    case INDEX_op_mul_i64:
    case INDEX_op_or_i32:
    case INDEX_op_or_i64:
    case INDEX_op_xor_i32:
    case INDEX_op_xor_i64:
        return &r_0_re;

    case INDEX_op_and_i32:
    case INDEX_op_and_i64:
        {
            static const TCGTargetOpDef and
                = { .args_ct_str = { "r", "0", "reZ" } };
            return &and;
        }
        break;
    case INDEX_op_andc_i32:
    case INDEX_op_andc_i64:
        {
            static const TCGTargetOpDef andc
                = { .args_ct_str = { "r", "r", "rI" } };
            return &andc;
        }
        break;
2898

2899 2900 2901 2902 2903 2904
    case INDEX_op_shl_i32:
    case INDEX_op_shl_i64:
    case INDEX_op_shr_i32:
    case INDEX_op_shr_i64:
    case INDEX_op_sar_i32:
    case INDEX_op_sar_i64:
2905
        return have_bmi2 ? &r_r_ri : &r_0_ci;
2906 2907 2908 2909 2910
    case INDEX_op_rotl_i32:
    case INDEX_op_rotl_i64:
    case INDEX_op_rotr_i32:
    case INDEX_op_rotr_i64:
        return &r_0_ci;
2911

2912 2913 2914
    case INDEX_op_brcond_i32:
    case INDEX_op_brcond_i64:
        return &r_re;
2915

2916 2917 2918 2919 2920 2921 2922 2923 2924
    case INDEX_op_bswap16_i32:
    case INDEX_op_bswap16_i64:
    case INDEX_op_bswap32_i32:
    case INDEX_op_bswap32_i64:
    case INDEX_op_bswap64_i64:
    case INDEX_op_neg_i32:
    case INDEX_op_neg_i64:
    case INDEX_op_not_i32:
    case INDEX_op_not_i64:
2925
    case INDEX_op_extrh_i64_i32:
2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940
        return &r_0;

    case INDEX_op_ext8s_i32:
    case INDEX_op_ext8s_i64:
    case INDEX_op_ext8u_i32:
    case INDEX_op_ext8u_i64:
        return &r_q;
    case INDEX_op_ext16s_i32:
    case INDEX_op_ext16s_i64:
    case INDEX_op_ext16u_i32:
    case INDEX_op_ext16u_i64:
    case INDEX_op_ext32s_i64:
    case INDEX_op_ext32u_i64:
    case INDEX_op_ext_i32_i64:
    case INDEX_op_extu_i32_i64:
2941
    case INDEX_op_extrl_i64_i32:
2942 2943 2944
    case INDEX_op_extract_i32:
    case INDEX_op_extract_i64:
    case INDEX_op_sextract_i32:
2945 2946
    case INDEX_op_ctpop_i32:
    case INDEX_op_ctpop_i64:
2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996
        return &r_r;

    case INDEX_op_deposit_i32:
    case INDEX_op_deposit_i64:
        {
            static const TCGTargetOpDef dep
                = { .args_ct_str = { "Q", "0", "Q" } };
            return &dep;
        }
    case INDEX_op_setcond_i32:
    case INDEX_op_setcond_i64:
        {
            static const TCGTargetOpDef setc
                = { .args_ct_str = { "q", "r", "re" } };
            return &setc;
        }
    case INDEX_op_movcond_i32:
    case INDEX_op_movcond_i64:
        {
            static const TCGTargetOpDef movc
                = { .args_ct_str = { "r", "r", "re", "r", "0" } };
            return &movc;
        }
    case INDEX_op_div2_i32:
    case INDEX_op_div2_i64:
    case INDEX_op_divu2_i32:
    case INDEX_op_divu2_i64:
        {
            static const TCGTargetOpDef div2
                = { .args_ct_str = { "a", "d", "0", "1", "r" } };
            return &div2;
        }
    case INDEX_op_mulu2_i32:
    case INDEX_op_mulu2_i64:
    case INDEX_op_muls2_i32:
    case INDEX_op_muls2_i64:
        {
            static const TCGTargetOpDef mul2
                = { .args_ct_str = { "a", "d", "a", "r" } };
            return &mul2;
        }
    case INDEX_op_add2_i32:
    case INDEX_op_add2_i64:
    case INDEX_op_sub2_i32:
    case INDEX_op_sub2_i64:
        {
            static const TCGTargetOpDef arith2
                = { .args_ct_str = { "r", "r", "0", "1", "re", "re" } };
            return &arith2;
        }
2997 2998 2999 3000
    case INDEX_op_ctz_i32:
    case INDEX_op_ctz_i64:
        {
            static const TCGTargetOpDef ctz[2] = {
3001
                { .args_ct_str = { "&r", "r", "r" } },
3002 3003 3004 3005 3006 3007 3008 3009
                { .args_ct_str = { "&r", "r", "rW" } },
            };
            return &ctz[have_bmi1];
        }
    case INDEX_op_clz_i32:
    case INDEX_op_clz_i64:
        {
            static const TCGTargetOpDef clz[2] = {
3010
                { .args_ct_str = { "&r", "r", "r" } },
3011 3012 3013 3014
                { .args_ct_str = { "&r", "r", "rW" } },
            };
            return &clz[have_lzcnt];
        }
B
bellard 已提交
3015

3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027
    case INDEX_op_qemu_ld_i32:
        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
    case INDEX_op_qemu_st_i32:
        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L;
    case INDEX_op_qemu_ld_i64:
        return (TCG_TARGET_REG_BITS == 64 ? &r_L
                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
                : &r_r_L_L);
    case INDEX_op_qemu_st_i64:
        return (TCG_TARGET_REG_BITS == 64 ? &L_L
                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L_L
                : &L_L_L_L);
3028

3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039
    case INDEX_op_brcond2_i32:
        {
            static const TCGTargetOpDef b2
                = { .args_ct_str = { "r", "r", "ri", "ri" } };
            return &b2;
        }
    case INDEX_op_setcond2_i32:
        {
            static const TCGTargetOpDef s2
                = { .args_ct_str = { "r", "r", "r", "ri", "ri" } };
            return &s2;
3040
        }
3041

3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060
    case INDEX_op_ld_vec:
    case INDEX_op_st_vec:
        return &x_r;

    case INDEX_op_add_vec:
    case INDEX_op_sub_vec:
    case INDEX_op_mul_vec:
    case INDEX_op_and_vec:
    case INDEX_op_or_vec:
    case INDEX_op_xor_vec:
    case INDEX_op_andc_vec:
    case INDEX_op_cmp_vec:
    case INDEX_op_x86_shufps_vec:
    case INDEX_op_x86_blend_vec:
    case INDEX_op_x86_packss_vec:
    case INDEX_op_x86_packus_vec:
    case INDEX_op_x86_vperm2i128_vec:
    case INDEX_op_x86_punpckl_vec:
    case INDEX_op_x86_punpckh_vec:
3061 3062 3063
#if TCG_TARGET_REG_BITS == 32
    case INDEX_op_dup2_vec:
#endif
3064 3065 3066 3067 3068 3069 3070 3071 3072 3073
        return &x_x_x;
    case INDEX_op_dup_vec:
    case INDEX_op_shli_vec:
    case INDEX_op_shri_vec:
    case INDEX_op_sari_vec:
    case INDEX_op_x86_psrldq_vec:
        return &x_x;
    case INDEX_op_x86_vpblendvb_vec:
        return &x_x_x_x;

3074 3075
    default:
        break;
3076 3077 3078 3079
    }
    return NULL;
}

3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380
int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
{
    switch (opc) {
    case INDEX_op_add_vec:
    case INDEX_op_sub_vec:
    case INDEX_op_and_vec:
    case INDEX_op_or_vec:
    case INDEX_op_xor_vec:
    case INDEX_op_andc_vec:
        return 1;
    case INDEX_op_cmp_vec:
        return -1;

    case INDEX_op_shli_vec:
    case INDEX_op_shri_vec:
        /* We must expand the operation for MO_8.  */
        return vece == MO_8 ? -1 : 1;

    case INDEX_op_sari_vec:
        /* We must expand the operation for MO_8.  */
        if (vece == MO_8) {
            return -1;
        }
        /* We can emulate this for MO_64, but it does not pay off
           unless we're producing at least 4 values.  */
        if (vece == MO_64) {
            return type >= TCG_TYPE_V256 ? -1 : 0;
        }
        return 1;

    case INDEX_op_mul_vec:
        if (vece == MO_8) {
            /* We can expand the operation for MO_8.  */
            return -1;
        }
        if (vece == MO_64) {
            return 0;
        }
        return 1;

    default:
        return 0;
    }
}

void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
                       TCGArg a0, ...)
{
    va_list va;
    TCGArg a1, a2;
    TCGv_vec v0, t1, t2, t3, t4;

    va_start(va, a0);
    v0 = temp_tcgv_vec(arg_temp(a0));

    switch (opc) {
    case INDEX_op_shli_vec:
    case INDEX_op_shri_vec:
        tcg_debug_assert(vece == MO_8);
        a1 = va_arg(va, TCGArg);
        a2 = va_arg(va, TCGArg);
        /* Unpack to W, shift, and repack.  Tricky bits:
           (1) Use punpck*bw x,x to produce DDCCBBAA,
               i.e. duplicate in other half of the 16-bit lane.
           (2) For right-shift, add 8 so that the high half of
               the lane becomes zero.  For left-shift, we must
               shift up and down again.
           (3) Step 2 leaves high half zero such that PACKUSWB
               (pack with unsigned saturation) does not modify
               the quantity.  */
        t1 = tcg_temp_new_vec(type);
        t2 = tcg_temp_new_vec(type);
        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
                  tcgv_vec_arg(t1), a1, a1);
        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
                  tcgv_vec_arg(t2), a1, a1);
        if (opc == INDEX_op_shri_vec) {
            vec_gen_3(INDEX_op_shri_vec, type, MO_16,
                     tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
            vec_gen_3(INDEX_op_shri_vec, type, MO_16,
                     tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
        } else {
            vec_gen_3(INDEX_op_shli_vec, type, MO_16,
                     tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
            vec_gen_3(INDEX_op_shli_vec, type, MO_16,
                     tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
            vec_gen_3(INDEX_op_shri_vec, type, MO_16,
                     tcgv_vec_arg(t1), tcgv_vec_arg(t1), 8);
            vec_gen_3(INDEX_op_shri_vec, type, MO_16,
                     tcgv_vec_arg(t2), tcgv_vec_arg(t2), 8);
        }
        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
                 a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2));
        tcg_temp_free_vec(t1);
        tcg_temp_free_vec(t2);
        break;

    case INDEX_op_sari_vec:
        a1 = va_arg(va, TCGArg);
        a2 = va_arg(va, TCGArg);
        if (vece == MO_8) {
            /* Unpack to W, shift, and repack, as above.  */
            t1 = tcg_temp_new_vec(type);
            t2 = tcg_temp_new_vec(type);
            vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
                      tcgv_vec_arg(t1), a1, a1);
            vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
                      tcgv_vec_arg(t2), a1, a1);
            vec_gen_3(INDEX_op_sari_vec, type, MO_16,
                      tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
            vec_gen_3(INDEX_op_sari_vec, type, MO_16,
                      tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
            vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
                      a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2));
            tcg_temp_free_vec(t1);
            tcg_temp_free_vec(t2);
            break;
        }
        tcg_debug_assert(vece == MO_64);
        /* MO_64: If the shift is <= 32, we can emulate the sign extend by
           performing an arithmetic 32-bit shift and overwriting the high
           half of the result (note that the ISA says shift of 32 is valid). */
        if (a2 <= 32) {
            t1 = tcg_temp_new_vec(type);
            vec_gen_3(INDEX_op_sari_vec, type, MO_32, tcgv_vec_arg(t1), a1, a2);
            vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2);
            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
                      a0, a0, tcgv_vec_arg(t1), 0xaa);
            tcg_temp_free_vec(t1);
            break;
        }
        /* Otherwise we will need to use a compare vs 0 to produce the
           sign-extend, shift and merge.  */
        t1 = tcg_temp_new_vec(type);
        t2 = tcg_const_zeros_vec(type);
        vec_gen_4(INDEX_op_cmp_vec, type, MO_64,
                  tcgv_vec_arg(t1), tcgv_vec_arg(t2), a1, TCG_COND_GT);
        tcg_temp_free_vec(t2);
        vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2);
        vec_gen_3(INDEX_op_shli_vec, type, MO_64,
                  tcgv_vec_arg(t1), tcgv_vec_arg(t1), 64 - a2);
        vec_gen_3(INDEX_op_or_vec, type, MO_64, a0, a0, tcgv_vec_arg(t1));
        tcg_temp_free_vec(t1);
        break;

    case INDEX_op_mul_vec:
        tcg_debug_assert(vece == MO_8);
        a1 = va_arg(va, TCGArg);
        a2 = va_arg(va, TCGArg);
        switch (type) {
        case TCG_TYPE_V64:
            t1 = tcg_temp_new_vec(TCG_TYPE_V128);
            t2 = tcg_temp_new_vec(TCG_TYPE_V128);
            tcg_gen_dup16i_vec(t2, 0);
            vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
                      tcgv_vec_arg(t1), a1, tcgv_vec_arg(t2));
            vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
                      tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2);
            tcg_gen_mul_vec(MO_16, t1, t1, t2);
            tcg_gen_shri_vec(MO_16, t1, t1, 8);
            vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
                      a0, tcgv_vec_arg(t1), tcgv_vec_arg(t1));
            tcg_temp_free_vec(t1);
            tcg_temp_free_vec(t2);
            break;

        case TCG_TYPE_V128:
            t1 = tcg_temp_new_vec(TCG_TYPE_V128);
            t2 = tcg_temp_new_vec(TCG_TYPE_V128);
            t3 = tcg_temp_new_vec(TCG_TYPE_V128);
            t4 = tcg_temp_new_vec(TCG_TYPE_V128);
            tcg_gen_dup16i_vec(t4, 0);
            vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
                      tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4));
            vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
                      tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2);
            vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8,
                      tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4));
            vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8,
                      tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2);
            tcg_gen_mul_vec(MO_16, t1, t1, t2);
            tcg_gen_mul_vec(MO_16, t3, t3, t4);
            tcg_gen_shri_vec(MO_16, t1, t1, 8);
            tcg_gen_shri_vec(MO_16, t3, t3, 8);
            vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
                      a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3));
            tcg_temp_free_vec(t1);
            tcg_temp_free_vec(t2);
            tcg_temp_free_vec(t3);
            tcg_temp_free_vec(t4);
            break;

        case TCG_TYPE_V256:
            t1 = tcg_temp_new_vec(TCG_TYPE_V256);
            t2 = tcg_temp_new_vec(TCG_TYPE_V256);
            t3 = tcg_temp_new_vec(TCG_TYPE_V256);
            t4 = tcg_temp_new_vec(TCG_TYPE_V256);
            tcg_gen_dup16i_vec(t4, 0);
            /* a1: A[0-7] ... D[0-7]; a2: W[0-7] ... Z[0-7]
               t1: extends of B[0-7], D[0-7]
               t2: extends of X[0-7], Z[0-7]
               t3: extends of A[0-7], C[0-7]
               t4: extends of W[0-7], Y[0-7].  */
            vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8,
                      tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4));
            vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8,
                      tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2);
            vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8,
                      tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4));
            vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8,
                      tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2);
            /* t1: BX DZ; t2: AW CY.  */
            tcg_gen_mul_vec(MO_16, t1, t1, t2);
            tcg_gen_mul_vec(MO_16, t3, t3, t4);
            tcg_gen_shri_vec(MO_16, t1, t1, 8);
            tcg_gen_shri_vec(MO_16, t3, t3, 8);
            /* a0: AW BX CY DZ.  */
            vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V256, MO_8,
                      a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3));
            tcg_temp_free_vec(t1);
            tcg_temp_free_vec(t2);
            tcg_temp_free_vec(t3);
            tcg_temp_free_vec(t4);
            break;

        default:
            g_assert_not_reached();
        }
        break;

    case INDEX_op_cmp_vec:
        {
            enum {
                NEED_SWAP = 1,
                NEED_INV  = 2,
                NEED_BIAS = 4
            };
            static const uint8_t fixups[16] = {
                [0 ... 15] = -1,
                [TCG_COND_EQ] = 0,
                [TCG_COND_NE] = NEED_INV,
                [TCG_COND_GT] = 0,
                [TCG_COND_LT] = NEED_SWAP,
                [TCG_COND_LE] = NEED_INV,
                [TCG_COND_GE] = NEED_SWAP | NEED_INV,
                [TCG_COND_GTU] = NEED_BIAS,
                [TCG_COND_LTU] = NEED_BIAS | NEED_SWAP,
                [TCG_COND_LEU] = NEED_BIAS | NEED_INV,
                [TCG_COND_GEU] = NEED_BIAS | NEED_SWAP | NEED_INV,
            };

            TCGCond cond;
            uint8_t fixup;

            a1 = va_arg(va, TCGArg);
            a2 = va_arg(va, TCGArg);
            cond = va_arg(va, TCGArg);
            fixup = fixups[cond & 15];
            tcg_debug_assert(fixup != 0xff);

            if (fixup & NEED_INV) {
                cond = tcg_invert_cond(cond);
            }
            if (fixup & NEED_SWAP) {
                TCGArg t;
                t = a1, a1 = a2, a2 = t;
                cond = tcg_swap_cond(cond);
            }

            t1 = t2 = NULL;
            if (fixup & NEED_BIAS) {
                t1 = tcg_temp_new_vec(type);
                t2 = tcg_temp_new_vec(type);
                tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1));
                tcg_gen_sub_vec(vece, t1, temp_tcgv_vec(arg_temp(a1)), t2);
                tcg_gen_sub_vec(vece, t2, temp_tcgv_vec(arg_temp(a2)), t2);
                a1 = tcgv_vec_arg(t1);
                a2 = tcgv_vec_arg(t2);
                cond = tcg_signed_cond(cond);
            }

            tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
            vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond);

            if (fixup & NEED_BIAS) {
                tcg_temp_free_vec(t1);
                tcg_temp_free_vec(t2);
            }
            if (fixup & NEED_INV) {
                tcg_gen_not_vec(vece, v0, v0);
            }
        }
        break;

    default:
        break;
    }

    va_end(va);
}

3381
static const int tcg_target_callee_save_regs[] = {
3382 3383 3384
#if TCG_TARGET_REG_BITS == 64
    TCG_REG_RBP,
    TCG_REG_RBX,
3385 3386 3387 3388
#if defined(_WIN64)
    TCG_REG_RDI,
    TCG_REG_RSI,
#endif
3389 3390
    TCG_REG_R12,
    TCG_REG_R13,
B
Blue Swirl 已提交
3391
    TCG_REG_R14, /* Currently used for the global env. */
3392 3393
    TCG_REG_R15,
#else
B
Blue Swirl 已提交
3394
    TCG_REG_EBP, /* Currently used for the global env. */
3395 3396 3397
    TCG_REG_EBX,
    TCG_REG_ESI,
    TCG_REG_EDI,
3398
#endif
3399 3400
};

3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414
/* Compute frame size via macros, to share between tcg_target_qemu_prologue
   and tcg_register_jit.  */

#define PUSH_SIZE \
    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
     * (TCG_TARGET_REG_BITS / 8))

#define FRAME_SIZE \
    ((PUSH_SIZE \
      + TCG_STATIC_CALL_ARGS_SIZE \
      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
      + TCG_TARGET_STACK_ALIGN - 1) \
     & ~(TCG_TARGET_STACK_ALIGN - 1))

3415
/* Generate global QEMU prologue and epilogue code */
3416
static void tcg_target_qemu_prologue(TCGContext *s)
3417
{
3418
    int i, stack_addend;
3419

3420
    /* TB prologue */
3421

3422
    /* Reserve some stack space, also for TCG temps.  */
3423
    stack_addend = FRAME_SIZE - PUSH_SIZE;
3424 3425 3426 3427 3428 3429 3430 3431
    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
                  CPU_TEMP_BUF_NLONGS * sizeof(long));

    /* Save all callee saved registers.  */
    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
        tcg_out_push(s, tcg_target_callee_save_regs[i]);
    }

B
Blue Swirl 已提交
3432 3433 3434
#if TCG_TARGET_REG_BITS == 32
    tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
               (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3435 3436 3437 3438 3439
    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
    /* jmp *tb.  */
    tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
		         (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
			 + stack_addend);
B
Blue Swirl 已提交
3440
#else
B
Blue Swirl 已提交
3441
    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
B
Blue Swirl 已提交
3442
    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3443
    /* jmp *tb.  */
B
Blue Swirl 已提交
3444
    tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3445
#endif
3446

3447 3448 3449 3450 3451 3452 3453
    /*
     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
     * and fall through to the rest of the epilogue.
     */
    s->code_gen_epilogue = s->code_ptr;
    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);

3454 3455
    /* TB epilogue */
    tb_ret_addr = s->code_ptr;
3456

3457
    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3458

3459 3460 3461
    if (have_avx2) {
        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
    }
3462
    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3463 3464
        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
    }
3465
    tcg_out_opc(s, OPC_RET, 0, 0, 0);
3466 3467

#if !defined(CONFIG_SOFTMMU)
3468 3469
    /* Try to set up a segment register to point to guest_base.  */
    if (guest_base) {
3470 3471 3472
        setup_guest_base_seg();
    }
#endif
3473 3474
}

3475 3476 3477 3478 3479
static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
{
    memset(p, 0x90, count);
}

3480
static void tcg_target_init(TCGContext *s)
B
bellard 已提交
3481
{
3482
#ifdef CONFIG_CPUID_H
3483
    unsigned a, b, c, d, b7 = 0;
3484
    int max = __get_cpuid_max(0, 0);
3485

3486 3487 3488 3489 3490 3491 3492
    if (max >= 7) {
        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
        __cpuid_count(7, 0, a, b7, c, d);
        have_bmi1 = (b7 & bit_BMI) != 0;
        have_bmi2 = (b7 & bit_BMI2) != 0;
    }

3493 3494 3495
    if (max >= 1) {
        __cpuid(1, a, b, c, d);
#ifndef have_cmov
3496 3497 3498
        /* For 32-bit, 99% certainty that we're running on hardware that
           supports cmov, but we still need to check.  In case cmov is not
           available, we'll use a small forward branch.  */
3499 3500
        have_cmov = (d & bit_CMOV) != 0;
#endif
3501

3502 3503
        /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
           need to probe for it.  */
3504
        have_movbe = (c & bit_MOVBE) != 0;
3505
        have_popcnt = (c & bit_POPCNT) != 0;
3506

3507 3508 3509 3510
        /* There are a number of things we must check before we can be
           sure of not hitting invalid opcode.  */
        if (c & bit_OSXSAVE) {
            unsigned xcrl, xcrh;
3511 3512 3513 3514
            /* The xgetbv instruction is not available to older versions of
             * the assembler, so we encode the instruction manually.
             */
            asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
3515 3516 3517 3518 3519
            if ((xcrl & 6) == 6) {
                have_avx1 = (c & bit_AVX) != 0;
                have_avx2 = (b7 & bit_AVX2) != 0;
            }
        }
3520
    }
3521

3522 3523 3524 3525 3526 3527
    max = __get_cpuid_max(0x8000000, 0);
    if (max >= 1) {
        __cpuid(0x80000001, a, b, c, d);
        /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
        have_lzcnt = (c & bit_LZCNT) != 0;
    }
3528
#endif /* CONFIG_CPUID_H */
3529

3530
    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
3531
    if (TCG_TARGET_REG_BITS == 64) {
3532 3533 3534 3535 3536 3537 3538 3539
        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
    }
    if (have_avx1) {
        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
    }
    if (have_avx2) {
        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
3540
    }
3541

3542
    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
3543 3544 3545
    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
3546
    if (TCG_TARGET_REG_BITS == 64) {
3547
#if !defined(_WIN64)
3548 3549
        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
3550
#endif
3551 3552 3553 3554 3555
        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
    }
3556

3557
    s->reserved_regs = 0;
3558
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
B
bellard 已提交
3559
}
3560 3561

typedef struct {
3562
    DebugFrameHeader h;
3563 3564
    uint8_t fde_def_cfa[4];
    uint8_t fde_reg_ofs[14];
3565 3566
} DebugFrame;

3567 3568 3569
/* We're expecting a 2 byte uleb128 encoded value.  */
QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));

3570 3571 3572
#if !defined(__ELF__)
    /* Host machine without ELF. */
#elif TCG_TARGET_REG_BITS == 64
3573
#define ELF_HOST_MACHINE EM_X86_64
3574 3575 3576 3577 3578 3579 3580
static const DebugFrame debug_frame = {
    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
    .h.cie.id = -1,
    .h.cie.version = 1,
    .h.cie.code_align = 1,
    .h.cie.data_align = 0x78,             /* sleb128 -8 */
    .h.cie.return_column = 16,
3581

3582
    /* Total FDE size does not include the "len" member.  */
3583
    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3584 3585

    .fde_def_cfa = {
3586 3587 3588 3589
        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
        (FRAME_SIZE >> 7)
    },
3590
    .fde_reg_ofs = {
3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602
        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
        /* The following ordering must match tcg_target_callee_save_regs.  */
        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
    }
};
#else
#define ELF_HOST_MACHINE EM_386
3603 3604 3605 3606 3607 3608 3609
static const DebugFrame debug_frame = {
    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
    .h.cie.id = -1,
    .h.cie.version = 1,
    .h.cie.code_align = 1,
    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
    .h.cie.return_column = 8,
3610

3611
    /* Total FDE size does not include the "len" member.  */
3612
    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3613 3614

    .fde_def_cfa = {
3615 3616 3617 3618
        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
        (FRAME_SIZE >> 7)
    },
3619
    .fde_reg_ofs = {
3620 3621 3622 3623 3624 3625 3626 3627 3628 3629
        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
        /* The following ordering must match tcg_target_callee_save_regs.  */
        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
    }
};
#endif

3630
#if defined(ELF_HOST_MACHINE)
3631 3632 3633 3634
void tcg_register_jit(void *buf, size_t buf_size)
{
    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
}
3635
#endif
反馈
建议
客服 返回
顶部