提交 b76f0d8c 编写于 作者: Y Yeongkyoon Lee 提交者: Blue Swirl

tcg: Optimize qemu_ld/st by generating slow paths at the end of a block

Add optimized TCG qemu_ld/st generation which locates the code of TLB miss
cases at the end of a block after generating the other IRs.
Currently, this optimization supports only i386 and x86_64 hosts.
Signed-off-by: NYeongkyoon Lee <yeongkyoon.lee@samsung.com>
Signed-off-by: NBlue Swirl <blauwirbel@gmail.com>
上级 fdbb84d1
......@@ -1002,6 +1002,17 @@ static const void *qemu_st_helpers[4] = {
helper_stq_mmu,
};
static void add_qemu_ldst_label(TCGContext *s,
int is_ld,
int opc,
int data_reg,
int data_reg2,
int addrlo_reg,
int addrhi_reg,
int mem_index,
uint8_t *raddr,
uint8_t **label_ptr);
/* Perform the TLB load and compare.
Inputs:
......@@ -1060,19 +1071,19 @@ static inline void tcg_out_tlb_load(TCGContext *s, int addrlo_idx,
tcg_out_mov(s, type, r1, addrlo);
/* jne label1 */
tcg_out8(s, OPC_JCC_short + JCC_JNE);
/* jne slow_path */
tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
label_ptr[0] = s->code_ptr;
s->code_ptr++;
s->code_ptr += 4;
if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
/* cmp 4(r0), addrhi */
tcg_out_modrm_offset(s, OPC_CMP_GvEv, args[addrlo_idx+1], r0, 4);
/* jne label1 */
tcg_out8(s, OPC_JCC_short + JCC_JNE);
/* jne slow_path */
tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
label_ptr[1] = s->code_ptr;
s->code_ptr++;
s->code_ptr += 4;
}
/* TLB Hit. */
......@@ -1193,10 +1204,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
int addrlo_idx;
#if defined(CONFIG_SOFTMMU)
int mem_index, s_bits;
#if TCG_TARGET_REG_BITS == 32
int stack_adjust;
#endif
uint8_t *label_ptr[3];
uint8_t *label_ptr[2];
#endif
data_reg = args[0];
......@@ -1216,87 +1224,17 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
/* TLB Hit. */
tcg_out_qemu_ld_direct(s, data_reg, data_reg2, TCG_REG_L1, 0, 0, opc);
/* jmp label2 */
tcg_out8(s, OPC_JMP_short);
label_ptr[2] = s->code_ptr;
s->code_ptr++;
/* TLB Miss. */
/* label1: */
*label_ptr[0] = s->code_ptr - label_ptr[0] - 1;
if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
*label_ptr[1] = s->code_ptr - label_ptr[1] - 1;
}
/* XXX: move that code at the end of the TB */
#if TCG_TARGET_REG_BITS == 32
tcg_out_pushi(s, mem_index);
stack_adjust = 4;
if (TARGET_LONG_BITS == 64) {
tcg_out_push(s, args[addrlo_idx + 1]);
stack_adjust += 4;
}
tcg_out_push(s, args[addrlo_idx]);
stack_adjust += 4;
tcg_out_push(s, TCG_AREG0);
stack_adjust += 4;
#else
tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0], TCG_AREG0);
/* The second argument is already loaded with addrlo. */
tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], mem_index);
#endif
tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]);
#if TCG_TARGET_REG_BITS == 32
if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) {
/* Pop and discard. This is 2 bytes smaller than the add. */
tcg_out_pop(s, TCG_REG_ECX);
} else if (stack_adjust != 0) {
tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust);
}
#endif
switch(opc) {
case 0 | 4:
tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
break;
case 1 | 4:
tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
break;
case 0:
tcg_out_ext8u(s, data_reg, TCG_REG_EAX);
break;
case 1:
tcg_out_ext16u(s, data_reg, TCG_REG_EAX);
break;
case 2:
tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
break;
#if TCG_TARGET_REG_BITS == 64
case 2 | 4:
tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
break;
#endif
case 3:
if (TCG_TARGET_REG_BITS == 64) {
tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
} else if (data_reg == TCG_REG_EDX) {
/* xchg %edx, %eax */
tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EAX);
} else {
tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EDX);
}
break;
default:
tcg_abort();
}
/* label2: */
*label_ptr[2] = s->code_ptr - label_ptr[2] - 1;
/* Record the current context of a load into ldst label */
add_qemu_ldst_label(s,
1,
opc,
data_reg,
data_reg2,
args[addrlo_idx],
args[addrlo_idx + 1],
mem_index,
s->code_ptr,
label_ptr);
#else
{
int32_t offset = GUEST_BASE;
......@@ -1393,8 +1331,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
int addrlo_idx;
#if defined(CONFIG_SOFTMMU)
int mem_index, s_bits;
int stack_adjust;
uint8_t *label_ptr[3];
uint8_t *label_ptr[2];
#endif
data_reg = args[0];
......@@ -1414,20 +1351,221 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
/* TLB Hit. */
tcg_out_qemu_st_direct(s, data_reg, data_reg2, TCG_REG_L1, 0, 0, opc);
/* jmp label2 */
/* Record the current context of a store into ldst label */
add_qemu_ldst_label(s,
0,
opc,
data_reg,
data_reg2,
args[addrlo_idx],
args[addrlo_idx + 1],
mem_index,
s->code_ptr,
label_ptr);
#else
{
int32_t offset = GUEST_BASE;
int base = args[addrlo_idx];
int seg = 0;
/* ??? We assume all operations have left us with register contents
that are zero extended. So far this appears to be true. If we
want to enforce this, we can either do an explicit zero-extension
here, or (if GUEST_BASE == 0, or a segment register is in use)
use the ADDR32 prefix. For now, do nothing. */
if (GUEST_BASE && guest_base_flags) {
seg = guest_base_flags;
offset = 0;
} else if (TCG_TARGET_REG_BITS == 64 && offset != GUEST_BASE) {
tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, GUEST_BASE);
tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L1, base);
base = TCG_REG_L1;
offset = 0;
}
tcg_out_qemu_st_direct(s, data_reg, data_reg2, base, offset, seg, opc);
}
#endif
}
#if defined(CONFIG_SOFTMMU)
/*
* Record the context of a call to the out of line helper code for the slow path
* for a load or store, so that we can later generate the correct helper code
*/
static void add_qemu_ldst_label(TCGContext *s,
int is_ld,
int opc,
int data_reg,
int data_reg2,
int addrlo_reg,
int addrhi_reg,
int mem_index,
uint8_t *raddr,
uint8_t **label_ptr)
{
int idx;
TCGLabelQemuLdst *label;
if (s->nb_qemu_ldst_labels >= TCG_MAX_QEMU_LDST) {
tcg_abort();
}
idx = s->nb_qemu_ldst_labels++;
label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[idx];
label->is_ld = is_ld;
label->opc = opc;
label->datalo_reg = data_reg;
label->datahi_reg = data_reg2;
label->addrlo_reg = addrlo_reg;
label->addrhi_reg = addrhi_reg;
label->mem_index = mem_index;
label->raddr = raddr;
label->label_ptr[0] = label_ptr[0];
if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
label->label_ptr[1] = label_ptr[1];
}
}
/*
* Generate code for the slow path for a load at the end of block
*/
static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *label)
{
int s_bits;
int opc = label->opc;
int mem_index = label->mem_index;
#if TCG_TARGET_REG_BITS == 32
int stack_adjust;
int addrlo_reg = label->addrlo_reg;
int addrhi_reg = label->addrhi_reg;
#endif
int data_reg = label->datalo_reg;
int data_reg2 = label->datahi_reg;
uint8_t *raddr = label->raddr;
uint8_t **label_ptr = &label->label_ptr[0];
s_bits = opc & 3;
/* resolve label address */
*(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4);
if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
*(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4);
}
#if TCG_TARGET_REG_BITS == 32
tcg_out_pushi(s, mem_index);
stack_adjust = 4;
if (TARGET_LONG_BITS == 64) {
tcg_out_push(s, addrhi_reg);
stack_adjust += 4;
}
tcg_out_push(s, addrlo_reg);
stack_adjust += 4;
tcg_out_push(s, TCG_AREG0);
stack_adjust += 4;
#else
tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0], TCG_AREG0);
/* The second argument is already loaded with addrlo. */
tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], mem_index);
#endif
/* Code generation of qemu_ld/st's slow path calling MMU helper
PRE_PROC ...
call MMU helper
jmp POST_PROC (2b) : short forward jump <- GETRA()
jmp next_code (5b) : dummy long backward jump which is never executed
POST_PROC ... : do post-processing <- GETRA() + 7
jmp next_code : jump to the code corresponding to next IR of qemu_ld/st
*/
tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]);
/* Jump to post-processing code */
tcg_out8(s, OPC_JMP_short);
label_ptr[2] = s->code_ptr;
s->code_ptr++;
tcg_out8(s, 5);
/* Dummy backward jump having information of fast path'pc for MMU helpers */
tcg_out8(s, OPC_JMP_long);
*(int32_t *)s->code_ptr = (int32_t)(raddr - s->code_ptr - 4);
s->code_ptr += 4;
/* TLB Miss. */
#if TCG_TARGET_REG_BITS == 32
if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) {
/* Pop and discard. This is 2 bytes smaller than the add. */
tcg_out_pop(s, TCG_REG_ECX);
} else if (stack_adjust != 0) {
tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust);
}
#endif
/* label1: */
*label_ptr[0] = s->code_ptr - label_ptr[0] - 1;
switch(opc) {
case 0 | 4:
tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
break;
case 1 | 4:
tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
break;
case 0:
tcg_out_ext8u(s, data_reg, TCG_REG_EAX);
break;
case 1:
tcg_out_ext16u(s, data_reg, TCG_REG_EAX);
break;
case 2:
tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
break;
#if TCG_TARGET_REG_BITS == 64
case 2 | 4:
tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
break;
#endif
case 3:
if (TCG_TARGET_REG_BITS == 64) {
tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
} else if (data_reg == TCG_REG_EDX) {
/* xchg %edx, %eax */
tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EAX);
} else {
tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EDX);
}
break;
default:
tcg_abort();
}
/* Jump to the code corresponding to next IR of qemu_st */
tcg_out_jmp(s, (tcg_target_long)raddr);
}
/*
* Generate code for the slow path for a store at the end of block
*/
static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *label)
{
int s_bits;
int stack_adjust;
int opc = label->opc;
int mem_index = label->mem_index;
int data_reg = label->datalo_reg;
#if TCG_TARGET_REG_BITS == 32
int data_reg2 = label->datahi_reg;
int addrlo_reg = label->addrlo_reg;
int addrhi_reg = label->addrhi_reg;
#endif
uint8_t *raddr = label->raddr;
uint8_t **label_ptr = &label->label_ptr[0];
s_bits = opc & 3;
/* resolve label address */
*(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4);
if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
*label_ptr[1] = s->code_ptr - label_ptr[1] - 1;
*(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4);
}
/* XXX: move that code at the end of the TB */
#if TCG_TARGET_REG_BITS == 32
tcg_out_pushi(s, mem_index);
stack_adjust = 4;
......@@ -1438,10 +1576,10 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
tcg_out_push(s, data_reg);
stack_adjust += 4;
if (TARGET_LONG_BITS == 64) {
tcg_out_push(s, args[addrlo_idx + 1]);
tcg_out_push(s, addrhi_reg);
stack_adjust += 4;
}
tcg_out_push(s, args[addrlo_idx]);
tcg_out_push(s, addrlo_reg);
stack_adjust += 4;
tcg_out_push(s, TCG_AREG0);
stack_adjust += 4;
......@@ -1454,8 +1592,26 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
stack_adjust = 0;
#endif
/* Code generation of qemu_ld/st's slow path calling MMU helper
PRE_PROC ...
call MMU helper
jmp POST_PROC (2b) : short forward jump <- GETRA()
jmp next_code (5b) : dummy long backward jump which is never executed
POST_PROC ... : do post-processing <- GETRA() + 7
jmp next_code : jump to the code corresponding to next IR of qemu_ld/st
*/
tcg_out_calli(s, (tcg_target_long)qemu_st_helpers[s_bits]);
/* Jump to post-processing code */
tcg_out8(s, OPC_JMP_short);
tcg_out8(s, 5);
/* Dummy backward jump having information of fast path'pc for MMU helpers */
tcg_out8(s, OPC_JMP_long);
*(int32_t *)s->code_ptr = (int32_t)(raddr - s->code_ptr - 4);
s->code_ptr += 4;
if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) {
/* Pop and discard. This is 2 bytes smaller than the add. */
tcg_out_pop(s, TCG_REG_ECX);
......@@ -1463,33 +1619,29 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust);
}
/* label2: */
*label_ptr[2] = s->code_ptr - label_ptr[2] - 1;
#else
{
int32_t offset = GUEST_BASE;
int base = args[addrlo_idx];
int seg = 0;
/* Jump to the code corresponding to next IR of qemu_st */
tcg_out_jmp(s, (tcg_target_long)raddr);
}
/* ??? We assume all operations have left us with register contents
that are zero extended. So far this appears to be true. If we
want to enforce this, we can either do an explicit zero-extension
here, or (if GUEST_BASE == 0, or a segment register is in use)
use the ADDR32 prefix. For now, do nothing. */
if (GUEST_BASE && guest_base_flags) {
seg = guest_base_flags;
offset = 0;
} else if (TCG_TARGET_REG_BITS == 64 && offset != GUEST_BASE) {
tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, GUEST_BASE);
tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L1, base);
base = TCG_REG_L1;
offset = 0;
/*
* Generate TB finalization at the end of block
*/
void tcg_out_tb_finalize(TCGContext *s)
{
int i;
TCGLabelQemuLdst *label;
/* qemu_ld/st slow paths */
for (i = 0; i < s->nb_qemu_ldst_labels; i++) {
label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[i];
if (label->is_ld) {
tcg_out_qemu_ld_slow_path(s, label);
} else {
tcg_out_qemu_st_slow_path(s, label);
}
tcg_out_qemu_st_direct(s, data_reg, data_reg2, base, offset, seg, opc);
}
#endif
}
#endif /* CONFIG_SOFTMMU */
static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
const TCGArg *args, const int *const_args)
......
......@@ -299,6 +299,14 @@ void tcg_func_start(TCGContext *s)
gen_opc_ptr = gen_opc_buf;
gen_opparam_ptr = gen_opparam_buf;
#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
/* Initialize qemu_ld/st labels to assist code generation at the end of TB
for TLB miss cases at the end of TB */
s->qemu_ldst_labels = tcg_malloc(sizeof(TCGLabelQemuLdst) *
TCG_MAX_QEMU_LDST);
s->nb_qemu_ldst_labels = 0;
#endif
}
static inline void tcg_temp_alloc(TCGContext *s, int n)
......@@ -2314,6 +2322,10 @@ static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
#endif
}
the_end:
#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
/* Generate TB finalization at the end of block */
tcg_out_tb_finalize(s);
#endif
return -1;
}
......
......@@ -188,6 +188,24 @@ typedef tcg_target_ulong TCGArg;
are aliases for target_ulong and host pointer sized values respectively.
*/
#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
/* Macros/structures for qemu_ld/st IR code optimization:
TCG_MAX_HELPER_LABELS is defined as same as OPC_BUF_SIZE in exec-all.h. */
#define TCG_MAX_QEMU_LDST 640
typedef struct TCGLabelQemuLdst {
int is_ld:1; /* qemu_ld: 1, qemu_st: 0 */
int opc:4;
int addrlo_reg; /* reg index for low word of guest virtual addr */
int addrhi_reg; /* reg index for high word of guest virtual addr */
int datalo_reg; /* reg index for low word to be loaded or stored */
int datahi_reg; /* reg index for high word to be loaded or stored */
int mem_index; /* soft MMU memory index */
uint8_t *raddr; /* gen code addr of the next IR of qemu_ld/st IR */
uint8_t *label_ptr[2]; /* label pointers to be updated */
} TCGLabelQemuLdst;
#endif
#ifdef CONFIG_DEBUG_TCG
#define DEBUG_TCGV 1
#endif
......@@ -431,6 +449,13 @@ struct TCGContext {
int temps_in_use;
int goto_tb_issue_mask;
#endif
#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
/* labels info for qemu_ld/st IRs
The labels help to generate TLB miss case codes at the end of TB */
TCGLabelQemuLdst *qemu_ldst_labels;
int nb_qemu_ldst_labels;
#endif
};
extern TCGContext tcg_ctx;
......@@ -634,3 +659,8 @@ extern uint8_t *code_gen_prologue;
#endif
void tcg_register_jit(void *buf, size_t buf_size);
#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
/* Generate TB finalization at the end of block */
void tcg_out_tb_finalize(TCGContext *s);
#endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册