1. 程式人生 > >qemu 中關鍵結構體和函式理解

qemu 中關鍵結構體和函式理解

關鍵結構

./target-arm/translate.c(DisasContext存在於該檔案中,專門為TB塊的生成而服務)

typedef struct DisasContext {
    target_ulong pc;
    int is_jmp;
    /* Nonzero if this instruction has been conditionally skipped.  */
    int condjmp;
    /* The label that will be jumped to when the instruction is skipped.  */
    int condlabel;
    /* Thumb-2 conditional execution bits.  */
    int condexec_mask;
    int condexec_cond;
    struct TranslationBlock *tb;//包含一個TB塊
    int singlestep_enabled;
    int thumb;
    int bswap_code;
#if !defined(CONFIG_USER_ONLY)
    int user;
#endif
    int vfp_enabled;
    int vec_len;
    int vec_stride;
} DisasContext;

./include/exec/exec-all.h

struct TranslationBlock {
    target_ulong pc;   /* simulated PC corresponding to this block (EIP + CS base) */
    target_ulong cs_base; /* CS base for this block */
    uint64_t flags; /* flags defining in which context the code was generated */
    uint16_t size;      /* size of target code for this block (1 <=
                           size <= TARGET_PAGE_SIZE) */
    uint16_t cflags;    /* compile flags */
#define CF_COUNT_MASK  0x7fff
#define CF_LAST_IO     0x8000 /* Last insn may be an IO access.  */

    uint8_t *tc_ptr;    /* pointer to the translated code */    //指向TB塊中正在轉換的指令,每個TB包含多條指令
    /* next matching tb for physical address. */
    struct TranslationBlock *phys_hash_next;
    /* first and second physical page containing code. The lower bit
       of the pointer tells the index in page_next[] */
    struct TranslationBlock *page_next[2];
    tb_page_addr_t page_addr[2];

    /* the following data are used to directly call another TB from
       the code of this one. */
    uint16_t tb_next_offset[2]; /* offset of original jump target */
#ifdef USE_DIRECT_JUMP
    uint16_t tb_jmp_offset[2]; /* offset of jump instruction */
#else
    uintptr_t tb_next[2]; /* address of jump generated code */
#endif
    /* list of TBs jumping to this one. This is a circular list using
       the two least significant bits of the pointers to tell what is
       the next pointer: 0 = jmp_next[0], 1 = jmp_next[1], 2 =
       jmp_first */
    struct TranslationBlock *jmp_next[2];
    struct TranslationBlock *jmp_first;
    uint32_t icount;
};


target-arm/cpu.h

typedef struct CPUARMState {
    /* Regs for current mode.  */
    uint32_t regs[16];
    /* Frequently accessed CPSR bits are stored separately for efficiency.
       This contains all the other bits.  Use cpsr_{read,write} to access
       the whole CPSR.  */
    uint32_t uncached_cpsr;
    uint32_t spsr;

    /* Banked registers.  */
    uint32_t banked_spsr[6];
    uint32_t banked_r13[6];
    uint32_t banked_r14[6];

    /* These hold r8-r12.  */
    uint32_t usr_regs[5];
    uint32_t fiq_regs[5];

    /* cpsr flag cache for faster execution */
    uint32_t CF; /* 0 or 1 */
    uint32_t VF; /* V is the bit 31. All other bits are undefined */
    uint32_t NF; /* N is bit 31. All other bits are undefined.  */
    uint32_t ZF; /* Z set if zero.  */
    uint32_t QF; /* 0 or 1 */
    uint32_t GE; /* cpsr[19:16] */
    uint32_t thumb; /* cpsr[5]. 0 = arm mode, 1 = thumb mode. */
    uint32_t condexec_bits; /* IT bits.  cpsr[15:10,26:25].  */

    /* System control coprocessor (cp15) */
    struct {
        uint32_t c0_cpuid;
        uint32_t c0_cssel; /* Cache size selection.  */
        uint32_t c1_sys; /* System control register.  */
        uint32_t c1_coproc; /* Coprocessor access register.  */
        uint32_t c1_xscaleauxcr; /* XScale auxiliary control register.  */
        uint32_t c1_scr; /* secure config register.  */
        uint32_t c2_base0; /* MMU translation table base 0.  */
        uint32_t c2_base0_hi; /* MMU translation table base 0, high 32 bits */
        uint32_t c2_base1; /* MMU translation table base 0.  */
        uint32_t c2_base1_hi; /* MMU translation table base 1, high 32 bits */
        uint32_t c2_control; /* MMU translation table base control.  */
        uint32_t c2_mask; /* MMU translation table base selection mask.  */
        uint32_t c2_base_mask; /* MMU translation table base 0 mask. */
        uint32_t c2_data; /* MPU data cachable bits.  */
        uint32_t c2_insn; /* MPU instruction cachable bits.  */
        uint32_t c3; /* MMU domain access control register
                        MPU write buffer control.  */
        uint32_t c5_insn; /* Fault status registers.  */
        uint32_t c5_data;
        uint32_t c6_region[8]; /* MPU base/size registers.  */
        uint32_t c6_insn; /* Fault address registers.  */
        uint32_t c6_data;
        uint32_t c7_par;  /* Translation result. */
        uint32_t c7_par_hi;  /* Translation result, high 32 bits */
        uint32_t c9_insn; /* Cache lockdown registers.  */
        uint32_t c9_data;
        uint32_t c9_pmcr; /* performance monitor control register */
        uint32_t c9_pmcnten; /* perf monitor counter enables */
        uint32_t c9_pmovsr; /* perf monitor overflow status */
        uint32_t c9_pmxevtyper; /* perf monitor event type */
        uint32_t c9_pmuserenr; /* perf monitor user enable */
        uint32_t c9_pminten; /* perf monitor interrupt enables */
        uint32_t c13_fcse; /* FCSE PID.  */
        uint32_t c13_context; /* Context ID.  */
        uint32_t c13_tls1; /* User RW Thread register.  */
        uint32_t c13_tls2; /* User RO Thread register.  */
        uint32_t c13_tls3; /* Privileged Thread register.  */
        uint32_t c15_cpar; /* XScale Coprocessor Access Register */
        uint32_t c15_ticonfig; /* TI925T configuration byte.  */
        uint32_t c15_i_max; /* Maximum D-cache dirty line index.  */
        uint32_t c15_i_min; /* Minimum D-cache dirty line index.  */
        uint32_t c15_threadid; /* TI debugger thread-ID.  */
        uint32_t c15_config_base_address; /* SCU base address.  */
        uint32_t c15_diagnostic; /* diagnostic register */
        uint32_t c15_power_diagnostic;
        uint32_t c15_power_control; /* power control */
    } cp15;

    struct {
        uint32_t other_sp;
        uint32_t vecbase;
        uint32_t basepri;
        uint32_t control;
        int current_sp;
        int exception;
        int pending_exception;
    } v7m;

    /* Thumb-2 EE state.  */
    uint32_t teecr;
    uint32_t teehbr;

    /* VFP coprocessor state.  */
    struct {
        float64 regs[32];

        uint32_t xregs[16];
        /* We store these fpcsr fields separately for convenience.  */
        int vec_len;
        int vec_stride;

        /* scratch space when Tn are not sufficient.  */
        uint32_t scratch[8];

        /* fp_status is the "normal" fp status. standard_fp_status retains
         * values corresponding to the ARM "Standard FPSCR Value", ie
         * default-NaN, flush-to-zero, round-to-nearest and is used by
         * any operations (generally Neon) which the architecture defines
         * as controlled by the standard FPSCR value rather than the FPSCR.
         *
         * To avoid having to transfer exception bits around, we simply
         * say that the FPSCR cumulative exception flags are the logical
         * OR of the flags in the two fp statuses. This relies on the
         * only thing which needs to read the exception flags being
         * an explicit FPSCR read.
         */
        float_status fp_status;
        float_status standard_fp_status;
    } vfp;
    uint32_t exclusive_addr;
    uint32_t exclusive_val;
    uint32_t exclusive_high;
#if defined(CONFIG_USER_ONLY)
    uint32_t exclusive_test;
    uint32_t exclusive_info;
#endif

    /* iwMMXt coprocessor state.  */
    struct {
        uint64_t regs[16];
        uint64_t val;

        uint32_t cregs[16];
    } iwmmxt;

    /* For mixed endian mode.  */
    bool bswap_code;

#if defined(CONFIG_USER_ONLY)
    /* For usermode syscall translation.  */
    int eabi;
#endif

    CPU_COMMON

    /* These fields after the common ones so they are preserved on reset.  */

    /* Internal CPU feature flags.  */
    uint64_t features;

    void *nvic;
    const struct arm_boot_info *boot_info;
} CPUARMState;
./include/exec/cpu-defs.h
#define CPU_COMMON                                                      \
    struct TranslationBlock *current_tb; /* currently executing TB  */  \
    /* soft mmu support */                                              \
    /* in order to avoid passing too many arguments to the MMIO         \
       helpers, we store some rarely used information in the CPU        \
       context) */                                                      \
    uintptr_t mem_io_pc; /* host pc at which the memory was             \
                            accessed */                                 \
    target_ulong mem_io_vaddr; /* target virtual addr at which the      \
                                     memory was accessed */             \
    uint32_t halted; /* Nonzero if the CPU is in suspend state */       \
    uint32_t interrupt_request;                                         \
    volatile sig_atomic_t exit_request;                                 \
    CPU_COMMON_TLB                                                      \
    struct TranslationBlock *tb_jmp_cache[TB_JMP_CACHE_SIZE];           \
    /* buffer for temporaries in the code generator */                  \
    long temp_buf[CPU_TEMP_BUF_NLONGS];                                 \
                                                                        \
    int64_t icount_extra; /* Instructions until next timer event.  */   \
    /* Number of cycles left, with interrupt flag in high bit.          \
       This allows a single read-compare-cbranch-write sequence to test \
       for both decrementer underflow and exceptions.  */               \
    union {                                                             \
        uint32_t u32;                                                   \
        icount_decr_u16 u16;                                            \
    } icount_decr;                                                      \
    uint32_t can_do_io; /* nonzero if memory mapped IO is safe.  */     \
                                                                        \
    /* from this point: preserved by CPU reset */                       \
    /* ice debug support */                                             \
    QTAILQ_HEAD(breakpoints_head, CPUBreakpoint) breakpoints;            \
    int singlestep_enabled;                                             \
                                                                        \
    QTAILQ_HEAD(watchpoints_head, CPUWatchpoint) watchpoints;            \
    CPUWatchpoint *watchpoint_hit;                                      \
                                                                        \
    struct GDBRegisterState *gdb_regs;                                  \
                                                                        \
    /* Core interrupt code */                                           \
    jmp_buf jmp_env;                                                    \
    int exception_index;                                                \
                                                                        \
    CPUArchState *next_cpu; /* next CPU sharing TB cache */                 \
    uint32_t host_tid; /* host thread ID */                             \
    int running; /* Nonzero if cpu is currently running(usermode).  */  \
    /* user data */                                                     \
    void *opaque;                                                       \
                                                                        \
    const char *cpu_model_str;

#endif

tcg/tcg.h

typedef struct TCGContext TCGContext;

struct TCGContext {
    uint8_t *pool_cur, *pool_end;
    TCGPool *pool_first, *pool_current, *pool_first_large;
    TCGLabel *labels;
    int nb_labels;
    int nb_globals;
    int nb_temps;
    /* index of free temps, -1 if none */
    int first_free_temp[TCG_TYPE_COUNT * 2]; 

    /* goto_tb support */
    uint8_t *code_buf;
    uintptr_t *tb_next;
    uint16_t *tb_next_offset;
    uint16_t *tb_jmp_offset; /* != NULL if USE_DIRECT_JUMP */

    /* liveness analysis */
    uint16_t *op_dead_args; /* for each operation, each bit tells if the
                               corresponding argument is dead */
    uint8_t *op_sync_args;  /* for each operation, each bit tells if the
                               corresponding output argument needs to be
                               sync to memory. */
    
    /* tells in which temporary a given register is. It does not take
       into account fixed registers */
    int reg_to_temp[TCG_TARGET_NB_REGS];
    TCGRegSet reserved_regs;
    tcg_target_long current_frame_offset;
    tcg_target_long frame_start;
    tcg_target_long frame_end;
    int frame_reg;

    uint8_t *code_ptr;
    TCGTemp temps[TCG_MAX_TEMPS]; /* globals first, temps after */

    TCGHelperInfo *helpers;
    int nb_helpers;
    int allocated_helpers;
    int helpers_sorted;

#ifdef CONFIG_PROFILER
    /* profiling info */
    int64_t tb_count1;
    int64_t tb_count;
    int64_t op_count; /* total insn count */
    int op_count_max; /* max insn per TB */
    int64_t temp_count;
    int temp_count_max;
    int64_t del_op_count;
    int64_t code_in_len;
    int64_t code_out_len;
    int64_t interm_time;
    int64_t code_time;
    int64_t la_time;
    int64_t opt_time;
    int64_t restore_count;
    int64_t restore_time;
#endif

#ifdef CONFIG_DEBUG_TCG
    int temps_in_use;
    int goto_tb_issue_mask;
#endif

    uint16_t gen_opc_buf[OPC_BUF_SIZE];
    TCGArg gen_opparam_buf[OPPARAM_BUF_SIZE];

    uint16_t *gen_opc_ptr;
    TCGArg *gen_opparam_ptr;
    target_ulong gen_opc_pc[OPC_BUF_SIZE];
    uint16_t gen_opc_icount[OPC_BUF_SIZE];
    uint8_t gen_opc_instr_start[OPC_BUF_SIZE];

#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
    /* labels info for qemu_ld/st IRs
       The labels help to generate TLB miss case codes at the end of TB */
    TCGLabelQemuLdst *qemu_ldst_labels;
    int nb_qemu_ldst_labels;
#endif
};

extern TCGContext tcg_ctx;


關鍵函式

參考連結:

//---->tcg/tcg.c
int tcg_gen_code(TCGContext *s, uint8_t *gen_code_buf)
{
#ifdef CONFIG_PROFILER
    {
        int n;
        n = (s->gen_opc_ptr - s->gen_opc_buf);
        s->op_count += n;
        if (n > s->op_count_max)
            s->op_count_max = n;

        s->temp_count += s->nb_temps;
        if (s->nb_temps > s->temp_count_max)
            s->temp_count_max = s->nb_temps;
    }
#endif

    tcg_gen_code_common(s, gen_code_buf, -1);

    /* flush instruction cache */
    flush_icache_range((tcg_target_ulong)gen_code_buf,
                       (tcg_target_ulong)s->code_ptr);

    return s->code_ptr -  gen_code_buf;
}

//tcg/tcg.c
static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
                                      long search_pc)
{
    TCGOpcode opc;
    int op_index;
    const TCGOpDef *def;
    const TCGArg *args;

#ifdef DEBUG_DISAS
    if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP))) {
        qemu_log("OP:\n");
        tcg_dump_ops(s);
        qemu_log("\n");
    }
#endif

#ifdef CONFIG_PROFILER
    s->opt_time -= profile_getclock();
#endif

#ifdef USE_TCG_OPTIMIZATIONS
    s->gen_opparam_ptr =
        tcg_optimize(s, s->gen_opc_ptr, s->gen_opparam_buf, tcg_op_defs);
#endif

#ifdef CONFIG_PROFILER
    s->opt_time += profile_getclock();
    s->la_time -= profile_getclock();
#endif

    tcg_liveness_analysis(s);

#ifdef CONFIG_PROFILER
    s->la_time += profile_getclock();
#endif

#ifdef DEBUG_DISAS
    if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP_OPT))) {
        qemu_log("OP after optimization and liveness analysis:\n");
        tcg_dump_ops(s);
        qemu_log("\n");
    }
#endif

    tcg_reg_alloc_start(s);

    s->code_buf = gen_code_buf;
    s->code_ptr = gen_code_buf;

    args = s->gen_opparam_buf;
    op_index = 0;

    for(;;) {
        opc = s->gen_opc_buf[op_index];
#ifdef CONFIG_PROFILER
        tcg_table_op_count[opc]++;
#endif
        def = &tcg_op_defs[opc];
#if 0
        printf("%s: %d %d %d\n", def->name,
               def->nb_oargs, def->nb_iargs, def->nb_cargs);
        //        dump_regs(s);
#endif
        switch(opc) {
        case INDEX_op_mov_i32:
        case INDEX_op_mov_i64:
            tcg_reg_alloc_mov(s, def, args, s->op_dead_args[op_index],
                              s->op_sync_args[op_index]);
            break;
        case INDEX_op_movi_i32:
        case INDEX_op_movi_i64:
            tcg_reg_alloc_movi(s, args, s->op_dead_args[op_index],
                               s->op_sync_args[op_index]);
            break;
        case INDEX_op_debug_insn_start:
            /* debug instruction */
            break;
        case INDEX_op_nop:
        case INDEX_op_nop1:
        case INDEX_op_nop2:
        case INDEX_op_nop3:
            break;
        case INDEX_op_nopn:
            args += args[0];
            goto next;
        case INDEX_op_discard:
            temp_dead(s, args[0]);
            break;
        case INDEX_op_set_label:
            tcg_reg_alloc_bb_end(s, s->reserved_regs);
            tcg_out_label(s, args[0], s->code_ptr);
            break;
        case INDEX_op_call:
            args += tcg_reg_alloc_call(s, def, opc, args,
                                       s->op_dead_args[op_index],
                                       s->op_sync_args[op_index]);
            goto next;
        case INDEX_op_end:
            goto the_end;
        default:
            /* Sanity check that we've not introduced any unhandled opcodes. */
            if (def->flags & TCG_OPF_NOT_PRESENT) {
                tcg_abort();
            }
            /* Note: in order to speed up the code, it would be much
               faster to have specialized register allocator functions for
               some common argument patterns */
            tcg_reg_alloc_op(s, def, opc, args, s->op_dead_args[op_index],
                             s->op_sync_args[op_index]);
            break;
        }
        args += def->nb_args;
    next:
        if (search_pc >= 0 && search_pc < s->code_ptr - gen_code_buf) {
            return op_index;
        }
        op_index++;
#ifndef NDEBUG
        check_regs(s);
#endif
    }
 the_end:
#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
    /* Generate TB finalization at the end of block */
    tcg_out_tb_finalize(s);
#endif
    return -1;
}



//tcg/i386/tcg-target.c
static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                              const TCGArg *args, const int *const_args)
{
    int c, rexw = 0;

#if TCG_TARGET_REG_BITS == 64
# define OP_32_64(x) \
        case glue(glue(INDEX_op_, x), _i64): \
            rexw = P_REXW; /* FALLTHRU */    \
        case glue(glue(INDEX_op_, x), _i32)
#else
# define OP_32_64(x) \
        case glue(glue(INDEX_op_, x), _i32)
#endif

    switch(opc) {
    case INDEX_op_exit_tb:
        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, args[0]);
        tcg_out_jmp(s, (tcg_target_long) tb_ret_addr);
        break;
    case INDEX_op_goto_tb:
        if (s->tb_jmp_offset) {
            /* direct jump method */
            tcg_out8(s, OPC_JMP_long); /* jmp im */
            s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf;
            tcg_out32(s, 0);
        } else {
            /* indirect jump method */
            tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
                                 (tcg_target_long)(s->tb_next + args[0]));
        }
        s->tb_next_offset[args[0]] = s->code_ptr - s->code_buf;
        break;
    case INDEX_op_call:
        if (const_args[0]) {
            tcg_out_calli(s, args[0]);
        } else {
            /* call *reg */
            tcg_out_modrm(s, OPC_GRP5, EXT5_CALLN_Ev, args[0]);
        }
        break;
    case INDEX_op_br:
        tcg_out_jxx(s, JCC_JMP, args[0], 0);
        break;
    case INDEX_op_movi_i32:
        tcg_out_movi(s, TCG_TYPE_I32, args[0], args[1]);
        break;
    OP_32_64(ld8u):
        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
        tcg_out_modrm_offset(s, OPC_MOVZBL, args[0], args[1], args[2]);
        break;
    OP_32_64(ld8s):
        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, args[0], args[1], args[2]);
        break;
    OP_32_64(ld16u):
        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
        tcg_out_modrm_offset(s, OPC_MOVZWL, args[0], args[1], args[2]);
        break;
    OP_32_64(ld16s):
        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, args[0], args[1], args[2]);
        break;
#if TCG_TARGET_REG_BITS == 64
    case INDEX_op_ld32u_i64:
#endif
    case INDEX_op_ld_i32:
        tcg_out_ld(s, TCG_TYPE_I32, args[0], args[1], args[2]);
        break;

    OP_32_64(st8):
        if (const_args[0]) {
            tcg_out_modrm_offset(s, OPC_MOVB_EvIz,
                                 0, args[1], args[2]);
            tcg_out8(s, args[0]);
        } else {
            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R,
                                 args[0], args[1], args[2]);
        }
        break;
    OP_32_64(st16):
        if (const_args[0]) {
            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16,
                                 0, args[1], args[2]);
            tcg_out16(s, args[0]);
        } else {
            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16,
                                 args[0], args[1], args[2]);
        }
        break;
#if TCG_TARGET_REG_BITS == 64
    case INDEX_op_st32_i64:
#endif
    case INDEX_op_st_i32:
        if (const_args[0]) {
            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, args[1], args[2]);
            tcg_out32(s, args[0]);
        } else {
            tcg_out_st(s, TCG_TYPE_I32, args[0], args[1], args[2]);
        }
        break;

    OP_32_64(add):
        /* For 3-operand addition, use LEA.  */
        if (args[0] != args[1]) {
            TCGArg a0 = args[0], a1 = args[1], a2 = args[2], c3 = 0;

            if (const_args[2]) {
                c3 = a2, a2 = -1;
            } else if (a0 == a2) {
                /* Watch out for dest = src + dest, since we've removed
                   the matching constraint on the add.  */
                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
                break;
            }

            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
            break;
        }
        c = ARITH_ADD;
        goto gen_arith;
    OP_32_64(sub):
        c = ARITH_SUB;
        goto gen_arith;
    OP_32_64(and):
        c = ARITH_AND;
        goto gen_arith;
    OP_32_64(or):
        c = ARITH_OR;
        goto gen_arith;
    OP_32_64(xor):
        c = ARITH_XOR;
        goto gen_arith;
    gen_arith:
        if (const_args[2]) {
            tgen_arithi(s, c + rexw, args[0], args[2], 0);
        } else {
            tgen_arithr(s, c + rexw, args[0], args[2]);
        }
        break;

    OP_32_64(mul):
        if (const_args[2]) {
            int32_t val;
            val = args[2];
            if (val == (int8_t)val) {
                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, args[0], args[0]);
                tcg_out8(s, val);
            } else {
                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, args[0], args[0]);
                tcg_out32(s, val);
            }
        } else {
            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, args[0], args[2]);
        }
        break;

    OP_32_64(div2):
        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
        break;
    OP_32_64(divu2):
        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
        break;

    OP_32_64(shl):
        c = SHIFT_SHL;
        goto gen_shift;
    OP_32_64(shr):
        c = SHIFT_SHR;
        goto gen_shift;
    OP_32_64(sar):
        c = SHIFT_SAR;
        goto gen_shift;
    OP_32_64(rotl):
        c = SHIFT_ROL;
        goto gen_shift;
    OP_32_64(rotr):
        c = SHIFT_ROR;
        goto gen_shift;
    gen_shift:
        if (const_args[2]) {
            tcg_out_shifti(s, c + rexw, args[0], args[2]);
        } else {
            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, args[0]);
        }
        break;

    case INDEX_op_brcond_i32:
        tcg_out_brcond32(s, args[2], args[0], args[1], const_args[1],
                         args[3], 0);
        break;
    case INDEX_op_setcond_i32:
        tcg_out_setcond32(s, args[3], args[0], args[1],
                          args[2], const_args[2]);
        break;
    case INDEX_op_movcond_i32:
        tcg_out_movcond32(s, args[5], args[0], args[1],
                          args[2], const_args[2], args[3]);
        break;

    OP_32_64(bswap16):
        tcg_out_rolw_8(s, args[0]);
        break;
    OP_32_64(bswap32):
        tcg_out_bswap32(s, args[0]);
        break;

    OP_32_64(neg):
        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, args[0]);
        break;
    OP_32_64(not):
        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, args[0]);
        break;

    OP_32_64(ext8s):
        tcg_out_ext8s(s, args[0], args[1], rexw);
        break;
    OP_32_64(ext16s):
        tcg_out_ext16s(s, args[0], args[1], rexw);
        break;
    OP_32_64(ext8u):
        tcg_out_ext8u(s, args[0], args[1]);
        break;
    OP_32_64(ext16u):
        tcg_out_ext16u(s, args[0], args[1]);
        break;

    case INDEX_op_qemu_ld8u:
        tcg_out_qemu_ld(s, args, 0);
        break;
    case INDEX_op_qemu_ld8s:
        tcg_out_qemu_ld(s, args, 0 | 4);
        break;
    case INDEX_op_qemu_ld16u:
        tcg_out_qemu_ld(s, args, 1);
        break;
    case INDEX_op_qemu_ld16s:
        tcg_out_qemu_ld(s, args, 1 | 4);
        break;
#if TCG_TARGET_REG_BITS == 64
    case INDEX_op_qemu_ld32u:
#endif
    case INDEX_op_qemu_ld32:
        tcg_out_qemu_ld(s, args, 2);
        break;
    case INDEX_op_qemu_ld64:
        tcg_out_qemu_ld(s, args, 3);
        break;

    case INDEX_op_qemu_st8:
        tcg_out_qemu_st(s, args, 0);
        break;
    case INDEX_op_qemu_st16:
        tcg_out_qemu_st(s, args, 1);
        break;
    case INDEX_op_qemu_st32:
        tcg_out_qemu_st(s, args, 2);
        break;
    case INDEX_op_qemu_st64:
        tcg_out_qemu_st(s, args, 3);
        break;

#if TCG_TARGET_REG_BITS == 32
    case INDEX_op_brcond2_i32:
        tcg_out_brcond2(s, args, const_args, 0);
        break;
    case INDEX_op_setcond2_i32:
        tcg_out_setcond2(s, args, const_args);
        break;
    case INDEX_op_mulu2_i32:
        tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_MUL, args[3]);
        break;
    case INDEX_op_add2_i32:
        if (const_args[4]) {
            tgen_arithi(s, ARITH_ADD, args[0], args[4], 1);
        } else {
            tgen_arithr(s, ARITH_ADD, args[0], args[4]);
        }
        if (const_args[5]) {
            tgen_arithi(s, ARITH_ADC, args[1], args[5], 1);
        } else {
            tgen_arithr(s, ARITH_ADC, args[1], args[5]);
        }
        break;
    case INDEX_op_sub2_i32:
        if (const_args[4]) {
            tgen_arithi(s, ARITH_SUB, args[0], args[4], 1);
        } else {
            tgen_arithr(s, ARITH_SUB, args[0], args[4]);
        }
        if (const_args[5]) {
            tgen_arithi(s, ARITH_SBB, args[1], args[5], 1);
        } else {
            tgen_arithr(s, ARITH_SBB, args[1], args[5]);
        }
        break;
#else /* TCG_TARGET_REG_BITS == 64 */
    case INDEX_op_movi_i64:
        tcg_out_movi(s, TCG_TYPE_I64, args[0], args[1]);
        break;
    case INDEX_op_ld32s_i64:
        tcg_out_modrm_offset(s, OPC_MOVSLQ, args[0], args[1], args[2]);
        break;
    case INDEX_op_ld_i64:
        tcg_out_ld(s, TCG_TYPE_I64, args[0], args[1], args[2]);
        break;
    case INDEX_op_st_i64:
        if (const_args[0]) {
            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW,
                                 0, args[1], args[2]);
            tcg_out32(s, args[0]);
        } else {
            tcg_out_st(s, TCG_TYPE_I64, args[0], args[1], args[2]);
        }
        break;
    case INDEX_op_qemu_ld32s:
        tcg_out_qemu_ld(s, args, 2 | 4);
        break;

    case INDEX_op_brcond_i64:
        tcg_out_brcond64(s, args[2], args[0], args[1], const_args[1],
                         args[3], 0);
        break;
    case INDEX_op_setcond_i64:
        tcg_out_setcond64(s, args[3], args[0], args[1],
                          args[2], const_args[2]);
        break;
    case INDEX_op_movcond_i64:
        tcg_out_movcond64(s, args[5], args[0], args[1],
                          args[2], const_args[2], args[3]);
        break;

    case INDEX_op_bswap64_i64:
        tcg_out_bswap64(s, args[0]);
        break;
    case INDEX_op_ext32u_i64:
        tcg_out_ext32u(s, args[0], args[1]);
        break;
    case INDEX_op_ext32s_i64:
        tcg_out_ext32s(s, args[0], args[1]);
        break;
#endif

    OP_32_64(deposit):
        if (args[3] == 0 && args[4] == 8) {
            /* load bits 0..7 */
            tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM,
                          args[2], args[0]);
        } else if (args[3] == 8 && args[4] == 8) {
            /* load bits 8..15 */
            tcg_out_modrm(s, OPC_MOVB_EvGv, args[2], args[0] + 4);
        } else if (args[3] == 0 && args[4] == 16) {
            /* load bits 0..15 */
            tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, args[2], args[0]);
        } else {
            tcg_abort();
        }
        break;

    default:
        tcg_abort();
    }

#undef OP_32_64
}