1. 程式人生 > >KVM虛擬機器IO處理過程(二) ----QEMU/KVM I/O 處理過程

KVM虛擬機器IO處理過程(二) ----QEMU/KVM I/O 處理過程

接著KVM虛擬機器IO處理過程中Guest Vm IO處理過程(http://blog.csdn.net/dashulu/article/details/16820281),本篇文章主要描述IO從guest vm跳轉到kvm和qemu後的處理過程.

    首先回顧一下kvm的啟動過程(http://blog.csdn.net/dashulu/article/details/17074675).qemu通過呼叫kvm提供的一系列介面來啟動kvm. qemu的入口為vl.c中的main函式,main函式通過呼叫kvm_init 和 machine->init來初始化kvm. 其中, machine->init會建立vcpu, 用一個執行緒去模擬vcpu, 該執行緒執行的函式為qemu_kvm_cpu_thread_fn, 並且該執行緒最終kvm_cpu_exec,該函式呼叫kvm_vcpu_ioctl切換到kvm中,下次從kvm中返回時,會接著執行kvm_vcpu_ioctl之後的程式碼,判斷exit_reason,然後進行相應處理.

int kvm_cpu_exec(CPUState *cpu)
{
    struct kvm_run *run = cpu->kvm_run;
    int ret, run_ret;

    DPRINTF("kvm_cpu_exec()\n");

    if (kvm_arch_process_async_events(cpu)) {
        cpu->exit_request = 0;
        return EXCP_HLT;
    }

    do {
        if (cpu->kvm_vcpu_dirty) {
            kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE);
            cpu->kvm_vcpu_dirty = false;
        }

        kvm_arch_pre_run(cpu, run);
        if (cpu->exit_request) {
            DPRINTF("interrupt exit requested\n");
            /*
             * KVM requires us to reenter the kernel after IO exits to complete
             * instruction emulation. This self-signal will ensure that we
             * leave ASAP again.
             */
            qemu_cpu_kick_self();
        }
        qemu_mutex_unlock_iothread();

        run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);

        qemu_mutex_lock_iothread();
        kvm_arch_post_run(cpu, run);

        if (run_ret < 0) {
            if (run_ret == -EINTR || run_ret == -EAGAIN) {
                DPRINTF("io window exit\n");
                ret = EXCP_INTERRUPT;
                break;
            }
            fprintf(stderr, "error: kvm run failed %s\n",
                    strerror(-run_ret));
            abort();
        }

        trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
        switch (run->exit_reason) {
        case KVM_EXIT_IO:
            DPRINTF("handle_io\n");
            kvm_handle_io(run->io.port,
                          (uint8_t *)run + run->io.data_offset,
                          run->io.direction,
                          run->io.size,
                          run->io.count);
            ret = 0;
            break;
        case KVM_EXIT_MMIO:
            DPRINTF("handle_mmio\n");
            cpu_physical_memory_rw(run->mmio.phys_addr,
                                   run->mmio.data,
                                   run->mmio.len,
                                   run->mmio.is_write);
            ret = 0;
            break;
        case KVM_EXIT_IRQ_WINDOW_OPEN:
            DPRINTF("irq_window_open\n");
            ret = EXCP_INTERRUPT;
            break;
        case KVM_EXIT_SHUTDOWN:
            DPRINTF("shutdown\n");
            qemu_system_reset_request();
            ret = EXCP_INTERRUPT;
            break;
        case KVM_EXIT_UNKNOWN:
            fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
                    (uint64_t)run->hw.hardware_exit_reason);
            ret = -1;
            break;
        case KVM_EXIT_INTERNAL_ERROR:
            ret = kvm_handle_internal_error(cpu, run);
            break;
        default:
            DPRINTF("kvm_arch_handle_exit\n");
            ret = kvm_arch_handle_exit(cpu, run);
            break;
        }
    } while (ret == 0);

    if (ret < 0) {
        cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE);
        vm_stop(RUN_STATE_INTERNAL_ERROR);
    }

    cpu->exit_request = 0;
    return ret;
}

kvm_vcpu_ioctl執行時,呼叫的kvm函式是virt/kvm/kvm-main.c中的kvm_vcpu_ioctl.c函式.當傳入引數為KVM_RUN時,最終會呼叫到vcpu_enter_guest函式, vcpu_enter_guest函式中呼叫了kvm_x86_ops->run(vcpu),在intel處理器架構中該函式對應的實現為vmx_vcpu_run, vmx_vcpu_run設定好暫存器狀態之後呼叫VM_LAUNCH或者VM_RESUME進入guest vm, 一旦發生vm exit則從此處繼續執行下去.

static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	unsigned long debugctlmsr;

	/*...此處省略n行程式碼...*/
	vmx->__launched = vmx->loaded_vmcs->launched;
	asm(
		/* Store host registers */
		"push %%" _ASM_DX "; push %%" _ASM_BP ";"
		"push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
		"push %%" _ASM_CX " \n\t"
		"cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
		"je 1f \n\t"
		"mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
		__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
		"1: \n\t"
		/* Reload cr2 if changed */
		"mov %c[cr2](%0), %%" _ASM_AX " \n\t"
		"mov %%cr2, %%" _ASM_DX " \n\t"
		"cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
		"je 2f \n\t"
		"mov %%" _ASM_AX", %%cr2 \n\t"
		"2: \n\t"
		/* Check if vmlaunch of vmresume is needed */
		"cmpl $0, %c[launched](%0) \n\t"
		/* Load guest registers.  Don't clobber flags. */
		"mov %c[rax](%0), %%" _ASM_AX " \n\t"
		"mov %c[rbx](%0), %%" _ASM_BX " \n\t"
		"mov %c[rdx](%0), %%" _ASM_DX " \n\t"
		"mov %c[rsi](%0), %%" _ASM_SI " \n\t"
		"mov %c[rdi](%0), %%" _ASM_DI " \n\t"
		"mov %c[rbp](%0), %%" _ASM_BP " \n\t"
#ifdef CONFIG_X86_64
		"mov %c[r8](%0),  %%r8  \n\t"
		"mov %c[r9](%0),  %%r9  \n\t"
		"mov %c[r10](%0), %%r10 \n\t"
		"mov %c[r11](%0), %%r11 \n\t"
		"mov %c[r12](%0), %%r12 \n\t"
		"mov %c[r13](%0), %%r13 \n\t"
		"mov %c[r14](%0), %%r14 \n\t"
		"mov %c[r15](%0), %%r15 \n\t"
#endif
		"mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */

		/* Enter guest mode */
		"jne 1f \n\t"
		__ex(ASM_VMX_VMLAUNCH) "\n\t"
		"jmp 2f \n\t"
		"1: " __ex(ASM_VMX_VMRESUME) "\n\t"
		"2: "
		/* Save guest registers, load host registers, keep flags */
		"mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
		"pop %0 \n\t"
		"mov %%" _ASM_AX ", %c[rax](%0) \n\t"
		"mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
		__ASM_SIZE(pop) " %c[rcx](%0) \n\t"
		"mov %%" _ASM_DX ", %c[rdx](%0) \n\t"
		"mov %%" _ASM_SI ", %c[rsi](%0) \n\t"
		"mov %%" _ASM_DI ", %c[rdi](%0) \n\t"
		"mov %%" _ASM_BP ", %c[rbp](%0) \n\t"
#ifdef CONFIG_X86_64
		"mov %%r8,  %c[r8](%0) \n\t"
		"mov %%r9,  %c[r9](%0) \n\t"
		"mov %%r10, %c[r10](%0) \n\t"
		"mov %%r11, %c[r11](%0) \n\t"
		"mov %%r12, %c[r12](%0) \n\t"
		"mov %%r13, %c[r13](%0) \n\t"
		"mov %%r14, %c[r14](%0) \n\t"
		"mov %%r15, %c[r15](%0) \n\t"
#endif
		"mov %%cr2, %%" _ASM_AX "   \n\t"
		"mov %%" _ASM_AX ", %c[cr2](%0) \n\t"

		"pop  %%" _ASM_BP "; pop  %%" _ASM_DX " \n\t"
		"setbe %c[fail](%0) \n\t"
		".pushsection .rodata \n\t"
		".global vmx_return \n\t"
		"vmx_return: " _ASM_PTR " 2b \n\t"
		".popsection"
	      : : "c"(vmx), "d"((unsigned long)HOST_RSP),
		[launched]"i"(offsetof(struct vcpu_vmx, __launched)),
		[fail]"i"(offsetof(struct vcpu_vmx, fail)),
		[host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
		[rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
		[rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
		[rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
		[rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
		[rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
		[rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
		[rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
#ifdef CONFIG_X86_64
		[r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
		[r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
		[r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
		[r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
		[r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
		[r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
		[r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
		[r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
#endif
		[cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
		[wordsize]"i"(sizeof(ulong))
	      : "cc", "memory"
#ifdef CONFIG_X86_64
		, "rax", "rbx", "rdi", "rsi"
		, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
#else
		, "eax", "ebx", "edi", "esi"
#endif
	      );

	/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
	if (debugctlmsr)
		update_debugctlmsr(debugctlmsr);

#ifndef CONFIG_X86_64
	/*
	 * The sysexit path does not restore ds/es, so we must set them to
	 * a reasonable value ourselves.
	 *
	 * We can't defer this to vmx_load_host_state() since that function
	 * may be executed in interrupt context, which saves and restore segments
	 * around it, nullifying its effect.
	 */
	loadsegment(ds, __USER_DS);
	loadsegment(es, __USER_DS);
#endif

	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
				  | (1 << VCPU_EXREG_RFLAGS)
				  | (1 << VCPU_EXREG_CPL)
				  | (1 << VCPU_EXREG_PDPTR)
				  | (1 << VCPU_EXREG_SEGMENTS)
				  | (1 << VCPU_EXREG_CR3));
	vcpu->arch.regs_dirty = 0;

	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);

	if (is_guest_mode(vcpu)) {
		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
		vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info;
		if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
			vmcs12->idt_vectoring_error_code =
				vmcs_read32(IDT_VECTORING_ERROR_CODE);
			vmcs12->vm_exit_instruction_len =
				vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
		}
	}

	vmx->loaded_vmcs->launched = 1;

	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
	trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);

	vmx_complete_atomic_exit(vmx);
	vmx_recover_nmi_blocking(vmx);
	vmx_complete_interrupts(vmx);
}

 

    介紹完初始化的流程,可以介紹IO在kvm和qemu中的處理流程了. 當Guest Vm進行IO操作需要訪問裝置時,就會觸發vm exit 返回到vmx_vcpu_run, vmx儲存好vmcs並且記錄下VM_ExIT_REASON後返回到呼叫該函式的vcpu_enter_guest, 在vcpu_enter_guest函式末尾呼叫了r = kvm_x86_ops->handle_exit(vcpu), 該函式對應於vmx_handle_exit函式(intel cpu架構對應關係可以檢視vmx.c檔案中static struct kvm_x86_ops vmx_x86_ops), vmx_handle_exit 呼叫kvm_vmx_exit_handlers[exit_reason](vcpu),該語句根據exit_reason呼叫不同的函式,該資料結構定義如下:

static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
	[EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
	[EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
	[EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
	[EXIT_REASON_NMI_WINDOW]	      = handle_nmi_window,
	[EXIT_REASON_IO_INSTRUCTION]          = handle_io,
	[EXIT_REASON_CR_ACCESS]               = handle_cr,
	[EXIT_REASON_DR_ACCESS]               = handle_dr,
	[EXIT_REASON_CPUID]                   = handle_cpuid,
	[EXIT_REASON_MSR_READ]                = handle_rdmsr,
	[EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
	[EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
	[EXIT_REASON_HLT]                     = handle_halt,
	[EXIT_REASON_INVD]		      = handle_invd,
	[EXIT_REASON_INVLPG]		      = handle_invlpg,
	[EXIT_REASON_RDPMC]                   = handle_rdpmc,
	[EXIT_REASON_VMCALL]                  = handle_vmcall,
	[EXIT_REASON_VMCLEAR]	              = handle_vmclear,
	[EXIT_REASON_VMLAUNCH]                = handle_vmlaunch,
	[EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
	[EXIT_REASON_VMPTRST]                 = handle_vmptrst,
	[EXIT_REASON_VMREAD]                  = handle_vmread,
	[EXIT_REASON_VMRESUME]                = handle_vmresume,
	[EXIT_REASON_VMWRITE]                 = handle_vmwrite,
	[EXIT_REASON_VMOFF]                   = handle_vmoff,
	[EXIT_REASON_VMON]                    = handle_vmon,
	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
	[EXIT_REASON_XSETBV]                  = handle_xsetbv,
	[EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
	[EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
	[EXIT_REASON_EPT_VIOLATION]	      = handle_ept_violation,
	[EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
	[EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
	[EXIT_REASON_MWAIT_INSTRUCTION]	      = handle_invalid_op,
	[EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
};

如果是因為IO原因導致的vm exit,則呼叫的處理函式為handle_io,handle_io的處理可以檢視(http://blog.csdn.net/fanwenyi/article/details/12748613), 該過程結束之後需要qemu去處理IO,這時候會返回到qemu, 在kvm_cpu_exec中繼續執行下去,看上面kvm_cpu_exec的程式碼,如果是因為IO原因返回到qemu,會呼叫kvm_handle_io函式.

switch (run->exit_reason) {
        case KVM_EXIT_IO:
            DPRINTF("handle_io\n");
            kvm_handle_io(run->io.port,
                          (uint8_t *)run + run->io.data_offset,
                          run->io.direction,
                          run->io.size,
                          run->io.count);
            ret = 0;
            break;

kvm_handle_io呼叫cpu_outb, cpu_outw等指令處理IO操作.

    假設虛擬機器是用raw格式的磁碟,則IO在qemu中處理時經過的函式棧如下所示:

#0 bdrv_aio_writev (bs=0x55555629e9b0, sector_num=870456, 
qiov=0x555556715ab0, nb_sectors=1, 
cb=0x55555570161b <ide_sector_write_cb>, opaque=0x5555567157b8)
at block.c:3408
#1 0x0000555555701960 in ide_sector_write (s=0x5555567157b8)
at hw/ide/core.c:798
#2 0x00005555557047ae in ide_data_writew (opaque=0x555556715740, addr=496, 
val=8995) at hw/ide/core.c:1907
#3 0x00005555558d9e4c in portio_write (opaque=0x5555565c0670, addr=0, 
data=8995, size=2) at /home/dashu/kvm/qemu/qemu-dev-zwu/ioport.c:174
#4 0x00005555558e13d5 in memory_region_write_accessor (mr=0x5555565c0670, 
addr=0, value=0x7fffb4dbd528, size=2, shift=0, mask=65535)
at /home/dashu/kvm/qemu/qemu-dev-zwu/memory.c:440
#5 0x00005555558e151d in access_with_adjusted_size (addr=0, 
value=0x7fffb4dbd528, size=2, access_size_min=1, access_size_max=4, 
access=0x5555558e1341 <memory_region_write_accessor>, mr=0x5555565c0670)
at /home/dashu/kvm/qemu/qemu-dev-zwu/memory.c:477
#6 0x00005555558e3dfb in memory_region_dispatch_write (mr=0x5555565c0670, 
addr=0, data=8995, size=2)
at /home/dashu/kvm/qemu/qemu-dev-zwu/memory.c:984
#7 0x00005555558e7384 in io_mem_write (mr=0x5555565c0670, addr=0, val=8995, 
size=2) at /home/dashu/kvm/qemu/qemu-dev-zwu/memory.c:1748
#8 0x000055555586a18e in address_space_rw (as=0x555556216d80, addr=496, 
buf=0x7fffb4dbd670 "##", len=2, is_write=true)
at /home/dashu/kvm/qemu/qemu-dev-zwu/exec.c:1968
#9 0x000055555586a474 in address_space_write (as=0x555556216d80, addr=496, 
buf=0x7fffb4dbd670 "##", len=2)
at /home/dashu/kvm/qemu/qemu-dev-zwu/exec.c:2030
#10 0x00005555558d98c9 in cpu_outw (addr=496, val=8995)
at /home/dashu/kvm/qemu/qemu-dev-zwu/ioport.c:61

bdrv_aio_writev最終呼叫bdrv_co_aio_rw_vector函式, 該函式呼叫co = qemu_coroutine_create(bdrv_co_do_rw) 建立一個協程去執行bdrv_co_do_rw函式,bdrv_co_wo_rw函式的函式棧如下:

#1 0x000055555563653c in paio_submit (bs=0x5555562a13d0, fd=10, sector_num=2, 
qiov=0x555556715ab0, nb_sectors=1, 
cb=0x5555556028b1 <bdrv_co_io_em_complete>, opaque=0x555556964e30, type=1)
at block/raw-posix.c:825
#2 0x0000555555636659 in raw_aio_submit (bs=0x5555562a13d0, sector_num=2, 
qiov=0x555556715ab0, nb_sectors=1, 
cb=0x5555556028b1 <bdrv_co_io_em_complete>, opaque=0x555556964e30, type=1)
at block/raw-posix.c:853
#3 0x00005555556366c9 in raw_aio_readv (bs=0x5555562a13d0, sector_num=2, 
qiov=0x555556715ab0, nb_sectors=1, 
cb=0x5555556028b1 <bdrv_co_io_em_complete>, opaque=0x555556964e30)
at block/raw-posix.c:861
#4 0x00005555556029b8 in bdrv_co_io_em (bs=0x5555562a13d0, sector_num=2, 
nb_sectors=1, iov=0x555556715ab0, is_write=false) at block.c:4038
#5 0x0000555555602a49 in bdrv_co_readv_em (bs=0x5555562a13d0, sector_num=2, 
nb_sectors=1, iov=0x555556715ab0) at block.c:4055
#6 0x00005555555fed61 in bdrv_co_do_readv (bs=0x5555562a13d0, sector_num=2, 
nb_sectors=1, qiov=0x555556715ab0, flags=0) at block.c:2547
#7 0x00005555555fee03 in bdrv_co_readv (bs=0x5555562a13d0, sector_num=2, 
nb_sectors=1, qiov=0x555556715ab0) at block.c:2573
#8 0x0000555555637d8c in raw_co_readv (bs=0x55555629e9b0, sector_num=2, 
nb_sectors=1, qiov=0x555556715ab0) at block/raw.c:47
#9 0x00005555555fed61 in bdrv_co_do_readv (bs=0x55555629e9b0, sector_num=2, 
nb_sectors=1, qiov=0x555556715ab0, flags=0) at block.c:2547
#10 0x00005555556023af in bdrv_co_do_rw 

最終在paio_summit中會往執行緒池中提交一個請求thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque), 由排程器去執行aio_worker函式,aio_worker是真正做IO操作的函式,它通過pwrite和pread去讀取磁碟.

    當qemu完成IO操作後,會在kvm_cpu_exec函式的迴圈中,呼叫kvm_vcpu_ioctl重新進入kvm.

    以上闡述了IO操作在kvm和qemu中處理的整個過程.

 

--------------------- 本文來自 dashulu 的CSDN 部落格 ,全文地址請點選:https://blog.csdn.net/dashulu/article/details/17090293?utm_source=copy