1. 程式人生 > >linux核心分析之缺頁中斷

linux核心分析之缺頁中斷

linux缺頁異常程式必須能夠區分由程式設計引起的異常以及由引用屬於程序地址空間但還尚未分配物理頁框的頁所引起的異常。在x86-ia32體系上由do_page_fault函式處理,每個版本有所差異,現分析的版本為2.6.32

 /*
 regs:該結構包含當異常發生時的微處理器暫存器的值
 3位的error_code,當異常發生時由控制單元壓入棧中
 -如果第0位被清0,則異常由訪問一個不存在的頁所
 引起,否則,則異常由無效的訪問許可權所引起;
 -如果第1位被清0,表示異常由讀訪問或者執行訪問
 所引起,反之,異常由寫訪問引起;
 -如果第2位被清0,則異常發生在處理器處於核心態
 時,否則,異常發生在處理器處於使用者態時
-如果3位為1表示檢測到使用了保留位。4位為1表示
1表示缺頁異常是在取指令的時候出現的
 */
dotraplinkage void __kprobes
do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
	struct vm_area_struct *vma;
	struct task_struct *tsk;
	unsigned long address;
	struct mm_struct *mm;
	int write;
	int fault;
	/*獲取當前cpu正在執行的程序的程序描述符
       	然後獲取該程序的記憶體描述符*/
	tsk = current;
	mm = tsk->mm;

	/* Get the faulting address: */
	/*獲取出錯的地址*/
	address = read_cr2();

	/*
	 * Detect and handle instructions that would cause a page fault for
	 * both a tracked kernel page and a userspace page.
	 */
	if (kmemcheck_active(regs))
		kmemcheck_hide(regs);
	prefetchw(&mm->mmap_sem);

	if (unlikely(kmmio_fault(regs, address)))
		return;

	/*
	 * We fault-in kernel-space virtual memory on-demand. The
	 * 'reference' page table is init_mm.pgd.
	 *
	 * NOTE! We MUST NOT take any locks for this case. We may
	 * be in an interrupt or a critical region, and should
	 * only copy the information from the master page table,
	 * nothing more.
	 *
	 * This verifies that the fault happens in kernel space
	 * (error_code & 4) == 0, and that the fault was not a
	 * protection error (error_code & 9) == 0.
	 */
	 /*頁訪問出錯地址address在核心空間*/
	if (unlikely(fault_in_kernel_space(address))) {
		/*檢查標誌位確定訪問發生在"核心態"*/
		if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
			/*如果是核心空間"非連續記憶體"的訪問,
                        則直接拷貝"核心頁表項"到"使用者頁表項"
                        如果"核心頁表項"為null,說明核心有BUG,返回-1
			這裡就是把init_mm中addr對應的項拷貝到本程序
			的相應頁表,防止缺頁中斷
			*/
			if (vmalloc_fault(address) >= 0)
				return;
			/*關於kmemcheck的操作需要設定巨集,這個版本
			沒有設定,可以不看;
			檢查不能為vm86模式以及讀寫許可權是否正確*/	
			if (kmemcheck_fault(regs, address, error_code))
				return;
		}

		/* Can handle a stale RO->RW TLB: */
		/*核心空間的地址,檢查頁表對應項的寫、執行許可權*/
		if (spurious_fault(error_code, address))
			return;

		/* kprobes don't want to hook the spurious faults: */
		if (notify_page_fault(regs))
			return;
		/*
		 * Don't take the mm semaphore here. If we fixup a prefetch
		 * fault we could otherwise deadlock:
		 */
		 /*如果上面的檢查不能搞定直接進入"非法訪問"處理函式*/
                bad_area_nosemaphore(regs, error_code, address);

		return;
	}

	/* kprobes don't want to hook the spurious faults: */
	if (unlikely(notify_page_fault(regs)))
		return;
	/*
	 * It's safe to allow irq's after cr2 has been saved and the
	 * vmalloc fault has been handled.
	 *
	 * User-mode registers count as a user access even for any
	 * potential system fault or CPU buglet:
	 */
	if (user_mode_vm(regs)) {
		local_irq_enable();
		error_code |= PF_USER;
	} else {
		if (regs->flags & X86_EFLAGS_IF)
			local_irq_enable();
	}

	if (unlikely(error_code & PF_RSVD))/*使用了保留位*/
		/*CPU暫存器和核心態堆疊的全部轉儲列印到控制檯,
		以及頁表的相關資訊,並輸出到一個系統訊息緩衝
		區,然後呼叫函式do_exit()殺死當前程序*/
		pgtable_bad(regs, error_code, address);

	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);

	/*
	 * If we're in an interrupt, have no user context or are running
	 * in an atomic region then we must not take the fault:
	 */
	 /*如果執行在中斷環境中,沒有使用者上下文
	 或執行在臨界區中*/
	if (unlikely(in_atomic() || !mm)) {
		bad_area_nosemaphore(regs, error_code, address);
		return;
	}

	/*
	 * When running in the kernel we expect faults to occur only to
	 * addresses in user space.  All other faults represent errors in
	 * the kernel and should generate an OOPS.  Unfortunately, in the
	 * case of an erroneous fault occurring in a code path which already
	 * holds mmap_sem we will deadlock attempting to validate the fault
	 * against the address space.  Luckily the kernel only validly
	 * references user space from well defined areas of code, which are
	 * listed in the exceptions table.
	 *
	 * As the vast majority of faults will be valid we will only perform
	 * the source reference check when there is a possibility of a
	 * deadlock. Attempt to lock the address space, if we cannot we then
	 * validate the source. If this is invalid we can skip the address
	 * space check, thus avoiding the deadlock:
	 */
	 /*此時可以確定出錯addr在使用者空間*/
	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
		/*錯誤發生在"核心態",檢視異常表
                如果在核心態引起缺頁,則引起缺頁的
                "指令地址"一定在"異常表"中
                如果"異常表"中返回指令地址
                ,則說明可能是"請求調頁",也可能是"非法訪問"
                如果"異常表"中無地址,則肯定是核心錯誤
		*/
		if ((error_code & PF_USER) == 0 &&
		    !search_exception_tables(regs->ip)) {
			bad_area_nosemaphore(regs, error_code, address);
			return;
		}
		down_read(&mm->mmap_sem);
	} else {
		/*
		 * The above down_read_trylock() might have succeeded in
		 * which case we'll have missed the might_sleep() from
		 * down_read():
		 */
		might_sleep();
	}
	/*尋找address所在的vma*/
	vma = find_vma(mm, address);
	/*如果address之後無vma,則肯定是非法訪問*/
	if (unlikely(!vma)) {
		bad_area(regs, error_code, address);
		return;
	}
	
	/*1 如果vma->start_address<=address,則直接跳到 "合法訪問"階段
            2 如果vma->start_address>address,則也有可能是使用者的"入棧行為"導致缺頁*/
	if (likely(vma->vm_start <= address))
		goto good_area;
	/* "入棧"操作,則該vma的標誌為 "向下增長"*/
	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
		bad_area(regs, error_code, address);
		return;
	}
	/*確定缺頁發生在"使用者態"*/
	if (error_code & PF_USER) {
		/*
		 * Accessing the stack below %sp is always a bug.
		 * The large cushion allows instructions like enter
		 * and pusha to work. ("enter $65535, $31" pushes
		 * 32 pointers and then decrements %sp by 65535.)
		 */
		 /*驗證缺頁address和棧頂sp的關係*/
		if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
			bad_area(regs, error_code, address);
			return;
		}
	}/*擴充套件棧*/
	if (unlikely(expand_stack(vma, address))) {
		bad_area(regs, error_code, address);
		return;
	}

	/*
	 * Ok, we have a good vm_area for this memory access, so
	 * we can handle it..
	 */
good_area:
	write = error_code & PF_WRITE;
	/*再次驗證"許可權"*/
	if (unlikely(access_error(error_code, write, vma))) {
		bad_area_access_error(regs, error_code, address);
		return;
	}

	/*
	 * If for any reason at all we couldn't handle the fault,
	 * make sure we exit gracefully rather than endlessly redo
	 * the fault:
	 */
	 /*分配新"頁框"*/
	fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);

	if (unlikely(fault & VM_FAULT_ERROR)) {
		mm_fault_error(regs, error_code, address, fault);
		return;
	}

	if (fault & VM_FAULT_MAJOR) {
		tsk->maj_flt++;
		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
				     regs, address);
	} else {
		tsk->min_flt++;
		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
				     regs, address);
	}

	check_v8086_mode(regs, address, tsk);

	up_read(&mm->mmap_sem);
}

大致流程中分為:

地址為核心空間:

1,當地址為核心地址空間並且在核心中訪問時,如果是非連續記憶體地址,將init_mm中對應的項複製到本程序對應的頁表項做修正;

2,地址為核心空間時,檢查頁表的訪問許可權;

3,如果1,2沒搞定,跳到非法訪問處理(在後面詳細分析這個);

地址為使用者空間:

4,如果使用了保留位,列印資訊,殺死當前程序;

5,如果在中斷上下文中火臨界區中時,直接跳到非法訪問;

6,如果出錯在核心空間中,檢視異常表,進行相應的處理;

7,查詢地址對應的vma,如果找不到,直接跳到非法訪問處,如果找到正常,跳到good_area;

8,如果vma->start_address>address,可能是棧太小,對齊進行擴充套件;

9,good_area處,再次檢查許可權;

10,許可權正確後分配新頁框,頁表等;