1. 程式人生 > >mmap核心原始碼分析,基於核心版本3.10(三)

mmap核心原始碼分析,基於核心版本3.10(三)

之前寫了(一)(二)其實就梳理到了get_unmapped_area的內容,而且有一點混亂,這裡進行第三篇的講解,講解在do_mmap_pgoff中除了get_unmapped_area的內容,來了解mmap的具體實現。通過(一)(二)(三)來將mmap核心原始碼進行一次梳理。可能過程有一點亂,所以最後準備寫一篇總結來總結mmap這樣一個流程。

在(三)中我準備從do_mmap_pgoff中的內容出發,分析(一)、(二)中沒有分析的內容。

直接上do_mmap_pgoff函式的內容,/mm/mmap.c檔案中的do_mmap_pgoff函式,給出了一定的中文註釋:

unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
			unsigned long len, unsigned long prot,
			unsigned long flags, unsigned long pgoff,
			unsigned long *populate)
{
	struct mm_struct * mm = current->mm;
	struct inode *inode;
	vm_flags_t vm_flags;

	*populate = 0;

	/*
	 * Does the application expect PROT_READ to imply PROT_EXEC?
	 *
	 * (the exception is when the underlying filesystem is noexec
	 *  mounted, in which case we dont add PROT_EXEC.)
	 應用程式是否期望PROT_READ影射PROT_EXEC? 
	 (例外情況是底層檔案系統是noexec掛載的,
	 在這種情況下我們不新增PROT_EXEC)
	 */
	if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
		if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
			prot |= PROT_EXEC;

	if (!len)
		return -EINVAL;

	if (!(flags & MAP_FIXED))
		addr = round_hint_to_min(addr);

	/* 小心溢位 */
	len = PAGE_ALIGN(len);
	if (!len)
		return -ENOMEM;

	/* 溢位偏移? */
	if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
               return -EOVERFLOW;

	/* 影射是否過多? */
	if (mm->map_count > sysctl_max_map_count)
		return -ENOMEM;

	/* Obtain the address to map to. we verify (or select) it and ensure
	 * that it represents a valid section of the address space.
	 獲取要對映到的地址。 我們驗證(或選擇)它並確保它代表有效的地址空間。
	 */
	addr = get_unmapped_area(file, addr, len, pgoff, flags);

	//如果現在都不是頁對齊的,那麼返回的肯定是錯誤碼,返回之return addr
	if (addr & ~PAGE_MASK)
		return addr;

	/* Do simple checking here so the lower-level routines won't have
	 * to. we assume access permissions have been handled by the open
	 * of the memory object, so we don't do any here.
	 在這裡進行簡單的檢查,以便下級例程不必。 
	 我們假設訪問許可權已由記憶體物件的開啟處理,
	 因此我們不在此處執行任何操作。
	 */
	 //calc_vm_prot_bits 將mmap“prot”引數合併到內部使用的“vm_flags”中。
	vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;

	if (flags & MAP_LOCKED)
		if (!can_do_mlock())
			return -EPERM;

	/* mlock MCL_FUTURE? */
	if (vm_flags & VM_LOCKED) {
		unsigned long locked, lock_limit;
		locked = len >> PAGE_SHIFT;
		locked += mm->locked_vm;
		lock_limit = rlimit(RLIMIT_MEMLOCK);
		lock_limit >>= PAGE_SHIFT;
		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
			return -EAGAIN;
	}

	inode = file ? file_inode(file) : NULL;

	if (file) {
		switch (flags & MAP_TYPE) {
		case MAP_SHARED:
			if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
				return -EACCES;

			/*
			 * Make sure we don't allow writing to an append-only
			 * file..
			 確保我們不允許寫入僅附加檔案
			 */
			if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
				return -EACCES;

			/*
			 * Make sure there are no mandatory locks on the file.
			 *確保檔案沒有強制鎖定
			 */
			if (locks_verify_locked(inode))
				return -EAGAIN;

			vm_flags |= VM_SHARED | VM_MAYSHARE;
			if (!(file->f_mode & FMODE_WRITE))
				vm_flags &= ~(VM_MAYWRITE | VM_SHARED);

			/* fall through(通過) */
		case MAP_PRIVATE:
			if (!(file->f_mode & FMODE_READ))
				return -EACCES;
			if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
				if (vm_flags & VM_EXEC)
					return -EPERM;
				vm_flags &= ~VM_MAYEXEC;
			}

			if (!file->f_op || !file->f_op->mmap)
				return -ENODEV;
			break;

		default:
			return -EINVAL;
		}
	} else {
		switch (flags & MAP_TYPE) {
		case MAP_SHARED:
			/*
			 * Ignore pgoff.
			 * 忽略pgoff
			 */
			pgoff = 0;
			vm_flags |= VM_SHARED | VM_MAYSHARE;
			break;
		case MAP_PRIVATE:
			/*
			 * Set pgoff according to addr for anon_vma.
			 * 根據addon為anon_vma設定pgoff
			 */
			pgoff = addr >> PAGE_SHIFT;
			break;
		default:
			return -EINVAL;
		}
	}

	/*
	 * Set 'VM_NORESERVE' if we should not account for the
	 * memory use of this mapping.
	 * 如果我們不考慮此對映的記憶體使用,請設定'VM_NORESERVE'
	 */
	if (flags & MAP_NORESERVE) {
		/* We honor MAP_NORESERVE if allowed to overcommit 
		如果允許過度使用,我們會尊重MAP_NORESERVE */
		if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
			vm_flags |= VM_NORESERVE;

		/* hugetlb applies strict overcommit unless MAP_NORESERVE
		除非MAP_NORESERVE,否則hugetlb會應用嚴格的overcommit */
		if (file && is_file_hugepages(file))
			vm_flags |= VM_NORESERVE;
	}

	addr = mmap_region(file, addr, len, vm_flags, pgoff);
	if (!IS_ERR_VALUE(addr) &&
	    ((vm_flags & VM_LOCKED) ||
	     (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
		*populate = len;
	return addr;
}

從do_mmap_pgoff中可以看到,其實get_unmapped_area函式只是返回了新線性區的地址,除此之外真正要做的工作還有很多,現在開始一步一步進行分析:

在do_mmap_pgoff的最開始做了一些標誌的判斷,然後呼叫了get_unmapped_area函式,返回了一個addr(這個過程在(一)、(二)中有詳細講解)。對get_unmapped_area返回的addr進行校驗,如果addr不滿足頁對齊,那麼說明get_unmapped_area函式返回的是一個錯誤碼,直接將這個錯誤碼返回即可。

如果addr正常,繼續往下分析,呼叫了calc_vm_prot_bits函式將mmap  prot 引數合併到內部使用的 vm_flags 中,只有在prot中設定了相應的PROT_READ、PROT_WRITE、PROT_EXEC等標誌,calc_vm_prot_bits函式才會在vm_flags中設定VM_READ、VM_WRITE、VM_EXEC標誌。同樣的只有在flags設定了相應的MAP_GROWSDOWN、MAP_EXECUTABLE等標誌,calc_vm_prot_bits函式才在vm_flags中設定VM_GROWSDOWN、VM_EXECUTABLE標誌。

繼續往下看又是flags的判斷,然後根據是否為檔案來獲取inode,如果是檔案就獲取這個檔案的inode。if(file)……else中進行了一些判斷,保證mmap的順利進行,if(file)……else之後呼叫了mmap_region函式

addr = mmap_region(file, addr, len, vm_flags, pgoff);

我們進入到mmap_region函式中檢視其中的具體內容:

unsigned long mmap_region(struct file *file, unsigned long addr,
		unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)
{
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma, *prev;
	int correct_wcount = 0;
	int error;
	struct rb_node **rb_link, *rb_parent;
	unsigned long charged = 0;
	struct inode *inode =  file ? file_inode(file) : NULL;

	/* 檢查地址空間限制 */
	if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
		unsigned long nr_pages;

		/*
		 * MAP_FIXED may remove pages of mappings that intersects with
		 * requested mapping. Account for the pages it would unmap.
		 * MAP_FIXED可能會刪除與請求的對映相交的對映頁面。 
		 * 說明要取消對映的頁面。
		 */
		if (!(vm_flags & MAP_FIXED))
			return -ENOMEM;

		nr_pages = count_vma_pages_range(mm, addr, addr + len);

		if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))
			return -ENOMEM;
	}

	/* Clear old maps */
	error = -ENOMEM;
munmap_back:
	if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
		if (do_munmap(mm, addr, len))
			return -ENOMEM;
		goto munmap_back;
	}

	/*
	 * Private writable mapping: check memory availability
	 * 專用可寫對映:檢查記憶體可用性
	 */
	if (accountable_mapping(file, vm_flags)) {
		charged = len >> PAGE_SHIFT;
		if (security_vm_enough_memory_mm(mm, charged))
			return -ENOMEM;
		vm_flags |= VM_ACCOUNT;
	}

	/*
	 * Can we just expand an old mapping?
	 * 我們可以擴充套件一箇舊的對映嗎?
	 */
	vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
	if (vma)
		goto out;

	/*
	 * Determine the object being mapped and call the appropriate
	 * specific mapper. the address has already been validated, but
	 * not unmapped, but the maps are removed from the list.
	 * 確定要對映的物件並呼叫適當的特定對映器。 
	 * 地址已經過驗證,但未取消對映,但對映已從列表中刪除。
	 */
	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
	if (!vma) {
		error = -ENOMEM;
		goto unacct_error;
	}

	vma->vm_mm = mm;
	vma->vm_start = addr;
	vma->vm_end = addr + len;
	vma->vm_flags = vm_flags;
	vma->vm_page_prot = vm_get_page_prot(vm_flags);
	vma->vm_pgoff = pgoff;
	INIT_LIST_HEAD(&vma->anon_vma_chain);

	error = -EINVAL;	/* 拒絕VM_GROWSDOWN | VM_GROWSUP時 */

	if (file) {
		if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
			goto free_vma;
		if (vm_flags & VM_DENYWRITE) {
			error = deny_write_access(file);
			if (error)
				goto free_vma;
			correct_wcount = 1;
		}
		vma->vm_file = get_file(file);
		error = file->f_op->mmap(file, vma);
		if (error)
			goto unmap_and_free_vma;

		/* Can addr have changed??
		 *
		 * Answer: Yes, several device drivers can do it in their
		 *         f_op->mmap method. -DaveM
		 * Bug: If addr is changed, prev, rb_link, rb_parent should
		 *      be updated for vma_link()
		 * addr可以改變??
  		 * 答:可以,有幾個裝置驅動程式可以在f_op-> mmap方法中執行此操作。 
  		 * -DaveM Bug:如果更改了addr,則應為vma_link()更新prev,rb_link,rb_parent
		 */
		WARN_ON_ONCE(addr != vma->vm_start);

		addr = vma->vm_start;
		pgoff = vma->vm_pgoff;
		vm_flags = vma->vm_flags;
	} else if (vm_flags & VM_SHARED) {
		if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP)))
			goto free_vma;
		error = shmem_zero_setup(vma);
		if (error)
			goto free_vma;
	}

	if (vma_wants_writenotify(vma)) {
		pgprot_t pprot = vma->vm_page_prot;

		/* Can vma->vm_page_prot have changed??
		 *
		 * Answer: Yes, drivers may have changed it in their
		 *         f_op->mmap method.
		 *
		 * Ensures that vmas marked as uncached stay that way.
		 * vma-> vm_page_prot可以改變嗎?
		 * 答:可以,驅動程式可能已在其f_op-> mmap方法中更改了它。 
		 * 確保標記為未快取的vma保持這種方式。
		 */
		vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
		if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot)))
			vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
	}

	vma_link(mm, vma, prev, rb_link, rb_parent);
	file = vma->vm_file;

	/* Once vma denies write, undo our temporary denial count 
	 一旦vma拒絕寫入,撤消我們的臨時拒絕計數 */
	if (correct_wcount)
		atomic_inc(&inode->i_writecount);
out:
	perf_event_mmap(vma);

	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
	if (vm_flags & VM_LOCKED) {
		if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
					vma == get_gate_vma(current->mm)))
			mm->locked_vm += (len >> PAGE_SHIFT);
		else
			vma->vm_flags &= ~VM_LOCKED;
	}

	if (file)
		uprobe_mmap(vma);

	return addr;

unmap_and_free_vma:
	if (correct_wcount)
		atomic_inc(&inode->i_writecount);
	vma->vm_file = NULL;
	fput(file);

	/* Undo any partial mapping done by a device driver. 
	撤消裝置驅動程式完成的任何部分對映 */
	unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
	charged = 0;
free_vma:
	kmem_cache_free(vm_area_cachep, vma);
unacct_error:
	if (charged)
		vm_unacct_memory(charged);
	return error;
}

在mmap_region中,可以看到這樣的一條語句:

	if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
		if (do_munmap(mm, addr, len))
			return -ENOMEM;
		goto munmap_back;
	}

mmap_region呼叫find_vma_links函式確定處於新區間之前的線性區物件的位置,以及在紅-黑樹中新線性區的位置。同時find_vma_link函式也檢查是否還存在與新區建重疊的線性區。如果這樣就呼叫do_munmap函式刪除新的區間,然後重複判斷。

檢查無誤後,再檢查記憶體的可用性,可用就繼續往下通過vma_merge函式返回了一個vma,語句如下:

vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);

呼叫vma_merge檢查前一個線性區是否可以以這樣的方式進行擴充套件來包含新的區間。同時需要保證,前一個線性區必須與在vm_flags區域性變數中存放的那些線性區具有完全相同的標誌。如果前一個線性區可以擴充套件,那麼vm_merge函式就會試圖把它與隨後的線性區進行合併,如果合併成功就直接跳轉到out

如果合併不成功,就繼續往下執行,呼叫kmem_cache_zalloc函式,為新的線性區分配一個vm_area_struct結構。這裡的這兩個函式vma_mergekmem_zcache_alloc函式都比較重要。

	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
	if (!vma) {
		error = -ENOMEM;
		goto unacct_error;
	}

進入到kmem_cache_zalloc函式中檢視其內容:

static inline void *kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags)
{
	return kmem_cache_alloc(k, flags | __GFP_ZERO);
}

它實際呼叫了kmem_cache_alloc函式,繼續進入到kmem_cache_alloc函式中檢視究竟

void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
{
	void *ret = slab_alloc(s, gfpflags, _RET_IP_);

	trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags);

	return ret;
}

可以發現,在kmem_cache_alloc中呼叫了slab分配函式,slab_alloc函式。

回到mmap_region函式中,kmem_cache_zalloc返回了這個vma之後,就開始對vma進行賦值,初始化了新的線性區物件。

接下來判斷如果是檔案,則通過

vma->vm_file = get_file(file);

對vm_file進行賦值等。

如果MAP_SHARED標誌被設定,同時新的線性區不對映磁碟上的檔案,則這個線性區是一個共享匿名區,因此又呼叫了shmem_zero_setup函式對vma進行了初始化。

else if (vm_flags & VM_SHARED) {
		if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP)))
			goto free_vma;
		error = shmem_zero_setup(vma);
		if (error)
			goto free_vma;
	}

進入到shmem_zero_setup函式中檢視其內容:

int shmem_zero_setup(struct vm_area_struct *vma)
{
	struct file *file;
	loff_t size = vma->vm_end - vma->vm_start;

	file = shmem_file_setup("dev/zero", size, vma->vm_flags);
	if (IS_ERR(file))
		return PTR_ERR(file);

	if (vma->vm_file)
		fput(vma->vm_file);
	vma->vm_file = file;
	vma->vm_ops = &shmem_vm_ops;
	return 0;
}

發現該函式和dev/zero有關,這與匿名記憶體對映是相關的,然後將vma->vm_file賦值給了這個dev/zero。

回到mmap_region函式中,繼續往下看

if (vma_wants_writenotify(vma))

這裡呼叫了vma_wants_writenotify函式的判斷, 對於某些共享對映會希望標記為只讀的頁面跟蹤寫入事件。 如果是這樣,會將vm_page_prot降級為私有版本(使用protection_map []而不使用VM_SHARED位)。

繼續往下

	vma_link(mm, vma, prev, rb_link, rb_parent);
	file = vma->vm_file;

這裡呼叫了vma_link函式,進入到vma_link函式中,檢視裡面做了什麼事情:

static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
			struct vm_area_struct *prev, struct rb_node **rb_link,
			struct rb_node *rb_parent)
{
	struct address_space *mapping = NULL;

	if (vma->vm_file)
		mapping = vma->vm_file->f_mapping;

	if (mapping)
		mutex_lock(&mapping->i_mmap_mutex);

	__vma_link(mm, vma, prev, rb_link, rb_parent);
	__vma_link_file(vma);

	if (mapping)
		mutex_unlock(&mapping->i_mmap_mutex);

	mm->map_count++;
	validate_mm(mm);
}

我們發現裡面又呼叫了__vma_link函式,進入裡面繼續往下看發現其實vma_link函式把新的線性區插入到了線性區連結串列的紅-黑樹中。

繼續往下到了out這裡

perf_event_mmap(vma);

呼叫了這樣一個函式,進入到裡面,發現在裡面對結構體perf_mmap_event進行了賦值,添加了這樣一個mmap_event事件。

繼續往下:

vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);

呼叫了vm_stat_account,我們進入到裡面

發現其實該函式主要講記憶體描述符的total_vm欄位中的程序地址空間大小進行了增加。

	if (vm_flags & VM_LOCKED) {
		if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
					vma == get_gate_vma(current->mm)))
			mm->locked_vm += (len >> PAGE_SHIFT);
		else
			vma->vm_flags &= ~VM_LOCKED;
	}

然後繼續判斷是否設定了VM_LOCKED標誌,如果設定了就對mm->locked_vm進行賦值。

最後

	if (file)
		uprobe_mmap(vma);

如果是file就呼叫了一個uprobe_mmap的函式,如果此時沒有活動的uprobe事件,我們可以跳過uprobe_mmap。

在uprobe_mmap函式中驗證指定的vma是否為可執行檔案vma。(這裡有點模糊,歡迎討論)

最後返回addr,至此mmap_region函式解析完畢,然後回到do_mmap_pgoff函式中,同樣也返回addr。

至此整個過程到此結束!!